From 93fa7636dfdc059b25df148f230c0991096afdef Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Tue, 8 Apr 2008 11:01:58 +0200 Subject: x86, ptrace: PEBS support Polish the ds.h interface and add support for PEBS. Ds.c is meant to be the resource allocator for per-thread and per-cpu BTS and PEBS recording. It is used by ptrace/utrace to provide execution tracing of debugged tasks. It will be used by profilers (e.g. perfmon2). It may be used by kernel debuggers to provide a kernel execution trace. Changes in detail: - guard DS and ptrace by CONFIG macros - separate DS and BTS more clearly - simplify field accesses - add functions to manage PEBS buffers - add simple protection/allocation mechanism - added support for Atom Opens: - buffer overflow handling Currently, only circular buffers are supported. This is all we need for debugging. Profilers would want an overflow notification. This is planned to be added when perfmon2 is made to use the ds.h interface. - utrace intermediate layer Signed-off-by: Markus Metzger Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig.cpu | 18 + arch/x86/kernel/cpu/intel.c | 3 +- arch/x86/kernel/ds.c | 953 ++++++++++++++++++++++++++++++------------- arch/x86/kernel/process_32.c | 25 +- arch/x86/kernel/process_64.c | 25 +- arch/x86/kernel/ptrace.c | 444 ++++++++++++-------- arch/x86/kernel/setup_64.c | 3 +- 7 files changed, 1009 insertions(+), 462 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2ad6301849a1..13f3f8bebfd1 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -415,3 +415,21 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y depends on !(M586MMX || M586TSC || M586 || M486 || M386) + +config X86_DS + bool "Debug Store support" + default y + help + Add support for Debug Store. + This allows the kernel to provide a memory buffer to the hardware + to store various profiling and tracing events. + +config X86_PTRACE_BTS + bool "ptrace interface to Branch Trace Store" + default y + depends on (X86_DS && X86_DEBUGCTLMSR) + help + Add a ptrace interface to allow collecting an execution trace + of the traced task. + This collects control flow changes in a (cyclic) buffer and allows + debuggers to fill in the gaps and show an execution trace of the debuggee. diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index fe9224c51d37..cbffa2a25a13 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -222,10 +222,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) set_cpu_cap(c, X86_FEATURE_PEBS); + ds_init_intel(c); } if (cpu_has_bts) - ds_init_intel(c); + ptrace_bts_init_intel(c); } static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 11c11b8ec48d..5b32b6d062b4 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -2,26 +2,48 @@ * Debug Store support * * This provides a low-level interface to the hardware's Debug Store - * feature that is used for last branch recording (LBR) and + * feature that is used for branch trace store (BTS) and * precise-event based sampling (PEBS). * - * Different architectures use a different DS layout/pointer size. - * The below functions therefore work on a void*. + * It manages: + * - per-thread and per-cpu allocation of BTS and PEBS + * - buffer memory allocation (optional) + * - buffer overflow handling + * - buffer access * + * It assumes: + * - get_task_struct on all parameter tasks + * - current is allowed to trace parameter tasks * - * Since there is no user for PEBS, yet, only LBR (or branch - * trace store, BTS) is supported. * - * - * Copyright (C) 2007 Intel Corporation. - * Markus Metzger , Dec 2007 + * Copyright (C) 2007-2008 Intel Corporation. + * Markus Metzger , 2007-2008 */ + +#ifdef CONFIG_X86_DS + #include #include #include #include +#include + + +/* + * The configuration for a particular DS hardware implementation. + */ +struct ds_configuration { + /* the size of the DS structure in bytes */ + unsigned char sizeof_ds; + /* the size of one pointer-typed field in the DS structure in bytes; + this covers the first 8 fields related to buffer management. */ + unsigned char sizeof_field; + /* the size of a BTS/PEBS record in bytes */ + unsigned char sizeof_rec[2]; +}; +static struct ds_configuration ds_cfg; /* @@ -44,378 +66,747 @@ * (interrupt occurs when write pointer passes interrupt pointer) * - value to which counter is reset following counter overflow * - * On later architectures, the last branch recording hardware uses - * 64bit pointers even in 32bit mode. - * - * - * Branch Trace Store (BTS) records store information about control - * flow changes. They at least provide the following information: - * - source linear address - * - destination linear address + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. * - * Netburst supported a predicated bit that had been dropped in later - * architectures. We do not suppor it. * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * - an offset giving the start of the respective region * - * In order to abstract from the actual DS and BTS layout, we describe - * the access to the relevant fields. - * Thanks to Andi Kleen for proposing this design. + * This offset is further used to index various arrays holding + * information for BTS and PEBS at the respective index. * - * The implementation, however, is not as general as it might seem. In - * order to stay somewhat simple and efficient, we assume an - * underlying unsigned type (mostly a pointer type) and we expect the - * field to be at least as big as that type. + * On later 32bit processors, we only access the lower 32bit of the + * 64bit pointer fields. The upper halves will be zeroed out. */ -/* - * A special from_ip address to indicate that the BTS record is an - * info record that needs to be interpreted or skipped. - */ -#define BTS_ESCAPE_ADDRESS (-1) +enum ds_field { + ds_buffer_base = 0, + ds_index, + ds_absolute_maximum, + ds_interrupt_threshold, +}; -/* - * A field access descriptor - */ -struct access_desc { - unsigned char offset; - unsigned char size; +enum ds_qualifier { + ds_bts = 0, + ds_pebs }; +static inline unsigned long ds_get(const unsigned char *base, + enum ds_qualifier qual, enum ds_field field) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + return *(unsigned long *)base; +} + +static inline void ds_set(unsigned char *base, enum ds_qualifier qual, + enum ds_field field, unsigned long value) +{ + base += (ds_cfg.sizeof_field * (field + (4 * qual))); + (*(unsigned long *)base) = value; +} + + /* - * The configuration for a particular DS/BTS hardware implementation. + * Locking is done only for allocating BTS or PEBS resources and for + * guarding context and buffer memory allocation. + * + * Most functions require the current task to own the ds context part + * they are going to access. All the locking is done when validating + * access to the context. */ -struct ds_configuration { - /* the DS configuration */ - unsigned char sizeof_ds; - struct access_desc bts_buffer_base; - struct access_desc bts_index; - struct access_desc bts_absolute_maximum; - struct access_desc bts_interrupt_threshold; - /* the BTS configuration */ - unsigned char sizeof_bts; - struct access_desc from_ip; - struct access_desc to_ip; - /* BTS variants used to store additional information like - timestamps */ - struct access_desc info_type; - struct access_desc info_data; - unsigned long debugctl_mask; -}; +static spinlock_t ds_lock = __SPIN_LOCK_UNLOCKED(ds_lock); /* - * The global configuration used by the below accessor functions + * Validate that the current task is allowed to access the BTS/PEBS + * buffer of the parameter task. + * + * Returns 0, if access is granted; -Eerrno, otherwise. */ -static struct ds_configuration ds_cfg; +static inline int ds_validate_access(struct ds_context *context, + enum ds_qualifier qual) +{ + if (!context) + return -EPERM; + + if (context->owner[qual] == current) + return 0; + + return -EPERM; +} + /* - * Accessor functions for some DS and BTS fields using the above - * global ptrace_bts_cfg. + * We either support (system-wide) per-cpu or per-thread allocation. + * We distinguish the two based on the task_struct pointer, where a + * NULL pointer indicates per-cpu allocation for the current cpu. + * + * Allocations are use-counted. As soon as resources are allocated, + * further allocations must be of the same type (per-cpu or + * per-thread). We model this by counting allocations (i.e. the number + * of tracers of a certain type) for one type negatively: + * =0 no tracers + * >0 number of per-thread tracers + * <0 number of per-cpu tracers + * + * The below functions to get and put tracers and to check the + * allocation type require the ds_lock to be held by the caller. + * + * Tracers essentially gives the number of ds contexts for a certain + * type of allocation. */ -static inline unsigned long get_bts_buffer_base(char *base) +static long tracers; + +static inline void get_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_buffer_base.offset); + tracers += (task ? 1 : -1); } -static inline void set_bts_buffer_base(char *base, unsigned long value) + +static inline void put_tracer(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_buffer_base.offset)) = value; + tracers -= (task ? 1 : -1); } -static inline unsigned long get_bts_index(char *base) + +static inline int check_tracer(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_index.offset); + return (task ? (tracers >= 0) : (tracers <= 0)); } -static inline void set_bts_index(char *base, unsigned long value) + + +/* + * The DS context is either attached to a thread or to a cpu: + * - in the former case, the thread_struct contains a pointer to the + * attached context. + * - in the latter case, we use a static array of per-cpu context + * pointers. + * + * Contexts are use-counted. They are allocated on first access and + * deallocated when the last user puts the context. + * + * We distinguish between an allocating and a non-allocating get of a + * context: + * - the allocating get is used for requesting BTS/PEBS resources. It + * requires the caller to hold the global ds_lock. + * - the non-allocating get is used for all other cases. A + * non-existing context indicates an error. It acquires and releases + * the ds_lock itself for obtaining the context. + * + * A context and its DS configuration are allocated and deallocated + * together. A context always has a DS configuration of the + * appropriate size. + */ +static DEFINE_PER_CPU(struct ds_context *, system_context); + +#define this_system_context per_cpu(system_context, smp_processor_id()) + +/* + * Returns the pointer to the parameter task's context or to the + * system-wide context, if task is NULL. + * + * Increases the use count of the returned context, if not NULL. + */ +static inline struct ds_context *ds_get_context(struct task_struct *task) { - (*(unsigned long *)(base + ds_cfg.bts_index.offset)) = value; + struct ds_context *context; + + spin_lock(&ds_lock); + + context = (task ? task->thread.ds_ctx : this_system_context); + if (context) + context->count++; + + spin_unlock(&ds_lock); + + return context; } -static inline unsigned long get_bts_absolute_maximum(char *base) + +/* + * Same as ds_get_context, but allocates the context and it's DS + * structure, if necessary; returns NULL; if out of memory. + * + * pre: requires ds_lock to be held + */ +static inline struct ds_context *ds_alloc_context(struct task_struct *task) { - return *(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset); + struct ds_context **p_context = + (task ? &task->thread.ds_ctx : &this_system_context); + struct ds_context *context = *p_context; + + if (!context) { + context = kzalloc(sizeof(*context), GFP_KERNEL); + + if (!context) + return 0; + + context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + if (!context->ds) { + kfree(context); + return 0; + } + + *p_context = context; + + context->this = p_context; + context->task = task; + + if (task) + set_tsk_thread_flag(task, TIF_DS_AREA_MSR); + + if (!task || (task == current)) + wrmsr(MSR_IA32_DS_AREA, (unsigned long)context->ds, 0); + + get_tracer(task); + } + + context->count++; + + return context; } -static inline void set_bts_absolute_maximum(char *base, unsigned long value) + +/* + * Decreases the use count of the parameter context, if not NULL. + * Deallocates the context, if the use count reaches zero. + */ +static inline void ds_put_context(struct ds_context *context) { - (*(unsigned long *)(base + ds_cfg.bts_absolute_maximum.offset)) = value; + if (!context) + return; + + spin_lock(&ds_lock); + + if (--context->count) + goto out; + + *(context->this) = 0; + + if (context->task) + clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + + if (!context->task || (context->task == current)) + wrmsrl(MSR_IA32_DS_AREA, 0); + + put_tracer(context->task); + + /* free any leftover buffers from tracers that did not + * deallocate them properly. */ + kfree(context->buffer[ds_bts]); + kfree(context->buffer[ds_pebs]); + kfree(context->ds); + kfree(context); + out: + spin_unlock(&ds_lock); } -static inline unsigned long get_bts_interrupt_threshold(char *base) + + +/* + * Handle a buffer overflow + * + * task: the task whose buffers are overflowing; + * NULL for a buffer overflow on the current cpu + * context: the ds context + * qual: the buffer type + */ +static void ds_overflow(struct task_struct *task, struct ds_context *context, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset); + if (!context) + return; + + if (context->callback[qual]) + (*context->callback[qual])(task); + + /* todo: do some more overflow handling */ } -static inline void set_bts_interrupt_threshold(char *base, unsigned long value) + + +/* + * Allocate a non-pageable buffer of the parameter size. + * Checks the memory and the locked memory rlimit. + * + * Returns the buffer, if successful; + * NULL, if out of memory or rlimit exceeded. + * + * size: the requested buffer size in bytes + * pages (out): if not NULL, contains the number of pages reserved + */ +static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) { - (*(unsigned long *)(base + ds_cfg.bts_interrupt_threshold.offset)) = value; + unsigned long rlim, vm, pgsz; + void *buffer; + + pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; + + rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = current->mm->total_vm + pgsz; + if (rlim < vm) + return 0; + + rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = current->mm->locked_vm + pgsz; + if (rlim < vm) + return 0; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + return 0; + + current->mm->total_vm += pgsz; + current->mm->locked_vm += pgsz; + + if (pages) + *pages = pgsz; + + return buffer; } -static inline unsigned long get_from_ip(char *base) + +static int ds_request(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl, enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.from_ip.offset); + struct ds_context *context; + unsigned long buffer, adj; + const unsigned long alignment = (1 << 3); + int error = 0; + + if (!ds_cfg.sizeof_ds) + return -EOPNOTSUPP; + + /* we require some space to do alignment adjustments below */ + if (size < (alignment + ds_cfg.sizeof_rec[qual])) + return -EINVAL; + + /* buffer overflow notification is not yet implemented */ + if (ovfl) + return -EOPNOTSUPP; + + + spin_lock(&ds_lock); + + if (!check_tracer(task)) + return -EPERM; + + error = -ENOMEM; + context = ds_alloc_context(task); + if (!context) + goto out_unlock; + + error = -EALREADY; + if (context->owner[qual] == current) + goto out_unlock; + error = -EPERM; + if (context->owner[qual] != 0) + goto out_unlock; + context->owner[qual] = current; + + spin_unlock(&ds_lock); + + + error = -ENOMEM; + if (!base) { + base = ds_allocate_buffer(size, &context->pages[qual]); + if (!base) + goto out_release; + + context->buffer[qual] = base; + } + error = 0; + + context->callback[qual] = ovfl; + + /* adjust the buffer address and size to meet alignment + * constraints: + * - buffer is double-word aligned + * - size is multiple of record size + * + * We checked the size at the very beginning; we have enough + * space to do the adjustment. + */ + buffer = (unsigned long)base; + + adj = ALIGN(buffer, alignment) - buffer; + buffer += adj; + size -= adj; + + size /= ds_cfg.sizeof_rec[qual]; + size *= ds_cfg.sizeof_rec[qual]; + + ds_set(context->ds, qual, ds_buffer_base, buffer); + ds_set(context->ds, qual, ds_index, buffer); + ds_set(context->ds, qual, ds_absolute_maximum, buffer + size); + + if (ovfl) { + /* todo: select a suitable interrupt threshold */ + } else + ds_set(context->ds, qual, + ds_interrupt_threshold, buffer + size + 1); + + /* we keep the context until ds_release */ + return error; + + out_release: + context->owner[qual] = 0; + ds_put_context(context); + return error; + + out_unlock: + spin_unlock(&ds_lock); + ds_put_context(context); + return error; } -static inline void set_from_ip(char *base, unsigned long value) + +int ds_request_bts(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - (*(unsigned long *)(base + ds_cfg.from_ip.offset)) = value; + return ds_request(task, base, size, ovfl, ds_bts); } -static inline unsigned long get_to_ip(char *base) + +int ds_request_pebs(struct task_struct *task, void *base, size_t size, + ds_ovfl_callback_t ovfl) { - return *(unsigned long *)(base + ds_cfg.to_ip.offset); + return ds_request(task, base, size, ovfl, ds_pebs); } -static inline void set_to_ip(char *base, unsigned long value) + +static int ds_release(struct task_struct *task, enum ds_qualifier qual) { - (*(unsigned long *)(base + ds_cfg.to_ip.offset)) = value; + struct ds_context *context; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + kfree(context->buffer[qual]); + context->buffer[qual] = 0; + + current->mm->total_vm -= context->pages[qual]; + current->mm->locked_vm -= context->pages[qual]; + context->pages[qual] = 0; + context->owner[qual] = 0; + + /* + * we put the context twice: + * once for the ds_get_context + * once for the corresponding ds_request + */ + ds_put_context(context); + out: + ds_put_context(context); + return error; } -static inline unsigned char get_info_type(char *base) + +int ds_release_bts(struct task_struct *task) { - return *(unsigned char *)(base + ds_cfg.info_type.offset); + return ds_release(task, ds_bts); } -static inline void set_info_type(char *base, unsigned char value) + +int ds_release_pebs(struct task_struct *task) { - (*(unsigned char *)(base + ds_cfg.info_type.offset)) = value; + return ds_release(task, ds_pebs); } -static inline unsigned long get_info_data(char *base) + +static int ds_get_index(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - return *(unsigned long *)(base + ds_cfg.info_data.offset); + struct ds_context *context; + unsigned long base, index; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + + error = ((index - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; } -static inline void set_info_data(char *base, unsigned long value) + +int ds_get_bts_index(struct task_struct *task, size_t *pos) { - (*(unsigned long *)(base + ds_cfg.info_data.offset)) = value; + return ds_get_index(task, pos, ds_bts); } +int ds_get_pebs_index(struct task_struct *task, size_t *pos) +{ + return ds_get_index(task, pos, ds_pebs); +} -int ds_allocate(void **dsp, size_t bts_size_in_bytes) +static int ds_get_end(struct task_struct *task, size_t *pos, + enum ds_qualifier qual) { - size_t bts_size_in_records; - unsigned long bts; - void *ds; + struct ds_context *context; + unsigned long base, end; + int error; + + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); + + error = ((end - base) / ds_cfg.sizeof_rec[qual]); + if (pos) + *pos = error; + out: + ds_put_context(context); + return error; +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +int ds_get_bts_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_bts); +} - if (bts_size_in_bytes < 0) - return -EINVAL; +int ds_get_pebs_end(struct task_struct *task, size_t *pos) +{ + return ds_get_end(task, pos, ds_pebs); +} - bts_size_in_records = - bts_size_in_bytes / ds_cfg.sizeof_bts; - bts_size_in_bytes = - bts_size_in_records * ds_cfg.sizeof_bts; +static int ds_access(struct task_struct *task, size_t index, + const void **record, enum ds_qualifier qual) +{ + struct ds_context *context; + unsigned long base, idx; + int error; - if (bts_size_in_bytes <= 0) + if (!record) return -EINVAL; - bts = (unsigned long)kzalloc(bts_size_in_bytes, GFP_KERNEL); + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - if (!bts) - return -ENOMEM; + base = ds_get(context->ds, qual, ds_buffer_base); + idx = base + (index * ds_cfg.sizeof_rec[qual]); - ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); + error = -EINVAL; + if (idx > ds_get(context->ds, qual, ds_absolute_maximum)) + goto out; - if (!ds) { - kfree((void *)bts); - return -ENOMEM; - } - - set_bts_buffer_base(ds, bts); - set_bts_index(ds, bts); - set_bts_absolute_maximum(ds, bts + bts_size_in_bytes); - set_bts_interrupt_threshold(ds, bts + bts_size_in_bytes + 1); - - *dsp = ds; - return 0; + *record = (const void *)idx; + error = ds_cfg.sizeof_rec[qual]; + out: + ds_put_context(context); + return error; } -int ds_free(void **dsp) +int ds_access_bts(struct task_struct *task, size_t index, const void **record) { - if (*dsp) { - kfree((void *)get_bts_buffer_base(*dsp)); - kfree(*dsp); - *dsp = NULL; - } - return 0; + return ds_access(task, index, record, ds_bts); } -int ds_get_bts_size(void *ds) +int ds_access_pebs(struct task_struct *task, size_t index, const void **record) { - int size_in_bytes; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (!ds) - return 0; - - size_in_bytes = - get_bts_absolute_maximum(ds) - - get_bts_buffer_base(ds); - return size_in_bytes; + return ds_access(task, index, record, ds_pebs); } -int ds_get_bts_end(void *ds) +static int ds_write(struct task_struct *task, const void *record, size_t size, + enum ds_qualifier qual, int force) { - int size_in_bytes = ds_get_bts_size(ds); - - if (size_in_bytes <= 0) - return size_in_bytes; + struct ds_context *context; + int error; - return size_in_bytes / ds_cfg.sizeof_bts; -} + if (!record) + return -EINVAL; -int ds_get_bts_index(void *ds) -{ - int index_offset_in_bytes; + error = -EPERM; + context = ds_get_context(task); + if (!context) + goto out; - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; + if (!force) { + error = ds_validate_access(context, qual); + if (error < 0) + goto out; + } - index_offset_in_bytes = - get_bts_index(ds) - - get_bts_buffer_base(ds); + error = 0; + while (size) { + unsigned long base, index, end, write_end, int_th; + unsigned long write_size, adj_write_size; + + /* + * write as much as possible without producing an + * overflow interrupt. + * + * interrupt_threshold must either be + * - bigger than absolute_maximum or + * - point to a record between buffer_base and absolute_maximum + * + * index points to a valid record. + */ + base = ds_get(context->ds, qual, ds_buffer_base); + index = ds_get(context->ds, qual, ds_index); + end = ds_get(context->ds, qual, ds_absolute_maximum); + int_th = ds_get(context->ds, qual, ds_interrupt_threshold); + + write_end = min(end, int_th); + + /* if we are already beyond the interrupt threshold, + * we fill the entire buffer */ + if (write_end <= index) + write_end = end; + + if (write_end <= index) + goto out; + + write_size = min((unsigned long) size, write_end - index); + memcpy((void *)index, record, write_size); + + record = (const char *)record + write_size; + size -= write_size; + error += write_size; + + adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; + adj_write_size *= ds_cfg.sizeof_rec[qual]; + + /* zero out trailing bytes */ + memset((char *)index + write_size, 0, + adj_write_size - write_size); + index += adj_write_size; + + if (index >= end) + index = base; + ds_set(context->ds, qual, ds_index, index); + + if (index >= int_th) + ds_overflow(task, context, qual); + } - return index_offset_in_bytes / ds_cfg.sizeof_bts; + out: + ds_put_context(context); + return error; } -int ds_set_overflow(void *ds, int method) +int ds_write_bts(struct task_struct *task, const void *record, size_t size) { - switch (method) { - case DS_O_SIGNAL: - return -EOPNOTSUPP; - case DS_O_WRAP: - return 0; - default: - return -EINVAL; - } + return ds_write(task, record, size, ds_bts, /* force = */ 0); } -int ds_get_overflow(void *ds) +int ds_write_pebs(struct task_struct *task, const void *record, size_t size) { - return DS_O_WRAP; + return ds_write(task, record, size, ds_pebs, /* force = */ 0); } -int ds_clear(void *ds) +int ds_unchecked_write_bts(struct task_struct *task, + const void *record, size_t size) { - int bts_size = ds_get_bts_size(ds); - unsigned long bts_base; - - if (bts_size <= 0) - return bts_size; - - bts_base = get_bts_buffer_base(ds); - memset((void *)bts_base, 0, bts_size); - - set_bts_index(ds, bts_base); - return 0; + return ds_write(task, record, size, ds_bts, /* force = */ 1); } -int ds_read_bts(void *ds, int index, struct bts_struct *out) +int ds_unchecked_write_pebs(struct task_struct *task, + const void *record, size_t size) { - void *bts; + return ds_write(task, record, size, ds_pebs, /* force = */ 1); +} - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; +static int ds_reset_or_clear(struct task_struct *task, + enum ds_qualifier qual, int clear) +{ + struct ds_context *context; + unsigned long base, end; + int error; - if (index < 0) - return -EINVAL; + context = ds_get_context(task); + error = ds_validate_access(context, qual); + if (error < 0) + goto out; - if (index >= ds_get_bts_size(ds)) - return -EINVAL; + base = ds_get(context->ds, qual, ds_buffer_base); + end = ds_get(context->ds, qual, ds_absolute_maximum); - bts = (void *)(get_bts_buffer_base(ds) + (index * ds_cfg.sizeof_bts)); + if (clear) + memset((void *)base, 0, end - base); - memset(out, 0, sizeof(*out)); - if (get_from_ip(bts) == BTS_ESCAPE_ADDRESS) { - out->qualifier = get_info_type(bts); - out->variant.jiffies = get_info_data(bts); - } else { - out->qualifier = BTS_BRANCH; - out->variant.lbr.from_ip = get_from_ip(bts); - out->variant.lbr.to_ip = get_to_ip(bts); - } + ds_set(context->ds, qual, ds_index, base); - return sizeof(*out);; + error = 0; + out: + ds_put_context(context); + return error; } -int ds_write_bts(void *ds, const struct bts_struct *in) +int ds_reset_bts(struct task_struct *task) { - unsigned long bts; - - if (!ds_cfg.sizeof_ds || !ds_cfg.sizeof_bts) - return -EOPNOTSUPP; - - if (ds_get_bts_size(ds) <= 0) - return -ENXIO; + return ds_reset_or_clear(task, ds_bts, /* clear = */ 0); +} - bts = get_bts_index(ds); +int ds_reset_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 0); +} - memset((void *)bts, 0, ds_cfg.sizeof_bts); - switch (in->qualifier) { - case BTS_INVALID: - break; +int ds_clear_bts(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_bts, /* clear = */ 1); +} - case BTS_BRANCH: - set_from_ip((void *)bts, in->variant.lbr.from_ip); - set_to_ip((void *)bts, in->variant.lbr.to_ip); - break; +int ds_clear_pebs(struct task_struct *task) +{ + return ds_reset_or_clear(task, ds_pebs, /* clear = */ 1); +} - case BTS_TASK_ARRIVES: - case BTS_TASK_DEPARTS: - set_from_ip((void *)bts, BTS_ESCAPE_ADDRESS); - set_info_type((void *)bts, in->qualifier); - set_info_data((void *)bts, in->variant.jiffies); - break; +int ds_get_pebs_reset(struct task_struct *task, u64 *value) +{ + struct ds_context *context; + int error; - default: + if (!value) return -EINVAL; - } - bts = bts + ds_cfg.sizeof_bts; - if (bts >= get_bts_absolute_maximum(ds)) - bts = get_bts_buffer_base(ds); - set_bts_index(ds, bts); + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; - return ds_cfg.sizeof_bts; + *value = *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)); + + error = 0; + out: + ds_put_context(context); + return error; } -unsigned long ds_debugctl_mask(void) +int ds_set_pebs_reset(struct task_struct *task, u64 value) { - return ds_cfg.debugctl_mask; -} + struct ds_context *context; + int error; -#ifdef __i386__ -static const struct ds_configuration ds_cfg_netburst = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<2)|(1<<3) -}; + context = ds_get_context(task); + error = ds_validate_access(context, ds_pebs); + if (error < 0) + goto out; -static const struct ds_configuration ds_cfg_pentium_m = { - .sizeof_ds = 9 * 4, - .bts_buffer_base = { 0, 4 }, - .bts_index = { 4, 4 }, - .bts_absolute_maximum = { 8, 4 }, - .bts_interrupt_threshold = { 12, 4 }, - .sizeof_bts = 3 * 4, - .from_ip = { 0, 4 }, - .to_ip = { 4, 4 }, - .info_type = { 4, 1 }, - .info_data = { 8, 4 }, - .debugctl_mask = (1<<6)|(1<<7) + *(u64 *)(context->ds + (ds_cfg.sizeof_field * 8)) = value; + + error = 0; + out: + ds_put_context(context); + return error; +} + +static const struct ds_configuration ds_cfg_var = { + .sizeof_ds = sizeof(long) * 12, + .sizeof_field = sizeof(long), + .sizeof_rec[ds_bts] = sizeof(long) * 3, + .sizeof_rec[ds_pebs] = sizeof(long) * 10 }; -#endif /* _i386_ */ - -static const struct ds_configuration ds_cfg_core2 = { - .sizeof_ds = 9 * 8, - .bts_buffer_base = { 0, 8 }, - .bts_index = { 8, 8 }, - .bts_absolute_maximum = { 16, 8 }, - .bts_interrupt_threshold = { 24, 8 }, - .sizeof_bts = 3 * 8, - .from_ip = { 0, 8 }, - .to_ip = { 8, 8 }, - .info_type = { 8, 1 }, - .info_data = { 16, 8 }, - .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +static const struct ds_configuration ds_cfg_64 = { + .sizeof_ds = 8 * 12, + .sizeof_field = 8, + .sizeof_rec[ds_bts] = 8 * 3, + .sizeof_rec[ds_pebs] = 8 * 10 }; static inline void @@ -429,14 +820,13 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86) { case 0x6: switch (c->x86_model) { -#ifdef __i386__ case 0xD: case 0xE: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ case 0xF: /* Core2 */ - ds_configure(&ds_cfg_core2); + case 0x1C: /* Atom */ + ds_configure(&ds_cfg_64); break; default: /* sorry, don't know about them */ @@ -445,13 +835,11 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; case 0xF: switch (c->x86_model) { -#ifdef __i386__ case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_var); break; -#endif /* _i386_ */ default: /* sorry, don't know about them */ break; @@ -462,3 +850,14 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) break; } } + +void ds_free(struct ds_context *context) +{ + /* This is called when the task owning the parameter context + * is dying. There should not be any user of that context left + * to disturb us, anymore. */ + unsigned long leftovers = context->count; + while (leftovers--) + ds_put_context(context); +} +#endif /* CONFIG_X86_DS */ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476dfbb60d..5cec8a75a4e8 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -316,6 +316,14 @@ void exit_thread(void) tss->x86_tss.io_bitmap_base = INVALID_IO_BITMAP_OFFSET; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(current->thread.ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(current->thread.ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -482,18 +490,27 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, { struct thread_struct *prev, *next; unsigned long debugctl; + unsigned long ds_prev = 0, ds_next = 0; prev = &prev_p->thread; next = &next_p->thread; debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { + +#ifdef CONFIG_X86_DS + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { /* we clear debugctl to make sure DS * is not in use when we change it */ debugctl = 0; update_debugctlmsr(0); - wrmsr(MSR_IA32_DS_AREA, next->ds_area_msr, 0); + wrmsr(MSR_IA32_DS_AREA, ds_next, 0); } +#endif /* CONFIG_X86_DS */ if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -517,13 +534,13 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, hard_enable_TSC(); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ if (!test_tsk_thread_flag(next_p, TIF_IO_BITMAP)) { diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f39988b..ad213494a22f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -267,6 +267,14 @@ void exit_thread(void) t->io_bitmap_max = 0; put_cpu(); } +#ifdef CONFIG_X86_DS + /* Free any DS contexts that have not been properly released. */ + if (unlikely(t->ds_ctx)) { + /* we clear debugctl to make sure DS is not used. */ + update_debugctlmsr(0); + ds_free(t->ds_ctx); + } +#endif /* CONFIG_X86_DS */ } void flush_thread(void) @@ -492,18 +500,27 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, { struct thread_struct *prev, *next; unsigned long debugctl; + unsigned long ds_prev = 0, ds_next = 0; prev = &prev_p->thread, next = &next_p->thread; debugctl = prev->debugctlmsr; - if (next->ds_area_msr != prev->ds_area_msr) { + +#ifdef CONFIG_X86_DS + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { /* we clear debugctl to make sure DS * is not in use when we change it */ debugctl = 0; update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, next->ds_area_msr); + wrmsrl(MSR_IA32_DS_AREA, ds_next); } +#endif /* CONFIG_X86_DS */ if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); @@ -541,13 +558,13 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS if (test_tsk_thread_flag(prev_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(prev_p, BTS_TASK_DEPARTS); if (test_tsk_thread_flag(next_p, TIF_BTS_TRACE_TS)) ptrace_bts_take_timestamp(next_p, BTS_TASK_ARRIVES); -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ } /* diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index fb03ef380f0e..b7ff783dc5fe 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -554,45 +554,115 @@ static int ptrace_set_debugreg(struct task_struct *child, return 0; } -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS +/* + * The configuration for a particular BTS hardware implementation. + */ +struct bts_configuration { + /* the size of a BTS record in bytes; at most BTS_MAX_RECORD_SIZE */ + unsigned char sizeof_bts; + /* the size of a field in the BTS record in bytes */ + unsigned char sizeof_field; + /* a bitmask to enable/disable BTS in DEBUGCTL MSR */ + unsigned long debugctl_mask; +}; +static struct bts_configuration bts_cfg; + +#define BTS_MAX_RECORD_SIZE (8 * 3) + + +/* + * Branch Trace Store (BTS) uses the following format. Different + * architectures vary in the size of those fields. + * - source linear address + * - destination linear address + * - flags + * + * Later architectures use 64bit pointers throughout, whereas earlier + * architectures use 32bit pointers in 32bit mode. + * + * We compute the base address for the first 8 fields based on: + * - the field size stored in the DS configuration + * - the relative field position + * + * In order to store additional information in the BTS buffer, we use + * a special source address to indicate that the record requires + * special interpretation. + * + * Netburst indicated via a bit in the flags field whether the branch + * was predicted; this is ignored. + */ -static int ptrace_bts_get_size(struct task_struct *child) +enum bts_field { + bts_from = 0, + bts_to, + bts_flags, + + bts_escape = (unsigned long)-1, + bts_qual = bts_to, + bts_jiffies = bts_flags +}; + +static inline unsigned long bts_get(const char *base, enum bts_field field) { - if (!child->thread.ds_area_msr) - return -ENXIO; + base += (bts_cfg.sizeof_field * field); + return *(unsigned long *)base; +} - return ds_get_bts_index((void *)child->thread.ds_area_msr); +static inline void bts_set(char *base, enum bts_field field, unsigned long val) +{ + base += (bts_cfg.sizeof_field * field);; + (*(unsigned long *)base) = val; } -static int ptrace_bts_read_record(struct task_struct *child, - long index, +/* + * Translate a BTS record from the raw format into the bts_struct format + * + * out (out): bts_struct interpretation + * raw: raw BTS record + */ +static void ptrace_bts_translate_record(struct bts_struct *out, const void *raw) +{ + memset(out, 0, sizeof(*out)); + if (bts_get(raw, bts_from) == bts_escape) { + out->qualifier = bts_get(raw, bts_qual); + out->variant.jiffies = bts_get(raw, bts_jiffies); + } else { + out->qualifier = BTS_BRANCH; + out->variant.lbr.from_ip = bts_get(raw, bts_from); + out->variant.lbr.to_ip = bts_get(raw, bts_to); + } +} + +static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { struct bts_struct ret; - int retval; - int bts_end; - int bts_index; - - if (!child->thread.ds_area_msr) - return -ENXIO; + const void *bts_record; + size_t bts_index, bts_end; + int error; - if (index < 0) - return -EINVAL; + error = ds_get_bts_end(child, &bts_end); + if (error < 0) + return error; - bts_end = ds_get_bts_end((void *)child->thread.ds_area_msr); if (bts_end <= index) return -EINVAL; + error = ds_get_bts_index(child, &bts_index); + if (error < 0) + return error; + /* translate the ptrace bts index into the ds bts index */ - bts_index = ds_get_bts_index((void *)child->thread.ds_area_msr); - bts_index -= (index + 1); - if (bts_index < 0) - bts_index += bts_end; + bts_index += bts_end - (index + 1); + if (bts_end <= bts_index) + bts_index -= bts_end; - retval = ds_read_bts((void *)child->thread.ds_area_msr, - bts_index, &ret); - if (retval < 0) - return retval; + error = ds_access_bts(child, bts_index, &bts_record); + if (error < 0) + return error; + + ptrace_bts_translate_record(&ret, bts_record); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; @@ -600,101 +670,106 @@ static int ptrace_bts_read_record(struct task_struct *child, return sizeof(ret); } -static int ptrace_bts_clear(struct task_struct *child) -{ - if (!child->thread.ds_area_msr) - return -ENXIO; - - return ds_clear((void *)child->thread.ds_area_msr); -} - static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { - int end, i; - void *ds = (void *)child->thread.ds_area_msr; - - if (!ds) - return -ENXIO; + struct bts_struct ret; + const unsigned char *raw; + size_t end, i; + int error; - end = ds_get_bts_index(ds); - if (end <= 0) - return end; + error = ds_get_bts_index(child, &end); + if (error < 0) + return error; if (size < (end * sizeof(struct bts_struct))) return -EIO; - for (i = 0; i < end; i++, out++) { - struct bts_struct ret; - int retval; + error = ds_access_bts(child, 0, (const void **)&raw); + if (error < 0) + return error; - retval = ds_read_bts(ds, i, &ret); - if (retval < 0) - return retval; + for (i = 0; i < end; i++, out++, raw += bts_cfg.sizeof_bts) { + ptrace_bts_translate_record(&ret, raw); if (copy_to_user(out, &ret, sizeof(ret))) return -EFAULT; } - ds_clear(ds); + error = ds_clear_bts(child); + if (error < 0) + return error; return end; } +static void ptrace_bts_ovfl(struct task_struct *child) +{ + send_sig(child->thread.bts_ovfl_signal, child, 0); +} + static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { struct ptrace_bts_config cfg; - int bts_size, ret = 0; - void *ds; + int error = 0; + + error = -EOPNOTSUPP; + if (!bts_cfg.sizeof_bts) + goto errout; + error = -EIO; if (cfg_size < sizeof(cfg)) - return -EIO; + goto errout; + error = -EFAULT; if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - return -EFAULT; + goto errout; - if ((int)cfg.size < 0) - return -EINVAL; + error = -EINVAL; + if ((cfg.flags & PTRACE_BTS_O_SIGNAL) && + !(cfg.flags & PTRACE_BTS_O_ALLOC)) + goto errout; - bts_size = 0; - ds = (void *)child->thread.ds_area_msr; - if (ds) { - bts_size = ds_get_bts_size(ds); - if (bts_size < 0) - return bts_size; - } - cfg.size = PAGE_ALIGN(cfg.size); + if (cfg.flags & PTRACE_BTS_O_ALLOC) { + ds_ovfl_callback_t ovfl = 0; + unsigned int sig = 0; - if (bts_size != cfg.size) { - ret = ptrace_bts_realloc(child, cfg.size, - cfg.flags & PTRACE_BTS_O_CUT_SIZE); - if (ret < 0) + /* we ignore the error in case we were not tracing child */ + (void)ds_release_bts(child); + + if (cfg.flags & PTRACE_BTS_O_SIGNAL) { + if (!cfg.signal) + goto errout; + + sig = cfg.signal; + ovfl = ptrace_bts_ovfl; + } + + error = ds_request_bts(child, /* base = */ 0, cfg.size, ovfl); + if (error < 0) goto errout; - ds = (void *)child->thread.ds_area_msr; + child->thread.bts_ovfl_signal = sig; } - if (cfg.flags & PTRACE_BTS_O_SIGNAL) - ret = ds_set_overflow(ds, DS_O_SIGNAL); - else - ret = ds_set_overflow(ds, DS_O_WRAP); - if (ret < 0) + error = -EINVAL; + if (!child->thread.ds_ctx && cfg.flags) goto errout; if (cfg.flags & PTRACE_BTS_O_TRACE) - child->thread.debugctlmsr |= ds_debugctl_mask(); + child->thread.debugctlmsr |= bts_cfg.debugctl_mask; else - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; if (cfg.flags & PTRACE_BTS_O_SCHED) set_tsk_thread_flag(child, TIF_BTS_TRACE_TS); else clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - ret = sizeof(cfg); + error = sizeof(cfg); out: if (child->thread.debugctlmsr) @@ -702,10 +777,10 @@ out: else clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - return ret; + return error; errout: - child->thread.debugctlmsr &= ~ds_debugctl_mask(); + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); goto out; } @@ -714,29 +789,40 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { - void *ds = (void *)child->thread.ds_area_msr; struct ptrace_bts_config cfg; + size_t end; + const void *base, *max; + int error; if (cfg_size < sizeof(cfg)) return -EIO; - memset(&cfg, 0, sizeof(cfg)); + error = ds_get_bts_end(child, &end); + if (error < 0) + return error; - if (ds) { - cfg.size = ds_get_bts_size(ds); + error = ds_access_bts(child, /* index = */ 0, &base); + if (error < 0) + return error; - if (ds_get_overflow(ds) == DS_O_SIGNAL) - cfg.flags |= PTRACE_BTS_O_SIGNAL; + error = ds_access_bts(child, /* index = */ end, &max); + if (error < 0) + return error; - if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && - child->thread.debugctlmsr & ds_debugctl_mask()) - cfg.flags |= PTRACE_BTS_O_TRACE; + memset(&cfg, 0, sizeof(cfg)); + cfg.size = (max - base); + cfg.signal = child->thread.bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); - if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) - cfg.flags |= PTRACE_BTS_O_SCHED; - } + if (cfg.signal) + cfg.flags |= PTRACE_BTS_O_SIGNAL; - cfg.bts_size = sizeof(struct bts_struct); + if (test_tsk_thread_flag(child, TIF_DEBUGCTLMSR) && + child->thread.debugctlmsr & bts_cfg.debugctl_mask) + cfg.flags |= PTRACE_BTS_O_TRACE; + + if (test_tsk_thread_flag(child, TIF_BTS_TRACE_TS)) + cfg.flags |= PTRACE_BTS_O_SCHED; if (copy_to_user(ucfg, &cfg, sizeof(cfg))) return -EFAULT; @@ -744,89 +830,38 @@ static int ptrace_bts_status(struct task_struct *child, return sizeof(cfg); } - static int ptrace_bts_write_record(struct task_struct *child, const struct bts_struct *in) { - int retval; + unsigned char bts_record[BTS_MAX_RECORD_SIZE]; - if (!child->thread.ds_area_msr) - return -ENXIO; + BUG_ON(BTS_MAX_RECORD_SIZE < bts_cfg.sizeof_bts); - retval = ds_write_bts((void *)child->thread.ds_area_msr, in); - if (retval) - return retval; + memset(bts_record, 0, bts_cfg.sizeof_bts); + switch (in->qualifier) { + case BTS_INVALID: + break; - return sizeof(*in); -} + case BTS_BRANCH: + bts_set(bts_record, bts_from, in->variant.lbr.from_ip); + bts_set(bts_record, bts_to, in->variant.lbr.to_ip); + break; -static int ptrace_bts_realloc(struct task_struct *child, - int size, int reduce_size) -{ - unsigned long rlim, vm; - int ret, old_size; + case BTS_TASK_ARRIVES: + case BTS_TASK_DEPARTS: + bts_set(bts_record, bts_from, bts_escape); + bts_set(bts_record, bts_qual, in->qualifier); + bts_set(bts_record, bts_jiffies, in->variant.jiffies); + break; - if (size < 0) + default: return -EINVAL; - - old_size = ds_get_bts_size((void *)child->thread.ds_area_msr); - if (old_size < 0) - return old_size; - - ret = ds_free((void **)&child->thread.ds_area_msr); - if (ret < 0) - goto out; - - size >>= PAGE_SHIFT; - old_size >>= PAGE_SHIFT; - - current->mm->total_vm -= old_size; - current->mm->locked_vm -= old_size; - - if (size == 0) - goto out; - - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->total_vm; - if (size <= 0) - goto out; } - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + size; - if (rlim < vm) { - ret = -ENOMEM; - - if (!reduce_size) - goto out; - - size = rlim - current->mm->locked_vm; - if (size <= 0) - goto out; - } - - ret = ds_allocate((void **)&child->thread.ds_area_msr, - size << PAGE_SHIFT); - if (ret < 0) - goto out; - - current->mm->total_vm += size; - current->mm->locked_vm += size; - -out: - if (child->thread.ds_area_msr) - set_tsk_thread_flag(child, TIF_DS_AREA_MSR); - else - clear_tsk_thread_flag(child, TIF_DS_AREA_MSR); - - return ret; + /* The writing task will be the switched-to task on a context + * switch. It needs to write into the switched-from task's BTS + * buffer. */ + return ds_unchecked_write_bts(child, bts_record, bts_cfg.sizeof_bts); } void ptrace_bts_take_timestamp(struct task_struct *tsk, @@ -839,7 +874,66 @@ void ptrace_bts_take_timestamp(struct task_struct *tsk, ptrace_bts_write_record(tsk, &rec); } -#endif /* X86_BTS */ + +static const struct bts_configuration bts_cfg_netburst = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<2)|(1<<3)|(1<<5) +}; + +static const struct bts_configuration bts_cfg_pentium_m = { + .sizeof_bts = sizeof(long) * 3, + .sizeof_field = sizeof(long), + .debugctl_mask = (1<<6)|(1<<7) +}; + +static const struct bts_configuration bts_cfg_core2 = { + .sizeof_bts = 8 * 3, + .sizeof_field = 8, + .debugctl_mask = (1<<6)|(1<<7)|(1<<9) +}; + +static inline void bts_configure(const struct bts_configuration *cfg) +{ + bts_cfg = *cfg; +} + +void __cpuinit ptrace_bts_init_intel(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 0x6: + switch (c->x86_model) { + case 0xD: + case 0xE: /* Pentium M */ + bts_configure(&bts_cfg_pentium_m); + break; + case 0xF: /* Core2 */ + case 0x1C: /* Atom */ + bts_configure(&bts_cfg_core2); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + case 0xF: + switch (c->x86_model) { + case 0x0: + case 0x1: + case 0x2: /* Netburst */ + bts_configure(&bts_cfg_netburst); + break; + default: + /* sorry, don't know about them */ + break; + } + break; + default: + /* sorry, don't know about them */ + break; + } +} +#endif /* CONFIG_X86_PTRACE_BTS */ /* * Called by kernel/ptrace.c when detaching.. @@ -852,15 +946,15 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - if (child->thread.ds_area_msr) { -#ifdef X86_BTS - ptrace_bts_realloc(child, 0, 0); -#endif - child->thread.debugctlmsr &= ~ds_debugctl_mask(); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); - } +#ifdef CONFIG_X86_PTRACE_BTS + (void)ds_release_bts(child); + + child->thread.debugctlmsr &= ~bts_cfg.debugctl_mask; + if (!child->thread.debugctlmsr) + clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + + clear_tsk_thread_flag(child, TIF_BTS_TRACE_TS); +#endif /* CONFIG_X86_PTRACE_BTS */ } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -980,7 +1074,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* * These bits need more cooking - not enabled yet: */ -#ifdef X86_BTS +#ifdef CONFIG_X86_PTRACE_BTS case PTRACE_BTS_CONFIG: ret = ptrace_bts_config (child, data, (struct ptrace_bts_config __user *)addr); @@ -992,7 +1086,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_SIZE: - ret = ptrace_bts_get_size(child); + ret = ds_get_bts_index(child, /* pos = */ 0); break; case PTRACE_BTS_GET: @@ -1001,14 +1095,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_CLEAR: - ret = ptrace_bts_clear(child); + ret = ds_clear_bts(child); break; case PTRACE_BTS_DRAIN: ret = ptrace_bts_drain (child, data, (struct bts_struct __user *) addr); break; -#endif +#endif /* CONFIG_X86_PTRACE_BTS */ default: ret = ptrace_request(child, request, addr, data); diff --git a/arch/x86/kernel/setup_64.c b/arch/x86/kernel/setup_64.c index f2fc8feb727d..f7aebe13c999 100644 --- a/arch/x86/kernel/setup_64.c +++ b/arch/x86/kernel/setup_64.c @@ -920,11 +920,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) set_cpu_cap(c, X86_FEATURE_PEBS); + ds_init_intel(c); } if (cpu_has_bts) - ds_init_intel(c); + ptrace_bts_init_intel(c); n = c->extended_cpuid_level; if (n >= 0x80000008) { -- cgit v1.2.2 From 970e725098a6da5a9c1f8128102c812e31a0444c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 16 Apr 2008 16:40:17 -0700 Subject: x86, ptrace: PEBS support, warning fix arch/x86/kernel/process_32.c:566: warning: unused variable 'ds_next' arch/x86/kernel/process_32.c:566: warning: unused variable 'ds_prev' Cc: Markus Metzger Cc: Andi Kleen Cc: "H. Peter Anvin" Cc: "Siddha, Suresh B" Cc: Roland McGrath Cc: Michael Kerrisk Cc: Cc: stephane eranian Cc: Jason Wessel Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_32.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 5cec8a75a4e8..496ea3110aa2 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -484,20 +484,13 @@ int set_tsc_mode(unsigned int val) return 0; } -static noinline void -__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss) +#ifdef CONFIG_X86_DS +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) { - struct thread_struct *prev, *next; - unsigned long debugctl; - unsigned long ds_prev = 0, ds_next = 0; + unsigned long ds_prev = 0; + unsigned long ds_next = 0; - prev = &prev_p->thread; - next = &next_p->thread; - - debugctl = prev->debugctlmsr; - -#ifdef CONFIG_X86_DS if (prev->ds_ctx) ds_prev = (unsigned long)prev->ds_ctx->ds; if (next->ds_ctx) @@ -510,8 +503,28 @@ __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, update_debugctlmsr(0); wrmsr(MSR_IA32_DS_AREA, ds_next, 0); } + return debugctl; +} +#else +static int update_debugctl(struct thread_struct *prev, + struct thread_struct *next, unsigned long debugctl) +{ + return debugctl; +} #endif /* CONFIG_X86_DS */ +static noinline void +__switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss) +{ + struct thread_struct *prev, *next; + unsigned long debugctl; + + prev = &prev_p->thread; + next = &next_p->thread; + + debugctl = update_debugctl(prev, next, prev->debugctlmsr); + if (next->debugctlmsr != debugctl) update_debugctlmsr(next->debugctlmsr); -- cgit v1.2.2 From 573da4224e8c3800e613d715e909c3179a7e3cb2 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 28 Apr 2008 23:15:04 +0400 Subject: x86: DS cleanup - dont treat 0 as NULL Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ds.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5b32b6d062b4..24a323c95997 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -238,12 +238,12 @@ static inline struct ds_context *ds_alloc_context(struct task_struct *task) context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) - return 0; + return NULL; context->ds = kzalloc(ds_cfg.sizeof_ds, GFP_KERNEL); if (!context->ds) { kfree(context); - return 0; + return NULL; } *p_context = context; @@ -279,7 +279,7 @@ static inline void ds_put_context(struct ds_context *context) if (--context->count) goto out; - *(context->this) = 0; + *(context->this) = NULL; if (context->task) clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); @@ -341,16 +341,16 @@ static inline void *ds_allocate_buffer(size_t size, unsigned int *pages) rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; vm = current->mm->total_vm + pgsz; if (rlim < vm) - return 0; + return NULL; rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; vm = current->mm->locked_vm + pgsz; if (rlim < vm) - return 0; + return NULL; buffer = kzalloc(size, GFP_KERNEL); if (!buffer) - return 0; + return NULL; current->mm->total_vm += pgsz; current->mm->locked_vm += pgsz; @@ -395,7 +395,7 @@ static int ds_request(struct task_struct *task, void *base, size_t size, if (context->owner[qual] == current) goto out_unlock; error = -EPERM; - if (context->owner[qual] != 0) + if (context->owner[qual] != NULL) goto out_unlock; context->owner[qual] = current; @@ -445,7 +445,7 @@ static int ds_request(struct task_struct *task, void *base, size_t size, return error; out_release: - context->owner[qual] = 0; + context->owner[qual] = NULL; ds_put_context(context); return error; @@ -825,7 +825,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_var); break; case 0xF: /* Core2 */ - case 0x1C: /* Atom */ + case 0x1C: /* Atom */ ds_configure(&ds_cfg_64); break; default: -- cgit v1.2.2 From 34b2cd5b688b012975fcfc3b3970fc3508fa82c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 17 May 2008 08:30:07 +0200 Subject: x86: PEBS cleanup Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ad213494a22f..4a93c98a60a2 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -500,7 +500,6 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, { struct thread_struct *prev, *next; unsigned long debugctl; - unsigned long ds_prev = 0, ds_next = 0; prev = &prev_p->thread, next = &next_p->thread; @@ -508,17 +507,23 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, debugctl = prev->debugctlmsr; #ifdef CONFIG_X86_DS - if (prev->ds_ctx) - ds_prev = (unsigned long)prev->ds_ctx->ds; - if (next->ds_ctx) - ds_next = (unsigned long)next->ds_ctx->ds; - - if (ds_next != ds_prev) { - /* we clear debugctl to make sure DS - * is not in use when we change it */ - debugctl = 0; - update_debugctlmsr(0); - wrmsrl(MSR_IA32_DS_AREA, ds_next); + { + unsigned long ds_prev = 0, ds_next = 0; + + if (prev->ds_ctx) + ds_prev = (unsigned long)prev->ds_ctx->ds; + if (next->ds_ctx) + ds_next = (unsigned long)next->ds_ctx->ds; + + if (ds_next != ds_prev) { + /* + * We clear debugctl to make sure DS + * is not in use when we change it: + */ + debugctl = 0; + update_debugctlmsr(0); + wrmsrl(MSR_IA32_DS_AREA, ds_next); + } } #endif /* CONFIG_X86_DS */ -- cgit v1.2.2 From 48adcf148c83faa41999fb0b3524299c4e160fd9 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 20 May 2008 01:03:16 +0300 Subject: [CPUFREQ] cpufreq: remove CVS keywords This patch removes CVS keywords that weren't updated for a long time from comments. Signed-off-by: Adrian Bunk Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k7.h | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h index f8a63b3664e3..35fb4eaf6e1c 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h @@ -1,5 +1,4 @@ /* - * $Id: powernow-k7.h,v 1.2 2003/02/10 18:26:01 davej Exp $ * (C) 2003 Dave Jones. * * Licensed under the terms of the GNU GPL License version 2. -- cgit v1.2.2 From 334ef7a7ab8f80b689a2be95d5e62d2167900865 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Mon, 12 May 2008 21:21:13 +0200 Subject: x86: use performance variant for_each_cpu_mask_nr Change references from for_each_cpu_mask to for_each_cpu_mask_nr where appropriate Reviewed-by: Paul Jackson Reviewed-by: Christoph Lameter Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner commit 2d474871e2fb092eb46a0930aba5442e10eb96cc Author: Mike Travis Date: Mon May 12 21:21:13 2008 +0200 --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 6 +++--- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 8 ++++---- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 10 +++++----- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 4 ++-- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 4 ++-- arch/x86/kernel/io_apic_64.c | 8 ++++---- arch/x86/kernel/smpboot.c | 8 ++++---- arch/x86/xen/smp.c | 4 ++-- 10 files changed, 30 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index b0c8208df9fa..dd097b835839 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -202,7 +202,7 @@ static void drv_write(struct drv_cmd *cmd) cpumask_t saved_mask = current->cpus_allowed; unsigned int i; - for_each_cpu_mask(i, cmd->mask) { + for_each_cpu_mask_nr(i, cmd->mask) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); do_drv_write(cmd); } @@ -451,7 +451,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -466,7 +466,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu_mask(i, cmd.mask) { + for_each_cpu_mask_nr(i, cmd.mask) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 199e4e05e5dc..f1685fb91fbd 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -122,7 +122,7 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, return 0; /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -130,11 +130,11 @@ static int cpufreq_p4_target(struct cpufreq_policy *policy, /* run on each logical CPU, see section 13.15.3 of IA32 Intel Architecture Software * Developer's Manual, Volume 3 */ - for_each_cpu_mask(i, policy->cpus) + for_each_cpu_mask_nr(i, policy->cpus) cpufreq_p4_setdc(i, p4clockmod_table[newstate].index); /* notifiers */ - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 46d4034d9f37..06d6eea5e07a 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -966,7 +966,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_fid(data->currfid); freqs.new = find_khz_freq_from_fid(fid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -974,7 +974,7 @@ static int transition_frequency_fidvid(struct powernow_k8_data *data, unsigned i res = transition_fid_vid(data, fid, vid); freqs.new = find_khz_freq_from_fid(data->currfid); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -997,7 +997,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i freqs.old = find_khz_freq_from_pstate(data->powernow_table, data->currpstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -1005,7 +1005,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i res = transition_pstate(data, pstate); freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate); - for_each_cpu_mask(i, *(data->available_cores)) { + for_each_cpu_mask_nr(i, *(data->available_cores)) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 908dd347c67e..8b0dd6f2a1ac 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -476,7 +476,7 @@ static int centrino_target (struct cpufreq_policy *policy, saved_mask = current->cpus_allowed; first_cpu = 1; cpus_clear(covered_cpus); - for_each_cpu_mask(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, online_policy_cpus) { /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq @@ -517,7 +517,7 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -540,7 +540,7 @@ static int centrino_target (struct cpufreq_policy *policy, preempt_enable(); } - for_each_cpu_mask(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -554,7 +554,7 @@ static int centrino_target (struct cpufreq_policy *policy, */ if (!cpus_empty(covered_cpus)) { - for_each_cpu_mask(j, covered_cpus) { + for_each_cpu_mask_nr(j, covered_cpus) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(j)); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); @@ -564,7 +564,7 @@ static int centrino_target (struct cpufreq_policy *policy, tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, online_policy_cpus) { freqs.cpu = j; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 1b50244b1fdf..191f7263c61d 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -279,7 +279,7 @@ static int speedstep_target (struct cpufreq_policy *policy, cpus_allowed = current->cpus_allowed; - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -292,7 +292,7 @@ static int speedstep_target (struct cpufreq_policy *policy, /* allow to be run on all CPUs */ set_cpus_allowed_ptr(current, &cpus_allowed); - for_each_cpu_mask(i, policy->cpus) { + for_each_cpu_mask_nr(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 26d615dcb149..bfade3301c3a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -488,7 +488,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) int sibling; this_leaf = CPUID4_INFO_IDX(cpu, index); - for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { + for_each_cpu_mask_nr(sibling, this_leaf->shared_cpu_map) { sibling_leaf = CPUID4_INFO_IDX(sibling, index); cpu_clear(cpu, sibling_leaf->shared_cpu_map); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 7c9a813e1193..88736cadbaa6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -527,7 +527,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (err) goto out_free; - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; @@ -617,7 +617,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #endif /* remove all sibling symlinks before unregistering */ - for_each_cpu_mask(i, b->cpus) { + for_each_cpu_mask_nr(i, b->cpus) { if (i == cpu) continue; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index ef1a8dfcc529..e2838cbd2ff8 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -718,7 +718,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) return 0; } - for_each_cpu_mask(cpu, mask) { + for_each_cpu_mask_nr(cpu, mask) { cpumask_t domain, new_mask; int new_cpu; int vector, offset; @@ -739,7 +739,7 @@ next: continue; if (vector == IA32_SYSCALL_VECTOR) goto next; - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; /* Found one! */ @@ -749,7 +749,7 @@ next: cfg->move_in_progress = 1; cfg->old_domain = cfg->domain; } - for_each_cpu_mask(new_cpu, new_mask) + for_each_cpu_mask_nr(new_cpu, new_mask) per_cpu(vector_irq, new_cpu)[vector] = irq; cfg->vector = vector; cfg->domain = domain; @@ -781,7 +781,7 @@ static void __clear_irq_vector(int irq) vector = cfg->vector; cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) per_cpu(vector_irq, cpu)[vector] = -1; cfg->vector = 0; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 38988491c622..fff8ebaa554f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -487,7 +487,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) cpu_set(cpu, cpu_sibling_setup_map); if (smp_num_siblings > 1) { - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (c->phys_proc_id == cpu_data(i).phys_proc_id && c->cpu_core_id == cpu_data(i).cpu_core_id) { cpu_set(i, per_cpu(cpu_sibling_map, cpu)); @@ -510,7 +510,7 @@ void __cpuinit set_cpu_sibling_map(int cpu) return; } - for_each_cpu_mask(i, cpu_sibling_setup_map) { + for_each_cpu_mask_nr(i, cpu_sibling_setup_map) { if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { cpu_set(i, c->llc_shared_map); @@ -1298,7 +1298,7 @@ static void remove_siblinginfo(int cpu) int sibling; struct cpuinfo_x86 *c = &cpu_data(cpu); - for_each_cpu_mask(sibling, per_cpu(cpu_core_map, cpu)) { + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); /*/ * last thread sibling in this cpu core going down @@ -1307,7 +1307,7 @@ static void remove_siblinginfo(int cpu) cpu_data(sibling).booted_cores--; } - for_each_cpu_mask(sibling, per_cpu(cpu_sibling_map, cpu)) + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); cpus_clear(per_cpu(cpu_sibling_map, cpu)); cpus_clear(per_cpu(cpu_core_map, cpu)); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 94e69000f982..7a70638797ed 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -345,7 +345,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) cpus_and(mask, mask, cpu_online_map); - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) xen_send_IPI_one(cpu, vector); } @@ -413,7 +413,7 @@ int xen_smp_call_function_mask(cpumask_t mask, void (*func)(void *), /* Make sure other vcpus get a chance to run if they need to. */ yield = false; - for_each_cpu_mask(cpu, mask) + for_each_cpu_mask_nr(cpu, mask) if (xen_vcpu_stolen(cpu)) yield = true; -- cgit v1.2.2 From 63cc8c75156462d4b42cbdd76c293b7eee7ddbfe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 12 May 2008 15:44:40 +0200 Subject: percpu: introduce DEFINE_PER_CPU_PAGE_ALIGNED() macro While examining holes in percpu section I found this : c05f5000 D per_cpu__current_task c05f5000 D __per_cpu_start c05f5004 D per_cpu__cpu_number c05f5008 D per_cpu__irq_regs c05f500c d per_cpu__cpu_devices c05f5040 D per_cpu__cyc2ns c05f6000 d per_cpu__cpuid4_info c05f6004 d per_cpu__cache_kobject c05f6008 d per_cpu__index_kobject c05f7000 D per_cpu__gdt_page This is because gdt_page is a percpu variable, defined with a page alignement, and linker is doing its job, two times because of .o nesting in the build process. I introduced a new macro DEFINE_PER_CPU_PAGE_ALIGNED() to avoid wasting this space. All page aligned variables (only one at this time) are put in a separate subsection .data.percpu.page_aligned, at the very begining of percpu zone. Before patch , on a x86_32 machine : .data.percpu 30232 3227471872 .data.percpu 22168 3227471872 Thats 8064 bytes saved for each CPU. Signed-off-by: Eric Dumazet Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/vmlinux_32.lds.S | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d0463a946247..b2f54fafb8bc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -21,7 +21,7 @@ #include "cpu.h" -DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index ce5ed083a1e9..0f7c29a8c4ea 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -189,6 +189,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { __per_cpu_start = .; + *(.data.percpu.page_aligned) *(.data.percpu) *(.data.percpu.shared_aligned) __per_cpu_end = .; -- cgit v1.2.2 From 59ea746337c69f6a5f1bc4d5e8544b3cbf12f801 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 13:56:40 +0200 Subject: MM: virtual address debug Add some (configurable) expensive sanity checking to catch wrong address translations on x86. - create linux/mmdebug.h file to be able include this file in asm headers to not get unsolvable loops in header files - __phys_addr on x86_32 became a function in ioremap.c since PAGE_OFFSET, is_vmalloc_addr and VMALLOC_* non-constasts are undefined if declared in page_32.h - add __phys_addr_const for initializing doublefault_tss.__cr3 Tested on 386, 386pae, x86_64 and x86_64 numa=fake=2. Contains Andi's enable numa virtual address debug patch. Signed-off-by: Jiri Slaby Cc: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/kernel/doublefault_32.c | 2 +- arch/x86/mm/ioremap.c | 31 ++++++++++++++++++++++++------- 2 files changed, 25 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index a47798b59f07..395acb12b0d1 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { .ds = __USER_DS, .fs = __KERNEL_PERCPU, - .__cr3 = __pa(swapper_pg_dir) + .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir) } }; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 2b2bb3f9b683..a78ffef62a2b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -23,18 +23,26 @@ #ifdef CONFIG_X86_64 -unsigned long __phys_addr(unsigned long x) +static inline int phys_addr_valid(unsigned long addr) { - if (x >= __START_KERNEL_map) - return x - __START_KERNEL_map + phys_base; - return x - PAGE_OFFSET; + return addr < (1UL << boot_cpu_data.x86_phys_bits); } -EXPORT_SYMBOL(__phys_addr); -static inline int phys_addr_valid(unsigned long addr) +unsigned long __phys_addr(unsigned long x) { - return addr < (1UL << boot_cpu_data.x86_phys_bits); + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); + x += phys_base; + } else { + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + x -= PAGE_OFFSET; + VIRTUAL_BUG_ON(system_state == SYSTEM_BOOTING ? x > MAXMEM : + !phys_addr_valid(x)); + } + return x; } +EXPORT_SYMBOL(__phys_addr); #else @@ -43,6 +51,15 @@ static inline int phys_addr_valid(unsigned long addr) return 1; } +unsigned long __phys_addr(unsigned long x) +{ + /* VMALLOC_* aren't constants; not available at the boot time */ + VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING && + is_vmalloc_addr((void *)x))); + return x - PAGE_OFFSET; +} +EXPORT_SYMBOL(__phys_addr); + #endif int page_is_ram(unsigned long pagenr) -- cgit v1.2.2 From a1bf9631be7332ce0641e299ddafad2d8223100f Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 12 Jun 2008 13:56:40 +0200 Subject: x86, MM: virtual address debug, v2 I've removed the test from phys_to_nid and made a function from __phys_addr only when the debugging is enabled (on x86_32). Signed-off-by: Jiri Slaby Cc: tglx@linutronix.de Cc: hpa@zytor.com Cc: Mike Travis Cc: Nick Piggin Cc: Cc: linux-mm@kvack.org Cc: Jiri Slaby Cc: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index a78ffef62a2b..9dd3cb905971 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -51,6 +51,7 @@ static inline int phys_addr_valid(unsigned long addr) return 1; } +#ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { /* VMALLOC_* aren't constants; not available at the boot time */ @@ -59,6 +60,7 @@ unsigned long __phys_addr(unsigned long x) return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); +#endif #endif -- cgit v1.2.2 From 5e5a29bf2624a5984e1c36c3a2481ee91249ec9c Mon Sep 17 00:00:00 2001 From: Anders H Kaseorg Date: Sat, 28 Jun 2008 18:25:41 -0400 Subject: x86, 64-bit: patch paravirt inline replacements when loading modules small speedup. Paravirt replacements were added to the i386 module loader by commit 139ec7c416248b9ea227d21839235344edfee1e0. This adds the same code to the x86_64 module loader. Signed-off-by: Anders Kaseorg Acked-by: "H. Peter Anvin" Signed-off-by: Ingo Molnar --- arch/x86/kernel/module_64.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index a888e67f5874..0e867676b5a5 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -150,7 +150,8 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL; + const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, + *para = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -160,6 +161,8 @@ int module_finalize(const Elf_Ehdr *hdr, alt = s; if (!strcmp(".smp_locks", secstrings + s->sh_name)) locks= s; + if (!strcmp(".parainstructions", secstrings + s->sh_name)) + para = s; } if (alt) { @@ -175,6 +178,11 @@ int module_finalize(const Elf_Ehdr *hdr, tseg, tseg + text->sh_size); } + if (para) { + void *pseg = (void *)para->sh_addr; + apply_paravirt(pseg, pseg + para->sh_size); + } + return module_bug_finalize(hdr, sechdrs, me); } -- cgit v1.2.2 From 323ec001c6bb98eeabb5abbdbb8c8055d9496554 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 29 Jun 2008 14:19:31 +0400 Subject: x86: use generic per-device dma coherent allocator Signed-off-by: Dmitry Baryshkov Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/pci-dma.c | 122 +--------------------------------------------- 2 files changed, 3 insertions(+), 120 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e0edaaa6920a..9d4714e0020c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -25,6 +25,7 @@ config X86 select HAVE_KRETPROBES select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER + select HAVE_GENERIC_DMA_COHERENT if X86_32 config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index dc00a1331ace..b75c81a8d3e6 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -197,124 +197,6 @@ static __init int iommu_setup(char *p) } early_param("iommu", iommu_setup); -#ifdef CONFIG_X86_32 -int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr, - dma_addr_t device_addr, size_t size, int flags) -{ - void __iomem *mem_base = NULL; - int pages = size >> PAGE_SHIFT; - int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); - - if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) - goto out; - if (!size) - goto out; - if (dev->dma_mem) - goto out; - - /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */ - - mem_base = ioremap(bus_addr, size); - if (!mem_base) - goto out; - - dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL); - if (!dev->dma_mem) - goto out; - dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL); - if (!dev->dma_mem->bitmap) - goto free1_out; - - dev->dma_mem->virt_base = mem_base; - dev->dma_mem->device_base = device_addr; - dev->dma_mem->size = pages; - dev->dma_mem->flags = flags; - - if (flags & DMA_MEMORY_MAP) - return DMA_MEMORY_MAP; - - return DMA_MEMORY_IO; - - free1_out: - kfree(dev->dma_mem); - out: - if (mem_base) - iounmap(mem_base); - return 0; -} -EXPORT_SYMBOL(dma_declare_coherent_memory); - -void dma_release_declared_memory(struct device *dev) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - - if (!mem) - return; - dev->dma_mem = NULL; - iounmap(mem->virt_base); - kfree(mem->bitmap); - kfree(mem); -} -EXPORT_SYMBOL(dma_release_declared_memory); - -void *dma_mark_declared_memory_occupied(struct device *dev, - dma_addr_t device_addr, size_t size) -{ - struct dma_coherent_mem *mem = dev->dma_mem; - int pos, err; - int pages = (size + (device_addr & ~PAGE_MASK) + PAGE_SIZE - 1); - - pages >>= PAGE_SHIFT; - - if (!mem) - return ERR_PTR(-EINVAL); - - pos = (device_addr - mem->device_base) >> PAGE_SHIFT; - err = bitmap_allocate_region(mem->bitmap, pos, get_order(pages)); - if (err != 0) - return ERR_PTR(err); - return mem->virt_base + (pos << PAGE_SHIFT); -} -EXPORT_SYMBOL(dma_mark_declared_memory_occupied); - -static int dma_alloc_from_coherent_mem(struct device *dev, ssize_t size, - dma_addr_t *dma_handle, void **ret) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - int order = get_order(size); - - if (mem) { - int page = bitmap_find_free_region(mem->bitmap, mem->size, - order); - if (page >= 0) { - *dma_handle = mem->device_base + (page << PAGE_SHIFT); - *ret = mem->virt_base + (page << PAGE_SHIFT); - memset(*ret, 0, size); - } - if (mem->flags & DMA_MEMORY_EXCLUSIVE) - *ret = NULL; - } - return (mem != NULL); -} - -static int dma_release_coherent(struct device *dev, int order, void *vaddr) -{ - struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL; - - if (mem && vaddr >= mem->virt_base && vaddr < - (mem->virt_base + (mem->size << PAGE_SHIFT))) { - int page = (vaddr - mem->virt_base) >> PAGE_SHIFT; - - bitmap_release_region(mem->bitmap, page, order); - return 1; - } - return 0; -} -#else -#define dma_alloc_from_coherent_mem(dev, size, handle, ret) (0) -#define dma_release_coherent(dev, order, vaddr) (0) -#endif /* CONFIG_X86_32 */ - int dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PCI @@ -383,7 +265,7 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* ignore region specifiers */ gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); - if (dma_alloc_from_coherent_mem(dev, size, dma_handle, &memory)) + if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; if (!dev) { @@ -486,7 +368,7 @@ void dma_free_coherent(struct device *dev, size_t size, { int order = get_order(size); WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_coherent(dev, order, vaddr)) + if (dma_release_from_coherent(dev, order, vaddr)) return; if (dma_ops->unmap_single) dma_ops->unmap_single(dev, bus, size, 0); -- cgit v1.2.2 From 299a140dacaa514be5e567b5851c187c42ec38c4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 8 Jul 2008 14:47:16 +0200 Subject: x86, AMD IOMMU: ignore detection of GART IOMMU One of the last IOMMU updates covered a bug in the AMD IOMMU code. The early detection code does not succeed if the GART is already detected. This patch fixes this. Cc: Robert Richter Cc: Bhavna Sarathy Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: Joerg Roedel Cc: Robert Richter Cc: Bhavna Sarathy Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2a13e430437d..bb0280077a32 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -828,7 +828,7 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table) void __init amd_iommu_detect(void) { - if (swiotlb || no_iommu || iommu_detected) + if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) return; if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { -- cgit v1.2.2 From ab6bc3e343fbe3be4a0f67225e849d0db6b4b7ac Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 5 Jul 2008 15:53:36 +0400 Subject: x86: idle process - add checking for NULL early param Signed-off-by: Cyrill Gorcunov Cc: akpm@linux-foundation.org Cc: andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ba370dc8685b..58325a6604a4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -164,6 +164,9 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) static int __init idle_setup(char *str) { + if (!str) + return -EINVAL; + if (!strcmp(str, "poll")) { printk("using polling idle threads.\n"); pm_idle = poll_idle; -- cgit v1.2.2 From d6cd7effcc5e0047faf15ab0a54c980f1a616a07 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 5 Jul 2008 15:53:37 +0400 Subject: x86: io delay - add checking for NULL early param Signed-off-by: Cyrill Gorcunov Cc: akpm@linux-foundation.org Cc: andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_delay.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 5921e5f0a640..1c3a66a67f83 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -103,6 +103,9 @@ void __init io_delay_init(void) static int __init io_delay_param(char *s) { + if (!s) + return -EINVAL; + if (!strcmp(s, "0x80")) io_delay_type = CONFIG_IO_DELAY_TYPE_0X80; else if (!strcmp(s, "0xed")) -- cgit v1.2.2 From 4d8cc874d7ed43eda72765e9c0e141e170fee4f3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 5 Jul 2008 15:53:38 +0400 Subject: x86: smpboot maxcpus - add checking for NULL early param Signed-off-by: Cyrill Gorcunov Cc: akpm@linux-foundation.org Cc: andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fd933b5465b6..e47bfac70c38 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1452,7 +1452,8 @@ static int __init parse_maxcpus(char *arg) { extern unsigned int maxcpus; - maxcpus = simple_strtoul(arg, NULL, 0); + if (arg) + maxcpus = simple_strtoul(arg, NULL, 0); return 0; } early_param("maxcpus", parse_maxcpus); -- cgit v1.2.2 From 46a7fa270afbe5fddc6042a598cfe22977b0e989 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:42 +0900 Subject: x86: make only GART code include gart.h gart.h has only GART-specific stuff. Only GART code needs it. Other IOMMU stuff should include iommu.h instead of gart.h. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 2 +- arch/x86/kernel/aperture_64.c | 1 + arch/x86/kernel/early-quirks.c | 5 +---- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/pci-dma.c | 2 +- arch/x86/kernel/pci-gart_64.c | 1 + arch/x86/kernel/pci-nommu.c | 2 +- arch/x86/kernel/pci-swiotlb_64.c | 2 +- arch/x86/kernel/setup.c | 2 +- 10 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f2766d84c7a0..cf2f74bcde53 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2a13e430437d..66438284c699 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include /* * definitions for the ACPI scanning code diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 9f907806c1a5..44e21826db11 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index a4665f37cfc5..510b8e367732 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -16,10 +16,7 @@ #include #include #include - -#ifdef CONFIG_GART_IOMMU -#include -#endif +#include static void __init fix_hypertransport_config(int num, int slot, int func) { diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 6959b5c45df4..151f2d171f7c 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8467ec2320f1..f581a4b63b43 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index d0d18db5d2a4..949ca985deb0 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index aec43d56f49c..792b9179eff3 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 82299cd1d04d..20df839b9c20 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include #include diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 86fc2d624270..e5d208934bfc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -96,7 +96,7 @@ #include #include #include -#include +#include #include #include -- cgit v1.2.2 From ac7ded2adb2e43152fe7385ddd53bf45f5c92285 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:43 +0900 Subject: x86: remove ifdef CONFIG_GART_IOMMU in pci-dma.c Our way to handle gart_* functions for CONFIG_GART_IOMMU and !CONFIG_GART_IOMMU cases is inconsistent. We have some dummy gart_* functions in !CONFIG_GART_IOMMU case and also use ifdef CONFIG_GART_IOMMU tricks in pci-dma.c to call some gart_* functions in only CONFIG_GART_IOMMU case. This patch removes ifdef CONFIG_GART_IOMMU in pci-dma.c and always use dummy gart_* functions in iommu.h. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f581a4b63b43..dd57c5bbe2da 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -114,9 +114,7 @@ void __init pci_iommu_alloc(void) * The order of these functions is important for * fall-back/fail-over reasons */ -#ifdef CONFIG_GART_IOMMU gart_iommu_hole_init(); -#endif #ifdef CONFIG_CALGARY_IOMMU detect_calgary(); @@ -184,9 +182,7 @@ static __init int iommu_setup(char *p) swiotlb = 1; #endif -#ifdef CONFIG_GART_IOMMU gart_parse_options(p); -#endif #ifdef CONFIG_CALGARY_IOMMU if (!strncmp(p, "calgary", 7)) @@ -508,9 +504,7 @@ static int __init pci_iommu_init(void) amd_iommu_init(); -#ifdef CONFIG_GART_IOMMU gart_iommu_init(); -#endif no_iommu_init(); return 0; -- cgit v1.2.2 From b8b48326f312026af12799917383c54c25d05482 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:44 +0900 Subject: x86: remove ifdef CONFIG_CALGARY_IOMMU in pci-dma.c asm-x86/calgary.h has dummy calgary_iommu_init() and detect_calgary() in !CONFIG_CALGARY_IOMMU case. So we don't need ifdef CONFIG_CALGARY_IOMMU in pci-dma.c. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: Alexis Bruemmer Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index dd57c5bbe2da..f16cbbe424a1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -116,9 +116,7 @@ void __init pci_iommu_alloc(void) */ gart_iommu_hole_init(); -#ifdef CONFIG_CALGARY_IOMMU detect_calgary(); -#endif detect_intel_iommu(); @@ -496,9 +494,7 @@ EXPORT_SYMBOL(dma_free_coherent); static int __init pci_iommu_init(void) { -#ifdef CONFIG_CALGARY_IOMMU calgary_iommu_init(); -#endif intel_iommu_init(); -- cgit v1.2.2 From be54f9d1c8df93c4998e134a306652caaa58f67f Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 11 Jul 2008 10:23:45 +0900 Subject: x86: remove ifdef CONFIG_SWIOTLB in pci-dma.c As other IOMMUs do, this puts dummy pci_swiotlb_init() in swiotlb.h and remove ifdef CONFIG_SWIOTLB in pci-dma.c. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f16cbbe424a1..d12945de0565 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -122,9 +122,7 @@ void __init pci_iommu_alloc(void) amd_iommu_detect(); -#ifdef CONFIG_SWIOTLB pci_swiotlb_init(); -#endif } #endif -- cgit v1.2.2 From b65233a9c1da587bf19ee161982f4f0ec59941c0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:21 +0200 Subject: x86, AMD IOMMU: add comments to the initialization code This patch adds some comments to the AMD IOMMU initialization code to increase its readability. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 214 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 206 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index bb0280077a32..9ddb46d7c524 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -71,6 +71,17 @@ #define ACPI_DEVFLAG_LINT1 0x80 #define ACPI_DEVFLAG_ATSDIS 0x10000000 +/* + * ACPI table definitions + * + * These data structures are laid over the table to parse the important values + * out of it. + */ + +/* + * structure describing one IOMMU in the ACPI table. Typically followed by one + * or more ivhd_entrys. + */ struct ivhd_header { u8 type; u8 flags; @@ -83,6 +94,10 @@ struct ivhd_header { u32 reserved; } __attribute__((packed)); +/* + * A device entry describing which devices a specific IOMMU translates and + * which requestor ids they use. + */ struct ivhd_entry { u8 type; u16 devid; @@ -90,6 +105,10 @@ struct ivhd_entry { u32 ext; } __attribute__((packed)); +/* + * An AMD IOMMU memory definition structure. It defines things like exclusion + * ranges for devices and regions that should be unity mapped. + */ struct ivmd_header { u8 type; u8 flags; @@ -103,22 +122,66 @@ struct ivmd_header { static int __initdata amd_iommu_detected; -u16 amd_iommu_last_bdf; -struct list_head amd_iommu_unity_map; -unsigned amd_iommu_aperture_order = 26; -int amd_iommu_isolate; +u16 amd_iommu_last_bdf; /* largest PCI device id we have + to handle */ +struct list_head amd_iommu_unity_map; /* a list of required unity mappings + we find in ACPI */ +unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ +int amd_iommu_isolate; /* if 1, device isolation is enabled */ -struct list_head amd_iommu_list; +struct list_head amd_iommu_list; /* list of all AMD IOMMUs in the + system */ + +/* + * Pointer to the device table which is shared by all AMD IOMMUs + * it is indexed by the PCI device id or the HT unit id and contains + * information about the domain the device belongs to as well as the + * page table root pointer. + */ struct dev_table_entry *amd_iommu_dev_table; + +/* + * The alias table is a driver specific data structure which contains the + * mappings of the PCI device ids to the actual requestor ids on the IOMMU. + * More than one device can share the same requestor id. + */ u16 *amd_iommu_alias_table; + +/* + * The rlookup table is used to find the IOMMU which is responsible + * for a specific device. It is also indexed by the PCI device id. + */ struct amd_iommu **amd_iommu_rlookup_table; + +/* + * The pd table (protection domain table) is used to find the protection domain + * data structure a device belongs to. Indexed with the PCI device id too. + */ struct protection_domain **amd_iommu_pd_table; + +/* + * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap + * to know which ones are already in use. + */ unsigned long *amd_iommu_pd_alloc_bitmap; -static u32 dev_table_size; -static u32 alias_table_size; -static u32 rlookup_table_size; +static u32 dev_table_size; /* size of the device table */ +static u32 alias_table_size; /* size of the alias table */ +static u32 rlookup_table_size; /* size if the rlookup table */ + +/**************************************************************************** + * + * AMD IOMMU MMIO register space handling functions + * + * These functions are used to program the IOMMU device registers in + * MMIO space required for that driver. + * + ****************************************************************************/ +/* + * This function set the exclusion range in the IOMMU. DMA accesses to the + * exclusion range are passed through untranslated + */ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) { u64 start = iommu->exclusion_start & PAGE_MASK; @@ -137,6 +200,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Programs the physical address of the device table into the IOMMU hardware */ static void __init iommu_set_device_table(struct amd_iommu *iommu) { u32 entry; @@ -149,6 +213,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu) &entry, sizeof(entry)); } +/* Generic functions to enable/disable certain features of the IOMMU. */ static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; @@ -167,6 +232,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } +/* Function to enable the hardware */ void __init iommu_enable(struct amd_iommu *iommu) { printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); @@ -176,6 +242,10 @@ void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +/* + * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in + * the system has one. + */ static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; @@ -199,6 +269,19 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) release_mem_region(iommu->mmio_phys, MMIO_REGION_LENGTH); } +/**************************************************************************** + * + * The functions below belong to the first pass of AMD IOMMU ACPI table + * parsing. In this pass we try to find out the highest device id this + * code has to handle. Upon this information the size of the shared data + * structures is determined later. + * + ****************************************************************************/ + +/* + * This function reads the last device id the IOMMU has to handle from the PCI + * capability header for this IOMMU + */ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) { u32 cap; @@ -209,6 +292,10 @@ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) return 0; } +/* + * After reading the highest device id from the IOMMU PCI capability header + * this function looks if there is a higher device id defined in the ACPI table + */ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) { u8 *p = (void *)h, *end = (void *)h; @@ -229,6 +316,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) case IVHD_DEV_RANGE_END: case IVHD_DEV_ALIAS: case IVHD_DEV_EXT_SELECT: + /* all the above subfield types refer to device ids */ UPDATE_LAST_BDF(dev->devid); break; default: @@ -242,6 +330,11 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) return 0; } +/* + * Iterate over all IVHD entries in the ACPI table and find the highest device + * id which we need to handle. This is the first of three functions which parse + * the ACPI table. So we check the checksum here. + */ static int __init find_last_devid_acpi(struct acpi_table_header *table) { int i; @@ -277,6 +370,20 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The following functions belong the the code path which parses the ACPI table + * the second time. In this ACPI parsing iteration we allocate IOMMU specific + * data structures, initialize the device/alias/rlookup table and also + * basically initialize the hardware. + * + ****************************************************************************/ + +/* + * Allocates the command buffer. This buffer is per AMD IOMMU. We can + * write commands to that buffer later and the IOMMU will execute them + * asynchronously + */ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, @@ -307,6 +414,7 @@ static void __init free_command_buffer(struct amd_iommu *iommu) get_order(CMD_BUFFER_SIZE)); } +/* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { int i = (bit >> 5) & 0x07; @@ -315,6 +423,10 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +/* + * This function takes the device specific flags read from the ACPI + * table and sets up the device table entry with that information + */ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) { if (flags & ACPI_DEVFLAG_INITPASS) @@ -333,11 +445,16 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); } +/* Writes the specific IOMMU for a device into the rlookup table */ static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) { amd_iommu_rlookup_table[devid] = iommu; } +/* + * Reads the device exclusion range from ACPI and initialize IOMMU with + * it + */ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) { struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; @@ -346,12 +463,22 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) return; if (iommu) { + /* + * We only can configure exclusion ranges per IOMMU, not + * per device. But we can enable the exclusion range per + * device. This is done here + */ set_dev_entry_bit(m->devid, DEV_ENTRY_EX); iommu->exclusion_start = m->range_start; iommu->exclusion_length = m->range_length; } } +/* + * This function reads some important data from the IOMMU PCI space and + * initializes the driver data structure with it. It reads the hardware + * capabilities and the first/last device entries + */ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int bus = PCI_BUS(iommu->devid); @@ -367,6 +494,10 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); } +/* + * Takes a pointer to an AMD IOMMU entry in the ACPI table and + * initializes the hardware and our data structures with it. + */ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, struct ivhd_header *h) { @@ -467,6 +598,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, } } +/* Initializes the device->iommu mapping for the driver */ static int __init init_iommu_devices(struct amd_iommu *iommu) { u16 i; @@ -494,6 +626,11 @@ static void __init free_iommu_all(void) } } +/* + * This function clues the initialization function for one IOMMU + * together and also allocates the command buffer and programs the + * hardware. It does NOT enable the IOMMU. This is done afterwards. + */ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) { spin_lock_init(&iommu->lock); @@ -521,6 +658,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) return 0; } +/* + * Iterates over all IOMMU entries in the ACPI table, allocates the + * IOMMU structure and initializes it with init_iommu_one() + */ static int __init init_iommu_all(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; @@ -555,6 +696,14 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The next functions belong to the third pass of parsing the ACPI + * table. In this last pass the memory mapping requirements are + * gathered (like exclusion and unity mapping reanges). + * + ****************************************************************************/ + static void __init free_unity_maps(void) { struct unity_map_entry *entry, *next; @@ -565,6 +714,7 @@ static void __init free_unity_maps(void) } } +/* called when we find an exclusion range definition in ACPI */ static int __init init_exclusion_range(struct ivmd_header *m) { int i; @@ -588,6 +738,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) return 0; } +/* called for unity map ACPI definition */ static int __init init_unity_map_range(struct ivmd_header *m) { struct unity_map_entry *e = 0; @@ -619,6 +770,7 @@ static int __init init_unity_map_range(struct ivmd_header *m) return 0; } +/* iterates over all memory definitions we find in the ACPI table */ static int __init init_memory_definitions(struct acpi_table_header *table) { u8 *p = (u8 *)table, *end = (u8 *)table; @@ -642,6 +794,10 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +/* + * This function finally enables all IOMMUs found in the system after + * they have been initialized + */ static void __init enable_iommus(void) { struct amd_iommu *iommu; @@ -678,6 +834,34 @@ static struct sys_device device_amd_iommu = { .cls = &amd_iommu_sysdev_class, }; +/* + * This is the core init function for AMD IOMMU hardware in the system. + * This function is called from the generic x86 DMA layer initialization + * code. + * + * This function basically parses the ACPI table for AMD IOMMU (IVRS) + * three times: + * + * 1 pass) Find the highest PCI device id the driver has to handle. + * Upon this information the size of the data structures is + * determined that needs to be allocated. + * + * 2 pass) Initialize the data structures just allocated with the + * information in the ACPI table about available AMD IOMMUs + * in the system. It also maps the PCI devices in the + * system to specific IOMMUs + * + * 3 pass) After the basic data structures are allocated and + * initialized we update them with information about memory + * remapping requirements parsed out of the ACPI table in + * this last pass. + * + * After that the hardware is initialized and ready to go. In the last + * step we do some Linux specific things like registering the driver in + * the dma_ops interface and initializing the suspend/resume support + * functions. Finally it prints some information about AMD IOMMUs and + * the driver state and enables the hardware. + */ int __init amd_iommu_init(void) { int i, ret = 0; @@ -821,6 +1005,13 @@ free: goto out; } +/**************************************************************************** + * + * Early detect code. This code runs at IOMMU detection time in the DMA + * layer. It just looks if there is an IVRS ACPI table to detect AMD + * IOMMUs + * + ****************************************************************************/ static int __init early_amd_iommu_detect(struct acpi_table_header *table) { return 0; @@ -841,6 +1032,13 @@ void __init amd_iommu_detect(void) } } +/**************************************************************************** + * + * Parsing functions for the AMD IOMMU specific kernel command line + * options. + * + ****************************************************************************/ + static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { -- cgit v1.2.2 From 431b2a2015337533f1a9e39a840266a8a2c93144 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:22 +0200 Subject: x86, AMD IOMMU: add comments to core code This patch adds comments about how the AMD IOMMU core code works for the DMA remapping functionality. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 201 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 199 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f2766d84c7a0..4bae96ca7c11 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -34,6 +34,9 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); +/* + * general struct to manage commands send to an IOMMU + */ struct command { u32 data[4]; }; @@ -41,11 +44,22 @@ struct command { static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e); +/* returns !0 if the IOMMU is caching non-present entries in its TLB */ static int iommu_has_npcache(struct amd_iommu *iommu) { return iommu->cap & IOMMU_CAP_NPCACHE; } +/**************************************************************************** + * + * IOMMU command queuing functions + * + ****************************************************************************/ + +/* + * Writes the command to the IOMMUs command buffer and informs the + * hardware about the new command. Must be called with iommu->lock held. + */ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) { u32 tail, head; @@ -63,6 +77,10 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return 0; } +/* + * General queuing function for commands. Takes iommu->lock and calls + * __iommu_queue_command(). + */ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) { unsigned long flags; @@ -75,6 +93,13 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) return ret; } +/* + * This function is called whenever we need to ensure that the IOMMU has + * completed execution of all commands we sent. It sends a + * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs + * us about that by writing a value to a physical address we pass with + * the command. + */ static int iommu_completion_wait(struct amd_iommu *iommu) { int ret; @@ -101,6 +126,9 @@ static int iommu_completion_wait(struct amd_iommu *iommu) return 0; } +/* + * Command send function for invalidating a device table entry + */ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { struct command cmd; @@ -116,6 +144,9 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) return iommu_queue_command(iommu, &cmd); } +/* + * Generic command send function for invalidaing TLB entries + */ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { @@ -127,9 +158,9 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, cmd.data[1] |= domid; cmd.data[2] = LOW_U32(address); cmd.data[3] = HIGH_U32(address); - if (s) + if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; - if (pde) + if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; iommu->need_sync = 1; @@ -137,6 +168,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, return iommu_queue_command(iommu, &cmd); } +/* + * TLB invalidation function which is called from the mapping functions. + * It invalidates a single PTE if the range to flush is within a single + * page. Otherwise it flushes the whole TLB of the IOMMU. + */ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { @@ -159,6 +195,20 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +/**************************************************************************** + * + * The functions below are used the create the page table mappings for + * unity mapped regions. + * + ****************************************************************************/ + +/* + * Generic mapping functions. It maps a physical address into a DMA + * address space. It allocates the page table pages if necessary. + * In the future it can be extended to a generic mapping function + * supporting all features of AMD IOMMU page tables like level skipping + * and full 64 bit address spaces. + */ static int iommu_map(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, @@ -209,6 +259,10 @@ static int iommu_map(struct protection_domain *dom, return 0; } +/* + * This function checks if a specific unity mapping entry is needed for + * this specific IOMMU. + */ static int iommu_for_unity_map(struct amd_iommu *iommu, struct unity_map_entry *entry) { @@ -223,6 +277,12 @@ static int iommu_for_unity_map(struct amd_iommu *iommu, return 0; } +/* + * Init the unity mappings for a specific IOMMU in the system + * + * Basically iterates over all unity mapping entries and applies them to + * the default domain DMA of that IOMMU if necessary. + */ static int iommu_init_unity_mappings(struct amd_iommu *iommu) { struct unity_map_entry *entry; @@ -239,6 +299,10 @@ static int iommu_init_unity_mappings(struct amd_iommu *iommu) return 0; } +/* + * This function actually applies the mapping to the page table of the + * dma_ops domain. + */ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, struct unity_map_entry *e) { @@ -261,6 +325,9 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, return 0; } +/* + * Inits the unity mappings required for a specific device + */ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, u16 devid) { @@ -278,12 +345,26 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, return 0; } +/**************************************************************************** + * + * The next functions belong to the address allocator for the dma_ops + * interface functions. They work like the allocators in the other IOMMU + * drivers. Its basically a bitmap which marks the allocated pages in + * the aperture. Maybe it could be enhanced in the future to a more + * efficient allocator. + * + ****************************************************************************/ static unsigned long dma_mask_to_pages(unsigned long mask) { return (mask >> PAGE_SHIFT) + (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); } +/* + * The address allocator core function. + * + * called with domain->lock held + */ static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages) @@ -317,6 +398,11 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, return address; } +/* + * The address free function. + * + * called with domain->lock held + */ static void dma_ops_free_addresses(struct dma_ops_domain *dom, unsigned long address, unsigned int pages) @@ -325,6 +411,16 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom, iommu_area_free(dom->bitmap, address, pages); } +/**************************************************************************** + * + * The next functions belong to the domain allocation. A domain is + * allocated for every IOMMU as the default domain. If device isolation + * is enabled, every device get its own domain. The most important thing + * about domains is the page table mapping the DMA address space they + * contain. + * + ****************************************************************************/ + static u16 domain_id_alloc(void) { unsigned long flags; @@ -342,6 +438,10 @@ static u16 domain_id_alloc(void) return id; } +/* + * Used to reserve address ranges in the aperture (e.g. for exclusion + * ranges. + */ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, unsigned long start_page, unsigned int pages) @@ -382,6 +482,10 @@ static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) free_page((unsigned long)p1); } +/* + * Free a domain, only used if something went wrong in the + * allocation path and we need to free an already allocated page table + */ static void dma_ops_domain_free(struct dma_ops_domain *dom) { if (!dom) @@ -396,6 +500,11 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom) kfree(dom); } +/* + * Allocates a new protection domain usable for the dma_ops functions. + * It also intializes the page table and the address allocator data + * structures required for the dma_ops interface + */ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, unsigned order) { @@ -436,6 +545,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->bitmap[0] = 1; dma_dom->next_bit = 0; + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; @@ -444,6 +554,11 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_ops_reserve_addresses(dma_dom, startpage, pages); } + /* + * At the last step, build the page tables so we don't need to + * allocate page table pages in the dma_ops mapping/unmapping + * path. + */ num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512); dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *), GFP_KERNEL); @@ -472,6 +587,10 @@ free_dma_dom: return NULL; } +/* + * Find out the protection domain structure for a given PCI device. This + * will give us the pointer to the page table root for example. + */ static struct protection_domain *domain_for_device(u16 devid) { struct protection_domain *dom; @@ -484,6 +603,10 @@ static struct protection_domain *domain_for_device(u16 devid) return dom; } +/* + * If a device is not yet associated with a domain, this function does + * assigns it visible for the hardware + */ static void set_device_domain(struct amd_iommu *iommu, struct protection_domain *domain, u16 devid) @@ -508,6 +631,19 @@ static void set_device_domain(struct amd_iommu *iommu, iommu->need_sync = 1; } +/***************************************************************************** + * + * The next functions belong to the dma_ops mapping/unmapping code. + * + *****************************************************************************/ + +/* + * In the dma_ops path we only have the struct device. This function + * finds the corresponding IOMMU, the protection domain and the + * requestor id for a given device. + * If the device is not yet associated with a domain this is also done + * in this function. + */ static int get_device_resources(struct device *dev, struct amd_iommu **iommu, struct protection_domain **domain, @@ -522,6 +658,7 @@ static int get_device_resources(struct device *dev, pcidev = to_pci_dev(dev); _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + /* device not translated by any IOMMU in the system? */ if (_bdf >= amd_iommu_last_bdf) { *iommu = NULL; *domain = NULL; @@ -547,6 +684,10 @@ static int get_device_resources(struct device *dev, return 1; } +/* + * This is the generic map function. It maps one 4kb page at paddr to + * the given address in the DMA address space for the domain. + */ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address, @@ -578,6 +719,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, return (dma_addr_t)address; } +/* + * The generic unmapping function for on page in the DMA address space. + */ static void dma_ops_domain_unmap(struct amd_iommu *iommu, struct dma_ops_domain *dom, unsigned long address) @@ -597,6 +741,12 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu, *pte = 0ULL; } +/* + * This function contains common code for mapping of a physically + * contiguous memory region into DMA address space. It is uses by all + * mapping functions provided by this IOMMU driver. + * Must be called with the domain lock held. + */ static dma_addr_t __map_single(struct device *dev, struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, @@ -628,6 +778,10 @@ out: return address; } +/* + * Does the reverse of the __map_single function. Must be called with + * the domain lock held too + */ static void __unmap_single(struct amd_iommu *iommu, struct dma_ops_domain *dma_dom, dma_addr_t dma_addr, @@ -652,6 +806,9 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); } +/* + * The exported map_single function for dma_ops. + */ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) { @@ -664,6 +821,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, get_device_resources(dev, &iommu, &domain, &devid); if (iommu == NULL || domain == NULL) + /* device not handled by any AMD IOMMU */ return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); @@ -683,6 +841,9 @@ out: return addr; } +/* + * The exported unmap_single function for dma_ops. + */ static void unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int dir) { @@ -692,6 +853,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, u16 devid; if (!get_device_resources(dev, &iommu, &domain, &devid)) + /* device not handled by any AMD IOMMU */ return; spin_lock_irqsave(&domain->lock, flags); @@ -706,6 +868,10 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * This is a special map_sg function which is used if we should map a + * device which is not handled by an AMD IOMMU in the system. + */ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -720,6 +886,10 @@ static int map_sg_no_iommu(struct device *dev, struct scatterlist *sglist, return nelems; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static int map_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -775,6 +945,10 @@ unmap: goto out; } +/* + * The exported map_sg function for dma_ops (handles scatter-gather + * lists). + */ static void unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int dir) { @@ -804,6 +978,9 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, spin_unlock_irqrestore(&domain->lock, flags); } +/* + * The exported alloc_coherent function for dma_ops. + */ static void *alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { @@ -851,6 +1028,11 @@ out: return virt_addr; } +/* + * The exported free_coherent function for dma_ops. + * FIXME: fix the generic x86 DMA layer so that it actually calls that + * function. + */ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) { @@ -879,6 +1061,8 @@ free_mem: } /* + * The function for pre-allocating protection domains. + * * If the driver core informs the DMA layer if a driver grabs a device * we don't need to preallocate the protection domains anymore. * For now we have to. @@ -921,12 +1105,20 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { .unmap_sg = unmap_sg, }; +/* + * The function which clues the AMD IOMMU driver into dma_ops. + */ int __init amd_iommu_init_dma_ops(void) { struct amd_iommu *iommu; int order = amd_iommu_aperture_order; int ret; + /* + * first allocate a default protection domain for every IOMMU we + * found in the system. Devices not assigned to any other + * protection domain will be assigned to the default one. + */ list_for_each_entry(iommu, &amd_iommu_list, list) { iommu->default_dom = dma_ops_domain_alloc(iommu, order); if (iommu->default_dom == NULL) @@ -936,6 +1128,10 @@ int __init amd_iommu_init_dma_ops(void) goto free_domains; } + /* + * If device isolation is enabled, pre-allocate the protection + * domains for each device. + */ if (amd_iommu_isolate) prealloc_protection_domains(); @@ -947,6 +1143,7 @@ int __init amd_iommu_init_dma_ops(void) gart_iommu_aperture = 0; #endif + /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; return 0; -- cgit v1.2.2 From 8ea80d783efd0c50577ec8d69757ae54c408eacd Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:23 +0200 Subject: x86, AMD IOMMU: replace HIGH_U32 macro with upper_32_bits function Removes a driver specific macro and replaces it with a generic function already available in Linux. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 4bae96ca7c11..9098f047c1a9 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -109,7 +109,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; - cmd.data[1] = HIGH_U32(ready_phys); + cmd.data[1] = upper_32_bits(ready_phys); cmd.data[2] = 1; /* value written to 'ready' */ CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); @@ -157,7 +157,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); cmd.data[1] |= domid; cmd.data[2] = LOW_U32(address); - cmd.data[3] = HIGH_U32(address); + cmd.data[3] = upper_32_bits(address); if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ -- cgit v1.2.2 From 208ec8c94d818a3def0b424958493728871716d1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:24 +0200 Subject: x86, AMD IOMMU: replace UPDATE_LAST_BDF macro with a function This patch replaces the UPDATE_LAST_BDF macro in the init code with the update_last_devid function. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9ddb46d7c524..6e1c8ffc0c5b 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -30,11 +30,6 @@ /* * definitions for the ACPI scanning code */ -#define UPDATE_LAST_BDF(x) do {\ - if ((x) > amd_iommu_last_bdf) \ - amd_iommu_last_bdf = (x); \ - } while (0); - #define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 @@ -169,6 +164,12 @@ static u32 dev_table_size; /* size of the device table */ static u32 alias_table_size; /* size of the alias table */ static u32 rlookup_table_size; /* size if the rlookup table */ +static inline void update_last_devid(u16 devid) +{ + if (devid > amd_iommu_last_bdf) + amd_iommu_last_bdf = devid; +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -287,7 +288,7 @@ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) u32 cap; cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - UPDATE_LAST_BDF(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + update_last_devid(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); return 0; } @@ -317,7 +318,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) case IVHD_DEV_ALIAS: case IVHD_DEV_EXT_SELECT: /* all the above subfield types refer to device ids */ - UPDATE_LAST_BDF(dev->devid); + update_last_devid(dev->devid); break; default: break; -- cgit v1.2.2 From c571484e53f3e1d90bc5374528580c7419d28d4c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:25 +0200 Subject: x86, AMD IOMMU: replace TBL_SIZE macro with a function This patch converts the TBL_SIZE macro in the init code to a function. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6e1c8ffc0c5b..1f148393cf7a 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -33,7 +33,6 @@ #define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 -#define TBL_SIZE(x) (1 << (PAGE_SHIFT + get_order(amd_iommu_last_bdf * (x)))) #define ACPI_IVHD_TYPE 0x10 #define ACPI_IVMD_TYPE_ALL 0x20 @@ -170,6 +169,14 @@ static inline void update_last_devid(u16 devid) amd_iommu_last_bdf = devid; } +static inline unsigned long tbl_size(int entry_size) +{ + unsigned shift = PAGE_SHIFT + + get_order(amd_iommu_last_bdf * entry_size); + + return 1UL << shift; +} + /**************************************************************************** * * AMD IOMMU MMIO register space handling functions @@ -884,9 +891,9 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) return -ENODEV; - dev_table_size = TBL_SIZE(DEV_TABLE_ENTRY_SIZE); - alias_table_size = TBL_SIZE(ALIAS_TABLE_ENTRY_SIZE); - rlookup_table_size = TBL_SIZE(RLOOKUP_TABLE_ENTRY_SIZE); + dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); + alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); + rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); ret = -ENOMEM; -- cgit v1.2.2 From 9a836de0c9944c42d006ec241712c72e74737c73 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:26 +0200 Subject: x86, AMD IOMMU: remove unnecessary free checks from init code This patch removes unnecessary checks before memory is released. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 27 ++++++++++----------------- 1 file changed, 10 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1f148393cf7a..0f5a9115a694 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -417,9 +417,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - if (iommu->cmd_buf) - free_pages((unsigned long)iommu->cmd_buf, - get_order(CMD_BUFFER_SIZE)); + free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); } /* sets a specific bit in the device table entry. */ @@ -987,24 +985,19 @@ out: return ret; free: - if (amd_iommu_pd_alloc_bitmap) - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); - if (amd_iommu_pd_table) - free_pages((unsigned long)amd_iommu_pd_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_pd_table, + get_order(rlookup_table_size)); - if (amd_iommu_rlookup_table) - free_pages((unsigned long)amd_iommu_rlookup_table, - get_order(rlookup_table_size)); + free_pages((unsigned long)amd_iommu_rlookup_table, + get_order(rlookup_table_size)); - if (amd_iommu_alias_table) - free_pages((unsigned long)amd_iommu_alias_table, - get_order(alias_table_size)); + free_pages((unsigned long)amd_iommu_alias_table, + get_order(alias_table_size)); - if (amd_iommu_dev_table) - free_pages((unsigned long)amd_iommu_dev_table, - get_order(dev_table_size)); + free_pages((unsigned long)amd_iommu_dev_table, + get_order(dev_table_size)); free_iommu_all(); -- cgit v1.2.2 From 136f78a19cf94d469f31a4009c7c0ac2301fbbf0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:27 +0200 Subject: x86, AMD IOMMU: add an emergency exit to the completion wait loop To make the loop waiting for the completion wait command not wait forever this patch adds a limit of cycles that loop. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 9098f047c1a9..7fa2d5d57dd8 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -32,6 +32,8 @@ #define to_pages(addr, size) \ (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) +#define EXIT_LOOP_COUNT 10000000 + static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* @@ -106,6 +108,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) struct command cmd; volatile u64 ready = 0; unsigned long ready_phys = virt_to_phys(&ready); + unsigned long i = 0; memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; @@ -120,8 +123,13 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (ret) return ret; - while (!ready) + while (!ready && (i < EXIT_LOOP_COUNT)) { + ++i; cpu_relax(); + } + + if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) + printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); return 0; } -- cgit v1.2.2 From d64495366ff78fdbd5bd3176a7ada2f0c2cbfba6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:28 +0200 Subject: x86, AMD IOMMU: rename struct command to iommu_cmd This patch gives the struct command a more descriptive and not so generic name. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7fa2d5d57dd8..dec10e1a397c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -39,7 +39,7 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); /* * general struct to manage commands send to an IOMMU */ -struct command { +struct iommu_cmd { u32 data[4]; }; @@ -62,7 +62,7 @@ static int iommu_has_npcache(struct amd_iommu *iommu) * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. Must be called with iommu->lock held. */ -static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { u32 tail, head; u8 *target; @@ -83,7 +83,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) * General queuing function for commands. Takes iommu->lock and calls * __iommu_queue_command(). */ -static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) +static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) { unsigned long flags; int ret; @@ -105,7 +105,7 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct command *cmd) static int iommu_completion_wait(struct amd_iommu *iommu) { int ret; - struct command cmd; + struct iommu_cmd cmd; volatile u64 ready = 0; unsigned long ready_phys = virt_to_phys(&ready); unsigned long i = 0; @@ -139,7 +139,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) */ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { - struct command cmd; + struct iommu_cmd cmd; BUG_ON(iommu == NULL); @@ -158,7 +158,7 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { - struct command cmd; + struct iommu_cmd cmd; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; -- cgit v1.2.2 From d0312b2142ac7665031755c1cc3dba827d4eb711 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:29 +0200 Subject: x86, AMD IOMMU: remove unneeded initializations from command buffer allocation This patch removes an unneeded initialization from the alloc_command_buffer function and replaces a memset with __GFP_ZERO. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0f5a9115a694..0124995c7b5d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -394,17 +394,15 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table) */ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) { - u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL, + u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(CMD_BUFFER_SIZE)); - u64 entry = 0; + u64 entry; if (cmd_buf == NULL) return NULL; iommu->cmd_buf_size = CMD_BUFFER_SIZE; - memset(cmd_buf, 0, CMD_BUFFER_SIZE); - entry = (u64)virt_to_phys(cmd_buf); entry |= MMIO_CMD_SIZE_512; memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET, -- cgit v1.2.2 From 58a3bee567b588a84cdde05fecc45439b396362c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:30 +0200 Subject: x86, AMD IOMMU: use true/false instead of 0/1 for bool value This patch replaces the integer values used for the bool variable in ACPI scanning code with true and false. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0124995c7b5d..316fe2eaeefe 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -509,7 +509,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, u8 *end = p, flags = 0; u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; u32 ext_flags = 0; - bool alias = 0; + bool alias = false; struct ivhd_entry *e; /* @@ -559,7 +559,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid_start = e->devid; flags = e->flags; ext_flags = 0; - alias = 0; + alias = false; break; case IVHD_DEV_ALIAS: devid = e->devid; @@ -572,7 +572,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, flags = e->flags; devid_to = e->ext >> 8; ext_flags = 0; - alias = 1; + alias = true; break; case IVHD_DEV_EXT_SELECT: devid = e->devid; @@ -582,7 +582,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, devid_start = e->devid; flags = e->flags; ext_flags = e->ext; - alias = 0; + alias = false; break; case IVHD_DEV_RANGE_END: devid = e->devid; -- cgit v1.2.2 From 2e22847fbe05f2543ccebd0c2df94d9cf3c52aa5 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:31 +0200 Subject: x86, AMD IOMMU: do runtime list initialization at compile time This patch changes the list initialization for the iommu list and the unity map list from runtime to compile time. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 316fe2eaeefe..0c247032308e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -118,12 +118,12 @@ static int __initdata amd_iommu_detected; u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ -struct list_head amd_iommu_unity_map; /* a list of required unity mappings +LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ int amd_iommu_isolate; /* if 1, device isolation is enabled */ -struct list_head amd_iommu_list; /* list of all AMD IOMMUs in the +LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ /* @@ -673,8 +673,6 @@ static int __init init_iommu_all(struct acpi_table_header *table) struct amd_iommu *iommu; int ret; - INIT_LIST_HEAD(&amd_iommu_list); - end += table->length; p += IVRS_HEADER_LENGTH; @@ -780,8 +778,6 @@ static int __init init_memory_definitions(struct acpi_table_header *table) u8 *p = (u8 *)table, *end = (u8 *)table; struct ivmd_header *m; - INIT_LIST_HEAD(&amd_iommu_unity_map); - end += table->length; p += IVRS_HEADER_LENGTH; -- cgit v1.2.2 From 5dc8bff0f6d0dfeb1f1c6e694294ba7c33d099f1 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:32 +0200 Subject: x86, AMD IOMMU: replace memset with __GFP_ZERO for table allocation This patch removes the memset from the data structure initialization code and allocate the structures with the __GFP_ZERO flag. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 0c247032308e..2efc3d59b7e6 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -890,7 +890,7 @@ int __init amd_iommu_init(void) ret = -ENOMEM; /* Device table - directly used by all IOMMUs */ - amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_dev_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(dev_table_size)); if (amd_iommu_dev_table == NULL) goto out; @@ -914,27 +914,23 @@ int __init amd_iommu_init(void) * Protection Domain table - maps devices to protection domains * This table has the same size as the rlookup_table */ - amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(rlookup_table_size)); if (amd_iommu_pd_table == NULL) goto free; - amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(GFP_KERNEL, + amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( + GFP_KERNEL | __GFP_ZERO, get_order(MAX_DOMAIN_ID/8)); if (amd_iommu_pd_alloc_bitmap == NULL) goto free; /* - * memory is allocated now; initialize the device table with all zeroes - * and let all alias entries point to itself + * let all alias entries point to itself */ - memset(amd_iommu_dev_table, 0, dev_table_size); for (i = 0; i < amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; - memset(amd_iommu_pd_table, 0, rlookup_table_size); - memset(amd_iommu_pd_alloc_bitmap, 0, MAX_DOMAIN_ID / 8); - /* * never allocate domain 0 because its used as the non-allocated and * error value placeholder -- cgit v1.2.2 From 0906372e6cf372f3162481f24a0b8ccae0eff4d7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:33 +0200 Subject: x86, AMD IOMMU: replace self made size parsing with memparse call This patch replaces the self-made parsing of the amd_iommu_size option with the generic memparse function call. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 2efc3d59b7e6..e0ff9404e6c9 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1042,20 +1042,10 @@ static int __init parse_amd_iommu_options(char *str) static int __init parse_amd_iommu_size_options(char *str) { - for (; *str; ++str) { - if (strcmp(str, "32M") == 0) - amd_iommu_aperture_order = 25; - if (strcmp(str, "64M") == 0) - amd_iommu_aperture_order = 26; - if (strcmp(str, "128M") == 0) - amd_iommu_aperture_order = 27; - if (strcmp(str, "256M") == 0) - amd_iommu_aperture_order = 28; - if (strcmp(str, "512M") == 0) - amd_iommu_aperture_order = 29; - if (strcmp(str, "1G") == 0) - amd_iommu_aperture_order = 30; - } + unsigned order = PAGE_SHIFT + get_order(memparse(str, &str)); + + if ((order > 24) && (order < 31)) + amd_iommu_aperture_order = order; return 1; } -- cgit v1.2.2 From d591b0a3ae25f587d0c4da1e1d1a425143590790 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 11 Jul 2008 17:14:35 +0200 Subject: x86, AMD IOMMU: replace DEVID macro with a function This patch replaces the DEVID macro with a function and uses them where apropriate (also in the core code). Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/amd_iommu_init.c | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index dec10e1a397c..8c3deb027d3a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -664,7 +664,7 @@ static int get_device_resources(struct device *dev, BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); pcidev = to_pci_dev(dev); - _bdf = (pcidev->bus->number << 8) | pcidev->devfn; + _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); /* device not translated by any IOMMU in the system? */ if (_bdf >= amd_iommu_last_bdf) { diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index e0ff9404e6c9..9bf1b8111b08 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -30,7 +30,6 @@ /* * definitions for the ACPI scanning code */ -#define DEVID(bus, devfn) (((bus) << 8) | (devfn)) #define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 @@ -295,7 +294,7 @@ static int __init find_last_devid_on_pci(int bus, int dev, int fn, int cap_ptr) u32 cap; cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - update_last_devid(DEVID(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); + update_last_devid(calc_devid(MMIO_GET_BUS(cap), MMIO_GET_LD(cap))); return 0; } @@ -494,8 +493,10 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu) iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); - iommu->first_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_FD(range)); - iommu->last_device = DEVID(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + iommu->first_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_FD(range)); + iommu->last_device = calc_devid(MMIO_GET_BUS(range), + MMIO_GET_LD(range)); } /* -- cgit v1.2.2 From 2510495e208e7a69b64fcf5cdf8966d873536d9e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 11 Jul 2008 12:13:59 +0200 Subject: x86/pci: Removing pci-y in Makefile Cc: Sam Ravnborg Signed-off-by: Robert Richter Cc: Robert Richter Cc: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/pci/Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index e515e8db842a..28451f41e0e4 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -5,13 +5,13 @@ obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o obj-$(CONFIG_PCI_DIRECT) += direct.o obj-$(CONFIG_PCI_OLPC) += olpc.o -pci-y := fixup.o -pci-$(CONFIG_ACPI) += acpi.o -pci-y += legacy.o irq.o +obj-y += fixup.o +obj-$(CONFIG_ACPI) += acpi.o +obj-y += legacy.o irq.o -pci-$(CONFIG_X86_VISWS) += visws.o +obj-$(CONFIG_X86_VISWS) += visws.o -pci-$(CONFIG_X86_NUMAQ) += numa.o +obj-$(CONFIG_X86_NUMAQ) += numa.o -obj-y += $(pci-y) common.o early.o +obj-y += common.o early.o obj-y += amd_bus.o -- cgit v1.2.2 From 060b9708a0c04cf9af69c128ef7954b6f0a84180 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 11 Jul 2008 12:14:27 +0200 Subject: x86/pci: Changing subsystem initialization order for NUMA Cc: Yinghai Lu Signed-off-by: Robert Richter Cc: Robert Richter Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/pci/legacy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 132876cc6fca..60e8caa13563 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -57,14 +57,14 @@ static int __init pci_legacy_init(void) int __init pci_subsys_init(void) { +#ifdef CONFIG_X86_NUMAQ + pci_numa_init(); +#endif #ifdef CONFIG_ACPI pci_acpi_init(); #endif pci_legacy_init(); pcibios_irq_init(); -#ifdef CONFIG_X86_NUMAQ - pci_numa_init(); -#endif pcibios_init(); return 0; -- cgit v1.2.2 From 9314d301390ad0d96986da3d893a21e81a287982 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 11 Jul 2008 12:18:40 +0200 Subject: x86/pci: renamed: numa.c -> numaq_32.c Cc: Yinghai Lu Signed-off-by: Robert Richter Cc: Robert Richter Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/pci/Makefile | 2 +- arch/x86/pci/numa.c | 178 ------------------------------------------------ arch/x86/pci/numaq_32.c | 178 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 179 insertions(+), 179 deletions(-) delete mode 100644 arch/x86/pci/numa.c create mode 100644 arch/x86/pci/numaq_32.c (limited to 'arch/x86') diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile index 28451f41e0e4..d49202e740ea 100644 --- a/arch/x86/pci/Makefile +++ b/arch/x86/pci/Makefile @@ -11,7 +11,7 @@ obj-y += legacy.o irq.o obj-$(CONFIG_X86_VISWS) += visws.o -obj-$(CONFIG_X86_NUMAQ) += numa.o +obj-$(CONFIG_X86_NUMAQ) += numaq_32.o obj-y += common.o early.o obj-y += amd_bus.o diff --git a/arch/x86/pci/numa.c b/arch/x86/pci/numa.c deleted file mode 100644 index 8b5ca1966731..000000000000 --- a/arch/x86/pci/numa.c +++ /dev/null @@ -1,178 +0,0 @@ -/* - * numa.c - Low-level PCI access for NUMA-Q machines - */ - -#include -#include -#include -#include -#include -#include "pci.h" - -#define XQUAD_PORTIO_BASE 0xfe400000 -#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ - -#define BUS2QUAD(global) (mp_bus_id_to_node[global]) - -#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) - -#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) - -/* Where the IO area was mapped on multiquad, always 0 otherwise */ -void *xquad_portio; -EXPORT_SYMBOL(xquad_portio); - -#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) - -#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ - (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) - -static void write_cf8(unsigned bus, unsigned devfn, unsigned reg) -{ - unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg); - if (xquad_portio) - writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus))); - else - outl(val, 0xCF8); -} - -static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 *value) -{ - unsigned long flags; - void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); - - if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) - return -EINVAL; - - spin_lock_irqsave(&pci_config_lock, flags); - - write_cf8(bus, devfn, reg); - - switch (len) { - case 1: - if (xquad_portio) - *value = readb(adr + (reg & 3)); - else - *value = inb(0xCFC + (reg & 3)); - break; - case 2: - if (xquad_portio) - *value = readw(adr + (reg & 2)); - else - *value = inw(0xCFC + (reg & 2)); - break; - case 4: - if (xquad_portio) - *value = readl(adr); - else - *value = inl(0xCFC); - break; - } - - spin_unlock_irqrestore(&pci_config_lock, flags); - - return 0; -} - -static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, - unsigned int devfn, int reg, int len, u32 value) -{ - unsigned long flags; - void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); - - if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) - return -EINVAL; - - spin_lock_irqsave(&pci_config_lock, flags); - - write_cf8(bus, devfn, reg); - - switch (len) { - case 1: - if (xquad_portio) - writeb(value, adr + (reg & 3)); - else - outb((u8)value, 0xCFC + (reg & 3)); - break; - case 2: - if (xquad_portio) - writew(value, adr + (reg & 2)); - else - outw((u16)value, 0xCFC + (reg & 2)); - break; - case 4: - if (xquad_portio) - writel(value, adr + reg); - else - outl((u32)value, 0xCFC); - break; - } - - spin_unlock_irqrestore(&pci_config_lock, flags); - - return 0; -} - -#undef PCI_CONF1_MQ_ADDRESS - -static struct pci_raw_ops pci_direct_conf1_mq = { - .read = pci_conf1_mq_read, - .write = pci_conf1_mq_write -}; - - -static void __devinit pci_fixup_i450nx(struct pci_dev *d) -{ - /* - * i450NX -- Find and scan all secondary buses on all PXB's. - */ - int pxb, reg; - u8 busno, suba, subb; - int quad = BUS2QUAD(d->bus->number); - - printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); - reg = 0xd0; - for(pxb=0; pxb<2; pxb++) { - pci_read_config_byte(d, reg++, &busno); - pci_read_config_byte(d, reg++, &suba); - pci_read_config_byte(d, reg++, &subb); - DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); - if (busno) { - /* Bus A */ - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); - } - if (suba < subb) { - /* Bus B */ - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1)); - } - } - pcibios_last_bus = -1; -} -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); - -int __init pci_numa_init(void) -{ - int quad; - - if (!found_numaq) - return 0; - - raw_pci_ops = &pci_direct_conf1_mq; - - if (pcibios_scanned++) - return 0; - - pci_root_bus = pcibios_scan_root(0); - if (pci_root_bus) - pci_bus_add_devices(pci_root_bus); - if (num_online_nodes() > 1) - for_each_online_node(quad) { - if (quad == 0) - continue; - printk("Scanning PCI bus %d for quad %d\n", - QUADLOCAL2BUS(quad,0), quad); - pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0)); - } - return 0; -} diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c new file mode 100644 index 000000000000..8b5ca1966731 --- /dev/null +++ b/arch/x86/pci/numaq_32.c @@ -0,0 +1,178 @@ +/* + * numa.c - Low-level PCI access for NUMA-Q machines + */ + +#include +#include +#include +#include +#include +#include "pci.h" + +#define XQUAD_PORTIO_BASE 0xfe400000 +#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */ + +#define BUS2QUAD(global) (mp_bus_id_to_node[global]) + +#define BUS2LOCAL(global) (mp_bus_id_to_local[global]) + +#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) + +/* Where the IO area was mapped on multiquad, always 0 otherwise */ +void *xquad_portio; +EXPORT_SYMBOL(xquad_portio); + +#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port) + +#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ + (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) + +static void write_cf8(unsigned bus, unsigned devfn, unsigned reg) +{ + unsigned val = PCI_CONF1_MQ_ADDRESS(bus, devfn, reg); + if (xquad_portio) + writel(val, XQUAD_PORT_ADDR(0xcf8, BUS2QUAD(bus))); + else + outl(val, 0xCF8); +} + +static int pci_conf1_mq_read(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 *value) +{ + unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); + + if (!value || (bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + write_cf8(bus, devfn, reg); + + switch (len) { + case 1: + if (xquad_portio) + *value = readb(adr + (reg & 3)); + else + *value = inb(0xCFC + (reg & 3)); + break; + case 2: + if (xquad_portio) + *value = readw(adr + (reg & 2)); + else + *value = inw(0xCFC + (reg & 2)); + break; + case 4: + if (xquad_portio) + *value = readl(adr); + else + *value = inl(0xCFC); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, + unsigned int devfn, int reg, int len, u32 value) +{ + unsigned long flags; + void *adr __iomem = XQUAD_PORT_ADDR(0xcfc, BUS2QUAD(bus)); + + if ((bus >= MAX_MP_BUSSES) || (devfn > 255) || (reg > 255)) + return -EINVAL; + + spin_lock_irqsave(&pci_config_lock, flags); + + write_cf8(bus, devfn, reg); + + switch (len) { + case 1: + if (xquad_portio) + writeb(value, adr + (reg & 3)); + else + outb((u8)value, 0xCFC + (reg & 3)); + break; + case 2: + if (xquad_portio) + writew(value, adr + (reg & 2)); + else + outw((u16)value, 0xCFC + (reg & 2)); + break; + case 4: + if (xquad_portio) + writel(value, adr + reg); + else + outl((u32)value, 0xCFC); + break; + } + + spin_unlock_irqrestore(&pci_config_lock, flags); + + return 0; +} + +#undef PCI_CONF1_MQ_ADDRESS + +static struct pci_raw_ops pci_direct_conf1_mq = { + .read = pci_conf1_mq_read, + .write = pci_conf1_mq_write +}; + + +static void __devinit pci_fixup_i450nx(struct pci_dev *d) +{ + /* + * i450NX -- Find and scan all secondary buses on all PXB's. + */ + int pxb, reg; + u8 busno, suba, subb; + int quad = BUS2QUAD(d->bus->number); + + printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); + reg = 0xd0; + for(pxb=0; pxb<2; pxb++) { + pci_read_config_byte(d, reg++, &busno); + pci_read_config_byte(d, reg++, &suba); + pci_read_config_byte(d, reg++, &subb); + DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); + if (busno) { + /* Bus A */ + pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); + } + if (suba < subb) { + /* Bus B */ + pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, suba+1)); + } + } + pcibios_last_bus = -1; +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); + +int __init pci_numa_init(void) +{ + int quad; + + if (!found_numaq) + return 0; + + raw_pci_ops = &pci_direct_conf1_mq; + + if (pcibios_scanned++) + return 0; + + pci_root_bus = pcibios_scan_root(0); + if (pci_root_bus) + pci_bus_add_devices(pci_root_bus); + if (num_online_nodes() > 1) + for_each_online_node(quad) { + if (quad == 0) + continue; + printk("Scanning PCI bus %d for quad %d\n", + QUADLOCAL2BUS(quad,0), quad); + pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, 0)); + } + return 0; +} -- cgit v1.2.2 From e27cf3a2e151b79375efadf71a5d383ad416fb44 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 11 Jul 2008 12:18:41 +0200 Subject: x86/pci: renaming numa into numaq Cc: Yinghai Lu Signed-off-by: Robert Richter Cc: Robert Richter Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/pci/legacy.c | 2 +- arch/x86/pci/numaq_32.c | 4 ++-- arch/x86/pci/pci.h | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index 60e8caa13563..f405eb0b8911 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -58,7 +58,7 @@ static int __init pci_legacy_init(void) int __init pci_subsys_init(void) { #ifdef CONFIG_X86_NUMAQ - pci_numa_init(); + pci_numaq_init(); #endif #ifdef CONFIG_ACPI pci_acpi_init(); diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 8b5ca1966731..f4b16dc11dad 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -1,5 +1,5 @@ /* - * numa.c - Low-level PCI access for NUMA-Q machines + * numaq_32.c - Low-level PCI access for NUMA-Q machines */ #include @@ -151,7 +151,7 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_82451NX, pci_fixup_i450nx); -int __init pci_numa_init(void) +int __init pci_numaq_init(void) { int quad; diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index b2270a55b0cf..36b8dd019fab 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -107,7 +107,7 @@ extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); extern int __init pcibios_irq_init(void); -extern int __init pci_numa_init(void); +extern int __init pci_numaq_init(void); extern int __init pcibios_init(void); /* pci-mmconfig.c */ -- cgit v1.2.2 From 3cabf37f6167125cb5185db05f5061650f685ab7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 11 Jul 2008 12:26:59 +0200 Subject: x86/pci: Changing subsystem init for visws I don't know, if this new code boots, but at least it compiles. Someone should really test it. Signed-off-by: Robert Richter Cc: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/pci/legacy.c | 3 +++ arch/x86/pci/pci.h | 1 + arch/x86/pci/visws.c | 23 +++++++---------------- 3 files changed, 11 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index f405eb0b8911..ec9ce35e44d6 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -62,6 +62,9 @@ int __init pci_subsys_init(void) #endif #ifdef CONFIG_ACPI pci_acpi_init(); +#endif +#ifdef CONFIG_X86_VISWS + pci_visws_init(); #endif pci_legacy_init(); pcibios_irq_init(); diff --git a/arch/x86/pci/pci.h b/arch/x86/pci/pci.h index 36b8dd019fab..a2c55ee98aff 100644 --- a/arch/x86/pci/pci.h +++ b/arch/x86/pci/pci.h @@ -107,6 +107,7 @@ extern void __init dmi_check_skip_isa_align(void); /* some common used subsys_initcalls */ extern int __init pci_acpi_init(void); extern int __init pcibios_irq_init(void); +extern int __init pci_visws_init(void); extern int __init pci_numaq_init(void); extern int __init pcibios_init(void); diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 1a7bed492bb1..42f4cb19faca 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -86,8 +86,14 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } -static int __init pci_visws_init(void) +int __init pci_visws_init(void) { + if (!is_visws_box()) + return -1; + + pcibios_enable_irq = &pci_visws_enable_irq; + pcibios_disable_irq = &pci_visws_disable_irq; + /* The VISWS supports configuration access type 1 only */ pci_probe = (pci_probe | PCI_PROBE_CONF1) & ~(PCI_PROBE_BIOS | PCI_PROBE_CONF2); @@ -105,18 +111,3 @@ static int __init pci_visws_init(void) pcibios_resource_survey(); return 0; } - -static __init int pci_subsys_init(void) -{ - if (!is_visws_box()) - return -1; - - pcibios_enable_irq = &pci_visws_enable_irq; - pcibios_disable_irq = &pci_visws_disable_irq; - - pci_visws_init(); - pcibios_init(); - - return 0; -} -subsys_initcall(pci_subsys_init); -- cgit v1.2.2 From d94d93ca5cc36cd78c532def62772c98fe8ba5d7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:46 -0700 Subject: x64, x2apic/intr-remap: 8259 specific mask/unmask routines 8259 specific mask/unmask routines which be used later while enabling interrupt-remapping. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/i8259.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index dc92b49d9204..4b8a53d841f7 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -282,6 +282,30 @@ static int __init i8259A_init_sysfs(void) device_initcall(i8259A_init_sysfs); +void mask_8259A(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + +void unmask_8259A(void) +{ + unsigned long flags; + + spin_lock_irqsave(&i8259A_lock, flags); + + outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ + outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ + + spin_unlock_irqrestore(&i8259A_lock, flags); +} + void init_8259A(int auto_eoi) { unsigned long flags; -- cgit v1.2.2 From 4dc2f96cacd1e74c688f94348a3bfd0a980817d5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:47 -0700 Subject: x64, x2apic/intr-remap: ioapic routines which deal with initial io-apic RTE setup Generic ioapic specific routines which be used later during enabling interrupt-remapping. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 66 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 9e645cba11c4..84dd63c13d63 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -114,6 +114,9 @@ DEFINE_SPINLOCK(vector_lock); */ int nr_ioapic_registers[MAX_IO_APICS]; +/* I/O APIC RTE contents at the OS boot up */ +struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + /* I/O APIC entries */ struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; int nr_ioapics; @@ -446,6 +449,69 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + return -ENOMEM; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + return 0; +} + +void restore_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); +} + +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} + int skip_ioapic_setup; int ioapic_force; -- cgit v1.2.2 From 0c81c746f9bdbfaafe64322d540c8b7b59c27314 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:48 -0700 Subject: x64, x2apic/intr-remap: introduce read_apic_id() to genapic routines Move the read_apic_id() to genapic routines. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 5 +++++ arch/x86/kernel/genapic_64.c | 11 ----------- arch/x86/kernel/genapic_flat_64.c | 14 +++++++++++++- arch/x86/kernel/genx2apic_uv_x.c | 14 +++++++++++++- 4 files changed, 31 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e3d32e27c14..9dd4ee4735f5 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1096,6 +1096,11 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu_set(cpu, cpu_present_map); } +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} + /* * Power management */ diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 1fa8be5bd217..7414871751a7 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -79,17 +79,6 @@ int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -unsigned int read_apic_id(void) -{ - unsigned int id; - - WARN_ON(preemptible() && num_online_cpus() > 1); - id = apic_read(APIC_ID); - if (uv_system_type >= UV_X2APIC) - id |= __get_cpu_var(x2apic_extra_bits); - return id; -} - enum uv_system_type get_uv_system_type(void) { return uv_system_type; diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 1a9c68845ee8..400ed8df8b4e 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -15,9 +15,11 @@ #include #include #include +#include #include #include #include +#include static cpumask_t flat_target_cpus(void) { @@ -95,9 +97,17 @@ static void flat_send_IPI_all(int vector) __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } +static unsigned int read_xapic_id(void) +{ + unsigned int id; + + id = GET_XAPIC_ID(apic_read(APIC_ID)); + return id; +} + static int flat_apic_id_registered(void) { - return physid_isset(GET_APIC_ID(read_apic_id()), phys_cpu_present_map); + return physid_isset(read_xapic_id(), phys_cpu_present_map); } static unsigned int flat_cpu_mask_to_apicid(cpumask_t cpumask) @@ -123,6 +133,7 @@ struct genapic apic_flat = { .send_IPI_mask = flat_send_IPI_mask, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, + .read_apic_id = read_xapic_id, }; /* @@ -187,4 +198,5 @@ struct genapic apic_physflat = { .send_IPI_mask = physflat_send_IPI_mask, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, + .read_apic_id = read_xapic_id, }; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b06..1ef99be18488 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -134,9 +135,19 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) return BAD_APICID; } +static unsigned int uv_read_apic_id(void) +{ + unsigned int id; + + WARN_ON(preemptible() && num_online_cpus() > 1); + id = apic_read(APIC_ID) | __get_cpu_var(x2apic_extra_bits); + + return id; +} + static unsigned int phys_pkg_id(int index_msb) { - return GET_APIC_ID(read_apic_id()) >> index_msb; + return uv_read_apic_id() >> index_msb; } #ifdef ZZZ /* Needs x2apic patch */ @@ -159,6 +170,7 @@ struct genapic apic_x2apic_uv_x = { /* ZZZ.send_IPI_self = uv_send_IPI_self, */ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ + .read_apic_id = uv_read_apic_id, }; static __cpuinit void set_x2apic_extra_bits(int pnode) -- cgit v1.2.2 From 2d7a66d02e11af9ab8e16c76d22767e622b4e3d7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 11 Jul 2008 14:24:19 -0700 Subject: x64, x2apic/intr-remap: Interrupt-remapping and x2apic support, fix Yinghai Lu wrote: > Setting APIC routing to physical flat > Kernel panic - not syncing: Boot APIC ID in local APIC unexpected (0 vs 4) > Pid: 1, comm: swapper Not tainted 2.6.26-rc9-tip-01763-g74f94b1-dirty #320 > > Call Trace: > [] ? set_cpu_sibling_map+0x38c/0x3bd > [] ? read_xapic_id+0x25/0x3e > [] ? verify_local_APIC+0x139/0x1b9 > [] ? read_xapic_id+0x25/0x3e > [] ? native_smp_prepare_cpus+0x224/0x2e9 > [] ? kernel_init+0x64/0x341 > [] ? child_rip+0xa/0x11 > [] ? kernel_init+0x0/0x341 > [] ? child_rip+0x0/0x11 > > > guess read_apic_id changing cuase some problem... genapic's read_apic_id() returns the actual apic id extracted from the APIC_ID register. And in some cases like UV, read_apic_id() returns completely different values from APIC ID register. Use the native apic register read, rather than genapic read_apic_id() in verify_local_APIC() And also, lapic_suspend() should also use native apic register read. Reported-by: Yinghai Lu Signed-off-by: Suresh Siddha Cc: "akpm@linux-foundation.org" Cc: "arjan@linux.intel.com" Cc: "andi@firstfloor.org" Cc: "ebiederm@xmission.com" Cc: "jbarnes@virtuousgeek.org" Cc: "steiner@sgi.com" Cc: "jeremy@goop.org" Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 9dd4ee4735f5..3963f590c3d4 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -626,10 +626,10 @@ int __init verify_local_APIC(void) /* * The ID register is read/write in a real APIC. */ - reg0 = read_apic_id(); + reg0 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = read_apic_id(); + reg1 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); apic_write(APIC_ID, reg0); if (reg1 != (reg0 ^ APIC_ID_MASK)) @@ -1137,7 +1137,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) maxlvt = lapic_get_maxlvt(); - apic_pm_state.apic_id = read_apic_id(); + apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); -- cgit v1.2.2 From 1b374e4d6f8b3eb2fcd034fcc24ea8ba1dfde7aa Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:49 -0700 Subject: x64, x2apic/intr-remap: basic apic ops support Introduce basic apic operations which handle the apic programming. This will be used later to introduce another specific operations for x2apic. For the perfomance critial accesses like IPI's, EOI etc, we use the native operations as they are already referenced by different indirections like genapic, irq_chip etc. 64bit Paravirt ops can also define their apic operations accordingly. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 6 ++++++ arch/x86/kernel/apic_64.c | 34 ++++++++++++++++++++++++++++++++-- arch/x86/kernel/io_apic_64.c | 8 ++++---- arch/x86/kernel/paravirt.c | 8 +++++--- arch/x86/kernel/smpboot.c | 28 +++++++++++----------------- 5 files changed, 58 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 3e58b676d23b..2a83c07bd887 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -145,6 +145,12 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +void apic_icr_write(u32 low, u32 id) +{ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write_around(APIC_ICR, low); +} + void apic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 3963f590c3d4..9bb040689b31 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -119,13 +119,13 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } -void apic_wait_icr_idle(void) +void xapic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -u32 safe_apic_wait_icr_idle(void) +u32 safe_xapic_wait_icr_idle(void) { u32 send_status; int timeout; @@ -141,6 +141,36 @@ u32 safe_apic_wait_icr_idle(void) return send_status; } +void xapic_icr_write(u32 low, u32 id) +{ + apic_write(APIC_ICR2, id << 24); + apic_write(APIC_ICR, low); +} + +u64 xapic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return (icr1 | ((u64)icr2 << 32)); +} + +static struct apic_ops xapic_ops = { + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .write_atomic = native_apic_mem_write_atomic, + .icr_read = xapic_icr_read, + .icr_write = xapic_icr_write, + .wait_icr_idle = xapic_wait_icr_idle, + .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +}; + +struct apic_ops __read_mostly *apic_ops = &xapic_ops; + +EXPORT_SYMBOL_GPL(apic_ops); + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 84dd63c13d63..b62d42ef9283 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1157,6 +1157,7 @@ static __apicdebuginit void print_APIC_bitfield (int base) void __apicdebuginit print_local_APIC(void * dummy) { unsigned int v, ver, maxlvt; + unsigned long icr; if (apic_verbosity == APIC_QUIET) return; @@ -1200,10 +1201,9 @@ void __apicdebuginit print_local_APIC(void * dummy) v = apic_read(APIC_ESR); printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c19..b80105a0f474 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -360,9 +360,11 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC - .apic_write = native_apic_write, - .apic_write_atomic = native_apic_write_atomic, - .apic_read = native_apic_read, +#ifnded CONFIG_X86_64 + .apic_write = native_apic_mem_write, + .apic_write_atomic = native_apic_mem_write_atomic, + .apic_read = native_apic_mem_read, +#endif .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f35c2d8016ac..c55263b3df02 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -123,7 +123,6 @@ EXPORT_PER_CPU_SYMBOL(cpu_info); static atomic_t init_deasserted; -static int boot_cpu_logical_apicid; /* representing cpus for which sibling maps can be computed */ static cpumask_t cpu_sibling_setup_map; @@ -165,6 +164,8 @@ static void unmap_cpu_to_node(int cpu) #endif #ifdef CONFIG_X86_32 +static int boot_cpu_logical_apicid; + u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; @@ -546,8 +547,7 @@ static inline void __inquire_remote_apic(int apicid) printk(KERN_CONT "a previous APIC delivery may have failed\n"); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_icr_write(APIC_DM_REMRD | regs[i], apicid); timeout = 0; do { @@ -579,11 +579,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) int maxlvt; /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); - /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + apic_icr_write(APIC_DM_NMI | APIC_DEST_LOGICAL, logical_apicid); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -639,13 +637,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); + apic_icr_write(APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT, + phys_apicid); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -655,10 +651,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_icr_write(APIC_INT_LEVELTRIG | APIC_DM_INIT, phys_apicid); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -703,12 +697,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_eip >> 12)); + apic_icr_write(APIC_DM_STARTUP | (start_eip >> 12), + phys_apicid); /* * Give the other CPU some time to accept the IPI. @@ -1147,7 +1139,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) * Setup boot CPU information */ smp_store_cpu_info(0); /* Final full version of the data */ +#ifdef CONFIG_X86_32 boot_cpu_logical_apicid = logical_smp_processor_id(); +#endif current_thread_info()->cpu = 0; /* needed? */ set_cpu_sibling_map(0); -- cgit v1.2.2 From 32e1d0a0651004f5fe47f85a2a5c725ad579a90c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:50 -0700 Subject: x64, x2apic/intr-remap: cpuid bits for x2apic feature cpuid feature for x2apic. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/feature_names.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index e43ad4ad4cba..0bf4d37a0483 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -45,7 +45,7 @@ const char * const x86_cap_flags[NCAPINTS*32] = { /* Intel-defined (#2) */ "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", + NULL, NULL, "dca", "sse4_1", "sse4_2", "x2apic", NULL, "popcnt", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ -- cgit v1.2.2 From 13c88fb58d0112d47f7839f24a755715c6218822 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:52 -0700 Subject: x64, x2apic/intr-remap: x2apic ops for x2apic mode support x2apic ops for x2apic mode support. This uses MSR interface and differs slightly from the xapic register layout. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 9bb040689b31..a969ef78e12a 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -171,6 +171,41 @@ struct apic_ops __read_mostly *apic_ops = &xapic_ops; EXPORT_SYMBOL_GPL(apic_ops); +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .write_atomic = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ -- cgit v1.2.2 From cff73a6ffaed726780b001937d2a42efde553922 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:53 -0700 Subject: x64, x2apic/intr-remap: introcude self IPI to genapic routines Introduce self IPI op for genapic. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_64.c | 2 +- arch/x86/kernel/genapic_flat_64.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 7414871751a7..6657de609dcb 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -61,7 +61,7 @@ void __init setup_apic_routing(void) /* Same for both flat and physical. */ -void send_IPI_self(int vector) +void apic_send_IPI_self(int vector) { __send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); } diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 400ed8df8b4e..735586822135 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -131,6 +131,7 @@ struct genapic apic_flat = { .send_IPI_all = flat_send_IPI_all, .send_IPI_allbutself = flat_send_IPI_allbutself, .send_IPI_mask = flat_send_IPI_mask, + .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, .read_apic_id = read_xapic_id, @@ -196,6 +197,7 @@ struct genapic apic_physflat = { .send_IPI_all = physflat_send_IPI_all, .send_IPI_allbutself = physflat_send_IPI_allbutself, .send_IPI_mask = physflat_send_IPI_mask, + .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, .read_apic_id = read_xapic_id, -- cgit v1.2.2 From 12a67cf6851871ca8df42025c94f140c303d0f7f Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:54 -0700 Subject: x64, x2apic/intr-remap: x2apic cluster mode support x2apic cluster mode support. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/genapic_64.c | 2 + arch/x86/kernel/genx2apic_cluster.c | 135 ++++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+) create mode 100644 arch/x86/kernel/genx2apic_cluster.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 55ff016e9f69..bde3e9b6fec9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -95,6 +95,7 @@ obj-$(CONFIG_OLPC) += olpc.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o + obj-y += genx2apic_cluster.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 6657de609dcb..1029e178cdf7 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -38,6 +38,8 @@ void __init setup_apic_routing(void) { if (uv_system_type == UV_NON_UNIQUE_APIC) genapic = &apic_x2apic_uv_x; + else if (cpu_has_x2apic && intr_remapping_enabled) + genapic = &apic_x2apic_cluster; else #ifdef CONFIG_ACPI /* diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c new file mode 100644 index 000000000000..ed0fdede800a --- /dev/null +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -0,0 +1,135 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t x2apic_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +/* + * for now each logical cpu is in its own vector allocation domain. + */ +static cpumask_t x2apic_vector_allocation_domain(int cpu) +{ + cpumask_t domain = CPU_MASK_NONE; + cpu_set(cpu, domain); + return domain; +} + +static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, + unsigned int dest) +{ + unsigned long cfg; + + cfg = __prepare_ICR(0, vector, dest); + + /* + * send the IPI. + */ + x2apic_icr_write(cfg, apicid); +} + +/* + * for now, we send the IPI's one by one in the cpumask. + * TBD: Based on the cpu mask, we can send the IPI's to the cluster group + * at once. We have 16 cpu's in a cluster. This will minimize IPI register + * writes. + */ +static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned long query_cpu; + + local_irq_save(flags); + for_each_cpu_mask(query_cpu, mask) { + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_logical_apicid, query_cpu), + vector, APIC_DEST_LOGICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + x2apic_send_IPI_mask(mask, vector); +} + +static void x2apic_send_IPI_all(int vector) +{ + x2apic_send_IPI_mask(cpu_online_map, vector); +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return per_cpu(x86_cpu_to_logical_apicid, cpu); + else + return BAD_APICID; +} + +static unsigned int x2apic_read_id(void) +{ + return apic_read(APIC_ID); +} + +static unsigned int phys_pkg_id(int index_msb) +{ + return x2apic_read_id() >> index_msb; +} + +static void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +static void init_x2apic_ldr(void) +{ + int cpu = smp_processor_id(); + + per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); + return; +} + +struct genapic apic_x2apic_cluster = { + .name = "cluster x2apic", + .int_delivery_mode = dest_LowestPrio, + .int_dest_mode = (APIC_DEST_LOGICAL != 0), + .target_cpus = x2apic_target_cpus, + .vector_allocation_domain = x2apic_vector_allocation_domain, + .apic_id_registered = x2apic_apic_id_registered, + .init_apic_ldr = init_x2apic_ldr, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_self = x2apic_send_IPI_self, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, + .read_apic_id = x2apic_read_id, +}; -- cgit v1.2.2 From 5c520a6724e912a7e6153b7597192edad6752750 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:55 -0700 Subject: x64, x2apic/intr-remap: setup init_apic_ldr for UV Signed-off-by: Suresh Siddha Signed-off-by: Jack Steiner Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 1ef99be18488..dcd4fb219daa 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -120,6 +120,10 @@ static int uv_apic_id_registered(void) return 1; } +static inline void uv_init_apic_ldr(void) +{ +} + static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) { int cpu; @@ -164,6 +168,7 @@ struct genapic apic_x2apic_uv_x = { .target_cpus = uv_target_cpus, .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ .apic_id_registered = uv_apic_id_registered, + .init_apic_ldr = uv_init_apic_ldr, .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, -- cgit v1.2.2 From 89027d35aa5b8f45ce0f7fa0911db85b46563da0 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:56 -0700 Subject: x64, x2apic/intr-remap: IO-APIC support for interrupt-remapping IO-APIC support in the presence of interrupt-remapping infrastructure. IO-APIC RTE will be programmed with interrupt-remapping table entry(IRTE) index and the IRTE will contain information about the vector, cpu destination, trigger mode etc, which traditionally was present in the IO-APIC RTE. Introduce a new irq_chip for cleaner irq migration (in the process context as opposed to the current irq migration in the context of an interrupt. interrupt-remapping infrastructure will help us achieve this cleanly). For edge triggered, irq migration is a simple atomic update(of vector and cpu destination) of IRTE and flush the hardware cache. For level triggered, we need to modify the io-apic RTE aswell with the update vector information, along with modifying IRTE with vector and cpu destination. So irq migration for level triggered is little bit more complex compared to edge triggered migration. But the good news is, we use the same algorithm for level triggered migration as we have today, only difference being, we now initiate the irq migration from process context instead of the interrupt context. In future, when we do a directed EOI (combined with cpu EOI broadcast suppression) to the IO-APIC, level triggered irq migration will also be as simple as edge triggered migration and we can do the irq migration with a simple atomic update to IO-APIC RTE. TBD: some tests/changes needed in the presence of fixup_irqs() for level triggered irq migration. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 1 + arch/x86/kernel/io_apic_64.c | 300 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 279 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index a969ef78e12a..d5c06917b5b1 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -46,6 +46,7 @@ static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; int disable_apic; +int x2apic; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index b62d42ef9283..9bd02ef049a0 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -37,6 +37,7 @@ #include #endif #include +#include #include #include @@ -48,6 +49,7 @@ #include #include #include +#include #include #include @@ -312,7 +314,12 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) pin = entry->pin; if (pin == -1) break; - io_apic_write(apic, 0x11 + pin*2, dest); + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -906,18 +913,98 @@ void setup_vector_irq(int cpu) static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif static void ioapic_register_intr(int irq, unsigned long trigger) { - if (trigger) { + if (trigger) irq_desc[irq].status |= IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); - } else { + else irq_desc[irq].status &= ~IRQ_LEVEL; + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + irq_desc[irq].status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif + if (trigger) + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); +} + +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) +{ + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; } + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + return 0; } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, @@ -942,24 +1029,15 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, irq, trigger, polarity); - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest = cpu_mask_to_apicid(mask); - entry.mask = 0; /* enable IRQ */ - entry.trigger = trigger; - entry.polarity = polarity; - entry.vector = cfg->vector; - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry.mask = 1; + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } ioapic_register_intr(irq, trigger); if (irq < 16) @@ -1011,6 +1089,9 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; + if (intr_remapping_enabled) + return; + memset(&entry, 0, sizeof(entry)); /* @@ -1466,6 +1547,147 @@ static int ioapic_retrigger_irq(unsigned int irq) */ #ifdef CONFIG_SMP + +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); + +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + struct irq_desc *desc = irq_desc + irq; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte = desc->status & IRQ_LEVEL; + unsigned int dest; + unsigned long flags; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + irq_desc[irq].affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); + + ret = 0; + irq_desc[irq].status &= ~IRQ_MOVE_PENDING; + cpus_clear(irq_desc[irq].pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + int irq; + + for (irq = 0; irq < NR_IRQS; irq++) { + struct irq_desc *desc = irq_desc + irq; + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, + irq_desc[irq].pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + if (irq_desc[irq].status & IRQ_LEVEL) { + irq_desc[irq].status |= IRQ_MOVE_PENDING; + irq_desc[irq].pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; @@ -1522,6 +1744,17 @@ static void irq_complete_move(unsigned int irq) #else static inline void irq_complete_move(unsigned int irq) {} #endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif static void ack_apic_edge(unsigned int irq) { @@ -1596,6 +1829,21 @@ static struct irq_chip ioapic_chip __read_mostly = { .retrigger = ioapic_retrigger_irq, }; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif + static inline void init_IO_APIC_traps(void) { int irq; @@ -1783,6 +2031,8 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -1809,6 +2059,8 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " @@ -2401,6 +2653,10 @@ void __init setup_ioapic_dest(void) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); +#ifdef CONFIG_INTR_REMAP + else if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); +#endif else set_ioapic_affinity_irq(irq, TARGET_CPUS); } -- cgit v1.2.2 From 75c46fa61bc5b4ccd20a168ff325c58771248fcd Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:57 -0700 Subject: x64, x2apic/intr-remap: MSI and MSI-X support for interrupt remapping infrastructure MSI and MSI-X support for interrupt remapping infrastructure. MSI address register will be programmed with interrupt-remapping table entry(IRTE) index and the IRTE will contain information about the vector, cpu destination, etc. For MSI-X, all the IRTE's will be consecutively allocated in the table, and the address registers will contain the starting index to the block and the data register will contain the subindex with in that block. This also introduces a new irq_chip for cleaner irq migration (in the process context as opposed to the current irq migration in the context of an interrupt. interrupt-remapping infrastructure will help us achieve this). As MSI is edge triggered, irq migration is a simple atomic update(of vector and cpu destination) of IRTE and flushing the hardware cache. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 230 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 222 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 9bd02ef049a0..877aa2e9d7e8 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -2297,6 +2297,9 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); @@ -2315,10 +2318,41 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms tmp = TARGET_CPUS; err = assign_irq_vector(irq, tmp); - if (!err) { - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); + if (err) + return err; + + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { msg->address_hi = MSI_ADDR_BASE_HI; msg->address_lo = MSI_ADDR_BASE_LO | @@ -2369,6 +2403,55 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) write_msi_msg(irq, &msg); irq_desc[irq].affinity = mask; } + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg = irq_cfg + irq; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + irq_desc[irq].affinity = mask; +} +#endif #endif /* CONFIG_SMP */ /* @@ -2386,26 +2469,157 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) { + int ret; struct msi_msg msg; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_desc + irq; + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ int irq, ret; + irq = create_irq(); if (irq < 0) return irq; - ret = msi_compose_msg(dev, irq, &msg); +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); if (ret < 0) { destroy_irq(irq); return ret; } + return 0; - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif +} - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + int irq, ret, sub_handle; + struct msi_desc *desc; +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq(); + if (irq < 0) + return irq; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } return 0; + +error: + destroy_irq(irq); + return ret; } void arch_teardown_msi_irq(unsigned int irq) -- cgit v1.2.2 From 6e1cb38a2aef7680975e71f23de187859ee8b158 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:58 -0700 Subject: x64, x2apic/intr-remap: add x2apic support, including enabling interrupt-remapping x2apic support. Interrupt-remapping must be enabled before enabling x2apic, this is needed to ensure that IO interrupts continue to work properly after the cpu mode is changed to x2apic(which uses 32bit extended physical/cluster apic id). On systems where apicid's are > 255, BIOS can handover the control to OS in x2apic mode. Or if the OS handover was in legacy xapic mode, check if it is capable of x2apic mode. And if we succeed in enabling Interrupt-remapping, then we can enable x2apic mode in the CPU. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 + arch/x86/kernel/apic_64.c | 154 ++++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/common_64.c | 2 + arch/x86/kernel/genapic_64.c | 1 + arch/x86/kernel/mpparse.c | 2 + arch/x86/kernel/setup.c | 2 + arch/x86/kernel/smpboot.c | 5 ++ 7 files changed, 164 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 785700a08e9d..8705262ddcda 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1337,7 +1337,9 @@ static void __init acpi_process_madt(void) acpi_ioapic = 1; smp_found_config = 1; +#ifdef CONFIG_X86_32 setup_apic_routing(); +#endif } } if (error == -EINVAL) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index d5c06917b5b1..dd0501039f09 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ #include #include #include +#include #include #include @@ -46,8 +48,12 @@ static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; int disable_apic; +int disable_x2apic; int x2apic; +/* x2apic enabled before OS handover */ +int x2apic_preenabled; + /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -896,6 +902,125 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +void check_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + + if (msr & X2APIC_ENABLE) { + printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + x2apic_preenabled = x2apic = 1; + apic_ops = &x2apic_ops; + } +} + +void enable_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (!(msr & X2APIC_ENABLE)) { + printk("Enabling x2apic\n"); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); + } +} + +void enable_IR_x2apic(void) +{ +#ifdef CONFIG_INTR_REMAP + int ret; + unsigned long flags; + + if (!cpu_has_x2apic) + return; + + if (!x2apic_preenabled && disable_x2apic) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); + return; + } + + if (x2apic_preenabled && disable_x2apic) + panic("Bios already enabled x2apic, can't enforce nox2apic"); + + if (!x2apic_preenabled && skip_ioapic_setup) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); + return; + } + + ret = dmar_table_init(); + if (ret) { + printk(KERN_INFO + "dmar_table_init() failed with %d:\n", ret); + + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else + printk(KERN_INFO + "Not enabling x2apic,Intr-remapping\n"); + return; + } + + local_irq_save(flags); + mask_8259A(); + save_mask_IO_APIC_setup(); + + ret = enable_intr_remapping(1); + + if (ret && x2apic_preenabled) { + local_irq_restore(flags); + panic("x2apic enabled by bios. But IR enabling failed"); + } + + if (ret) + goto end; + + if (!x2apic) { + x2apic = 1; + apic_ops = &x2apic_ops; + enable_x2apic(); + } +end: + if (ret) + /* + * IR enabling failed + */ + restore_IO_APIC_setup(); + else + reinit_intr_remapped_IO_APIC(x2apic_preenabled); + + unmask_8259A(); + local_irq_restore(flags); + + if (!ret) { + if (!x2apic_preenabled) + printk(KERN_INFO + "Enabled x2apic and interrupt-remapping\n"); + else + printk(KERN_INFO + "Enabled Interrupt-remapping\n"); + } else + printk(KERN_ERR + "Failed to enable Interrupt-remapping and x2apic\n"); +#else + if (!cpu_has_x2apic) + return; + + if (x2apic_preenabled) + panic("x2apic enabled prior OS handover," + " enable CONFIG_INTR_REMAP"); + + printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); +#endif + + return; +} + /* * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. @@ -943,6 +1068,11 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { + if (x2apic) { + boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + return; + } + /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -981,6 +1111,9 @@ int __init APIC_init_uniprocessor(void) return -1; } + enable_IR_x2apic(); + setup_apic_routing(); + verify_local_APIC(); connect_bsp_APIC(); @@ -1238,10 +1371,14 @@ static int lapic_resume(struct sys_device *dev) maxlvt = lapic_get_maxlvt(); local_irq_save(flags); - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); + if (!x2apic) { + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } else + enable_x2apic(); + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); apic_write(APIC_DFR, apic_pm_state.apic_dfr); @@ -1381,6 +1518,15 @@ __cpuinit int apic_is_clustered_box(void) return (clusters > 2); } +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); + + /* * APIC command line parameters */ diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 36537ab9e56a..e7bf3c2dc5fe 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -606,6 +606,8 @@ void __cpuinit cpu_init(void) barrier(); check_efer(); + if (cpu != 0 && x2apic) + enable_x2apic(); /* * set up and load the per-CPU TSS diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 1029e178cdf7..792e21ba1a81 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3b25e49380c6..70e1f3e287fb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -545,7 +545,9 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) generic_bigsmp_probe(); #endif +#ifdef CONFIG_X86_32 setup_apic_routing(); +#endif if (!num_processors) printk(KERN_ERR "MPTABLE: no processors registered!\n"); return num_processors; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 987b6fde3a99..2e78a143dec3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -730,6 +730,8 @@ void __init setup_arch(char **cmdline_p) num_physpages = max_pfn; check_efer(); + if (cpu_has_x2apic) + check_x2apic(); /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c55263b3df02..0c43e1f2e7d3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1145,6 +1145,11 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) current_thread_info()->cpu = 0; /* needed? */ set_cpu_sibling_map(0); +#ifdef CONFIG_X86_64 + enable_IR_x2apic(); + setup_apic_routing(); +#endif + if (smp_sanity_check(max_cpus) < 0) { printk(KERN_INFO "SMP disabled\n"); disable_smp(); -- cgit v1.2.2 From 2d9579a124d746a3e0e0ba45e57d80800ee80807 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:16:59 -0700 Subject: x64, x2apic/intr-remap: support for x2apic physical mode support x2apic Physical mode support. By default we will use x2apic cluster mode. x2apic physical mode can be selected using "x2apic_phys" boot parameter. Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/genapic_64.c | 18 +++++- arch/x86/kernel/genx2apic_phys.c | 122 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 138 insertions(+), 3 deletions(-) create mode 100644 arch/x86/kernel/genx2apic_phys.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index bde3e9b6fec9..81280e93e792 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_OLPC) += olpc.o ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o obj-y += genx2apic_cluster.o + obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 792e21ba1a81..3940d8161f8b 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -30,6 +30,15 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); struct genapic __read_mostly *genapic = &apic_flat; +static int x2apic_phys = 0; + +static int set_x2apic_phys_mode(char *arg) +{ + x2apic_phys = 1; + return 0; +} +early_param("x2apic_phys", set_x2apic_phys_mode); + static enum uv_system_type uv_system_type; /* @@ -39,9 +48,12 @@ void __init setup_apic_routing(void) { if (uv_system_type == UV_NON_UNIQUE_APIC) genapic = &apic_x2apic_uv_x; - else if (cpu_has_x2apic && intr_remapping_enabled) - genapic = &apic_x2apic_cluster; - else + else if (cpu_has_x2apic && intr_remapping_enabled) { + if (x2apic_phys) + genapic = &apic_x2apic_phys; + else + genapic = &apic_x2apic_cluster; + } else #ifdef CONFIG_ACPI /* * Quirk: some x86_64 machines can only use physical APIC mode diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c new file mode 100644 index 000000000000..3c70b9d692b2 --- /dev/null +++ b/arch/x86/kernel/genx2apic_phys.c @@ -0,0 +1,122 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ + +static cpumask_t x2apic_target_cpus(void) +{ + return cpumask_of_cpu(0); +} + +static cpumask_t x2apic_vector_allocation_domain(int cpu) +{ + cpumask_t domain = CPU_MASK_NONE; + cpu_set(cpu, domain); + return domain; +} + +static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, + unsigned int dest) +{ + unsigned long cfg; + + cfg = __prepare_ICR(0, vector, dest); + + /* + * send the IPI. + */ + x2apic_icr_write(cfg, apicid); +} + +static void x2apic_send_IPI_mask(cpumask_t mask, int vector) +{ + unsigned long flags; + unsigned long query_cpu; + + local_irq_save(flags); + for_each_cpu_mask(query_cpu, mask) { + __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), + vector, APIC_DEST_PHYSICAL); + } + local_irq_restore(flags); +} + +static void x2apic_send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + x2apic_send_IPI_mask(mask, vector); +} + +static void x2apic_send_IPI_all(int vector) +{ + x2apic_send_IPI_mask(cpu_online_map, vector); +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = first_cpu(cpumask); + if ((unsigned)cpu < NR_CPUS) + return per_cpu(x86_cpu_to_apicid, cpu); + else + return BAD_APICID; +} + +static unsigned int x2apic_read_id(void) +{ + return apic_read(APIC_ID); +} + +static unsigned int phys_pkg_id(int index_msb) +{ + return x2apic_read_id() >> index_msb; +} + +void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +void init_x2apic_ldr(void) +{ + return; +} + +struct genapic apic_x2apic_phys = { + .name = "physical x2apic", + .int_delivery_mode = dest_Fixed, + .int_dest_mode = (APIC_DEST_PHYSICAL != 0), + .target_cpus = x2apic_target_cpus, + .vector_allocation_domain = x2apic_vector_allocation_domain, + .apic_id_registered = x2apic_apic_id_registered, + .init_apic_ldr = init_x2apic_ldr, + .send_IPI_all = x2apic_send_IPI_all, + .send_IPI_allbutself = x2apic_send_IPI_allbutself, + .send_IPI_mask = x2apic_send_IPI_mask, + .send_IPI_self = x2apic_send_IPI_self, + .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, + .phys_pkg_id = phys_pkg_id, + .read_apic_id = x2apic_read_id, +}; -- cgit v1.2.2 From 9fa8c481b55e80edd8c637573f87853bb6b600f5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 11:17:00 -0700 Subject: x64, x2apic/intr-remap: introduce CONFIG_INTR_REMAP Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Cc: andi@firstfloor.org Cc: ebiederm@xmission.com Cc: jbarnes@virtuousgeek.org Cc: steiner@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2cfccc987a26..0bf8391a7f2d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1651,6 +1651,14 @@ config DMAR_FLOPPY_WA workaround will setup a 1:1 mapping for the first 16M to make floppy (an ISA device) work. +config INTR_REMAP + bool "Support for Interrupt Remapping (EXPERIMENTAL)" + depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL + help + Supports Interrupt remapping for IO-APIC and MSI devices. + To use x2apic mode in the CPU's which support x2APIC enhancements or + to support platforms with CPU's having > 8 bit APIC ID, say Y. + source "drivers/pci/pcie/Kconfig" source "drivers/pci/Kconfig" -- cgit v1.2.2 From 372e92d8b3e433888bf76c36f1c7e1405eae1584 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 10 Jul 2008 14:56:18 -0700 Subject: x64, x2apic/intr-remap: Interrupt-remapping and x2apic support On Thu, Jul 10, 2008 at 12:53:20PM -0700, Ingo Molnar wrote: > > Btw., i threw it at the -tip test-cluster and got back a quick build > bugreport: > > arch/x86/xen/enlighten.c: In function 'xen_patch': > arch/x86/xen/enlighten.c:1084: warning: label 'patch_site' defined but not used > arch/x86/xen/enlighten.c: At top level: > arch/x86/xen/enlighten.c:1272: error: expected identifier before '(' token > arch/x86/xen/enlighten.c:1273: error: expected '}' before '.' token > arch/x86/kernel/paravirt.c:376:2: error: invalid preprocessing directive > #ifndedarch/x86/kernel/paravirt.c:384:2: error: #endif without #if > > with this config: > > http://redhat.com/~mingo/misc/config-Thu_Jul_10_21_43_28_CEST_2008.bad fix the typo. Signed-off-by: Suresh Siddha Cc: "Siddha Cc: Suresh B" Cc: "akpm@linux-foundation.org" Cc: "arjan@linux.intel.com" Cc: "andi@firstfloor.org" Cc: "ebiederm@xmission.com" Cc: "jbarnes@virtuousgeek.org" Cc: "steiner@sgi.com" Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index b80105a0f474..4f29ff847ebe 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -360,7 +360,7 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC -#ifnded CONFIG_X86_64 +#ifndef CONFIG_X86_64 .apic_write = native_apic_mem_write, .apic_write_atomic = native_apic_mem_write_atomic, .apic_read = native_apic_mem_read, -- cgit v1.2.2 From 277d1f5846d84e16760131a93b7a67ebfa8eded4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 11 Jul 2008 13:11:55 -0700 Subject: x2apic: uninline uv_init_apic_ldr() Andrew says: > There's no point in declaring it inline if it's always called indirectly. And point taken! Signed-off-by: Suresh Siddha Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index dcd4fb219daa..c915f750241e 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -120,7 +120,7 @@ static int uv_apic_id_registered(void) return 1; } -static inline void uv_init_apic_ldr(void) +static void uv_init_apic_ldr(void) { } -- cgit v1.2.2 From ad66dd340f561bdde2285992314d9e4fd9b6191e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 11 Jul 2008 13:11:56 -0700 Subject: x2apic: xen64 paravirt basic apic ops Define the Xen specific basic apic ops, in additon to paravirt apic ops, with some misc warning fixes. Signed-off-by: Suresh Siddha Cc: Jeremy Fitzhardinge Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/lguest/boot.c | 4 ++-- arch/x86/xen/enlighten.c | 41 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 50dad44fb542..0c45df20e2b3 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -783,11 +783,11 @@ static void lguest_wbinvd(void) * code qualifies for Advanced. It will also never interrupt anything. It * does, however, allow us to get through the Linux boot code. */ #ifdef CONFIG_X86_LOCAL_APIC -static void lguest_apic_write(unsigned long reg, u32 v) +static void lguest_apic_write(u32 reg, u32 v) { } -static u32 lguest_apic_read(unsigned long reg) +static u32 lguest_apic_read(u32 reg) { return 0; } diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dcd4e51f2f16..54e255667530 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -548,16 +548,45 @@ static void xen_io_delay(void) } #ifdef CONFIG_X86_LOCAL_APIC -static u32 xen_apic_read(unsigned long reg) +static u32 xen_apic_read(u32 reg) { return 0; } -static void xen_apic_write(unsigned long reg, u32 val) +static void xen_apic_write(u32 reg, u32 val) { /* Warn to see if there's any stray references */ WARN_ON(1); } + +#ifdef CONFIG_X86_64 +static u64 xen_apic_icr_read(void) +{ + return 0; +} + +static void xen_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static void xen_apic_wait_icr_idle(void) +{ + return; +} + +static struct apic_ops xen_basic_apic_ops = { + .read = xen_apic_read, + .write = xen_apic_write, + .write_atomic = xen_apic_write, + .icr_read = xen_apic_icr_read, + .icr_write = xen_apic_icr_write, + .wait_icr_idle = xen_apic_wait_icr_idle, + .safe_wait_icr_idle = xen_apic_wait_icr_idle, +}; +#endif + #endif static void xen_flush_tlb(void) @@ -1130,9 +1159,11 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC +#ifndef CONFIG_X86_64 .apic_write = xen_apic_write, .apic_write_atomic = xen_apic_write, .apic_read = xen_apic_read, +#endif .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, @@ -1291,6 +1322,12 @@ asmlinkage void __init xen_start_kernel(void) pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; +#ifdef CONFIG_X86_64 + /* + * for 64bit, set up the basic apic ops aswell. + */ + apic_ops = &xen_basic_apic_ops; +#endif if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; -- cgit v1.2.2 From c535b6a1a685eb23f96e2c221777d6c1e05080d5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 11 Jul 2008 18:41:54 -0700 Subject: x86: let 32bit use apic_ops too Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 39 +++++++++++++++++++++++++++++++-------- 1 file changed, 31 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 2a83c07bd887..7413354c9ffd 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -145,19 +145,13 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } -void apic_icr_write(u32 low, u32 id) -{ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(id)); - apic_write_around(APIC_ICR, low); -} - -void apic_wait_icr_idle(void) +void xapic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) cpu_relax(); } -u32 safe_apic_wait_icr_idle(void) +u32 safe_xapic_wait_icr_idle(void) { u32 send_status; int timeout; @@ -173,6 +167,35 @@ u32 safe_apic_wait_icr_idle(void) return send_status; } +void xapic_icr_write(u32 low, u32 id) +{ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write_around(APIC_ICR, low); +} + +u64 xapic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +static struct apic_ops xapic_ops = { + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .write_atomic = native_apic_mem_write_atomic, + .icr_read = xapic_icr_read, + .icr_write = xapic_icr_write, + .wait_icr_idle = xapic_wait_icr_idle, + .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +}; + +struct apic_ops __read_mostly *apic_ops = &xapic_ops; +EXPORT_SYMBOL_GPL(apic_ops); + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ -- cgit v1.2.2 From 4696ca5bfd2697f5686f96d59cf0b6de14869b4e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 11 Jul 2008 18:43:10 -0700 Subject: x86: mach_apicdef.h need to include before smp.h smp.h internal has include, so need to include that at first when genericarch use them need to have different apicdef.h Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mach-generic/bigsmp.c | 5 ++--- arch/x86/mach-generic/es7000.c | 3 +-- arch/x86/mach-generic/numaq.c | 4 ++-- arch/x86/mach-generic/summit.c | 5 ++--- 4 files changed, 7 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index 59d771714559..b31f2800638e 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -5,17 +5,16 @@ #define APIC_DEFINITION 1 #include #include -#include #include #include #include #include #include -#include #include #include -#include #include +#include +#include #include #include diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 4742626f08c4..9b30547d746e 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -4,16 +4,15 @@ #define APIC_DEFINITION 1 #include #include -#include #include #include #include #include #include #include -#include #include #include +#include #include #include #include diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 8091e68764c4..95c07efff6b7 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -4,7 +4,6 @@ #define APIC_DEFINITION 1 #include #include -#include #include #include #include @@ -12,8 +11,9 @@ #include #include #include -#include #include +#include +#include #include #include #include diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index a97ea0f35b1e..752edd96b1bf 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -4,17 +4,16 @@ #define APIC_DEFINITION 1 #include #include -#include #include #include #include #include #include #include -#include #include -#include #include +#include +#include #include #include -- cgit v1.2.2 From 4c9961d56ec20c27ec5d02e49fd7427748312741 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 11 Jul 2008 18:44:16 -0700 Subject: x86: make read_apic_id return final apicid also remove GET_APIC_ID when read_apic_id is used. need to apply after [PATCH] x86: mach_apicdef.h need to include before smp.h Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/apic_32.c | 4 ++-- arch/x86/kernel/apic_64.c | 6 +++--- arch/x86/kernel/genapic_flat_64.c | 2 +- arch/x86/kernel/io_apic_32.c | 5 ++--- arch/x86/kernel/io_apic_64.c | 4 ++-- arch/x86/kernel/smpboot.c | 6 +++--- 7 files changed, 14 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8705262ddcda..b314bcd08406 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -761,7 +761,7 @@ static void __init acpi_register_lapic_address(unsigned long address) set_fixmap_nocache(FIX_APIC_BASE, address); if (boot_cpu_physical_apicid == -1U) { - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); #ifdef CONFIG_X86_32 apic_version[boot_cpu_physical_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 7413354c9ffd..47ff978aecfd 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1230,7 +1230,7 @@ void __init init_apic_mappings(void) * default configuration (or the MP table is broken). */ if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); } @@ -1270,7 +1270,7 @@ int __init APIC_init_uniprocessor(void) * might be zero if read from MP tables. Get it from LAPIC. */ #ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index dd0501039f09..c75f58a66d8e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1060,7 +1060,7 @@ void __init early_init_lapic_mapping(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); } /** @@ -1069,7 +1069,7 @@ void __init early_init_lapic_mapping(void) void __init init_apic_mappings(void) { if (x2apic) { - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); return; } @@ -1092,7 +1092,7 @@ void __init init_apic_mappings(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - boot_cpu_physical_apicid = GET_APIC_ID(read_apic_id()); + boot_cpu_physical_apicid = read_apic_id(); } /* diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 735586822135..7dac2f275fad 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -101,7 +101,7 @@ static unsigned int read_xapic_id(void) { unsigned int id; - id = GET_XAPIC_ID(apic_read(APIC_ID)); + id = GET_APIC_ID(apic_read(APIC_ID)); return id; } diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index c50adb84ea6f..382208d11f8d 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1501,7 +1501,7 @@ void /*__init*/ print_local_APIC(void *dummy) smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, - GET_APIC_ID(read_apic_id())); + GET_APIC_ID(v)); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1709,8 +1709,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = - GET_APIC_ID(read_apic_id()); + entry.dest.physical.physical_dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 877aa2e9d7e8..2db0f98e2af5 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1246,7 +1246,7 @@ void __apicdebuginit print_local_APIC(void * dummy) printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, GET_APIC_ID(read_apic_id())); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1439,7 +1439,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest = GET_APIC_ID(read_apic_id()); + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0c43e1f2e7d3..6cd002f3e20e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -211,7 +211,7 @@ static void __cpuinit smp_callin(void) /* * (This works even if the APIC is not enabled.) */ - phys_id = GET_APIC_ID(read_apic_id()); + phys_id = read_apic_id(); cpuid = smp_processor_id(); if (cpu_isset(cpuid, cpu_callin_map)) { panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, @@ -1157,9 +1157,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) } preempt_disable(); - if (GET_APIC_ID(read_apic_id()) != boot_cpu_physical_apicid) { + if (read_apic_id() != boot_cpu_physical_apicid) { panic("Boot APIC ID in local APIC unexpected (%d vs %d)", - GET_APIC_ID(read_apic_id()), boot_cpu_physical_apicid); + read_apic_id(), boot_cpu_physical_apicid); /* Or can we switch back to PIC here? */ } preempt_enable(); -- cgit v1.2.2 From f910a9dc7c865896815e2a95fe33363e9522f277 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 12 Jul 2008 01:01:20 -0700 Subject: x86: make 64bit have get_apic_id generalize the x2apic code some more. let read_apic_id become a macro (later on a function/inline) GET_APIC_ID(apic_read(APIC_ID)) +#define read_apic_id() (GET_APIC_ID(apic_read(APIC_ID))) instead of this weird construct: -#define read_apic_id (genapic->read_apic_id) Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 26 +++++++++++++++++++++++--- arch/x86/kernel/genx2apic_cluster.c | 20 +++++++++++++++++++- arch/x86/kernel/genx2apic_phys.c | 20 +++++++++++++++++++- arch/x86/kernel/genx2apic_uv_x.c | 23 ++++++++++++++++++++--- 4 files changed, 81 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 7dac2f275fad..2c973cbf054f 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -97,11 +97,27 @@ static void flat_send_IPI_all(int vector) __send_IPI_shortcut(APIC_DEST_ALLINC, vector, APIC_DEST_LOGICAL); } +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = (((x)>>24) & 0xFFu); + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xFFu)<<24); + return x; +} + static unsigned int read_xapic_id(void) { unsigned int id; - id = GET_APIC_ID(apic_read(APIC_ID)); + id = get_apic_id(apic_read(APIC_ID)); return id; } @@ -134,7 +150,9 @@ struct genapic apic_flat = { .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = flat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, - .read_apic_id = read_xapic_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFu<<24), }; /* @@ -200,5 +218,7 @@ struct genapic apic_physflat = { .send_IPI_self = apic_send_IPI_self, .cpu_mask_to_apicid = physflat_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, - .read_apic_id = read_xapic_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFu<<24), }; diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index ed0fdede800a..40bc0140d89f 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -94,6 +94,22 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) return BAD_APICID; } +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = x; + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = id; + return x; +} + static unsigned int x2apic_read_id(void) { return apic_read(APIC_ID); @@ -131,5 +147,7 @@ struct genapic apic_x2apic_cluster = { .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, - .read_apic_id = x2apic_read_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), }; diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 3c70b9d692b2..2f3c6ca19de9 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -84,6 +84,22 @@ static unsigned int x2apic_cpu_mask_to_apicid(cpumask_t cpumask) return BAD_APICID; } +static unsigned int get_apic_id(unsigned long x) +{ + unsigned int id; + + id = x; + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = id; + return x; +} + static unsigned int x2apic_read_id(void) { return apic_read(APIC_ID); @@ -118,5 +134,7 @@ struct genapic apic_x2apic_phys = { .send_IPI_self = x2apic_send_IPI_self, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, - .read_apic_id = x2apic_read_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), }; diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index c915f750241e..3ca29cd8c23c 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -139,16 +139,31 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) return BAD_APICID; } -static unsigned int uv_read_apic_id(void) +static unsigned int get_apic_id(unsigned long x) { unsigned int id; WARN_ON(preemptible() && num_online_cpus() > 1); - id = apic_read(APIC_ID) | __get_cpu_var(x2apic_extra_bits); + id = x | __get_cpu_var(x2apic_extra_bits); return id; } +static long set_apic_id(unsigned int id) +{ + unsigned long x; + + /* maskout x2apic_extra_bits ? */ + x = id; + return x; +} + +static unsigned int uv_read_apic_id(void) +{ + + return get_apic_id(apic_read(APIC_ID)); +} + static unsigned int phys_pkg_id(int index_msb) { return uv_read_apic_id() >> index_msb; @@ -175,7 +190,9 @@ struct genapic apic_x2apic_uv_x = { /* ZZZ.send_IPI_self = uv_send_IPI_self, */ .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ - .read_apic_id = uv_read_apic_id, + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = (0xFFFFFFFFu), }; static __cpuinit void set_x2apic_extra_bits(int pnode) -- cgit v1.2.2 From 94a8c3c2437c8946f1b6c8e0b2c560a7db8ed3c6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 13 Jul 2008 22:19:35 -0700 Subject: x86: let 32bit use apic_ops too - fix fix for pv - clean up the namespace there too. Signed-off-by: Yinghai Lu Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 5 ----- arch/x86/kernel/vmi_32.c | 51 +++++++++++++++++++++++++++++++++++++++++++--- arch/x86/xen/enlighten.c | 19 ++++++++--------- 3 files changed, 57 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 4f29ff847ebe..e0f139106c7e 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -360,11 +360,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC -#ifndef CONFIG_X86_64 - .apic_write = native_apic_mem_write, - .apic_write_atomic = native_apic_mem_write_atomic, - .apic_read = native_apic_mem_read, -#endif .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, .startup_ipi_hook = paravirt_nop, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b15346092b7b..cf3074354553 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -676,6 +676,50 @@ static inline int __init probe_vmi_rom(void) return 0; } +#ifdef CONFIG_X86_LOCAL_APIC +static u32 vmi_apic_read(u32 reg) +{ + return 0; +} + +static void vmi_apic_write(u32 reg, u32 val) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static u64 vmi_apic_icr_read(void) +{ + return 0; +} + +static void vmi_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static void vmi_apic_wait_icr_idle(void) +{ + return; +} + +static u32 vmi_safe_apic_wait_icr_idle(void) +{ + return 0; +} + +static struct apic_ops vmi_basic_apic_ops = { + .read = vmi_apic_read, + .write = vmi_apic_write, + .write_atomic = vmi_apic_write, + .icr_read = vmi_apic_icr_read, + .icr_write = vmi_apic_icr_write, + .wait_icr_idle = vmi_apic_wait_icr_idle, + .safe_wait_icr_idle = vmi_safe_apic_wait_icr_idle, +}; +#endif + /* * VMI setup common to all processors */ @@ -904,9 +948,10 @@ static inline int __init activate_vmi(void) #endif #ifdef CONFIG_X86_LOCAL_APIC - para_fill(pv_apic_ops.apic_read, APICRead); - para_fill(pv_apic_ops.apic_write, APICWrite); - para_fill(pv_apic_ops.apic_write_atomic, APICWrite); + para_fill(vmi_basic_apic_ops.read, APICRead); + para_fill(vmi_basic_apic_ops.write, APICWrite); + para_fill(vmi_basic_apic_ops.write_atomic, APICWrite); + apic_ops = &vmi_basic_apic_ops; #endif /* diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 54e255667530..d11dda7ebd7a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -559,7 +559,6 @@ static void xen_apic_write(u32 reg, u32 val) WARN_ON(1); } -#ifdef CONFIG_X86_64 static u64 xen_apic_icr_read(void) { return 0; @@ -576,6 +575,11 @@ static void xen_apic_wait_icr_idle(void) return; } +static u32 xen_safe_apic_wait_icr_idle(void) +{ + return 0; +} + static struct apic_ops xen_basic_apic_ops = { .read = xen_apic_read, .write = xen_apic_write, @@ -583,9 +587,8 @@ static struct apic_ops xen_basic_apic_ops = { .icr_read = xen_apic_icr_read, .icr_write = xen_apic_icr_write, .wait_icr_idle = xen_apic_wait_icr_idle, - .safe_wait_icr_idle = xen_apic_wait_icr_idle, + .safe_wait_icr_idle = xen_safe_apic_wait_icr_idle, }; -#endif #endif @@ -1159,11 +1162,6 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC -#ifndef CONFIG_X86_64 - .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, - .apic_read = xen_apic_read, -#endif .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, .startup_ipi_hook = paravirt_nop, @@ -1322,9 +1320,10 @@ asmlinkage void __init xen_start_kernel(void) pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; -#ifdef CONFIG_X86_64 + +#ifdef CONFIG_X86_LOCAL_APIC /* - * for 64bit, set up the basic apic ops aswell. + * set up the basic apic ops. */ apic_ops = &xen_basic_apic_ops; #endif -- cgit v1.2.2 From d54191b85e294c46f05a2249b1f55ae54930bcc7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 22 Apr 2008 15:09:30 +0200 Subject: Kprobe smoke test lockdep warning On Mon, 2008-04-21 at 18:54 -0400, Masami Hiramatsu wrote: > Thank you for reporting. > > Actually, kprobes tries to fixup thread's flags in post_kprobe_handler > (which is called from kprobe_exceptions_notify) by > trace_hardirqs_fixup_flags(pt_regs->flags). However, even the irq flag > is set in pt_regs->flags, true hardirq is still off until returning > from do_debug. Thus, lockdep assumes that hardirq is off without annotation. > > IMHO, one possible solution is that fixing hardirq flags right after > notify_die in do_debug instead of in post_kprobe_handler. My reply to BZ 10489: > [ 2.707509] Kprobe smoke test started > [ 2.709300] ------------[ cut here ]------------ > [ 2.709420] WARNING: at kernel/lockdep.c:2658 check_flags+0x4d/0x12c() > [ 2.709541] Modules linked in: > [ 2.709588] Pid: 1, comm: swapper Not tainted 2.6.25.jml.057 #1 > [ 2.709588] [] warn_on_slowpath+0x41/0x51 > [ 2.709588] [] ? save_stack_trace+0x1d/0x3b > [ 2.709588] [] ? save_trace+0x37/0x89 > [ 2.709588] [] ? kernel_map_pages+0x103/0x11c > [ 2.709588] [] ? native_sched_clock+0xca/0xea > [ 2.709588] [] ? mark_held_locks+0x41/0x5c > [ 2.709588] [] ? kprobe_exceptions_notify+0x322/0x3af > [ 2.709588] [] ? trace_hardirqs_on+0xf1/0x119 > [ 2.709588] [] ? kprobe_exceptions_notify+0x355/0x3af > [ 2.709588] [] check_flags+0x4d/0x12c > [ 2.709588] [] lock_release+0x58/0x195 > [ 2.709588] [] ? __atomic_notifier_call_chain+0x0/0x80 > [ 2.709588] [] __atomic_notifier_call_chain+0x5a/0x80 > [ 2.709588] [] atomic_notifier_call_chain+0xc/0xe > [ 2.709588] [] notify_die+0x2d/0x2f > [ 2.709588] [] do_debug+0x67/0xfe > [ 2.709588] [] debug_stack_correct+0x27/0x30 > [ 2.709588] [] ? kprobe_target+0x1/0x34 > [ 2.709588] [] ? init_test_probes+0x50/0x186 > [ 2.709588] [] init_kprobes+0x85/0x8c > [ 2.709588] [] kernel_init+0x13d/0x298 > [ 2.709588] [] ? kernel_init+0x0/0x298 > [ 2.709588] [] ? kernel_init+0x0/0x298 > [ 2.709588] [] kernel_thread_helper+0x7/0x10 > [ 2.709588] ======================= > [ 2.709588] ---[ end trace 778e504de7e3b1e3 ]--- > [ 2.709588] possible reason: unannotated irqs-off. > [ 2.709588] irq event stamp: 370065 > [ 2.709588] hardirqs last enabled at (370065): [] kprobe_exceptions_notify+0x322/0x3af > [ 2.709588] hardirqs last disabled at (370064): [] do_int3+0x1d/0x7d > [ 2.709588] softirqs last enabled at (370050): [] __do_softirq+0xfa/0x100 > [ 2.709588] softirqs last disabled at (370045): [] do_softirq+0x74/0xd9 > [ 2.714751] Kprobe smoke test passed successfully how I love this stuff... Ok, do_debug() is a trap, this can happen at any time regardless of the machine's IRQ state. So the first thing we do is fix up the IRQ state. Then we call this die notifier stuff; and return with messed up IRQ state... YAY. So, kprobes fudges it.. notify_die(DIE_DEBUG) kprobe_exceptions_notify() post_kprobe_handler() modify regs->flags trace_hardirqs_fixup_flags(regs->flags); <--- must be it So what's the use of modifying flags if they're not meant to take effect at some point. /me tries to reproduce issue; enable kprobes test thingy && boot OK, that reproduces.. So the below makes it work - but I'm not getting this code; at the time I wrote that stuff I CC'ed each and every kprobe maintainer listed in the usual places but got no reposonse - can some please explain this stuff to me? Are the saved flags only for the TF bit or are they made in full effect later (and if so, where) ? Signed-off-by: Peter Zijlstra Acked-by: Masami Hiramatsu Signed-off-by: Ingo Molnar --- arch/x86/kernel/kprobes.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index b8c6743a13da..43c019f85f0d 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -860,7 +860,6 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) resume_execution(cur, regs, kcb); regs->flags |= kcb->kprobe_saved_flags; - trace_hardirqs_fixup_flags(regs->flags); if ((kcb->kprobe_status != KPROBE_REENTER) && cur->post_handler) { kcb->kprobe_status = KPROBE_HIT_SSDONE; -- cgit v1.2.2 From a312b37b2a212fd2e227d1d6321f903b91b65ec7 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:23 -0700 Subject: x86/paravirt: call paravirt_pagetable_setup_{start, done} Call paravirt_pagetable_setup_{start,done} These paravirt_ops functions were not being called on x86_64. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/kernel/setup.c | 2 ++ arch/x86/xen/enlighten.c | 4 ++++ 3 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c19..2963ab5d91ee 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -373,6 +374,9 @@ struct pv_mmu_ops pv_mmu_ops = { #ifndef CONFIG_X86_64 .pagetable_setup_start = native_pagetable_setup_start, .pagetable_setup_done = native_pagetable_setup_done, +#else + .pagetable_setup_start = paravirt_nop, + .pagetable_setup_done = paravirt_nop, #endif .read_cr2 = native_read_cr2, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 36c540d4ac4b..8ce6a91ce108 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -819,7 +819,9 @@ void __init setup_arch(char **cmdline_p) vmi_init(); #endif + paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); + paravirt_pagetable_setup_done(swapper_pg_dir); #ifdef CONFIG_X86_64 map_vsyscall(); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb508456ef52..eaab6c9b4a84 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -841,6 +841,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { +#ifdef CONFIG_X86_32 pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; int i; @@ -886,6 +887,7 @@ static __init void xen_pagetable_setup_start(pgd_t *base) /* Unpin initial Xen pagetable */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(xen_start_info->pt_base))); +#endif /* CONFIG_X86_32 */ } void xen_setup_shared_info(void) @@ -927,9 +929,11 @@ static __init void xen_pagetable_setup_done(pgd_t *base) xen_setup_shared_info(); +#ifdef CONFIG_X86_32 /* Actually pin the pagetable down, but we can't set PG_pinned yet because the page structures don't exist yet. */ pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); +#endif } static __init void xen_post_allocator_init(void) -- cgit v1.2.2 From c1f2f09ef66d5dadd5fe42ea909e708470c9636d Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:24 -0700 Subject: pvops-64: call paravirt_post_allocator_init() on setup_arch() Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 1 + arch/x86/mm/init_32.c | 2 -- arch/x86/xen/mmu.c | 8 +++++--- 3 files changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8ce6a91ce108..2ed504b97d47 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -822,6 +822,7 @@ void __init setup_arch(char **cmdline_p) paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); paravirt_pagetable_setup_done(swapper_pg_dir); + paravirt_post_allocator_init(); #ifdef CONFIG_X86_64 map_vsyscall(); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9689a5138e64..7113acd8ac45 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -868,8 +868,6 @@ void __init paging_init(void) */ sparse_init(); zone_sizes_init(); - - paravirt_post_allocator_init(); } /* diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ff0aa74afaa1..ebd6900e331c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -656,9 +656,11 @@ void xen_mm_pin_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } -/* The init_mm pagetable is really pinned as soon as its created, but - that's before we have page structures to store the bits. So do all - the book-keeping now. */ +/* + * The init_mm pagetable is really pinned as soon as its created, but + * that's before we have page structures to store the bits. So do all + * the book-keeping now. + */ static __init int mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); -- cgit v1.2.2 From 8840c0ccd763936a8e730ece118197a51be8dc8e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:25 -0700 Subject: x86_64: there's no need to preallocate level1_fixmap_pgt Early fixmap will allocate its own L1 pagetable page for fixmap mappings, so there's no need to preallocate one. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b07ac7b217cb..4b6bda21837f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -362,12 +362,6 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level2_fixmap_pgt) - .fill 506,8,0 - .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE - /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ - .fill 5,8,0 - -NEXT_PAGE(level1_fixmap_pgt) .fill 512,8,0 NEXT_PAGE(level2_ident_pgt) -- cgit v1.2.2 From 87b935a0ef9a1ddf62f2f0c0fc17b10654ff41cd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:26 -0700 Subject: x86: clean up formatting of __switch_to process_64.c:__switch_to has some very old strange formatting, some of it dating back to pre-git. Fix it up. No functional changes. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 56 ++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index a8e53626ac9a..e8a8e1b99817 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -537,8 +537,8 @@ static inline void __switch_to_xtra(struct task_struct *prev_p, struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { - struct thread_struct *prev = &prev_p->thread, - *next = &next_p->thread; + struct thread_struct *prev = &prev_p->thread; + struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(init_tss, cpu); unsigned fsindex, gsindex; @@ -586,35 +586,34 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch FS and GS. + * + * Segment register != 0 always requires a reload. Also + * reload when it has changed. When prev process used 64bit + * base always reload to avoid an information leak. */ - { - /* segment register != 0 always requires a reload. - also reload when it has changed. - when prev process used 64bit base always reload - to avoid an information leak. */ - if (unlikely(fsindex | next->fsindex | prev->fs)) { - loadsegment(fs, next->fsindex); - /* check if the user used a selector != 0 - * if yes clear 64bit base, since overloaded base - * is always mapped to the Null selector - */ - if (fsindex) + if (unlikely(fsindex | next->fsindex | prev->fs)) { + loadsegment(fs, next->fsindex); + /* + * Check if the user used a selector != 0; if yes + * clear 64bit base, since overloaded base is always + * mapped to the Null selector + */ + if (fsindex) prev->fs = 0; - } - /* when next process has a 64bit base use it */ - if (next->fs) - wrmsrl(MSR_FS_BASE, next->fs); - prev->fsindex = fsindex; - - if (unlikely(gsindex | next->gsindex | prev->gs)) { - load_gs_index(next->gsindex); - if (gsindex) + } + /* when next process has a 64bit base use it */ + if (next->fs) + wrmsrl(MSR_FS_BASE, next->fs); + prev->fsindex = fsindex; + + if (unlikely(gsindex | next->gsindex | prev->gs)) { + load_gs_index(next->gsindex); + if (gsindex) prev->gs = 0; - } - if (next->gs) - wrmsrl(MSR_KERNEL_GS_BASE, next->gs); - prev->gsindex = gsindex; } + if (next->gs) + wrmsrl(MSR_KERNEL_GS_BASE, next->gs); + prev->gsindex = gsindex; /* Must be after DS reload */ unlazy_fpu(prev_p); @@ -627,7 +626,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) write_pda(pcurrent, next_p); write_pda(kernelstack, - (unsigned long)task_stack_page(next_p) + THREAD_SIZE - PDA_STACKOFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - PDA_STACKOFFSET); #ifdef CONFIG_CC_STACKPROTECTOR write_pda(stack_canary, next_p->stack_canary); /* -- cgit v1.2.2 From cbcd79c2e5b496b84845618cef734b4c40736576 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:27 -0700 Subject: x86: use __page_aligned_data/bss Update arch/x86's use of page-aligned variables. The change to arch/x86/xen/mmu.c fixes an actual bug, but the rest are cleanups and to set a precedent. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 4 ++-- arch/x86/kernel/irq_32.c | 7 ++----- arch/x86/xen/mmu.c | 15 ++++++--------- 3 files changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb40..15419cd3c5a4 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -517,8 +518,7 @@ void pda_init(int cpu) } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] -__attribute__((section(".bss.page_aligned"))); + DEBUG_STKSZ] __page_aligned_bss; extern asmlinkage void ignore_sysret(void); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 47a6f6f12478..1cf8c1fcc088 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -83,11 +83,8 @@ union irq_ctx { static union irq_ctx *hardirq_ctx[NR_CPUS] __read_mostly; static union irq_ctx *softirq_ctx[NR_CPUS] __read_mostly; -static char softirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); - -static char hardirq_stack[NR_CPUS * THREAD_SIZE] - __attribute__((__section__(".bss.page_aligned"))); +static char softirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; +static char hardirq_stack[NR_CPUS * THREAD_SIZE] __page_aligned_bss; static void call_on_stack(void *func, void *stack) { diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ebd6900e331c..4fca9d88bef0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -60,22 +61,18 @@ #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) /* Placeholder for holes in the address space */ -static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] - __attribute__((section(".data.page_aligned"))) = +static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data = { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL }; /* Array of pointers to pages containing p2m entries */ -static unsigned long *p2m_top[TOP_ENTRIES] - __attribute__((section(".data.page_aligned"))) = +static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data = { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] }; /* Arrays of p2m arrays expressed in mfns used for save/restore */ -static unsigned long p2m_top_mfn[TOP_ENTRIES] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss; -static unsigned long p2m_top_mfn_list[ - PAGE_ALIGN(TOP_ENTRIES / P2M_ENTRIES_PER_PAGE)] - __attribute__((section(".bss.page_aligned"))); +static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE] + __page_aligned_bss; static inline unsigned p2m_top_index(unsigned long pfn) { -- cgit v1.2.2 From 360c044eb1b985a9ef29d952276a3e14973bed93 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:28 -0700 Subject: x86_64: adjust exception frame in ia32entry The 32-bit compat int $0x80 entrypoint needs exception frame adjustment. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 20371d0635e4..0ae1e77eae50 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -321,6 +321,7 @@ ENTRY(ia32_syscall) /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ /*CFI_REL_OFFSET cs,CS-RIP*/ CFI_REL_OFFSET rip,RIP-RIP + PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS /* * No need to follow this irqs on/off section: the syscall -- cgit v1.2.2 From 7c33b1e6ee26d67551109aca04d46544d0ce55b1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:29 -0700 Subject: x86_64: unstatic get_local_pda This allows Xen's xen_cpu_up() to allocate a pda for the new CPU. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e8..1deb3b624a79 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -768,7 +768,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) * * Must be called after the _cpu_pda pointer table is initialized. */ -static int __cpuinit get_local_pda(int cpu) +int __cpuinit get_local_pda(int cpu) { struct x8664_pda *oldpda, *newpda; unsigned long size = sizeof(struct x8664_pda); -- cgit v1.2.2 From 8ba6c2b0958c332d2f3336f4ca9c116ed81f38e9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:30 -0700 Subject: xen: print backtrace on multicall failure Print a backtrace if a multicall fails, to help with debugging. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/multicalls.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 3c63c4da7ed1..9efd1c6c9776 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -76,6 +76,7 @@ void xen_mc_flush(void) if (ret) { printk(KERN_ERR "%d multicall(s) failed: cpu %d\n", ret, smp_processor_id()); + dump_stack(); for (i = 0; i < b->mcidx; i++) { printk(" call %2d/%d: op=%lu arg=[%lx] result=%ld\n", i+1, b->mcidx, -- cgit v1.2.2 From ad55db9fed6d6cd09333045945cb03ba2c070085 Mon Sep 17 00:00:00 2001 From: Isaku Yamahata Date: Tue, 8 Jul 2008 15:06:32 -0700 Subject: xen: add xen_arch_resume()/xen_timer_resume hook for ia64 support add xen_timer_resume() hook. Timer resume should be done after event channel is resumed. add xen_arch_resume() hook when ipi becomes usable after resume. After resume, some cpu specific resource must be reinitialized on ia64 that can't be set by another cpu. However available hooks is run once on only one cpu so that ipi has to be used. During stop_machine_run() ipi can't be used because interrupt is masked. So add another hook after stop_machine_run(). Another approach might be use resume hook which is run by device_resume(). However device_resume() may be executed on suspend error recovery path. So it is necessary to determine whether it is executed on real resume path or error recovery path. Signed-off-by: Isaku Yamahata Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/suspend.c | 5 ++++- arch/x86/xen/xen-ops.h | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c index 251669a932d4..2a234db5949b 100644 --- a/arch/x86/xen/suspend.c +++ b/arch/x86/xen/suspend.c @@ -38,8 +38,11 @@ void xen_post_suspend(int suspend_cancelled) xen_cpu_initialized_map = cpu_online_map; #endif xen_vcpu_restore(); - xen_timer_resume(); } } +void xen_arch_resume(void) +{ + /* nothing */ +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 6f4b1045c1c2..77354d204257 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -37,7 +37,6 @@ void __init xen_time_init(void); unsigned long xen_get_wallclock(void); int xen_set_wallclock(unsigned long time); unsigned long long xen_sched_clock(void); -void xen_timer_resume(void); irqreturn_t xen_debug_interrupt(int irq, void *dev_id); -- cgit v1.2.2 From 851fa3c4e7b50d6a946d8b4c0a68683b5e56b2f1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:33 -0700 Subject: xen: define set_pte from the outset We need set_pte to work from a relatively early point, so enable it from the start. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index eaab6c9b4a84..c5f0b40aa39d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -845,9 +845,6 @@ static __init void xen_pagetable_setup_start(pgd_t *base) pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; int i; - /* special set_pte for pagetable initialization */ - pv_mmu_ops.set_pte = xen_set_pte_init; - init_mm.pgd = base; /* * copy top-level of Xen-supplied pagetable into place. This @@ -1174,7 +1171,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .kmap_atomic_pte = xen_kmap_atomic_pte, #endif - .set_pte = NULL, /* see xen_pagetable_setup_* */ + .set_pte = xen_set_pte_init, .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, -- cgit v1.2.2 From 48b5db20621388582ca11ac3c61d3403966dbe51 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:34 -0700 Subject: xen64: define asm/xen/interface for 64-bit Copy 64-bit definitions of various interface structures into place. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.h | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 297bf9f5b8bc..7856e37f6044 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -10,18 +10,6 @@ enum pt_level { PT_PTE }; -/* - * Page-directory addresses above 4GB do not fit into architectural %cr3. - * When accessing %cr3, or equivalent field in vcpu_guest_context, guests - * must use the following accessor macros to pack/unpack valid MFNs. - * - * Note that Xen is using the fact that the pagetable base is always - * page-aligned, and putting the 12 MSB of the address into the 12 LSB - * of cr3. - */ -#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20)) -#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20)) - void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); -- cgit v1.2.2 From 7077c33d81a8d790135ae87cd19e6efcb075c23a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:35 -0700 Subject: xen: make ELF notes work for 32 and 64 bit Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-head.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index 7c0cf6320a0a..a9cac9dc04be 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -5,7 +5,10 @@ #include #include + #include +#include + #include #include @@ -21,21 +24,21 @@ ENTRY(startup_xen) .pushsection .text .align PAGE_SIZE_asm ENTRY(hypercall_page) - .skip 0x1000 + .skip PAGE_SIZE_asm .popsection ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") - ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, .long __PAGE_OFFSET) - ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, .long startup_xen) - ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, .long hypercall_page) + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) + ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) + ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") ELFNOTE(Xen, XEN_ELFNOTE_PAE_MODE, .asciz "yes") ELFNOTE(Xen, XEN_ELFNOTE_LOADER, .asciz "generic") ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID, .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) - ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, .long __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) #endif /*CONFIG_XEN */ -- cgit v1.2.2 From f6e587325b3bc7e5c829a407ddc25b52c1e73851 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:38 -0700 Subject: xen64: add extra pv_mmu_ops We need extra pv_mmu_ops for 64-bit, to deal with the extra level of pagetable. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 33 ++++++++++++++++++++++++++++++- arch/x86/xen/mmu.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/mmu.h | 15 ++++++++++++-- 3 files changed, 95 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c5f0b40aa39d..afb047e30bdc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -803,6 +803,18 @@ static void xen_release_pmd(u32 pfn) xen_release_ptpage(pfn, PT_PMD); } +#if PAGETABLE_LEVELS == 4 +static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) +{ + xen_alloc_ptpage(mm, pfn, PT_PUD); +} + +static void xen_release_pud(u32 pfn) +{ + xen_release_ptpage(pfn, PT_PUD); +} +#endif + #ifdef CONFIG_HIGHPTE static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) { @@ -922,6 +934,11 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.alloc_pmd = xen_alloc_pmd; pv_mmu_ops.release_pte = xen_release_pte; pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + pv_mmu_ops.set_pte = xen_set_pte; xen_setup_shared_info(); @@ -937,6 +954,9 @@ static __init void xen_post_allocator_init(void) { pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.set_pgd = xen_set_pgd; +#endif xen_mark_init_mm_pinned(); } @@ -1185,15 +1205,26 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pte = xen_make_pte, .make_pgd = xen_make_pgd, +#ifdef CONFIG_X86_PAE .set_pte_atomic = xen_set_pte_atomic, .set_pte_present = xen_set_pte_at, - .set_pud = xen_set_pud_hyper, .pte_clear = xen_pte_clear, .pmd_clear = xen_pmd_clear, +#endif /* CONFIG_X86_PAE */ + .set_pud = xen_set_pud_hyper, .make_pmd = xen_make_pmd, .pmd_val = xen_pmd_val, +#if PAGETABLE_LEVELS == 4 + .pud_val = xen_pud_val, + .make_pud = xen_make_pud, + .set_pgd = xen_set_pgd_hyper, + + .alloc_pud = xen_alloc_pte_init, + .release_pud = xen_release_pte_init, +#endif /* PAGETABLE_LEVELS == 4 */ + .activate_mm = xen_activate_mm, .dup_mmap = xen_dup_mmap, .exit_mmap = xen_exit_mmap, diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 4fca9d88bef0..d0976b87cd2c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -438,14 +438,19 @@ void xen_set_pud(pud_t *ptr, pud_t val) void xen_set_pte(pte_t *ptep, pte_t pte) { +#ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); ptep->pte_low = pte.pte_low; +#else + *ptep = pte; +#endif } +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - set_64bit((u64 *)ptep, pte_val_ma(pte)); + set_64bit((u64 *)ptep, native_pte_val(pte)); } void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -459,6 +464,7 @@ void xen_pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, __pmd(0)); } +#endif /* CONFIG_X86_PAE */ pmd_t xen_make_pmd(pmdval_t pmd) { @@ -466,6 +472,49 @@ pmd_t xen_make_pmd(pmdval_t pmd) return native_make_pmd(pmd); } +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud) +{ + return pte_mfn_to_pfn(pud.pud); +} + +pud_t xen_make_pud(pudval_t pud) +{ + pud = pte_pfn_to_mfn(pud); + + return native_make_pud(pud); +} + +void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + struct mmu_update u; + + preempt_disable(); + + xen_mc_batch(); + + u.ptr = virt_to_machine(ptr).maddr; + u.val = pgd_val_ma(val); + extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); + + preempt_enable(); +} + +void xen_set_pgd(pgd_t *ptr, pgd_t val) +{ + /* If page is not pinned, we can just update the entry + directly */ + if (!page_pinned(ptr)) { + *ptr = val; + return; + } + + xen_set_pgd_hyper(ptr, val); +} +#endif /* PAGETABLE_LEVELS == 4 */ + /* (Yet another) pagetable walker. This one is intended for pinning a pagetable. This means that it walks a pagetable and calls the diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 7856e37f6044..19d544b0b6c6 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -32,13 +32,24 @@ pgd_t xen_make_pgd(pgdval_t); void xen_set_pte(pte_t *ptep, pte_t pteval); void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); + +#ifdef CONFIG_X86_PAE void xen_set_pte_atomic(pte_t *ptep, pte_t pte); +void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +void xen_pmd_clear(pmd_t *pmdp); +#endif /* CONFIG_X86_PAE */ + void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud(pud_t *ptr, pud_t val); void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); void xen_set_pud_hyper(pud_t *ptr, pud_t val); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); + +#if PAGETABLE_LEVELS == 4 +pudval_t xen_pud_val(pud_t pud); +pud_t xen_make_pud(pudval_t pudval); +void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); +void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); +#endif pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, -- cgit v1.2.2 From f5d36de069f4b343f64e858e7377cfc9c772c4fb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:39 -0700 Subject: xen64: random ifdefs to mask out 32-bit only code Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index afb047e30bdc..ada2e1a141df 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1296,6 +1296,7 @@ static const struct machine_ops __initdata xen_machine_ops = { static void __init xen_reserve_top(void) { +#ifdef CONFIG_X86_32 unsigned long top = HYPERVISOR_VIRT_START; struct xen_platform_parameters pp; @@ -1303,6 +1304,7 @@ static void __init xen_reserve_top(void) top = pp.virt_start; reserve_top_address(-top + 2 * PAGE_SIZE); +#endif /* CONFIG_X86_32 */ } /* First C function to be called on Xen boot */ @@ -1333,6 +1335,11 @@ asmlinkage void __init xen_start_kernel(void) machine_ops = xen_machine_ops; +#ifdef CONFIG_X86_64 + /* Disable until direct per-cpu data access. */ + have_vcpu_info_placement = 0; +#endif + #ifdef CONFIG_SMP smp_ops = xen_smp_ops; #endif @@ -1343,9 +1350,11 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; +#ifdef CONFIG_X86_32 init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; +#endif init_mm.pgd = pgd; /* use the Xen pagetables to start */ @@ -1372,7 +1381,9 @@ asmlinkage void __init xen_start_kernel(void) /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); +#ifdef CONFIG_X86_32 new_cpu_data.hard_math = 1; +#endif new_cpu_data.x86_capability[0] = cpuid_edx(1); /* Poke various useful things into boot_params */ @@ -1388,5 +1399,9 @@ asmlinkage void __init xen_start_kernel(void) } /* Start the world */ +#ifdef CONFIG_X86_32 i386_start_kernel(); +#else + x86_64_start_kernel((char *)&boot_params); +#endif } -- cgit v1.2.2 From ce87b3d326de733c72b47662f106ee6cd699a20f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:40 -0700 Subject: xen64: get active_mm from the pda x86_64 stores the active_mm in the pda, so fetch it from there. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d0976b87cd2c..2579e70cdd08 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -805,8 +805,15 @@ void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) static void drop_other_mm_ref(void *info) { struct mm_struct *mm = info; + struct mm_struct *active_mm; - if (__get_cpu_var(cpu_tlbstate).active_mm == mm) +#ifdef CONFIG_X86_64 + active_mm = read_pda(active_mm); +#else + active_mm = __get_cpu_var(cpu_tlbstate).active_mm; +#endif + + if (active_mm == mm) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure -- cgit v1.2.2 From a9e7062d7339f1a1df2b6d7e5d595c7d55b56bfb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:41 -0700 Subject: xen: move smp setup into smp.c Move all the smp_ops setup into smp.c, allowing a lot of things to become static. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 19 +------------------ arch/x86/xen/smp.c | 34 ++++++++++++++++++++++++++-------- arch/x86/xen/xen-ops.h | 13 +++++-------- 3 files changed, 32 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index ada2e1a141df..a85f447b8d00 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1237,21 +1237,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .set_fixmap = xen_set_fixmap, }; -#ifdef CONFIG_SMP -static const struct smp_ops xen_smp_ops __initdata = { - .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, - .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, - .smp_cpus_done = xen_smp_cpus_done, - - .smp_send_stop = xen_smp_send_stop, - .smp_send_reschedule = xen_smp_send_reschedule, - - .send_call_func_ipi = xen_smp_send_call_function_ipi, - .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, -}; -#endif /* CONFIG_SMP */ - static void xen_reboot(int reason) { struct sched_shutdown r = { .reason = reason }; @@ -1340,9 +1325,7 @@ asmlinkage void __init xen_start_kernel(void) have_vcpu_info_placement = 0; #endif -#ifdef CONFIG_SMP - smp_ops = xen_smp_ops; -#endif + xen_smp_init(); /* Get mfn list */ if (!xen_feature(XENFEAT_auto_translated_physmap)) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 233156f39b7f..91fae8ff756e 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -152,7 +152,7 @@ void __init xen_fill_possible_map(void) } } -void __init xen_smp_prepare_boot_cpu(void) +static void __init xen_smp_prepare_boot_cpu(void) { int cpu; @@ -176,7 +176,7 @@ void __init xen_smp_prepare_boot_cpu(void) xen_setup_vcpu_info_placement(); } -void __init xen_smp_prepare_cpus(unsigned int max_cpus) +static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; @@ -276,7 +276,7 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) return 0; } -int __cpuinit xen_cpu_up(unsigned int cpu) +static int __cpuinit xen_cpu_up(unsigned int cpu) { struct task_struct *idle = idle_task(cpu); int rc; @@ -319,7 +319,7 @@ int __cpuinit xen_cpu_up(unsigned int cpu) return 0; } -void xen_smp_cpus_done(unsigned int max_cpus) +static void xen_smp_cpus_done(unsigned int max_cpus) { } @@ -335,12 +335,12 @@ static void stop_self(void *v) BUG(); } -void xen_smp_send_stop(void) +static void xen_smp_send_stop(void) { smp_call_function(stop_self, NULL, 0); } -void xen_smp_send_reschedule(int cpu) +static void xen_smp_send_reschedule(int cpu) { xen_send_IPI_one(cpu, XEN_RESCHEDULE_VECTOR); } @@ -355,7 +355,7 @@ static void xen_send_IPI_mask(cpumask_t mask, enum ipi_vector vector) xen_send_IPI_one(cpu, vector); } -void xen_smp_send_call_function_ipi(cpumask_t mask) +static void xen_smp_send_call_function_ipi(cpumask_t mask) { int cpu; @@ -370,7 +370,7 @@ void xen_smp_send_call_function_ipi(cpumask_t mask) } } -void xen_smp_send_call_function_single_ipi(int cpu) +static void xen_smp_send_call_function_single_ipi(int cpu) { xen_send_IPI_mask(cpumask_of_cpu(cpu), XEN_CALL_FUNCTION_SINGLE_VECTOR); } @@ -394,3 +394,21 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } + +static const struct smp_ops xen_smp_ops __initdata = { + .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, + .smp_prepare_cpus = xen_smp_prepare_cpus, + .cpu_up = xen_cpu_up, + .smp_cpus_done = xen_smp_cpus_done, + + .smp_send_stop = xen_smp_send_stop, + .smp_send_reschedule = xen_smp_send_reschedule, + + .send_call_func_ipi = xen_smp_send_call_function_ipi, + .send_call_func_single_ipi = xen_smp_send_call_function_single_ipi, +}; + +void __init xen_smp_init(void) +{ + smp_ops = xen_smp_ops; +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 77354d204257..81a779fc9b26 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -47,17 +47,14 @@ void xen_mark_init_mm_pinned(void); void __init xen_fill_possible_map(void); void __init xen_setup_vcpu_info_placement(void); -void xen_smp_prepare_boot_cpu(void); -void xen_smp_prepare_cpus(unsigned int max_cpus); -int xen_cpu_up(unsigned int cpu); -void xen_smp_cpus_done(unsigned int max_cpus); -void xen_smp_send_stop(void); -void xen_smp_send_reschedule(int cpu); -void xen_smp_send_call_function_ipi(cpumask_t mask); -void xen_smp_send_call_function_single_ipi(int cpu); +#ifdef CONFIG_SMP +void xen_smp_init(void); extern cpumask_t xen_cpu_initialized_map; +#else +static inline void xen_smp_init(void) {} +#endif /* Declare an asm function, along with symbols needed to make it -- cgit v1.2.2 From 5b09b2876ed1a8e34a0da8f069575fc6174e2077 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:42 -0700 Subject: x86_64: add workaround for no %gs-based percpu As a stopgap until Mike Travis's x86-64 gs-based percpu patches are ready, provide workaround functions for x86_read/write_percpu for Xen's use. Specifically, this means that we can't really make use of vcpu placement, because we can't use a single gs-based memory access to get to vcpu fields. So disable all that for now. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 11 ++++++++--- arch/x86/xen/enlighten.c | 5 +++++ 2 files changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c97819829146..1b318e903bf6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -39,6 +39,13 @@ static struct x8664_pda *__cpu_pda[NR_CPUS] __initdata; static struct x8664_pda *__cpu_pda[NR_CPUS] __read_mostly; #endif +void __init x86_64_init_pda(void) +{ + _cpu_pda = __cpu_pda; + cpu_pda(0) = &_boot_cpu_pda; + pda_init(0); +} + static void __init zap_identity_mappings(void) { pgd_t *pgd = pgd_offset_k(0UL); @@ -102,9 +109,7 @@ void __init x86_64_start_kernel(char * real_mode_data) early_printk("Kernel alive\n"); - _cpu_pda = __cpu_pda; - cpu_pda(0) = &_boot_cpu_pda; - pda_init(0); + x86_64_init_pda(); early_printk("Kernel really alive\n"); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a85f447b8d00..f3f11acf7856 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -971,6 +971,7 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ +#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -980,6 +981,7 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } +#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -1000,10 +1002,12 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { +#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); +#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1323,6 +1327,7 @@ asmlinkage void __init xen_start_kernel(void) #ifdef CONFIG_X86_64 /* Disable until direct per-cpu data access. */ have_vcpu_info_placement = 0; + x86_64_init_pda(); #endif xen_smp_init(); -- cgit v1.2.2 From c7b75947f89d45493562ede6d9ee7311dfa5c4ce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:43 -0700 Subject: xen64: smp.c compile hacking A number of random changes to make xen/smp.c compile in 64-bit mode. Signed-off-by: Jeremy Fitzhardinge a Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 7 +--- arch/x86/xen/smp.c | 98 +++++++++++++++++++++++++++++--------------------- arch/x86/xen/xen-ops.h | 2 -- 3 files changed, 58 insertions(+), 49 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e0a39595bde3..f52f3855fb6b 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -98,7 +98,7 @@ void xen_enable_sysenter(void) /* Mask events on entry, even though they get enabled immediately */ static struct callback_register sysenter = { .type = CALLBACKTYPE_sysenter, - .address = { __KERNEL_CS, (unsigned long)xen_sysenter_target }, + .address = XEN_CALLBACK(__KERNEL_CS, xen_sysenter_target), .flags = CALLBACKF_mask_events, }; @@ -143,11 +143,6 @@ void __init xen_arch_setup(void) pm_idle = xen_idle; -#ifdef CONFIG_SMP - /* fill cpus_possible with all available cpus */ - xen_fill_possible_map(); -#endif - paravirt_disable_iospace(); fiddle_vdso(); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 91fae8ff756e..800bb2191e2a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -66,13 +66,21 @@ static __cpuinit void cpu_bringup_and_idle(void) int cpu = smp_processor_id(); cpu_init(); + preempt_disable(); + xen_enable_sysenter(); - preempt_disable(); - per_cpu(cpu_state, cpu) = CPU_ONLINE; + cpu = smp_processor_id(); + smp_store_cpu_info(cpu); + cpu_data(cpu).x86_max_cores = 1; + set_cpu_sibling_map(cpu); xen_setup_cpu_clockevents(); + cpu_set(cpu, cpu_online_map); + x86_write_percpu(cpu_state, CPU_ONLINE); + wmb(); + /* We can take interrupts now: we're officially "up". */ local_irq_enable(); @@ -141,7 +149,7 @@ static int xen_smp_intr_init(unsigned int cpu) return rc; } -void __init xen_fill_possible_map(void) +static void __init xen_fill_possible_map(void) { int i, rc; @@ -154,24 +162,12 @@ void __init xen_fill_possible_map(void) static void __init xen_smp_prepare_boot_cpu(void) { - int cpu; - BUG_ON(smp_processor_id() != 0); native_smp_prepare_boot_cpu(); /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu__gdt_page); - - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_map lives in a per cpu area that is cleared - * when the per cpu array is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } + make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); xen_setup_vcpu_info_placement(); } @@ -180,17 +176,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; - for_each_possible_cpu(cpu) { - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - /* - * cpu_core_ map will be zeroed when the per - * cpu area is allocated. - * - * cpus_clear(per_cpu(cpu_core_map, cpu)); - */ - } - smp_store_cpu_info(0); + cpu_data(0).x86_max_cores = 1; set_cpu_sibling_map(0); if (xen_smp_intr_init(0)) @@ -225,7 +212,7 @@ static __cpuinit int cpu_initialize_context(unsigned int cpu, struct task_struct *idle) { struct vcpu_guest_context *ctxt; - struct gdt_page *gdt = &per_cpu(gdt_page, cpu); + struct desc_struct *gdt; if (cpu_test_and_set(cpu, xen_cpu_initialized_map)) return 0; @@ -234,12 +221,15 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) if (ctxt == NULL) return -ENOMEM; + gdt = get_cpu_gdt_table(cpu); + ctxt->flags = VGCF_IN_KERNEL; ctxt->user_regs.ds = __USER_DS; ctxt->user_regs.es = __USER_DS; - ctxt->user_regs.fs = __KERNEL_PERCPU; - ctxt->user_regs.gs = 0; ctxt->user_regs.ss = __KERNEL_DS; +#ifdef CONFIG_X86_32 + ctxt->user_regs.fs = __KERNEL_PERCPU; +#endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ @@ -249,11 +239,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->ldt_ents = 0; - BUG_ON((unsigned long)gdt->gdt & ~PAGE_MASK); - make_lowmem_page_readonly(gdt->gdt); + BUG_ON((unsigned long)gdt & ~PAGE_MASK); + make_lowmem_page_readonly(gdt); - ctxt->gdt_frames[0] = virt_to_mfn(gdt->gdt); - ctxt->gdt_ents = ARRAY_SIZE(gdt->gdt); + ctxt->gdt_frames[0] = virt_to_mfn(gdt); + ctxt->gdt_ents = GDT_ENTRIES; ctxt->user_regs.cs = __KERNEL_CS; ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs); @@ -261,9 +251,11 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->kernel_ss = __KERNEL_DS; ctxt->kernel_sp = idle->thread.sp0; +#ifdef CONFIG_X86_32 ctxt->event_callback_cs = __KERNEL_CS; - ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_cs = __KERNEL_CS; +#endif + ctxt->event_callback_eip = (unsigned long)xen_hypervisor_callback; ctxt->failsafe_callback_eip = (unsigned long)xen_failsafe_callback; per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir); @@ -287,11 +279,28 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) return rc; #endif +#ifdef CONFIG_X86_64 + /* Allocate node local memory for AP pdas */ + WARN_ON(cpu == 0); + if (cpu > 0) { + rc = get_local_pda(cpu); + if (rc) + return rc; + } +#endif + +#ifdef CONFIG_X86_32 init_gdt(cpu); per_cpu(current_task, cpu) = idle; irq_ctx_init(cpu); +#else + cpu_pda(cpu)->pcurrent = idle; + clear_tsk_thread_flag(idle, TIF_FORK); +#endif xen_setup_timer(cpu); + per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* make sure interrupts start blocked */ per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1; @@ -306,16 +315,14 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) if (rc) return rc; - smp_store_cpu_info(cpu); - set_cpu_sibling_map(cpu); - /* This must be done before setting cpu_online_map */ - wmb(); - - cpu_set(cpu, cpu_online_map); - rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); BUG_ON(rc); + while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { + HYPERVISOR_sched_op(SCHEDOP_yield, 0); + barrier(); + } + return 0; } @@ -379,7 +386,11 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; @@ -389,7 +400,11 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) { irq_enter(); generic_smp_call_function_single_interrupt(); +#ifdef CONFIG_X86_32 __get_cpu_var(irq_stat).irq_call_count++; +#else + add_pda(irq_call_count, 1); +#endif irq_exit(); return IRQ_HANDLED; @@ -411,4 +426,5 @@ static const struct smp_ops xen_smp_ops __initdata = { void __init xen_smp_init(void) { smp_ops = xen_smp_ops; + xen_fill_possible_map(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 81a779fc9b26..aca4a7803e2c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -44,8 +44,6 @@ bool xen_vcpu_stolen(int vcpu); void xen_mark_init_mm_pinned(void); -void __init xen_fill_possible_map(void); - void __init xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP -- cgit v1.2.2 From 8c5e5ac32fe08793246709fbb94c055ec76a7c0e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:44 -0700 Subject: xen64: add xen-head code to head_64.S Add the Xen entrypoint and ELF notes to head_64.S. Adapts xen-head.S to compile either 32-bit or 64-bit. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_64.c | 3 +++ arch/x86/kernel/head_64.S | 1 + arch/x86/xen/xen-head.S | 15 +++++++++++++-- 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index bacf5deeec2d..0f7e1f09aa09 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -131,5 +131,8 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + + BLANK(); + DEFINE(PAGE_SIZE_asm, PAGE_SIZE); return 0; } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 4b6bda21837f..2240f823676a 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -401,6 +401,7 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 +#include "../../x86/xen/xen-head.S" .section .bss, "aw", @nobits .align L1_CACHE_BYTES diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S index a9cac9dc04be..63d49a523ed3 100644 --- a/arch/x86/xen/xen-head.S +++ b/arch/x86/xen/xen-head.S @@ -8,15 +8,21 @@ #include #include +#include #include #include __INIT ENTRY(startup_xen) - movl %esi,xen_start_info cld - movl $(init_thread_union+THREAD_SIZE),%esp +#ifdef CONFIG_X86_32 + mov %esi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%esp +#else + mov %rsi,xen_start_info + mov $init_thread_union+THREAD_SIZE,%rsp +#endif jmp xen_start_kernel __FINIT @@ -30,7 +36,11 @@ ENTRY(hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") ELFNOTE(Xen, XEN_ELFNOTE_GUEST_VERSION, .asciz "2.6") ELFNOTE(Xen, XEN_ELFNOTE_XEN_VERSION, .asciz "xen-3.0") +#ifdef CONFIG_X86_32 ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __PAGE_OFFSET) +#else + ELFNOTE(Xen, XEN_ELFNOTE_VIRT_BASE, _ASM_PTR __START_KERNEL_map) +#endif ELFNOTE(Xen, XEN_ELFNOTE_ENTRY, _ASM_PTR startup_xen) ELFNOTE(Xen, XEN_ELFNOTE_HYPERCALL_PAGE, _ASM_PTR hypercall_page) ELFNOTE(Xen, XEN_ELFNOTE_FEATURES, .asciz "!writable_page_tables|pae_pgdir_above_4gb") @@ -40,5 +50,6 @@ ENTRY(hypercall_page) .quad _PAGE_PRESENT; .quad _PAGE_PRESENT) ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1) ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW, _ASM_PTR __HYPERVISOR_VIRT_START) + ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET, _ASM_PTR 0) #endif /*CONFIG_XEN */ -- cgit v1.2.2 From 555cf2b5805a213ba262a2830c4d22ad635a249e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:45 -0700 Subject: xen64: add asm-offsets Add Xen vcpu_info offsets to asm-offsets_64. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/asm-offsets_64.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index 0f7e1f09aa09..aa89387006fe 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -18,6 +18,8 @@ #include #include +#include + #define __NO_STUBS 1 #undef __SYSCALL #undef _ASM_X86_64_UNISTD_H_ @@ -134,5 +136,11 @@ int main(void) BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); +#ifdef CONFIG_XEN + BLANK(); + OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); + OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); +#undef ENTRY +#endif return 0; } -- cgit v1.2.2 From cdacc1278b12d929f9a053c245ff3d16eb7af9f8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:46 -0700 Subject: xen64: add 64-bit assembler Split xen-asm into 32- and 64-bit files, and implement the 64-bit variants. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/Makefile | 2 +- arch/x86/xen/xen-asm.S | 305 ---------------------------------------------- arch/x86/xen/xen-asm_32.S | 305 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-asm_64.S | 141 +++++++++++++++++++++ 4 files changed, 447 insertions(+), 306 deletions(-) delete mode 100644 arch/x86/xen/xen-asm.S create mode 100644 arch/x86/xen/xen-asm_32.S create mode 100644 arch/x86/xen/xen-asm_64.S (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 2ba2d1649131..59c1e539aed2 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,4 @@ obj-y := enlighten.o setup.o multicalls.o mmu.o \ - time.o xen-asm.o grant-table.o suspend.o + time.o xen-asm_$(BITS).o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o diff --git a/arch/x86/xen/xen-asm.S b/arch/x86/xen/xen-asm.S deleted file mode 100644 index 2497a30f41de..000000000000 --- a/arch/x86/xen/xen-asm.S +++ /dev/null @@ -1,305 +0,0 @@ -/* - Asm versions of Xen pv-ops, suitable for either direct use or inlining. - The inline versions are the same as the direct-use versions, with the - pre- and post-amble chopped off. - - This code is encoded for size rather than absolute efficiency, - with a view to being able to inline as much as possible. - - We only bother with direct forms (ie, vcpu in pda) of the operations - here; the indirect forms are better handled in C, since they're - generally too large to inline anyway. - */ - -#include - -#include -#include -#include -#include -#include - -#include - -#define RELOC(x, v) .globl x##_reloc; x##_reloc=v -#define ENDPATCH(x) .globl x##_end; x##_end=. - -/* Pseudo-flag used for virtual NMI, which we don't implement yet */ -#define XEN_EFLAGS_NMI 0x80000000 - -/* - Enable events. This clears the event mask and tests the pending - event status with one and operation. If there are pending - events, then enter the hypervisor to get them handled. - */ -ENTRY(xen_irq_enable_direct) - /* Unmask events */ - movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* Test for pending */ - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f - -2: call check_events -1: -ENDPATCH(xen_irq_enable_direct) - ret - ENDPROC(xen_irq_enable_direct) - RELOC(xen_irq_enable_direct, 2b+1) - - -/* - Disabling events is simply a matter of making the event mask - non-zero. - */ -ENTRY(xen_irq_disable_direct) - movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask -ENDPATCH(xen_irq_disable_direct) - ret - ENDPROC(xen_irq_disable_direct) - RELOC(xen_irq_disable_direct, 0) - -/* - (xen_)save_fl is used to get the current interrupt enable status. - Callers expect the status to be in X86_EFLAGS_IF, and other bits - may be set in the return value. We take advantage of this by - making sure that X86_EFLAGS_IF has the right value (and other bits - in that byte are 0), but other bits in the return value are - undefined. We need to toggle the state of the bit, because - Xen and x86 use opposite senses (mask vs enable). - */ -ENTRY(xen_save_fl_direct) - testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - setz %ah - addb %ah,%ah -ENDPATCH(xen_save_fl_direct) - ret - ENDPROC(xen_save_fl_direct) - RELOC(xen_save_fl_direct, 0) - - -/* - In principle the caller should be passing us a value return - from xen_save_fl_direct, but for robustness sake we test only - the X86_EFLAGS_IF flag rather than the whole byte. After - setting the interrupt mask state, it checks for unmasked - pending events and enters the hypervisor to get them delivered - if so. - */ -ENTRY(xen_restore_fl_direct) - testb $X86_EFLAGS_IF>>8, %ah - setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask - /* Preempt here doesn't matter because that will deal with - any pending interrupts. The pending check may end up being - run on the wrong CPU, but that doesn't hurt. */ - - /* check for unmasked and pending */ - cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending - jz 1f -2: call check_events -1: -ENDPATCH(xen_restore_fl_direct) - ret - ENDPROC(xen_restore_fl_direct) - RELOC(xen_restore_fl_direct, 2b+1) - -/* - We can't use sysexit directly, because we're not running in ring0. - But we can easily fake it up using iret. Assuming xen_sysexit - is jumped to with a standard stack frame, we can just strip it - back to a standard iret frame and use iret. - */ -ENTRY(xen_sysexit) - movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ - orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) - lea PT_EIP(%esp), %esp - - jmp xen_iret -ENDPROC(xen_sysexit) - -/* - This is run where a normal iret would be run, with the same stack setup: - 8: eflags - 4: cs - esp-> 0: eip - - This attempts to make sure that any pending events are dealt - with on return to usermode, but there is a small window in - which an event can happen just before entering usermode. If - the nested interrupt ends up setting one of the TIF_WORK_MASK - pending work flags, they will not be tested again before - returning to usermode. This means that a process can end up - with pending work, which will be unprocessed until the process - enters and leaves the kernel again, which could be an - unbounded amount of time. This means that a pending signal or - reschedule event could be indefinitely delayed. - - The fix is to notice a nested interrupt in the critical - window, and if one occurs, then fold the nested interrupt into - the current interrupt stack frame, and re-process it - iteratively rather than recursively. This means that it will - exit via the normal path, and all pending work will be dealt - with appropriately. - - Because the nested interrupt handler needs to deal with the - current stack state in whatever form its in, we keep things - simple by only using a single register which is pushed/popped - on the stack. - */ -ENTRY(xen_iret) - /* test eflags for special cases */ - testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) - jnz hyper_iret - - push %eax - ESP_OFFSET=4 # bytes pushed onto stack - - /* Store vcpu_info pointer for easy access. Do it this - way to avoid having to reload %fs */ -#ifdef CONFIG_SMP - GET_THREAD_INFO(%eax) - movl TI_cpu(%eax),%eax - movl __per_cpu_offset(,%eax,4),%eax - mov per_cpu__xen_vcpu(%eax),%eax -#else - movl per_cpu__xen_vcpu, %eax -#endif - - /* check IF state we're restoring */ - testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) - - /* Maybe enable events. Once this happens we could get a - recursive event, so the critical region starts immediately - afterwards. However, if that happens we don't end up - resuming the code, so we don't have to be worried about - being preempted to another CPU. */ - setz XEN_vcpu_info_mask(%eax) -xen_iret_start_crit: - - /* check for unmasked and pending */ - cmpw $0x0001, XEN_vcpu_info_pending(%eax) - - /* If there's something pending, mask events again so we - can jump back into xen_hypervisor_callback */ - sete XEN_vcpu_info_mask(%eax) - - popl %eax - - /* From this point on the registers are restored and the stack - updated, so we don't need to worry about it if we're preempted */ -iret_restore_end: - - /* Jump to hypervisor_callback after fixing up the stack. - Events are masked, so jumping out of the critical - region is OK. */ - je xen_hypervisor_callback - -1: iret -xen_iret_end_crit: -.section __ex_table,"a" - .align 4 - .long 1b,iret_exc -.previous - -hyper_iret: - /* put this out of line since its very rarely used */ - jmp hypercall_page + __HYPERVISOR_iret * 32 - - .globl xen_iret_start_crit, xen_iret_end_crit - -/* - This is called by xen_hypervisor_callback in entry.S when it sees - that the EIP at the time of interrupt was between xen_iret_start_crit - and xen_iret_end_crit. We're passed the EIP in %eax so we can do - a more refined determination of what to do. - - The stack format at this point is: - ---------------- - ss : (ss/esp may be present if we came from usermode) - esp : - eflags } outer exception info - cs } - eip } - ---------------- <- edi (copy dest) - eax : outer eax if it hasn't been restored - ---------------- - eflags } nested exception info - cs } (no ss/esp because we're nested - eip } from the same ring) - orig_eax }<- esi (copy src) - - - - - - - - - - fs } - es } - ds } SAVE_ALL state - eax } - : : - ebx }<- esp - ---------------- - - In order to deliver the nested exception properly, we need to shift - everything from the return addr up to the error code so it - sits just under the outer exception info. This means that when we - handle the exception, we do it in the context of the outer exception - rather than starting a new one. - - The only caveat is that if the outer eax hasn't been - restored yet (ie, it's still on stack), we need to insert - its value into the SAVE_ALL state before going on, since - it's usermode state which we eventually need to restore. - */ -ENTRY(xen_iret_crit_fixup) - /* - Paranoia: Make sure we're really coming from kernel space. - One could imagine a case where userspace jumps into the - critical range address, but just before the CPU delivers a GP, - it decides to deliver an interrupt instead. Unlikely? - Definitely. Easy to avoid? Yes. The Intel documents - explicitly say that the reported EIP for a bad jump is the - jump instruction itself, not the destination, but some virtual - environments get this wrong. - */ - movl PT_CS(%esp), %ecx - andl $SEGMENT_RPL_MASK, %ecx - cmpl $USER_RPL, %ecx - je 2f - - lea PT_ORIG_EAX(%esp), %esi - lea PT_EFLAGS(%esp), %edi - - /* If eip is before iret_restore_end then stack - hasn't been restored yet. */ - cmp $iret_restore_end, %eax - jae 1f - - movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ - movl %eax, PT_EAX(%esp) - - lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ - - /* set up the copy */ -1: std - mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ - rep movsl - cld - - lea 4(%edi),%esp /* point esp to new frame */ -2: jmp xen_do_upcall - - -/* - Force an event check by making a hypercall, - but preserve regs before making the call. - */ -check_events: - push %eax - push %ecx - push %edx - call force_evtchn_callback - pop %edx - pop %ecx - pop %eax - ret diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S new file mode 100644 index 000000000000..2497a30f41de --- /dev/null +++ b/arch/x86/xen/xen-asm_32.S @@ -0,0 +1,305 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include + +#include +#include +#include +#include +#include + +#include + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + jz 1f + +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_mask + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info)+XEN_vcpu_info_pending + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + +/* + We can't use sysexit directly, because we're not running in ring0. + But we can easily fake it up using iret. Assuming xen_sysexit + is jumped to with a standard stack frame, we can just strip it + back to a standard iret frame and use iret. + */ +ENTRY(xen_sysexit) + movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ + orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) + lea PT_EIP(%esp), %esp + + jmp xen_iret +ENDPROC(xen_sysexit) + +/* + This is run where a normal iret would be run, with the same stack setup: + 8: eflags + 4: cs + esp-> 0: eip + + This attempts to make sure that any pending events are dealt + with on return to usermode, but there is a small window in + which an event can happen just before entering usermode. If + the nested interrupt ends up setting one of the TIF_WORK_MASK + pending work flags, they will not be tested again before + returning to usermode. This means that a process can end up + with pending work, which will be unprocessed until the process + enters and leaves the kernel again, which could be an + unbounded amount of time. This means that a pending signal or + reschedule event could be indefinitely delayed. + + The fix is to notice a nested interrupt in the critical + window, and if one occurs, then fold the nested interrupt into + the current interrupt stack frame, and re-process it + iteratively rather than recursively. This means that it will + exit via the normal path, and all pending work will be dealt + with appropriately. + + Because the nested interrupt handler needs to deal with the + current stack state in whatever form its in, we keep things + simple by only using a single register which is pushed/popped + on the stack. + */ +ENTRY(xen_iret) + /* test eflags for special cases */ + testl $(X86_EFLAGS_VM | XEN_EFLAGS_NMI), 8(%esp) + jnz hyper_iret + + push %eax + ESP_OFFSET=4 # bytes pushed onto stack + + /* Store vcpu_info pointer for easy access. Do it this + way to avoid having to reload %fs */ +#ifdef CONFIG_SMP + GET_THREAD_INFO(%eax) + movl TI_cpu(%eax),%eax + movl __per_cpu_offset(,%eax,4),%eax + mov per_cpu__xen_vcpu(%eax),%eax +#else + movl per_cpu__xen_vcpu, %eax +#endif + + /* check IF state we're restoring */ + testb $X86_EFLAGS_IF>>8, 8+1+ESP_OFFSET(%esp) + + /* Maybe enable events. Once this happens we could get a + recursive event, so the critical region starts immediately + afterwards. However, if that happens we don't end up + resuming the code, so we don't have to be worried about + being preempted to another CPU. */ + setz XEN_vcpu_info_mask(%eax) +xen_iret_start_crit: + + /* check for unmasked and pending */ + cmpw $0x0001, XEN_vcpu_info_pending(%eax) + + /* If there's something pending, mask events again so we + can jump back into xen_hypervisor_callback */ + sete XEN_vcpu_info_mask(%eax) + + popl %eax + + /* From this point on the registers are restored and the stack + updated, so we don't need to worry about it if we're preempted */ +iret_restore_end: + + /* Jump to hypervisor_callback after fixing up the stack. + Events are masked, so jumping out of the critical + region is OK. */ + je xen_hypervisor_callback + +1: iret +xen_iret_end_crit: +.section __ex_table,"a" + .align 4 + .long 1b,iret_exc +.previous + +hyper_iret: + /* put this out of line since its very rarely used */ + jmp hypercall_page + __HYPERVISOR_iret * 32 + + .globl xen_iret_start_crit, xen_iret_end_crit + +/* + This is called by xen_hypervisor_callback in entry.S when it sees + that the EIP at the time of interrupt was between xen_iret_start_crit + and xen_iret_end_crit. We're passed the EIP in %eax so we can do + a more refined determination of what to do. + + The stack format at this point is: + ---------------- + ss : (ss/esp may be present if we came from usermode) + esp : + eflags } outer exception info + cs } + eip } + ---------------- <- edi (copy dest) + eax : outer eax if it hasn't been restored + ---------------- + eflags } nested exception info + cs } (no ss/esp because we're nested + eip } from the same ring) + orig_eax }<- esi (copy src) + - - - - - - - - + fs } + es } + ds } SAVE_ALL state + eax } + : : + ebx }<- esp + ---------------- + + In order to deliver the nested exception properly, we need to shift + everything from the return addr up to the error code so it + sits just under the outer exception info. This means that when we + handle the exception, we do it in the context of the outer exception + rather than starting a new one. + + The only caveat is that if the outer eax hasn't been + restored yet (ie, it's still on stack), we need to insert + its value into the SAVE_ALL state before going on, since + it's usermode state which we eventually need to restore. + */ +ENTRY(xen_iret_crit_fixup) + /* + Paranoia: Make sure we're really coming from kernel space. + One could imagine a case where userspace jumps into the + critical range address, but just before the CPU delivers a GP, + it decides to deliver an interrupt instead. Unlikely? + Definitely. Easy to avoid? Yes. The Intel documents + explicitly say that the reported EIP for a bad jump is the + jump instruction itself, not the destination, but some virtual + environments get this wrong. + */ + movl PT_CS(%esp), %ecx + andl $SEGMENT_RPL_MASK, %ecx + cmpl $USER_RPL, %ecx + je 2f + + lea PT_ORIG_EAX(%esp), %esi + lea PT_EFLAGS(%esp), %edi + + /* If eip is before iret_restore_end then stack + hasn't been restored yet. */ + cmp $iret_restore_end, %eax + jae 1f + + movl 0+4(%edi),%eax /* copy EAX (just above top of frame) */ + movl %eax, PT_EAX(%esp) + + lea ESP_OFFSET(%edi),%edi /* move dest up over saved regs */ + + /* set up the copy */ +1: std + mov $PT_EIP / 4, %ecx /* saved regs up to orig_eax */ + rep movsl + cld + + lea 4(%edi),%esp /* point esp to new frame */ +2: jmp xen_do_upcall + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %eax + push %ecx + push %edx + call force_evtchn_callback + pop %edx + pop %ecx + pop %eax + ret diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S new file mode 100644 index 000000000000..4ec10827370b --- /dev/null +++ b/arch/x86/xen/xen-asm_64.S @@ -0,0 +1,141 @@ +/* + Asm versions of Xen pv-ops, suitable for either direct use or inlining. + The inline versions are the same as the direct-use versions, with the + pre- and post-amble chopped off. + + This code is encoded for size rather than absolute efficiency, + with a view to being able to inline as much as possible. + + We only bother with direct forms (ie, vcpu in pda) of the operations + here; the indirect forms are better handled in C, since they're + generally too large to inline anyway. + */ + +#include + +#include +#include + +#include + +#define RELOC(x, v) .globl x##_reloc; x##_reloc=v +#define ENDPATCH(x) .globl x##_end; x##_end=. + +/* Pseudo-flag used for virtual NMI, which we don't implement yet */ +#define XEN_EFLAGS_NMI 0x80000000 + +#if 0 +#include + +/* + Enable events. This clears the event mask and tests the pending + event status with one and operation. If there are pending + events, then enter the hypervisor to get them handled. + */ +ENTRY(xen_irq_enable_direct) + /* Unmask events */ + movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* Test for pending */ + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f + +2: call check_events +1: +ENDPATCH(xen_irq_enable_direct) + ret + ENDPROC(xen_irq_enable_direct) + RELOC(xen_irq_enable_direct, 2b+1) + +/* + Disabling events is simply a matter of making the event mask + non-zero. + */ +ENTRY(xen_irq_disable_direct) + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) +ENDPATCH(xen_irq_disable_direct) + ret + ENDPROC(xen_irq_disable_direct) + RELOC(xen_irq_disable_direct, 0) + +/* + (xen_)save_fl is used to get the current interrupt enable status. + Callers expect the status to be in X86_EFLAGS_IF, and other bits + may be set in the return value. We take advantage of this by + making sure that X86_EFLAGS_IF has the right value (and other bits + in that byte are 0), but other bits in the return value are + undefined. We need to toggle the state of the bit, because + Xen and x86 use opposite senses (mask vs enable). + */ +ENTRY(xen_save_fl_direct) + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + setz %ah + addb %ah,%ah +ENDPATCH(xen_save_fl_direct) + ret + ENDPROC(xen_save_fl_direct) + RELOC(xen_save_fl_direct, 0) + +/* + In principle the caller should be passing us a value return + from xen_save_fl_direct, but for robustness sake we test only + the X86_EFLAGS_IF flag rather than the whole byte. After + setting the interrupt mask state, it checks for unmasked + pending events and enters the hypervisor to get them delivered + if so. + */ +ENTRY(xen_restore_fl_direct) + testb $X86_EFLAGS_IF>>8, %ah + setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) + /* Preempt here doesn't matter because that will deal with + any pending interrupts. The pending check may end up being + run on the wrong CPU, but that doesn't hurt. */ + + /* check for unmasked and pending */ + cmpw $0x0001, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_pending) + jz 1f +2: call check_events +1: +ENDPATCH(xen_restore_fl_direct) + ret + ENDPROC(xen_restore_fl_direct) + RELOC(xen_restore_fl_direct, 2b+1) + + +/* + Force an event check by making a hypercall, + but preserve regs before making the call. + */ +check_events: + push %rax + push %rcx + push %rdx + push %rsi + push %rdi + push %r8 + push %r9 + push %r10 + push %r11 + call force_evtchn_callback + pop %r11 + pop %r10 + pop %r9 + pop %r8 + pop %rdi + pop %rsi + pop %rdx + pop %rcx + pop %rax + ret +#endif + +ENTRY(xen_iret) + pushq $0 + jmp hypercall_page + __HYPERVISOR_iret * 32 + +ENTRY(xen_sysexit) + ud2a -- cgit v1.2.2 From 15664f968a95d8fbf4a0d7b462fcc20f88906bb3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:47 -0700 Subject: xen64: use set_fixmap for shared_info structure Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f3f11acf7856..dbe3549fad40 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -902,18 +902,11 @@ static __init void xen_pagetable_setup_start(pgd_t *base) void xen_setup_shared_info(void) { if (!xen_feature(XENFEAT_auto_translated_physmap)) { - unsigned long addr = fix_to_virt(FIX_PARAVIRT_BOOTMAP); - - /* - * Create a mapping for the shared info page. - * Should be set_fixmap(), but shared_info is a machine - * address with no corresponding pseudo-phys address. - */ - set_pte_mfn(addr, - PFN_DOWN(xen_start_info->shared_info), - PAGE_KERNEL); - - HYPERVISOR_shared_info = (struct shared_info *)addr; + set_fixmap(FIX_PARAVIRT_BOOTMAP, + xen_start_info->shared_info); + + HYPERVISOR_shared_info = + (struct shared_info *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); } else HYPERVISOR_shared_info = (struct shared_info *)__va(xen_start_info->shared_info); @@ -1050,8 +1043,13 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_F00F_BUG case FIX_F00F_IDT: #endif +#ifdef CONFIG_X86_32 case FIX_WP_TEST: case FIX_VDSO: + case FIX_KMAP_BEGIN ... FIX_KMAP_END: +#else + case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: +#endif #ifdef CONFIG_X86_LOCAL_APIC case FIX_APIC_BASE: /* maps dummy local APIC */ #endif -- cgit v1.2.2 From 7d087b68d6ddb2398fb7f6e45990b7248de640ef Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:48 -0700 Subject: xen: cpu_detect is 32-bit only Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index dbe3549fad40..2b7bea3bb6f3 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1365,12 +1365,12 @@ asmlinkage void __init xen_start_kernel(void) /* set the limit of our address space */ xen_reserve_top(); +#ifdef CONFIG_X86_32 /* set up basic CPUID stuff */ cpu_detect(&new_cpu_data); -#ifdef CONFIG_X86_32 new_cpu_data.hard_math = 1; -#endif new_cpu_data.x86_capability[0] = cpuid_edx(1); +#endif /* Poke various useful things into boot_params */ boot_params.hdr.type_of_loader = (9 << 4) | 0; -- cgit v1.2.2 From 3d75e1b8ef1567348ceba93d4666a1c7c2333583 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:49 -0700 Subject: xen64: add hypervisor callbacks for events, etc Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 98 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c340..7cc2de796146 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1312,3 +1312,101 @@ KPROBE_ENTRY(ignore_sysret) sysret CFI_ENDPROC ENDPROC(ignore_sysret) + +#ifdef CONFIG_XEN +ENTRY(xen_hypervisor_callback) + zeroentry xen_do_hypervisor_callback +END(xen_hypervisor_callback) + +/* +# A note on the "critical region" in our callback handler. +# We want to avoid stacking callback handlers due to events occurring +# during handling of the last event. To do this, we keep events disabled +# until we've done all processing. HOWEVER, we must enable events before +# popping the stack frame (can't be done atomically) and so it would still +# be possible to get enough handler activations to overflow the stack. +# Although unlikely, bugs of that kind are hard to track down, so we'd +# like to avoid the possibility. +# So, on entry to the handler we detect whether we interrupted an +# existing activation in its critical region -- if so, we pop the current +# activation and restart the handler using the previous one. +*/ +ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs) + CFI_STARTPROC +/* Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will + see the correct pointer to the pt_regs */ + movq %rdi, %rsp # we don't return, adjust the stack frame + CFI_ENDPROC + CFI_DEFAULT_STACK +11: incl %gs:pda_irqcount + movq %rsp,%rbp + CFI_DEF_CFA_REGISTER rbp + cmovzq %gs:pda_irqstackptr,%rsp + pushq %rbp # backlink for old unwinder + call xen_evtchn_do_upcall + popq %rsp + CFI_DEF_CFA_REGISTER rsp + decl %gs:pda_irqcount + jmp error_exit + CFI_ENDPROC +END(do_hypervisor_callback) + +/* +# Hypervisor uses this for application faults while it executes. +# We get here for two reasons: +# 1. Fault while reloading DS, ES, FS or GS +# 2. Fault while executing IRET +# Category 1 we do not need to fix up as Xen has already reloaded all segment +# registers that could be reloaded and zeroed the others. +# Category 2 we fix up by killing the current process. We cannot use the +# normal Linux return path in this case because if we use the IRET hypercall +# to pop the stack frame we end up in an infinite loop of failsafe callbacks. +# We distinguish between categories by comparing each saved segment register +# with its current contents: any discrepancy means we in category 1. +*/ +ENTRY(xen_failsafe_callback) +#if 1 + ud2a +#else + _frame (RIP-0x30) + CFI_REL_OFFSET rcx, 0 + CFI_REL_OFFSET r11, 8 + movw %ds,%cx + cmpw %cx,0x10(%rsp) + CFI_REMEMBER_STATE + jne 1f + movw %es,%cx + cmpw %cx,0x18(%rsp) + jne 1f + movw %fs,%cx + cmpw %cx,0x20(%rsp) + jne 1f + movw %gs,%cx + cmpw %cx,0x28(%rsp) + jne 1f + /* All segments match their saved values => Category 2 (Bad IRET). */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + movq $11,%rdi /* SIGSEGV */ + jmp do_exit + CFI_RESTORE_STATE +1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ + movq (%rsp),%rcx + CFI_RESTORE rcx + movq 8(%rsp),%r11 + CFI_RESTORE r11 + addq $0x30,%rsp + CFI_ADJUST_CFA_OFFSET -0x30 + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + SAVE_ALL + jmp error_exit + CFI_ENDPROC +#endif +END(xen_failsafe_callback) + +#endif /* CONFIG_XEN */ -- cgit v1.2.2 From 084a2a4e7656209ea93aac9778defa03213ca31d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:50 -0700 Subject: xen64: early mapping setup Set up the initial pagetables to map the kernel mapping into the physical mapping space. This makes __va() usable, since it requires physical mappings. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 192 +++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 176 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 2b7bea3bb6f3..a991ee7ade9e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1294,6 +1295,157 @@ static void __init xen_reserve_top(void) #endif /* CONFIG_X86_32 */ } +#ifdef CONFIG_X86_64 +/* + * Like __va(), but returns address in the kernel mapping (which is + * all we have until the physical memory mapping has been set up. + */ +static void *__ka(phys_addr_t paddr) +{ + return (void *)(paddr + __START_KERNEL_map); +} + +/* Convert a machine address to physical address */ +static unsigned long m2p(phys_addr_t maddr) +{ + phys_addr_t paddr; + + maddr &= PTE_MASK; + paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; + + return paddr; +} + +/* Convert a machine address to kernel virtual */ +static void *m2v(phys_addr_t maddr) +{ + return __ka(m2p(maddr)); +} + +static void walk(pgd_t *pgd, unsigned long addr) +{ + unsigned l4idx = pgd_index(addr); + unsigned l3idx = pud_index(addr); + unsigned l2idx = pmd_index(addr); + unsigned l1idx = pte_index(addr); + pgd_t l4; + pud_t l3; + pmd_t l2; + pte_t l1; + + xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", + pgd, addr, l4idx, l3idx, l2idx, l1idx); + + l4 = pgd[l4idx]; + xen_raw_printk(" l4: %016lx\n", l4.pgd); + xen_raw_printk(" %016lx\n", pgd_val(l4)); + + l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; + xen_raw_printk(" l3: %016lx\n", l3.pud); + xen_raw_printk(" %016lx\n", pud_val(l3)); + + l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; + xen_raw_printk(" l2: %016lx\n", l2.pmd); + xen_raw_printk(" %016lx\n", pmd_val(l2)); + + l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; + xen_raw_printk(" l1: %016lx\n", l1.pte); + xen_raw_printk(" %016lx\n", pte_val(l1)); +} + +static void set_page_prot(void *addr, pgprot_t prot) +{ + unsigned long pfn = __pa(addr) >> PAGE_SHIFT; + pte_t pte = pfn_pte(pfn, prot); + + xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016x pte=%016x\n", + addr, pfn, get_phys_to_machine(pfn), + pgprot_val(prot), pte.pte); + + if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) + BUG(); +} + +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for(i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); +} + +/* + * Set up the inital kernel pagetable. + * + * We can construct this by grafting the Xen provided pagetable into + * head_64.S's preconstructed pagetables. We copy the Xen L2's into + * level2_ident_pgt, level2_kernel_pgt and level2_fixmap_pgt. This + * means that only the kernel has a physical mapping to start with - + * but that's enough to get __va working. We need to fill in the rest + * of the physical mapping once some sort of allocator has been set + * up. + */ +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +{ + pud_t *l3; + pmd_t *l2; + + /* Zap identity mapping */ + init_level4_pgt[0] = __pgd(0); + + /* Pre-constructed entries are in pfn, so convert to mfn */ + convert_pfn_mfn(init_level4_pgt); + convert_pfn_mfn(level3_ident_pgt); + convert_pfn_mfn(level3_kernel_pgt); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map)].pud); + + memcpy(level2_ident_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + memcpy(level2_kernel_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + l3 = m2v(pgd[pgd_index(__START_KERNEL_map + PMD_SIZE)].pgd); + l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); + memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + + /* Make pagetable pieces RO */ + set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); + + /* Pin down new L4 */ + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(init_level4_pgt))); + + /* Unpin Xen-provided one */ + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + /* Switch over */ + pgd = init_level4_pgt; + xen_write_cr3(__pa(pgd)); + + max_pfn_mapped = PFN_DOWN(__pa(pgd) + + xen_start_info->nr_pt_frames*PAGE_SIZE + + 512*1024); + + return pgd; +} +#else +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +{ + init_pg_tables_start = __pa(pgd); + init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; + max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + + return pgd; +} +#endif /* CONFIG_X86_64 */ + /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -1336,32 +1488,29 @@ asmlinkage void __init xen_start_kernel(void) pgd = (pgd_t *)xen_start_info->pt_base; -#ifdef CONFIG_X86_32 - init_pg_tables_start = __pa(pgd); - init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; - max_pfn_mapped = (init_pg_tables_end + 512*1024) >> PAGE_SHIFT; -#endif + /* Prevent unwanted bits from being set in PTEs. */ + __supported_pte_mask &= ~_PAGE_GLOBAL; + if (!is_initial_xendomain()) + __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); + + /* Don't do the full vcpu_info placement stuff until we have a + possible map and a non-dummy shared_info. */ + per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; + + xen_raw_console_write("mapping kernel into physical memory\n"); + pgd = xen_setup_kernel_pagetable(pgd); - init_mm.pgd = pgd; /* use the Xen pagetables to start */ + init_mm.pgd = pgd; /* keep using Xen gdt for now; no urgent need to change it */ x86_write_percpu(xen_cr3, __pa(pgd)); x86_write_percpu(xen_current_cr3, __pa(pgd)); - /* Don't do the full vcpu_info placement stuff until we have a - possible map and a non-dummy shared_info. */ - per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; - pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; - /* Prevent unwanted bits from being set in PTEs. */ - __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) - __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); - /* set the limit of our address space */ xen_reserve_top(); @@ -1384,10 +1533,21 @@ asmlinkage void __init xen_start_kernel(void) add_preferred_console("hvc", 0, NULL); } + xen_raw_console_write("about to get started...\n"); + +#if 0 + xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", + &boot_params, __pa_symbol(&boot_params), + __va(__pa_symbol(&boot_params))); + + walk(pgd, &boot_params); + walk(pgd, __va(__pa(&boot_params))); +#endif + /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); #else - x86_64_start_kernel((char *)&boot_params); + x86_64_start_reservations((char *)__pa_symbol(&boot_params)); #endif } -- cgit v1.2.2 From 22911b3f1cf5431058e56b1727e8ef77be5e0ac9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:51 -0700 Subject: xen64: 64-bit starts using set_pte from very early It also doesn't need the 32-bit hack version of set_pte for initial pagetable construction, so just make it use the real thing. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index a991ee7ade9e..392450787aa9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1194,7 +1194,11 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .kmap_atomic_pte = xen_kmap_atomic_pte, #endif +#ifdef CONFIG_X86_64 + .set_pte = xen_set_pte, +#else .set_pte = xen_set_pte_init, +#endif .set_pte_at = xen_set_pte_at, .set_pmd = xen_set_pmd_hyper, -- cgit v1.2.2 From d114e1981cc1a51131230993a082c27c79ab370a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:52 -0700 Subject: xen64: map an initial chunk of physical memory Early in boot, map a chunk of extra physical memory for use later on. We need a pool of mapped pages to allocate further pages to construct pagetables mapping all physical memory. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 79 ++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 69 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 392450787aa9..e9e3bafe48cf 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1381,6 +1381,61 @@ static void convert_pfn_mfn(void *v) pte[i] = xen_make_pte(pte[i].pte); } +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +static __init void xen_map_identity_early(unsigned long max_pfn) +{ + unsigned pmdidx, pteidx; + unsigned ident_pte; + unsigned long pfn; + + ident_pte = 0; + pfn = 0; + for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { + pte_t *pte_page; + + BUG_ON(level2_ident_pgt[pmdidx].pmd != level2_kernel_pgt[pmdidx].pmd); + + /* Reuse or allocate a page of ptes */ + if (pmd_present(level2_ident_pgt[pmdidx])) + pte_page = m2v(level2_ident_pgt[pmdidx].pmd); + else { + /* Check for free pte pages */ + if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) + break; + + pte_page = &level1_ident_pgt[ident_pte]; + ident_pte += PTRS_PER_PTE; + + /* Install new l1 in l2(s) */ + level2_ident_pgt[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); + level2_kernel_pgt[pmdidx] = level2_ident_pgt[pmdidx]; + } + + /* Install mappings */ + for(pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { + pte_t pte; + + if (pfn > max_pfn_mapped) + max_pfn_mapped = pfn; + + if (!pte_none(pte_page[pteidx])) + continue; + + pte = pfn_pte(pfn, PAGE_KERNEL_EXEC); + pte_page[pteidx] = pte; + } + } + + for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) + set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); +} + /* * Set up the inital kernel pagetable. * @@ -1392,7 +1447,7 @@ static void convert_pfn_mfn(void *v) * of the physical mapping once some sort of allocator has been set * up. */ -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { pud_t *l3; pmd_t *l2; @@ -1415,6 +1470,9 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) l2 = m2v(l3[pud_index(__START_KERNEL_map + PMD_SIZE)].pud); memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); + /* Set up identity map */ + xen_map_identity_early(max_pfn); + /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); @@ -1424,7 +1482,7 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(init_level4_pgt))); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa_symbol(init_level4_pgt))); /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); @@ -1433,19 +1491,23 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) pgd = init_level4_pgt; xen_write_cr3(__pa(pgd)); - max_pfn_mapped = PFN_DOWN(__pa(pgd) + - xen_start_info->nr_pt_frames*PAGE_SIZE + - 512*1024); + reserve_early(__pa(xen_start_info->pt_base), + __pa(xen_start_info->pt_base + + xen_start_info->nr_pt_frames * PAGE_SIZE), + "XEN PAGETABLES"); return pgd; } #else -static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd) +static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); + x86_write_percpu(xen_cr3, __pa(pgd)); + x86_write_percpu(xen_current_cr3, __pa(pgd)); + return pgd; } #endif /* CONFIG_X86_64 */ @@ -1502,15 +1564,12 @@ asmlinkage void __init xen_start_kernel(void) per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; xen_raw_console_write("mapping kernel into physical memory\n"); - pgd = xen_setup_kernel_pagetable(pgd); + pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); init_mm.pgd = pgd; /* keep using Xen gdt for now; no urgent need to change it */ - x86_write_percpu(xen_cr3, __pa(pgd)); - x86_write_percpu(xen_current_cr3, __pa(pgd)); - pv_info.kernel_rpl = 1; if (xen_feature(XENFEAT_supervisor_mode_kernel)) pv_info.kernel_rpl = 0; -- cgit v1.2.2 From 39dbc5bd345ebf93e066dde7f8e29467eb61b42e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:53 -0700 Subject: xen32: create initial mappings like 64-bit Rearrange the pagetable initialization to share code with the 64-bit kernel. Rather than deferring anything to pagetable_setup_start, just set up an initial pagetable in swapper_pg_dir early at startup, and create an additional 8MB of physical memory mappings. This matches the native head_32.S mappings to a large degree, and allows the rest of the pagetable setup to continue without much Xen vs. native difference. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 130 +++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 78 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e9e3bafe48cf..19c12a6c7311 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -854,50 +854,6 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) static __init void xen_pagetable_setup_start(pgd_t *base) { -#ifdef CONFIG_X86_32 - pgd_t *xen_pgd = (pgd_t *)xen_start_info->pt_base; - int i; - - init_mm.pgd = base; - /* - * copy top-level of Xen-supplied pagetable into place. This - * is a stand-in while we copy the pmd pages. - */ - memcpy(base, xen_pgd, PTRS_PER_PGD * sizeof(pgd_t)); - - /* - * For PAE, need to allocate new pmds, rather than - * share Xen's, since Xen doesn't like pmd's being - * shared between address spaces. - */ - for (i = 0; i < PTRS_PER_PGD; i++) { - if (pgd_val_ma(xen_pgd[i]) & _PAGE_PRESENT) { - pmd_t *pmd = (pmd_t *)alloc_bootmem_low_pages(PAGE_SIZE); - - memcpy(pmd, (void *)pgd_page_vaddr(xen_pgd[i]), - PAGE_SIZE); - - make_lowmem_page_readonly(pmd); - - set_pgd(&base[i], __pgd(1 + __pa(pmd))); - } else - pgd_clear(&base[i]); - } - - /* make sure zero_page is mapped RO so we can use it in pagetables */ - make_lowmem_page_readonly(empty_zero_page); - make_lowmem_page_readonly(base); - /* - * Switch to new pagetable. This is done before - * pagetable_init has done anything so that the new pages - * added to the table can be prepared properly for Xen. - */ - xen_write_cr3(__pa(base)); - - /* Unpin initial Xen pagetable */ - pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, - PFN_DOWN(__pa(xen_start_info->pt_base))); -#endif /* CONFIG_X86_32 */ } void xen_setup_shared_info(void) @@ -936,12 +892,6 @@ static __init void xen_pagetable_setup_done(pgd_t *base) pv_mmu_ops.set_pte = xen_set_pte; xen_setup_shared_info(); - -#ifdef CONFIG_X86_32 - /* Actually pin the pagetable down, but we can't set PG_pinned - yet because the page structures don't exist yet. */ - pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(base))); -#endif } static __init void xen_post_allocator_init(void) @@ -1299,14 +1249,17 @@ static void __init xen_reserve_top(void) #endif /* CONFIG_X86_32 */ } -#ifdef CONFIG_X86_64 /* * Like __va(), but returns address in the kernel mapping (which is * all we have until the physical memory mapping has been set up. */ static void *__ka(phys_addr_t paddr) { +#ifdef CONFIG_X86_64 return (void *)(paddr + __START_KERNEL_map); +#else + return __va(paddr); +#endif } /* Convert a machine address to physical address */ @@ -1326,6 +1279,7 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } +#ifdef CONFIG_X86_64 static void walk(pgd_t *pgd, unsigned long addr) { unsigned l4idx = pgd_index(addr); @@ -1356,13 +1310,14 @@ static void walk(pgd_t *pgd, unsigned long addr) xen_raw_printk(" l1: %016lx\n", l1.pte); xen_raw_printk(" %016lx\n", pte_val(l1)); } +#endif static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016x pte=%016x\n", + xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", addr, pfn, get_phys_to_machine(pfn), pgprot_val(prot), pte.pte); @@ -1370,17 +1325,6 @@ static void set_page_prot(void *addr, pgprot_t prot) BUG(); } -static void convert_pfn_mfn(void *v) -{ - pte_t *pte = v; - int i; - - /* All levels are converted the same way, so just treat them - as ptes. */ - for(i = 0; i < PTRS_PER_PTE; i++) - pte[i] = xen_make_pte(pte[i].pte); -} - /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -1388,7 +1332,7 @@ static void convert_pfn_mfn(void *v) */ static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; -static __init void xen_map_identity_early(unsigned long max_pfn) +static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; unsigned ident_pte; @@ -1399,11 +1343,9 @@ static __init void xen_map_identity_early(unsigned long max_pfn) for(pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { pte_t *pte_page; - BUG_ON(level2_ident_pgt[pmdidx].pmd != level2_kernel_pgt[pmdidx].pmd); - /* Reuse or allocate a page of ptes */ - if (pmd_present(level2_ident_pgt[pmdidx])) - pte_page = m2v(level2_ident_pgt[pmdidx].pmd); + if (pmd_present(pmd[pmdidx])) + pte_page = m2v(pmd[pmdidx].pmd); else { /* Check for free pte pages */ if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) @@ -1412,9 +1354,7 @@ static __init void xen_map_identity_early(unsigned long max_pfn) pte_page = &level1_ident_pgt[ident_pte]; ident_pte += PTRS_PER_PTE; - /* Install new l1 in l2(s) */ - level2_ident_pgt[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); - level2_kernel_pgt[pmdidx] = level2_ident_pgt[pmdidx]; + pmd[pmdidx] = __pmd(__pa(pte_page) | _PAGE_TABLE); } /* Install mappings */ @@ -1434,6 +1374,20 @@ static __init void xen_map_identity_early(unsigned long max_pfn) for(pteidx = 0; pteidx < ident_pte; pteidx += PTRS_PER_PTE) set_page_prot(&level1_ident_pgt[pteidx], PAGE_KERNEL_RO); + + set_page_prot(pmd, PAGE_KERNEL_RO); +} + +#ifdef CONFIG_X86_64 +static void convert_pfn_mfn(void *v) +{ + pte_t *pte = v; + int i; + + /* All levels are converted the same way, so just treat them + as ptes. */ + for(i = 0; i < PTRS_PER_PTE; i++) + pte[i] = xen_make_pte(pte[i].pte); } /* @@ -1471,18 +1425,18 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf memcpy(level2_fixmap_pgt, l2, sizeof(pmd_t) * PTRS_PER_PMD); /* Set up identity map */ - xen_map_identity_early(max_pfn); + xen_map_identity_early(level2_ident_pgt, max_pfn); /* Make pagetable pieces RO */ set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); - set_page_prot(level2_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); /* Pin down new L4 */ - pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa_symbol(init_level4_pgt))); + pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, + PFN_DOWN(__pa_symbol(init_level4_pgt))); /* Unpin Xen-provided one */ pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); @@ -1498,17 +1452,37 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf return pgd; } -#else +#else /* !CONFIG_X86_64 */ +static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; + static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) { + pmd_t *kernel_pmd; + init_pg_tables_start = __pa(pgd); init_pg_tables_end = __pa(pgd) + xen_start_info->nr_pt_frames*PAGE_SIZE; max_pfn_mapped = PFN_DOWN(init_pg_tables_end + 512*1024); - x86_write_percpu(xen_cr3, __pa(pgd)); - x86_write_percpu(xen_current_cr3, __pa(pgd)); + kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); + memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); - return pgd; + xen_map_identity_early(level2_kernel_pgt, max_pfn); + + memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); + set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], + __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); + + set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); + set_page_prot(empty_zero_page, PAGE_KERNEL_RO); + + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); + + xen_write_cr3(__pa(swapper_pg_dir)); + + pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); + + return swapper_pg_dir; } #endif /* CONFIG_X86_64 */ -- cgit v1.2.2 From ebd879e397f6361727c36267a12d1650710e465a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:54 -0700 Subject: xen: fix truncation of machine address arbitrary_virt_to_machine can truncate a machine address if its above 4G. Cast the problem away. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 2579e70cdd08..05d7392a7a4c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -186,7 +186,7 @@ xmaddr_t arbitrary_virt_to_machine(unsigned long address) BUG_ON(pte == NULL); - return XMADDR((pte_mfn(*pte) << PAGE_SHIFT) + offset); + return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); } void make_lowmem_page_readonly(void *vaddr) -- cgit v1.2.2 From ce803e705f1cbdd2703e83061622089b5b4a5417 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:55 -0700 Subject: xen64: use arbitrary_virt_to_machine for xen_set_pmd When building initial pagetables in 64-bit kernel the pud/pmd pointer may be in ioremap/fixmap space, so we need to walk the pagetable to look up the physical address. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 05d7392a7a4c..a8f023271819 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -178,8 +178,9 @@ void set_phys_to_machine(unsigned long pfn, unsigned long mfn) p2m_top[topidx][idx] = mfn; } -xmaddr_t arbitrary_virt_to_machine(unsigned long address) +xmaddr_t arbitrary_virt_to_machine(void *vaddr) { + unsigned long address = (unsigned long)vaddr; unsigned int level; pte_t *pte = lookup_address(address, &level); unsigned offset = address & ~PAGE_MASK; @@ -253,7 +254,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); extend_mmu_update(&u); @@ -415,7 +417,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) xen_mc_batch(); - u.ptr = virt_to_machine(ptr).maddr; + /* ptr may be ioremapped for 64-bit pagetable setup */ + u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); extend_mmu_update(&u); -- cgit v1.2.2 From 4560a2947e32670fc6ede108c2b032c396180649 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:56 -0700 Subject: xen: set num_processors Someone's got to do it. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 800bb2191e2a..8310ca0ea375 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -155,8 +155,10 @@ static void __init xen_fill_possible_map(void) for (i = 0; i < NR_CPUS; i++) { rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); - if (rc >= 0) + if (rc >= 0) { + num_processors++; cpu_set(i, cpu_possible_map); + } } } -- cgit v1.2.2 From 8745f8b0b914cf1d617ecc49726c24011858c74e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:57 -0700 Subject: xen64: defer setting pagetable alloc/release ops We need to wait until the page structure is available to use the proper pagetable page alloc/release operations, since they use struct page to determine if a pagetable is pinned. This happened to work in 32bit because nobody allocated new pagetable pages in the interim between xen_pagetable_setup_done and xen_post_allocator_init, but the 64-bit kenrel needs to allocate more pagetable levels. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 19c12a6c7311..da91404fc66c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -878,30 +878,29 @@ void xen_setup_shared_info(void) static __init void xen_pagetable_setup_done(pgd_t *base) { - /* This will work as long as patching hasn't happened yet - (which it hasn't) */ - pv_mmu_ops.alloc_pte = xen_alloc_pte; - pv_mmu_ops.alloc_pmd = xen_alloc_pmd; - pv_mmu_ops.release_pte = xen_release_pte; - pv_mmu_ops.release_pmd = xen_release_pmd; -#if PAGETABLE_LEVELS == 4 - pv_mmu_ops.alloc_pud = xen_alloc_pud; - pv_mmu_ops.release_pud = xen_release_pud; -#endif - - pv_mmu_ops.set_pte = xen_set_pte; - xen_setup_shared_info(); } static __init void xen_post_allocator_init(void) { + pv_mmu_ops.set_pte = xen_set_pte; pv_mmu_ops.set_pmd = xen_set_pmd; pv_mmu_ops.set_pud = xen_set_pud; #if PAGETABLE_LEVELS == 4 pv_mmu_ops.set_pgd = xen_set_pgd; #endif + /* This will work as long as patching hasn't happened yet + (which it hasn't) */ + pv_mmu_ops.alloc_pte = xen_alloc_pte; + pv_mmu_ops.alloc_pmd = xen_alloc_pmd; + pv_mmu_ops.release_pte = xen_release_pte; + pv_mmu_ops.release_pmd = xen_release_pmd; +#if PAGETABLE_LEVELS == 4 + pv_mmu_ops.alloc_pud = xen_alloc_pud; + pv_mmu_ops.release_pud = xen_release_pud; +#endif + xen_mark_init_mm_pinned(); } -- cgit v1.2.2 From 836fe2f291cb450a6193fa713878efe7d32bec6e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:06:58 -0700 Subject: xen: use set_pte_vaddr Make Xen's set_pte_mfn() use set_pte_vaddr rather than copying it. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Juan Quintela Signed-off-by: Mark McLoughlin Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 30 +----------------------------- 1 file changed, 1 insertion(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a8f023271819..eb31ed291b93 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -282,35 +282,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) */ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - pgd = swapper_pg_dir + pgd_index(vaddr); - if (pgd_none(*pgd)) { - BUG(); - return; - } - pud = pud_offset(pgd, vaddr); - if (pud_none(*pud)) { - BUG(); - return; - } - pmd = pmd_offset(pud, vaddr); - if (pmd_none(*pmd)) { - BUG(); - return; - } - pte = pte_offset_kernel(pmd, vaddr); - /* stored as-is, to permit clearing entries */ - xen_set_pte(pte, mfn_pte(mfn, flags)); - - /* - * It's enough to flush this one mapping. - * (PGE mappings get flushed as well) - */ - __flush_tlb_one(vaddr); + set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); } void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, -- cgit v1.2.2 From e176d367d0cc8b8efd2e0960c9edf5d2fe7cd9f1 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:06:59 -0700 Subject: xen64: xen_write_idt_entry() and cvt_gate_to_trap() Changed to use the (to-be-)unified descriptor structs. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index da91404fc66c..f5e96f7a4c5c 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -401,23 +401,18 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, preempt_enable(); } -static int cvt_gate_to_trap(int vector, u32 low, u32 high, +static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { - u8 type, dpl; - - type = (high >> 8) & 0x1f; - dpl = (high >> 13) & 3; - - if (type != 0xf && type != 0xe) + if (val->type != 0xf && val->type != 0xe) return 0; info->vector = vector; - info->address = (high & 0xffff0000) | (low & 0x0000ffff); - info->cs = low >> 16; - info->flags = dpl; + info->address = gate_offset(*val); + info->cs = gate_segment(*val); + info->flags = val->dpl; /* interrupt gates clear IF */ - if (type == 0xe) + if (val->type == 0xe) info->flags |= 4; return 1; @@ -444,11 +439,10 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) if (p >= start && (p + 8) <= end) { struct trap_info info[2]; - u32 *desc = (u32 *)g; info[1].address = 0; - if (cvt_gate_to_trap(entrynum, desc[0], desc[1], &info[0])) + if (cvt_gate_to_trap(entrynum, g, &info[0])) if (HYPERVISOR_set_trap_table(info)) BUG(); } @@ -461,13 +455,13 @@ static void xen_convert_trap_info(const struct desc_ptr *desc, { unsigned in, out, count; - count = (desc->size+1) / 8; + count = (desc->size+1) / sizeof(gate_desc); BUG_ON(count > 256); for (in = out = 0; in < count; in++) { - const u32 *entry = (u32 *)(desc->address + in * 8); + gate_desc *entry = (gate_desc*)(desc->address) + in; - if (cvt_gate_to_trap(in, entry[0], entry[1], &traps[out])) + if (cvt_gate_to_trap(in, entry, &traps[out])) out++; } traps[out].address = 0; -- cgit v1.2.2 From 997409d3d0bd6894f33e31ced251c0fdf523aa14 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:00 -0700 Subject: xen64: deal with extra words Xen pushes onto exception frames Xen pushes two extra words containing the values of rcx and r11. This pvop hook copies the words back into their appropriate registers, and cleans them off the stack. This leaves the stack in native form, so the normal handler can run unchanged. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/xen-asm_64.S | 5 +++++ arch/x86/xen/xen-ops.h | 2 ++ 3 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f5e96f7a4c5c..9d94483b3b5e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1091,7 +1091,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { .safe_halt = xen_safe_halt, .halt = xen_halt, #ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, + .adjust_exception_frame = xen_adjust_exception_frame, #endif }; diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 4ec10827370b..b147b495daef 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -133,6 +133,11 @@ check_events: ret #endif +ENTRY(xen_adjust_exception_frame) + mov 8+0(%rsp),%rcx + mov 8+8(%rsp),%r11 + ret $16 + ENTRY(xen_iret) pushq $0 jmp hypercall_page + __HYPERVISOR_iret * 32 diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index aca4a7803e2c..c4800a2c5a41 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -67,7 +67,9 @@ DECL_ASM(void, xen_irq_disable_direct, void); DECL_ASM(unsigned long, xen_save_fl_direct, void); DECL_ASM(void, xen_restore_fl_direct, unsigned long); +/* These are not functions, and cannot be called normally */ void xen_iret(void); void xen_sysexit(void); +void xen_adjust_exception_frame(void); #endif /* XEN_OPS_H */ -- cgit v1.2.2 From 952d1d7055c8cbf95b4ad2f90be5ed37db8a48ee Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:01 -0700 Subject: xen64: add pvop for swapgs swapgs is a no-op under Xen, because the hypervisor makes sure the right version of %gs is current when switching between user and kernel modes. This means that the swapgs "implementation" can be inlined and used when the stack is unsafe (usermode). Unfortunately, it means that disabling patching will result in a non-booting kernel... Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9d94483b3b5e..8b60982e457a 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1076,6 +1076,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .set_iopl_mask = xen_set_iopl_mask, .io_delay = xen_io_delay, + /* Xen takes care of %gs when switching to usermode for us */ + .swapgs = paravirt_nop, + .lazy_mode = { .enter = paravirt_enter_lazy_cpu, .leave = xen_leave_lazy, -- cgit v1.2.2 From 88459d4c7eb68c4a15609e00e5d100e2a305f040 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:02 -0700 Subject: xen64: register callbacks in arch-independent way Use callback_op hypercall to register callbacks in a 32/64-bit independent way (64-bit doesn't need a code segment, but that detail is hidden in XEN_CALLBACK). Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f52f3855fb6b..bea3d4f779db 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -91,19 +91,25 @@ static void __init fiddle_vdso(void) *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; } -void xen_enable_sysenter(void) +static __cpuinit int register_callback(unsigned type, const void *func) { - int cpu = smp_processor_id(); - extern void xen_sysenter_target(void); - /* Mask events on entry, even though they get enabled immediately */ - static struct callback_register sysenter = { - .type = CALLBACKTYPE_sysenter, - .address = XEN_CALLBACK(__KERNEL_CS, xen_sysenter_target), + struct callback_register callback = { + .type = type, + .address = XEN_CALLBACK(__KERNEL_CS, func), .flags = CALLBACKF_mask_events, }; + return HYPERVISOR_callback_op(CALLBACKOP_register, &callback); +} + +void __cpuinit xen_enable_sysenter(void) +{ + int cpu = smp_processor_id(); + extern void xen_sysenter_target(void); + if (!boot_cpu_has(X86_FEATURE_SEP) || - HYPERVISOR_callback_op(CALLBACKOP_register, &sysenter) != 0) { + register_callback(CALLBACKTYPE_sysenter, + xen_sysenter_target) != 0) { clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); } @@ -120,8 +126,9 @@ void __init xen_arch_setup(void) if (!xen_feature(XENFEAT_auto_translated_physmap)) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_pae_extended_cr3); - HYPERVISOR_set_callbacks(__KERNEL_CS, (unsigned long)xen_hypervisor_callback, - __KERNEL_CS, (unsigned long)xen_failsafe_callback); + if (register_callback(CALLBACKTYPE_event, xen_hypervisor_callback) || + register_callback(CALLBACKTYPE_failsafe, xen_failsafe_callback)) + BUG(); xen_enable_sysenter(); -- cgit v1.2.2 From 0725cbb97793d4e65bf148e4872959cdbb8c6ddd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:03 -0700 Subject: xen64: add identity irq->vector map The x86_64 interrupt subsystem is oriented towards vectors, as opposed to a flat irq space as it is in x86-32. This patch adds a simple identity irq->vector mapping so that we can continue to feed irqs into do_IRQ() and get a good result. Ideally x86_32 will unify with the 64-bit code and use vectors too. At that point we can move to mapping event channels to vectors, which will allow us to economise on irqs (so per-cpu event channels can share irqs, rather than having to allocte one per cpu, for example). Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8b60982e457a..52f2292672c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1085,8 +1085,25 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { }, }; +static void __init __xen_init_IRQ(void) +{ +#ifdef CONFIG_X86_64 + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } +#endif /* CONFIG_X86_64 */ + + xen_init_IRQ(); +} + static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = xen_init_IRQ, + .init_IRQ = __xen_init_IRQ, .save_fl = xen_save_fl, .restore_fl = xen_restore_fl, .irq_disable = xen_irq_disable, -- cgit v1.2.2 From a8fc1089e49caa5dca346dfacb5c84abf9a22a0c Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:07:05 -0700 Subject: xen64: implement xen_load_gs_index() xen-64: implement xen_load_gs_index() Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 52f2292672c4..3b6b7fcf5b55 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -385,6 +385,14 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) loadsegment(gs, 0); } +#ifdef CONFIG_X86_64 +static void xen_load_gs_index(unsigned int idx) +{ + if (HYPERVISOR_set_segment_base(SEGBASE_GS_USER_SEL, idx)) + BUG(); +} +#endif + static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { @@ -1063,6 +1071,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, +#ifdef CONFIG_X86_64 + .load_gs_index = xen_load_gs_index, +#endif .store_gdt = native_store_gdt, .store_idt = native_store_idt, -- cgit v1.2.2 From 5deb30d194d28b6bf7dacfb758267a51bf7c5b78 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:06 -0700 Subject: xen: rework pgd_walk to deal with 32/64 bit Rewrite pgd_walk to deal with 64-bit address spaces. There are two notible features of 64-bit workspaces: 1. The physical address is only 48 bits wide, with the upper 16 bits being sign extension; kernel addresses are negative, and userspace is positive. 2. The Xen hypervisor mapping is at the negative-most address, just above the sign-extension hole. 1. means that we can't easily use addresses when traversing the space, since we must deal with sign extension. This rewrite expresses everything in terms of pgd/pud/pmd indices, which means we don't need to worry about the exact configuration of the virtual memory space. This approach works equally well in 32-bit. To deal with 2, assume the hole is between the uppermost userspace address and PAGE_OFFSET. For 64-bit this skips the Xen mapping hole. For 32-bit, the hole is zero-sized. In all cases, the uppermost kernel address is FIXADDR_TOP. A side-effect of this patch is that the upper boundary is actually handled properly, exposing a long-standing bug in 32-bit, which failed to pin kernel pmd page. The kernel pmd is not shared, and so must be explicitly pinned, even though the kernel ptes are shared and don't need pinning. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 115 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 75 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index eb31ed291b93..046c1f23dd6e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -44,6 +44,7 @@ #include #include +#include #include #include #include @@ -491,77 +492,103 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) #endif /* PAGETABLE_LEVELS == 4 */ /* - (Yet another) pagetable walker. This one is intended for pinning a - pagetable. This means that it walks a pagetable and calls the - callback function on each page it finds making up the page table, - at every level. It walks the entire pagetable, but it only bothers - pinning pte pages which are below pte_limit. In the normal case - this will be TASK_SIZE, but at boot we need to pin up to - FIXADDR_TOP. But the important bit is that we don't pin beyond - there, because then we start getting into Xen's ptes. -*/ -static int pgd_walk(pgd_t *pgd_base, int (*func)(struct page *, enum pt_level), + * (Yet another) pagetable walker. This one is intended for pinning a + * pagetable. This means that it walks a pagetable and calls the + * callback function on each page it finds making up the page table, + * at every level. It walks the entire pagetable, but it only bothers + * pinning pte pages which are below limit. In the normal case this + * will be STACK_TOP_MAX, but at boot we need to pin up to + * FIXADDR_TOP. + * + * For 32-bit the important bit is that we don't pin beyond there, + * because then we start getting into Xen's ptes. + * + * For 64-bit, we must skip the Xen hole in the middle of the address + * space, just after the big x86-64 virtual hole. + */ +static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), unsigned long limit) { - pgd_t *pgd = pgd_base; int flush = 0; - unsigned long addr = 0; - unsigned long pgd_next; + unsigned hole_low, hole_high; + unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; + unsigned pgdidx, pudidx, pmdidx; - BUG_ON(limit > FIXADDR_TOP); + /* The limit is the last byte to be touched */ + limit--; + BUG_ON(limit >= FIXADDR_TOP); if (xen_feature(XENFEAT_auto_translated_physmap)) return 0; - for (; addr != FIXADDR_TOP; pgd++, addr = pgd_next) { + /* + * 64-bit has a great big hole in the middle of the address + * space, which contains the Xen mappings. On 32-bit these + * will end up making a zero-sized hole and so is a no-op. + */ + hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1); + hole_high = pgd_index(PAGE_OFFSET); + + pgdidx_limit = pgd_index(limit); +#if PTRS_PER_PUD > 1 + pudidx_limit = pud_index(limit); +#else + pudidx_limit = 0; +#endif +#if PTRS_PER_PMD > 1 + pmdidx_limit = pmd_index(limit); +#else + pmdidx_limit = 0; +#endif + + flush |= (*func)(virt_to_page(pgd), PT_PGD); + + for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; - unsigned long pud_limit, pud_next; - pgd_next = pud_limit = pgd_addr_end(addr, FIXADDR_TOP); + if (pgdidx >= hole_low && pgdidx < hole_high) + continue; - if (!pgd_val(*pgd)) + if (!pgd_val(pgd[pgdidx])) continue; - pud = pud_offset(pgd, 0); + pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ flush |= (*func)(virt_to_page(pud), PT_PUD); - for (; addr != pud_limit; pud++, addr = pud_next) { + for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; - unsigned long pmd_limit; - pud_next = pud_addr_end(addr, pud_limit); - - if (pud_next < limit) - pmd_limit = pud_next; - else - pmd_limit = limit; + if (pgdidx == pgdidx_limit && + pudidx > pudidx_limit) + goto out; - if (pud_none(*pud)) + if (pud_none(pud[pudidx])) continue; - pmd = pmd_offset(pud, 0); + pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ flush |= (*func)(virt_to_page(pmd), PT_PMD); - for (; addr != pmd_limit; pmd++) { - addr += (PAGE_SIZE * PTRS_PER_PTE); - if ((pmd_limit-1) < (addr-1)) { - addr = pmd_limit; - break; - } + for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { + struct page *pte; + + if (pgdidx == pgdidx_limit && + pudidx == pudidx_limit && + pmdidx > pmdidx_limit) + goto out; - if (pmd_none(*pmd)) + if (pmd_none(pmd[pmdidx])) continue; - flush |= (*func)(pmd_page(*pmd), PT_PTE); + pte = pmd_page(pmd[pmdidx]); + flush |= (*func)(pte, PT_PTE); } } } - - flush |= (*func)(virt_to_page(pgd_base), PT_PGD); +out: return flush; } @@ -650,6 +677,11 @@ void xen_pgd_pin(pgd_t *pgd) xen_mc_batch(); } +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is pinnable */ + pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); +#endif + xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); xen_mc_issue(0); } @@ -731,6 +763,10 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +#ifdef CONFIG_X86_PAE + /* Need to make sure unshared kernel PMD is unpinned */ + pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); +#endif pgd_walk(pgd, unpin_page, TASK_SIZE); xen_mc_issue(0); @@ -750,7 +786,6 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - printk("unpinning pinned %p\n", page_address(page)); xen_pgd_unpin((pgd_t *)page_address(page)); ClearPageSavePinned(page); } -- cgit v1.2.2 From b7c3c5c15936a40c79ef40af7b3bac801c7feb20 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:07 -0700 Subject: xen: make sure the kernel command line is right Point the boot params cmd_line_ptr to the domain-builder-provided command line. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 3b6b7fcf5b55..0172ba774523 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1587,6 +1587,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_image = xen_start_info->mod_start ? __pa(xen_start_info->mod_start) : 0; boot_params.hdr.ramdisk_size = xen_start_info->mod_len; + boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); if (!is_initial_xendomain()) { add_preferred_console("xenboot", 0, NULL); -- cgit v1.2.2 From 4a5c3e77f70b3ea8b361d7fa9eb2e4dad18f70ae Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:09 -0700 Subject: xen64: implement failsafe callback Implement the failsafe callback, so that iret and segment register load exceptions are reported to the kernel. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 7cc2de796146..6aa6932e21b1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1365,10 +1365,8 @@ END(do_hypervisor_callback) # with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) -#if 1 - ud2a -#else - _frame (RIP-0x30) + framesz = (RIP-0x30) /* workaround buggy gas */ + _frame framesz CFI_REL_OFFSET rcx, 0 CFI_REL_OFFSET r11, 8 movw %ds,%cx @@ -1391,8 +1389,13 @@ ENTRY(xen_failsafe_callback) CFI_RESTORE r11 addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 - movq $11,%rdi /* SIGSEGV */ - jmp do_exit + pushq $0 + CFI_ADJUST_CFA_OFFSET 8 + pushq %r11 + CFI_ADJUST_CFA_OFFSET 8 + pushq %rcx + CFI_ADJUST_CFA_OFFSET 8 + jmp general_protection CFI_RESTORE_STATE 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp),%rcx @@ -1406,7 +1409,6 @@ ENTRY(xen_failsafe_callback) SAVE_ALL jmp error_exit CFI_ENDPROC -#endif END(xen_failsafe_callback) #endif /* CONFIG_XEN */ -- cgit v1.2.2 From 8a95408e183b3e4aaf3b6a66fa34bff4db53011b Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Tue, 8 Jul 2008 15:07:10 -0700 Subject: xen64: Clear %fs on xen_load_tls() We need to do this, otherwise we can get a GPF on hypercall return after TLS descriptor is cleared but %fs is still pointing to it. Signed-off-by: Eduardo Habkost Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0172ba774523..c13698faae54 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -364,14 +364,6 @@ static void load_TLS_descriptor(struct thread_struct *t, static void xen_load_tls(struct thread_struct *t, unsigned int cpu) { - xen_mc_batch(); - - load_TLS_descriptor(t, cpu, 0); - load_TLS_descriptor(t, cpu, 1); - load_TLS_descriptor(t, cpu, 2); - - xen_mc_issue(PARAVIRT_LAZY_CPU); - /* * XXX sleazy hack: If we're being called in a lazy-cpu zone, * it means we're in a context switch, and %gs has just been @@ -380,9 +372,30 @@ static void xen_load_tls(struct thread_struct *t, unsigned int cpu) * Either way, it has been saved, and the new value will get * loaded properly. This will go away as soon as Xen has been * modified to not save/restore %gs for normal hypercalls. + * + * On x86_64, this hack is not used for %gs, because gs points + * to KERNEL_GS_BASE (and uses it for PDA references), so we + * must not zero %gs on x86_64 + * + * For x86_64, we need to zero %fs, otherwise we may get an + * exception between the new %fs descriptor being loaded and + * %fs being effectively cleared at __switch_to(). */ - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) + if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { +#ifdef CONFIG_X86_32 loadsegment(gs, 0); +#else + loadsegment(fs, 0); +#endif + } + + xen_mc_batch(); + + load_TLS_descriptor(t, cpu, 0); + load_TLS_descriptor(t, cpu, 1); + load_TLS_descriptor(t, cpu, 2); + + xen_mc_issue(PARAVIRT_LAZY_CPU); } #ifdef CONFIG_X86_64 -- cgit v1.2.2 From d6182fbf04164016cb6540db02eef3d6bdc967c3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:13 -0700 Subject: xen64: allocate and manage user pagetables Because the x86_64 architecture does not enforce segment limits, Xen cannot protect itself with them as it does in 32-bit mode. Therefore, to protect itself, it runs the guest kernel in ring 3. Since it also runs the guest userspace in ring3, the guest kernel must maintain a second pagetable for its userspace, which does not map kernel space. Naturally, the guest kernel pagetables map both kernel and userspace. The userspace pagetable is attached to the corresponding kernel pagetable via the pgd's page->private field. It is allocated and freed at the same time as the kernel pgd via the paravirt_pgd_alloc/free hooks. Fortunately, the user pagetable is almost entirely shared with the kernel pagetable; the only difference is the pgd page itself. set_pgd will populate all entries in the kernel pagetable, and also set the corresponding user pgd entry if the address is less than STACK_TOP_MAX. The user pagetable must be pinned and unpinned with the kernel one, but because the pagetables are aliased, pgd_walk() only needs to be called on the kernel pagetable. The user pgd page is then pinned/unpinned along with the kernel pgd page. xen_write_cr3 must write both the kernel and user cr3s. The init_mm.pgd pagetable never has a user pagetable allocated for it, because it can never be used while running usermode. One awkward area is that early in boot the page structures are not available. No user pagetable can exist at that point, but it complicates the logic to avoid looking at the page structure. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 99 ++++++++++++++++++++++++++++++++++++++++-------- arch/x86/xen/mmu.c | 91 +++++++++++++++++++++++++++++++++++++++----- arch/x86/xen/mmu.h | 2 + 3 files changed, 168 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index c13698faae54..48f1a7eca8b9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -46,7 +46,6 @@ #include #include #include -#include #include "xen-ops.h" #include "mmu.h" @@ -711,29 +710,57 @@ static void set_current_cr3(void *v) x86_write_percpu(xen_current_cr3, (unsigned long)v); } -static void xen_write_cr3(unsigned long cr3) +static void __xen_write_cr3(bool kernel, unsigned long cr3) { struct mmuext_op *op; struct multicall_space mcs; - unsigned long mfn = pfn_to_mfn(PFN_DOWN(cr3)); + unsigned long mfn; - BUG_ON(preemptible()); + if (cr3) + mfn = pfn_to_mfn(PFN_DOWN(cr3)); + else + mfn = 0; - mcs = xen_mc_entry(sizeof(*op)); /* disables interrupts */ + WARN_ON(mfn == 0 && kernel); - /* Update while interrupts are disabled, so its atomic with - respect to ipis */ - x86_write_percpu(xen_cr3, cr3); + mcs = __xen_mc_entry(sizeof(*op)); op = mcs.args; - op->cmd = MMUEXT_NEW_BASEPTR; + op->cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR; op->arg1.mfn = mfn; MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); - /* Update xen_update_cr3 once the batch has actually - been submitted. */ - xen_mc_callback(set_current_cr3, (void *)cr3); + if (kernel) { + x86_write_percpu(xen_cr3, cr3); + + /* Update xen_current_cr3 once the batch has actually + been submitted. */ + xen_mc_callback(set_current_cr3, (void *)cr3); + } +} + +static void xen_write_cr3(unsigned long cr3) +{ + BUG_ON(preemptible()); + + xen_mc_batch(); /* disables interrupts */ + + /* Update while interrupts are disabled, so its atomic with + respect to ipis */ + x86_write_percpu(xen_cr3, cr3); + + __xen_write_cr3(true, cr3); + +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(__va(cr3)); + if (user_pgd) + __xen_write_cr3(false, __pa(user_pgd)); + else + __xen_write_cr3(false, 0); + } +#endif xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } @@ -794,6 +821,40 @@ static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) xen_alloc_ptpage(mm, pfn, PT_PMD); } +static int xen_pgd_alloc(struct mm_struct *mm) +{ + pgd_t *pgd = mm->pgd; + int ret = 0; + + BUG_ON(PagePinned(virt_to_page(pgd))); + +#ifdef CONFIG_X86_64 + { + struct page *page = virt_to_page(pgd); + + BUG_ON(page->private != 0); + + page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO); + if (page->private == 0) + ret = -ENOMEM; + + BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); + } +#endif + + return ret; +} + +static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) +{ +#ifdef CONFIG_X86_64 + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) + free_page((unsigned long)user_pgd); +#endif +} + /* This should never happen until we're OK to use struct page */ static void xen_release_ptpage(u32 pfn, unsigned level) { @@ -1168,8 +1229,8 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .pte_update = paravirt_nop, .pte_update_defer = paravirt_nop, - .pgd_alloc = __paravirt_pgd_alloc, - .pgd_free = paravirt_nop, + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, @@ -1480,7 +1541,15 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf /* Switch over */ pgd = init_level4_pgt; - xen_write_cr3(__pa(pgd)); + + /* + * At this stage there can be no user pgd, and no page + * structure to attach it to, so make sure we just set kernel + * pgd. + */ + xen_mc_batch(); + __xen_write_cr3(true, __pa(pgd)); + xen_mc_issue(PARAVIRT_LAZY_CPU); reserve_early(__pa(xen_start_info->pt_base), __pa(xen_start_info->pt_base + diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 046c1f23dd6e..a44d56e38bd1 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -58,6 +58,13 @@ #include "multicalls.h" #include "mmu.h" +/* + * Just beyond the highest usermode address. STACK_TOP_MAX has a + * redzone above it, so round it up to a PGD boundary. + */ +#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) + + #define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long)) #define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE) @@ -461,17 +468,45 @@ pud_t xen_make_pud(pudval_t pud) return native_make_pud(pud); } -void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +pgd_t *xen_get_user_pgd(pgd_t *pgd) { - struct mmu_update u; + pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); + unsigned offset = pgd - pgd_page; + pgd_t *user_ptr = NULL; - preempt_disable(); + if (offset < pgd_index(USER_LIMIT)) { + struct page *page = virt_to_page(pgd_page); + user_ptr = (pgd_t *)page->private; + if (user_ptr) + user_ptr += offset; + } - xen_mc_batch(); + return user_ptr; +} + +static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + struct mmu_update u; u.ptr = virt_to_machine(ptr).maddr; u.val = pgd_val_ma(val); extend_mmu_update(&u); +} + +/* + * Raw hypercall-based set_pgd, intended for in early boot before + * there's a page structure. This implies: + * 1. The only existing pagetable is the kernel's + * 2. It is always pinned + * 3. It has no user pagetable attached to it + */ +void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +{ + preempt_disable(); + + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -480,14 +515,28 @@ void xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) void xen_set_pgd(pgd_t *ptr, pgd_t val) { + pgd_t *user_ptr = xen_get_user_pgd(ptr); + /* If page is not pinned, we can just update the entry directly */ if (!page_pinned(ptr)) { *ptr = val; + if (user_ptr) { + WARN_ON(page_pinned(user_ptr)); + *user_ptr = val; + } return; } - xen_set_pgd_hyper(ptr, val); + /* If it's pinned, then we can at least batch the kernel and + user updates together. */ + xen_mc_batch(); + + __xen_set_pgd_hyper(ptr, val); + if (user_ptr) + __xen_set_pgd_hyper(user_ptr, val); + + xen_mc_issue(PARAVIRT_LAZY_MMU); } #endif /* PAGETABLE_LEVELS == 4 */ @@ -526,7 +575,7 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), * space, which contains the Xen mappings. On 32-bit these * will end up making a zero-sized hole and so is a no-op. */ - hole_low = pgd_index(STACK_TOP_MAX + PGDIR_SIZE - 1); + hole_low = pgd_index(USER_LIMIT); hole_high = pgd_index(PAGE_OFFSET); pgdidx_limit = pgd_index(limit); @@ -670,19 +719,31 @@ void xen_pgd_pin(pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, TASK_SIZE)) { + if (pgd_walk(pgd, pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); xen_mc_batch(); } +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); + + if (user_pgd) { + pin_page(virt_to_page(user_pgd), PT_PGD); + xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); + } + } +#else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif - xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); +#endif /* CONFIG_X86_64 */ xen_mc_issue(0); } @@ -763,11 +824,23 @@ static void xen_pgd_unpin(pgd_t *pgd) xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); +#ifdef CONFIG_X86_64 + { + pgd_t *user_pgd = xen_get_user_pgd(pgd); + + if (user_pgd) { + xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); + unpin_page(virt_to_page(user_pgd), PT_PGD); + } + } +#endif + #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif - pgd_walk(pgd, unpin_page, TASK_SIZE); + + pgd_walk(pgd, unpin_page, USER_LIMIT); xen_mc_issue(0); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 19d544b0b6c6..0f59bd03f9e3 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -51,6 +51,8 @@ void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); #endif +pgd_t *xen_get_user_pgd(pgd_t *pgd); + pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -- cgit v1.2.2 From 6fcac6d305e8238939e169f4c52e8ec8a552a31f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:14 -0700 Subject: xen64: set up syscall and sysenter entrypoints for 64-bit We set up entrypoints for syscall and sysenter. sysenter is only used for 32-bit compat processes, whereas syscall can be used in by both 32 and 64-bit processes. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++ arch/x86/xen/setup.c | 42 +++++++++++++-- arch/x86/xen/smp.c | 1 + arch/x86/xen/xen-asm_64.S | 129 +++++++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/xen-ops.h | 3 ++ 5 files changed, 174 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 48f1a7eca8b9..87d36044054d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1139,6 +1139,10 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .iret = xen_iret, .irq_enable_sysexit = xen_sysexit, +#ifdef CONFIG_X86_64 + .usergs_sysret32 = xen_sysret32, + .usergs_sysret64 = xen_sysret64, +#endif .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index bea3d4f779db..9d7a14402895 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -86,9 +86,11 @@ static void xen_idle(void) */ static void __init fiddle_vdso(void) { +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) extern const char vdso32_default_start; u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; +#endif } static __cpuinit int register_callback(unsigned type, const void *func) @@ -106,15 +108,48 @@ void __cpuinit xen_enable_sysenter(void) { int cpu = smp_processor_id(); extern void xen_sysenter_target(void); + int ret; + +#ifdef CONFIG_X86_32 + if (!boot_cpu_has(X86_FEATURE_SEP)) { + return; + } +#else + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && + boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) { + return; + } +#endif - if (!boot_cpu_has(X86_FEATURE_SEP) || - register_callback(CALLBACKTYPE_sysenter, - xen_sysenter_target) != 0) { + ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); + if(ret != 0) { clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); } } +void __cpuinit xen_enable_syscall(void) +{ +#ifdef CONFIG_X86_64 + int cpu = smp_processor_id(); + int ret; + extern void xen_syscall_target(void); + extern void xen_syscall32_target(void); + + ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); + if (ret != 0) { + printk("failed to set syscall: %d\n", ret); + clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SYSCALL); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SYSCALL); + } else { + ret = register_callback(CALLBACKTYPE_syscall32, + xen_syscall32_target); + if (ret != 0) + printk("failed to set 32-bit syscall: %d\n", ret); + } +#endif /* CONFIG_X86_64 */ +} + void __init xen_arch_setup(void) { struct physdev_set_iopl set_iopl; @@ -131,6 +166,7 @@ void __init xen_arch_setup(void) BUG(); xen_enable_sysenter(); + xen_enable_syscall(); set_iopl.iopl = 1; rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8310ca0ea375..f702199312a5 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -69,6 +69,7 @@ static __cpuinit void cpu_bringup_and_idle(void) preempt_disable(); xen_enable_sysenter(); + xen_enable_syscall(); cpu = smp_processor_id(); smp_store_cpu_info(cpu); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index b147b495daef..4038cbfe3331 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -15,6 +15,8 @@ #include #include +#include +#include #include @@ -138,9 +140,132 @@ ENTRY(xen_adjust_exception_frame) mov 8+8(%rsp),%r11 ret $16 +hypercall_iret = hypercall_page + __HYPERVISOR_iret * 32 +/* + Xen64 iret frame: + + ss + rsp + rflags + cs + rip <-- standard iret frame + + flags + + rcx } + r11 }<-- pushed by hypercall page +rsp -> rax } + */ ENTRY(xen_iret) pushq $0 - jmp hypercall_page + __HYPERVISOR_iret * 32 +1: jmp hypercall_iret +ENDPATCH(xen_iret) +RELOC(xen_iret, 1b+1) +/* + sysexit is not used for 64-bit processes, so it's + only ever used to return to 32-bit compat userspace. + */ ENTRY(xen_sysexit) - ud2a + pushq $__USER32_DS + pushq %rcx + pushq $X86_EFLAGS_IF + pushq $__USER32_CS + pushq %rdx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysexit) +RELOC(xen_sysexit, 1b+1) + +ENTRY(xen_sysret64) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack,%rsp + + pushq $__USER_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret64) +RELOC(xen_sysret64, 1b+1) + +ENTRY(xen_sysret32) + /* We're already on the usermode stack at this point, but still + with the kernel gs, so we can easily switch back */ + movq %rsp, %gs:pda_oldrsp + movq %gs:pda_kernelstack, %rsp + + pushq $__USER32_DS + pushq %gs:pda_oldrsp + pushq %r11 + pushq $__USER32_CS + pushq %rcx + + pushq $VGCF_in_syscall +1: jmp hypercall_iret +ENDPATCH(xen_sysret32) +RELOC(xen_sysret32, 1b+1) + +/* + Xen handles syscall callbacks much like ordinary exceptions, + which means we have: + - kernel gs + - kernel rsp + - an iret-like stack frame on the stack (including rcx and r11): + ss + rsp + rflags + cs + rip + r11 + rsp-> rcx + + In all the entrypoints, we undo all that to make it look + like a CPU-generated syscall/sysenter and jump to the normal + entrypoint. + */ + +.macro undo_xen_syscall + mov 0*8(%rsp),%rcx + mov 1*8(%rsp),%r11 + mov 5*8(%rsp),%rsp +.endm + +/* Normal 64-bit system call target */ +ENTRY(xen_syscall_target) + undo_xen_syscall + jmp system_call_after_swapgs +ENDPROC(xen_syscall_target) + +#ifdef CONFIG_IA32_EMULATION + +/* 32-bit compat syscall target */ +ENTRY(xen_syscall32_target) + undo_xen_syscall + jmp ia32_cstar_target +ENDPROC(xen_syscall32_target) + +/* 32-bit compat sysenter target */ +ENTRY(xen_sysenter_target) + undo_xen_syscall + jmp ia32_sysenter_target +ENDPROC(xen_sysenter_target) + +#else /* !CONFIG_IA32_EMULATION */ + +ENTRY(xen_syscall32_target) +ENTRY(xen_sysenter_target) + lea 16(%rsp), %rsp /* strip %rcx,%r11 */ + mov $-ENOSYS, %rax + pushq $VGCF_in_syscall + jmp hypercall_iret +ENDPROC(xen_syscall32_target) +ENDPROC(xen_sysenter_target) + +#endif /* CONFIG_IA32_EMULATION */ diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index c4800a2c5a41..dd3c23152a2e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -26,6 +26,7 @@ char * __init xen_memory_setup(void); void __init xen_arch_setup(void); void __init xen_init_IRQ(void); void xen_enable_sysenter(void); +void xen_enable_syscall(void); void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); @@ -70,6 +71,8 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ void xen_iret(void); void xen_sysexit(void); +void xen_sysret32(void); +void xen_sysret64(void); void xen_adjust_exception_frame(void); #endif /* XEN_OPS_H */ -- cgit v1.2.2 From bf18bf94dc72db998d0fbebc846c07c858a59c90 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:15 -0700 Subject: xen64: set up userspace syscall patch 64-bit userspace expects the vdso to be mapped at a specific fixed address, which happens to be in the middle of the kernel address space. Because we have split user and kernel pagetables, we need to make special arrangements for the vsyscall mapping to appear in the kernel part of the user pagetable. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 46 ++++++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 87d36044054d..f64b8729cd07 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -56,6 +56,18 @@ EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +/* + * Identity map, in addition to plain kernel map. This needs to be + * large enough to allocate page table pages to allocate the rest. + * Each page can map 2MB. + */ +static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; + +#ifdef CONFIG_X86_64 +/* l3 pud for userspace vsyscall mapping */ +static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss; +#endif /* CONFIG_X86_64 */ + /* * Note about cr3 (pagetable base) values: * @@ -831,12 +843,20 @@ static int xen_pgd_alloc(struct mm_struct *mm) #ifdef CONFIG_X86_64 { struct page *page = virt_to_page(pgd); + pgd_t *user_pgd; BUG_ON(page->private != 0); - page->private = __get_free_page(GFP_KERNEL | __GFP_ZERO); - if (page->private == 0) - ret = -ENOMEM; + ret = -ENOMEM; + + user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + page->private = (unsigned long)user_pgd; + + if (user_pgd != NULL) { + user_pgd[pgd_index(VSYSCALL_START)] = + __pgd(__pa(level3_user_vsyscall) | _PAGE_TABLE); + ret = 0; + } BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd)))); } @@ -977,6 +997,9 @@ static __init void xen_post_allocator_init(void) pv_mmu_ops.release_pud = xen_release_pud; #endif +#ifdef CONFIG_X86_64 + SetPagePinned(virt_to_page(level3_user_vsyscall)); +#endif xen_mark_init_mm_pinned(); } @@ -1088,6 +1111,15 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) } __native_set_fixmap(idx, pte); + +#ifdef CONFIG_X86_64 + /* Replicate changes to map the vsyscall page into the user + pagetable vsyscall mapping. */ + if (idx >= VSYSCALL_LAST_PAGE && idx <= VSYSCALL_FIRST_PAGE) { + unsigned long vaddr = __fix_to_virt(idx); + set_pte_vaddr_pud(level3_user_vsyscall, vaddr, pte); + } +#endif } static const struct pv_info xen_info __initdata = { @@ -1427,13 +1459,6 @@ static void set_page_prot(void *addr, pgprot_t prot) BUG(); } -/* - * Identity map, in addition to plain kernel map. This needs to be - * large enough to allocate page table pages to allocate the rest. - * Each page can map 2MB. - */ -static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; - static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) { unsigned pmdidx, pteidx; @@ -1533,6 +1558,7 @@ static __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pf set_page_prot(init_level4_pgt, PAGE_KERNEL_RO); set_page_prot(level3_ident_pgt, PAGE_KERNEL_RO); set_page_prot(level3_kernel_pgt, PAGE_KERNEL_RO); + set_page_prot(level3_user_vsyscall, PAGE_KERNEL_RO); set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); set_page_prot(level2_fixmap_pgt, PAGE_KERNEL_RO); -- cgit v1.2.2 From 1153968a48e3ca3e2b7a437e8b82ec9e6f768e24 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:16 -0700 Subject: xen: implement Xen write_msr operation 64-bit uses MSRs for important things like the base for fs and gs-prefixed addresses. It's more efficient to use a hypercall to update these, rather than go via the trap and emulate path. Other MSR writes are just passed through; in an unprivileged domain they do nothing, but it might be useful later. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f64b8729cd07..776c0fb77d69 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -777,6 +778,34 @@ static void xen_write_cr3(unsigned long cr3) xen_mc_issue(PARAVIRT_LAZY_CPU); /* interrupts restored */ } +static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) +{ + int ret; + + ret = 0; + + switch(msr) { +#ifdef CONFIG_X86_64 + unsigned which; + u64 base; + + case MSR_FS_BASE: which = SEGBASE_FS; goto set; + case MSR_KERNEL_GS_BASE: which = SEGBASE_GS_USER; goto set; + case MSR_GS_BASE: which = SEGBASE_GS_KERNEL; goto set; + + set: + base = ((u64)high << 32) | low; + if (HYPERVISOR_set_segment_base(which, base) != 0) + ret = -EFAULT; + break; +#endif + default: + ret = native_write_msr_safe(msr, low, high); + } + + return ret; +} + /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) @@ -1165,7 +1194,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, - .write_msr = native_write_msr_safe, + .write_msr = xen_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, -- cgit v1.2.2 From 51dd660a2cd6eab4d470cfe1009c7f473832b786 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 8 Jul 2008 15:07:17 -0700 Subject: xen: update Kconfig to allow 64-bit Xen Allow Xen to be enabled on 64-bit. Also extend domain size limit from 8 GB (on 32-bit) to 32 GB on 64-bit. Signed-off-by: Jeremy Fitzhardinge Cc: Stephen Tweedie Cc: Eduardo Habkost Cc: Mark McLoughlin Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index c2cc99580871..20b49729bed5 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -6,8 +6,8 @@ config XEN bool "Xen guest support" select PARAVIRT select PARAVIRT_CLOCK - depends on X86_32 - depends on X86_CMPXCHG && X86_TSC && X86_PAE && !(X86_VISWS || X86_VOYAGER) + depends on X86_64 || (X86_32 && X86_PAE && !(X86_VISWS || X86_VOYAGER)) + depends on X86_CMPXCHG && X86_TSC help This is the Linux Xen port. Enabling this will allow the kernel to boot in a paravirtualized environment under the @@ -15,10 +15,11 @@ config XEN config XEN_MAX_DOMAIN_MEMORY int "Maximum allowed size of a domain in gigabytes" - default 8 + default 8 if X86_32 + default 32 if X86_64 depends on XEN help The pseudo-physical to machine address array is sized according to the maximum possible memory size of a Xen domain. This array uses 1 page per gigabyte, so there's no - need to be too stingy here. \ No newline at end of file + need to be too stingy here. -- cgit v1.2.2 From b3fe124389f9dd97f0bbd954da2910e286648f0f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 13:45:33 +0200 Subject: xen64: fix build error on 32-bit + !HIGHMEM fix: arch/x86/xen/enlighten.c: In function 'xen_set_fixmap': arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_BEGIN' undeclared (first use in this function) arch/x86/xen/enlighten.c:1127: error: (Each undeclared identifier is reported only once arch/x86/xen/enlighten.c:1127: error: for each function it appears in.) arch/x86/xen/enlighten.c:1127: error: 'FIX_KMAP_END' undeclared (first use in this function) make[1]: *** [arch/x86/xen/enlighten.o] Error 1 make: *** [arch/x86/xen/enlighten.o] Error 2 FIX_KMAP_BEGIN is only available on HIGHMEM. Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 776c0fb77d69..3da6acb7eafc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1124,7 +1124,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_32 case FIX_WP_TEST: case FIX_VDSO: +# ifdef CONFIG_HIGHMEM case FIX_KMAP_BEGIN ... FIX_KMAP_END: +# endif #else case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: #endif -- cgit v1.2.2 From 6596f2422306a05be2170efc114da49f26a047dd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 22:32:33 +0200 Subject: Revert "x86_64: there's no need to preallocate level1_fixmap_pgt" This reverts commit 033786969d1d1b5af12a32a19d3a760314d05329. Suresh Siddha reported that this broke booting on his 2GB testbox. Reported-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 2240f823676a..db3280afe886 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -362,6 +362,12 @@ NEXT_PAGE(level3_kernel_pgt) .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) .fill 512,8,0 NEXT_PAGE(level2_ident_pgt) -- cgit v1.2.2 From 62541c376668042e20122864a044360707b2fb82 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 10 Jul 2008 16:24:08 -0700 Subject: xen64: disable 32-bit syscall/sysenter if not supported. Old versions of Xen (3.1 and before) don't support sysenter or syscall from 32-bit compat userspaces. If we can't set the appropriate syscall callback, then disable the corresponding feature bit, which will cause the vdso32 setup to fall back appropriately. Linux assumes that syscall is always available to 32-bit userspace, and installs it by default if sysenter isn't available. In that case, we just disable vdso altogether, forcing userspace libc to fall back to int $0x80. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9d7a14402895..9cce4a92aac0 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -106,46 +106,46 @@ static __cpuinit int register_callback(unsigned type, const void *func) void __cpuinit xen_enable_sysenter(void) { - int cpu = smp_processor_id(); extern void xen_sysenter_target(void); int ret; + unsigned sysenter_feature; #ifdef CONFIG_X86_32 - if (!boot_cpu_has(X86_FEATURE_SEP)) { - return; - } + sysenter_feature = X86_FEATURE_SEP; #else - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL && - boot_cpu_data.x86_vendor != X86_VENDOR_CENTAUR) { - return; - } + sysenter_feature = X86_FEATURE_SYSENTER32; #endif + if (!boot_cpu_has(sysenter_feature)) + return; + ret = register_callback(CALLBACKTYPE_sysenter, xen_sysenter_target); - if(ret != 0) { - clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SEP); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP); - } + if(ret != 0) + setup_clear_cpu_cap(sysenter_feature); } void __cpuinit xen_enable_syscall(void) { #ifdef CONFIG_X86_64 - int cpu = smp_processor_id(); int ret; extern void xen_syscall_target(void); extern void xen_syscall32_target(void); ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); if (ret != 0) { - printk("failed to set syscall: %d\n", ret); - clear_cpu_cap(&cpu_data(cpu), X86_FEATURE_SYSCALL); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SYSCALL); - } else { + printk(KERN_ERR "Failed to set syscall: %d\n", ret); + /* Pretty fatal; 64-bit userspace has no other + mechanism for syscalls. */ + } + + if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { ret = register_callback(CALLBACKTYPE_syscall32, xen_syscall32_target); - if (ret != 0) - printk("failed to set 32-bit syscall: %d\n", ret); + if (ret != 0) { + printk(KERN_INFO "Xen: 32-bit syscall not supported: disabling vdso\n"); + setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); + sysctl_vsyscall32 = 0; + } } #endif /* CONFIG_X86_64 */ } -- cgit v1.2.2 From 71415c6a0877d5944d5dc3060f3b03513746158d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Jul 2008 22:41:34 +0200 Subject: x86, xen, vdso: fix build error fix: arch/x86/xen/built-in.o: In function `xen_enable_syscall': (.cpuinit.text+0xdb): undefined reference to `sysctl_vsyscall32' Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 9cce4a92aac0..3e11779755c3 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -144,7 +144,9 @@ void __cpuinit xen_enable_syscall(void) if (ret != 0) { printk(KERN_INFO "Xen: 32-bit syscall not supported: disabling vdso\n"); setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); +#ifdef CONFIG_COMPAT sysctl_vsyscall32 = 0; +#endif } } #endif /* CONFIG_X86_64 */ -- cgit v1.2.2 From 6a52e4b1cddd90fbfde8fb67021657936ee74b07 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 12 Jul 2008 02:22:00 -0700 Subject: x86_64: further cleanup of 32-bit compat syscall mechanisms AMD only supports "syscall" from 32-bit compat usermode. Intel and Centaur(?) only support "sysenter" from 32-bit compat usermode. Set the X86 feature bits accordingly, and set up the vdso in accordance with those bits. On the offchance we run on in a 64-bit environment which supports neither syscall nor sysenter from 32-bit mode, then fall back to the int $0x80 vdso. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd_64.c | 2 ++ arch/x86/kernel/cpu/common_64.c | 3 --- arch/x86/vdso/Makefile | 2 +- arch/x86/vdso/vdso32-setup.c | 19 +++++++++---------- arch/x86/vdso/vdso32.S | 13 ++++++++----- arch/x86/xen/setup.c | 10 +++++++--- 6 files changed, 27 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 7c36fb8a28d4..d1692b2a41ff 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -115,6 +115,8 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + + set_cpu_cap(c, X86_FEATURE_SYSCALL32); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 15419cd3c5a4..736f50fa433d 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -317,9 +317,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } - /* Assume all 64-bit CPUs support 32-bit syscall */ - set_cpu_cap(c, X86_FEATURE_SYSCALL32); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index b7ad9f89d21f..4d6ef0a336d6 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -62,7 +62,7 @@ $(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE # Build multiple 32-bit vDSO images to choose from at boot time. # obj-$(VDSO32-y) += vdso32-syms.lds -vdso32.so-$(CONFIG_X86_32) += int80 +vdso32.so-$(VDSO32-y) += int80 vdso32.so-$(CONFIG_COMPAT) += syscall vdso32.so-$(VDSO32-y) += sysenter diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 0bce5429a515..513f330c5832 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -193,17 +193,12 @@ static __init void relocate_vdso(Elf32_Ehdr *ehdr) } } -/* - * These symbols are defined by vdso32.S to mark the bounds - * of the ELF DSO images included therein. - */ -extern const char vdso32_default_start, vdso32_default_end; -extern const char vdso32_sysenter_start, vdso32_sysenter_end; static struct page *vdso32_pages[1]; #ifdef CONFIG_X86_64 #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) +#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) /* May not be __init: called during resume */ void syscall32_cpu_init(void) @@ -226,6 +221,7 @@ static inline void map_compat_vdso(int map) #else /* CONFIG_X86_32 */ #define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) +#define vdso32_syscall() (0) void enable_sep_cpu(void) { @@ -296,12 +292,15 @@ int __init sysenter_setup(void) gate_vma_init(); #endif - if (!vdso32_sysenter()) { - vsyscall = &vdso32_default_start; - vsyscall_len = &vdso32_default_end - &vdso32_default_start; - } else { + if (vdso32_syscall()) { + vsyscall = &vdso32_syscall_start; + vsyscall_len = &vdso32_syscall_end - &vdso32_syscall_start; + } else if (vdso32_sysenter()){ vsyscall = &vdso32_sysenter_start; vsyscall_len = &vdso32_sysenter_end - &vdso32_sysenter_start; + } else { + vsyscall = &vdso32_int80_start; + vsyscall_len = &vdso32_int80_end - &vdso32_int80_start; } memcpy(syscall_page, vsyscall, vsyscall_len); diff --git a/arch/x86/vdso/vdso32.S b/arch/x86/vdso/vdso32.S index 1e36f72cab86..2ce5f82c333b 100644 --- a/arch/x86/vdso/vdso32.S +++ b/arch/x86/vdso/vdso32.S @@ -2,14 +2,17 @@ __INITDATA - .globl vdso32_default_start, vdso32_default_end -vdso32_default_start: -#ifdef CONFIG_X86_32 + .globl vdso32_int80_start, vdso32_int80_end +vdso32_int80_start: .incbin "arch/x86/vdso/vdso32-int80.so" -#else +vdso32_int80_end: + + .globl vdso32_syscall_start, vdso32_syscall_end +vdso32_syscall_start: +#ifdef CONFIG_COMPAT .incbin "arch/x86/vdso/vdso32-syscall.so" #endif -vdso32_default_end: +vdso32_syscall_end: .globl vdso32_sysenter_start, vdso32_sysenter_end vdso32_sysenter_start: diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 3e11779755c3..e3648e64a637 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -83,12 +83,16 @@ static void xen_idle(void) /* * Set the bit indicating "nosegneg" library variants should be used. + * We only need to bother in pure 32-bit mode; compat 32-bit processes + * can have un-truncated segments, so wrapping around is allowed. */ static void __init fiddle_vdso(void) { -#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) - extern const char vdso32_default_start; - u32 *mask = VDSO32_SYMBOL(&vdso32_default_start, NOTE_MASK); +#ifdef CONFIG_X86_32 + u32 *mask; + mask = VDSO32_SYMBOL(&vdso32_int80_start, NOTE_MASK); + *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; + mask = VDSO32_SYMBOL(&vdso32_sysenter_start, NOTE_MASK); *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; #endif } -- cgit v1.2.2 From d5303b811b9d6dad2e7396d545eb7db414d42a61 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 12 Jul 2008 02:22:06 -0700 Subject: x86: xen: no need to disable vdso32 Now that the vdso32 code can cope with both syscall and sysenter missing for 32-bit compat processes, just disable the features without disabling vdso altogether. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/xen/setup.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e3648e64a637..b6acc3a0af46 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -137,7 +137,7 @@ void __cpuinit xen_enable_syscall(void) ret = register_callback(CALLBACKTYPE_syscall, xen_syscall_target); if (ret != 0) { - printk(KERN_ERR "Failed to set syscall: %d\n", ret); + printk(KERN_ERR "Failed to set syscall callback: %d\n", ret); /* Pretty fatal; 64-bit userspace has no other mechanism for syscalls. */ } @@ -145,13 +145,8 @@ void __cpuinit xen_enable_syscall(void) if (boot_cpu_has(X86_FEATURE_SYSCALL32)) { ret = register_callback(CALLBACKTYPE_syscall32, xen_syscall32_target); - if (ret != 0) { - printk(KERN_INFO "Xen: 32-bit syscall not supported: disabling vdso\n"); + if (ret != 0) setup_clear_cpu_cap(X86_FEATURE_SYSCALL32); -#ifdef CONFIG_COMPAT - sysctl_vsyscall32 = 0; -#endif - } } #endif /* CONFIG_X86_64 */ } -- cgit v1.2.2 From 094029479be8eb380447f42eff1b35362ef1a464 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 12 Jul 2008 02:22:12 -0700 Subject: x86_64: adjust exception frame on paranoid exceptions Exceptions using paranoidentry need to have their exception frames adjusted explicitly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_64.S | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6aa6932e21b1..80d5663db3bc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1189,6 +1189,7 @@ END(device_not_available) /* runs on exception stack */ KPROBE_ENTRY(debug) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_debug, DEBUG_STACK @@ -1198,6 +1199,7 @@ KPROBE_END(debug) /* runs on exception stack */ KPROBE_ENTRY(nmi) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $-1 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_nmi, 0, 0 @@ -1211,6 +1213,7 @@ KPROBE_END(nmi) KPROBE_ENTRY(int3) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_int3, DEBUG_STACK @@ -1237,6 +1240,7 @@ END(coprocessor_segment_overrun) /* runs on exception stack */ ENTRY(double_fault) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_double_fault jmp paranoid_exit1 CFI_ENDPROC @@ -1253,6 +1257,7 @@ END(segment_not_present) /* runs on exception stack */ ENTRY(stack_segment) XCPT_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME paranoidentry do_stack_segment jmp paranoid_exit1 CFI_ENDPROC @@ -1278,6 +1283,7 @@ END(spurious_interrupt_bug) /* runs on exception stack */ ENTRY(machine_check) INTR_FRAME + PARAVIRT_ADJUST_EXCEPTION_FRAME pushq $0 CFI_ADJUST_CFA_OFFSET 8 paranoidentry do_machine_check -- cgit v1.2.2 From 74d4affde8feb8d5bdebf7fba8e90e4eae3b7b1d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:50 -0700 Subject: x86/paravirt: add hooks for spinlock operations Ticket spinlocks have absolutely ghastly worst-case performance characteristics in a virtual environment. If there is any contention for physical CPUs (ie, there are more runnable vcpus than cpus), then ticket locks can cause the system to end up spending 90+% of its time spinning. The problem is that (v)cpus waiting on a ticket spinlock will be granted access to the lock in strict order they got their tickets. If the hypervisor scheduler doesn't give the vcpus time in that order, they will burn timeslices waiting for the scheduler to give the right vcpu some time. In the worst case it could take O(n^2) vcpu scheduler timeslices for everyone waiting on the lock to get it, not counting new cpus trying to take the lock while the log-jam is sorted out. These hooks allow a paravirt backend to replace the spinlock implementation. At the very least, this could revert the implementation back to the old lock algorithm, which allows the next scheduled vcpu to take the lock, and has basically fairly good performance. It also allows the spinlocks to take advantages of the hypervisor features to make locks more efficient (spin and block, for example). The cost to native execution is an extra direct call when using a spinlock function. There's no overhead if CONFIG_PARAVIRT is turned off. The lock structure is fixed at a single "unsigned int", initialized to zero, but the spinlock implementation can use it as it wishes. Thanks to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around" for pointing out this problem. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 2963ab5d91ee..f33816868707 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -124,6 +124,7 @@ static void *get_call_destination(u8 type) .pv_irq_ops = pv_irq_ops, .pv_apic_ops = pv_apic_ops, .pv_mmu_ops = pv_mmu_ops, + .pv_lock_ops = pv_lock_ops, }; return *((void **)&tmpl + type); } @@ -450,6 +451,15 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; +struct pv_lock_ops pv_lock_ops = { + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +}; + EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); -- cgit v1.2.2 From 8efcbab674de2bee45a2e4cdf97de16b8e609ac8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:51 -0700 Subject: paravirt: introduce a "lock-byte" spinlock implementation Implement a version of the old spinlock algorithm, in which everyone spins waiting for a lock byte. In order to be compatible with the ticket-lock's use of a zero initializer, this uses the convention of '0' for unlocked and '1' for locked. This algorithm is much better than ticket locks in a virtual envionment, because it doesn't interact badly with the vcpu scheduler. If there are multiple vcpus spinning on a lock and the lock is released, the next vcpu to be scheduled will take the lock, rather than cycling around until the next ticketed vcpu gets it. To use this, you must call paravirt_use_bytelocks() very early, before any spinlocks have been taken. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index f33816868707..bba4041bb7ff 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,6 +268,15 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } +void __init paravirt_use_bytelocks(void) +{ + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +} + struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, -- cgit v1.2.2 From 56397f8dadb40055479a8ffff23f21a890098a31 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:52 -0700 Subject: xen: use lock-byte spinlock implementation Switch to using the lock-byte spinlock implementation, to avoid the worst of the performance hit from ticket locks. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index f702199312a5..a8ebafc09d47 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -430,4 +430,5 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); + paravirt_use_bytelocks(); } -- cgit v1.2.2 From 2d9e1e2f58b5612aa4eab0ab54c84308a29dbd79 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 7 Jul 2008 12:07:53 -0700 Subject: xen: implement Xen-specific spinlocks The standard ticket spinlocks are very expensive in a virtual environment, because their performance depends on Xen's scheduler giving vcpus time in the order that they're supposed to take the spinlock. This implements a Xen-specific spinlock, which should be much more efficient. The fast-path is essentially the old Linux-x86 locks, using a single lock byte. The locker decrements the byte; if the result is 0, then they have the lock. If the lock is negative, then locker must spin until the lock is positive again. When there's contention, the locker spin for 2^16[*] iterations waiting to get the lock. If it fails to get the lock in that time, it adds itself to the contention count in the lock and blocks on a per-cpu event channel. When unlocking the spinlock, the locker looks to see if there's anyone blocked waiting for the lock by checking for a non-zero waiter count. If there's a waiter, it traverses the per-cpu "lock_spinners" variable, which contains which lock each CPU is waiting on. It picks one CPU waiting on the lock and sends it an event to wake it up. This allows efficient fast-path spinlock operation, while allowing spinning vcpus to give up their processor time while waiting for a contended lock. [*] 2^16 iterations is threshold at which 98% locks have been taken according to Thomas Friebel's Xen Summit talk "Preventing Guests from Spinning Around". Therefore, we'd expect the lock and unlock slow paths will only be entered 2% of the time. Signed-off-by: Jeremy Fitzhardinge Cc: Jens Axboe Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Petr Tesarik Cc: Virtualization Cc: Xen devel Cc: Thomas Friebel Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 172 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 171 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index a8ebafc09d47..e693812ac59a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -15,6 +15,7 @@ * This does not handle HOTPLUG_CPU yet. */ #include +#include #include #include @@ -35,6 +36,8 @@ #include "xen-ops.h" #include "mmu.h" +static void __cpuinit xen_init_lock_cpu(int cpu); + cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); @@ -179,6 +182,8 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) { unsigned cpu; + xen_init_lock_cpu(0); + smp_store_cpu_info(0); cpu_data(0).x86_max_cores = 1; set_cpu_sibling_map(0); @@ -301,6 +306,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) clear_tsk_thread_flag(idle, TIF_FORK); #endif xen_setup_timer(cpu); + xen_init_lock_cpu(cpu); per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; @@ -413,6 +419,170 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +static inline void spinning_lock(struct xen_spinlock *xl) +{ + __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); +} + +static inline void unspinning_lock(struct xen_spinlock *xl) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before clearing lock */ + __get_cpu_var(lock_spinners) = NULL; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + /* announce we're spinning */ + spinning_lock(xl); + + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) + goto out; + + /* block until irq becomes pending */ + xen_poll_irq(irq); + kstat_this_cpu.irqs[irq]++; + +out: + unspinning_lock(xl); + return ret; +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int timeout; + u8 oldval; + + do { + timeout = 1 << 10; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static __cpuinit void xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + xen_reschedule_interrupt, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +static void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} + static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, @@ -430,5 +600,5 @@ void __init xen_smp_init(void) { smp_ops = xen_smp_ops; xen_fill_possible_map(); - paravirt_use_bytelocks(); + xen_init_spinlocks(); } -- cgit v1.2.2 From 4bb689eee12ceb6d669a0c9a519037c049a8af38 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:33:33 +0200 Subject: x86: paravirt spinlocks, !CONFIG_SMP build fixes Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bba4041bb7ff..6aa8aed06d54 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,11 +270,13 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) void __init paravirt_use_bytelocks(void) { +#ifdef CONFIG_SMP pv_lock_ops.spin_is_locked = __byte_spin_is_locked; pv_lock_ops.spin_is_contended = __byte_spin_is_contended; pv_lock_ops.spin_lock = __byte_spin_lock; pv_lock_ops.spin_trylock = __byte_spin_trylock; pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif } struct pv_info pv_info = { @@ -461,12 +463,14 @@ struct pv_mmu_ops pv_mmu_ops = { }; struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP .spin_is_locked = __ticket_spin_is_locked, .spin_is_contended = __ticket_spin_is_contended, .spin_lock = __ticket_spin_lock, .spin_trylock = __ticket_spin_trylock, .spin_unlock = __ticket_spin_unlock, +#endif }; EXPORT_SYMBOL_GPL(pv_time_ops); -- cgit v1.2.2 From 9af98578d6af588f52d0dacd64fe42caa405a327 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 14:39:15 +0200 Subject: x86: paravirt spinlocks, modular build fix fix: MODPOST 408 modules ERROR: "pv_lock_ops" [net/dccp/dccp.ko] undefined! ERROR: "pv_lock_ops" [fs/jbd2/jbd2.ko] undefined! ERROR: "pv_lock_ops" [drivers/media/common/saa7146_vv.ko] undefined! Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 6aa8aed06d54..3edfd7af22ae 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -472,6 +472,7 @@ struct pv_lock_ops pv_lock_ops = { .spin_unlock = __ticket_spin_unlock, #endif }; +EXPORT_SYMBOL_GPL(pv_lock_ops); EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); -- cgit v1.2.2 From 34646bca474142e1424e5f6c4a33cb2ba0930ea1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 9 Jul 2008 15:42:09 +0200 Subject: x86, paravirt-spinlocks: fix boot hang the paravirt-spinlock patches caused a boot hang with this config: http://redhat.com/~mingo/misc/config-Wed_Jul__9_14_47_04_CEST_2008.bad i have bisected it down to: | commit e17b58c2e85bc2ad2afc07fb8d898017c2b75ed1 | Author: Jeremy Fitzhardinge | Date: Mon Jul 7 12:07:53 2008 -0700 | | xen: implement Xen-specific spinlocks i.e. applying that patch alone causes the hang. The hang happens in the ftrace self-test: initcall utsname_sysctl_init+0x0/0x19 returned 0 after 0 msecs calling init_sched_switch_trace+0x0/0x4c Testing tracer sched_switch: PASSED initcall init_sched_switch_trace+0x0/0x4c returned 0 after 167 msecs calling init_function_trace+0x0/0x12 Testing tracer ftrace: [hard hang] it should have continued like this: Testing tracer ftrace: PASSED initcall init_function_trace+0x0/0x12 returned 0 after 198 msecs calling init_irqsoff_tracer+0x0/0x14 Testing tracer irqsoff: PASSED initcall init_irqsoff_tracer+0x0/0x14 returned 0 after 3 msecs calling init_mmio_trace+0x0/0x12 initcall init_mmio_trace+0x0/0x12 returned 0 after 0 msecs the problem is that such lowlevel primitives as spinlocks should never be built with -pg (which ftrace does). Marking paravirt.o as non-pg and marking all spinlock ops as always-inline solve the hang. Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5112c84f5421..78d52171400b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -7,10 +7,11 @@ extra-y := head_$(BITS).o head$(BITS).o head.o init_task.o vmlinu CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FTRACE -# Do not profile debug utilities +# Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc_64.o = -pg CFLAGS_REMOVE_tsc_32.o = -pg CFLAGS_REMOVE_rtc.o = -pg +CFLAGS_REMOVE_paravirt.o = -pg endif # -- cgit v1.2.2 From 6718d0d6da2749d3bff522e6057e97e6aa85e4d1 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 01:07:02 -0700 Subject: x86 ptrace: block-step fix The enable_single_step() logic bails out early if TF is already set. That skips some of the bookkeeping that keeps things straight. This makes PTRACE_SINGLEBLOCK break the behavior of a user task that was already setting TF itself in user mode. Fix the bookkeeping to notice the old TF setting as it should. Test case at: http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/step-jump-cont-strict.c?cvsroot=systemtap Signed-off-by: Roland McGrath --- arch/x86/kernel/step.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 92c20fee6781..0d2cb363ea75 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -105,6 +105,7 @@ static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs) static int enable_single_step(struct task_struct *child) { struct pt_regs *regs = task_pt_regs(child); + unsigned long oflags; /* * Always set TIF_SINGLESTEP - this guarantees that @@ -113,11 +114,7 @@ static int enable_single_step(struct task_struct *child) */ set_tsk_thread_flag(child, TIF_SINGLESTEP); - /* - * If TF was already set, don't do anything else - */ - if (regs->flags & X86_EFLAGS_TF) - return 0; + oflags = regs->flags; /* Set TF on the kernel stack.. */ regs->flags |= X86_EFLAGS_TF; @@ -126,9 +123,22 @@ static int enable_single_step(struct task_struct *child) * ..but if TF is changed by the instruction we will trace, * don't mark it as being "us" that set it, so that we * won't clear it by hand later. + * + * Note that if we don't actually execute the popf because + * of a signal arriving right now or suchlike, we will lose + * track of the fact that it really was "us" that set it. */ - if (is_setting_trap_flag(child, regs)) + if (is_setting_trap_flag(child, regs)) { + clear_tsk_thread_flag(child, TIF_FORCED_TF); return 0; + } + + /* + * If TF was already set, check whether it was us who set it. + * If not, we should never attempt a block step. + */ + if (oflags & X86_EFLAGS_TF) + return test_tsk_thread_flag(child, TIF_FORCED_TF); set_tsk_thread_flag(child, TIF_FORCED_TF); -- cgit v1.2.2 From 64f097331928b01d704047c1dbc738bb6d2a9bf9 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 01:33:14 -0700 Subject: x86 ptrace: unify TIF_SINGLESTEP This unifies the treatment of TIF_SINGLESTEP on i386 and x86_64. The bit is now excluded from _TIF_WORK_MASK on i386 as it has been on x86_64. This means the do_notify_resume() path using it is never used, so TIF_SINGLESTEP is not cleared on returning to user mode. Both now leave TIF_SINGLESTEP set when returning to user, so that it's already set on an int $0x80 system call entry. This removes the need for testing TF on the system_call path. Doing it this way fixes the regression for PTRACE_SINGLESTEP into a sigreturn syscall, introduced by commit 1e2e99f0e4aa6363e8515ed17011c210c8f1b52a. The clear_TF_reenable case that sets TIF_SINGLESTEP can only happen on a non-exception kernel entry, i.e. sysenter/syscall instruction. That will always get to the syscall exit tracing path. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_32.S | 4 ---- arch/x86/kernel/signal_32.c | 6 ------ arch/x86/kernel/signal_64.c | 6 ------ 3 files changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6bc07f0f1202..0ad987d02b72 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -383,10 +383,6 @@ syscall_exit: # setting need_resched or sigpending # between sampling and the iret TRACE_IRQS_OFF - testl $X86_EFLAGS_TF,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit - jz no_singlestep - orl $_TIF_SINGLESTEP,TI_flags(%ebp) -no_singlestep: movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d92373630963..295b5f5c9389 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -657,12 +657,6 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e7..bf87684474f1 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -487,12 +487,6 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { - /* Pending single-step? */ - if (thread_info_flags & _TIF_SINGLESTEP) { - regs->flags |= X86_EFLAGS_TF; - clear_thread_flag(TIF_SINGLESTEP); - } - #ifdef CONFIG_X86_MCE /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) -- cgit v1.2.2 From d4d67150165df8bf1cc05e532f6efca96f907cab Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 02:38:07 -0700 Subject: x86 ptrace: unify syscall tracing This unifies and cleans up the syscall tracing code on i386 and x86_64. Using a single function for entry and exit tracing on 32-bit made the do_syscall_trace() into some terrible spaghetti. The logic is clear and simple using separate syscall_trace_enter() and syscall_trace_leave() functions as on 64-bit. The unification adds PTRACE_SYSEMU and PTRACE_SYSEMU_SINGLESTEP support on x86_64, for 32-bit ptrace() callers and for 64-bit ptrace() callers tracing either 32-bit or 64-bit tasks. It behaves just like 32-bit. Changing syscall_trace_enter() to return the syscall number shortens all the assembly paths, while adding the SYSEMU feature in a simple way. Signed-off-by: Roland McGrath --- arch/x86/ia32/ia32entry.S | 17 +++--- arch/x86/kernel/entry_32.S | 19 +++--- arch/x86/kernel/entry_64.S | 14 +++-- arch/x86/kernel/ptrace.c | 141 +++++++++++++++------------------------------ 4 files changed, 71 insertions(+), 120 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 20371d0635e4..8796d1905255 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -37,6 +37,11 @@ movq %rax,R8(%rsp) .endm + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %eax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ .macro LOAD_ARGS32 offset movl \offset(%rsp),%r11d movl \offset+8(%rsp),%r10d @@ -46,7 +51,6 @@ movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi movl \offset+64(%rsp),%edi - movl \offset+72(%rsp),%eax .endm .macro CFI_STARTPROC32 simple @@ -137,13 +141,12 @@ ENTRY(ia32_sysenter_target) .previous GET_THREAD_INFO(%r10) orl $TS_COMPAT,TI_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz sysenter_tracesys -sysenter_do_call: cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys +sysenter_do_call: IA32_ARG_FIXUP 1 call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -242,8 +245,7 @@ ENTRY(ia32_cstar_target) .previous GET_THREAD_INFO(%r10) orl $TS_COMPAT,TI_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys cstar_do_call: @@ -336,8 +338,7 @@ ENTRY(ia32_syscall) SAVE_ARGS 0,0,1 GET_THREAD_INFO(%r10) orl $TS_COMPAT,TI_status(%r10) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%r10) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) jnz ia32_tracesys ia32_do_syscall: cmpl $(IA32_NR_syscalls-1),%eax diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 0ad987d02b72..cadf73f70d33 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -332,7 +332,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -370,7 +370,7 @@ ENTRY(system_call) GET_THREAD_INFO(%ebp) # system call tracing in operation / emulation /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ - testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) jnz syscall_trace_entry cmpl $(nr_syscalls), %eax jae syscall_badsys @@ -510,12 +510,8 @@ END(work_pending) syscall_trace_entry: movl $-ENOSYS,PT_EAX(%esp) movl %esp, %eax - xorl %edx,%edx - call do_syscall_trace - cmpl $0, %eax - jne resume_userspace # ret != 0 -> running under PTRACE_SYSEMU, - # so must skip actual syscall - movl PT_ORIG_EAX(%esp), %eax + call syscall_trace_enter + /* What it returned is what we'll actually use. */ cmpl $(nr_syscalls), %eax jnae syscall_call jmp syscall_exit @@ -524,14 +520,13 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN syscall_exit_work: - testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl + testb $_TIF_WORK_SYSCALL_EXIT, %cl jz work_pending TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let do_syscall_trace() call + ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call # schedule() instead movl %esp, %eax - movl $1, %edx - call do_syscall_trace + call syscall_trace_leave jmp resume_userspace END(syscall_exit_work) CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ae63e584c340..63001c6ecf6d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -349,8 +349,7 @@ ENTRY(system_call_after_swapgs) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP), \ - TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -430,7 +429,12 @@ tracesys: FIXUP_TOP_OF_STACK %rdi movq %rsp,%rdi call syscall_trace_enter - LOAD_ARGS ARGOFFSET /* reload args from stack in case ptrace changed it */ + /* + * Reload arg registers from stack in case ptrace changed them. + * We don't reload %rax because syscall_trace_enter() returned + * the value it wants us to use in the table lookup. + */ + LOAD_ARGS ARGOFFSET, 1 RESTORE_REST cmpq $__NR_syscall_max,%rax ja int_ret_from_sys_call /* RAX(%rsp) set to -ENOSYS above */ @@ -483,7 +487,7 @@ int_very_careful: ENABLE_INTERRUPTS(CLBR_NONE) SAVE_REST /* Check for syscall exit trace */ - testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edx + testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal pushq %rdi CFI_ADJUST_CFA_OFFSET 8 @@ -491,7 +495,7 @@ int_very_careful: call syscall_trace_leave popq %rdi CFI_ADJUST_CFA_OFFSET -8 - andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi + andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi jmp int_restore_rest int_signal: diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 77040b6070e1..34e77b16a42a 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1357,8 +1357,6 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -#ifdef CONFIG_X86_32 - void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) { struct siginfo info; @@ -1377,89 +1375,10 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) force_sig_info(SIGTRAP, &info, tsk); } -/* notification of system call entry/exit - * - triggered by current->work.syscall_trace - */ -int do_syscall_trace(struct pt_regs *regs, int entryexit) -{ - int is_sysemu = test_thread_flag(TIF_SYSCALL_EMU); - /* - * With TIF_SYSCALL_EMU set we want to ignore TIF_SINGLESTEP for syscall - * interception - */ - int is_singlestep = !is_sysemu && test_thread_flag(TIF_SINGLESTEP); - int ret = 0; - - /* do the secure computing check first */ - if (!entryexit) - secure_computing(regs->orig_ax); - - if (unlikely(current->audit_context)) { - if (entryexit) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), - regs->ax); - /* Debug traps, when using PTRACE_SINGLESTEP, must be sent only - * on the syscall exit path. Normally, when TIF_SYSCALL_AUDIT is - * not used, entry.S will call us only on syscall exit, not - * entry; so when TIF_SYSCALL_AUDIT is used we must avoid - * calling send_sigtrap() on syscall entry. - * - * Note that when PTRACE_SYSEMU_SINGLESTEP is used, - * is_singlestep is false, despite his name, so we will still do - * the correct thing. - */ - else if (is_singlestep) - goto out; - } - - if (!(current->ptrace & PT_PTRACED)) - goto out; - - /* If a process stops on the 1st tracepoint with SYSCALL_TRACE - * and then is resumed with SYSEMU_SINGLESTEP, it will come in - * here. We have to check this and return */ - if (is_sysemu && entryexit) - return 0; - - /* Fake a debug trap */ - if (is_singlestep) - send_sigtrap(current, regs, 0); - - if (!test_thread_flag(TIF_SYSCALL_TRACE) && !is_sysemu) - goto out; - - /* the 0x80 provides a way for the tracing parent to distinguish - between a syscall stop and SIGTRAP delivery */ - /* Note that the debugger could change the result of test_thread_flag!*/ - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) ? 0x80:0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } - ret = is_sysemu; -out: - if (unlikely(current->audit_context) && !entryexit) - audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, - regs->bx, regs->cx, regs->dx, regs->si); - if (ret == 0) - return 0; - - regs->orig_ax = -1; /* force skip of syscall restarting */ - if (unlikely(current->audit_context)) - audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - return 1; -} - -#else /* CONFIG_X86_64 */ - static void syscall_trace(struct pt_regs *regs) { + if (!(current->ptrace & PT_PTRACED)) + return; #if 0 printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", @@ -1481,39 +1400,71 @@ static void syscall_trace(struct pt_regs *regs) } } -asmlinkage void syscall_trace_enter(struct pt_regs *regs) +#ifdef CONFIG_X86_32 +# define IS_IA32 1 +#elif defined CONFIG_IA32_EMULATION +# define IS_IA32 test_thread_flag(TIF_IA32) +#else +# define IS_IA32 0 +#endif + +/* + * We must return the syscall number to actually look up in the table. + * This can be -1L to skip running any syscall at all. + */ +asmregparm long syscall_trace_enter(struct pt_regs *regs) { + long ret = 0; + /* do the secure computing check first */ secure_computing(regs->orig_ax); - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + ret = -1L; + + if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); if (unlikely(current->audit_context)) { - if (test_thread_flag(TIF_IA32)) { + if (IS_IA32) audit_syscall_entry(AUDIT_ARCH_I386, regs->orig_ax, regs->bx, regs->cx, regs->dx, regs->si); - } else { +#ifdef CONFIG_X86_64 + else audit_syscall_entry(AUDIT_ARCH_X86_64, regs->orig_ax, regs->di, regs->si, regs->dx, regs->r10); - } +#endif } + + return ret ?: regs->orig_ax; } -asmlinkage void syscall_trace_leave(struct pt_regs *regs) +asmregparm void syscall_trace_leave(struct pt_regs *regs) { if (unlikely(current->audit_context)) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); - if ((test_thread_flag(TIF_SYSCALL_TRACE) - || test_thread_flag(TIF_SINGLESTEP)) - && (current->ptrace & PT_PTRACED)) + if (test_thread_flag(TIF_SYSCALL_TRACE)) syscall_trace(regs); -} -#endif /* CONFIG_X86_32 */ + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(), so don't do any more now. + */ + if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) + return; + + /* + * If we are single-stepping, synthesize a trap to follow the + * system call instruction. + */ + if (test_thread_flag(TIF_SINGLESTEP) && + (current->ptrace & PT_PTRACED)) + send_sigtrap(current, regs, 0); +} -- cgit v1.2.2 From 380fdd7585a4c2f41b48925eba85c0654b7b858b Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Wed, 9 Jul 2008 02:39:29 -0700 Subject: x86 ptrace: user-sets-TF nits This closes some arcane holes in single-step handling that can arise only when user programs set TF directly (via popf or sigreturn) and then use vDSO (syscall/sysenter) system call entry. In those entry paths, the clear_TF_reenable case hits and we must check TIF_SINGLESTEP to be sure our bookkeeping stays correct wrt the user's view of TF. Signed-off-by: Roland McGrath --- arch/x86/kernel/ptrace.c | 10 ++++++++++ arch/x86/kernel/step.c | 13 +++++++++++++ 2 files changed, 23 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 34e77b16a42a..e37dccce85db 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1416,6 +1416,16 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (test_thread_flag(TIF_SINGLESTEP)) + regs->flags |= X86_EFLAGS_TF; + /* do the secure computing check first */ secure_computing(regs->orig_ax); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 0d2cb363ea75..e8b9863ef8c4 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -107,6 +107,19 @@ static int enable_single_step(struct task_struct *child) struct pt_regs *regs = task_pt_regs(child); unsigned long oflags; + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state so we don't wrongly set TIF_FORCED_TF below. + * If enable_single_step() was used last and that is what + * set TIF_SINGLESTEP, then both TF and TIF_FORCED_TF are + * already set and our bookkeeping is fine. + */ + if (unlikely(test_tsk_thread_flag(child, TIF_SINGLESTEP))) + regs->flags |= X86_EFLAGS_TF; + /* * Always set TIF_SINGLESTEP - this guarantees that * we single-step system calls etc.. This will also -- cgit v1.2.2 From fab3b58d3b242b5903f78d60d86803a8aecdf6de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 17 Jul 2008 13:50:15 +0200 Subject: x86 reboot quirks: add Dell Precision WorkStation T5400 as reported in: "reboot=bios is mandatory on Dell T5400 server." http://bugzilla.kernel.org/show_bug.cgi?id=11108 add a DMI reboot quirk. Signed-off-by: Ingo Molnar Cc: --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8a62160e151..9dcf39c02972 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -177,6 +177,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "PowerEdge 2400"), }, }, + { /* Handle problems with rebooting on Dell T5400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T5400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), + }, + }, { /* Handle problems with rebooting on HP laptops */ .callback = set_bios_reboot, .ident = "HP Compaq Laptop", -- cgit v1.2.2 From 93a0886e2368eafb9df5e2021fb185195cee88b2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 Jul 2008 13:43:42 -0700 Subject: x86, xen, power: fix up config dependencies on PM Xen save/restore needs bits of code enabled by PM_SLEEP, and PM_SLEEP depends on PM. So make XEN_SAVE_RESTORE depend on PM and PM_SLEEP depend on XEN_SAVE_RESTORE. Signed-off-by: Jeremy Fitzhardinge Acked-by: Rafael J. Wysocki Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 20b49729bed5..3815e425f470 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -23,3 +23,8 @@ config XEN_MAX_DOMAIN_MEMORY according to the maximum possible memory size of a Xen domain. This array uses 1 page per gigabyte, so there's no need to be too stingy here. + +config XEN_SAVE_RESTORE + bool + depends on PM + default y \ No newline at end of file -- cgit v1.2.2 From 4fdf08b5bf8d449cc9897395895157c6ff8ddc41 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 17 Jul 2008 11:29:24 -0700 Subject: x86: unify and correct the GDT_ENTRY() macro Merge the GDT_ENTRY() macro between arch/x86/boot/pm.c and arch/x86/kernel/acpi/sleep.c and put the new one in . While we're at it, correct the bitmasks for the limit and flags. The new version relies on using ULL constants in order to cause type promotion rather than explicit casts; this avoids having to include in . Signed-off-by: H. Peter Anvin --- arch/x86/boot/pm.c | 6 ------ arch/x86/kernel/acpi/sleep.c | 10 +--------- 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/pm.c b/arch/x86/boot/pm.c index 328956fdb59e..85a1cd8a8ff8 100644 --- a/arch/x86/boot/pm.c +++ b/arch/x86/boot/pm.c @@ -98,12 +98,6 @@ static void reset_coprocessor(void) /* * Set up the GDT */ -#define GDT_ENTRY(flags, base, limit) \ - (((u64)(base & 0xff000000) << 32) | \ - ((u64)flags << 40) | \ - ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffffff) << 16) | \ - ((u64)(limit & 0x0000ffff))) struct gdt_ptr { u16 len; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 868de3d5c39d..a3ddad18aaa3 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "realmode/wakeup.h" #include "sleep.h" @@ -23,15 +24,6 @@ static unsigned long acpi_realmode; static char temp_stack[10240]; #endif -/* XXX: this macro should move to asm-x86/segment.h and be shared with the - boot code... */ -#define GDT_ENTRY(flags, base, limit) \ - (((u64)(base & 0xff000000) << 32) | \ - ((u64)flags << 40) | \ - ((u64)(limit & 0x00ff0000) << 32) | \ - ((u64)(base & 0x00ffffff) << 16) | \ - ((u64)(limit & 0x0000ffff))) - /** * acpi_save_state_mem - save kernel state * -- cgit v1.2.2 From 64d206d896ff70b828138577d5ff39deda5f1c4d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Jul 2008 00:26:59 +0200 Subject: x86: rename CONFIG_NONPROMISC_DEVMEM to CONFIG_PROMISC_DEVMEM Linus observed: > The real bug is that we shouldn't have "double negatives", and > certainly not negative config options. Making that "promiscuous > /dev/mem" option a negated thing as a config option was bad. right ... lets rename this option. There should never be a negation in config options. [ that reminds me of CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER, but that is for another commit ;-) ] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 7 ++++--- arch/x86/mm/pat.c | 6 +++--- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ae36bfa814e5..f0cf5d990794 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,10 +5,11 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config NONPROMISC_DEVMEM - bool "Filter access to /dev/mem" +config PROMISC_DEVMEM + bool "Allow unlimited access to /dev/mem" + default y help - If this option is left off, you allow userspace access to all + If this option is left on, you allow userspace (root) access to all of memory, including kernel and userspace memory. Accidental access to this is obviously disastrous, but specific access can be used by people debugging the kernel. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d4585077977a..c34dc483839c 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -373,8 +373,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return vma_prot; } -#ifdef CONFIG_NONPROMISC_DEVMEM -/* This check is done in drivers/char/mem.c in case of NONPROMISC_DEVMEM*/ +#ifndef CONFIG_PROMISC_DEVMEM +/* This check is done in drivers/char/mem.c in case of !PROMISC_DEVMEM*/ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; @@ -398,7 +398,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) } return 1; } -#endif /* CONFIG_NONPROMISC_DEVMEM */ +#endif /* CONFIG_PROMISC_DEVMEM */ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) -- cgit v1.2.2 From 6879827f4e08da219c99b91e4e1d793a924103e3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Jul 2008 01:21:53 +0200 Subject: x86: remove arch/x86/kernel/smpcommon_32.c Yinghai Lu noticed that arch/x86/kernel/smpcommon_32.c got renamed to arch/x86/kernel/smpcommon.c but the old almost-empty file stayed around. Zap it. Reported-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpcommon_32.c | 1 - 1 file changed, 1 deletion(-) delete mode 100644 arch/x86/kernel/smpcommon_32.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpcommon_32.c b/arch/x86/kernel/smpcommon_32.c deleted file mode 100644 index 8b137891791f..000000000000 --- a/arch/x86/kernel/smpcommon_32.c +++ /dev/null @@ -1 +0,0 @@ - -- cgit v1.2.2 From 29cbeb0e17d9d2ca824f62f71cfa7360b3157112 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 17 Jul 2008 21:50:23 -0700 Subject: x86: use cpu_clear in remove_cpu_from_maps Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e8..27456574f070 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1311,7 +1311,7 @@ static void __ref remove_cpu_from_maps(int cpu) cpu_clear(cpu, cpu_callout_map); cpu_clear(cpu, cpu_callin_map); /* was set by cpu_init() */ - clear_bit(cpu, (unsigned long *)&cpu_initialized); + cpu_clear(cpu, cpu_initialized); numa_remove_cpu(cpu); } -- cgit v1.2.2 From 593f4a788e5d09e9f00182561437461b0b564de4 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Wed, 16 Jul 2008 19:15:30 +0100 Subject: x86: APIC: remove apic_write_around(); use alternatives Use alternatives to select the workaround for the 11AP Pentium erratum for the affected steppings on the fly rather than build time. Remove the X86_GOOD_APIC configuration option and replace all the calls to apic_write_around() with plain apic_write(), protecting accesses to the ESR as appropriate due to the 3AP Pentium erratum. Remove apic_read_around() and all its invocations altogether as not needed. Remove apic_write_atomic() and all its implementing backends. The use of ASM_OUTPUT2() is not strictly needed for input constraints, but I have used it for readability's sake. I had the feeling no one else was brave enough to do it, so I went ahead and here it is. Verified by checking the generated assembly and tested with both a 32-bit and a 64-bit configuration, also with the 11AP "feature" forced on and verified with gdb on /proc/kcore to work as expected (as an 11AP machines are quite hard to get hands on these days). Some script complained about the use of "volatile", but apic_write() needs it for the same reason and is effectively a replacement for writel(), so I have disregarded it. I am not sure what the policy wrt defconfig files is, they are generated and there is risk of a conflict resulting from an unrelated change, so I have left changes to them out. The option will get removed from them at the next run. Some testing with machines other than mine will be needed to avoid some stupid mistake, but despite its volume, the change is not really that intrusive, so I am fairly confident that because it works for me, it will everywhere. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 4 --- arch/x86/kernel/apic_32.c | 75 ++++++++++++++++++++--------------------- arch/x86/kernel/cpu/bugs.c | 23 +------------ arch/x86/kernel/cpu/intel.c | 10 ++++++ arch/x86/kernel/cpu/mcheck/p4.c | 4 +-- arch/x86/kernel/io_apic_32.c | 14 ++++---- arch/x86/kernel/ipi.c | 6 ++-- arch/x86/kernel/nmi.c | 4 +-- arch/x86/kernel/paravirt.c | 1 - arch/x86/kernel/smpboot.c | 49 +++++++++++---------------- arch/x86/kernel/vmi_32.c | 1 - arch/x86/lguest/boot.c | 1 - arch/x86/xen/enlighten.c | 1 - 13 files changed, 80 insertions(+), 113 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index abff1b84ed5b..54b8c02c71e6 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -362,10 +362,6 @@ config X86_ALIGNMENT_16 def_bool y depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 -config X86_GOOD_APIC - def_bool y - depends on MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || MK8 || MEFFICEON || MCORE2 || MVIAC7 || X86_64 - config X86_INTEL_USERCOPY def_bool y depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a437d027f20b..2bc1186cc95a 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -177,7 +177,7 @@ void __cpuinit enable_NMI_through_LVT0(void) /* Level triggered for 82489DX */ if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT0, v); + apic_write(APIC_LVT0, v); } /** @@ -212,9 +212,6 @@ int lapic_get_maxlvt(void) * this function twice on the boot CPU, once with a bogus timeout * value, second time for real. The other (noncalibrating) CPUs * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. */ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { @@ -229,18 +226,18 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) if (!irqen) lvtt_value |= APIC_LVT_MASKED; - apic_write_around(APIC_LVTT, lvtt_value); + apic_write(APIC_LVTT, lvtt_value); /* * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write_around(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) - apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* @@ -249,7 +246,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) static int lapic_next_event(unsigned long delta, struct clock_event_device *evt) { - apic_write_around(APIC_TMICT, delta); + apic_write(APIC_TMICT, delta); return 0; } @@ -278,7 +275,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, case CLOCK_EVT_MODE_SHUTDOWN: v = apic_read(APIC_LVTT); v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, v); + apic_write(APIC_LVTT, v); break; case CLOCK_EVT_MODE_RESUME: /* Nothing to do here */ @@ -693,44 +690,44 @@ void clear_local_APIC(void) */ if (maxlvt >= 3) { v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write_around(APIC_LVTERR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); } /* * Careful: we have to set masks only first to deassert * any level-triggered sources. */ v = apic_read(APIC_LVTT); - apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); v = apic_read(APIC_LVT1); - apic_write_around(APIC_LVT1, v | APIC_LVT_MASKED); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); if (maxlvt >= 4) { v = apic_read(APIC_LVTPC); - apic_write_around(APIC_LVTPC, v | APIC_LVT_MASKED); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); } /* lets not touch this if we didn't frob it */ #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, v | APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); } #endif /* * Clean APIC state for other OSs: */ - apic_write_around(APIC_LVTT, APIC_LVT_MASKED); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); - apic_write_around(APIC_LVT1, APIC_LVT_MASKED); + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); if (maxlvt >= 3) - apic_write_around(APIC_LVTERR, APIC_LVT_MASKED); + apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) - apic_write_around(APIC_LVTPC, APIC_LVT_MASKED); + apic_write(APIC_LVTPC, APIC_LVT_MASKED); #ifdef CONFIG_X86_MCE_P4THERMAL if (maxlvt >= 5) - apic_write_around(APIC_LVTTHMR, APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); #endif /* Integrated APIC (!82489DX) ? */ if (lapic_is_integrated()) { @@ -756,7 +753,7 @@ void disable_local_APIC(void) */ value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * When LAPIC was disabled by the BIOS and enabled by the kernel, @@ -865,8 +862,8 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write_around(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* @@ -902,16 +899,16 @@ void __init init_bsp_APIC(void) else value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up the virtual wire mode. */ - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } static void __cpuinit lapic_setup_esr(void) @@ -926,7 +923,7 @@ static void __cpuinit lapic_setup_esr(void) /* enables sending errors */ value = ERROR_APIC_VECTOR; - apic_write_around(APIC_LVTERR, value); + apic_write(APIC_LVTERR, value); /* * spec says clear errors after enabling vector. */ @@ -989,7 +986,7 @@ void __cpuinit setup_local_APIC(void) */ value = apic_read(APIC_TASKPRI); value &= ~APIC_TPRI_MASK; - apic_write_around(APIC_TASKPRI, value); + apic_write(APIC_TASKPRI, value); /* * After a crash, we no longer service the interrupts and a pending @@ -1047,7 +1044,7 @@ void __cpuinit setup_local_APIC(void) * Set spurious IRQ vector */ value |= SPURIOUS_APIC_VECTOR; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); /* * Set up LVT0, LVT1: @@ -1069,7 +1066,7 @@ void __cpuinit setup_local_APIC(void) apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", smp_processor_id()); } - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); /* * only the BP should see the LINT1 NMI signal, obviously. @@ -1080,7 +1077,7 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI | APIC_LVT_MASKED; if (!integrated) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } void __cpuinit end_local_APIC_setup(void) @@ -1091,7 +1088,7 @@ void __cpuinit end_local_APIC_setup(void) /* Disable the local apic timer */ value = apic_read(APIC_LVTT); value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write_around(APIC_LVTT, value); + apic_write(APIC_LVTT, value); setup_apic_nmi_watchdog(NULL); apic_pm_activate(); @@ -1419,7 +1416,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; value |= 0xf; - apic_write_around(APIC_SPIV, value); + apic_write(APIC_SPIV, value); if (!virt_wire_setup) { /* @@ -1432,10 +1429,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write_around(APIC_LVT0, value); + apic_write(APIC_LVT0, value); } else { /* Disable LVT0 */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); } /* @@ -1449,7 +1446,7 @@ void disconnect_bsp_APIC(int virt_wire_setup) APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write_around(APIC_LVT1, value); + apic_write(APIC_LVT1, value); } } diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 1b1c56bb338f..c9b58a806e85 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -131,13 +131,7 @@ static void __init check_popad(void) * (for due to lack of "invlpg" and working WP on a i386) * - In order to run on anything without a TSC, we need to be * compiled for a i486. - * - In order to support the local APIC on a buggy Pentium machine, - * we need to be compiled with CONFIG_X86_GOOD_APIC disabled, - * which happens implicitly if compiled for a Pentium or lower - * (unless an advanced selection of CPU features is used) as an - * otherwise config implies a properly working local APIC without - * the need to do extra reads from the APIC. -*/ + */ static void __init check_config(void) { @@ -151,21 +145,6 @@ static void __init check_config(void) if (boot_cpu_data.x86 == 3) panic("Kernel requires i486+ for 'invlpg' and other features"); #endif - -/* - * If we were told we had a good local APIC, check for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_GOOD_APIC) - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL - && cpu_has_apic - && boot_cpu_data.x86 == 5 - && boot_cpu_data.x86_model == 2 - && (boot_cpu_data.x86_mask < 6 || boot_cpu_data.x86_mask == 11)) - panic("Kernel compiled for PMMX+, assumes a local APIC without the read-before-write bug!"); -#endif } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 70609efdf1da..b75f2569b8f8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -227,6 +227,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ds_init_intel(c); + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif diff --git a/arch/x86/kernel/cpu/mcheck/p4.c b/arch/x86/kernel/cpu/mcheck/p4.c index eef001ad3bde..9b60fce09f75 100644 --- a/arch/x86/kernel/cpu/mcheck/p4.c +++ b/arch/x86/kernel/cpu/mcheck/p4.c @@ -102,7 +102,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) /* The temperature transition interrupt handler setup */ h = THERMAL_APIC_VECTOR; /* our delivery vector */ h |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */ - apic_write_around(APIC_LVTTHMR, h); + apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); wrmsr(MSR_IA32_THERM_INTERRUPT, l | 0x03 , h); @@ -114,7 +114,7 @@ static void intel_init_thermal(struct cpuinfo_x86 *c) wrmsr(MSR_IA32_MISC_ENABLE, l | (1<<3), h); l = apic_read(APIC_LVTTHMR); - apic_write_around(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); + apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); printk(KERN_INFO "CPU%d: Thermal monitoring enabled\n", cpu); /* enable thermal throttle processing */ diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 558abf4c796a..eabaf9244f5b 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -756,7 +756,7 @@ void send_IPI_self(int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } #endif /* !CONFIG_SMP */ @@ -2030,7 +2030,7 @@ static void mask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } static void unmask_lapic_irq(unsigned int irq) @@ -2038,7 +2038,7 @@ static void unmask_lapic_irq(unsigned int irq) unsigned long v; v = apic_read(APIC_LVT0); - apic_write_around(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static struct irq_chip lapic_chip __read_mostly = { @@ -2168,7 +2168,7 @@ static inline void __init check_timer(void) * The AEOI mode will finish them in the 8259A * automatically. */ - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); @@ -2256,7 +2256,7 @@ static inline void __init check_timer(void) printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); lapic_register_intr(0, vector); - apic_write_around(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { @@ -2264,14 +2264,14 @@ static inline void __init check_timer(void) goto out; } disable_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); printk(" failed.\n"); printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); init_8259A(0); make_8259A_irq(0); - apic_write_around(APIC_LVT0, APIC_DM_EXTINT); + apic_write(APIC_LVT0, APIC_DM_EXTINT); unlock_ExtINT_logic(); diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 9d98cda39ad9..3f7537b669d3 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -70,7 +70,7 @@ void __send_IPI_shortcut(unsigned int shortcut, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } void send_IPI_self(int vector) @@ -98,7 +98,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) * prepare target chip field */ cfg = __prepare_ICR2(mask); - apic_write_around(APIC_ICR2, cfg); + apic_write(APIC_ICR2, cfg); /* * program the ICR @@ -108,7 +108,7 @@ static inline void __send_IPI_dest_field(unsigned long mask, int vector) /* * Send the IPI. The write to APIC_ICR fires this off. */ - apic_write_around(APIC_ICR, cfg); + apic_write(APIC_ICR, cfg); } /* diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ec024b3baad0..384b49fed598 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -263,7 +263,7 @@ late_initcall(init_lapic_nmi_sysfs); static void __acpi_nmi_enable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI); + apic_write(APIC_LVT0, APIC_DM_NMI); } /* @@ -277,7 +277,7 @@ void acpi_nmi_enable(void) static void __acpi_nmi_disable(void *__unused) { - apic_write_around(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED); } /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index e0f571d58c19..5d7326a60b7c 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -361,7 +361,6 @@ struct pv_cpu_ops pv_cpu_ops = { struct pv_apic_ops pv_apic_ops = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = native_apic_write, - .apic_write_atomic = native_apic_write_atomic, .apic_read = native_apic_read, .setup_boot_clock = setup_boot_APIC_clock, .setup_secondary_clock = setup_secondary_APIC_clock, diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 687376ab07e8..f251f5c38823 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -546,8 +546,8 @@ static inline void __inquire_remote_apic(int apicid) printk(KERN_CONT "a previous APIC delivery may have failed\n"); - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write(APIC_ICR, APIC_DM_REMRD | regs[i]); timeout = 0; do { @@ -579,11 +579,11 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) int maxlvt; /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -592,14 +592,9 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); Dprintk("NMI sent.\n"); @@ -625,12 +620,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) return send_status; } + maxlvt = lapic_get_maxlvt(); + /* * Be paranoid about clearing APIC errors. */ if (APIC_INTEGRATED(apic_version[phys_apicid])) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -639,13 +636,13 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Turn INIT on target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* * Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); + apic_write(APIC_ICR, + APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -655,10 +652,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) Dprintk("Deasserting INIT.\n"); /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); Dprintk("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); @@ -689,12 +686,10 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ Dprintk("#startup loops: %d.\n", num_starts); - maxlvt = lapic_get_maxlvt(); - for (j = 1; j <= num_starts; j++) { Dprintk("Sending STARTUP #%d.\n", j); - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); apic_read(APIC_ESR); Dprintk("After apic_write.\n"); @@ -703,12 +698,11 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); /* Boot on the stack */ /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_eip >> 12)); + apic_write(APIC_ICR, APIC_DM_STARTUP | (start_eip >> 12)); /* * Give the other CPU some time to accept the IPI. @@ -724,13 +718,8 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); - } accept_status = (apic_read(APIC_ESR) & 0xEF); if (send_status || accept_status) break; diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index b15346092b7b..0a1b1a9d922d 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -906,7 +906,6 @@ static inline int __init activate_vmi(void) #ifdef CONFIG_X86_LOCAL_APIC para_fill(pv_apic_ops.apic_read, APICRead); para_fill(pv_apic_ops.apic_write, APICWrite); - para_fill(pv_apic_ops.apic_write_atomic, APICWrite); #endif /* diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 50dad44fb542..0313a5eec412 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -991,7 +991,6 @@ __init void lguest_init(void) #ifdef CONFIG_X86_LOCAL_APIC /* apic read/write intercepts */ pv_apic_ops.apic_write = lguest_apic_write; - pv_apic_ops.apic_write_atomic = lguest_apic_write; pv_apic_ops.apic_read = lguest_apic_read; #endif diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb508456ef52..7f26c3718777 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1131,7 +1131,6 @@ static const struct pv_irq_ops xen_irq_ops __initdata = { static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, - .apic_write_atomic = xen_apic_write, .apic_read = xen_apic_read, .setup_boot_clock = paravirt_nop, .setup_secondary_clock = paravirt_nop, -- cgit v1.2.2 From 95c7c23b06bc92f1772b9c9460845f179ba8c39e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 15 Jul 2008 13:42:34 -0700 Subject: xen: report hypervisor version Various versions of the hypervisor have differences in what ABIs and features they support. Print some details into the boot log to help with remote debugging. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bb508456ef52..5328e46d9cf7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -167,10 +167,14 @@ void xen_vcpu_restore(void) static void __init xen_banner(void) { + unsigned version = HYPERVISOR_xen_version(XENVER_version, NULL); + struct xen_extraversion extra; + HYPERVISOR_xen_version(XENVER_extraversion, &extra); + printk(KERN_INFO "Booting paravirtualized kernel on %s\n", pv_info.name); - printk(KERN_INFO "Hypervisor signature: %s%s\n", - xen_start_info->magic, + printk(KERN_INFO "Xen version: %d.%d%s%s\n", + version >> 16, version & 0xffff, extra.extraversion, xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } -- cgit v1.2.2 From fbdb7da91b0382d4b148d8b43c2eb4bab642bb5b Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 14 Jul 2008 15:34:09 -0700 Subject: x86_64: ia32_signal.c: use macro instead of immediate Make and use macro FIX_EFLAGS, instead of immediate value 0x40DD5 in ia32_restore_sigcontext(). Signed-off-by: Hiroshi Shimamoto Acked-by: "H. Peter Anvin" Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index cb3856a18c85..dc9b9b9803f6 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -36,6 +36,11 @@ #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) +#define FIX_EFLAGS (X86_EFLAGS_AC | X86_EFLAGS_OF | \ + X86_EFLAGS_DF | X86_EFLAGS_TF | X86_EFLAGS_SF | \ + X86_EFLAGS_ZF | X86_EFLAGS_AF | X86_EFLAGS_PF | \ + X86_EFLAGS_CF) + asmlinkage int do_signal(struct pt_regs *regs, sigset_t *oldset); void signal_fault(struct pt_regs *regs, void __user *frame, char *where); @@ -248,7 +253,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, regs->ss |= 3; err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~0x40DD5) | (tmpflags & 0x40DD5); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); /* disable syscall checks */ regs->orig_ax = -1; -- cgit v1.2.2 From 1f067167a83d1c7f80437fd1d32b55508aaca009 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 15 Jul 2008 00:02:28 -0700 Subject: x86: seperate memtest from init_64.c it's separate functionality that deserves its own file. This also prepares 32-bit memtest support. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 1 + arch/x86/mm/init_64.c | 112 --------------------------------------------- arch/x86/mm/memtest.c | 123 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 124 insertions(+), 112 deletions(-) create mode 100644 arch/x86/mm/memtest.c (limited to 'arch/x86') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 9873716e9f76..1fbb844c3d7a 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -21,3 +21,4 @@ obj-$(CONFIG_K8_NUMA) += k8topology_64.o endif obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o +obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 306049edd553..ec37121f6709 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -517,118 +517,6 @@ static void __init init_gbpages(void) direct_gbpages = 0; } -#ifdef CONFIG_MEMTEST - -static void __init memtest(unsigned long start_phys, unsigned long size, - unsigned pattern) -{ - unsigned long i; - unsigned long *start; - unsigned long start_bad; - unsigned long last_bad; - unsigned long val; - unsigned long start_phys_aligned; - unsigned long count; - unsigned long incr; - - switch (pattern) { - case 0: - val = 0UL; - break; - case 1: - val = -1UL; - break; - case 2: - val = 0x5555555555555555UL; - break; - case 3: - val = 0xaaaaaaaaaaaaaaaaUL; - break; - default: - return; - } - - incr = sizeof(unsigned long); - start_phys_aligned = ALIGN(start_phys, incr); - count = (size - (start_phys_aligned - start_phys))/incr; - start = __va(start_phys_aligned); - start_bad = 0; - last_bad = 0; - - for (i = 0; i < count; i++) - start[i] = val; - for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { - if (*start != val) { - if (start_phys_aligned == last_bad + incr) { - last_bad += incr; - } else { - if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", - val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); - } - start_bad = last_bad = start_phys_aligned; - } - } - } - if (start_bad) { - printk(KERN_CONT "\n %016lx bad mem addr %016lx - %016lx reserved", - val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); - } - -} - -/* default is disabled */ -static int memtest_pattern __initdata; - -static int __init parse_memtest(char *arg) -{ - if (arg) - memtest_pattern = simple_strtoul(arg, NULL, 0); - return 0; -} - -early_param("memtest", parse_memtest); - -static void __init early_memtest(unsigned long start, unsigned long end) -{ - u64 t_start, t_size; - unsigned pattern; - - if (!memtest_pattern) - return; - - printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); - for (pattern = 0; pattern < memtest_pattern; pattern++) { - t_start = start; - t_size = 0; - while (t_start < end) { - t_start = find_e820_area_size(t_start, &t_size, 1); - - /* done ? */ - if (t_start >= end) - break; - if (t_start + t_size > end) - t_size = end - t_start; - - printk(KERN_CONT "\n %016llx - %016llx pattern %d", - (unsigned long long)t_start, - (unsigned long long)t_start + t_size, pattern); - - memtest(t_start, t_size, pattern); - - t_start += t_size; - } - } - printk(KERN_CONT "\n"); -} -#else -static void __init early_memtest(unsigned long start, unsigned long end) -{ -} -#endif - static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c new file mode 100644 index 000000000000..672e17f8262a --- /dev/null +++ b/arch/x86/mm/memtest.c @@ -0,0 +1,123 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void __init memtest(unsigned long start_phys, unsigned long size, + unsigned pattern) +{ + unsigned long i; + unsigned long *start; + unsigned long start_bad; + unsigned long last_bad; + unsigned long val; + unsigned long start_phys_aligned; + unsigned long count; + unsigned long incr; + + switch (pattern) { + case 0: + val = 0UL; + break; + case 1: + val = -1UL; + break; + case 2: +#ifdef CONFIG_X86_64 + val = 0x5555555555555555UL; +#else + val = 0x55555555UL; +#endif + break; + case 3: +#ifdef CONFIG_X86_64 + val = 0xaaaaaaaaaaaaaaaaUL; +#else + val = 0xaaaaaaaaUL; +#endif + break; + default: + return; + } + + incr = sizeof(unsigned long); + start_phys_aligned = ALIGN(start_phys, incr); + count = (size - (start_phys_aligned - start_phys))/incr; + start = __va(start_phys_aligned); + start_bad = 0; + last_bad = 0; + + for (i = 0; i < count; i++) + start[i] = val; + for (i = 0; i < count; i++, start++, start_phys_aligned += incr) { + if (*start != val) { + if (start_phys_aligned == last_bad + incr) { + last_bad += incr; + } else { + if (start_bad) { + printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + start_bad = last_bad = start_phys_aligned; + } + } + } + if (start_bad) { + printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", + val, start_bad, last_bad + incr); + reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + } + +} + +/* default is disabled */ +static int memtest_pattern __initdata; + +static int __init parse_memtest(char *arg) +{ + if (arg) + memtest_pattern = simple_strtoul(arg, NULL, 0); + return 0; +} + +early_param("memtest", parse_memtest); + +void __init early_memtest(unsigned long start, unsigned long end) +{ + u64 t_start, t_size; + unsigned pattern; + + if (!memtest_pattern) + return; + + printk(KERN_INFO "early_memtest: pattern num %d", memtest_pattern); + for (pattern = 0; pattern < memtest_pattern; pattern++) { + t_start = start; + t_size = 0; + while (t_start < end) { + t_start = find_e820_area_size(t_start, &t_size, 1); + + /* done ? */ + if (t_start >= end) + break; + if (t_start + t_size > end) + t_size = end - t_start; + + printk(KERN_CONT "\n %010llx - %010llx pattern %d", + (unsigned long long)t_start, + (unsigned long long)t_start + t_size, pattern); + + memtest(t_start, t_size, pattern); + + t_start += t_size; + } + } + printk(KERN_CONT "\n"); +} -- cgit v1.2.2 From caadbdce240c43e3e46c82fce6c00eb7f01e1beb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 15 Jul 2008 00:03:44 -0700 Subject: x86: enable memory tester support on 32-bit only supports memory below max_low_pfn. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - arch/x86/mm/init_32.c | 3 +++ 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 96e0c2ebc388..03980cb04291 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -447,7 +447,6 @@ config PARAVIRT_DEBUG config MEMTEST bool "Memtest" - depends on X86_64 help This option adds a kernel parameter 'memtest', which allows memtest to be set. diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9689a5138e64..3eeab6d0065f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -844,6 +844,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT, "PGTABLE"); + if (!after_init_bootmem) + early_memtest(start, end); + return end >> PAGE_SHIFT; } -- cgit v1.2.2 From 89b3b1f41bd94085da2f08dcb719bdbf7e8e9d57 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 15 Jul 2008 21:02:54 +0400 Subject: x86: apic_64 - make calibrate_APIC_clock to return error code Make calibration_result to return error and check calibration_result to be sufficient inside calibrate_APIC_clock. Signed-off-by: Cyrill Gorcunov Cc: Cyrill Gorcunov Cc: macro@linux-mips.org Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e3d32e27c14..ce294d623e5f 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -314,7 +314,7 @@ static void setup_APIC_timer(void) #define TICK_COUNT 100000000 -static void __init calibrate_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { unsigned apic, apic_start; unsigned long tsc, tsc_start; @@ -368,6 +368,17 @@ static void __init calibrate_APIC_clock(void) clockevent_delta2ns(0xF, &lapic_clockevent); calibration_result = result / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; } /* @@ -394,14 +405,7 @@ void __init setup_boot_APIC_clock(void) } printk(KERN_INFO "Using local APIC timer interrupts.\n"); - calibrate_APIC_clock(); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); + if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) setup_APIC_timer(); -- cgit v1.2.2 From 836c129de971d526b6e85b8ad760bd635a00215e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 15 Jul 2008 21:02:55 +0400 Subject: x86: apic_32 - introduce calibrate_APIC_clock Introduce calibrate_APIC_clock so it could help in further 32/64bit apic code merging. Signed-off-by: Cyrill Gorcunov Cc: Cyrill Gorcunov Cc: macro@linux-mips.org Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 93 +++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 2bc1186cc95a..d2a7eb511d6b 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -369,12 +369,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) +static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); const long pm_100ms = PMTMR_TICKS_PER_SEC/10; @@ -384,24 +379,6 @@ void __init setup_boot_APIC_clock(void) long delta, deltapm; int pm_referenced = 0; - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (local_apic_timer_disabled) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - local_irq_disable(); /* Replace the global interrupt handler */ @@ -486,8 +463,6 @@ void __init setup_boot_APIC_clock(void) calibration_result / (1000000 / HZ), calibration_result % (1000000 / HZ)); - local_apic_timer_verify_ok = 1; - /* * Do a sanity check on the APIC calibration result */ @@ -495,12 +470,11 @@ void __init setup_boot_APIC_clock(void) local_irq_enable(); printk(KERN_WARNING "APIC frequency too slow, disabling apic timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; + return -1; } + local_apic_timer_verify_ok = 1; + /* We trust the pm timer based calibration */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); @@ -540,22 +514,55 @@ void __init setup_boot_APIC_clock(void) if (!local_apic_timer_verify_ok) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (local_apic_timer_disabled) { /* No broadcast on UP ! */ - if (num_possible_cpus() == 1) - return; - } else { - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; } + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=%d!\n", nmi_watchdog); + /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } -- cgit v1.2.2 From 17c44697f293cf24cbbf51b4a5bd15f4fbc88b90 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 14 Jul 2008 17:18:04 +0100 Subject: x86: I/O APIC: Include required by some code Include for i8259A_lock used in print_PIC() -- #if-0-ed out by default. The 32-bit version gets it right already. The plan is to enable this code with "apic=debug" eventually. This will aid with debugging strange problems without the need to ask people to apply patches. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 6510cde36b35..834b06afef43 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.2 From baa1318841d4bc95d783e6c15219b264720002c8 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 14 Jul 2008 18:44:51 +0100 Subject: x86: APIC: Make apic_verbosity unsigned As a microoptimisation, make apic_verbosity unsigned. This will make apic_printk(APIC_QUIET, ...) expand into just printk(...) with the surrounding condition and a reference to apic_verbosity removed. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/apic_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d2a7eb511d6b..7f30c0f3dbe4 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -75,7 +75,7 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; int pic_mode; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ce294d623e5f..98c70f044e19 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -54,7 +54,7 @@ EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); /* * Debug level, exported for io_apic.c */ -int apic_verbosity; +unsigned int apic_verbosity; /* Have we found an MP table */ int smp_found_config; -- cgit v1.2.2 From 49a66a0bcea8737e82035a31ad0d8cd64d027a5d Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 14 Jul 2008 19:08:13 +0100 Subject: x86: I/O APIC: Always report how the timer has been set up Following recent (and less so) issues with the 8254 timer when routed through the I/O or local APIC, always report which configurations have been tried and which one has been set up eventually. This is so that logs posted by people for some other reason can be used as a cross-reference when investigating any possible future problems. The change unifies messages printed on 32-bit and 64-bit platforms and adds trailing newlines (removes leading ones), so that proper log level annotation can be used and any possible interspersed output will not cause a mess. I have chosen to use apic_printk(APIC_QUIET, ...) rather than printk(...) so that the distinction of these messages is maintained making possible future decisions about changes in this area easier. A change posted separately making apic_verbosity unsigned removes any extra code that would otherwise be generated as a result of this design decision. Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 39 ++++++++++++++++++++++----------------- arch/x86/kernel/io_apic_64.c | 40 ++++++++++++++++++++++------------------ 2 files changed, 44 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index eabaf9244f5b..de9aa0e3a9c5 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -2177,8 +2177,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - printk(KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2216,12 +2217,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - printk(KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); - printk(KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - printk("\n..... (found pin %d) ...", pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -2230,7 +2232,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - printk("works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -2244,30 +2246,33 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - printk(" failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } timer_ack = 0; - printk(KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0, vector); apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); - printk(" failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - printk(KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); @@ -2276,12 +2281,12 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - printk(" works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - printk(" failed :(.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option"); + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 834b06afef43..64a46affd858 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1697,8 +1697,9 @@ static inline void __init check_timer(void) pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -1736,14 +1737,13 @@ static inline void __init check_timer(void) } clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: " + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " "8254 timer not connected to IO-APIC\n"); - apic_printk(APIC_VERBOSE,KERN_INFO - "...trying to set up timer (IRQ0) " - "through the 8259A ... "); - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", - apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); /* * legacy devices should be connected to IO APIC #0 */ @@ -1752,7 +1752,7 @@ static inline void __init check_timer(void) unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); timer_through_8259 = 1; if (nmi_watchdog == NMI_IO_APIC) { disable_8259A_irq(0); @@ -1766,29 +1766,32 @@ static inline void __init check_timer(void) */ disable_8259A_irq(0); clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); } if (nmi_watchdog == NMI_IO_APIC) { - printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as Virtual Wire IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); lapic_register_intr(0); apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } disable_8259A_irq(0); apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_VERBOSE," failed.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - apic_printk(APIC_VERBOSE, KERN_INFO "...trying to set up timer as ExtINT IRQ..."); + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); init_8259A(0); make_8259A_irq(0); @@ -1797,11 +1800,12 @@ static inline void __init check_timer(void) unlock_ExtINT_logic(); if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); goto out; } - apic_printk(APIC_VERBOSE," failed :(.\n"); - panic("IO-APIC + timer doesn't work! Try using the 'noapic' kernel parameter\n"); + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); out: local_irq_restore(flags); } -- cgit v1.2.2 From 7019cc2dd6fafcdc6b104005482dc910dcdbb797 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Wed, 9 Jul 2008 15:27:19 -0500 Subject: x86 BIOS interface for RTC on SGI UV Real-time code needs to know the number of cycles per second on SGI UV. The information is provided via a run time BIOS call. This patch provides the linux side of that interface. This is the first of several run time BIOS calls to be defined in uv/bios.h and bios_uv.c. Note that BIOS_CALL() is just a stub for now. The bios side is being worked on. Signed-off-by: Russ Anderson Cc: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/bios_uv.c | 48 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/genx2apic_uv_x.c | 23 +++++++++++++++++++ 3 files changed, 72 insertions(+) create mode 100644 arch/x86/kernel/bios_uv.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index da140611bb57..b78a17b12810 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_OLPC) += olpc.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o + obj-y += bios_uv.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o obj-$(CONFIG_AUDIT) += audit_64.o diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c new file mode 100644 index 000000000000..c639bd55391c --- /dev/null +++ b/arch/x86/kernel/bios_uv.c @@ -0,0 +1,48 @@ +/* + * BIOS run time interface routines. + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include + +const char * +x86_bios_strerror(long status) +{ + const char *str; + switch (status) { + case 0: str = "Call completed without error"; break; + case -1: str = "Not implemented"; break; + case -2: str = "Invalid argument"; break; + case -3: str = "Call completed with error"; break; + default: str = "Unknown BIOS status code"; break; + } + return str; +} + +long +x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, + unsigned long *drift_info) +{ + struct uv_bios_retval isrv; + + BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); + *ticks_per_second = isrv.v0; + *drift_info = isrv.v1; + return isrv.status; +} +EXPORT_SYMBOL_GPL(x86_bios_freq_base); diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b06..3c3929340692 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -24,6 +24,7 @@ #include #include #include +#include DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); @@ -40,6 +41,9 @@ EXPORT_SYMBOL_GPL(uv_cpu_to_blade); short uv_possible_blades; EXPORT_SYMBOL_GPL(uv_possible_blades); +unsigned long sn_rtc_cycles_per_second; +EXPORT_SYMBOL(sn_rtc_cycles_per_second); + /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ static cpumask_t uv_target_cpus(void) @@ -272,6 +276,23 @@ static __init void map_mmioh_high(int max_pnode) map_high("MMIOH", mmioh.s.base, shift, map_uc); } +static __init void uv_rtc_init(void) +{ + long status, ticks_per_sec, drift; + + status = + x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, + &drift); + if (status != 0 || ticks_per_sec < 100000) { + printk(KERN_WARNING + "unable to determine platform RTC clock frequency, " + "guessing.\n"); + /* BIOS gives wrong value for clock freq. so guess */ + sn_rtc_cycles_per_second = 1000000000000UL / 30000UL; + } else + sn_rtc_cycles_per_second = ticks_per_sec; +} + static __init void uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; @@ -326,6 +347,8 @@ static __init void uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_rtc_init(); + for_each_present_cpu(cpu) { nid = cpu_to_node(cpu); pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); -- cgit v1.2.2 From 78cbac65fd77242f3e5d77f4d7a71e8bc869fe4d Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 10 Jul 2008 21:14:52 +0200 Subject: x86: traps_xx: refactor die() like in x86_64 Make the diff between the traps_32.c and traps_64.c a bit smaller. Change traps_32.c to look more like traps_64.c: - move lock information to file scope - split out oops_begin() and oops_end() from die() - increment nest counter in oops_begin Only whitespace change in traps_64.c No functional changes intended. Signed-off-by: Alexander van Heukelum Acked-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 95 +++++++++++++++++++++++++--------------------- arch/x86/kernel/traps_64.c | 2 +- 2 files changed, 52 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 8a768973c4f0..51cccde376a5 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -383,6 +383,54 @@ int is_valid_bugaddr(unsigned long ip) return ud2 == 0x0b0f; } +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) + panic("Fatal exception"); + + oops_exit(); + do_exit(signr); +} + int __kprobes __die(const char *str, struct pt_regs *regs, long err) { unsigned short ss; @@ -423,31 +471,9 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) */ void die(const char *str, struct pt_regs *regs, long err) { - static struct { - raw_spinlock_t lock; - u32 lock_owner; - int lock_owner_depth; - } die = { - .lock = __RAW_SPIN_LOCK_UNLOCKED, - .lock_owner = -1, - .lock_owner_depth = 0 - }; - unsigned long flags; - - oops_enter(); - - if (die.lock_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die.lock); - die.lock_owner = smp_processor_id(); - die.lock_owner_depth = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } + unsigned long flags = oops_begin(); - if (++die.lock_owner_depth < 3) { + if (die_nest_count < 3) { report_bug(regs->ip, regs); if (__die(str, regs, err)) @@ -456,26 +482,7 @@ void die(const char *str, struct pt_regs *regs, long err) printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); } - bust_spinlocks(0); - die.lock_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die.lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(SIGSEGV); + oops_end(flags, regs, SIGSEGV); } static inline void diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 2696a6837782..babdbe673b7f 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -518,7 +518,7 @@ unsigned __kprobes long oops_begin(void) } void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ +{ die_owner = -1; bust_spinlocks(0); die_nest_count--; -- cgit v1.2.2 From 7dedcee394a3f61475d08002bd12e8068d044216 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Thu, 10 Jul 2008 21:16:39 +0200 Subject: x86: traps_xx: modify x86_64 to use _log_lvl variants i386 has show_trace_log_lvl and show_stack_log_lvl, allowing traces to be emitted with log-level annotations. This patch introduces them to x86_64, but log_lvl is only ever set to an empty string. Output of traces is unchanged. i386-chunk is whitespace-only. Signed-off-by: Alexander van Heukelum Cc: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 2 +- arch/x86/kernel/traps_64.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 51cccde376a5..c971dce3847b 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -256,7 +256,7 @@ static const struct stacktrace_ops print_trace_ops = { static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) + unsigned long *stack, unsigned long bp, char *log_lvl) { dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("%s =======================\n", log_lvl); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index babdbe673b7f..c664e6962009 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -355,17 +355,24 @@ static const struct stacktrace_ops print_trace_ops = { .address = print_trace_address, }; -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) { printk("\nCall Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, NULL); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); printk("\n"); } +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + static void -_show_stack(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp) +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -399,12 +406,12 @@ _show_stack(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } - show_trace(task, regs, sp, bp); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *sp) { - _show_stack(task, NULL, sp, 0); + show_stack_log_lvl(task, NULL, sp, 0, ""); } /* @@ -454,7 +461,8 @@ void show_registers(struct pt_regs *regs) u8 *ip; printk("Stack: "); - _show_stack(NULL, regs, (unsigned long *)sp, regs->bp); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, ""); printk("\n"); printk(KERN_EMERG "Code: "); -- cgit v1.2.2 From 3f9b5cc018566ad9562df0648395649aebdbc5e0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Jul 2008 16:30:05 +0200 Subject: x86: re-enable OPTIMIZE_INLINING re-enable OPTIMIZE_INLINING more widely. Jeff Dike fixed the remaining outstanding issue in this commit: | commit 4f81c5350b44bcc501ab6f8a089b16d064b4d2f6 | Author: Jeff Dike | Date: Mon Jul 7 13:36:56 2008 -0400 | | [UML] fix gcc ICEs and unresolved externs [...] | This patch reintroduces unit-at-a-time for gcc >= 4.0, bringing back the | possibility of Uli's crash. If that happens, we'll debug it. it's still default-off and thus opt-in. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index ae36bfa814e5..ffd5913b35d1 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -287,7 +287,6 @@ config CPA_DEBUG config OPTIMIZE_INLINING bool "Allow gcc to uninline functions marked 'inline'" - depends on BROKEN help This option determines if the kernel forces gcc to inline the functions developers have marked 'inline'. Doing so takes away freedom from gcc to @@ -298,5 +297,7 @@ config OPTIMIZE_INLINING become the default in the future, until then this option is there to test gcc for this. + If unsure, say N. + endmenu -- cgit v1.2.2 From 8b2b9c1af065a45ef00c26964420489a53581779 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 15 Jul 2008 17:09:03 +0900 Subject: x86, intel_cacheinfo: fix use-after-free cache_kobject This avoids calling kobject_uevent() with cache_kobject that has already been deallocated in an error path. Signed-off-by: Akinobu Mita Cc: "H. Peter Anvin" Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 2c8afafa18e8..ff517f0b8cc4 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -780,15 +780,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) } kobject_put(per_cpu(cache_kobject, cpu)); cpuid4_cache_sysfs_exit(cpu); - break; + return retval; } kobject_uevent(&(this_object->kobj), KOBJ_ADD); } - if (!retval) - cpu_set(cpu, cache_dev_map); + cpu_set(cpu, cache_dev_map); kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); - return retval; + return 0; } static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) -- cgit v1.2.2 From b8f8c3cf0a4ac0632ec3f0e15e9dc0c29de917af Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 18 Jul 2008 17:27:28 +0200 Subject: nohz: prevent tick stop outside of the idle loop Jack Ren and Eric Miao tracked down the following long standing problem in the NOHZ code: scheduler switch to idle task enable interrupts Window starts here ----> interrupt happens (does not set NEED_RESCHED) irq_exit() stops the tick ----> interrupt happens (does set NEED_RESCHED) return from schedule() cpu_idle(): preempt_disable(); Window ends here The interrupts can happen at any point inside the race window. The first interrupt stops the tick, the second one causes the scheduler to rerun and switch away from idle again and we end up with the tick disabled. The fact that it needs two interrupts where the first one does not set NEED_RESCHED and the second one does made the bug obscure and extremly hard to reproduce and analyse. Kudos to Jack and Eric. Solution: Limit the NOHZ functionality to the idle loop to make sure that we can not run into such a situation ever again. cpu_idle() { preempt_disable(); while(1) { tick_nohz_stop_sched_tick(1); <- tell NOHZ code that we are in the idle loop while (!need_resched()) halt(); tick_nohz_restart_sched_tick(); <- disables NOHZ mode preempt_enable_no_resched(); schedule(); preempt_disable(); } } In hindsight we should have done this forever, but ... /me grabs a large brown paperbag. Debugged-by: Jack Ren , Debugged-by: eric miao Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f8476dfbb60d..1f5fa1cf16dd 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -166,7 +166,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { void (*idle)(void); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e2319f39988b..c0a5c2a687e6 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -148,7 +148,7 @@ void cpu_idle(void) current_thread_info()->status |= TS_POLLING; /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(); + tick_nohz_stop_sched_tick(1); while (!need_resched()) { void (*idle)(void); -- cgit v1.2.2 From 47129654226b5bd418afe533ce4e11d6a0b6d6e4 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Sun, 6 Jul 2008 20:13:49 +0400 Subject: x86 setup.c: cleanup includes x86: remove double includes in setup.c Signed-off-by: Alexander Beregalov Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 531b55b8e81a..4a2b8acc1d95 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -57,12 +57,8 @@ #include #include #include -#include #include -#include -#include -#include #include #include #include @@ -104,7 +100,6 @@ #include #include -#include #include #include #ifdef CONFIG_X86_64 -- cgit v1.2.2 From 9781f39fd209cd93ab98b669814191acc67f32fd Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Thu, 10 Jul 2008 17:13:19 +0200 Subject: x86: consolidate the definition of the force_mwait variable The force_mwait variable iss defined either in arch/x86/kernel/cpu/amd.c or in arch/x86/kernel/setup_64.c, but it is only initialized and used in arch/x86/kernel/process.c. This patch moves the declaration to arch/x86/kernel/process.c. Signed-off-by: Thomas Petazzoni Cc: michael@free-electrons.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/process.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 81a07ca65d44..cae9cabc3031 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,8 +24,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -int force_mwait __cpuinitdata; - static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { if (cpuid_eax(0x80000000) >= 0x80000007) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4d629c62f4f8..74f2d196adb4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,6 +15,7 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +static int force_mwait __cpuinitdata; int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -- cgit v1.2.2 From 5ff4789d045cdaec7629e027e4f8ff8e34308b81 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Jul 2008 20:11:18 +0200 Subject: AMD IOMMU: set iommu for device from ACPI code too The device<->iommu relationship has to be set from the information in the ACPI table too. This patch adds this logic to the driver. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 9bf1b8111b08..7661b02d7208 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -426,11 +426,18 @@ static void set_dev_entry_bit(u16 devid, u8 bit) amd_iommu_dev_table[devid].data[i] |= (1 << _bit); } +/* Writes the specific IOMMU for a device into the rlookup table */ +static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) +{ + amd_iommu_rlookup_table[devid] = iommu; +} + /* * This function takes the device specific flags read from the ACPI * table and sets up the device table entry with that information */ -static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) +static void __init set_dev_entry_from_acpi(struct amd_iommu *iommu, + u16 devid, u32 flags, u32 ext_flags) { if (flags & ACPI_DEVFLAG_INITPASS) set_dev_entry_bit(devid, DEV_ENTRY_INIT_PASS); @@ -446,12 +453,8 @@ static void __init set_dev_entry_from_acpi(u16 devid, u32 flags, u32 ext_flags) set_dev_entry_bit(devid, DEV_ENTRY_LINT0_PASS); if (flags & ACPI_DEVFLAG_LINT1) set_dev_entry_bit(devid, DEV_ENTRY_LINT1_PASS); -} -/* Writes the specific IOMMU for a device into the rlookup table */ -static void __init set_iommu_for_device(struct amd_iommu *iommu, u16 devid) -{ - amd_iommu_rlookup_table[devid] = iommu; + set_iommu_for_device(iommu, devid); } /* @@ -550,11 +553,12 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALL: for (dev_i = iommu->first_device; dev_i <= iommu->last_device; ++dev_i) - set_dev_entry_from_acpi(dev_i, e->flags, 0); + set_dev_entry_from_acpi(iommu, dev_i, + e->flags, 0); break; case IVHD_DEV_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); break; case IVHD_DEV_SELECT_RANGE_START: devid_start = e->devid; @@ -565,7 +569,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, case IVHD_DEV_ALIAS: devid = e->devid; devid_to = e->ext >> 8; - set_dev_entry_from_acpi(devid, e->flags, 0); + set_dev_entry_from_acpi(iommu, devid, e->flags, 0); amd_iommu_alias_table[devid] = devid_to; break; case IVHD_DEV_ALIAS_RANGE: @@ -577,7 +581,8 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, break; case IVHD_DEV_EXT_SELECT: devid = e->devid; - set_dev_entry_from_acpi(devid, e->flags, e->ext); + set_dev_entry_from_acpi(iommu, devid, e->flags, + e->ext); break; case IVHD_DEV_EXT_SELECT_RANGE: devid_start = e->devid; @@ -590,7 +595,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, for (dev_i = devid_start; dev_i <= devid; ++dev_i) { if (alias) amd_iommu_alias_table[dev_i] = devid_to; - set_dev_entry_from_acpi( + set_dev_entry_from_acpi(iommu, amd_iommu_alias_table[dev_i], flags, ext_flags); } -- cgit v1.2.2 From 6ac8d51f01d345af5ea4209004a9ea29b2f20891 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Tue, 15 Jul 2008 21:09:13 +0530 Subject: x86: introducing asm-x86/traps.h Declaring x86 traps under one hood. Declaring x86 do_traps before defining them. Signed-off-by: Jaswinder Singh Cc: Andi Kleen Cc: David Woodhouse Cc: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 21 +-------------------- arch/x86/kernel/traps_64.c | 22 +--------------------- 2 files changed, 2 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index c971dce3847b..03df8e45e5a1 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "mach_traps.h" @@ -77,26 +78,6 @@ char ignore_fpu_irq; gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 24; static unsigned int code_bytes = 64; diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index c664e6962009..3f18d73f420c 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -51,30 +51,10 @@ #include #include #include +#include #include -asmlinkage void divide_error(void); -asmlinkage void debug(void); -asmlinkage void nmi(void); -asmlinkage void int3(void); -asmlinkage void overflow(void); -asmlinkage void bounds(void); -asmlinkage void invalid_op(void); -asmlinkage void device_not_available(void); -asmlinkage void double_fault(void); -asmlinkage void coprocessor_segment_overrun(void); -asmlinkage void invalid_TSS(void); -asmlinkage void segment_not_present(void); -asmlinkage void stack_segment(void); -asmlinkage void general_protection(void); -asmlinkage void page_fault(void); -asmlinkage void coprocessor_error(void); -asmlinkage void simd_coprocessor_error(void); -asmlinkage void alignment_check(void); -asmlinkage void spurious_interrupt_bug(void); -asmlinkage void machine_check(void); - int panic_on_unrecovered_nmi; int kstack_depth_to_print = 12; static unsigned int code_bytes = 64; -- cgit v1.2.2 From 1181f8b5f0302580af0958169ef4497c3eb57a61 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 3 Jul 2008 13:12:13 -0700 Subject: x86_32: remove redundant KERN_INFO This printk has a KERN_ facility level in the format string. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d92373630963..d633d801f858 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -212,7 +212,7 @@ asmlinkage unsigned long sys_sigreturn(unsigned long __unused) badframe: if (show_unhandled_signals && printk_ratelimit()) { - printk(KERN_INFO "%s%s[%d] bad frame in sigreturn frame:" + printk("%s%s[%d] bad frame in sigreturn frame:" "%p ip:%lx sp:%lx oeax:%lx", task_pid_nr(current) > 1 ? KERN_INFO : KERN_EMERG, current->comm, task_pid_nr(current), frame, regs->ip, -- cgit v1.2.2 From 9a8f0e6b5dfe3b4f330fc82b16a4000f5688fce8 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 18 Jul 2008 09:59:40 -0700 Subject: x86: let 32bit use apic_ops too - fix Fix VMI apic_ops. Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 5 +++++ arch/x86/kernel/vmi_32.c | 51 +++-------------------------------------------- 2 files changed, 8 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 47ff978aecfd..cb54d9e20f94 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -145,6 +145,11 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * Paravirt kernels also might be using these below ops. So we still + * use generic apic_read()/apic_write(), which might be pointing to different + * ops in PARAVIRT case. + */ void xapic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index cf3074354553..237082833c14 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -676,50 +676,6 @@ static inline int __init probe_vmi_rom(void) return 0; } -#ifdef CONFIG_X86_LOCAL_APIC -static u32 vmi_apic_read(u32 reg) -{ - return 0; -} - -static void vmi_apic_write(u32 reg, u32 val) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static u64 vmi_apic_icr_read(void) -{ - return 0; -} - -static void vmi_apic_icr_write(u32 low, u32 id) -{ - /* Warn to see if there's any stray references */ - WARN_ON(1); -} - -static void vmi_apic_wait_icr_idle(void) -{ - return; -} - -static u32 vmi_safe_apic_wait_icr_idle(void) -{ - return 0; -} - -static struct apic_ops vmi_basic_apic_ops = { - .read = vmi_apic_read, - .write = vmi_apic_write, - .write_atomic = vmi_apic_write, - .icr_read = vmi_apic_icr_read, - .icr_write = vmi_apic_icr_write, - .wait_icr_idle = vmi_apic_wait_icr_idle, - .safe_wait_icr_idle = vmi_safe_apic_wait_icr_idle, -}; -#endif - /* * VMI setup common to all processors */ @@ -948,10 +904,9 @@ static inline int __init activate_vmi(void) #endif #ifdef CONFIG_X86_LOCAL_APIC - para_fill(vmi_basic_apic_ops.read, APICRead); - para_fill(vmi_basic_apic_ops.write, APICWrite); - para_fill(vmi_basic_apic_ops.write_atomic, APICWrite); - apic_ops = &vmi_basic_apic_ops; + para_fill(apic_ops->read, APICRead); + para_fill(apic_ops->write, APICWrite); + para_fill(apic_ops->write_atomic, APICWrite); #endif /* -- cgit v1.2.2 From 511d9d34183662aada3890883e860b151d707e22 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 14 Jul 2008 09:49:14 -0700 Subject: x86: apic_ops for lguest apic_ops for lguest. Signed-off-by: Suresh Siddha Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/lguest/boot.c | 35 ++++++++++++++++++++++++++++++++--- 1 file changed, 32 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 0c45df20e2b3..675ee7a6475e 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -791,6 +791,37 @@ static u32 lguest_apic_read(u32 reg) { return 0; } + +static u64 lguest_apic_icr_read(void) +{ + return 0; +} + +static void lguest_apic_icr_write(u32 low, u32 id) +{ + /* Warn to see if there's any stray references */ + WARN_ON(1); +} + +static void lguest_apic_wait_icr_idle(void) +{ + return; +} + +static u32 lguest_apic_safe_wait_icr_idle(void) +{ + return 0; +} + +static struct apic_ops lguest_basic_apic_ops = { + .read = lguest_apic_read, + .write = lguest_apic_write, + .write_atomic = lguest_apic_write, + .icr_read = lguest_apic_icr_read, + .icr_write = lguest_apic_icr_write, + .wait_icr_idle = lguest_apic_wait_icr_idle, + .safe_wait_icr_idle = lguest_apic_safe_wait_icr_idle, +}; #endif /* STOP! Until an interrupt comes in. */ @@ -990,9 +1021,7 @@ __init void lguest_init(void) #ifdef CONFIG_X86_LOCAL_APIC /* apic read/write intercepts */ - pv_apic_ops.apic_write = lguest_apic_write; - pv_apic_ops.apic_write_atomic = lguest_apic_write; - pv_apic_ops.apic_read = lguest_apic_read; + apic_ops = &lguest_basic_apic_ops; #endif /* time operations */ -- cgit v1.2.2 From fa10c51a04a43ced5fd6033f19a74d2c82198b34 Mon Sep 17 00:00:00 2001 From: Alexander Beregalov Date: Wed, 9 Jul 2008 22:28:24 +0400 Subject: arch/x86/kernel/cpu/common_64.c: remove double inclusions x86: remove double inclusions in arch/x86/kernel/cpu/common_64.c Signed-off-by: Alexander Beregalov Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb40..2a4475beea4a 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -7,12 +7,9 @@ #include #include #include -#include #include #include -#include #include -#include #include #include #include -- cgit v1.2.2 From 65c011845316d3c1381f478ca0d8265c43b3b039 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:30 -0700 Subject: cpumask: Replace cpumask_of_cpu with cpumask_of_cpu_ptr * This patch replaces the dangerous lvalue version of cpumask_of_cpu with new cpumask_of_cpu_ptr macros. These are patterned after the node_to_cpumask_ptr macros. In general terms, if there is a cpumask_of_cpu_map[] then a pointer to the cpumask_of_cpu_map[cpu] entry is used. The cpumask_of_cpu_map is provided when there is a large NR_CPUS count, reducing greatly the amount of code generated and stack space used for cpumask_of_cpu(). The pointer to the cpumask_t value is needed for calling set_cpus_allowed_ptr() to reduce the amount of stack space needed to pass the cpumask_t value. If there isn't a cpumask_of_cpu_map[], then a temporary variable is declared and filled in with value from cpumask_of_cpu(cpu) as well as a pointer variable pointing to this temporary variable. Afterwards, the pointer is used to reference the cpumask value. The compiler will optimize out the extra dereference through the pointer as well as the stack space used for the pointer, resulting in identical code. A good example of the orthogonal usages is in net/sunrpc/svc.c: case SVC_POOL_PERCPU: { unsigned int cpu = m->pool_to[pidx]; cpumask_of_cpu_ptr(cpumask, cpu); *oldmask = current->cpus_allowed; set_cpus_allowed_ptr(current, cpumask); return 1; } case SVC_POOL_PERNODE: { unsigned int node = m->pool_to[pidx]; node_to_cpumask_ptr(nodecpumask, node); *oldmask = current->cpus_allowed; set_cpus_allowed_ptr(current, nodecpumask); return 1; } Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/cstate.c | 3 ++- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 10 +++++++--- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 ++++++++++----- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 9 ++++++--- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 3 ++- arch/x86/kernel/cpu/intel_cacheinfo.c | 3 ++- arch/x86/kernel/microcode.c | 13 +++++++++---- arch/x86/kernel/reboot.c | 14 +++++++++----- 8 files changed, 47 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index c2502eb9aa83..9220cf46aa10 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -73,6 +73,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct cpuinfo_x86 *c = &cpu_data(cpu); cpumask_t saved_mask; + cpumask_of_cpu_ptr(new_mask, cpu); int retval; unsigned int eax, ebx, ecx, edx; unsigned int edx_part; @@ -91,7 +92,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, /* Make sure we are running on right CPU */ saved_mask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + retval = set_cpus_allowed_ptr(current, new_mask); if (retval) return -1; diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b835839..ff2fff56f0a8 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -200,10 +200,12 @@ static void drv_read(struct drv_cmd *cmd) static void drv_write(struct drv_cmd *cmd) { cpumask_t saved_mask = current->cpus_allowed; + cpumask_of_cpu_ptr_declare(cpu_mask); unsigned int i; for_each_cpu_mask_nr(i, cmd->mask) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); + cpumask_of_cpu_ptr_next(cpu_mask, i); + set_cpus_allowed_ptr(current, cpu_mask); do_drv_write(cmd); } @@ -267,11 +269,12 @@ static unsigned int get_measured_perf(unsigned int cpu) } aperf_cur, mperf_cur; cpumask_t saved_mask; + cpumask_of_cpu_ptr(cpu_mask, cpu); unsigned int perf_percent; unsigned int retval; saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, cpu_mask); if (get_cpu() != cpu) { /* We were not able to run on requested processor */ put_cpu(); @@ -337,6 +340,7 @@ static unsigned int get_measured_perf(unsigned int cpu) static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { + cpumask_of_cpu_ptr(cpu_mask, cpu); struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; unsigned int cached_freq; @@ -349,7 +353,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) } cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); + freq = extract_freq(get_cur_val(cpu_mask), data); if (freq != cached_freq) { /* * The dreaded BIOS frequency change behind our back. diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c45ca6d4dce1..53c7b6936973 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -479,11 +479,12 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi static int check_supported_cpu(unsigned int cpu) { cpumask_t oldmask; + cpumask_of_cpu_ptr(cpu_mask, cpu); u32 eax, ebx, ecx, edx; unsigned int rc = 0; oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, cpu_mask); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); @@ -1016,6 +1017,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { cpumask_t oldmask; + cpumask_of_cpu_ptr(cpu_mask, pol->cpu); struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1030,7 +1032,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + set_cpus_allowed_ptr(current, cpu_mask); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1105,6 +1107,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask; + cpumask_of_cpu_ptr_declare(newmask); int rc; if (!cpu_online(pol->cpu)) @@ -1156,7 +1159,8 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); + cpumask_of_cpu_ptr_next(newmask, pol->cpu); + set_cpus_allowed_ptr(current, newmask); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1178,7 +1182,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) set_cpus_allowed_ptr(current, &oldmask); if (cpu_family == CPU_HW_PSTATE) - pol->cpus = cpumask_of_cpu(pol->cpu); + pol->cpus = *newmask; else pol->cpus = per_cpu(cpu_core_map, pol->cpu); data->available_cores = &(pol->cpus); @@ -1244,6 +1248,7 @@ static unsigned int powernowk8_get (unsigned int cpu) { struct powernow_k8_data *data; cpumask_t oldmask = current->cpus_allowed; + cpumask_of_cpu_ptr(newmask, cpu); unsigned int khz = 0; unsigned int first; @@ -1253,7 +1258,7 @@ static unsigned int powernowk8_get (unsigned int cpu) if (!data) return -EINVAL; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 8b0dd6f2a1ac..fd561bb26c60 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -313,9 +313,10 @@ static unsigned int get_cur_freq(unsigned int cpu) unsigned l, h; unsigned clock_freq; cpumask_t saved_mask; + cpumask_of_cpu_ptr(new_mask, cpu); saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, new_mask); if (smp_processor_id() != cpu) return 0; @@ -554,9 +555,11 @@ static int centrino_target (struct cpufreq_policy *policy, */ if (!cpus_empty(covered_cpus)) { + cpumask_of_cpu_ptr_declare(new_mask); + for_each_cpu_mask_nr(j, covered_cpus) { - set_cpus_allowed_ptr(current, - &cpumask_of_cpu(j)); + cpumask_of_cpu_ptr_next(new_mask, j); + set_cpus_allowed_ptr(current, new_mask); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); } } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 191f7263c61d..2f3728dc24f6 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -244,7 +244,8 @@ static unsigned int _speedstep_get(const cpumask_t *cpus) static unsigned int speedstep_get(unsigned int cpu) { - return _speedstep_get(&cpumask_of_cpu(cpu)); + cpumask_of_cpu_ptr(newmask, cpu); + return _speedstep_get(newmask); } /** diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a7b0f8f1736b..e4b8d189d7ed 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -516,6 +516,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) unsigned long j; int retval; cpumask_t oldmask; + cpumask_of_cpu_ptr(newmask, cpu); if (num_cache_leaves == 0) return -ENOENT; @@ -526,7 +527,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) return -ENOMEM; oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + retval = set_cpus_allowed_ptr(current, newmask); if (retval) goto out; diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 56b933119a04..58520169e35d 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -388,6 +388,7 @@ static int do_microcode_update (void) void *new_mc = NULL; int cpu; cpumask_t old; + cpumask_of_cpu_ptr_declare(newmask); old = current->cpus_allowed; @@ -404,7 +405,8 @@ static int do_microcode_update (void) if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + cpumask_of_cpu_ptr_next(newmask, cpu); + set_cpus_allowed_ptr(current, newmask); error = get_maching_microcode(new_mc, cpu); if (error < 0) goto out; @@ -574,6 +576,7 @@ static int apply_microcode_check_cpu(int cpu) struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); unsigned int val[2]; int err = 0; @@ -582,7 +585,7 @@ static int apply_microcode_check_cpu(int cpu) return 0; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); /* Check if the microcode we have in memory matches the CPU */ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || @@ -620,11 +623,12 @@ static int apply_microcode_check_cpu(int cpu) static void microcode_init_cpu(int cpu, int resume) { cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); mutex_lock(µcode_mutex); collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING && !resume) @@ -656,11 +660,12 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) return -EINVAL; if (val == 1) { cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); old = current->cpus_allowed; get_online_cpus(); - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + set_cpus_allowed_ptr(current, newmask); mutex_lock(µcode_mutex); if (uci->valid) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index f8a62160e151..214bbdfc851e 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -403,24 +403,28 @@ void native_machine_shutdown(void) { /* Stop the cpus and apics */ #ifdef CONFIG_SMP - int reboot_cpu_id; /* The boot cpu is always logical cpu 0 */ - reboot_cpu_id = 0; + int reboot_cpu_id = 0; + cpumask_of_cpu_ptr(newmask, reboot_cpu_id); #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && - cpu_online(reboot_cpu)) + cpu_online(reboot_cpu)) { reboot_cpu_id = reboot_cpu; + cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); + } #endif /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(reboot_cpu_id)) + if (!cpu_online(reboot_cpu_id)) { reboot_cpu_id = smp_processor_id(); + cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); + } /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); + set_cpus_allowed_ptr(current, newmask); /* O.K Now that I'm on the appropriate processor, * stop all of the others. -- cgit v1.2.2 From cb6d2be60dc3ec9ac788f45d8e24b82a9faacdd9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:31 -0700 Subject: cpumask: Optimize cpumask_of_cpu in arch/x86/kernel/io_apic_64.c * Optimize various places where a pointer to the cpumask_of_cpu value will result in reducing stack pressure. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index bf27114773d5..a85db790754b 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1372,12 +1372,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { struct irq_cfg *cfg = &irq_cfg[irq]; - cpumask_t mask; unsigned long flags; spin_lock_irqsave(&vector_lock, flags); - mask = cpumask_of_cpu(first_cpu(cfg->domain)); - send_IPI_mask(mask, cfg->vector); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); spin_unlock_irqrestore(&vector_lock, flags); return 1; -- cgit v1.2.2 From c42f4f4c6dab3b2b7768c36173ee7c7ecf79eddb Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:32 -0700 Subject: cpumask: Optimize cpumask_of_cpu in arch/x86/kernel/ldt.c * Optimize various places where a pointer to the cpumask_of_cpu value will result in reducing stack pressure. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/ldt.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a8449571858a..3fee2aa50f3f 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -62,12 +62,12 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP - cpumask_t mask; + cpumask_of_cpu_ptr_declare(mask); preempt_disable(); load_LDT(pc); - mask = cpumask_of_cpu(smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, mask)) + cpumask_of_cpu_ptr_next(mask, smp_processor_id()); + if (!cpus_equal(current->mm->cpu_vm_mask, *mask)) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else -- cgit v1.2.2 From eb53fac5cafc4b2f8443ff064938b4494a28c54e Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Tue, 15 Jul 2008 14:14:37 -0700 Subject: cpumask: Use optimized CPUMASK_ALLOC macros in the centrino_target * Use the CPUMASK_ALLOC macros in the centrino_target() function. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 73 +++++++++++++++--------- 1 file changed, 45 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index fd561bb26c60..470c016cb254 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -442,6 +442,13 @@ static int centrino_verify (struct cpufreq_policy *policy) * * Sets a new CPUFreq policy. */ +struct allmasks { + cpumask_t online_policy_cpus; + cpumask_t saved_mask; + cpumask_t set_mask; + cpumask_t covered_cpus; +}; + static int centrino_target (struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) @@ -449,48 +456,55 @@ static int centrino_target (struct cpufreq_policy *policy, unsigned int newstate = 0; unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu; struct cpufreq_freqs freqs; - cpumask_t online_policy_cpus; - cpumask_t saved_mask; - cpumask_t set_mask; - cpumask_t covered_cpus; int retval = 0; unsigned int j, k, first_cpu, tmp; - - if (unlikely(centrino_model[cpu] == NULL)) - return -ENODEV; + CPUMASK_ALLOC(allmasks); + CPUMASK_VAR(online_policy_cpus, allmasks); + CPUMASK_VAR(saved_mask, allmasks); + CPUMASK_VAR(set_mask, allmasks); + CPUMASK_VAR(covered_cpus, allmasks); + + if (unlikely(allmasks == NULL)) + return -ENOMEM; + + if (unlikely(centrino_model[cpu] == NULL)) { + retval = -ENODEV; + goto out; + } if (unlikely(cpufreq_frequency_table_target(policy, centrino_model[cpu]->op_points, target_freq, relation, &newstate))) { - return -EINVAL; + retval = -EINVAL; + goto out; } #ifdef CONFIG_HOTPLUG_CPU /* cpufreq holds the hotplug lock, so we are safe from here on */ - cpus_and(online_policy_cpus, cpu_online_map, policy->cpus); + cpus_and(*online_policy_cpus, cpu_online_map, policy->cpus); #else - online_policy_cpus = policy->cpus; + *online_policy_cpus = policy->cpus; #endif - saved_mask = current->cpus_allowed; + *saved_mask = current->cpus_allowed; first_cpu = 1; - cpus_clear(covered_cpus); - for_each_cpu_mask_nr(j, online_policy_cpus) { + cpus_clear(*covered_cpus); + for_each_cpu_mask_nr(j, *online_policy_cpus) { /* * Support for SMP systems. * Make sure we are running on CPU that wants to change freq */ - cpus_clear(set_mask); + cpus_clear(*set_mask); if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) - cpus_or(set_mask, set_mask, online_policy_cpus); + cpus_or(*set_mask, *set_mask, *online_policy_cpus); else - cpu_set(j, set_mask); + cpu_set(j, *set_mask); - set_cpus_allowed_ptr(current, &set_mask); + set_cpus_allowed_ptr(current, set_mask); preempt_disable(); - if (unlikely(!cpu_isset(smp_processor_id(), set_mask))) { + if (unlikely(!cpu_isset(smp_processor_id(), *set_mask))) { dprintk("couldn't limit to CPUs in this domain\n"); retval = -EAGAIN; if (first_cpu) { @@ -518,7 +532,7 @@ static int centrino_target (struct cpufreq_policy *policy, dprintk("target=%dkHz old=%d new=%d msr=%04x\n", target_freq, freqs.old, freqs.new, msr); - for_each_cpu_mask_nr(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); @@ -537,11 +551,11 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - cpu_set(j, covered_cpus); + cpu_set(j, *covered_cpus); preempt_enable(); } - for_each_cpu_mask_nr(k, online_policy_cpus) { + for_each_cpu_mask_nr(k, *online_policy_cpus) { freqs.cpu = k; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -554,10 +568,10 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(covered_cpus)) { + if (!cpus_empty(*covered_cpus)) { cpumask_of_cpu_ptr_declare(new_mask); - for_each_cpu_mask_nr(j, covered_cpus) { + for_each_cpu_mask_nr(j, *covered_cpus) { cpumask_of_cpu_ptr_next(new_mask, j); set_cpus_allowed_ptr(current, new_mask); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); @@ -567,19 +581,22 @@ static int centrino_target (struct cpufreq_policy *policy, tmp = freqs.new; freqs.new = freqs.old; freqs.old = tmp; - for_each_cpu_mask_nr(j, online_policy_cpus) { + for_each_cpu_mask_nr(j, *online_policy_cpus) { freqs.cpu = j; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } } - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); + retval = 0; + goto out; migrate_end: preempt_enable(); - set_cpus_allowed_ptr(current, &saved_mask); - return 0; + set_cpus_allowed_ptr(current, saved_mask); +out: + CPUMASK_FREE(allmasks); + return retval; } static struct freq_attr* centrino_attr[] = { -- cgit v1.2.2 From 812b121d55316333a3480b294523d4e52f9dd366 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 16 Jul 2008 19:21:31 -0700 Subject: x86_64: ia32_signal.c: remove signal number conversion This was old code that was needed for iBCS and x86-64 never supported that. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index dc9b9b9803f6..20af4c79579a 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -520,7 +520,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, compat_sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - struct exec_domain *ed = current_thread_info()->exec_domain; void __user *restorer; int err = 0; @@ -543,8 +542,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - err |= __put_user((ed && ed->signal_invmap && sig < 32 - ? ed->signal_invmap[sig] : sig), &frame->sig); + err |= __put_user(sig, &frame->sig); err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); err |= copy_siginfo_to_user32(&frame->info, info); -- cgit v1.2.2 From f2ba93929fdb91fd806be20e959a50f7db82790e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:35:37 +0100 Subject: x86: check function status in EDD boot code Without checking the return value of get_edd_info() and adding the entry only in the success case, 6 devices show up under /sys/firmware/edd/, no matter how many devices are actually present. Signed-off-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/boot/edd.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 03399d64013b..d93cbc6464d0 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -167,9 +167,8 @@ void query_edd(void) * Scan the BIOS-supported hard disks and query EDD * information... */ - get_edd_info(devno, &ei); - - if (boot_params.eddbuf_entries < EDDMAXNR) { + if (!get_edd_info(devno, &ei) + && boot_params.eddbuf_entries < EDDMAXNR) { memcpy(edp, &ei, sizeof ei); edp++; boot_params.eddbuf_entries++; -- cgit v1.2.2 From 369c99205f633d1e4038b15f5dc4a5500a4359c3 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:37:53 +0100 Subject: x86: fix two modpost warnings Even though it's only the difference of the two __initdata symbols that's being calculated, modpost still doesn't like this. So rather calculate the size once in an __init function and store it for later use. Signed-off-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vma.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 19a6cfaf5db9..257ba4a10abf 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -21,7 +21,8 @@ unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; extern unsigned short vdso_sync_cpuid; -struct page **vdso_pages; +static struct page **vdso_pages; +static unsigned vdso_size; static inline void *var_ref(void *p, char *name) { @@ -38,6 +39,7 @@ static int __init init_vdso_vars(void) int i; char *vbase; + vdso_size = npages << PAGE_SHIFT; vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); if (!vdso_pages) goto oom; @@ -101,20 +103,19 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int exstack) struct mm_struct *mm = current->mm; unsigned long addr; int ret; - unsigned len = round_up(vdso_end - vdso_start, PAGE_SIZE); if (!vdso_enabled) return 0; down_write(&mm->mmap_sem); - addr = vdso_addr(mm->start_stack, len); - addr = get_unmapped_area(NULL, addr, len, 0, 0); + addr = vdso_addr(mm->start_stack, vdso_size); + addr = get_unmapped_area(NULL, addr, vdso_size, 0, 0); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; } - ret = install_special_mapping(mm, addr, len, + ret = install_special_mapping(mm, addr, vdso_size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| VM_ALWAYSDUMP, -- cgit v1.2.2 From 91467bdf6e53058af13fd255375d6634ba0c70e0 Mon Sep 17 00:00:00 2001 From: Bernhard Walle Date: Fri, 18 Jul 2008 19:07:53 +0200 Subject: x86: move dma32_reserve_bootmem() after reserve_crashkernel() On a x86-64 machine (nothing special I could encounter) I had the problem that crashkernel reservation with the usual "64M@16M" failed. While debugging that, I encountered that dma32_reserve_bootmem() reserves a memory region which is in that area. Because dma32_reserve_bootmem() does not rely on a specific offset but crashkernel does, it makes sense to move the dma32_reserve_bootmem() reservation down a bit. I tested that patch and it works without problems. I don't see any negative effects of that move, but maybe I oversaw something ... While we strictly don't need that patch in 2.6.27 because we have the automatic, dynamic offset detection, it makes sense to also include it here because: - it's easier to get it in -stable then, - many people are still used to the 'crashkernel=...@16M' syntax, - not everybody may be using a reloatable kernel. Signed-off-by: Bernhard Walle Cc: kexec@lists.infradead.org Cc: vgoyal@redhat.com Cc: akpm@linux-foundation.org Cc: Bernhard Walle Cc: yhlu.kernel@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 531b55b8e81a..74d110ef2690 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -792,10 +792,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(0, max_pfn); -#ifdef CONFIG_X86_64 - dma32_reserve_bootmem(); -#endif - #ifdef CONFIG_ACPI_SLEEP /* * Reserve low memory region for sleep support. @@ -810,6 +806,15 @@ void __init setup_arch(char **cmdline_p) #endif reserve_crashkernel(); +#ifdef CONFIG_X86_64 + /* + * dma32_reserve_bootmem() allocates bootmem which may conflict + * with the crashkernel command line, so do that after + * reserve_crashkernel() + */ + dma32_reserve_bootmem(); +#endif + reserve_ibft_region(); #ifdef CONFIG_KVM_CLOCK -- cgit v1.2.2 From 08e1a13e7d14ba5d6a22bf4b8c6e11128d3bcdfe Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:44:16 +0100 Subject: x86: reduce forbid_dac's visibility It's not used anywhere outside its declaring file. Signed-off-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/pci-dma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8467ec2320f1..702714bd1511 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -9,8 +9,7 @@ #include #include -int forbid_dac __read_mostly; -EXPORT_SYMBOL(forbid_dac); +static int forbid_dac __read_mostly; const struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); -- cgit v1.2.2 From 08ad8afaa0f7343e9c64eec5dbbb178e390e03a2 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:45:20 +0100 Subject: x86: reduce force_mwait visibility It's not used anywhere outside its single referencing file. Signed-off-by: Jan Beulich Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/process.c | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 81a07ca65d44..cae9cabc3031 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,8 +24,6 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); -int force_mwait __cpuinitdata; - static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { if (cpuid_eax(0x80000000) >= 0x80000007) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 158bd6a16f6a..9f94bb1c8117 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -199,6 +199,7 @@ static void poll_idle(void) * * idle=mwait overrides this decision and forces the usage of mwait. */ +static int __cpuinitdata force_mwait; #define MWAIT_INFO 0x05 #define MWAIT_ECX_EXTENDED_INFO 0x01 -- cgit v1.2.2 From 2ddf9b7b3e6660199269e34cfa27148440ddc3bf Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 18 Jul 2008 13:32:23 +0100 Subject: i386/xen: add proper unwind annotations to xen_sysenter_target Signed-off-by: Jan Beulich Cc: Jeremy Fitzhardinge Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 6bc07f0f1202..ad5264c29e9b 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1024,6 +1024,7 @@ ENDPROC(kernel_thread_helper) ENTRY(xen_sysenter_target) RING0_INT_FRAME addl $5*4, %esp /* remove xen-provided frame */ + CFI_ADJUST_CFA_OFFSET -5*4 jmp sysenter_past_esp CFI_ENDPROC -- cgit v1.2.2 From ae79cdaacb5599781f8bb49f4bdd5723029669cf Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 18 Jul 2008 16:08:13 -0700 Subject: x86: Add a arch directory for x86 under debugfs Add a directory for x86 arch under debugfs. Can be used to accumulate all x86 specific debugfs files. Signed-off-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin --- arch/x86/kernel/kdebugfs.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index c03205991718..f2d43bc75514 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -12,9 +12,13 @@ #include #include #include +#include #include +struct dentry *arch_debugfs_dir; +EXPORT_SYMBOL(arch_debugfs_dir); + #ifdef CONFIG_DEBUG_BOOT_PARAMS struct setup_data_node { u64 paddr; @@ -209,6 +213,10 @@ static int __init arch_kdebugfs_init(void) { int error = 0; + arch_debugfs_dir = debugfs_create_dir("x86", NULL); + if (!arch_debugfs_dir) + return -ENOMEM; + #ifdef CONFIG_DEBUG_BOOT_PARAMS error = boot_params_kdebugfs_init(); #endif -- cgit v1.2.2 From fec0962e0bed407927b9ff54bb0596a3ab7e4b61 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 18 Jul 2008 16:08:14 -0700 Subject: x86: Add a debugfs interface to dump PAT memtype Add a debugfs interface to list out all the PAT memtype reservations. Appears at debugfs x86/pat_memtype_list and output format is type @ - We do not hold the lock while printing the entire list. So, the list may not be a consistent copy in case where regions are getting added or deleted at the same time. Signed-off-by: Venkatesh Pallipadi Signed-off-by: H. Peter Anvin --- arch/x86/mm/pat.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index d4585077977a..0917a540a55e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include #include #include @@ -489,3 +491,89 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } + +#if defined(CONFIG_DEBUG_FS) + +/* get Nth element of the linked list */ +static struct memtype *memtype_get_idx(loff_t pos) +{ + struct memtype *list_node, *print_entry; + int i = 1; + + print_entry = kmalloc(sizeof(struct memtype), GFP_KERNEL); + if (!print_entry) + return NULL; + + spin_lock(&memtype_lock); + list_for_each_entry(list_node, &memtype_list, nd) { + if (pos == i) { + *print_entry = *list_node; + spin_unlock(&memtype_lock); + return print_entry; + } + ++i; + } + spin_unlock(&memtype_lock); + kfree(print_entry); + return NULL; +} + +static void *memtype_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (*pos == 0) { + ++*pos; + seq_printf(seq, "PAT memtype list:\n"); + } + + return memtype_get_idx(*pos); +} + +static void *memtype_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + ++*pos; + return memtype_get_idx(*pos); +} + +static void memtype_seq_stop(struct seq_file *seq, void *v) +{ +} + +static int memtype_seq_show(struct seq_file *seq, void *v) +{ + struct memtype *print_entry = (struct memtype *)v; + + seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), + print_entry->start, print_entry->end); + kfree(print_entry); + return 0; +} + +static struct seq_operations memtype_seq_ops = { + .start = memtype_seq_start, + .next = memtype_seq_next, + .stop = memtype_seq_stop, + .show = memtype_seq_show, +}; + +static int memtype_seq_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &memtype_seq_ops); +} + +static const struct file_operations memtype_fops = { + .open = memtype_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init pat_memtype_list_init(void) +{ + debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, + NULL, &memtype_fops); + return 0; +} + +late_initcall(pat_memtype_list_init); + +#endif /* CONFIG_DEBUG_FS */ -- cgit v1.2.2 From e5849e71adcbb774ce40f09c1bcb48acca3b6da7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 18 Jul 2008 17:28:40 -0700 Subject: x86: remove arch_get_ram_range no user now Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 21 --------------------- 1 file changed, 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 28c29180b380..df1b32fa88db 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1367,24 +1367,3 @@ void __init setup_memory_map(void) printk(KERN_INFO "BIOS-provided physical RAM map:\n"); e820_print_map(who); } - -#ifdef CONFIG_X86_64 -int __init arch_get_ram_range(int slot, u64 *addr, u64 *size) -{ - int i; - - if (slot < 0 || slot >= e820.nr_map) - return -1; - for (i = slot; i < e820.nr_map; i++) { - if (e820.map[i].type != E820_RAM) - continue; - break; - } - if (i == e820.nr_map || e820.map[i].addr > (max_pfn << PAGE_SHIFT)) - return -1; - *addr = e820.map[i].addr; - *size = min_t(u64, e820.map[i].size + e820.map[i].addr, - max_pfn << PAGE_SHIFT) - *addr; - return i + 1; -} -#endif -- cgit v1.2.2 From f586bf7df9acc26b68b0feefc0dd32fa63516d3a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 18 Jul 2008 15:58:35 -0700 Subject: x86: APIC: Remove apic_write_around(); use alternatives, merge fix On Fri, Jul 18, 2008 at 02:03:48PM -0700, Ingo Molnar wrote: > > * Yinghai Lu wrote: > > > > git merge tip/x86/x2apic > > CONFLICT (content): Merge conflict in arch/x86/kernel/Makefile > > CONFLICT (content): Merge conflict in arch/x86/kernel/paravirt.c > > CONFLICT (content): Merge conflict in arch/x86/kernel/smpboot.c > > CONFLICT (content): Merge conflict in arch/x86/kernel/vmi_32.c > > CONFLICT (content): Merge conflict in arch/x86/xen/enlighten.c > > CONFLICT (content): Merge conflict in include/asm-x86/apic.h > > CONFLICT (content): Merge conflict in include/asm-x86/paravirt.h > > that's due to the changes in tip/x86/apic and in tip/x86/uv. > > ok, i've just merged x86/apic into x86/x2apic and x86/uv as well, and > pushed out the result. > > Note: it's a first raw merge and completely untested. It will now merge > cleanly into tip/master. There are probably a few details missing. Ingo, thanks for doing this. While I was testing my merge changes, you posted yours... anyhow we need this piece, which is missing from your merge. Signed-off-by: Suresh Siddha Cc: Yinghai Lu Cc: "Maciej W. Rozycki" Cc: Jeremy Fitzhardinge Cc: Zachary Amsden Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 5 ++--- arch/x86/kernel/apic_64.c | 2 -- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index dcb897f22aa2..8728f54a93d8 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -174,8 +174,8 @@ u32 safe_xapic_wait_icr_idle(void) void xapic_icr_write(u32 low, u32 id) { - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(id)); - apic_write_around(APIC_ICR, low); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); } u64 xapic_icr_read(void) @@ -191,7 +191,6 @@ u64 xapic_icr_read(void) static struct apic_ops xapic_ops = { .read = native_apic_mem_read, .write = native_apic_mem_write, - .write_atomic = native_apic_mem_write_atomic, .icr_read = xapic_icr_read, .icr_write = xapic_icr_write, .wait_icr_idle = xapic_wait_icr_idle, diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 46e612408aca..a850bc63fb1c 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -167,7 +167,6 @@ u64 xapic_icr_read(void) static struct apic_ops xapic_ops = { .read = native_apic_mem_read, .write = native_apic_mem_write, - .write_atomic = native_apic_mem_write_atomic, .icr_read = xapic_icr_read, .icr_write = xapic_icr_write, .wait_icr_idle = xapic_wait_icr_idle, @@ -206,7 +205,6 @@ u64 x2apic_icr_read(void) static struct apic_ops x2apic_ops = { .read = native_apic_msr_read, .write = native_apic_msr_write, - .write_atomic = native_apic_msr_write, .icr_read = x2apic_icr_read, .icr_write = x2apic_icr_write, .wait_icr_idle = x2apic_wait_icr_idle, -- cgit v1.2.2 From d092633bff3b19faffc480fe9810805e7792a029 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 18 Jul 2008 00:26:59 +0200 Subject: Subject: devmem, x86: fix rename of CONFIG_NONPROMISC_DEVMEM From: Arjan van de Ven Date: Sat, 19 Jul 2008 15:47:17 -0700 CONFIG_NONPROMISC_DEVMEM was a rather confusing name - but renaming it to CONFIG_PROMISC_DEVMEM causes problems on architectures that do not support this feature; this patch renames it to CONFIG_STRICT_DEVMEM, so that architectures can opt-in into it. ( the polarity of the option is still the same as it was originally; it needs to be for now to not break architectures that don't have the infastructure yet to support this feature) Signed-off-by: Arjan van de Ven Cc: "V.Radhakrishnan" Signed-off-by: Ingo Molnar --- --- arch/x86/Kconfig.debug | 9 +++++---- arch/x86/configs/i386_defconfig | 2 +- arch/x86/configs/x86_64_defconfig | 2 +- arch/x86/mm/pat.c | 6 +++--- 4 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index f0cf5d990794..51c821477951 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -5,14 +5,15 @@ config TRACE_IRQFLAGS_SUPPORT source "lib/Kconfig.debug" -config PROMISC_DEVMEM - bool "Allow unlimited access to /dev/mem" - default y +config STRICT_DEVMEM + bool "Filter access to /dev/mem" help If this option is left on, you allow userspace (root) access to all of memory, including kernel and userspace memory. Accidental access to this is obviously disastrous, but specific access can - be used by people debugging the kernel. + be used by people debugging the kernel. Note that with PAT support + enabled, even in this case there are restrictions on /dev/mem + use due to the cache aliasing requirements. If this option is switched on, the /dev/mem file only allows userspace access to PCI space and the BIOS code and data regions. diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9bc34e2033ec..4d73f53287b6 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -2047,7 +2047,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_SAMPLES is not set # CONFIG_KGDB is not set CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_NONPROMISC_DEVMEM is not set +# CONFIG_STRICT_DEVMEM is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ae5124e064d4..a40452429625 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -2012,7 +2012,7 @@ CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_SAMPLES is not set # CONFIG_KGDB is not set CONFIG_HAVE_ARCH_KGDB=y -# CONFIG_NONPROMISC_DEVMEM is not set +# CONFIG_STRICT_DEVMEM is not set CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index c34dc483839c..6bb597f4d701 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -373,8 +373,8 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, return vma_prot; } -#ifndef CONFIG_PROMISC_DEVMEM -/* This check is done in drivers/char/mem.c in case of !PROMISC_DEVMEM*/ +#ifdef CONFIG_STRICT_DEVMEM +/* This check is done in drivers/char/mem.c in case of STRICT_DEVMEM*/ static inline int range_is_allowed(unsigned long pfn, unsigned long size) { return 1; @@ -398,7 +398,7 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) } return 1; } -#endif /* CONFIG_PROMISC_DEVMEM */ +#endif /* CONFIG_STRICT_DEVMEM */ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) -- cgit v1.2.2 From a83fe32fa668c0a17b3f99a1480b006f7d649924 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 18 Jul 2008 13:22:36 -0700 Subject: x86, pci: detect end_bus_number according to acpi/e820 reserved, v2 Jack Howarth reported that 2.6.26-rc9-git9 doesn't boot on MacBookPro2. the reason is a faulty BIOS update that reportes faulty resources. Nevertheless it's possible for Linux to be more resolent about this situation (and similar situations) and work around this bug, by cross-checking the mmconf range against the e820 table and ACPI resources. Change the mconf bus range from [0,0xff] to to [0, 0x3f] to match range [0xf0000000, 0xf4000000) in e820 tables. [ v2, yhlu.kernel@gmail.com: x86, pci: detect end_bus_number according to acpi/e820 reserved - fix ] Reported-by: Jack Howarth Signed-off-by: Yinghai Lu Cc: jbarnes@virtuousgeek.org Cc: Jack Howarth Signed-off-by: Ingo Molnar --- arch/x86/pci/mmconfig-shared.c | 65 +++++++++++++++++++++++++++++++----------- 1 file changed, 48 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 23faaa890ffc..429c7014febc 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -293,7 +293,7 @@ static acpi_status __init find_mboard_resource(acpi_handle handle, u32 lvl, return AE_OK; } -static int __init is_acpi_reserved(unsigned long start, unsigned long end) +static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used) { struct resource mcfg_res; @@ -310,6 +310,41 @@ static int __init is_acpi_reserved(unsigned long start, unsigned long end) return mcfg_res.flags; } +typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); + +static int __init is_mmconf_reserved(check_reserved_t is_reserved, + u64 addr, u64 size, int i, + typeof(pci_mmcfg_config[0]) *cfg, int with_e820) +{ + u64 old_size = size; + int valid = 0; + + while (!is_reserved(addr, addr + size - 1, E820_RESERVED)) { + size >>= 1; + if (size < (16UL<<20)) + break; + } + + if (size >= (16UL<<20) || size == old_size) { + printk(KERN_NOTICE + "PCI: MCFG area at %Lx reserved in %s\n", + addr, with_e820?"E820":"ACPI motherboard resources"); + valid = 1; + + if (old_size != size) { + /* update end_bus_number */ + cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); + printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " + "segment %hu buses %u - %u\n", + i, (unsigned long)cfg->address, cfg->pci_segment, + (unsigned int)cfg->start_bus_number, + (unsigned int)cfg->end_bus_number); + } + } + + return valid; +} + static void __init pci_mmcfg_reject_broken(int early) { typeof(pci_mmcfg_config[0]) *cfg; @@ -324,21 +359,22 @@ static void __init pci_mmcfg_reject_broken(int early) for (i = 0; i < pci_mmcfg_config_num; i++) { int valid = 0; - u32 size = (cfg->end_bus_number + 1) << 20; + u64 addr, size; + cfg = &pci_mmcfg_config[i]; + addr = cfg->start_bus_number; + addr <<= 20; + addr += cfg->address; + size = cfg->end_bus_number + 1 - cfg->start_bus_number; + size <<= 20; printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx " "segment %hu buses %u - %u\n", i, (unsigned long)cfg->address, cfg->pci_segment, (unsigned int)cfg->start_bus_number, (unsigned int)cfg->end_bus_number); - if (!early && - is_acpi_reserved(cfg->address, cfg->address + size - 1)) { - printk(KERN_NOTICE "PCI: MCFG area at %Lx reserved " - "in ACPI motherboard resources\n", - cfg->address); - valid = 1; - } + if (!early) + valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); if (valid) continue; @@ -347,16 +383,11 @@ static void __init pci_mmcfg_reject_broken(int early) printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" " reserved in ACPI motherboard resources\n", cfg->address); + /* Don't try to do this check unless configuration type 1 is available. how about type 2 ?*/ - if (raw_pci_ops && e820_all_mapped(cfg->address, - cfg->address + size - 1, - E820_RESERVED)) { - printk(KERN_NOTICE - "PCI: MCFG area at %Lx reserved in E820\n", - cfg->address); - valid = 1; - } + if (raw_pci_ops) + valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); if (!valid) goto reject; -- cgit v1.2.2 From 5f1f2b3d9dbaee82cd532f28da459adcbf611499 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 18 Jul 2008 16:16:23 -0700 Subject: x86: improve debug printout: add target bootmem range in early_res_to_bootmem() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index df1b32fa88db..6c60aeaac15f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -877,7 +877,8 @@ void __init early_res_to_bootmem(u64 start, u64 end) for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) count++; - printk(KERN_INFO "(%d early reservations) ==> bootmem\n", count); + printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count, start, end); for (i = 0; i < count; i++) { struct early_res *r = &early_res[i]; printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, -- cgit v1.2.2 From 3c9cb6de1e5ad37d1558fdb0d9d2bed5a7bac0d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Jul 2008 02:07:25 -0700 Subject: x86: introduce x86_quirks introduce x86_quirks array of boot-time quirk methods. No change in functionality intended. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 9 ++------- arch/x86/kernel/mpparse.c | 17 +++++------------ arch/x86/kernel/setup.c | 4 ++++ arch/x86/kernel/visws_quirks.c | 42 ++++++++++++++++++++---------------------- arch/x86/mach-default/setup.c | 24 ++++++++---------------- 5 files changed, 39 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 6c60aeaac15f..9af89078f7bb 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1299,11 +1299,6 @@ void __init e820_reserve_resources(void) } } -/* - * Non-standard memory setup can be specified via this quirk: - */ -char * (*arch_memory_setup_quirk)(void); - char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; @@ -1344,8 +1339,8 @@ char *__init default_machine_specific_memory_setup(void) char *__init __attribute__((weak)) machine_specific_memory_setup(void) { - if (arch_memory_setup_quirk) { - char *who = arch_memory_setup_quirk(); + if (x86_quirks->arch_memory_setup) { + char *who = x86_quirks->arch_memory_setup(); if (who) return who; diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3b25e49380c6..3cbd2df3abe4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_32 @@ -725,12 +726,6 @@ static inline void __init construct_default_ISA_mptable(int mpc_default_type) static struct intel_mp_floating *mpf_found; -/* - * Machine specific quirk for finding the SMP config before other setup - * activities destroy the table: - */ -int (*mach_get_smp_config_quirk)(unsigned int early); - /* * Scan the memory blocks for an SMP configuration block. */ @@ -738,8 +733,8 @@ static void __init __get_smp_config(unsigned int early) { struct intel_mp_floating *mpf = mpf_found; - if (mach_get_smp_config_quirk) { - if (mach_get_smp_config_quirk(early)) + if (x86_quirks->mach_get_smp_config) { + if (x86_quirks->mach_get_smp_config(early)) return; } if (acpi_lapic && early) @@ -899,14 +894,12 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, return 0; } -int (*mach_find_smp_config_quirk)(unsigned int reserve); - static void __init __find_smp_config(unsigned int reserve) { unsigned int address; - if (mach_find_smp_config_quirk) { - if (mach_find_smp_config_quirk(reserve)) + if (x86_quirks->mach_find_smp_config) { + if (x86_quirks->mach_find_smp_config(reserve)) return; } /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 4a2b8acc1d95..bbcc13d0b569 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -574,6 +574,10 @@ static int __init setup_elfcorehdr(char *arg) early_param("elfcorehdr", setup_elfcorehdr); #endif +static struct x86_quirks default_x86_quirks __initdata; + +struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index e94bdb6add1d..41e01b145c48 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -73,7 +73,7 @@ int is_visws_box(void) return visws_board_type >= 0; } -static int __init visws_time_init_quirk(void) +static int __init visws_time_init(void) { printk(KERN_INFO "Starting Cobalt Timer system clock\n"); @@ -93,7 +93,7 @@ static int __init visws_time_init_quirk(void) return 0; } -static int __init visws_pre_intr_init_quirk(void) +static int __init visws_pre_intr_init(void) { init_VISWS_APIC_irqs(); @@ -114,7 +114,7 @@ EXPORT_SYMBOL(sgivwfb_mem_size); long long mem_size __initdata = 0; -static char * __init visws_memory_setup_quirk(void) +static char * __init visws_memory_setup(void) { long long gfx_mem_size = 8 * MB; @@ -176,7 +176,7 @@ static void visws_machine_power_off(void) outl(PIIX_SPECIAL_STOP, 0xCFC); } -static int __init visws_get_smp_config_quirk(unsigned int early) +static int __init visws_get_smp_config(unsigned int early) { /* * Prevent MP-table parsing by the generic code: @@ -192,7 +192,7 @@ extern unsigned int __cpuinitdata maxcpus; * No problem for Linux. */ -static void __init MP_processor_info (struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int ver, logical_apicid; physid_mask_t apic_cpus; @@ -232,7 +232,7 @@ static void __init MP_processor_info (struct mpc_config_processor *m) apic_version[m->mpc_apicid] = ver; } -int __init visws_find_smp_config_quirk(unsigned int reserve) +static int __init visws_find_smp_config(unsigned int reserve) { struct mpc_config_processor *mp = phys_to_virt(CO_CPU_TAB_PHYS); unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); @@ -258,7 +258,17 @@ int __init visws_find_smp_config_quirk(unsigned int reserve) return 1; } -extern int visws_trap_init_quirk(void); +static int visws_trap_init(void); + +static struct x86_quirks visws_x86_quirks __initdata = { + .arch_time_init = visws_time_init, + .arch_pre_intr_init = visws_pre_intr_init, + .arch_memory_setup = visws_memory_setup, + .arch_intr_init = NULL, + .arch_trap_init = visws_trap_init, + .mach_get_smp_config = visws_get_smp_config, + .mach_find_smp_config = visws_find_smp_config, +}; void __init visws_early_detect(void) { @@ -272,16 +282,10 @@ void __init visws_early_detect(void) /* * Install special quirks for timer, interrupt and memory setup: - */ - arch_time_init_quirk = visws_time_init_quirk; - arch_pre_intr_init_quirk = visws_pre_intr_init_quirk; - arch_memory_setup_quirk = visws_memory_setup_quirk; - - /* * Fall back to generic behavior for traps: + * Override generic MP-table parsing: */ - arch_intr_init_quirk = NULL; - arch_trap_init_quirk = visws_trap_init_quirk; + x86_quirks = &visws_x86_quirks; /* * Install reboot quirks: @@ -294,12 +298,6 @@ void __init visws_early_detect(void) */ no_broadcast = 0; - /* - * Override generic MP-table parsing: - */ - mach_get_smp_config_quirk = visws_get_smp_config_quirk; - mach_find_smp_config_quirk = visws_find_smp_config_quirk; - #ifdef CONFIG_X86_IO_APIC /* * Turn off IO-APIC detection and initialization: @@ -426,7 +424,7 @@ static __init void cobalt_init(void) co_apic_read(CO_APIC_ID)); } -int __init visws_trap_init_quirk(void) +static int __init visws_trap_init(void) { lithium_init(); cobalt_init(); diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 48278fa7d3de..631dbed9fb9d 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -10,14 +10,6 @@ #include #include -/* - * Any quirks to be performed to initialize timers/irqs/etc? - */ -int (*arch_time_init_quirk)(void); -int (*arch_pre_intr_init_quirk)(void); -int (*arch_intr_init_quirk)(void); -int (*arch_trap_init_quirk)(void); - #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) #else @@ -37,8 +29,8 @@ int no_broadcast=DEFAULT_SEND_IPI; **/ void __init pre_intr_init_hook(void) { - if (arch_pre_intr_init_quirk) { - if (arch_pre_intr_init_quirk()) + if (x86_quirks->arch_pre_intr_init) { + if (x86_quirks->arch_pre_intr_init()) return; } init_ISA_irqs(); @@ -64,8 +56,8 @@ static struct irqaction irq2 = { **/ void __init intr_init_hook(void) { - if (arch_intr_init_quirk) { - if (arch_intr_init_quirk()) + if (x86_quirks->arch_intr_init) { + if (x86_quirks->arch_intr_init()) return; } #ifdef CONFIG_X86_LOCAL_APIC @@ -97,8 +89,8 @@ void __init pre_setup_arch_hook(void) **/ void __init trap_init_hook(void) { - if (arch_trap_init_quirk) { - if (arch_trap_init_quirk()) + if (x86_quirks->arch_trap_init) { + if (x86_quirks->arch_trap_init()) return; } } @@ -119,13 +111,13 @@ static struct irqaction irq0 = { **/ void __init time_init_hook(void) { - if (arch_time_init_quirk) { + if (x86_quirks->arch_time_init) { /* * A nonzero return code does not mean failure, it means * that the architecture quirk does not want any * generic (timer) setup to be performed after this: */ - if (arch_time_init_quirk()) + if (x86_quirks->arch_time_init()) return; } -- cgit v1.2.2 From 64898a8bad8c94ad7a4bd5cc86b66edfbb081f4a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Jul 2008 18:01:16 -0700 Subject: x86: extend and use x86_quirks to clean up NUMAQ code add these new x86_quirks methods: int *mpc_record; int (*mpc_apic_id)(struct mpc_config_processor *m); void (*mpc_oem_bus_info)(struct mpc_config_bus *m, char *name); void (*mpc_oem_pci_bus)(struct mpc_config_bus *m); void (*smp_read_mpc_oem)(struct mp_config_oemtable *oemtable, unsigned short oemsize); ... and move NUMAQ related mps table handling to numaq_32.c. also move the call to smp_read_mpc_oem() to smp_read_mpc() directly. Should not change functionality, albeit it would be nice to get it tested on real NUMAQ as well ... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 191 +++++---------------------------------------- arch/x86/kernel/numaq_32.c | 190 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 200 insertions(+), 181 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 3cbd2df3abe4..6ae005ccaed8 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -49,76 +49,6 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -#ifdef CONFIG_X86_NUMAQ -int found_numaq; -/* - * Have to match translation table entries to main table entries by counter - * hence the mpc_record variable .... can't see a less disgusting way of - * doing this .... - */ -struct mpc_config_translation { - unsigned char mpc_type; - unsigned char trans_len; - unsigned char trans_type; - unsigned char trans_quad; - unsigned char trans_global; - unsigned char trans_local; - unsigned short trans_reserved; -}; - - -static int mpc_record; -static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] - __cpuinitdata; - -static inline int generate_logical_apicid(int quad, int phys_apicid) -{ - return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); -} - - -static inline int mpc_apic_id(struct mpc_config_processor *m, - struct mpc_config_translation *translation_record) -{ - int quad = translation_record->trans_quad; - int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); - - printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", - m->mpc_apicid, - (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, - (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, - m->mpc_apicver, quad, logical_apicid); - return logical_apicid; -} - -int mp_bus_id_to_node[MAX_MP_BUSSES]; - -int mp_bus_id_to_local[MAX_MP_BUSSES]; - -static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - mp_bus_id_to_node[m->mpc_busid] = quad; - mp_bus_id_to_local[m->mpc_busid] = local; - printk(KERN_INFO "Bus #%d is %s (node %d)\n", - m->mpc_busid, name, quad); -} - -int quad_local_to_mp_bus_id [NR_CPUS/4][4]; -static void mpc_oem_pci_bus(struct mpc_config_bus *m, - struct mpc_config_translation *translation) -{ - int quad = translation->trans_quad; - int local = translation->trans_local; - - quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; -} - -#endif - static void __cpuinit MP_processor_info(struct mpc_config_processor *m) { int apicid; @@ -128,14 +58,12 @@ static void __cpuinit MP_processor_info(struct mpc_config_processor *m) disabled_cpus++; return; } -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - apicid = mpc_apic_id(m, translation_table[mpc_record]); + + if (x86_quirks->mpc_apic_id) + apicid = x86_quirks->mpc_apic_id(m); else apicid = m->mpc_apicid; -#else - apicid = m->mpc_apicid; -#endif + if (m->mpc_cpuflag & CPU_BOOTPROCESSOR) { bootup_cpu = " (Bootup-CPU)"; boot_cpu_physical_apicid = m->mpc_apicid; @@ -152,12 +80,10 @@ static void __init MP_bus_info(struct mpc_config_bus *m) memcpy(str, m->mpc_bustype, 6); str[6] = 0; -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_bus_info(m, str, translation_table[mpc_record]); -#else - printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); -#endif + if (x86_quirks->mpc_oem_bus_info) + x86_quirks->mpc_oem_bus_info(m, str); + else + printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); #if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { @@ -174,10 +100,9 @@ static void __init MP_bus_info(struct mpc_config_bus *m) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_ISA; #endif } else if (strncmp(str, BUSTYPE_PCI, sizeof(BUSTYPE_PCI) - 1) == 0) { -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) - mpc_oem_pci_bus(m, translation_table[mpc_record]); -#endif + if (x86_quirks->mpc_oem_pci_bus) + x86_quirks->mpc_oem_pci_bus(m); + clear_bit(m->mpc_busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined (CONFIG_MCA) mp_bus_id_to_type[m->mpc_busid] = MP_BUS_PCI; @@ -317,83 +242,6 @@ static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) m->mpc_srcbusirq, m->mpc_destapic, m->mpc_destapiclint); } -#ifdef CONFIG_X86_NUMAQ -static void __init MP_translation_info(struct mpc_config_translation *m) -{ - printk(KERN_INFO - "Translation: record %d, type %d, quad %d, global %d, local %d\n", - mpc_record, m->trans_type, m->trans_quad, m->trans_global, - m->trans_local); - - if (mpc_record >= MAX_MPC_ENTRY) - printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); - else - translation_table[mpc_record] = m; /* stash this for later */ - if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) - node_set_online(m->trans_quad); -} - -/* - * Read/parse the MPC oem tables - */ - -static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, - unsigned short oemsize) -{ - int count = sizeof(*oemtable); /* the header size */ - unsigned char *oemptr = ((unsigned char *)oemtable) + count; - - mpc_record = 0; - printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", - oemtable); - if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { - printk(KERN_WARNING - "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", - oemtable->oem_signature[0], oemtable->oem_signature[1], - oemtable->oem_signature[2], oemtable->oem_signature[3]); - return; - } - if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { - printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); - return; - } - while (count < oemtable->oem_length) { - switch (*oemptr) { - case MP_TRANSLATION: - { - struct mpc_config_translation *m = - (struct mpc_config_translation *)oemptr; - MP_translation_info(m); - oemptr += sizeof(*m); - count += sizeof(*m); - ++mpc_record; - break; - } - default: - { - printk(KERN_WARNING - "Unrecognised OEM table entry type! - %d\n", - (int)*oemptr); - return; - } - } - } -} - -void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, - char *productid) -{ - if (strncmp(oem, "IBM NUMA", 8)) - printk("Warning! Not a NUMA-Q system!\n"); - else - found_numaq = 1; - - if (mpc->mpc_oemptr) - smp_read_mpc_oem((struct mp_config_oemtable *)mpc->mpc_oemptr, - mpc->mpc_oemsize); -} -#endif /* CONFIG_X86_NUMAQ */ - /* * Read/parse the MPC */ @@ -458,7 +306,6 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) } else mps_oem_check(mpc, oem, str); #endif - /* save the local APIC address, it might be non-default */ if (!acpi_lapic) mp_lapic_addr = mpc->mpc_lapic; @@ -466,12 +313,17 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) if (early) return 1; + if (mpc->mpc_oemptr && x86_quirks->smp_read_mpc_oem) { + struct mp_config_oemtable *oem_table = (struct mp_config_oemtable *)(unsigned long)mpc->mpc_oemptr; + x86_quirks->smp_read_mpc_oem(oem_table, mpc->mpc_oemsize); + } + /* * Now process the configuration blocks. */ -#ifdef CONFIG_X86_NUMAQ - mpc_record = 0; -#endif + if (x86_quirks->mpc_record) + *x86_quirks->mpc_record = 0; + while (count < mpc->mpc_length) { switch (*mpt) { case MP_PROCESSOR: @@ -537,9 +389,8 @@ static int __init smp_read_mpc(struct mp_config_table *mpc, unsigned early) count = mpc->mpc_length; break; } -#ifdef CONFIG_X86_NUMAQ - ++mpc_record; -#endif + if (x86_quirks->mpc_record) + (*x86_quirks->mpc_record)++; } #ifdef CONFIG_X86_GENERICARCH diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index a23e8233b9ac..7f4e00d1d893 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -33,6 +33,7 @@ #include #include #include +#include #define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT)) @@ -71,6 +72,181 @@ static void __init smp_dump_qct(void) } } + +void __init numaq_tsc_disable(void) +{ + if (!found_numaq) + return; + + if (num_online_nodes() > 1) { + printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); + setup_clear_cpu_cap(X86_FEATURE_TSC); + } +} + +int found_numaq; +/* + * Have to match translation table entries to main table entries by counter + * hence the mpc_record variable .... can't see a less disgusting way of + * doing this .... + */ +struct mpc_config_translation { + unsigned char mpc_type; + unsigned char trans_len; + unsigned char trans_type; + unsigned char trans_quad; + unsigned char trans_global; + unsigned char trans_local; + unsigned short trans_reserved; +}; + +/* x86_quirks member */ +static int mpc_record; +static struct mpc_config_translation *translation_table[MAX_MPC_ENTRY] + __cpuinitdata; + +static inline int generate_logical_apicid(int quad, int phys_apicid) +{ + return (quad << 4) + (phys_apicid ? phys_apicid << 1 : 1); +} + +/* x86_quirks member */ +static int mpc_apic_id(struct mpc_config_processor *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int logical_apicid = generate_logical_apicid(quad, m->mpc_apicid); + + printk(KERN_DEBUG "Processor #%d %u:%u APIC version %d (quad %d, apic %d)\n", + m->mpc_apicid, + (m->mpc_cpufeature & CPU_FAMILY_MASK) >> 8, + (m->mpc_cpufeature & CPU_MODEL_MASK) >> 4, + m->mpc_apicver, quad, logical_apicid); + return logical_apicid; +} + +int mp_bus_id_to_node[MAX_MP_BUSSES]; + +int mp_bus_id_to_local[MAX_MP_BUSSES]; + +/* x86_quirks member */ +static void mpc_oem_bus_info(struct mpc_config_bus *m, char *name) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + mp_bus_id_to_node[m->mpc_busid] = quad; + mp_bus_id_to_local[m->mpc_busid] = local; + printk(KERN_INFO "Bus #%d is %s (node %d)\n", + m->mpc_busid, name, quad); +} + +int quad_local_to_mp_bus_id [NR_CPUS/4][4]; + +/* x86_quirks member */ +static void mpc_oem_pci_bus(struct mpc_config_bus *m) +{ + int quad = translation_table[mpc_record]->trans_quad; + int local = translation_table[mpc_record]->trans_local; + + quad_local_to_mp_bus_id[quad][local] = m->mpc_busid; +} + +static void __init MP_translation_info(struct mpc_config_translation *m) +{ + printk(KERN_INFO + "Translation: record %d, type %d, quad %d, global %d, local %d\n", + mpc_record, m->trans_type, m->trans_quad, m->trans_global, + m->trans_local); + + if (mpc_record >= MAX_MPC_ENTRY) + printk(KERN_ERR "MAX_MPC_ENTRY exceeded!\n"); + else + translation_table[mpc_record] = m; /* stash this for later */ + if (m->trans_quad < MAX_NUMNODES && !node_online(m->trans_quad)) + node_set_online(m->trans_quad); +} + +static int __init mpf_checksum(unsigned char *mp, int len) +{ + int sum = 0; + + while (len--) + sum += *mp++; + + return sum & 0xFF; +} + +/* + * Read/parse the MPC oem tables + */ + +static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, + unsigned short oemsize) +{ + int count = sizeof(*oemtable); /* the header size */ + unsigned char *oemptr = ((unsigned char *)oemtable) + count; + + mpc_record = 0; + printk(KERN_INFO "Found an OEM MPC table at %8p - parsing it ... \n", + oemtable); + if (memcmp(oemtable->oem_signature, MPC_OEM_SIGNATURE, 4)) { + printk(KERN_WARNING + "SMP mpc oemtable: bad signature [%c%c%c%c]!\n", + oemtable->oem_signature[0], oemtable->oem_signature[1], + oemtable->oem_signature[2], oemtable->oem_signature[3]); + return; + } + if (mpf_checksum((unsigned char *)oemtable, oemtable->oem_length)) { + printk(KERN_WARNING "SMP oem mptable: checksum error!\n"); + return; + } + while (count < oemtable->oem_length) { + switch (*oemptr) { + case MP_TRANSLATION: + { + struct mpc_config_translation *m = + (struct mpc_config_translation *)oemptr; + MP_translation_info(m); + oemptr += sizeof(*m); + count += sizeof(*m); + ++mpc_record; + break; + } + default: + { + printk(KERN_WARNING + "Unrecognised OEM table entry type! - %d\n", + (int)*oemptr); + return; + } + } + } +} + +static struct x86_quirks numaq_x86_quirks __initdata = { + .arch_time_init = NULL, + .arch_pre_intr_init = NULL, + .arch_memory_setup = NULL, + .arch_intr_init = NULL, + .arch_trap_init = NULL, + .mach_get_smp_config = NULL, + .mach_find_smp_config = NULL, + .mpc_record = &mpc_record, + .mpc_apic_id = mpc_apic_id, + .mpc_oem_bus_info = mpc_oem_bus_info, + .mpc_oem_pci_bus = mpc_oem_pci_bus, + .smp_read_mpc_oem = smp_read_mpc_oem, +}; + +void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, + char *productid) +{ + if (strncmp(oem, "IBM NUMA", 8)) + printk("Warning! Not a NUMA-Q system!\n"); + else + found_numaq = 1; +} + static __init void early_check_numaq(void) { /* @@ -82,6 +258,9 @@ static __init void early_check_numaq(void) */ if (smp_found_config) early_get_smp_config(); + + if (found_numaq) + x86_quirks = &numaq_x86_quirks; } int __init get_memcfg_numaq(void) @@ -92,14 +271,3 @@ int __init get_memcfg_numaq(void) smp_dump_qct(); return 1; } - -void __init numaq_tsc_disable(void) -{ - if (!found_numaq) - return; - - if (num_online_nodes() > 1) { - printk(KERN_DEBUG "NUMAQ: disabling TSC\n"); - setup_clear_cpu_cap(X86_FEATURE_TSC); - } -} -- cgit v1.2.2 From 63b5d7af2556a7de6bf72c5dd0b85a32fb4c3767 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 19 Jul 2008 18:02:26 -0700 Subject: x86: add ->pre_time_init to x86_quirks so NUMAQ can use that to call numaq_pre_time_init() This allows us to remove a NUMAQ special from arch/x86/kernel/setup.c. (and paves the way to remove the NUMAQ subarch) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/numaq_32.c | 7 +++++++ arch/x86/kernel/setup.c | 8 -------- arch/x86/kernel/time_32.c | 1 + arch/x86/mach-default/setup.c | 10 ++++++++++ 4 files changed, 18 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index 7f4e00d1d893..b8c45610b20a 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -84,6 +84,12 @@ void __init numaq_tsc_disable(void) } } +static int __init numaq_pre_time_init(void) +{ + numaq_tsc_disable(); + return 0; +} + int found_numaq; /* * Have to match translation table entries to main table entries by counter @@ -224,6 +230,7 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, } static struct x86_quirks numaq_x86_quirks __initdata = { + .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, .arch_pre_intr_init = NULL, .arch_memory_setup = NULL, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index bbcc13d0b569..4064616cfa85 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -853,14 +853,6 @@ void __init setup_arch(char **cmdline_p) init_cpu_to_node(); #endif -#ifdef CONFIG_X86_NUMAQ - /* - * need to check online nodes num, call it - * here before time_init/tsc_init - */ - numaq_tsc_disable(); -#endif - init_apic_mappings(); ioapic_init_mappings(); diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 059ca6ee59b4..ffe3c664afc0 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -129,6 +129,7 @@ void __init hpet_time_init(void) */ void __init time_init(void) { + pre_time_init_hook(); tsc_init(); late_time_init = choose_time_init(); } diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 631dbed9fb9d..3d317836be9e 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -102,6 +102,16 @@ static struct irqaction irq0 = { .name = "timer" }; +/** + * pre_time_init_hook - do any specific initialisations before. + * + **/ +void __init pre_time_init_hook(void) +{ + if (x86_quirks->arch_pre_time_init) + x86_quirks->arch_pre_time_init(); +} + /** * time_init_hook - do any specific initialisations for the system timer. * -- cgit v1.2.2 From e3a61b0a8c0e342e700a61cd554b01050f333a36 Mon Sep 17 00:00:00 2001 From: Simon Arlott Date: Sat, 19 Jul 2008 23:32:54 +0100 Subject: x86: add unknown_nmi_panic kernel parameter It's not possible to enable the unknown_nmi_panic sysctl option until init is run. It's useful to be able to panic the kernel during boot too, this adds a parameter to enable this option. Signed-off-by: Simon Arlott Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ec024b3baad0..e0b44b7b717a 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -448,6 +448,13 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) #ifdef CONFIG_SYSCTL +static int __init setup_unknown_nmi_panic(char *str) +{ + unknown_nmi_panic = 1; + return 1; +} +__setup("unknown_nmi_panic", setup_unknown_nmi_panic); + static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) { unsigned char reason = get_nmi_reason(); -- cgit v1.2.2 From 6bca67f951f80b9e61078f8cdf5fb7b3d9e51aa9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:27 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/cpu/mcheck/mce_64.c * nr_cpu_ids should be used to allocate arrays based on the number of cpu's present. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index c4a7ec31394c..2fe06ab5c547 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -580,7 +580,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, char __user *buf = ubuf; int i, err; - cpu_tsc = kmalloc(NR_CPUS * sizeof(long), GFP_KERNEL); + cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); if (!cpu_tsc) return -ENOMEM; -- cgit v1.2.2 From f2ad47ffeb1d292b7c7d1e2f6aedb37646c391db Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:28 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/cpu/proc.c * Use nr_cpu_ids instead of NR_CPUS to limit traversal of cpu online map. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 0d0d9057e7c0..a26c480b9491 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -160,7 +160,7 @@ static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = first_cpu(cpu_online_map); - if ((*pos) < NR_CPUS && cpu_online(*pos)) + if ((*pos) < nr_cpu_ids && cpu_online(*pos)) return &cpu_data(*pos); return NULL; } -- cgit v1.2.2 From 247bc6ca0f691e4617e7bdb70cbaccc939f754ec Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:29 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/genx2apic_uv_x.c * Replace NR_CPUS loop with for_each_possible_cpu(). * nr_cpu_ids should be used to determine if a percpu area is available for a given cpu. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 711f11c30b06..3e5d7b8698f9 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -94,7 +94,7 @@ static void uv_send_IPI_mask(cpumask_t mask, int vector) { unsigned int cpu; - for (cpu = 0; cpu < NR_CPUS; ++cpu) + for_each_possible_cpu(cpu) if (cpu_isset(cpu, mask)) uv_send_IPI_one(cpu, vector); } @@ -128,7 +128,7 @@ static unsigned int uv_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; -- cgit v1.2.2 From 1bd9d6b64e1474f1a03f8660e8721d746cffae57 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:30 -0700 Subject: NR_CPUS: Replace NR_CPUS in arch/x86/kernel/genapic_flat_64.c * nr_cpu_ids should be used to determine if a percpu area is available for a given cpu. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 1a9c68845ee8..786548a62d38 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -168,7 +168,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) * May as well be the first. */ cpu = first_cpu(cpumask); - if ((unsigned)cpu < NR_CPUS) + if ((unsigned)cpu < nr_cpu_ids) return per_cpu(x86_cpu_to_apicid, cpu); else return BAD_APICID; -- cgit v1.2.2 From c4762aba0b1f72659aae9ce37b772ca8bd8f06f4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Fri, 18 Jul 2008 18:11:34 -0700 Subject: NR_CPUS: Replace NR_CPUS in speedstep-centrino.c Some cleanups in speedstep-centrino.c for NR_CPUS=4096. * Use new CPUMASK_PTR (instead of old CPUMASK_VAR). * Replace arrays sized by NR_CPUS with percpu variables. * Cleanup some formatting problems (>80 chars per line) and other checkpatch complaints. Signed-off-by: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 85 ++++++++++++++---------- 1 file changed, 51 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 470c016cb254..ca2ac13b7af2 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -28,7 +28,8 @@ #define PFX "speedstep-centrino: " #define MAINTAINER "cpufreq@lists.linux.org.uk" -#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) +#define dprintk(msg...) \ + cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) #define INTEL_MSR_RANGE (0xffff) @@ -66,11 +67,12 @@ struct cpu_model struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */ }; -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x); +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x); /* Operating points for current CPU */ -static struct cpu_model *centrino_model[NR_CPUS]; -static const struct cpu_id *centrino_cpu[NR_CPUS]; +static DEFINE_PER_CPU(struct cpu_model *, centrino_model); +static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu); static struct cpufreq_driver centrino_driver; @@ -255,7 +257,7 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) return -ENOENT; } - centrino_model[policy->cpu] = model; + per_cpu(centrino_model, policy->cpu) = model; dprintk("found \"%s\": max frequency: %dkHz\n", model->model_name, model->max_freq); @@ -264,10 +266,14 @@ static int centrino_cpu_init_table(struct cpufreq_policy *policy) } #else -static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) { return -ENODEV; } +static inline int centrino_cpu_init_table(struct cpufreq_policy *policy) +{ + return -ENODEV; +} #endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */ -static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, const struct cpu_id *x) +static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c, + const struct cpu_id *x) { if ((c->x86 == x->x86) && (c->x86_model == x->x86_model) && @@ -286,23 +292,28 @@ static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe) * for centrino, as some DSDTs are buggy. * Ideally, this can be done using the acpi_data structure. */ - if ((centrino_cpu[cpu] == &cpu_ids[CPU_BANIAS]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_A1]) || - (centrino_cpu[cpu] == &cpu_ids[CPU_DOTHAN_B0])) { + if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) || + (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) { msr = (msr >> 8) & 0xff; return msr * 100000; } - if ((!centrino_model[cpu]) || (!centrino_model[cpu]->op_points)) + if ((!per_cpu(centrino_model, cpu)) || + (!per_cpu(centrino_model, cpu)->op_points)) return 0; msr &= 0xffff; - for (i=0;centrino_model[cpu]->op_points[i].frequency != CPUFREQ_TABLE_END; i++) { - if (msr == centrino_model[cpu]->op_points[i].index) - return centrino_model[cpu]->op_points[i].frequency; + for (i = 0; + per_cpu(centrino_model, cpu)->op_points[i].frequency + != CPUFREQ_TABLE_END; + i++) { + if (msr == per_cpu(centrino_model, cpu)->op_points[i].index) + return per_cpu(centrino_model, cpu)-> + op_points[i].frequency; } if (failsafe) - return centrino_model[cpu]->op_points[i-1].frequency; + return per_cpu(centrino_model, cpu)->op_points[i-1].frequency; else return 0; } @@ -348,7 +359,8 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) int i; /* Only Intel makes Enhanced Speedstep-capable CPUs */ - if (cpu->x86_vendor != X86_VENDOR_INTEL || !cpu_has(cpu, X86_FEATURE_EST)) + if (cpu->x86_vendor != X86_VENDOR_INTEL || + !cpu_has(cpu, X86_FEATURE_EST)) return -ENODEV; if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC)) @@ -362,9 +374,9 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) break; if (i != N_IDS) - centrino_cpu[policy->cpu] = &cpu_ids[i]; + per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i]; - if (!centrino_cpu[policy->cpu]) { + if (!per_cpu(centrino_cpu, policy->cpu)) { dprintk("found unsupported CPU with " "Enhanced SpeedStep: send /proc/cpuinfo to " MAINTAINER "\n"); @@ -387,23 +399,26 @@ static int centrino_cpu_init(struct cpufreq_policy *policy) /* check to see if it stuck */ rdmsr(MSR_IA32_MISC_ENABLE, l, h); if (!(l & (1<<16))) { - printk(KERN_INFO PFX "couldn't enable Enhanced SpeedStep\n"); + printk(KERN_INFO PFX + "couldn't enable Enhanced SpeedStep\n"); return -ENODEV; } } freq = get_cur_freq(policy->cpu); - - policy->cpuinfo.transition_latency = 10000; /* 10uS transition latency */ + policy->cpuinfo.transition_latency = 10000; + /* 10uS transition latency */ policy->cur = freq; dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur); - ret = cpufreq_frequency_table_cpuinfo(policy, centrino_model[policy->cpu]->op_points); + ret = cpufreq_frequency_table_cpuinfo(policy, + per_cpu(centrino_model, policy->cpu)->op_points); if (ret) return (ret); - cpufreq_frequency_table_get_attr(centrino_model[policy->cpu]->op_points, policy->cpu); + cpufreq_frequency_table_get_attr( + per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu); return 0; } @@ -412,12 +427,12 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; - if (!centrino_model[cpu]) + if (!per_cpu(centrino_model, cpu)) return -ENODEV; cpufreq_frequency_table_put_attr(cpu); - centrino_model[cpu] = NULL; + per_cpu(centrino_model, cpu) = NULL; return 0; } @@ -431,14 +446,16 @@ static int centrino_cpu_exit(struct cpufreq_policy *policy) */ static int centrino_verify (struct cpufreq_policy *policy) { - return cpufreq_frequency_table_verify(policy, centrino_model[policy->cpu]->op_points); + return cpufreq_frequency_table_verify(policy, + per_cpu(centrino_model, policy->cpu)->op_points); } /** * centrino_setpolicy - set a new CPUFreq policy * @policy: new policy * @target_freq: the target frequency - * @relation: how that frequency relates to achieved frequency (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) + * @relation: how that frequency relates to achieved frequency + * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H) * * Sets a new CPUFreq policy. */ @@ -459,21 +476,21 @@ static int centrino_target (struct cpufreq_policy *policy, int retval = 0; unsigned int j, k, first_cpu, tmp; CPUMASK_ALLOC(allmasks); - CPUMASK_VAR(online_policy_cpus, allmasks); - CPUMASK_VAR(saved_mask, allmasks); - CPUMASK_VAR(set_mask, allmasks); - CPUMASK_VAR(covered_cpus, allmasks); + CPUMASK_PTR(online_policy_cpus, allmasks); + CPUMASK_PTR(saved_mask, allmasks); + CPUMASK_PTR(set_mask, allmasks); + CPUMASK_PTR(covered_cpus, allmasks); if (unlikely(allmasks == NULL)) return -ENOMEM; - if (unlikely(centrino_model[cpu] == NULL)) { + if (unlikely(per_cpu(centrino_model, cpu) == NULL)) { retval = -ENODEV; goto out; } if (unlikely(cpufreq_frequency_table_target(policy, - centrino_model[cpu]->op_points, + per_cpu(centrino_model, cpu)->op_points, target_freq, relation, &newstate))) { @@ -515,7 +532,7 @@ static int centrino_target (struct cpufreq_policy *policy, break; } - msr = centrino_model[cpu]->op_points[newstate].index; + msr = per_cpu(centrino_model, cpu)->op_points[newstate].index; if (first_cpu) { rdmsr(MSR_IA32_PERF_CTL, oldmsr, h); -- cgit v1.2.2 From 31656519e132f6612584815f128c83976a9aaaef Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 18 Jul 2008 18:01:23 +0200 Subject: sched, x86: clean up hrtick implementation random uvesafb failures were reported against Gentoo: http://bugs.gentoo.org/show_bug.cgi?id=222799 and Mihai Moldovan bisected it back to: > 8f4d37ec073c17e2d4aa8851df5837d798606d6f is first bad commit > commit 8f4d37ec073c17e2d4aa8851df5837d798606d6f > Author: Peter Zijlstra > Date: Fri Jan 25 21:08:29 2008 +0100 > > sched: high-res preemption tick Linus suspected it to be hrtick + vm86 interaction and observed: > Btw, Peter, Ingo: I think that commit is doing bad things. They aren't > _incorrect_ per se, but they are definitely bad. > > Why? > > Using random _TIF_WORK_MASK flags is really impolite for doing > "scheduling" work. There's a reason that arch/x86/kernel/entry_32.S > special-cases the _TIF_NEED_RESCHED flag: we don't want to exit out of > vm86 mode unnecessarily. > > See the "work_notifysig_v86" label, and how it does that > "save_v86_state()" thing etc etc. Right, I never liked having to fiddle with those TIF flags. Initially I needed it because the hrtimer base lock could not nest in the rq lock. That however is fixed these days. Currently the only reason left to fiddle with the TIF flags is remote wakeups. We cannot program a remote cpu's hrtimer. I've been thinking about using the new and improved IPI function call stuff to implement hrtimer_start_on(). However that does require that smp_call_function_single(.wait=0) works from interrupt context - /me looks at the latest series from Jens - Yes that does seem to be supported, good. Here's a stab at cleaning this stuff up ... Mihai reported test success as well. Signed-off-by: Peter Zijlstra Tested-by: Mihai Moldovan Cc: Michal Januszewski Cc: Antonino Daplas Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 3 --- arch/x86/kernel/signal_64.c | 3 --- 2 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d92373630963..e1fc7bd57bfe 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -667,8 +667,5 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); - clear_thread_flag(TIF_IRET); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index e53b267662e7..88023fcc7049 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -502,9 +502,6 @@ void do_notify_resume(struct pt_regs *regs, void *unused, /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); - - if (thread_info_flags & _TIF_HRTICK_RESCHED) - hrtick_resched(); } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -- cgit v1.2.2 From 8b2cf73cc11cf29a21c51c453a3205f23d888915 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sun, 27 Apr 2008 12:14:13 -0700 Subject: KVM: add statics were possible, function definition in lapic.h Noticed by sparse: arch/x86/kvm/vmx.c:1583:6: warning: symbol 'vmx_disable_intercept_for_msr' was not declared. Should it be static? arch/x86/kvm/x86.c:3406:5: warning: symbol 'kvm_task_switch_16' was not declared. Should it be static? arch/x86/kvm/x86.c:3429:5: warning: symbol 'kvm_task_switch_32' was not declared. Should it be static? arch/x86/kvm/mmu.c:1968:6: warning: symbol 'kvm_mmu_remove_one_alloc_mmu_page' was not declared. Should it be static? arch/x86/kvm/mmu.c:2014:6: warning: symbol 'mmu_destroy_caches' was not declared. Should it be static? arch/x86/kvm/lapic.c:862:5: warning: symbol 'kvm_lapic_get_base' was not declared. Should it be static? arch/x86/kvm/i8254.c:94:5: warning: symbol 'pit_get_gate' was not declared. Should it be static? arch/x86/kvm/i8254.c:196:5: warning: symbol '__pit_timer_fn' was not declared. Should it be static? arch/x86/kvm/i8254.c:561:6: warning: symbol '__inject_pit_timer_intr' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 6 +++--- arch/x86/kvm/lapic.h | 1 + arch/x86/kvm/mmu.c | 2 +- arch/x86/kvm/vmx.c | 2 +- arch/x86/kvm/x86.c | 4 ++-- 5 files changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 3829aa7b663f..735ec9a0b360 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -91,7 +91,7 @@ static void pit_set_gate(struct kvm *kvm, int channel, u32 val) c->gate = val; } -int pit_get_gate(struct kvm *kvm, int channel) +static int pit_get_gate(struct kvm *kvm, int channel) { WARN_ON(!mutex_is_locked(&kvm->arch.vpit->pit_state.lock)); @@ -193,7 +193,7 @@ static void pit_latch_status(struct kvm *kvm, int channel) } } -int __pit_timer_fn(struct kvm_kpit_state *ps) +static int __pit_timer_fn(struct kvm_kpit_state *ps) { struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; struct kvm_kpit_timer *pt = &ps->pit_timer; @@ -575,7 +575,7 @@ void kvm_free_pit(struct kvm *kvm) } } -void __inject_pit_timer_intr(struct kvm *kvm) +static void __inject_pit_timer_intr(struct kvm *kvm) { mutex_lock(&kvm->lock); kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 676c396c9cee..81858881287e 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -31,6 +31,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); +u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); int kvm_apic_match_physical_addr(struct kvm_lapic *apic, u16 dest); int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7e7c3969f7a2..8e449dbcc596 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1948,7 +1948,7 @@ void kvm_mmu_zap_all(struct kvm *kvm) kvm_flush_remote_tlbs(kvm); } -void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) +static void kvm_mmu_remove_one_alloc_mmu_page(struct kvm *kvm) { struct kvm_mmu_page *page; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 10ce6ee4c491..397393059800 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1821,7 +1821,7 @@ static void allocate_vpid(struct vcpu_vmx *vmx) spin_unlock(&vmx_vpid_lock); } -void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) +static void vmx_disable_intercept_for_msr(struct page *msr_bitmap, u32 msr) { void *va; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0faa2546b1cd..45dc2b6a9c82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3449,7 +3449,7 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, return 0; } -int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, +static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, struct desc_struct *cseg_desc, struct desc_struct *nseg_desc) { @@ -3472,7 +3472,7 @@ out: return ret; } -int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, +static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, struct desc_struct *cseg_desc, struct desc_struct *nseg_desc) { -- cgit v1.2.2 From c7bf23babc959b186335d2640959a1b8633588de Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 30 Apr 2008 17:55:59 +0200 Subject: KVM: VMX: move APIC_ACCESS trace entry to generic code This patch moves the trace entry for APIC accesses from the VMX code to the generic lapic code. This way APIC accesses from SVM will also be traced. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 4 ++++ arch/x86/kvm/vmx.c | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ebc03f5ae162..f9201fbc61d1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -572,6 +572,8 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) { u32 val = 0; + KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + if (offset >= LAPIC_MMIO_LENGTH) return 0; @@ -695,6 +697,8 @@ static void apic_mmio_write(struct kvm_io_device *this, offset &= 0xff0; + KVMTRACE_1D(APIC_ACCESS, apic->vcpu, (u32)offset, handler); + switch (offset) { case APIC_ID: /* Local APIC ID */ apic_set_reg(apic, APIC_ID, val); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 397393059800..8c951d3eab30 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2554,8 +2554,6 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_read64(EXIT_QUALIFICATION); offset = exit_qualification & 0xffful; - KVMTRACE_1D(APIC_ACCESS, vcpu, (u32)offset, handler); - er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); if (er != EMULATE_DONE) { -- cgit v1.2.2 From c47f098d69ed2bd7343e54095ff4aa2533253bee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 30 Apr 2008 17:56:00 +0200 Subject: KVM: SVM: implement dedicated NMI exit handler With an exit handler for NMI intercepts its possible to account them using kvmtrace. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6b0d5fa5bab3..8a2118b09fd2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1081,6 +1081,11 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); } +static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + return 1; +} + static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { return 1; @@ -1365,7 +1370,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, [SVM_EXIT_INTR] = nop_on_interception, - [SVM_EXIT_NMI] = nop_on_interception, + [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, [SVM_EXIT_VINTR] = interrupt_window_interception, -- cgit v1.2.2 From a069805579a390f0fa91694f6963bcc4b2cecc6b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 30 Apr 2008 17:56:01 +0200 Subject: KVM: SVM: implement dedicated INTR exit handler With an exit handler for INTR intercepts its possible to account them using kvmtrace. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8a2118b09fd2..0eac1a5060a6 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1086,6 +1086,12 @@ static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + ++svm->vcpu.stat.irq_exits; + return 1; +} + static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { return 1; @@ -1369,7 +1375,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, - [SVM_EXIT_INTR] = nop_on_interception, + [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, [SVM_EXIT_INIT] = nop_on_interception, -- cgit v1.2.2 From 54e445ca8411ec892f986d9f8c11b8c1806ecde4 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 30 Apr 2008 17:56:02 +0200 Subject: KVM: add missing kvmtrace bits This patch adds some kvmtrace bits to the generic x86 code where it is instrumented from SVM. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 45dc2b6a9c82..59084a3981c0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2020,6 +2020,7 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) int emulate_clts(struct kvm_vcpu *vcpu) { + KVMTRACE_0D(CLTS, vcpu, handler); kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); return X86EMUL_CONTINUE; } @@ -2600,27 +2601,41 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw, unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) { + unsigned long value; + kvm_x86_ops->decache_cr4_guest_bits(vcpu); switch (cr) { case 0: - return vcpu->arch.cr0; + value = vcpu->arch.cr0; + break; case 2: - return vcpu->arch.cr2; + value = vcpu->arch.cr2; + break; case 3: - return vcpu->arch.cr3; + value = vcpu->arch.cr3; + break; case 4: - return vcpu->arch.cr4; + value = vcpu->arch.cr4; + break; case 8: - return kvm_get_cr8(vcpu); + value = kvm_get_cr8(vcpu); + break; default: vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); return 0; } + KVMTRACE_3D(CR_READ, vcpu, (u32)cr, (u32)value, + (u32)((u64)value >> 32), handler); + + return value; } void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val, unsigned long *rflags) { + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)val, + (u32)((u64)val >> 32), handler); + switch (cr) { case 0: kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); -- cgit v1.2.2 From af9ca2d703f4cefbf6441bfe127c4191092ad394 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 30 Apr 2008 17:56:03 +0200 Subject: KVM: SVM: add missing kvmtrace markers This patch adds the missing kvmtrace markers to the svm module of kvm. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 0eac1a5060a6..8953292acfd9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -949,7 +949,9 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) { - return to_svm(vcpu)->db_regs[dr]; + unsigned long val = to_svm(vcpu)->db_regs[dr]; + KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); + return val; } static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, @@ -1004,6 +1006,12 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; + + if (!npt_enabled) + KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, + (u32)fault_address, (u32)(fault_address >> 32), + handler); + return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } @@ -1083,12 +1091,14 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + KVMTRACE_0D(NMI, &svm->vcpu, handler); return 1; } static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { ++svm->vcpu.stat.irq_exits; + KVMTRACE_0D(INTR, &svm->vcpu, handler); return 1; } @@ -1230,6 +1240,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (svm_get_msr(&svm->vcpu, ecx, &data)) kvm_inject_gp(&svm->vcpu, 0); else { + KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, + (u32)(data >> 32), handler); + svm->vmcb->save.rax = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; svm->next_rip = svm->vmcb->save.rip + 2; @@ -1315,6 +1328,10 @@ static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; u64 data = (svm->vmcb->save.rax & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); + + KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), + handler); + svm->next_rip = svm->vmcb->save.rip + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) kvm_inject_gp(&svm->vcpu, 0); @@ -1334,6 +1351,8 @@ static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int interrupt_window_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { + KVMTRACE_0D(PEND_INTR, &svm->vcpu, handler); + svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; /* @@ -1408,6 +1427,9 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_svm *svm = to_svm(vcpu); u32 exit_code = svm->vmcb->control.exit_code; + KVMTRACE_3D(VMEXIT, vcpu, exit_code, (u32)svm->vmcb->save.rip, + (u32)((u64)svm->vmcb->save.rip >> 32), entryexit); + if (npt_enabled) { int mmu_reload = 0; if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) { @@ -1481,6 +1503,8 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) { struct vmcb_control_area *control; + KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; -- cgit v1.2.2 From d2ebb4103ff349af6dac14955bf93e57487a6694 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 30 Apr 2008 17:56:04 +0200 Subject: KVM: SVM: add tracing support for TDP page faults To distinguish between real page faults and nested page faults they should be traced as different events. This is implemented by this patch. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8953292acfd9..218949cce1a0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1011,6 +1011,10 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) KVMTRACE_3D(PAGE_FAULT, &svm->vcpu, error_code, (u32)fault_address, (u32)(fault_address >> 32), handler); + else + KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, + (u32)fault_address, (u32)(fault_address >> 32), + handler); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } -- cgit v1.2.2 From f697554515b06e8d7264f316b25e6da943407142 Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Fri, 2 May 2008 17:02:23 +0200 Subject: KVM: PIT: support mode 3 The in-kernel PIT emulation ignores pending timers if operating under mode 3, which for example Hurd uses. This mode should output a square wave, high for (N+1)/2 counts and low for (N-1)/2 counts. As we only care about the resulting interrupts, the period is N, and mode 3 is the same as mode 2 with regard to interrupts. Signed-off-by: Aurelien Jarno Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 735ec9a0b360..60074dc66bd7 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -308,6 +308,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) create_pit_timer(&ps->pit_timer, val, 0); break; case 2: + case 3: create_pit_timer(&ps->pit_timer, val, 1); break; default: -- cgit v1.2.2 From 14ae51b6c068ef7ab52dc2d53fe226e6189f2ab2 Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Mon, 5 May 2008 13:05:16 -0400 Subject: KVM: SVM: Fake MSR_K7 performance counters Attached is a patch that fixes a guest crash when booting older Linux kernels. The problem stems from the fact that we are currently emulating MSR_K7_EVNTSEL[0-3], but not emulating MSR_K7_PERFCTR[0-3]. Because of this, setup_k7_watchdog() in the Linux kernel receives a GPF when it attempts to write into MSR_K7_PERFCTR, which causes an OOPs. The patch fixes it by just "fake" emulating the appropriate MSRs, throwing away the data in the process. This causes the NMI watchdog to not actually work, but it's not such a big deal in a virtualized environment. When we get a write to one of these counters, we printk_ratelimit() a warning. I decided to print it out for all writes, even if the data is 0; it doesn't seem to make sense to me to special case when data == 0. Tested by myself on a RHEL-4 guest, and Joerg Roedel on a Windows XP 64-bit guest. Signed-off-by: Chris Lalancette Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 218949cce1a0..992ab7115871 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1312,16 +1312,19 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) case MSR_K7_EVNTSEL1: case MSR_K7_EVNTSEL2: case MSR_K7_EVNTSEL3: + case MSR_K7_PERFCTR0: + case MSR_K7_PERFCTR1: + case MSR_K7_PERFCTR2: + case MSR_K7_PERFCTR3: /* - * only support writing 0 to the performance counters for now - * to make Windows happy. Should be replaced by a real - * performance counter emulation later. + * Just discard all writes to the performance counters; this + * should keep both older linux and windows 64-bit guests + * happy */ - if (data != 0) - goto unhandled; + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", ecx, data); + break; default: - unhandled: return kvm_set_msr_common(vcpu, ecx, data); } return 0; -- cgit v1.2.2 From 7682f2d0dd3ff5bd2756eac018a5b4e7e30ef16c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 12 May 2008 19:25:43 +0300 Subject: KVM: VMX: Trivial vmcs_write64() code simplification Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 8c951d3eab30..fff8e23433d6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -431,10 +431,8 @@ static void vmcs_write32(unsigned long field, u32 value) static void vmcs_write64(unsigned long field, u64 value) { -#ifdef CONFIG_X86_64 - vmcs_writel(field, value); -#else vmcs_writel(field, value); +#ifndef CONFIG_X86_64 asm volatile (""); vmcs_writel(field+1, value >> 32); #endif -- cgit v1.2.2 From 1b7fcd3263e5f12dba43d27b64e1578bec070c28 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 15 May 2008 13:51:35 +0300 Subject: KVM: MMU: Fix false flooding when a pte points to page table The KVM MMU tries to detect when a speculative pte update is not actually used by demand fault, by checking the accessed bit of the shadow pte. If the shadow pte has not been accessed, we deem that page table flooded and remove the shadow page table, allowing further pte updates to proceed without emulation. However, if the pte itself points at a page table and only used for write operations, the accessed bit will never be set since all access will happen through the emulator. This is exactly what happens with kscand on old (2.4.x) HIGHMEM kernels. The kernel points a kmap_atomic() pte at a page table, and then proceeds with read-modify-write operations to look at the dirty and accessed bits. We get a false flood trigger on the kmap ptes, which results in the mmu spending all its time setting up and tearing down shadows. Fix by setting the shadow accessed bit on emulated accesses. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 17 ++++++++++++++++- arch/x86/kvm/mmu.h | 3 ++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8e449dbcc596..53f1ed852ca2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1122,8 +1122,10 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, else kvm_release_pfn_clean(pfn); } - if (!ptwrite || !*ptwrite) + if (speculative) { vcpu->arch.last_pte_updated = shadow_pte; + vcpu->arch.last_pte_gfn = gfn; + } } static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) @@ -1671,6 +1673,18 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, vcpu->arch.update_pte.pfn = pfn; } +static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) +{ + u64 *spte = vcpu->arch.last_pte_updated; + + if (spte + && vcpu->arch.last_pte_gfn == gfn + && shadow_accessed_mask + && !(*spte & shadow_accessed_mask) + && is_shadow_present_pte(*spte)) + set_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); +} + void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes) { @@ -1694,6 +1708,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); mmu_guess_page_from_pte_write(vcpu, gpa, new, bytes); spin_lock(&vcpu->kvm->mmu_lock); + kvm_mmu_access_page(vcpu, gfn); kvm_mmu_free_some_pages(vcpu); ++vcpu->kvm->stat.mmu_pte_write; kvm_mmu_audit(vcpu, "pre pte write"); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 1730757bbc7a..258e5d56298e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -15,7 +15,8 @@ #define PT_USER_MASK (1ULL << 2) #define PT_PWT_MASK (1ULL << 3) #define PT_PCD_MASK (1ULL << 4) -#define PT_ACCESSED_MASK (1ULL << 5) +#define PT_ACCESSED_SHIFT 5 +#define PT_ACCESSED_MASK (1ULL << PT_ACCESSED_SHIFT) #define PT_DIRTY_MASK (1ULL << 6) #define PT_PAGE_SIZE_MASK (1ULL << 7) #define PT_PAT_MASK (1ULL << 7) -- cgit v1.2.2 From 4ecac3fd6dc2629ad76a658a486f081c44aef10e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 May 2008 13:23:38 +0300 Subject: KVM: Handle virtualization instruction #UD faults during reboot KVM turns off hardware virtualization extensions during reboot, in order to disassociate the memory used by the virtualization extensions from the processor, and in order to have the system in a consistent state. Unfortunately virtual machines may still be running while this goes on, and once virtualization extensions are turned off, any virtulization instruction will #UD on execution. Fix by adding an exception handler to virtualization instructions; if we get an exception during reboot, we simply spin waiting for the reset to complete. If it's a true exception, BUG() so we can have our stack trace. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 20 +++++++++++--------- arch/x86/kvm/vmx.c | 25 ++++++++++++++----------- 2 files changed, 25 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 992ab7115871..9390a31c06f4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -27,6 +27,8 @@ #include +#define __ex(x) __kvm_handle_fault_on_reboot(x) + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -129,17 +131,17 @@ static inline void push_irq(struct kvm_vcpu *vcpu, u8 irq) static inline void clgi(void) { - asm volatile (SVM_CLGI); + asm volatile (__ex(SVM_CLGI)); } static inline void stgi(void) { - asm volatile (SVM_STGI); + asm volatile (__ex(SVM_STGI)); } static inline void invlpga(unsigned long addr, u32 asid) { - asm volatile (SVM_INVLPGA :: "a"(addr), "c"(asid)); + asm volatile (__ex(SVM_INVLPGA) :: "a"(addr), "c"(asid)); } static inline unsigned long kvm_read_cr2(void) @@ -1758,17 +1760,17 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Enter guest mode */ "push %%rax \n\t" "mov %c[vmcb](%[svm]), %%rax \n\t" - SVM_VMLOAD "\n\t" - SVM_VMRUN "\n\t" - SVM_VMSAVE "\n\t" + __ex(SVM_VMLOAD) "\n\t" + __ex(SVM_VMRUN) "\n\t" + __ex(SVM_VMSAVE) "\n\t" "pop %%rax \n\t" #else /* Enter guest mode */ "push %%eax \n\t" "mov %c[vmcb](%[svm]), %%eax \n\t" - SVM_VMLOAD "\n\t" - SVM_VMRUN "\n\t" - SVM_VMSAVE "\n\t" + __ex(SVM_VMLOAD) "\n\t" + __ex(SVM_VMRUN) "\n\t" + __ex(SVM_VMSAVE) "\n\t" "pop %%eax \n\t" #endif diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fff8e23433d6..b80b4d141637 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -30,6 +30,8 @@ #include #include +#define __ex(x) __kvm_handle_fault_on_reboot(x) + MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -278,7 +280,7 @@ static inline void __invvpid(int ext, u16 vpid, gva_t gva) u64 gva; } operand = { vpid, 0, gva }; - asm volatile (ASM_VMX_INVVPID + asm volatile (__ex(ASM_VMX_INVVPID) /* CF==1 or ZF==1 --> rc = -1 */ "; ja 1f ; ud2 ; 1:" : : "a"(&operand), "c"(ext) : "cc", "memory"); @@ -290,7 +292,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa) u64 eptp, gpa; } operand = {eptp, gpa}; - asm volatile (ASM_VMX_INVEPT + asm volatile (__ex(ASM_VMX_INVEPT) /* CF==1 or ZF==1 --> rc = -1 */ "; ja 1f ; ud2 ; 1:\n" : : "a" (&operand), "c" (ext) : "cc", "memory"); @@ -311,7 +313,7 @@ static void vmcs_clear(struct vmcs *vmcs) u64 phys_addr = __pa(vmcs); u8 error; - asm volatile (ASM_VMX_VMCLEAR_RAX "; setna %0" + asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc", "memory"); if (error) @@ -378,7 +380,7 @@ static unsigned long vmcs_readl(unsigned long field) { unsigned long value; - asm volatile (ASM_VMX_VMREAD_RDX_RAX + asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) : "=a"(value) : "d"(field) : "cc"); return value; } @@ -413,7 +415,7 @@ static void vmcs_writel(unsigned long field, unsigned long value) { u8 error; - asm volatile (ASM_VMX_VMWRITE_RAX_RDX "; setna %0" + asm volatile (__ex(ASM_VMX_VMWRITE_RAX_RDX) "; setna %0" : "=q"(error) : "a"(value), "d"(field) : "cc"); if (unlikely(error)) vmwrite_error(field, value); @@ -621,7 +623,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) u8 error; per_cpu(current_vmcs, cpu) = vmx->vmcs; - asm volatile (ASM_VMX_VMPTRLD_RAX "; setna %0" + asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) : "cc"); if (error) @@ -1030,13 +1032,14 @@ static void hardware_enable(void *garbage) MSR_IA32_FEATURE_CONTROL_LOCKED | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ - asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) + asm volatile (ASM_VMX_VMXON_RAX + : : "a"(&phys_addr), "m"(phys_addr) : "memory", "cc"); } static void hardware_disable(void *garbage) { - asm volatile (ASM_VMX_VMXOFF : : : "cc"); + asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); write_cr4(read_cr4() & ~X86_CR4_VMXE); } @@ -2834,7 +2837,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "push %%edx; push %%ebp;" "push %%ecx \n\t" #endif - ASM_VMX_VMWRITE_RSP_RDX "\n\t" + __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ @@ -2869,9 +2872,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif /* Enter guest mode */ "jne .Llaunched \n\t" - ASM_VMX_VMLAUNCH "\n\t" + __ex(ASM_VMX_VMLAUNCH) "\n\t" "jmp .Lkvm_vmx_return \n\t" - ".Llaunched: " ASM_VMX_VMRESUME "\n\t" + ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ #ifdef CONFIG_X86_64 -- cgit v1.2.2 From 543e42436643d68ad007d0bae2f485caac9c8a02 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 May 2008 16:22:47 +0300 Subject: KVM: VMX: Add list of potentially locally cached vcpus VMX hardware can cache the contents of a vcpu's vmcs. This cache needs to be flushed when migrating a vcpu to another cpu, or (which is the case that interests us here) when disabling hardware virtualization on a cpu. The current implementation of decaching iterates over the list of all vcpus, picks the ones that are potentially cached on the cpu that is being offlined, and flushes the cache. The problem is that it uses mutex_trylock() to gain exclusive access to the vcpu, which fires off a (benign) warning about using the mutex in an interrupt context. To avoid this, and to make things generally nicer, add a new per-cpu list of potentially cached vcus. This makes the decaching code much simpler. The list is vmx-specific since other hardware doesn't have this issue. [andrea: fix crash on suspend/resume] Signed-off-by: Andrea Arcangeli Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 24 ++++++++++++++++++++++-- arch/x86/kvm/x86.c | 27 --------------------------- 2 files changed, 22 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b80b4d141637..4d179d106376 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -55,6 +55,7 @@ struct vmcs { struct vcpu_vmx { struct kvm_vcpu vcpu; + struct list_head local_vcpus_link; int launched; u8 fail; u32 idt_vectoring_info; @@ -93,6 +94,7 @@ static int init_rmode(struct kvm *kvm); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); +static DEFINE_PER_CPU(struct list_head, vcpus_on_cpu); static struct page *vmx_io_bitmap_a; static struct page *vmx_io_bitmap_b; @@ -331,6 +333,9 @@ static void __vcpu_clear(void *arg) if (per_cpu(current_vmcs, cpu) == vmx->vmcs) per_cpu(current_vmcs, cpu) = NULL; rdtscll(vmx->vcpu.arch.host_tsc); + list_del(&vmx->local_vcpus_link); + vmx->vcpu.cpu = -1; + vmx->launched = 0; } static void vcpu_clear(struct vcpu_vmx *vmx) @@ -338,7 +343,6 @@ static void vcpu_clear(struct vcpu_vmx *vmx) if (vmx->vcpu.cpu == -1) return; smp_call_function_single(vmx->vcpu.cpu, __vcpu_clear, vmx, 1); - vmx->launched = 0; } static inline void vpid_sync_vcpu_all(struct vcpu_vmx *vmx) @@ -617,6 +621,10 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu_clear(vmx); kvm_migrate_timers(vcpu); vpid_sync_vcpu_all(vmx); + local_irq_disable(); + list_add(&vmx->local_vcpus_link, + &per_cpu(vcpus_on_cpu, cpu)); + local_irq_enable(); } if (per_cpu(current_vmcs, cpu) != vmx->vmcs) { @@ -1022,6 +1030,7 @@ static void hardware_enable(void *garbage) u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); u64 old; + INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) @@ -1037,8 +1046,19 @@ static void hardware_enable(void *garbage) : "memory", "cc"); } +static void vmclear_local_vcpus(void) +{ + int cpu = raw_smp_processor_id(); + struct vcpu_vmx *vmx, *n; + + list_for_each_entry_safe(vmx, n, &per_cpu(vcpus_on_cpu, cpu), + local_vcpus_link) + __vcpu_clear(vmx); +} + static void hardware_disable(void *garbage) { + vmclear_local_vcpus(); asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); write_cr4(read_cr4() & ~X86_CR4_VMXE); } @@ -2967,7 +2987,7 @@ static void vmx_free_vmcs(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->vmcs) { - on_each_cpu(__vcpu_clear, vmx, 1); + vcpu_clear(vmx); free_vmcs(vmx->vmcs); vmx->vmcs = NULL; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 59084a3981c0..8c14ddcaba70 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -823,33 +823,6 @@ out: */ void decache_vcpus_on_cpu(int cpu) { - struct kvm *vm; - struct kvm_vcpu *vcpu; - int i; - - spin_lock(&kvm_lock); - list_for_each_entry(vm, &vm_list, vm_list) - for (i = 0; i < KVM_MAX_VCPUS; ++i) { - vcpu = vm->vcpus[i]; - if (!vcpu) - continue; - /* - * If the vcpu is locked, then it is running on some - * other cpu and therefore it is not cached on the - * cpu in question. - * - * If it's not locked, check the last cpu it executed - * on. - */ - if (mutex_trylock(&vcpu->mutex)) { - if (vcpu->cpu == cpu) { - kvm_x86_ops->vcpu_decache(vcpu); - vcpu->cpu = -1; - } - mutex_unlock(&vcpu->mutex); - } - } - spin_unlock(&kvm_lock); } int kvm_dev_ioctl_check_extension(long ext) -- cgit v1.2.2 From 7cc8883074b040aa8c1ebd3a17463b0ea3a9ef16 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 13 May 2008 16:29:20 +0300 Subject: KVM: Remove decache_vcpus_on_cpu() and related callbacks Obsoleted by the vmx-specific per-cpu list. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 5 ----- arch/x86/kvm/vmx.c | 6 ------ arch/x86/kvm/x86.c | 8 -------- 3 files changed, 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9390a31c06f4..238e8f3afaf4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -709,10 +709,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) rdtscll(vcpu->arch.host_tsc); } -static void svm_vcpu_decache(struct kvm_vcpu *vcpu) -{ -} - static void svm_cache_regs(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1933,7 +1929,6 @@ static struct kvm_x86_ops svm_x86_ops = { .prepare_guest_switch = svm_prepare_guest_switch, .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .vcpu_decache = svm_vcpu_decache, .set_guest_debug = svm_guest_debug, .get_msr = svm_get_msr, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4d179d106376..b99bb37e5dec 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -692,11 +692,6 @@ static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); } -static void vmx_vcpu_decache(struct kvm_vcpu *vcpu) -{ - vcpu_clear(to_vmx(vcpu)); -} - static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { return vmcs_readl(GUEST_RFLAGS); @@ -3114,7 +3109,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .prepare_guest_switch = vmx_save_host_state, .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .vcpu_decache = vmx_vcpu_decache, .set_guest_debug = set_guest_debug, .guest_debug_pre = kvm_guest_debug_pre, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 8c14ddcaba70..fd03b4465bcc 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -817,14 +817,6 @@ out: return r; } -/* - * Make sure that a cpu that is being hot-unplugged does not have any vcpus - * cached on it. - */ -void decache_vcpus_on_cpu(int cpu) -{ -} - int kvm_dev_ioctl_check_extension(long ext) { int r; -- cgit v1.2.2 From 50d40d7fb9b09e68a657c68837fcfa067b70cc42 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 25 May 2008 14:38:15 +0300 Subject: KVM: Remove unnecessary ->decache_regs() call Since we aren't modifying any register, there's no need to decache the register state. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fd03b4465bcc..5f00c60f0aff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2297,7 +2297,6 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_x86_ops->cache_regs(vcpu); memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); -- cgit v1.2.2 From 3419ffc8e45a5344abc87684cbca6cdc5c9c8a01 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 15 May 2008 09:52:48 +0800 Subject: KVM: IOAPIC/LAPIC: Enable NMI support [avi: fix ia64 build breakage] Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 3 ++- arch/x86/kvm/x86.c | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index f9201fbc61d1..e48d19394031 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -356,8 +356,9 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, case APIC_DM_SMI: printk(KERN_DEBUG "Ignoring guest SMI\n"); break; + case APIC_DM_NMI: - printk(KERN_DEBUG "Ignoring guest NMI\n"); + kvm_inject_nmi(vcpu); break; case APIC_DM_INIT: diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f00c60f0aff..19974dde6567 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -173,6 +173,12 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); } +void kvm_inject_nmi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.nmi_pending = 1; +} +EXPORT_SYMBOL_GPL(kvm_inject_nmi); + void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) { WARN_ON(vcpu->arch.exception.pending); -- cgit v1.2.2 From f08864b42a45581a64558aa5b6b673c77b97ee5d Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 15 May 2008 18:23:25 +0800 Subject: KVM: VMX: Enable NMI with in-kernel irqchip Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 124 +++++++++++++++++++++++++++++++++++++++++++++-------- arch/x86/kvm/vmx.h | 12 +++++- arch/x86/kvm/x86.c | 1 + 3 files changed, 118 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b99bb37e5dec..1bb994657208 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -264,6 +264,11 @@ static inline int cpu_has_vmx_vpid(void) SECONDARY_EXEC_ENABLE_VPID); } +static inline int cpu_has_virtual_nmis(void) +{ + return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; +} + static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr) { int i; @@ -1088,7 +1093,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) u32 _vmentry_control = 0; min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = 0; + opt = PIN_BASED_VIRTUAL_NMIS; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -2130,6 +2135,13 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) irq | INTR_TYPE_EXT_INTR | INTR_INFO_VALID_MASK); } +static void vmx_inject_nmi(struct kvm_vcpu *vcpu) +{ + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); + vcpu->arch.nmi_pending = 0; +} + static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) { int word_index = __ffs(vcpu->arch.irq_summary); @@ -2653,6 +2665,19 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u32 cpu_based_vm_exec_control; + + /* clear pending NMI */ + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); + ++vcpu->stat.nmi_window_exits; + + return 1; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs @@ -2663,6 +2688,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_EXCEPTION_NMI] = handle_exception, [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, + [EXIT_REASON_NMI_WINDOW] = handle_nmi_window, [EXIT_REASON_IO_INSTRUCTION] = handle_io, [EXIT_REASON_CR_ACCESS] = handle_cr, [EXIT_REASON_DR_ACCESS] = handle_dr, @@ -2750,17 +2776,52 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); } +static void enable_nmi_window(struct kvm_vcpu *vcpu) +{ + u32 cpu_based_vm_exec_control; + + if (!cpu_has_virtual_nmis()) + return; + + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); + cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; + vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); +} + +static int vmx_nmi_enabled(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + return !(guest_intr & (GUEST_INTR_STATE_NMI | + GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_STI)); +} + +static int vmx_irq_enabled(struct kvm_vcpu *vcpu) +{ + u32 guest_intr = vmcs_read32(GUEST_INTERRUPTIBILITY_INFO); + return (!(guest_intr & (GUEST_INTR_STATE_MOV_SS | + GUEST_INTR_STATE_STI)) && + (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF)); +} + +static void enable_intr_window(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.nmi_pending) + enable_nmi_window(vcpu); + else if (kvm_cpu_has_interrupt(vcpu)) + enable_irq_window(vcpu); +} + static void vmx_intr_assist(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field; - int has_ext_irq, interrupt_window_open; + u32 idtv_info_field, intr_info_field, exit_intr_info_field; int vector; update_tpr_threshold(vcpu); - has_ext_irq = kvm_cpu_has_interrupt(vcpu); intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); + exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); idtv_info_field = vmx->idt_vectoring_info; if (intr_info_field & INTR_INFO_VALID_MASK) { if (idtv_info_field & INTR_INFO_VALID_MASK) { @@ -2768,8 +2829,7 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) if (printk_ratelimit()) printk(KERN_ERR "Fault when IDT_Vectoring\n"); } - if (has_ext_irq) - enable_irq_window(vcpu); + enable_intr_window(vcpu); return; } if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { @@ -2779,30 +2839,56 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; vmx_inject_irq(vcpu, vect); - if (unlikely(has_ext_irq)) - enable_irq_window(vcpu); + enable_intr_window(vcpu); return; } KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field); + /* + * SDM 3: 25.7.1.2 + * Clear bit "block by NMI" before VM entry if a NMI delivery + * faulted. + */ + if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) + == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + ~GUEST_INTR_STATE_NMI); + + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field + & ~INTR_INFO_RESVD_BITS_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, vmcs_read32(IDT_VECTORING_ERROR_CODE)); - if (unlikely(has_ext_irq)) - enable_irq_window(vcpu); + enable_intr_window(vcpu); return; } - if (!has_ext_irq) + if (cpu_has_virtual_nmis()) { + /* + * SDM 3: 25.7.1.2 + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + */ + if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && + (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) + vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, + vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | + GUEST_INTR_STATE_NMI); + else if (vcpu->arch.nmi_pending) { + if (vmx_nmi_enabled(vcpu)) + vmx_inject_nmi(vcpu); + enable_intr_window(vcpu); + return; + } + + } + if (!kvm_cpu_has_interrupt(vcpu)) return; - interrupt_window_open = - ((vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) && - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); - if (interrupt_window_open) { + if (vmx_irq_enabled(vcpu)) { vector = kvm_cpu_get_interrupt(vcpu); vmx_inject_irq(vcpu, vector); kvm_timer_intr_post(vcpu, vector); @@ -2963,7 +3049,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) fixup_rmode_irq(vmx); vcpu->arch.interrupt_window_open = - (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0; + (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & + (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)) == 0; asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); vmx->launched = 1; @@ -2971,7 +3058,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) intr_info = vmcs_read32(VM_EXIT_INTR_INFO); /* We need to handle NMIs before interrupts are enabled */ - if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200) { /* nmi */ + if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == 0x200 && + (intr_info & INTR_INFO_VALID_MASK)) { KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); } diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 79d94c610dfe..425a13436b3f 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -40,6 +40,7 @@ #define CPU_BASED_CR8_LOAD_EXITING 0x00080000 #define CPU_BASED_CR8_STORE_EXITING 0x00100000 #define CPU_BASED_TPR_SHADOW 0x00200000 +#define CPU_BASED_VIRTUAL_NMI_PENDING 0x00400000 #define CPU_BASED_MOV_DR_EXITING 0x00800000 #define CPU_BASED_UNCOND_IO_EXITING 0x01000000 #define CPU_BASED_USE_IO_BITMAPS 0x02000000 @@ -216,7 +217,7 @@ enum vmcs_field { #define EXIT_REASON_TRIPLE_FAULT 2 #define EXIT_REASON_PENDING_INTERRUPT 7 - +#define EXIT_REASON_NMI_WINDOW 8 #define EXIT_REASON_TASK_SWITCH 9 #define EXIT_REASON_CPUID 10 #define EXIT_REASON_HLT 12 @@ -251,7 +252,9 @@ enum vmcs_field { #define INTR_INFO_VECTOR_MASK 0xff /* 7:0 */ #define INTR_INFO_INTR_TYPE_MASK 0x700 /* 10:8 */ #define INTR_INFO_DELIVER_CODE_MASK 0x800 /* 11 */ +#define INTR_INFO_UNBLOCK_NMI 0x1000 /* 12 */ #define INTR_INFO_VALID_MASK 0x80000000 /* 31 */ +#define INTR_INFO_RESVD_BITS_MASK 0x7ffff000 #define VECTORING_INFO_VECTOR_MASK INTR_INFO_VECTOR_MASK #define VECTORING_INFO_TYPE_MASK INTR_INFO_INTR_TYPE_MASK @@ -259,9 +262,16 @@ enum vmcs_field { #define VECTORING_INFO_VALID_MASK INTR_INFO_VALID_MASK #define INTR_TYPE_EXT_INTR (0 << 8) /* external interrupt */ +#define INTR_TYPE_NMI_INTR (2 << 8) /* NMI */ #define INTR_TYPE_EXCEPTION (3 << 8) /* processor exception */ #define INTR_TYPE_SOFT_INTR (4 << 8) /* software interrupt */ +/* GUEST_INTERRUPTIBILITY_INFO flags. */ +#define GUEST_INTR_STATE_STI 0x00000001 +#define GUEST_INTR_STATE_MOV_SS 0x00000002 +#define GUEST_INTR_STATE_SMI 0x00000004 +#define GUEST_INTR_STATE_NMI 0x00000008 + /* * Exit Qualifications for MOV for Control Register Access */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19974dde6567..05b54976c891 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -72,6 +72,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmio_exits", VCPU_STAT(mmio_exits) }, { "signal_exits", VCPU_STAT(signal_exits) }, { "irq_window", VCPU_STAT(irq_window_exits) }, + { "nmi_window", VCPU_STAT(nmi_window_exits) }, { "halt_exits", VCPU_STAT(halt_exits) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "hypercalls", VCPU_STAT(hypercalls) }, -- cgit v1.2.2 From 9ba075a664dff836fd6fb93f90fcc827f7683d91 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 26 May 2008 20:06:35 +0300 Subject: KVM: MTRR support Add emulation for the memory type range registers, needed by VMware esx 3.5, and by pci device assignment. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 48 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 05b54976c891..5f67a7c54e82 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -611,6 +611,38 @@ static void kvm_write_guest_time(struct kvm_vcpu *v) mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); } +static bool msr_mtrr_valid(unsigned msr) +{ + switch (msr) { + case 0x200 ... 0x200 + 2 * KVM_NR_VAR_MTRR - 1: + case MSR_MTRRfix64K_00000: + case MSR_MTRRfix16K_80000: + case MSR_MTRRfix16K_A0000: + case MSR_MTRRfix4K_C0000: + case MSR_MTRRfix4K_C8000: + case MSR_MTRRfix4K_D0000: + case MSR_MTRRfix4K_D8000: + case MSR_MTRRfix4K_E0000: + case MSR_MTRRfix4K_E8000: + case MSR_MTRRfix4K_F0000: + case MSR_MTRRfix4K_F8000: + case MSR_MTRRdefType: + case MSR_IA32_CR_PAT: + return true; + case 0x2f8: + return true; + } + return false; +} + +static int set_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 data) +{ + if (!msr_mtrr_valid(msr)) + return 1; + + vcpu->arch.mtrr[msr - 0x200] = data; + return 0; +} int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) { @@ -632,8 +664,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: - case 0x200 ... 0x2ff: /* MTRRs */ break; + case 0x200 ... 0x2ff: + return set_msr_mtrr(vcpu, msr, data); case MSR_IA32_APICBASE: kvm_set_apic_base(vcpu, data); break; @@ -691,6 +724,15 @@ int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) return kvm_x86_ops->get_msr(vcpu, msr_index, pdata); } +static int get_msr_mtrr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) +{ + if (!msr_mtrr_valid(msr)) + return 1; + + *pdata = vcpu->arch.mtrr[msr - 0x200]; + return 0; +} + int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) { u64 data; @@ -712,11 +754,13 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+16: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: - /* MTRR registers */ - case 0xfe: - case 0x200 ... 0x2ff: data = 0; break; + case MSR_MTRRcap: + data = 0x500 | KVM_NR_VAR_MTRR; + break; + case 0x200 ... 0x2ff: + return get_msr_mtrr(vcpu, msr, pdata); case 0xcd: /* fsb frequency */ data = 3; break; -- cgit v1.2.2 From 3e6e0aab1ba1e8b354ce01f5659336f9aee69437 Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Tue, 27 May 2008 10:18:46 +0200 Subject: KVM: Prefixes segment functions that will be exported with "kvm_" Prefixes functions that will be exported with kvm_. We also prefixed set_segment() even if it still static to be coherent. signed-off-by: Guillaume Thouvenin Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 78 +++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5f67a7c54e82..4c94fad7f01e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3100,8 +3100,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) return 0; } -static void get_segment(struct kvm_vcpu *vcpu, - struct kvm_segment *var, int seg) +void kvm_get_segment(struct kvm_vcpu *vcpu, + struct kvm_segment *var, int seg) { kvm_x86_ops->get_segment(vcpu, var, seg); } @@ -3110,7 +3110,7 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { struct kvm_segment cs; - get_segment(vcpu, &cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &cs, VCPU_SREG_CS); *db = cs.db; *l = cs.l; } @@ -3124,15 +3124,15 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - get_segment(vcpu, &sregs->es, VCPU_SREG_ES); - get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); kvm_x86_ops->get_idt(vcpu, &dt); sregs->idt.limit = dt.limit; @@ -3184,7 +3184,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -static void set_segment(struct kvm_vcpu *vcpu, +static void kvm_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { kvm_x86_ops->set_segment(vcpu, var, seg); @@ -3221,7 +3221,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, if (selector & 1 << 2) { struct kvm_segment kvm_seg; - get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); + kvm_get_segment(vcpu, &kvm_seg, VCPU_SREG_LDTR); if (kvm_seg.unusable) dtable->limit = 0; @@ -3327,7 +3327,7 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment kvm_seg; - get_segment(vcpu, &kvm_seg, seg); + kvm_get_segment(vcpu, &kvm_seg, seg); return kvm_seg.selector; } @@ -3343,8 +3343,8 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } -static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, - int type_bits, int seg) +int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, + int type_bits, int seg) { struct kvm_segment kvm_seg; @@ -3357,7 +3357,7 @@ static int load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (!kvm_seg.s) kvm_seg.unusable = 1; - set_segment(vcpu, &kvm_seg, seg); + kvm_set_segment(vcpu, &kvm_seg, seg); return 0; } @@ -3403,25 +3403,25 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; - if (load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) + if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) return 1; - if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) return 1; - if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) return 1; - if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) return 1; - if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) return 1; - if (load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) + if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) return 1; - if (load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) + if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) return 1; return 0; } @@ -3462,19 +3462,19 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; - if (load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) + if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) return 1; - if (load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) + if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) return 1; - if (load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) + if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) return 1; - if (load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) + if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) return 1; - if (load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) + if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) return 1; return 0; } @@ -3532,7 +3532,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) struct desc_struct nseg_desc; int ret = 0; - get_segment(vcpu, &tr_seg, VCPU_SREG_TR); + kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR); if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) goto out; @@ -3591,7 +3591,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); tr_seg.type = 11; - set_segment(vcpu, &tr_seg, VCPU_SREG_TR); + kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); out: kvm_x86_ops->decache_regs(vcpu); return ret; @@ -3658,15 +3658,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, } } - set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); - set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); - set_segment(vcpu, &sregs->es, VCPU_SREG_ES); - set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); - set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); - set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); + kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); + kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS); + kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES); + kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS); + kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS); + kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS); - set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); - set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); + kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); vcpu_put(vcpu); -- cgit v1.2.2 From 89c696383d6eb493351a89d450d8ad7a55cbe1da Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Tue, 27 May 2008 10:22:20 +0200 Subject: KVM: x86 emulator: Update c->dst.bytes in decode instruction Update c->dst.bytes in decode instruction instead of instruction itself. It's needed because if c->dst.bytes is equal to 0, the instruction is not emulated. Signed-off-by: Guillaume Thouvenin Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 932f216d890c..a928aa6cdad2 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1049,6 +1049,7 @@ done_prefixes: break; case DstMem: if ((c->d & ModRM) && c->modrm_mod == 3) { + c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.type = OP_REG; c->dst.val = c->dst.orig_val = c->modrm_val; c->dst.ptr = c->modrm_ptr; -- cgit v1.2.2 From 954cd36f7613ac6d084abe33114dd45a8e0dbe92 Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Tue, 27 May 2008 10:19:08 +0200 Subject: KVM: x86 emulator: add support for jmp far 0xea Add support for jmp far (opcode 0xea) instruction. Signed-off-by: Guillaume Thouvenin Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index a928aa6cdad2..48b62cc3bd0c 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -168,7 +168,8 @@ static u16 opcode_table[256] = { /* 0xE0 - 0xE7 */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE8 - 0xEF */ - ImplicitOps | Stack, SrcImm|ImplicitOps, 0, SrcImmByte|ImplicitOps, + ImplicitOps | Stack, SrcImm | ImplicitOps, + ImplicitOps, SrcImmByte | ImplicitOps, 0, 0, 0, 0, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, @@ -1661,7 +1662,33 @@ special_insn: break; } case 0xe9: /* jmp rel */ - case 0xeb: /* jmp rel short */ + goto jmp; + case 0xea: /* jmp far */ { + uint32_t eip; + uint16_t sel; + + switch (c->op_bytes) { + case 2: + eip = insn_fetch(u16, 2, c->eip); + break; + case 4: + eip = insn_fetch(u32, 4, c->eip); + break; + default: + DPRINTF("jmp far: Invalid op_bytes\n"); + goto cannot_emulate; + } + sel = insn_fetch(u16, 2, c->eip); + if (kvm_load_segment_descriptor(ctxt->vcpu, sel, 9, VCPU_SREG_CS) < 0) { + DPRINTF("jmp far: Failed to load CS descriptor\n"); + goto cannot_emulate; + } + + c->eip = eip; + break; + } + case 0xeb: + jmp: /* jmp rel short */ jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ break; -- cgit v1.2.2 From 615ac125618dc7b40ecb418e8b353d31ccf0e518 Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Tue, 27 May 2008 10:19:16 +0200 Subject: KVM: x86 emulator: adds support to mov r,imm (opcode 0xb8) instruction Add support to mov r, imm (0xb8) instruction. Signed-off-by: Guillaume Thouvenin Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 48b62cc3bd0c..21d7ff6a8ecd 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -152,7 +152,8 @@ static u16 opcode_table[256] = { ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | String, ImplicitOps | String, /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, /* 0xC0 - 0xC7 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, ImplicitOps | Stack, 0, 0, @@ -1624,6 +1625,8 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; + case 0xb8: /* mov r, imm */ + goto mov; case 0xc0 ... 0xc1: emulate_grp2(ctxt); break; -- cgit v1.2.2 From 4257198ae2c36e030a0947fef661c8de973778be Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Tue, 27 May 2008 14:49:15 +0200 Subject: KVM: x86 emulator: Add support for mov seg, r (0x8e) instruction Add support for mov r, sreg (0x8c) instruction. [avi: drop the sreg decoding table in favor of 1:1 encoding] Signed-off-by: Guillaume Thouvenin Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 21d7ff6a8ecd..b049b6bf9a71 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -138,7 +138,8 @@ static u16 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, 0, Group | Group1A, + 0, ModRM | DstReg, + DstReg | SrcMem | ModRM | Mov, Group | Group1A, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, @@ -1520,6 +1521,28 @@ special_insn: case 0x8d: /* lea r16/r32, m */ c->dst.val = c->modrm_ea; break; + case 0x8e: { /* mov seg, r/m16 */ + uint16_t sel; + int type_bits; + int err; + + sel = c->src.val; + if (c->modrm_reg <= 5) { + type_bits = (c->modrm_reg == 1) ? 9 : 1; + err = kvm_load_segment_descriptor(ctxt->vcpu, sel, + type_bits, c->modrm_reg); + } else { + printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + + if (err < 0) + goto cannot_emulate; + + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + } case 0x8f: /* pop (sole member of Grp1a) */ rc = emulate_grp1a(ctxt, ops); if (rc != 0) -- cgit v1.2.2 From 38d5bc6d50a4368be08b39b02efb9cbbe1dd60d0 Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Tue, 27 May 2008 15:13:28 +0200 Subject: KVM: x86 emulator: Add support for mov r, sreg (0x8c) instruction Add support for mov r, sreg (0x8c) instruction Signed-off-by: Guillaume Thouvenin Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index b049b6bf9a71..2a9db4d90bac 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -138,7 +138,7 @@ static u16 opcode_table[256] = { /* 0x88 - 0x8F */ ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, - 0, ModRM | DstReg, + DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, DstReg | SrcMem | ModRM | Mov, Group | Group1A, /* 0x90 - 0x9F */ 0, 0, 0, 0, 0, 0, 0, 0, @@ -1518,6 +1518,19 @@ special_insn: break; case 0x88 ... 0x8b: /* mov */ goto mov; + case 0x8c: { /* mov r/m, sreg */ + struct kvm_segment segreg; + + if (c->modrm_reg <= 5) + kvm_get_segment(ctxt->vcpu, &segreg, c->modrm_reg); + else { + printk(KERN_INFO "0x8c: Invalid segreg in modrm byte 0x%02x\n", + c->modrm); + goto cannot_emulate; + } + c->dst.val = segreg.selector; + break; + } case 0x8d: /* lea r16/r32, m */ c->dst.val = c->modrm_ea; break; -- cgit v1.2.2 From eab9f71feb1851b5b700ca12ae614b6a0a441021 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 May 2008 14:20:16 +0300 Subject: KVM: MMU: Optimize prefetch_page() Instead of reading each pte individually, read 256 bytes worth of ptes and batch process them. Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 934c7b619396..4d918220baeb 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -460,8 +460,9 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { - int i, offset = 0, r = 0; - pt_element_t pt; + int i, j, offset, r; + pt_element_t pt[256 / sizeof(pt_element_t)]; + gpa_t pte_gpa; if (sp->role.metaphysical || (PTTYPE == 32 && sp->role.level > PT_PAGE_TABLE_LEVEL)) { @@ -469,19 +470,20 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, return; } - if (PTTYPE == 32) + pte_gpa = gfn_to_gpa(sp->gfn); + if (PTTYPE == 32) { offset = sp->role.quadrant << PT64_LEVEL_BITS; + pte_gpa += offset * sizeof(pt_element_t); + } - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { - gpa_t pte_gpa = gfn_to_gpa(sp->gfn); - pte_gpa += (i+offset) * sizeof(pt_element_t); - - r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &pt, - sizeof(pt_element_t)); - if (r || is_present_pte(pt)) - sp->spt[i] = shadow_trap_nonpresent_pte; - else - sp->spt[i] = shadow_notrap_nonpresent_pte; + for (i = 0; i < PT64_ENT_PER_PAGE; i += ARRAY_SIZE(pt)) { + r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa, pt, sizeof pt); + pte_gpa += ARRAY_SIZE(pt) * sizeof(pt_element_t); + for (j = 0; j < ARRAY_SIZE(pt); ++j) + if (r || is_present_pte(pt[j])) + sp->spt[i+j] = shadow_trap_nonpresent_pte; + else + sp->spt[i+j] = shadow_notrap_nonpresent_pte; } } -- cgit v1.2.2 From 19e43636b5af1c8b9cc8406af674835284abab0c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 May 2008 14:26:29 +0300 Subject: KVM: x86 emulator: simplify push imm8 emulation Instead of fetching the data explicitly, use SrcImmByte. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 2a9db4d90bac..4e037ea8fe64 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -121,7 +121,7 @@ static u16 opcode_table[256] = { 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 0, 0, 0, 0, /* 0x68 - 0x6F */ - 0, 0, ImplicitOps | Mov | Stack, 0, + 0, 0, SrcImmByte | Mov | Stack, 0, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ @@ -1425,8 +1425,6 @@ special_insn: c->dst.val = (s32) c->src.val; break; case 0x6a: /* push imm8 */ - c->src.val = 0L; - c->src.val = insn_fetch(s8, 1, c->eip); emulate_push(ctxt); break; case 0x6c: /* insb */ -- cgit v1.2.2 From 91ed7a0e15c6f6ff57f5cf70feabdba56a999863 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 May 2008 14:38:38 +0300 Subject: KVM: x86 emulator: implement 'push imm' (opcode 0x68) Encountered in FC6 boot sequence, now that we don't force ss.rpl = 0 during the protected mode transition. Not really necessary, but nice to have. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 4e037ea8fe64..b90857c76569 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -121,7 +121,7 @@ static u16 opcode_table[256] = { 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 0, 0, 0, 0, /* 0x68 - 0x6F */ - 0, 0, SrcImmByte | Mov | Stack, 0, + SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* insb, insw/insd */ SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* outsb, outsw/outsd */ /* 0x70 - 0x77 */ @@ -1424,6 +1424,7 @@ special_insn: goto cannot_emulate; c->dst.val = (s32) c->src.val; break; + case 0x68: /* push imm */ case 0x6a: /* push imm8 */ emulate_push(ctxt); break; -- cgit v1.2.2 From d761a501cf9cd4fa08ff35d252ff08b8c31ce677 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 May 2008 14:55:03 +0300 Subject: KVM: MMU: Move nonpaging_prefetch_page() In preparation for next patch. No code change. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 53f1ed852ca2..62741b7c4223 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -776,6 +776,15 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } +static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + int i; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) + sp->spt[i] = shadow_trap_nonpresent_pte; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -1213,15 +1222,6 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) } -static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp) -{ - int i; - - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) - sp->spt[i] = shadow_trap_nonpresent_pte; -} - static void mmu_free_roots(struct kvm_vcpu *vcpu) { int i; -- cgit v1.2.2 From 131d82791b628d4aeafd94ddc74a9b68f3d15a83 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 29 May 2008 14:56:28 +0300 Subject: KVM: MMU: Avoid page prefetch on SVM SVM cannot benefit from page prefetching since guest page fault bypass cannot by made to work there. Avoid accessing the guest page table in this case. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 62741b7c4223..5ebb2788bd73 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -850,7 +850,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_add_head(&sp->hash_link, bucket); if (!metaphysical) rmap_write_protect(vcpu->kvm, gfn); - vcpu->arch.mmu.prefetch_page(vcpu, sp); + if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) + vcpu->arch.mmu.prefetch_page(vcpu, sp); + else + nonpaging_prefetch_page(vcpu, sp); return sp; } -- cgit v1.2.2 From 92760499d01ef91518119908eb9b8798b6c9bd3f Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Fri, 30 May 2008 16:05:53 +0200 Subject: KVM: kvm_io_device: extend in_range() to manage len and write attribute Modify member in_range() of structure kvm_io_device to pass length and the type of the I/O (write or read). This modification allows to use kvm_io_device with coalesced MMIO. Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 6 ++++-- arch/x86/kvm/i8259.c | 3 ++- arch/x86/kvm/lapic.c | 3 ++- arch/x86/kvm/x86.c | 28 +++++++++++++++++----------- 4 files changed, 25 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 60074dc66bd7..9e3391e9a1b7 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -460,7 +460,8 @@ static void pit_ioport_read(struct kvm_io_device *this, mutex_unlock(&pit_state->lock); } -static int pit_in_range(struct kvm_io_device *this, gpa_t addr) +static int pit_in_range(struct kvm_io_device *this, gpa_t addr, + int len, int is_write) { return ((addr >= KVM_PIT_BASE_ADDRESS) && (addr < KVM_PIT_BASE_ADDRESS + KVM_PIT_MEM_LENGTH)); @@ -501,7 +502,8 @@ static void speaker_ioport_read(struct kvm_io_device *this, mutex_unlock(&pit_state->lock); } -static int speaker_in_range(struct kvm_io_device *this, gpa_t addr) +static int speaker_in_range(struct kvm_io_device *this, gpa_t addr, + int len, int is_write) { return (addr == KVM_SPEAKER_BASE_ADDRESS); } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index ab29cf2def47..5857f59ad4aa 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -346,7 +346,8 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(struct kvm_io_device *this, gpa_t addr) +static int picdev_in_range(struct kvm_io_device *this, gpa_t addr, + int len, int is_write) { switch (addr) { case 0x20: diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index e48d19394031..180ba7316da5 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -785,7 +785,8 @@ static void apic_mmio_write(struct kvm_io_device *this, } -static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr) +static int apic_mmio_range(struct kvm_io_device *this, gpa_t addr, + int len, int size) { struct kvm_lapic *apic = (struct kvm_lapic *)this->private; int ret = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4c94fad7f01e..ab3f5552d694 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1797,13 +1797,14 @@ static void kvm_init_msr_list(void) * Only apic need an MMIO device hook, so shortcut now.. */ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, - gpa_t addr) + gpa_t addr, int len, + int is_write) { struct kvm_io_device *dev; if (vcpu->arch.apic) { dev = &vcpu->arch.apic->dev; - if (dev->in_range(dev, addr)) + if (dev->in_range(dev, addr, len, is_write)) return dev; } return NULL; @@ -1811,13 +1812,15 @@ static struct kvm_io_device *vcpu_find_pervcpu_dev(struct kvm_vcpu *vcpu, static struct kvm_io_device *vcpu_find_mmio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) + gpa_t addr, int len, + int is_write) { struct kvm_io_device *dev; - dev = vcpu_find_pervcpu_dev(vcpu, addr); + dev = vcpu_find_pervcpu_dev(vcpu, addr, len, is_write); if (dev == NULL) - dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr); + dev = kvm_io_bus_find_dev(&vcpu->kvm->mmio_bus, addr, len, + is_write); return dev; } @@ -1885,7 +1888,7 @@ mmio: * Is this MMIO handled locally? */ mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 0); if (mmio_dev) { kvm_iodevice_read(mmio_dev, gpa, bytes, val); mutex_unlock(&vcpu->kvm->lock); @@ -1940,7 +1943,7 @@ mmio: * Is this MMIO handled locally? */ mutex_lock(&vcpu->kvm->lock); - mmio_dev = vcpu_find_mmio_dev(vcpu, gpa); + mmio_dev = vcpu_find_mmio_dev(vcpu, gpa, bytes, 1); if (mmio_dev) { kvm_iodevice_write(mmio_dev, gpa, bytes, val); mutex_unlock(&vcpu->kvm->lock); @@ -2317,9 +2320,10 @@ static void pio_string_write(struct kvm_io_device *pio_dev, } static struct kvm_io_device *vcpu_find_pio_dev(struct kvm_vcpu *vcpu, - gpa_t addr) + gpa_t addr, int len, + int is_write) { - return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr); + return kvm_io_bus_find_dev(&vcpu->kvm->pio_bus, addr, len, is_write); } int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, @@ -2351,7 +2355,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, kvm_x86_ops->skip_emulated_instruction(vcpu); - pio_dev = vcpu_find_pio_dev(vcpu, port); + pio_dev = vcpu_find_pio_dev(vcpu, port, size, !in); if (pio_dev) { kernel_pio(pio_dev, vcpu, vcpu->arch.pio_data); complete_pio(vcpu); @@ -2433,7 +2437,9 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, } } - pio_dev = vcpu_find_pio_dev(vcpu, port); + pio_dev = vcpu_find_pio_dev(vcpu, port, + vcpu->arch.pio.cur_count, + !vcpu->arch.pio.in); if (!vcpu->arch.pio.in) { /* string PIO write */ ret = pio_copy_data(vcpu); -- cgit v1.2.2 From 542472b53ea9e0add0ba23976018210191d84754 Mon Sep 17 00:00:00 2001 From: Laurent Vivier Date: Fri, 30 May 2008 16:05:55 +0200 Subject: KVM: Add coalesced MMIO support (x86 part) This patch enables coalesced MMIO for x86 architecture. It defines KVM_MMIO_PAGE_OFFSET and KVM_CAP_COALESCED_MMIO. It enables the compilation of coalesced_mmio.c. Signed-off-by: Laurent Vivier Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 3 ++- arch/x86/kvm/x86.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index c97d35c218db..d0e940bb6f40 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -2,7 +2,8 @@ # Makefile for Kernel-based Virtual Machine module # -common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o) +common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ + coalesced_mmio.o) ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ab3f5552d694..d731d4fff1ae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -885,6 +885,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_MP_STATE: r = 1; break; + case KVM_CAP_COALESCED_MMIO: + r = KVM_COALESCED_MMIO_PAGE_OFFSET; + break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; -- cgit v1.2.2 From 622395a9e63bf87a16faecf555ed02375cbae5b7 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 11 Jun 2008 19:52:53 -0300 Subject: KVM: only abort guest entry if timer count goes from 0->1 Only abort guest entry if the timer count went from 0->1, since for 1->2 or larger the bit will either be set already or a timer irq will have been injected. Using atomic_inc_and_test() for it also introduces an SMP barrier to the LAPIC version (thought it was unecessary because of timer migration, but guest can be scheduled to a different pCPU between exit and kvm_vcpu_block(), so there is the possibility for a race). Noticed by Avi. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 11 ++++------- arch/x86/kvm/lapic.c | 4 ++-- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 9e3391e9a1b7..c0f7872a9124 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -198,14 +198,11 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) struct kvm_vcpu *vcpu0 = ps->pit->kvm->vcpus[0]; struct kvm_kpit_timer *pt = &ps->pit_timer; - atomic_inc(&pt->pending); - smp_mb__after_atomic_inc(); - if (vcpu0) { + if (!atomic_inc_and_test(&pt->pending)) set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - if (waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; - wake_up_interruptible(&vcpu0->wq); - } + if (vcpu0 && waitqueue_active(&vcpu0->wq)) { + vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + wake_up_interruptible(&vcpu0->wq); } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 180ba7316da5..73f43de69f67 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -945,8 +945,8 @@ static int __apic_timer_fn(struct kvm_lapic *apic) int result = 0; wait_queue_head_t *q = &apic->vcpu->wq; - atomic_inc(&apic->timer.pending); - set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); + if(!atomic_inc_and_test(&apic->timer.pending)) + set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); if (waitqueue_active(q)) { apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; wake_up_interruptible(q); -- cgit v1.2.2 From 25be46080f1a446cb2bda3daadbd22a5682b955e Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 10 Jun 2008 10:46:53 -0300 Subject: KVM: Do not calculate linear rip in emulation failure report If we're not gonna do anything (case in which failure is already reported), we do not need to even bother with calculating the linear rip. Signed-off-by: Glauber Costa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d731d4fff1ae..5d21bb69d88c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2081,11 +2081,11 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) unsigned long rip = vcpu->arch.rip; unsigned long rip_linear; - rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); - if (reported) return; + rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); + emulator_read_std(rip_linear, (void *)opcodes, 4, vcpu); printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", -- cgit v1.2.2 From f76c710d759250a43976bcfcab6af6ebb94b7dc2 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 13 Jun 2008 22:45:42 +0300 Subject: KVM: Use printk_rlimit() instead of reporting emulation failures just once Emulation failure reports are useful, so allow more than one per the lifetime of the module. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5d21bb69d88c..d1db5aa5c7f4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2076,12 +2076,11 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { - static int reported; u8 opcodes[4]; unsigned long rip = vcpu->arch.rip; unsigned long rip_linear; - if (reported) + if (!printk_ratelimit()) return; rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); @@ -2090,7 +2089,6 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); - reported = 1; } EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); -- cgit v1.2.2 From b13354f8f092884fa8d79472404de4907b25d579 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 15 Jun 2008 19:37:38 +0300 Subject: KVM: x86 emulator: emulate nop and xchg reg, acc (opcodes 0x90 - 0x97) Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index b90857c76569..28082913919e 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -140,8 +140,9 @@ static u16 opcode_table[256] = { ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, ModRM | DstReg, DstReg | SrcMem | ModRM | Mov, Group | Group1A, - /* 0x90 - 0x9F */ - 0, 0, 0, 0, 0, 0, 0, 0, + /* 0x90 - 0x97 */ + DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, + /* 0x98 - 0x9F */ 0, 0, 0, 0, ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, /* 0xA0 - 0xA7 */ ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, @@ -1493,6 +1494,7 @@ special_insn: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); break; case 0x86 ... 0x87: /* xchg */ + xchg: /* Write back the register source. */ switch (c->dst.bytes) { case 1: @@ -1560,6 +1562,17 @@ special_insn: if (rc != 0) goto done; break; + case 0x90: /* nop / xchg r8,rax */ + if (!(c->rex_prefix & 1)) { /* nop */ + c->dst.type = OP_NONE; + break; + } + case 0x91 ... 0x97: /* xchg reg,rax */ + c->src.type = c->dst.type = OP_REG; + c->src.bytes = c->dst.bytes = c->op_bytes; + c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX]; + c->src.val = *(c->src.ptr); + goto xchg; case 0x9c: /* pushf */ c->src.val = (unsigned long) ctxt->eflags; emulate_push(ctxt); -- cgit v1.2.2 From 8684c0af0b2bab770c257e2a04e1546eed35fa56 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 Jun 2008 21:13:41 -0700 Subject: KVM: x86 emulator: handle undecoded rex.b with r/m = 5 in certain cases x86_64 does not decode rex.b in certain cases, where the r/m field = 5. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 28082913919e..3721cfddc973 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -750,6 +750,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (base_reg) { case 5: + case 13: if (c->modrm_mod != 0) c->modrm_ea += c->regs[base_reg]; else @@ -767,6 +768,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } break; case 5: + case 13: if (c->modrm_mod != 0) c->modrm_ea += c->regs[c->modrm_rm]; else if (ctxt->mode == X86EMUL_MODE_PROT64) -- cgit v1.2.2 From dc71d0f1620790ec8e54101ca37e7b31e31208a8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 Jun 2008 21:23:17 -0700 Subject: KVM: x86 emulator: simplify sib decoding Instead of using sparse switches, use simpler if/else sequences. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 3721cfddc973..ca7ab2469a4a 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -748,24 +748,12 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, base_reg |= sib & 7; scale = sib >> 6; - switch (base_reg) { - case 5: - case 13: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[base_reg]; - else - c->modrm_ea += - insn_fetch(s32, 4, c->eip); - break; - default: + if ((base_reg & 7) == 5 && c->modrm_mod == 0) + c->modrm_ea += insn_fetch(s32, 4, c->eip); + else c->modrm_ea += c->regs[base_reg]; - } - switch (index_reg) { - case 4: - break; - default: + if (index_reg != 4) c->modrm_ea += c->regs[index_reg] << scale; - } break; case 5: case 13: -- cgit v1.2.2 From 84411d85dacdb6665578608c6a70fc8b819761a8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 Jun 2008 21:53:26 -0700 Subject: KVM: x86 emulator: simplify r/m decoding Consolidate the duplicated code when not in any special case. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ca7ab2469a4a..c3a823174f3e 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -740,9 +740,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_ea = (u16)c->modrm_ea; } else { /* 32/64-bit ModR/M decode. */ - switch (c->modrm_rm) { - case 4: - case 12: + if ((c->modrm_rm & 7) == 4) { sib = insn_fetch(u8, 1, c->eip); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; @@ -754,18 +752,11 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_ea += c->regs[base_reg]; if (index_reg != 4) c->modrm_ea += c->regs[index_reg] << scale; - break; - case 5: - case 13: - if (c->modrm_mod != 0) - c->modrm_ea += c->regs[c->modrm_rm]; - else if (ctxt->mode == X86EMUL_MODE_PROT64) + } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { + if (ctxt->mode == X86EMUL_MODE_PROT64) rip_relative = 1; - break; - default: + } else c->modrm_ea += c->regs[c->modrm_rm]; - break; - } switch (c->modrm_mod) { case 0: if (c->modrm_rm == 5) -- cgit v1.2.2 From f5b4edcd52e78556800f90d08bfc9126416ac82f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 Jun 2008 22:09:11 -0700 Subject: KVM: x86 emulator: simplify rip relative decoding rip relative decoding is relative to the instruction pointer of the next instruction; by moving address adjustment until after decoding is complete, we remove the need to determine the instruction size. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index c3a823174f3e..20b604489c3c 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -664,7 +664,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; u8 sib; - int index_reg = 0, base_reg = 0, scale, rip_relative = 0; + int index_reg = 0, base_reg = 0, scale; int rc = 0; if (c->rex_prefix) { @@ -754,7 +754,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, c->modrm_ea += c->regs[index_reg] << scale; } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { if (ctxt->mode == X86EMUL_MODE_PROT64) - rip_relative = 1; + c->rip_relative = 1; } else c->modrm_ea += c->regs[c->modrm_rm]; switch (c->modrm_mod) { @@ -770,22 +770,6 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, break; } } - if (rip_relative) { - c->modrm_ea += c->eip; - switch (c->d & SrcMask) { - case SrcImmByte: - c->modrm_ea += 1; - break; - case SrcImm: - if (c->d & ByteOp) - c->modrm_ea += 1; - else - if (c->op_bytes == 8) - c->modrm_ea += 4; - else - c->modrm_ea += c->op_bytes; - } - } done: return rc; } @@ -1044,6 +1028,9 @@ done_prefixes: break; } + if (c->rip_relative) + c->modrm_ea += c->eip; + done: return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; } -- cgit v1.2.2 From 0adc8675d645940139d12477e5e05b8a0a7a1117 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 15 Jun 2008 22:45:54 -0700 Subject: KVM: x86 emulator: avoid segment base adjust for lea Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 20b604489c3c..38926b7da64a 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -940,7 +940,7 @@ done_prefixes: c->override_base != &ctxt->gs_base) c->override_base = NULL; - if (c->override_base) + if (c->override_base && !(!c->twobyte && c->b == 0x8d)) c->modrm_ea += *c->override_base; if (c->ad_bytes != 8) -- cgit v1.2.2 From 7a5b56dfd3a682a51fc84682290d5147872a8e99 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Jun 2008 16:22:51 +0300 Subject: KVM: x86 emulator: lazily evaluate segment registers Instead of prefetching all segment bases before emulation, read them at the last moment. Since most of them are unneeded, we save some cycles on Intel machines where this is a bit expensive. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 21 ---------- arch/x86/kvm/x86_emulate.c | 96 +++++++++++++++++++++++++++------------------- 2 files changed, 57 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d1db5aa5c7f4..f726ba79fd3a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2126,27 +2126,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; - if (vcpu->arch.emulate_ctxt.mode == X86EMUL_MODE_PROT64) { - vcpu->arch.emulate_ctxt.cs_base = 0; - vcpu->arch.emulate_ctxt.ds_base = 0; - vcpu->arch.emulate_ctxt.es_base = 0; - vcpu->arch.emulate_ctxt.ss_base = 0; - } else { - vcpu->arch.emulate_ctxt.cs_base = - get_segment_base(vcpu, VCPU_SREG_CS); - vcpu->arch.emulate_ctxt.ds_base = - get_segment_base(vcpu, VCPU_SREG_DS); - vcpu->arch.emulate_ctxt.es_base = - get_segment_base(vcpu, VCPU_SREG_ES); - vcpu->arch.emulate_ctxt.ss_base = - get_segment_base(vcpu, VCPU_SREG_SS); - } - - vcpu->arch.emulate_ctxt.gs_base = - get_segment_base(vcpu, VCPU_SREG_GS); - vcpu->arch.emulate_ctxt.fs_base = - get_segment_base(vcpu, VCPU_SREG_FS); - r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); /* Reject the instructions other than VMCALL/VMMCALL when diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 38926b7da64a..18ca25c2d4a4 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -522,6 +522,39 @@ static inline void jmp_rel(struct decode_cache *c, int rel) register_address_increment(c, &c->eip, rel); } +static void set_seg_override(struct decode_cache *c, int seg) +{ + c->has_seg_override = true; + c->seg_override = seg; +} + +static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, int seg) +{ + if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) + return 0; + + return kvm_x86_ops->get_segment_base(ctxt->vcpu, seg); +} + +static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, + struct decode_cache *c) +{ + if (!c->has_seg_override) + return 0; + + return seg_base(ctxt, c->seg_override); +} + +static unsigned long es_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_ES); +} + +static unsigned long ss_base(struct x86_emulate_ctxt *ctxt) +{ + return seg_base(ctxt, VCPU_SREG_SS); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long linear, u8 *dest) @@ -735,8 +768,8 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } if (c->modrm_rm == 2 || c->modrm_rm == 3 || (c->modrm_rm == 6 && c->modrm_mod != 0)) - if (!c->override_base) - c->override_base = &ctxt->ss_base; + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_SS); c->modrm_ea = (u16)c->modrm_ea; } else { /* 32/64-bit ModR/M decode. */ @@ -807,6 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) memset(c, 0, sizeof(struct decode_cache)); c->eip = ctxt->vcpu->arch.rip; + ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); switch (mode) { @@ -845,23 +879,15 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* switch between 2/4 bytes */ c->ad_bytes = def_ad_bytes ^ 6; break; + case 0x26: /* ES override */ case 0x2e: /* CS override */ - c->override_base = &ctxt->cs_base; - break; + case 0x36: /* SS override */ case 0x3e: /* DS override */ - c->override_base = &ctxt->ds_base; - break; - case 0x26: /* ES override */ - c->override_base = &ctxt->es_base; + set_seg_override(c, (c->b >> 3) & 3); break; case 0x64: /* FS override */ - c->override_base = &ctxt->fs_base; - break; case 0x65: /* GS override */ - c->override_base = &ctxt->gs_base; - break; - case 0x36: /* SS override */ - c->override_base = &ctxt->ss_base; + set_seg_override(c, c->b & 7); break; case 0x40 ... 0x4f: /* REX */ if (mode != X86EMUL_MODE_PROT64) @@ -933,15 +959,11 @@ done_prefixes: if (rc) goto done; - if (!c->override_base) - c->override_base = &ctxt->ds_base; - if (mode == X86EMUL_MODE_PROT64 && - c->override_base != &ctxt->fs_base && - c->override_base != &ctxt->gs_base) - c->override_base = NULL; + if (!c->has_seg_override) + set_seg_override(c, VCPU_SREG_DS); - if (c->override_base && !(!c->twobyte && c->b == 0x8d)) - c->modrm_ea += *c->override_base; + if (!(!c->twobyte && c->b == 0x8d)) + c->modrm_ea += seg_override_base(ctxt, c); if (c->ad_bytes != 8) c->modrm_ea = (u32)c->modrm_ea; @@ -1043,7 +1065,7 @@ static inline void emulate_push(struct x86_emulate_ctxt *ctxt) c->dst.bytes = c->op_bytes; c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.ptr = (void *) register_address(c, ctxt->ss_base, + c->dst.ptr = (void *) register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); } @@ -1053,7 +1075,7 @@ static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int rc; - rc = ops->read_std(register_address(c, ctxt->ss_base, + rc = ops->read_std(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), &c->dst.val, c->dst.bytes, ctxt->vcpu); if (rc != 0) @@ -1375,11 +1397,11 @@ special_insn: register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); c->dst.ptr = (void *) register_address( - c, ctxt->ss_base, c->regs[VCPU_REGS_RSP]); + c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]); break; case 0x58 ... 0x5f: /* pop reg */ pop_instruction: - if ((rc = ops->read_std(register_address(c, ctxt->ss_base, + if ((rc = ops->read_std(register_address(c, ss_base(ctxt), c->regs[VCPU_REGS_RSP]), c->dst.ptr, c->op_bytes, ctxt->vcpu)) != 0) goto done; @@ -1405,7 +1427,7 @@ special_insn: c->rep_prefix ? address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, (ctxt->eflags & EFLG_DF), - register_address(c, ctxt->es_base, + register_address(c, es_base(ctxt), c->regs[VCPU_REGS_RDI]), c->rep_prefix, c->regs[VCPU_REGS_RDX]) == 0) { @@ -1421,9 +1443,8 @@ special_insn: c->rep_prefix ? address_mask(c, c->regs[VCPU_REGS_RCX]) : 1, (ctxt->eflags & EFLG_DF), - register_address(c, c->override_base ? - *c->override_base : - ctxt->ds_base, + register_address(c, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]), c->rep_prefix, c->regs[VCPU_REGS_RDX]) == 0) { @@ -1559,11 +1580,10 @@ special_insn: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)register_address(c, - ctxt->es_base, + es_base(ctxt), c->regs[VCPU_REGS_RDI]); if ((rc = ops->read_emulated(register_address(c, - c->override_base ? *c->override_base : - ctxt->ds_base, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]), &c->dst.val, c->dst.bytes, ctxt->vcpu)) != 0) @@ -1579,8 +1599,7 @@ special_insn: c->src.type = OP_NONE; /* Disable writeback. */ c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.ptr = (unsigned long *)register_address(c, - c->override_base ? *c->override_base : - ctxt->ds_base, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]); if ((rc = ops->read_emulated((unsigned long)c->src.ptr, &c->src.val, @@ -1591,7 +1610,7 @@ special_insn: c->dst.type = OP_NONE; /* Disable writeback. */ c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)register_address(c, - ctxt->es_base, + es_base(ctxt), c->regs[VCPU_REGS_RDI]); if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, &c->dst.val, @@ -1615,7 +1634,7 @@ special_insn: c->dst.type = OP_MEM; c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)register_address(c, - ctxt->es_base, + es_base(ctxt), c->regs[VCPU_REGS_RDI]); c->dst.val = c->regs[VCPU_REGS_RAX]; register_address_increment(c, &c->regs[VCPU_REGS_RDI], @@ -1627,8 +1646,7 @@ special_insn: c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; if ((rc = ops->read_emulated(register_address(c, - c->override_base ? *c->override_base : - ctxt->ds_base, + seg_override_base(ctxt, c), c->regs[VCPU_REGS_RSI]), &c->dst.val, c->dst.bytes, -- cgit v1.2.2 From 6ada8cca79cb971f5da7d1756f4f9292e3ef1e03 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Jun 2008 16:45:24 +0300 Subject: KVM: MMU: When debug is enabled, make it a run-time parameter Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5ebb2788bd73..5994645dcee0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -66,7 +66,8 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {} #endif #if defined(MMU_DEBUG) || defined(AUDIT) -static int dbg = 1; +static int dbg = 0; +module_param(dbg, bool, 0644); #endif #ifndef MMU_DEBUG -- cgit v1.2.2 From db475c39eca0f2e44953d96e768d7ce808ab85bd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 22 Jun 2008 16:46:22 +0300 Subject: KVM: MMU: Fix printk format Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5994645dcee0..1fd8e3b58cc0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1116,7 +1116,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, mark_page_dirty(vcpu->kvm, gfn); pgprintk("%s: setting spte %llx\n", __func__, spte); - pgprintk("instantiating %s PTE (%s) at %d (%llx) addr %llx\n", + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); set_shadow_pte(shadow_pte, spte); -- cgit v1.2.2 From 65267ea1b3e768dc54b63cd7fad520d89c27d350 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 18 Jun 2008 14:43:38 +0800 Subject: KVM: VMX: Fix a wrong usage of vmcs_config The function ept_update_paging_mode_cr0() write to CPU_BASED_VM_EXEC_CONTROL based on vmcs_config.cpu_based_exec_ctrl. That's wrong because the variable may not consistent with the content in the CPU_BASE_VM_EXEC_CONTROL MSR. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1bb994657208..6a3a4038f3b9 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1441,7 +1441,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, if (!(cr0 & X86_CR0_PG)) { /* From paging/starting to nonpaging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_config.cpu_based_exec_ctrl | + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) | (CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; @@ -1451,7 +1451,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, } else if (!is_paging(vcpu)) { /* From nonpaging to paging */ vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, - vmcs_config.cpu_based_exec_ctrl & + vmcs_read32(CPU_BASED_VM_EXEC_CONTROL) & ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING)); vcpu->arch.cr0 = cr0; -- cgit v1.2.2 From efa67e0d1f51842393606034051d805ab9948abd Mon Sep 17 00:00:00 2001 From: Chris Lalancette Date: Fri, 20 Jun 2008 09:51:30 +0200 Subject: KVM: VMX: Fake emulate Intel perfctr MSRs Older linux guests (in this case, 2.6.9) can attempt to access the performance counter MSRs without a fixup section, and injecting a GPF kills the guest. Work around by allowing the guest to write those MSRs. Tested by me on RHEL-4 i386 and x86_64 guests, as well as F-9 guests. Signed-off-by: Chris Lalancette Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6a3a4038f3b9..d493a97e7887 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -920,6 +920,18 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; case MSR_IA32_TIME_STAMP_COUNTER: guest_write_tsc(data); + break; + case MSR_P6_PERFCTR0: + case MSR_P6_PERFCTR1: + case MSR_P6_EVNTSEL0: + case MSR_P6_EVNTSEL1: + /* + * Just discard all writes to the performance counters; this + * should keep both older linux and windows 64-bit guests + * happy + */ + pr_unimpl(vcpu, "unimplemented perfctr wrmsr: 0x%x data 0x%llx\n", msr_index, data); + break; default: vmx_load_host_state(vmx); -- cgit v1.2.2 From f8b78fa3d406f3a2dc038e2b47749013a9295994 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 23 Jun 2008 12:04:25 -0300 Subject: KVM: move slots_lock acquision down to vapic_exit There is no need to grab slots_lock if the vapic_page will not be touched. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f726ba79fd3a..55906e4c4676 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2787,8 +2787,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; + down_read(&vcpu->kvm->slots_lock); kvm_release_page_dirty(apic->vapic_page); mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); + up_read(&vcpu->kvm->slots_lock); } static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -2944,9 +2946,7 @@ out: post_kvm_run_save(vcpu, kvm_run); - down_read(&vcpu->kvm->slots_lock); vapic_exit(vcpu); - up_read(&vcpu->kvm->slots_lock); return r; } -- cgit v1.2.2 From 0da1db75a2feca54564add30828bab658982481c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 2 Jul 2008 16:02:11 +0200 Subject: KVM: SVM: fix suspend/resume support On suspend the svm_hardware_disable function is called which frees all svm_data variables. On resume they are not re-allocated. This patch removes the deallocation of svm_data from the hardware_disable function to the hardware_unsetup function which is not called on suspend. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 238e8f3afaf4..858e29702232 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -272,19 +272,11 @@ static int has_svm(void) static void svm_hardware_disable(void *garbage) { - struct svm_cpu_data *svm_data - = per_cpu(svm_data, raw_smp_processor_id()); - - if (svm_data) { - uint64_t efer; + uint64_t efer; - wrmsrl(MSR_VM_HSAVE_PA, 0); - rdmsrl(MSR_EFER, efer); - wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); - per_cpu(svm_data, raw_smp_processor_id()) = NULL; - __free_page(svm_data->save_area); - kfree(svm_data); - } + wrmsrl(MSR_VM_HSAVE_PA, 0); + rdmsrl(MSR_EFER, efer); + wrmsrl(MSR_EFER, efer & ~MSR_EFER_SVME_MASK); } static void svm_hardware_enable(void *garbage) @@ -323,6 +315,19 @@ static void svm_hardware_enable(void *garbage) page_to_pfn(svm_data->save_area) << PAGE_SHIFT); } +static void svm_cpu_uninit(int cpu) +{ + struct svm_cpu_data *svm_data + = per_cpu(svm_data, raw_smp_processor_id()); + + if (!svm_data) + return; + + per_cpu(svm_data, raw_smp_processor_id()) = NULL; + __free_page(svm_data->save_area); + kfree(svm_data); +} + static int svm_cpu_init(int cpu) { struct svm_cpu_data *svm_data; @@ -460,6 +465,11 @@ err: static __exit void svm_hardware_unsetup(void) { + int cpu; + + for_each_online_cpu(cpu) + svm_cpu_uninit(cpu); + __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); iopm_base = 0; } -- cgit v1.2.2 From 7e37c2998a5a0b00134f6227167694b710f57ac0 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Tue, 1 Jul 2008 01:19:19 +0300 Subject: x86: KVM guest: make kvm_smp_prepare_boot_cpu() static This patch makes the needlessly global kvm_smp_prepare_boot_cpu() static. Signed-off-by: Adrian Bunk Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 87edf1ceb1df..d02def06ca91 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -113,7 +113,7 @@ static void kvm_setup_secondary_clock(void) #endif #ifdef CONFIG_SMP -void __init kvm_smp_prepare_boot_cpu(void) +static void __init kvm_smp_prepare_boot_cpu(void) { WARN_ON(kvm_register_clock("primary cpu clock")); native_smp_prepare_boot_cpu(); -- cgit v1.2.2 From 5a4c92880493945678315a6df810f7a21f55b985 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 3 Jul 2008 18:33:02 -0300 Subject: KVM: mmu_shrink: kvm_mmu_zap_page requires slots_lock to be held kvm_mmu_zap_page() needs slots lock held (rmap_remove->gfn_to_memslot, for example). Since kvm_lock spinlock is held in mmu_shrink(), do a non-blocking down_read_trylock(). Untested. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 1fd8e3b58cc0..ff7cf632175b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1987,6 +1987,8 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) list_for_each_entry(kvm, &vm_list, vm_list) { int npages; + if (!down_read_trylock(&kvm->slots_lock)) + continue; spin_lock(&kvm->mmu_lock); npages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages; @@ -1999,6 +2001,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) nr_to_scan--; spin_unlock(&kvm->mmu_lock); + up_read(&kvm->slots_lock); } if (kvm_freed) list_move_tail(&kvm_freed->vm_list, &vm_list); -- cgit v1.2.2 From 4e1096d27f3d095735c1c69c7b0a26a06a0d454e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Sun, 6 Jul 2008 19:16:51 +0800 Subject: KVM: VMX: Add ept_sync_context in flush_tlb Fix a potention issue caused by kvm_mmu_slot_remove_write_access(). The old behavior don't sync EPT TLB with modified EPT entry, which result in inconsistent content of EPT TLB and EPT table. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index d493a97e7887..fff3b490976e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -91,6 +91,7 @@ static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) } static int init_rmode(struct kvm *kvm); +static u64 construct_eptp(unsigned long root_hpa); static DEFINE_PER_CPU(struct vmcs *, vmxarea); static DEFINE_PER_CPU(struct vmcs *, current_vmcs); @@ -1422,6 +1423,8 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static void vmx_flush_tlb(struct kvm_vcpu *vcpu) { vpid_sync_vcpu_all(to_vmx(vcpu)); + if (vm_need_ept()) + ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); } static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From ac9f6dc0db0b5582ebf8bb720d7c41c3d2159013 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 6 Jul 2008 15:48:31 +0300 Subject: KVM: Apply the kernel sigmask to vcpus blocked due to being uninitialized Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 55906e4c4676..89fc8565edee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2958,15 +2958,15 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vcpu_load(vcpu); + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); - vcpu_put(vcpu); - return -EAGAIN; + r = -EAGAIN; + goto out; } - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - /* re-sync apic's tpr */ if (!irqchip_in_kernel(vcpu->kvm)) kvm_set_cr8(vcpu, kvm_run->cr8); -- cgit v1.2.2 From 19fdfa0d133ae216e9d1c69a8333fe63fcf8e584 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 6 Jul 2008 16:51:26 +0300 Subject: KVM: x86 emulator: Fix HLT instruction This patch fixes issue encountered with HLT instruction under FreeDOS's HIMEM XMS Driver. The HLT instruction jumped directly to the done label and skips updating the EIP value, therefore causing the guest to spin endlessly on the same instruction. The patch changes the instruction so that it writes back the updated EIP value. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 18ca25c2d4a4..8bc63f62fbbd 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -1731,7 +1731,7 @@ special_insn: break; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; - goto done; + break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; -- cgit v1.2.2 From c65bbfa1d693d375da51f9c8aa9fb26f09fa19ed Mon Sep 17 00:00:00 2001 From: Ben-Ami Yassour Date: Sun, 6 Jul 2008 17:15:07 +0300 Subject: KVM: check injected pic irq within valid pic irqs Check that an injected pic irq is between 0 and 15. Signed-off-by: Ben-Ami Yassour Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 6 ++++-- arch/x86/kvm/irq.h | 2 ++ 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 5857f59ad4aa..c31164e8aa46 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -130,8 +130,10 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) { struct kvm_pic *s = opaque; - pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); - pic_update_irq(s); + if (irq >= 0 && irq < PIC_NUM_PINS) { + pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); + pic_update_irq(s); + } } /* diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 2a15be2275c0..7ca47cbb48bb 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -30,6 +30,8 @@ #include "ioapic.h" #include "lapic.h" +#define PIC_NUM_PINS 16 + struct kvm; struct kvm_vcpu; -- cgit v1.2.2 From d6e88aec07aa8f6c7e4024f5734ec659fd7c5a40 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 10 Jul 2008 16:53:33 +0300 Subject: KVM: Prefix some x86 low level function with kvm_, to avoid namespace issues Fixes compilation with CONFIG_VMI enabled. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 12 ++++++------ arch/x86/kvm/vmx.c | 24 ++++++++++++------------ arch/x86/kvm/x86.c | 18 +++++++++--------- 3 files changed, 27 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 858e29702232..b756e876dce3 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1710,9 +1710,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) sync_lapic_to_cr8(vcpu); save_host_msrs(vcpu); - fs_selector = read_fs(); - gs_selector = read_gs(); - ldt_selector = read_ldt(); + fs_selector = kvm_read_fs(); + gs_selector = kvm_read_gs(); + ldt_selector = kvm_read_ldt(); svm->host_cr2 = kvm_read_cr2(); svm->host_dr6 = read_dr6(); svm->host_dr7 = read_dr7(); @@ -1845,9 +1845,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) write_dr7(svm->host_dr7); kvm_write_cr2(svm->host_cr2); - load_fs(fs_selector); - load_gs(gs_selector); - load_ldt(ldt_selector); + kvm_load_fs(fs_selector); + kvm_load_gs(gs_selector); + kvm_load_ldt(ldt_selector); load_host_msrs(vcpu); reload_tss(vcpu); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fff3b490976e..0cac63701719 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -484,7 +484,7 @@ static void reload_tss(void) struct descriptor_table gdt; struct desc_struct *descs; - get_gdt(&gdt); + kvm_get_gdt(&gdt); descs = (void *)gdt.base; descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ load_TR_desc(); @@ -540,9 +540,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) * Set host fs and gs selectors. Unfortunately, 22.2.3 does not * allow segment selectors with cpl > 0 or ti == 1. */ - vmx->host_state.ldt_sel = read_ldt(); + vmx->host_state.ldt_sel = kvm_read_ldt(); vmx->host_state.gs_ldt_reload_needed = vmx->host_state.ldt_sel; - vmx->host_state.fs_sel = read_fs(); + vmx->host_state.fs_sel = kvm_read_fs(); if (!(vmx->host_state.fs_sel & 7)) { vmcs_write16(HOST_FS_SELECTOR, vmx->host_state.fs_sel); vmx->host_state.fs_reload_needed = 0; @@ -550,7 +550,7 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu) vmcs_write16(HOST_FS_SELECTOR, 0); vmx->host_state.fs_reload_needed = 1; } - vmx->host_state.gs_sel = read_gs(); + vmx->host_state.gs_sel = kvm_read_gs(); if (!(vmx->host_state.gs_sel & 7)) vmcs_write16(HOST_GS_SELECTOR, vmx->host_state.gs_sel); else { @@ -586,15 +586,15 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) ++vmx->vcpu.stat.host_state_reload; vmx->host_state.loaded = 0; if (vmx->host_state.fs_reload_needed) - load_fs(vmx->host_state.fs_sel); + kvm_load_fs(vmx->host_state.fs_sel); if (vmx->host_state.gs_ldt_reload_needed) { - load_ldt(vmx->host_state.ldt_sel); + kvm_load_ldt(vmx->host_state.ldt_sel); /* * If we have to reload gs, we must take care to * preserve our gs base. */ local_irq_save(flags); - load_gs(vmx->host_state.gs_sel); + kvm_load_gs(vmx->host_state.gs_sel); #ifdef CONFIG_X86_64 wrmsrl(MSR_GS_BASE, vmcs_readl(HOST_GS_BASE)); #endif @@ -654,8 +654,8 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) * Linux uses per-cpu TSS and GDT, so set these when switching * processors. */ - vmcs_writel(HOST_TR_BASE, read_tr_base()); /* 22.2.4 */ - get_gdt(&dt); + vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ + kvm_get_gdt(&dt); vmcs_writel(HOST_GDTR_BASE, dt.base); /* 22.2.4 */ rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); @@ -1943,8 +1943,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_CS_SELECTOR, __KERNEL_CS); /* 22.2.4 */ vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); /* 22.2.4 */ - vmcs_write16(HOST_FS_SELECTOR, read_fs()); /* 22.2.4 */ - vmcs_write16(HOST_GS_SELECTOR, read_gs()); /* 22.2.4 */ + vmcs_write16(HOST_FS_SELECTOR, kvm_read_fs()); /* 22.2.4 */ + vmcs_write16(HOST_GS_SELECTOR, kvm_read_gs()); /* 22.2.4 */ vmcs_write16(HOST_SS_SELECTOR, __KERNEL_DS); /* 22.2.4 */ #ifdef CONFIG_X86_64 rdmsrl(MSR_FS_BASE, a); @@ -1958,7 +1958,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) vmcs_write16(HOST_TR_SELECTOR, GDT_ENTRY_TSS*8); /* 22.2.4 */ - get_idt(&dt); + kvm_get_idt(&dt); vmcs_writel(HOST_IDTR_BASE, dt.base); /* 22.2.4 */ asm("mov $.Lkvm_vmx_return, %0" : "=r"(kvm_vmx_return)); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 89fc8565edee..b131f3c0cf64 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3767,14 +3767,14 @@ void fx_init(struct kvm_vcpu *vcpu) * allocate ram with GFP_KERNEL. */ if (!used_math()) - fx_save(&vcpu->arch.host_fx_image); + kvm_fx_save(&vcpu->arch.host_fx_image); /* Initialize guest FPU by resetting ours and saving into guest's */ preempt_disable(); - fx_save(&vcpu->arch.host_fx_image); - fx_finit(); - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_finit(); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); preempt_enable(); vcpu->arch.cr0 |= X86_CR0_ET; @@ -3791,8 +3791,8 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 1; - fx_save(&vcpu->arch.host_fx_image); - fx_restore(&vcpu->arch.guest_fx_image); + kvm_fx_save(&vcpu->arch.host_fx_image); + kvm_fx_restore(&vcpu->arch.guest_fx_image); } EXPORT_SYMBOL_GPL(kvm_load_guest_fpu); @@ -3802,8 +3802,8 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) return; vcpu->guest_fpu_loaded = 0; - fx_save(&vcpu->arch.guest_fx_image); - fx_restore(&vcpu->arch.host_fx_image); + kvm_fx_save(&vcpu->arch.guest_fx_image); + kvm_fx_restore(&vcpu->arch.host_fx_image); ++vcpu->stat.fpu_reload; } EXPORT_SYMBOL_GPL(kvm_put_guest_fpu); -- cgit v1.2.2 From 34d4cb8fca1f2a31be152b74797e6cd160ec9de6 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 10 Jul 2008 20:49:31 -0300 Subject: KVM: MMU: nuke shadowed pgtable pages and ptes on memslot destruction Flush the shadow mmu before removing regions to avoid stale entries. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b131f3c0cf64..9f1cdb011cff 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4032,6 +4032,11 @@ int kvm_arch_set_memory_region(struct kvm *kvm, return 0; } +void kvm_arch_flush_shadow(struct kvm *kvm) +{ + kvm_mmu_zap_all(kvm); +} + int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) { return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE -- cgit v1.2.2 From 376c53c2b30d4a1955240f59f4ecd959aa118f92 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 10 Jul 2008 20:54:29 -0300 Subject: KVM: MMU: improve invalid shadow root page handling Harden kvm_mmu_zap_page() against invalid root pages that had been shadowed from memslots that are gone. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ff7cf632175b..7f57da663826 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -930,14 +930,17 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) } kvm_mmu_page_unlink_children(kvm, sp); if (!sp->root_count) { - if (!sp->role.metaphysical) + if (!sp->role.metaphysical && !sp->role.invalid) unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { + int invalid = sp->role.invalid; list_move(&sp->link, &kvm->arch.active_mmu_pages); sp->role.invalid = 1; kvm_reload_remote_mmus(kvm); + if (!sp->role.metaphysical && !invalid) + unaccount_shadowed(kvm, sp->gfn); } kvm_mmu_reset_last_pte_updated(kvm); } -- cgit v1.2.2 From 2a7c5b8b550b1fb1db9eb490420132e637f5dcb4 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 10 Jul 2008 17:08:15 -0300 Subject: KVM: x86 emulator: emulate clflush If the guest issues a clflush in a mmio address, the instruction can trap into the hypervisor. Currently, we do not decode clflush properly, causing the guest to hang. This patch fixes this emulating clflush (opcode 0f ae). Signed-off-by: Glauber Costa Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 8bc63f62fbbd..f2f90468f8b1 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -219,7 +219,7 @@ static u16 twobyte_table[256] = { /* 0xA0 - 0xA7 */ 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, /* 0xA8 - 0xAF */ - 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, 0, 0, + 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 0, 0, ModRM, 0, /* 0xB0 - 0xB7 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, DstMem | SrcReg | ModRM | BitOp, @@ -1947,6 +1947,8 @@ twobyte_insn: c->src.val &= (c->dst.bytes << 3) - 1; emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); break; + case 0xae: /* clflush */ + break; case 0xb0 ... 0xb1: /* cmpxchg */ /* * Save real source value, then compare EAX against -- cgit v1.2.2 From 722c05f2192070bac0208b2c16ce13929b32d92f Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 13 Jul 2008 11:33:54 +0300 Subject: KVM: MMU: Fix potential race setting upper shadow ptes on nonpae hosts The direct mapped shadow code (used for real mode and two dimensional paging) sets upper-level ptes using direct assignment rather than calling set_shadow_pte(). A nonpae host will split this into two writes, which opens up a race if another vcpu accesses the same memory area. Fix by calling set_shadow_pte() instead of assigning directly. Noticed by Izik Eidus. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7f57da663826..b0e4ddca6c18 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1189,9 +1189,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, return -ENOMEM; } - table[index] = __pa(new_table->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask; + set_shadow_pte(&table[index], + __pa(new_table->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } table_addr = table[index] & PT64_BASE_ADDR_MASK; } -- cgit v1.2.2 From caf43bf7c6a55e89b6df5179df434d67e24aa32e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 20 Jul 2008 14:06:50 +0200 Subject: x86, xen: fix apic_ops build on UP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/xen/enlighten.c:615: error: variable ‘xen_basic_apic_ops’ has initializer but incomplete type arch/x86/xen/enlighten.c:616: error: unknown field ‘read’ specified in initializer [...] Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 008b7b69581e..e4d1459a63df 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -35,6 +35,7 @@ #include #include +#include #include #include #include -- cgit v1.2.2 From 7be42004065ce4df193aeef5befd26805267d0d9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 20 Jul 2008 17:04:57 +0200 Subject: x86, lguest: fix apic_ops build on UP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/lguest/boot.c:816: error: variable ‘lguest_basic_apic_ops’ has initializer but incomplete type arch/x86/lguest/boot.c:817: error: unknown field ‘read’ specified in initializer [...] Signed-off-by: Ingo Molnar --- arch/x86/lguest/boot.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 35c4349cd668..756fc489652b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.2 From 9175fc06aee79c349790672178d3fd7507d75c86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Jul 2008 01:38:14 -0700 Subject: x86: use setup_clear_cpu_cap() when disabling the lapic ... so don't need to call clear_cpu_cap again in early_identify_cpu, and could use cleared_cpu_caps like other places. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 5 +---- arch/x86/kernel/apic_64.c | 2 +- arch/x86/kernel/cpu/common_64.c | 4 ---- 3 files changed, 2 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a437d027f20b..e9a00e5074b2 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1214,9 +1214,6 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { - if (disable_apic) - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - if (!smp_found_config && !cpu_has_apic) return -1; @@ -1700,7 +1697,7 @@ early_param("lapic", parse_lapic); static int __init parse_nolapic(char *arg) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("nolapic", parse_nolapic); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e3d32e27c14..16e586cacbdc 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1337,7 +1337,7 @@ early_param("apic", apic_set_verbosity); static __init int setup_disableapic(char *str) { disable_apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } early_param("disableapic", setup_disableapic); diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7b8cc72feb40..0485cf644520 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -324,10 +324,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) cpu_devs[c->x86_vendor]->c_early_init(c); validate_pat_support(c); - - /* early_param could clear that, but recall get it set again */ - if (disable_apic) - clear_cpu_cap(c, X86_FEATURE_APIC); } /* -- cgit v1.2.2 From 7edf8891ad7aef5f4e97991fed6fb0e605e96ea0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Jul 2008 01:39:03 -0700 Subject: x86: remove extra calling to get ext cpuid level Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 0485cf644520..daee611f0140 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -305,7 +305,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } - c->extended_cpuid_level = cpuid_eax(0x80000000); if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); -- cgit v1.2.2 From cfc1b9a6a683c835a20d5b565ade55baf639f72f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Jul 2008 21:35:38 +0200 Subject: x86: convert Dprintk to pr_debug There are a couple of places where (P)Dprintk is used which is an old compile time enabled printk wrapper. Convert it to the generic pr_debug(). Signed-off-by: Thomas Gleixner --- arch/x86/kernel/acpi/boot.c | 6 ++-- arch/x86/kernel/cpu/perfctr-watchdog.c | 4 +-- arch/x86/kernel/setup_percpu.c | 6 ++-- arch/x86/kernel/smpboot.c | 52 +++++++++++++++++----------------- arch/x86/mm/numa_64.c | 4 --- arch/x86/pci/early.c | 16 +++++------ 6 files changed, 41 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f489d7a9be92..fa88a1d71290 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_bus_id_to_type[MP_ISA_BUS] = MP_BUS_ISA; #endif set_bit(MP_ISA_BUS, mp_bus_not_pci); - Dprintk("Bus #%d is ISA\n", MP_ISA_BUS); + pr_debug("Bus #%d is ISA\n", MP_ISA_BUS); #ifdef CONFIG_X86_ES7000 /* @@ -1127,8 +1127,8 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - Dprintk(KERN_DEBUG "Pin %d-%d already programmed\n", - mp_ioapic_routing[ioapic].apic_id, ioapic_pin); + pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n", + mp_ioapic_routing[ioapic].apic_id, ioapic_pin); #ifdef CONFIG_X86_32 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); #else diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6d4bdc02388a..de7439f82b92 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -250,7 +250,7 @@ static void write_watchdog_counter(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsrl(perfctr_msr, 0 - count); } @@ -261,7 +261,7 @@ static void write_watchdog_counter32(unsigned int perfctr_msr, do_div(count, nmi_hz); if(descr) - Dprintk("setting %s to -0x%08Lx\n", descr, count); + pr_debug("setting %s to -0x%08Lx\n", descr, count); wrmsr(perfctr_msr, (u32)(-count), 0); } diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index cac68430d31f..f7745f94c006 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -227,8 +227,8 @@ static void __init setup_node_to_cpumask_map(void) /* allocate the map */ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - Dprintk(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", - map, nr_node_ids); + pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + map, nr_node_ids); /* node_to_cpumask() will now work */ node_to_cpumask_map = map; @@ -248,7 +248,7 @@ void __cpuinit numa_set_node(int cpu, int node) per_cpu(x86_cpu_to_node_map, cpu) = node; else - Dprintk(KERN_INFO "Setting node for non-present cpu %d\n", cpu); + pr_debug("Setting node for non-present cpu %d\n", cpu); } void __cpuinit numa_clear_node(int cpu) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 27640196eb7c..4b53a647bc0a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -216,7 +216,7 @@ static void __cpuinit smp_callin(void) panic("%s: phys CPU#%d, CPU#%d already present??\n", __func__, phys_id, cpuid); } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + pr_debug("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); /* * STARTUP IPIs are fragile beasts as they might sometimes @@ -251,7 +251,7 @@ static void __cpuinit smp_callin(void) * boards) */ - Dprintk("CALLIN, before setup_local_APIC().\n"); + pr_debug("CALLIN, before setup_local_APIC().\n"); smp_callin_clear_local_apic(); setup_local_APIC(); end_local_APIC_setup(); @@ -266,7 +266,7 @@ static void __cpuinit smp_callin(void) local_irq_enable(); calibrate_delay(); local_irq_disable(); - Dprintk("Stack at about %p\n", &cpuid); + pr_debug("Stack at about %p\n", &cpuid); /* * Save our processor parameters @@ -513,7 +513,7 @@ static void impress_friends(void) /* * Allow the user to impress friends. */ - Dprintk("Before bogomips.\n"); + pr_debug("Before bogomips.\n"); for_each_possible_cpu(cpu) if (cpu_isset(cpu, cpu_callout_map)) bogosum += cpu_data(cpu).loops_per_jiffy; @@ -523,7 +523,7 @@ static void impress_friends(void) bogosum/(500000/HZ), (bogosum/(5000/HZ))%100); - Dprintk("Before bogocount - setting activated=1.\n"); + pr_debug("Before bogocount - setting activated=1.\n"); } static inline void __inquire_remote_apic(int apicid) @@ -585,7 +585,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) /* Kick the second */ apic_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* @@ -596,7 +596,7 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); accept_status = (apic_read(APIC_ESR) & 0xEF); - Dprintk("NMI sent.\n"); + pr_debug("NMI sent.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -631,7 +631,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) apic_read(APIC_ESR); } - Dprintk("Asserting INIT.\n"); + pr_debug("Asserting INIT.\n"); /* * Turn INIT on target chip @@ -644,12 +644,12 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT | APIC_DM_INIT); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mdelay(10); - Dprintk("Deasserting INIT.\n"); + pr_debug("Deasserting INIT.\n"); /* Target chip */ apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); @@ -657,7 +657,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* Send IPI */ apic_write(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); mb(); @@ -684,14 +684,14 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) /* * Run STARTUP IPI loop. */ - Dprintk("#startup loops: %d.\n", num_starts); + pr_debug("#startup loops: %d.\n", num_starts); for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n", j); + pr_debug("Sending STARTUP #%d.\n", j); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); + pr_debug("After apic_write.\n"); /* * STARTUP IPI @@ -709,9 +709,9 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) */ udelay(300); - Dprintk("Startup point 1.\n"); + pr_debug("Startup point 1.\n"); - Dprintk("Waiting for send to finish...\n"); + pr_debug("Waiting for send to finish...\n"); send_status = safe_apic_wait_icr_idle(); /* @@ -724,7 +724,7 @@ wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) if (send_status || accept_status) break; } - Dprintk("After Startup.\n"); + pr_debug("After Startup.\n"); if (send_status) printk(KERN_ERR "APIC never delivered???\n"); @@ -875,7 +875,7 @@ do_rest: if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { - Dprintk("Setting warm reset code and vector.\n"); + pr_debug("Setting warm reset code and vector.\n"); store_NMI_vector(&nmi_high, &nmi_low); @@ -896,9 +896,9 @@ do_rest: /* * allow APs to start initializing. */ - Dprintk("Before Callout %d.\n", cpu); + pr_debug("Before Callout %d.\n", cpu); cpu_set(cpu, cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); + pr_debug("After Callout %d.\n", cpu); /* * Wait 5s total for a response @@ -911,10 +911,10 @@ do_rest: if (cpu_isset(cpu, cpu_callin_map)) { /* number CPUs logically, starting from 1 (BSP is 0) */ - Dprintk("OK.\n"); + pr_debug("OK.\n"); printk(KERN_INFO "CPU%d: ", cpu); print_cpu_info(&cpu_data(cpu)); - Dprintk("CPU has booted.\n"); + pr_debug("CPU has booted.\n"); } else { boot_error = 1; if (*((volatile unsigned char *)trampoline_base) @@ -959,7 +959,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) WARN_ON(irqs_disabled()); - Dprintk("++++++++++++++++++++=_---CPU UP %u\n", cpu); + pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || !physid_isset(apicid, phys_cpu_present_map)) { @@ -971,7 +971,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) * Already booted CPU? */ if (cpu_isset(cpu, cpu_callin_map)) { - Dprintk("do_boot_cpu %d Already started\n", cpu); + pr_debug("do_boot_cpu %d Already started\n", cpu); return -ENOSYS; } @@ -998,7 +998,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) err = do_boot_cpu(apicid, cpu); #endif if (err) { - Dprintk("do_boot_cpu failed %d\n", err); + pr_debug("do_boot_cpu failed %d\n", err); return -EIO; } @@ -1202,7 +1202,7 @@ void __init native_smp_prepare_boot_cpu(void) void __init native_smp_cpus_done(unsigned int max_cpus) { - Dprintk("Boot done.\n"); + pr_debug("Boot done.\n"); impress_friends(); smp_checks(); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index b432d5781773..9782f42dd319 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -20,10 +20,6 @@ #include #include -#ifndef Dprintk -#define Dprintk(x...) -#endif - struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c index 858dbe3399f9..86631ccbc25a 100644 --- a/arch/x86/pci/early.c +++ b/arch/x86/pci/early.c @@ -7,15 +7,13 @@ /* Direct PCI access. This is used for PCI accesses in early boot before the PCI subsystem works. */ -#define PDprintk(x...) - u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset) { u32 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inl(0xcfc); if (v != 0xffffffff) - PDprintk("%x reading 4 from %x: %x\n", slot, offset, v); + pr_debug("%x reading 4 from %x: %x\n", slot, offset, v); return v; } @@ -24,7 +22,7 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset) u8 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inb(0xcfc + (offset&3)); - PDprintk("%x reading 1 from %x: %x\n", slot, offset, v); + pr_debug("%x reading 1 from %x: %x\n", slot, offset, v); return v; } @@ -33,28 +31,28 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset) u16 v; outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); v = inw(0xcfc + (offset&2)); - PDprintk("%x reading 2 from %x: %x\n", slot, offset, v); + pr_debug("%x reading 2 from %x: %x\n", slot, offset, v); return v; } void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, u32 val) { - PDprintk("%x writing to %x: %x\n", slot, offset, val); + pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outl(val, 0xcfc); } void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) { - PDprintk("%x writing to %x: %x\n", slot, offset, val); + pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outb(val, 0xcfc + (offset&3)); } void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) { - PDprintk("%x writing to %x: %x\n", slot, offset, val); + pr_debug("%x writing to %x: %x\n", slot, offset, val); outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); outw(val, 0xcfc + (offset&2)); } @@ -71,7 +69,7 @@ void early_dump_pci_device(u8 bus, u8 slot, u8 func) int j; u32 val; - printk("PCI: %02x:%02x:%02x", bus, slot, func); + printk(KERN_INFO "PCI: %02x:%02x:%02x", bus, slot, func); for (i = 0; i < 256; i += 4) { if (!(i & 0x0f)) -- cgit v1.2.2 From 5171c3047df9d5b5183b2b179aa797a5aed8369b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 21 Jul 2008 21:58:34 +0200 Subject: x86: move the last Dprintk instance to pr_debug() Signed-off-by: Thomas Gleixner --- arch/x86/mach-es7000/es7000plat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c index 4354ce804889..50189af14b85 100644 --- a/arch/x86/mach-es7000/es7000plat.c +++ b/arch/x86/mach-es7000/es7000plat.c @@ -130,10 +130,10 @@ parse_unisys_oem (char *oemptr) mip_addr = val; mip = (struct mip_reg *)val; mip_reg = __va(mip); - Dprintk("es7000_mipcfg: host_reg = 0x%lx \n", - (unsigned long)host_reg); - Dprintk("es7000_mipcfg: mip_reg = 0x%lx \n", - (unsigned long)mip_reg); + pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + (unsigned long)host_reg); + pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + (unsigned long)mip_reg); success++; break; case MIP_PSAI_REG: -- cgit v1.2.2 From f2d0f1dea41fd6c7a347e71b505a155096643517 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 21 Jul 2008 13:04:08 -0700 Subject: x86: Fix help message for STRICT_DEVMEM config option The message talked about "left on" when it meant to say disabled. Signed-off-by: Linus Torvalds --- arch/x86/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 85a87d2ac0c0..092f019e033a 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -8,7 +8,7 @@ source "lib/Kconfig.debug" config STRICT_DEVMEM bool "Filter access to /dev/mem" help - If this option is left on, you allow userspace (root) access to all + If this option is disabled, you allow userspace (root) access to all of memory, including kernel and userspace memory. Accidental access to this is obviously disastrous, but specific access can be used by people debugging the kernel. Note that with PAT support -- cgit v1.2.2 From 3bfd49c8ab1859ae0f5fa1df2b3781c99115f442 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 21 May 2008 12:52:33 -0700 Subject: device create: x86: convert device_create to device_create_drvdata device_create() is race-prone, so use the race-free device_create_drvdata() instead as device_create() is going away. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpuid.c | 4 ++-- arch/x86/kernel/msr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2de5fa2bbf77..14b11b3be31c 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -141,8 +141,8 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), - "cpu%d", cpu); + dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), + NULL, "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index a153b3905f60..9fd809552447 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -149,8 +149,8 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), - "msr%d", cpu); + dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), + NULL, "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } -- cgit v1.2.2 From fc3a8828b139c24aade3f9d608775e36c248f8f5 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 2 May 2008 06:02:41 +0200 Subject: driver core: fix a lot of printk usages of bus_id We have the dev_printk() variants for this kind of thing, use them instead of directly trying to access the bus_id field of struct device. This is done in order to remove bus_id entirely. Cc: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/pci-dma.c | 6 ++---- arch/x86/kernel/pci-gart_64.c | 4 +--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index a4213c00dffc..cbecb05551bb 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -314,8 +314,7 @@ int dma_supported(struct device *dev, u64 mask) { #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { - printk(KERN_INFO "PCI: Disallowing DAC for device %s\n", - dev->bus_id); + dev_info(dev, "PCI: Disallowing DAC for device\n"); return 0; } #endif @@ -342,8 +341,7 @@ int dma_supported(struct device *dev, u64 mask) type. Normally this doesn't make any difference, but gives more gentle handling of IOMMU overflow. */ if (iommu_sac_force && (mask >= DMA_40BIT_MASK)) { - printk(KERN_INFO "%s: Force SAC with mask %Lx\n", - dev->bus_id, mask); + dev_info(dev, "Force SAC with mask %Lx\n", mask); return 0; } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index be60961f8695..df5f142657d2 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -198,9 +198,7 @@ static void iommu_full(struct device *dev, size_t size, int dir) * out. Hopefully no network devices use single mappings that big. */ - printk(KERN_ERR - "PCI-DMA: Out of IOMMU space for %lu bytes at device %s\n", - size, dev->bus_id); + dev_err(dev, "PCI-DMA: Out of IOMMU space for %lu bytes\n", size); if (size > PAGE_SIZE*EMERGENCY_PAGES) { if (dir == PCI_DMA_FROMDEVICE || dir == PCI_DMA_BIDIRECTIONAL) -- cgit v1.2.2 From 4a0b2b4dbe1335b8b9886ba3dc85a145d5d938ed Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 1 Jul 2008 18:48:41 +0200 Subject: sysdev: Pass the attribute to the low level sysdev show/store function This allow to dynamically generate attributes and share show/store functions between attributes. Right now most attributes are generated by special macros and lots of duplicated code. With the attribute passed it's instead possible to attach some data to the attribute and then use that in shared low level functions to do different things. I need this for the dynamically generated bank attributes in the x86 machine check code, but it'll allow some further cleanups. I converted all users in tree to the new show/store prototype. It's a single huge patch to avoid unbisectable sections. Runtime tested: x86-32, x86-64 Compiled only: ia64, powerpc Not compile tested/only grep converted: sh, arm, avr32 Signed-off-by: Andi Kleen Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce_64.c | 14 ++++++++++---- arch/x86/kernel/cpu/mcheck/therm_throt.c | 1 + arch/x86/kernel/microcode.c | 10 +++++++--- 3 files changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index c4a7ec31394c..e6a4d5f67643 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -762,10 +762,14 @@ DEFINE_PER_CPU(struct sys_device, device_mce); /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ - static ssize_t show_ ## name(struct sys_device *s, char *buf) { \ + static ssize_t show_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + char *buf) { \ return sprintf(buf, "%lx\n", (unsigned long)var); \ } \ - static ssize_t set_ ## name(struct sys_device *s,const char *buf,size_t siz) { \ + static ssize_t set_ ## name(struct sys_device *s, \ + struct sysdev_attribute *attr, \ + const char *buf, size_t siz) { \ char *end; \ unsigned long new = simple_strtoul(buf, &end, 0); \ if (end == buf) return -EINVAL; \ @@ -786,14 +790,16 @@ ACCESSOR(bank3ctl,bank[3],mce_restart()) ACCESSOR(bank4ctl,bank[4],mce_restart()) ACCESSOR(bank5ctl,bank[5],mce_restart()) -static ssize_t show_trigger(struct sys_device *s, char *buf) +static ssize_t show_trigger(struct sys_device *s, struct sysdev_attribute *attr, + char *buf) { strcpy(buf, trigger); strcat(buf, "\n"); return strlen(trigger) + 1; } -static ssize_t set_trigger(struct sys_device *s,const char *buf,size_t siz) +static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, + const char *buf,size_t siz) { char *p; int len; diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 1f4cc48c14c6..d5ae2243f0b9 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -35,6 +35,7 @@ atomic_t therm_throt_en = ATOMIC_INIT(0); #define define_therm_throt_sysdev_show_func(name) \ static ssize_t therm_throt_sysdev_show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 56b933119a04..fc4790638b69 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -644,7 +644,9 @@ static void microcode_fini_cpu(int cpu) mutex_unlock(µcode_mutex); } -static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; char *end; @@ -674,14 +676,16 @@ static ssize_t reload_store(struct sys_device *dev, const char *buf, size_t sz) return sz; } -static ssize_t version_show(struct sys_device *dev, char *buf) +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->rev); } -static ssize_t pf_show(struct sys_device *dev, char *buf) +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; -- cgit v1.2.2 From d95d62c018209355c0dc998682ff792432aa870c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 1 Jul 2008 18:48:43 +0200 Subject: sysdev: Convert the x86 mce tolerant sysdev attribute to generic attribute Use the new generic int attribute accessors for the x86 mce tolerant attribute. Simple example to illustrate the new macros. There are much more places all over the tree that could be converted like this. Signed-off-by: Andi Kleen Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/mcheck/mce_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index e6a4d5f67643..9ab65be82427 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -812,12 +812,12 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, } static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -ACCESSOR(tolerant,tolerant,) +static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); ACCESSOR(check_interval,check_interval,mce_restart()) static struct sysdev_attribute *mce_attributes[] = { &attr_bank0ctl, &attr_bank1ctl, &attr_bank2ctl, &attr_bank3ctl, &attr_bank4ctl, &attr_bank5ctl, - &attr_tolerant, &attr_check_interval, &attr_trigger, + &attr_tolerant.attr, &attr_check_interval, &attr_trigger, NULL }; -- cgit v1.2.2 From 988781dc3e1d9209192b04458d279815923f5e76 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Jul 2008 11:21:43 -0700 Subject: x86: use setup_clear_cpu_cap with disable_apic, fix beauty fix: /proc/cpuinfo will still show apic feature even if we booted up with it disabled. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ec952aa5394a..b4aacb9f52e3 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -680,7 +680,7 @@ void __init setup_arch(char **cmdline_p) #ifdef CONFIG_X86_LOCAL_APIC disable_apic = 1; #endif - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + setup_clear_cpu_cap(X86_FEATURE_APIC); } #ifdef CONFIG_PCI -- cgit v1.2.2 From 1b9b89e7f163336ad84200b66a17284dbf26aced Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 21 Jul 2008 22:08:21 -0700 Subject: x86: add apic probe for genapic 64bit, v2 introducing an APIC handling probing abstraction: static struct genapic *apic_probe[] __initdata = { &apic_x2apic_uv_x, &apic_x2apic_phys, &apic_x2apic_cluster, &apic_physflat, NULL, }; This way we can remove UV, x2apic specific code from genapic_64.c and move them to their specific genapic files. [ v2: fix compiling when CONFIG_ACPI is not set ] Signed-off-by: Yinghai Lu Cc: Jack Steiner Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_64.c | 85 ++++++++++++------------------------- arch/x86/kernel/genapic_flat_64.c | 26 ++++++++++++ arch/x86/kernel/genx2apic_cluster.c | 11 +++++ arch/x86/kernel/genx2apic_phys.c | 21 +++++++++ arch/x86/kernel/genx2apic_uv_x.c | 32 +++++++++++++- 5 files changed, 115 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 3940d8161f8b..b3ba969c50d2 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -16,62 +16,37 @@ #include #include #include -#include #include #include #include -#ifdef CONFIG_ACPI -#include -#endif - -DEFINE_PER_CPU(int, x2apic_extra_bits); +extern struct genapic apic_flat; +extern struct genapic apic_physflat; +extern struct genapic apic_x2xpic_uv_x; +extern struct genapic apic_x2apic_phys; +extern struct genapic apic_x2apic_cluster; struct genapic __read_mostly *genapic = &apic_flat; -static int x2apic_phys = 0; - -static int set_x2apic_phys_mode(char *arg) -{ - x2apic_phys = 1; - return 0; -} -early_param("x2apic_phys", set_x2apic_phys_mode); - -static enum uv_system_type uv_system_type; +static struct genapic *apic_probe[] __initdata = { + &apic_x2apic_uv_x, + &apic_x2apic_phys, + &apic_x2apic_cluster, + &apic_physflat, + NULL, +}; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. */ void __init setup_apic_routing(void) { - if (uv_system_type == UV_NON_UNIQUE_APIC) - genapic = &apic_x2apic_uv_x; - else if (cpu_has_x2apic && intr_remapping_enabled) { - if (x2apic_phys) - genapic = &apic_x2apic_phys; - else - genapic = &apic_x2apic_cluster; - } else -#ifdef CONFIG_ACPI - /* - * Quirk: some x86_64 machines can only use physical APIC mode - * regardless of how many processors are present (x86_64 ES7000 - * is an example). - */ - if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) - genapic = &apic_physflat; - else -#endif - - if (max_physical_apicid < 8) - genapic = &apic_flat; - else - genapic = &apic_physflat; - - printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); + if (genapic == &apic_flat) { + if (max_physical_apicid >= 8) + genapic = &apic_physflat; + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); + } } /* Same for both flat and physical. */ @@ -83,23 +58,15 @@ void apic_send_IPI_self(int vector) int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (!strcmp(oem_id, "SGI")) { - if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - else if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - else if (!strcmp(oem_table_id, "UVH")) - uv_system_type = UV_NON_UNIQUE_APIC; + int i; + + for (i = 0; apic_probe[i]; ++i) { + if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { + genapic = apic_probe[i]; + printk(KERN_INFO "Setting APIC routing to %s.\n", + genapic->name); + return 1; + } } return 0; } - -enum uv_system_type get_uv_system_type(void) -{ - return uv_system_type; -} - -int is_uv_system(void) -{ - return uv_system_type != UV_NONE; -} diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 2c973cbf054f..1740b83329f6 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -21,6 +21,15 @@ #include #include +#ifdef CONFIG_ACPI +#include +#endif + +static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + return 1; +} + static cpumask_t flat_target_cpus(void) { return cpu_online_map; @@ -138,6 +147,7 @@ static unsigned int phys_pkg_id(int index_msb) struct genapic apic_flat = { .name = "flat", + .acpi_madt_oem_check = flat_acpi_madt_oem_check, .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), .target_cpus = flat_target_cpus, @@ -160,6 +170,21 @@ struct genapic apic_flat = { * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ +static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ +#ifdef CONFIG_ACPI + /* + * Quirk: some x86_64 machines can only use physical APIC mode + * regardless of how many processors are present (x86_64 ES7000 + * is an example). + */ + if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + return 1; +#endif + + return 0; +} static cpumask_t physflat_target_cpus(void) { @@ -206,6 +231,7 @@ static unsigned int physflat_cpu_mask_to_apicid(cpumask_t cpumask) struct genapic apic_physflat = { .name = "physical flat", + .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = physflat_target_cpus, diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index 40bc0140d89f..ef3f3182d50a 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -4,12 +4,22 @@ #include #include #include +#include + #include #include #include DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (cpu_has_x2apic && intr_remapping_enabled) + return 1; + + return 0; +} + /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ static cpumask_t x2apic_target_cpus(void) @@ -135,6 +145,7 @@ static void init_x2apic_ldr(void) struct genapic apic_x2apic_cluster = { .name = "cluster x2apic", + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .int_delivery_mode = dest_LowestPrio, .int_dest_mode = (APIC_DEST_LOGICAL != 0), .target_cpus = x2apic_target_cpus, diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 2f3c6ca19de9..f35a3bf3a9e5 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -4,10 +4,30 @@ #include #include #include +#include + #include #include #include +DEFINE_PER_CPU(int, x2apic_extra_bits); + +static int x2apic_phys; + +static int set_x2apic_phys_mode(char *arg) +{ + x2apic_phys = 1; + return 0; +} +early_param("x2apic_phys", set_x2apic_phys_mode); + +static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (cpu_has_x2apic && intr_remapping_enabled && x2apic_phys) + return 1; + + return 0; +} /* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */ @@ -122,6 +142,7 @@ void init_x2apic_ldr(void) struct genapic apic_x2apic_phys = { .name = "physical x2apic", + .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = x2apic_target_cpus, diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index a8e5cb4c4d43..a882bc3a2ce9 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -27,6 +27,33 @@ #include #include +static enum uv_system_type uv_system_type; + +static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strcmp(oem_id, "SGI")) { + if (!strcmp(oem_table_id, "UVL")) + uv_system_type = UV_LEGACY_APIC; + else if (!strcmp(oem_table_id, "UVX")) + uv_system_type = UV_X2APIC; + else if (!strcmp(oem_table_id, "UVH")) { + uv_system_type = UV_NON_UNIQUE_APIC; + return 1; + } + } + return 0; +} + +enum uv_system_type get_uv_system_type(void) +{ + return uv_system_type; +} + +int is_uv_system(void) +{ + return uv_system_type != UV_NONE; +} + DEFINE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); EXPORT_PER_CPU_SYMBOL_GPL(__uv_hub_info); @@ -153,7 +180,7 @@ static unsigned int get_apic_id(unsigned long x) return id; } -static long set_apic_id(unsigned int id) +static unsigned long set_apic_id(unsigned int id) { unsigned long x; @@ -182,6 +209,7 @@ static void uv_send_IPI_self(int vector) struct genapic apic_x2apic_uv_x = { .name = "UV large system", + .acpi_madt_oem_check = uv_acpi_madt_oem_check, .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = uv_target_cpus, @@ -433,3 +461,5 @@ void __cpuinit uv_cpu_init(void) if (get_uv_system_type() == UV_NON_UNIQUE_APIC) set_x2apic_extra_bits(uv_hub_info->pnode); } + + -- cgit v1.2.2 From c2e3277f875b83e5adc34e96989d6d87ec5f80f7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 22 Jul 2008 15:40:46 +1000 Subject: x86: fix pte_flags() to only return flags, fix lguest (updated) (Jeremy said: rusty: use PTE_MASK rusty: use PTE_MASK rusty: use PTE_MASK When I asked: jsgf: does that include the NX flag? He responded eloquently: rusty: use PTE_MASK rusty: use PTE_MASK yes, it's the official constant of masking flags out of ptes ) Change a15af1c9ea2750a9ff01e51615c45950bad8221b 'x86/paravirt: add pte_flags to just get pte flags' removed lguest's private pte_flags() in favor of a generic one. Unfortunately, the generic one doesn't filter out the non-flags bits: this results in lguest creating corrupt shadow page tables and blowing up host memory. Since noone is supposed to use the pfn part of pte_flags(), it seems safest to always do the filtering. Signed-off-by: Rusty Russell Acked-by: Jeremy Fitzhardinge Signed-off-and-morning-tea-spilled-by: Ingo Molnar --- arch/x86/kernel/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 097d8a6797fa..94da4d52d798 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -443,7 +443,7 @@ struct pv_mmu_ops pv_mmu_ops = { #endif /* PAGETABLE_LEVELS >= 3 */ .pte_val = native_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = native_pgd_val, .make_pte = native_make_pte, -- cgit v1.2.2 From 59438c9fc4f7a92c808c9049bc6b396f98bf954c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 22:59:42 -0700 Subject: x86: rename PTE_MASK to PTE_PFN_MASK Rusty, in his peevish way, complained that macros defining constants should have a name which somewhat accurately reflects the actual purpose of the constant. Aside from the fact that PTE_MASK gives no clue as to what's actually being masked, and is misleadingly similar to the functionally entirely different PMD_MASK, PUD_MASK and PGD_MASK, I don't really see what the problem is. But if this patch silences the incessent noise, then it will have achieved its goal (TODO: write test-case). Signed-off-by: Jeremy Fitzhardinge Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 10 +++++----- arch/x86/xen/enlighten.c | 2 +- arch/x86/xen/mmu.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 0bb0caed8971..cc174fc412bc 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & ~(PTE_MASK); - cur = pgprot_val(st->current_prot) & ~(PTE_MASK); + prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); + cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); if (!st->level) { /* First entry */ @@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { - pgprotval_t prot = pmd_val(*start) & ~PTE_MASK; + pgprotval_t prot = pmd_val(*start) & ~PTE_PFN_MASK; if (pmd_large(*start) || !pmd_present(*start)) note_page(m, st, __pgprot(prot), 3); @@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { - pgprotval_t prot = pud_val(*start) & ~PTE_MASK; + pgprotval_t prot = pud_val(*start) & ~PTE_PFN_MASK; if (pud_large(*start) || !pud_present(*start)) note_page(m, st, __pgprot(prot), 2); @@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m) for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { - pgprotval_t prot = pgd_val(*start) & ~PTE_MASK; + pgprotval_t prot = pgd_val(*start) & ~PTE_PFN_MASK; if (pgd_large(*start) || !pgd_present(*start)) note_page(m, &st, __pgprot(prot), 1); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 194bbd6e3241..9ff6e3cbf08f 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1435,7 +1435,7 @@ static unsigned long m2p(phys_addr_t maddr) { phys_addr_t paddr; - maddr &= PTE_MASK; + maddr &= PTE_PFN_MASK; paddr = mfn_to_pfn(maddr >> PAGE_SHIFT) << PAGE_SHIFT; return paddr; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index a44d56e38bd1..0db6912395ed 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -343,8 +343,8 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long mfn = (val & PTE_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_MASK; + unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_PFN_MASK; val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; } @@ -354,8 +354,8 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) static pteval_t pte_pfn_to_mfn(pteval_t val) { if (val & _PAGE_PRESENT) { - unsigned long pfn = (val & PTE_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_MASK; + unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; + pteval_t flags = val & ~PTE_PFN_MASK; val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; } -- cgit v1.2.2 From 77be1fabd024b37423d12f832b1fbdb95dbdf494 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 22:59:56 -0700 Subject: x86: add PTE_FLAGS_MASK PTE_PFN_MASK was getting lonely, so I made it a friend. Signed-off-by: Jeremy Fitzhardinge Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 6 +++--- arch/x86/xen/mmu.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index cc174fc412bc..a20d1fa64b4e 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -221,7 +221,7 @@ static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr, for (i = 0; i < PTRS_PER_PMD; i++) { st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT); if (!pmd_none(*start)) { - pgprotval_t prot = pmd_val(*start) & ~PTE_PFN_MASK; + pgprotval_t prot = pmd_val(*start) & PTE_FLAGS_MASK; if (pmd_large(*start) || !pmd_present(*start)) note_page(m, st, __pgprot(prot), 3); @@ -253,7 +253,7 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, for (i = 0; i < PTRS_PER_PUD; i++) { st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT); if (!pud_none(*start)) { - pgprotval_t prot = pud_val(*start) & ~PTE_PFN_MASK; + pgprotval_t prot = pud_val(*start) & PTE_FLAGS_MASK; if (pud_large(*start) || !pud_present(*start)) note_page(m, st, __pgprot(prot), 2); @@ -288,7 +288,7 @@ static void walk_pgd_level(struct seq_file *m) for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); if (!pgd_none(*start)) { - pgprotval_t prot = pgd_val(*start) & ~PTE_PFN_MASK; + pgprotval_t prot = pgd_val(*start) & PTE_FLAGS_MASK; if (pgd_large(*start) || !pgd_present(*start)) note_page(m, &st, __pgprot(prot), 1); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0db6912395ed..aa37469da696 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -344,7 +344,7 @@ static pteval_t pte_mfn_to_pfn(pteval_t val) { if (val & _PAGE_PRESENT) { unsigned long mfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_PFN_MASK; + pteval_t flags = val & PTE_FLAGS_MASK; val = ((pteval_t)mfn_to_pfn(mfn) << PAGE_SHIFT) | flags; } @@ -355,7 +355,7 @@ static pteval_t pte_pfn_to_mfn(pteval_t val) { if (val & _PAGE_PRESENT) { unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; - pteval_t flags = val & ~PTE_PFN_MASK; + pteval_t flags = val & PTE_FLAGS_MASK; val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; } -- cgit v1.2.2 From 026e2c05ef58ef413e2d52696f125d5ea1aa8bce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 22 Jul 2008 11:58:14 +0200 Subject: x86, cyrix: debug Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cyrix.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 3fd7a67bb06a..db5868cd2443 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -116,7 +116,7 @@ static void __cpuinit set_cx86_reorder(void) setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* Load/Store Serialize to mem access disable (=reorder it) */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) & ~0x80); + setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) & ~0x80); /* set load/store serialize from 1GB to 4GB */ ccr3 |= 0xe0; setCx86(CX86_CCR3, ccr3); @@ -127,11 +127,11 @@ static void __cpuinit set_cx86_memwb(void) printk(KERN_INFO "Enable Memory-Write-back mode on Cyrix/NSC processor.\n"); /* CCR2 bit 2: unlock NW bit */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) & ~0x04); + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) & ~0x04); /* set 'Not Write-through' */ write_cr0(read_cr0() | X86_CR0_NW); /* CCR2 bit 2: lock NW bit and set WT1 */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x14); } static void __cpuinit set_cx86_inc(void) @@ -144,10 +144,10 @@ static void __cpuinit set_cx86_inc(void) setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* PCR1 -- Performance Control */ /* Incrementor on, whatever that is */ - setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); + setCx86_old(CX86_PCR1, getCx86_old(CX86_PCR1) | 0x02); /* PCR0 -- Performance Control */ /* Incrementor Margin 10 */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); + setCx86_old(CX86_PCR0, getCx86_old(CX86_PCR0) | 0x04); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ } @@ -162,14 +162,14 @@ static void __cpuinit geode_configure(void) local_irq_save(flags); /* Suspend on halt power saving and enable #SUSP pin */ - setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x88); + setCx86_old(CX86_CCR2, getCx86_old(CX86_CCR2) | 0x88); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ /* FPU fast, DTE cache, Mem bypass */ - setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x38); + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x38); setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ set_cx86_memwb(); @@ -286,7 +286,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) /* GXm supports extended cpuid levels 'ala' AMD */ if (c->cpuid_level == 2) { /* Enable cxMMX extensions (GX1 Datasheet 54) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7) | 1); + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7) | 1); /* * GXm : 0x30 ... 0x5f GXm datasheet 51 @@ -309,7 +309,7 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) if (dir1 > 7) { dir0_msn++; /* M II */ /* Enable MMX extensions (App note 108) */ - setCx86(CX86_CCR7, getCx86(CX86_CCR7)|1); + setCx86_old(CX86_CCR7, getCx86_old(CX86_CCR7)|1); } else { c->coma_bug = 1; /* 6x86MX, it has the bug. */ } @@ -424,7 +424,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - setCx86(CX86_CCR4, getCx86(CX86_CCR4) | 0x80); /* enable cpuid */ + setCx86_old(CX86_CCR4, getCx86_old(CX86_CCR4) | 0x80); /* enable cpuid */ setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ local_irq_restore(flags); } -- cgit v1.2.2 From d536b1f86591fb081c7a56eab04e711eb4dab951 Mon Sep 17 00:00:00 2001 From: Jan Kratochvil Date: Tue, 22 Jul 2008 14:00:47 +0200 Subject: x86: fix crash due to missing debugctlmsr on AMD K6-3 currently if you use PTRACE_SINGLEBLOCK on AMD K6-3 (i586) it will crash. Kernel now wrongly assumes existing DEBUGCTLMSR MSR register there. Removed the assumption also for some other non-K6 CPUs but I am not sure there (but it can only bring small inefficiency there if my assumption is wrong). Based on info from Roland McGrath, Chuck Ebbert and Mikulas Patocka. More info at: https://bugzilla.redhat.com/show_bug.cgi?id=456175 Signed-off-by: Jan Kratochvil Cc: Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 54b8c02c71e6..2c518fbc52ec 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -414,4 +414,4 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y - depends on !(M586MMX || M586TSC || M586 || M486 || M386) + depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) -- cgit v1.2.2 From bbc1f698a508927d21324b57500e863f9bd562b9 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 21 Jul 2008 21:34:13 +0530 Subject: x86: Introducing asm/syscalls.h Declaring arch-dependent syscalls for x86 architecture Signed-off-by: Jaswinder Singh --- arch/x86/kernel/ioport.c | 1 + arch/x86/kernel/ldt.c | 1 + arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 1 + arch/x86/kernel/signal_32.c | 1 + arch/x86/kernel/signal_64.c | 1 + arch/x86/kernel/sys_i386_32.c | 2 ++ arch/x86/kernel/sys_x86_64.c | 1 + arch/x86/kernel/tls.c | 1 + arch/x86/kernel/vm86_32.c | 1 + 10 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 50e5e4a31c85..191914302744 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -14,6 +14,7 @@ #include #include #include +#include /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ static void set_bitmap(unsigned long *bitmap, unsigned int base, diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index a8449571858a..c49ff2dc8f83 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -18,6 +18,7 @@ #include #include #include +#include #ifdef CONFIG_SMP static void flush_ldt(void *current_mm) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0c3927accb00..6490e4981261 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,6 +55,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e8a8e1b99817..c78090dd0c52 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -51,6 +51,7 @@ #include #include #include +#include asmlinkage extern void ret_from_fork(void); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 07faaa5109cb..5cede1045ce7 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "sigframe.h" diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index bf87684474f1..b95a0a609053 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "sigframe.h" #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c index 7066cb855a60..1884a8d12bfa 100644 --- a/arch/x86/kernel/sys_i386_32.c +++ b/arch/x86/kernel/sys_i386_32.c @@ -22,6 +22,8 @@ #include #include +#include + asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long pgoff) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 3b360ef33817..c9288c883e20 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -16,6 +16,7 @@ #include #include +#include asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long fd, unsigned long off) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index ab6bf375a307..6bb7b8579e70 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "tls.h" diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 38f566fa27d2..4eeb5cf9720d 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -46,6 +46,7 @@ #include #include #include +#include /* * Known problems: -- cgit v1.2.2 From fb26132b441e75d6ba9996efc29b42081aee0abd Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 21 Jul 2008 21:36:40 +0530 Subject: x86: process_32.c declare cpu_number before they get used Moved DECLARE_PER_CPU(int, cpu_number) from CONFIG_X86_32_SMP to CONFIG_X86_32 because cpu_number is required for both. And include asm/smp.h in process_32.c Signed-off-by: Jaswinder Singh --- arch/x86/kernel/process_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6490e4981261..7218bccd1766 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -56,6 +56,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -- cgit v1.2.2 From cc0384917bf69079088701a0725c5fc6b554bf35 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 21 Jul 2008 21:52:51 +0530 Subject: x86: time_XX.c declare functions before they get used Declare time_init() in asm-x86/time.h Also did cleanup in asm-x86/timer.h : timer_ack is only required for X86_32 int recalibrate_cpu_khz(void) is for X86_32 Signed-off-by: Jaswinder Singh --- arch/x86/kernel/time_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ffe3c664afc0..bbecf8b6bf96 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -36,6 +36,7 @@ #include #include #include +#include #include "do_timer.h" -- cgit v1.2.2 From 1e84911c6c37fd1080ef07039e19c346628b31db Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Mon, 21 Jul 2008 22:58:29 +0530 Subject: x86: mtrr/main.c declare range_state as static Signed-off-by: Jaswinder Singh --- arch/x86/kernel/cpu/mtrr/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6f23969c8faf..b5ade28ca8f8 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -729,7 +729,7 @@ struct var_mtrr_range_state { mtrr_type type; }; -struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; +static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; static int __initdata debug_print; static int __init -- cgit v1.2.2 From a31863168660c6b6f6c7ffe05bb6a38e97803326 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Tue, 22 Jul 2008 21:53:53 +0200 Subject: x86: consolidate header guards This patch consolidates the header guard names which are also used externally, i.e. in .c files. Signed-off-by: Vegard Nossum --- arch/x86/boot/compressed/misc.c | 2 +- arch/x86/kernel/asm-offsets_64.c | 2 +- arch/x86/kernel/syscall_64.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index bc5553b496f7..3ee338c7d1d1 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -16,7 +16,7 @@ */ #undef CONFIG_PARAVIRT #ifdef CONFIG_X86_32 -#define _ASM_DESC_H_ 1 +#define ASM_X86__DESC_H 1 #endif #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index aa89387006fe..505543a75a56 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -22,7 +22,7 @@ #define __NO_STUBS 1 #undef __SYSCALL -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H #define __SYSCALL(nr, sym) [nr] = 1, static char syscalls[] = { #include diff --git a/arch/x86/kernel/syscall_64.c b/arch/x86/kernel/syscall_64.c index 170d43c17487..3d1be4f0fac5 100644 --- a/arch/x86/kernel/syscall_64.c +++ b/arch/x86/kernel/syscall_64.c @@ -8,12 +8,12 @@ #define __NO_STUBS #define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H #include #undef __SYSCALL #define __SYSCALL(nr, sym) [nr] = sym, -#undef _ASM_X86_64_UNISTD_H_ +#undef ASM_X86__UNISTD_64_H typedef void (*sys_call_ptr_t)(void); -- cgit v1.2.2 From a80495ec927e8ec2b1ff085592bbe9bed77ffb3b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 23 Jul 2008 17:33:57 +0530 Subject: x86: mm/init_XX.c declare functions before they get used included in mm/init_32.c for zap_low_mappings() declared free_initmem() in asm-x86/page_XX.h Signed-off-by: Jaswinder Singh --- arch/x86/mm/init_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..4974e97dedfe 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -47,6 +47,7 @@ #include #include #include +#include unsigned int __VMALLOC_RESERVE = 128 << 20; -- cgit v1.2.2 From 70ef56414ec7e01d787c8e959bb259845df4ee4f Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 23 Jul 2008 17:36:37 +0530 Subject: x86: mm/fault.c declare do_page_fault before they get used declared do_page_fault() in asm-x86/trap.h for both X86_32 and X86_64 removed do_invalid_op declaration from mm/fault.c as it is already declared in asm-x86/trap.h Signed-off-by: Jaswinder Singh --- arch/x86/mm/fault.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 455f3fe67b42..8f92cac4e6db 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -35,6 +35,7 @@ #include #include #include +#include /* * Page fault error code bits @@ -357,8 +358,6 @@ static int is_errata100(struct pt_regs *regs, unsigned long address) return 0; } -void do_invalid_op(struct pt_regs *, unsigned long); - static int is_f00f_bug(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_F00F_BUG -- cgit v1.2.2 From 4b6e9f27d0034740e9cfa341b45c229ba30ec0c5 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 23 Jul 2008 17:39:16 +0530 Subject: x86: mm/ioremap.c declare early_ioremap_debug and early_ioremap_nested as static Signed-off-by: Jaswinder Singh --- arch/x86/mm/ioremap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 24c1d3c30186..19fd9a3c5210 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -413,7 +413,7 @@ void unxlate_dev_mem_ptr(unsigned long phys, void *addr) return; } -int __initdata early_ioremap_debug; +static int __initdata early_ioremap_debug; static int __init early_ioremap_debug_setup(char *str) { @@ -539,7 +539,7 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) } -int __initdata early_ioremap_nested; +static int __initdata early_ioremap_nested; static int __init check_early_ioremap_leak(void) { -- cgit v1.2.2 From 71e3b818431957371c7f69fa1c576d4a403c1478 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Wed, 23 Jul 2008 17:44:00 +0530 Subject: x86: mach-default/setup.c declare no_broadcast before they get used included mach_ipi.h for no_broadcast declaration fixed minor spacing for no_broadcast Signed-off-by: Jaswinder Singh --- arch/x86/mach-default/setup.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 3d317836be9e..3f2cf11f201a 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -10,13 +10,15 @@ #include #include +#include + #ifdef CONFIG_HOTPLUG_CPU #define DEFAULT_SEND_IPI (1) #else #define DEFAULT_SEND_IPI (0) #endif -int no_broadcast=DEFAULT_SEND_IPI; +int no_broadcast = DEFAULT_SEND_IPI; /** * pre_intr_init_hook - initialisation prior to setting up interrupt vectors -- cgit v1.2.2 From 05d3ed0a1fe3ea05ab9f3b8d32576a0bc2e19660 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 21 Jul 2008 10:15:22 -0400 Subject: x86, pci: iommu fix potential overflow in alloc_iommu() It is possible that alloc_iommu()'s boundary_size overflows as dma_get_seg_boundary can return 0xffffffff. In that case, further usage of boundary_size triggers a BUG_ON() in the iommu code. Signed-off-by: Prarit Bhargava Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index df5f142657d2..1062dc1e6396 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -93,7 +93,7 @@ static unsigned long alloc_iommu(struct device *dev, int size) base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; - boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, + boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; spin_lock_irqsave(&iommu_bitmap_lock, flags); -- cgit v1.2.2 From fd60b39f845aa7462453af8aed4f059b162f181f Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 23 Jul 2008 22:45:01 +0800 Subject: arch/x86/kernel/genx2apic_uv_x.c: Removed duplicated include Removed duplicated include file in arch/x86/kernel/genx2apic_uv_x.c. Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index a882bc3a2ce9..14a9772778e5 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.2 From 15e8f348db372dec21229fda5d52ae6ee7e64666 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 23 Jun 2008 20:41:12 -0700 Subject: x86_64: remove bogus optimization in sysret_signal This short-circuit path in sysret_signal looks wrong to me. AFAICT, in practice the branch is never taken--and if it were, it would go wrong. To wit, try loading a module whose init function does set_thread_flag(TIF_IRET), and see insmod crash (presumably with a wrong user stack pointer). This is because the FIXUP_TOP_OF_STACK work hasn't been done yet when we jump around the call to ptregscall_common and get to int_with_check--where it expects the user RSP,SS,CS and EFLAGS to have been stored by FIXUP_TOP_OF_STACK. I don't think it's normally possible to get to sysret_signal with no _TIF_DO_NOTIFY_MASK bits set anyway, so these two instructions are already superfluous. If it ever did happen, it is harmless to call do_notify_resume with nothing for it to do. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_64.S | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8410e26f4183..a169225869cc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -402,16 +402,12 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - testl $_TIF_DO_NOTIFY_MASK,%edx - jz 1f - - /* Really a signal */ /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 xorl %esi,%esi # oldset -> arg2 call ptregscall_common -1: movl $_TIF_WORK_MASK,%edi + movl $_TIF_WORK_MASK,%edi /* Use IRET because user could have changed frame. This works because ptregscall_common has called FIXUP_TOP_OF_STACK. */ DISABLE_INTERRUPTS(CLBR_NONE) -- cgit v1.2.2 From 86a1c34a929f30fde8ad01ea8245df61ddcf58b7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Mon, 23 Jun 2008 15:37:04 -0700 Subject: x86_64 syscall audit fast-path This adds a fast path for 64-bit syscall entry and exit when TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing. This path does not need to save and restore all registers as the general case of tracing does. Avoiding the iret return path when syscall audit is enabled helps performance a lot. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_64.S | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a169225869cc..db7d34a89d2e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -53,6 +53,12 @@ #include #include +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_64BIT 0x80000000 +#define __AUDIT_ARCH_LE 0x40000000 + .code64 #ifdef CONFIG_FTRACE @@ -351,6 +357,7 @@ ENTRY(system_call_after_swapgs) GET_THREAD_INFO(%rcx) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) jnz tracesys +system_call_fastpath: cmpq $__NR_syscall_max,%rax ja badsys movq %r10,%rcx @@ -402,6 +409,10 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) +#ifdef CONFIG_AUDITSYSCALL + bt $TIF_SYSCALL_AUDIT,%edx + jc sysret_audit +#endif /* edx: work flags (arg3) */ leaq do_notify_resume(%rip),%rax leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1 @@ -418,8 +429,45 @@ badsys: movq $-ENOSYS,RAX-ARGOFFSET(%rsp) jmp ret_from_sys_call +#ifdef CONFIG_AUDITSYSCALL + /* + * Fast path for syscall audit without full syscall trace. + * We just call audit_syscall_entry() directly, and then + * jump back to the normal fast path. + */ +auditsys: + movq %r10,%r9 /* 6th arg: 4th syscall arg */ + movq %rdx,%r8 /* 5th arg: 3rd syscall arg */ + movq %rsi,%rcx /* 4th arg: 2nd syscall arg */ + movq %rdi,%rdx /* 3rd arg: 1st syscall arg */ + movq %rax,%rsi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_X86_64,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + LOAD_ARGS 0 /* reload call-clobbered registers */ + jmp system_call_fastpath + + /* + * Return fast path for syscall audit. Call audit_syscall_exit() + * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT + * masked off. + */ +sysret_audit: + movq %rax,%rsi /* second arg, syscall return value */ + cmpq $0,%rax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + jmp sysret_check +#endif /* CONFIG_AUDITSYSCALL */ + /* Do syscall tracing */ tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + jz auditsys +#endif SAVE_REST movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ FIXUP_TOP_OF_STACK %rdi -- cgit v1.2.2 From 5cbf1565f29eb57a86a305b08836613508e294d7 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 24 Jun 2008 01:13:31 -0700 Subject: x86_64 ia32 syscall audit fast-path This adds fast paths for 32-bit syscall entry and exit when TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing. These paths does not need to save and restore all registers as the general case of tracing does. Avoiding the iret return path when syscall audit is enabled helps performance a lot. Signed-off-by: Roland McGrath --- arch/x86/ia32/ia32entry.S | 91 ++++++++++++++++++++++++++++++++++++++++++++-- arch/x86/kernel/entry_64.S | 1 + 2 files changed, 88 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 23d146ce676b..021d71bc69b5 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -15,6 +15,16 @@ #include #include +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysexit_audit int_ret_from_sys_call +#define sysretl_audit int_ret_from_sys_call +#endif + #define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) .macro IA32_ARG_FIXUP noebp=0 @@ -148,13 +158,15 @@ ENTRY(ia32_sysenter_target) ja ia32_badsys sysenter_do_call: IA32_ARG_FIXUP 1 +sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz int_ret_from_sys_call + jnz sysexit_audit +sysexit_from_sys_call: andl $~TS_COMPAT,TI_status(%r10) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) @@ -170,9 +182,63 @@ sysenter_do_call: TRACE_IRQS_ON ENABLE_INTERRUPTS_SYSEXIT32 -sysenter_tracesys: +#ifdef CONFIG_AUDITSYSCALL + .macro auditsys_entry_common + movl %esi,%r9d /* 6th arg: 4th syscall arg */ + movl %edx,%r8d /* 5th arg: 3rd syscall arg */ + /* (already in %ecx) 4th arg: 2nd syscall arg */ + movl %ebx,%edx /* 3rd arg: 1st syscall arg */ + movl %eax,%esi /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%edi /* 1st arg: audit arch */ + call audit_syscall_entry + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ + cmpl $(IA32_NR_syscalls-1),%eax + ja ia32_badsys + movl %ebx,%edi /* reload 1st syscall arg */ + movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ + .endm + + .macro auditsys_exit exit + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jnz int_ret_from_sys_call + TRACE_IRQS_ON + sti + movl %eax,%esi /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%edi /* zero-extend that into %edi */ + inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + GET_THREAD_INFO(%r10) + movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ + movl RBP-ARGOFFSET(%rsp),%ebp /* reload user register value */ + movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi + cli + TRACE_IRQS_OFF + testl %edi,TI_flags(%r10) + jnz int_with_check + jmp \exit + .endm + +sysenter_auditsys: CFI_RESTORE_STATE + auditsys_entry_common + movl %ebp,%r9d /* reload 6th syscall arg */ + jmp sysenter_dispatch + +sysexit_audit: + auditsys_exit sysexit_from_sys_call +#endif + +sysenter_tracesys: xchgl %r9d,%ebp +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jz sysenter_auditsys +#endif SAVE_REST CLEAR_RREGS movq %r9,R9(%rsp) @@ -252,13 +318,15 @@ cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys IA32_ARG_FIXUP 1 +cstar_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK,TI_flags(%r10) - jnz int_ret_from_sys_call + jnz sysretl_audit +sysretl_from_sys_call: andl $~TS_COMPAT,TI_status(%r10) RESTORE_ARGS 1,-ARG_SKIP,1,1,1 movl RIP-ARGOFFSET(%rsp),%ecx @@ -270,8 +338,23 @@ cstar_do_call: CFI_RESTORE rsp USERGS_SYSRET32 -cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL +cstar_auditsys: CFI_RESTORE_STATE + movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ + auditsys_entry_common + movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ + jmp cstar_dispatch + +sysretl_audit: + auditsys_exit sysretl_from_sys_call +#endif + +cstar_tracesys: +#ifdef CONFIG_AUDITSYSCALL + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + jz cstar_auditsys +#endif xchgl %r9d,%ebp SAVE_REST CLEAR_RREGS diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index db7d34a89d2e..89434d439605 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -492,6 +492,7 @@ tracesys: * Has correct top of stack, but partial stack frame. */ .globl int_ret_from_sys_call + .globl int_with_check int_ret_from_sys_call: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF -- cgit v1.2.2 From af0575bba0f46dd9054d46e0a88c57afad3bf4d2 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Tue, 24 Jun 2008 04:16:52 -0700 Subject: i386 syscall audit fast-path This adds fast paths for 32-bit syscall entry and exit when TIF_SYSCALL_AUDIT is set, but no other kind of syscall tracing. These paths does not need to save and restore all registers as the general case of tracing does. Avoiding the iret return path when syscall audit is enabled helps performance a lot. Signed-off-by: Roland McGrath --- arch/x86/kernel/entry_32.S | 55 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 53 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index cdfd94cc6b14..109792bc7cfa 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -54,6 +54,16 @@ #include #include +/* Avoid __ASSEMBLER__'ifying just for this. */ +#include +#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) +#define __AUDIT_ARCH_LE 0x40000000 + +#ifndef CONFIG_AUDITSYSCALL +#define sysenter_audit syscall_trace_entry +#define sysexit_audit syscall_exit_work +#endif + /* * We use macros for low-level operations which need to be overridden * for paravirtualization. The following will never clobber any registers: @@ -333,7 +343,8 @@ sysenter_past_esp: /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ testw $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%ebp) - jnz syscall_trace_entry + jnz sysenter_audit +sysenter_do_call: cmpl $(nr_syscalls), %eax jae syscall_badsys call *sys_call_table(,%eax,4) @@ -343,7 +354,8 @@ sysenter_past_esp: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testw $_TIF_ALLWORK_MASK, %cx - jne syscall_exit_work + jne sysexit_audit +sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx movl PT_OLDESP(%esp), %ecx @@ -351,6 +363,45 @@ sysenter_past_esp: TRACE_IRQS_ON 1: mov PT_FS(%esp), %fs ENABLE_INTERRUPTS_SYSEXIT + +#ifdef CONFIG_AUDITSYSCALL +sysenter_audit: + testw $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%ebp) + jnz syscall_trace_entry + addl $4,%esp + CFI_ADJUST_CFA_OFFSET -4 + /* %esi already in 8(%esp) 6th arg: 4th syscall arg */ + /* %edx already in 4(%esp) 5th arg: 3rd syscall arg */ + /* %ecx already in 0(%esp) 4th arg: 2nd syscall arg */ + movl %ebx,%ecx /* 3rd arg: 1st syscall arg */ + movl %eax,%edx /* 2nd arg: syscall number */ + movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ + call audit_syscall_entry + pushl %ebx + CFI_ADJUST_CFA_OFFSET 4 + movl PT_EAX(%esp),%eax /* reload syscall number */ + jmp sysenter_do_call + +sysexit_audit: + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_ANY) + movl %eax,%edx /* second arg, syscall return value */ + cmpl $0,%eax /* is it < 0? */ + setl %al /* 1 if so, 0 if not */ + movzbl %al,%eax /* zero-extend that */ + inc %eax /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ + call audit_syscall_exit + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %cx + jne syscall_exit_work + movl PT_EAX(%esp),%eax /* reload syscall return value */ + jmp sysenter_exit +#endif + CFI_ENDPROC .pushsection .fixup,"ax" 2: movl $0,PT_FS(%esp) -- cgit v1.2.2 From 9e882c9282512cc622752f29ec7c29ce338fc1eb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 16:49:54 -0700 Subject: x86: call early_cpu_init at the same point Call early_cpu_init() at the same (early) point in setup_arch(). The x86_64 code was calling it relatively late, after when other arch code need to do cpu-related setup which depends on it. Signed-off-by: Jeremy Fitzhardinge Cc: Mark McLoughlin Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4aacb9f52e3..b520dae02bf4 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -597,11 +597,11 @@ void __init setup_arch(char **cmdline_p) memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); pre_setup_arch_hook(); - early_cpu_init(); #else printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + early_cpu_init(); early_ioremap_init(); ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); @@ -665,9 +665,6 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; -#ifdef CONFIG_X86_64 - early_cpu_init(); -#endif strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; -- cgit v1.2.2 From 2dc1697eb355c34f9f7bcbbb83f490de248c360a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 21 Jul 2008 16:49:58 -0700 Subject: xen: don't use sysret for sysexit32 When implementing sysexit32, don't let Xen use sysret to return to userspace. That results in usermode register state being trashed. Signed-off-by: Jeremy Fitzhardinge Cc: Mark McLoughlin Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/xen/xen-asm_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 4038cbfe3331..7f58304fafb3 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -173,7 +173,7 @@ ENTRY(xen_sysexit) pushq $__USER32_CS pushq %rdx - pushq $VGCF_in_syscall + pushq $0 1: jmp hypercall_iret ENDPATCH(xen_sysexit) RELOC(xen_sysexit, 1b+1) -- cgit v1.2.2 From 38ffbe66d59051fd9cfcfc8545f164700e2fa3bc Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 23 Jul 2008 14:21:18 -0700 Subject: x86/paravirt/xen: properly fill out the ldt ops LTP testing showed that Xen does not properly implement sys_modify_ldt(). This patch does the final little bits needed to make the ldt work properly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/ldt.c | 9 ++++++++- arch/x86/kernel/paravirt.c | 4 ++++ arch/x86/xen/enlighten.c | 23 +++++++++++++++++++++++ 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 3fee2aa50f3f..4895e0634d22 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -51,6 +51,8 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) memset(newldt + oldsize * LDT_ENTRY_SIZE, 0, (mincount - oldsize) * LDT_ENTRY_SIZE); + paravirt_alloc_ldt(newldt, mincount); + #ifdef CONFIG_X86_64 /* CHECKME: Do we really need this ? */ wmb(); @@ -75,6 +77,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) #endif } if (oldsize) { + paravirt_free_ldt(oldldt, oldsize); if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(oldldt); else @@ -86,10 +89,13 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) static inline int copy_ldt(mm_context_t *new, mm_context_t *old) { int err = alloc_ldt(new, old->size, 0); + int i; if (err < 0) return err; - memcpy(new->ldt, old->ldt, old->size * LDT_ENTRY_SIZE); + + for(i = 0; i < old->size; i++) + write_ldt_entry(new->ldt, i, old->ldt + i * LDT_ENTRY_SIZE); return 0; } @@ -126,6 +132,7 @@ void destroy_context(struct mm_struct *mm) if (mm == current->active_mm) clear_LDT(); #endif + paravirt_free_ldt(mm->context.ldt, mm->context.size); if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(mm->context.ldt); else diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 94da4d52d798..d8f2277be5a0 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -348,6 +348,10 @@ struct pv_cpu_ops pv_cpu_ops = { .write_ldt_entry = native_write_ldt_entry, .write_gdt_entry = native_write_gdt_entry, .write_idt_entry = native_write_idt_entry, + + .alloc_ldt = paravirt_nop, + .free_ldt = paravirt_nop, + .load_sp0 = native_load_sp0, #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..06219e60e9c8 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,6 +325,26 @@ static unsigned long xen_store_tr(void) return 0; } +static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readonly(v + i); +} + +static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) +{ + unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); + void *v = ldt; + int i; + + for(i = 0; i < pages; i += PAGE_SIZE) + make_lowmem_page_readwrite(v + i); +} + static void xen_set_ldt(const void *addr, unsigned entries) { struct mmuext_op *op; @@ -1220,6 +1240,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_gs_index = xen_load_gs_index, #endif + .alloc_ldt = xen_alloc_ldt, + .free_ldt = xen_free_ldt, + .store_gdt = native_store_gdt, .store_idt = native_store_idt, .store_tr = xen_store_tr, -- cgit v1.2.2 From d5de8841355a48f7f634a04507185eaf1f9755e3 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 23 Jul 2008 13:28:58 -0700 Subject: x86: split spinlock implementations out into their own files ftrace requires certain low-level code, like spinlocks and timestamps, to be compiled without -pg in order to avoid infinite recursion. This patch splits out the core paravirt spinlocks and the Xen spinlocks into separate files which can be compiled without -pg. Also do xen/time.c while we're about it. As a result, we can now use ftrace within a Xen domain. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/paravirt-spinlocks.c | 31 ++++++ arch/x86/kernel/paravirt.c | 23 ----- arch/x86/xen/Makefile | 8 +- arch/x86/xen/smp.c | 167 -------------------------------- arch/x86/xen/spinlock.c | 183 +++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-ops.h | 3 + 7 files changed, 226 insertions(+), 193 deletions(-) create mode 100644 arch/x86/kernel/paravirt-spinlocks.c create mode 100644 arch/x86/xen/spinlock.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3db651fc8ec5..d679cb2c79b4 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -10,7 +10,7 @@ ifdef CONFIG_FTRACE # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg -CFLAGS_REMOVE_paravirt.o = -pg +CFLAGS_REMOVE_paravirt-spinlocks.o = -pg endif # @@ -89,7 +89,7 @@ obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o obj-$(CONFIG_KVM_GUEST) += kvm.o obj-$(CONFIG_KVM_CLOCK) += kvmclock.o -obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o +obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o paravirt-spinlocks.o obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c new file mode 100644 index 000000000000..38d7f7f1dbc9 --- /dev/null +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -0,0 +1,31 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include +#include + +#include + +struct pv_lock_ops pv_lock_ops = { +#ifdef CONFIG_SMP + .spin_is_locked = __ticket_spin_is_locked, + .spin_is_contended = __ticket_spin_is_contended, + + .spin_lock = __ticket_spin_lock, + .spin_trylock = __ticket_spin_trylock, + .spin_unlock = __ticket_spin_unlock, +#endif +}; +EXPORT_SYMBOL_GPL(pv_lock_ops); + +void __init paravirt_use_bytelocks(void) +{ +#ifdef CONFIG_SMP + pv_lock_ops.spin_is_locked = __byte_spin_is_locked; + pv_lock_ops.spin_is_contended = __byte_spin_is_contended; + pv_lock_ops.spin_lock = __byte_spin_lock; + pv_lock_ops.spin_trylock = __byte_spin_trylock; + pv_lock_ops.spin_unlock = __byte_spin_unlock; +#endif +} diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 94da4d52d798..0d71de9ff56d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -268,17 +268,6 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) return __get_cpu_var(paravirt_lazy_mode); } -void __init paravirt_use_bytelocks(void) -{ -#ifdef CONFIG_SMP - pv_lock_ops.spin_is_locked = __byte_spin_is_locked; - pv_lock_ops.spin_is_contended = __byte_spin_is_contended; - pv_lock_ops.spin_lock = __byte_spin_lock; - pv_lock_ops.spin_trylock = __byte_spin_trylock; - pv_lock_ops.spin_unlock = __byte_spin_unlock; -#endif -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, @@ -461,18 +450,6 @@ struct pv_mmu_ops pv_mmu_ops = { .set_fixmap = native_set_fixmap, }; -struct pv_lock_ops pv_lock_ops = { -#ifdef CONFIG_SMP - .spin_is_locked = __ticket_spin_is_locked, - .spin_is_contended = __ticket_spin_is_contended, - - .spin_lock = __ticket_spin_lock, - .spin_trylock = __ticket_spin_trylock, - .spin_unlock = __ticket_spin_unlock, -#endif -}; -EXPORT_SYMBOL_GPL(pv_lock_ops); - EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); EXPORT_SYMBOL (pv_mmu_ops); diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 59c1e539aed2..5bfee243cf9a 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -1,4 +1,10 @@ +ifdef CONFIG_FTRACE +# Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_spinlock.o = -pg +CFLAGS_REMOVE_time.o = -pg +endif + obj-y := enlighten.o setup.o multicalls.o mmu.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o spinlock.o diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index d8faf79a0a1d..baca7f2fbd8a 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -15,7 +15,6 @@ * This does not handle HOTPLUG_CPU yet. */ #include -#include #include #include @@ -36,8 +35,6 @@ #include "xen-ops.h" #include "mmu.h" -static void __cpuinit xen_init_lock_cpu(int cpu); - cpumask_t xen_cpu_initialized_map; static DEFINE_PER_CPU(int, resched_irq); @@ -419,170 +416,6 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -struct xen_spinlock { - unsigned char lock; /* 0 -> free; 1 -> locked */ - unsigned short spinners; /* count of waiting cpus */ -}; - -static int xen_spin_is_locked(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - return xl->lock != 0; -} - -static int xen_spin_is_contended(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - /* Not strictly true; this is only the count of contended - lock-takers entering the slow path. */ - return xl->spinners != 0; -} - -static int xen_spin_trylock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - u8 old = 1; - - asm("xchgb %b0,%1" - : "+q" (old), "+m" (xl->lock) : : "memory"); - - return old == 0; -} - -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); - -static inline void spinning_lock(struct xen_spinlock *xl) -{ - __get_cpu_var(lock_spinners) = xl; - wmb(); /* set lock of interest before count */ - asm(LOCK_PREFIX " incw %0" - : "+m" (xl->spinners) : : "memory"); -} - -static inline void unspinning_lock(struct xen_spinlock *xl) -{ - asm(LOCK_PREFIX " decw %0" - : "+m" (xl->spinners) : : "memory"); - wmb(); /* decrement count before clearing lock */ - __get_cpu_var(lock_spinners) = NULL; -} - -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int irq = __get_cpu_var(lock_kicker_irq); - int ret; - - /* If kicker interrupts not initialized yet, just spin */ - if (irq == -1) - return 0; - - /* announce we're spinning */ - spinning_lock(xl); - - /* clear pending */ - xen_clear_irq_pending(irq); - - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) - goto out; - - /* block until irq becomes pending */ - xen_poll_irq(irq); - kstat_this_cpu.irqs[irq]++; - -out: - unspinning_lock(xl); - return ret; -} - -static void xen_spin_lock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int timeout; - u8 oldval; - - do { - timeout = 1 << 10; - - asm("1: xchgb %1,%0\n" - " testb %1,%1\n" - " jz 3f\n" - "2: rep;nop\n" - " cmpb $0,%0\n" - " je 1b\n" - " dec %2\n" - " jnz 2b\n" - "3:\n" - : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) - : "1" (1) - : "memory"); - - } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); -} - -static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) -{ - int cpu; - - for_each_online_cpu(cpu) { - /* XXX should mix up next cpu selection */ - if (per_cpu(lock_spinners, cpu) == xl) { - xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; - } - } -} - -static void xen_spin_unlock(struct raw_spinlock *lock) -{ - struct xen_spinlock *xl = (struct xen_spinlock *)lock; - - smp_wmb(); /* make sure no writes get moved after unlock */ - xl->lock = 0; /* release lock */ - - /* make sure unlock happens before kick */ - barrier(); - - if (unlikely(xl->spinners)) - xen_spin_unlock_slow(xl); -} - -static __cpuinit void xen_init_lock_cpu(int cpu) -{ - int irq; - const char *name; - - name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); - irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, - cpu, - xen_reschedule_interrupt, - IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, - name, - NULL); - - if (irq >= 0) { - disable_irq(irq); /* make sure it's never delivered */ - per_cpu(lock_kicker_irq, cpu) = irq; - } - - printk("cpu %d spinlock event irq %d\n", cpu, irq); -} - -static void __init xen_init_spinlocks(void) -{ - pv_lock_ops.spin_is_locked = xen_spin_is_locked; - pv_lock_ops.spin_is_contended = xen_spin_is_contended; - pv_lock_ops.spin_lock = xen_spin_lock; - pv_lock_ops.spin_trylock = xen_spin_trylock; - pv_lock_ops.spin_unlock = xen_spin_unlock; -} - static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c new file mode 100644 index 000000000000..8dc4d31da67f --- /dev/null +++ b/arch/x86/xen/spinlock.c @@ -0,0 +1,183 @@ +/* + * Split spinlock implementation out into its own file, so it can be + * compiled in a FTRACE-compatible way. + */ +#include +#include + +#include + +#include +#include + +#include "xen-ops.h" + +struct xen_spinlock { + unsigned char lock; /* 0 -> free; 1 -> locked */ + unsigned short spinners; /* count of waiting cpus */ +}; + +static int xen_spin_is_locked(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + return xl->lock != 0; +} + +static int xen_spin_is_contended(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + /* Not strictly true; this is only the count of contended + lock-takers entering the slow path. */ + return xl->spinners != 0; +} + +static int xen_spin_trylock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + u8 old = 1; + + asm("xchgb %b0,%1" + : "+q" (old), "+m" (xl->lock) : : "memory"); + + return old == 0; +} + +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); + +static inline void spinning_lock(struct xen_spinlock *xl) +{ + __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" + : "+m" (xl->spinners) : : "memory"); +} + +static inline void unspinning_lock(struct xen_spinlock *xl) +{ + asm(LOCK_PREFIX " decw %0" + : "+m" (xl->spinners) : : "memory"); + wmb(); /* decrement count before clearing lock */ + __get_cpu_var(lock_spinners) = NULL; +} + +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int irq = __get_cpu_var(lock_kicker_irq); + int ret; + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return 0; + + /* announce we're spinning */ + spinning_lock(xl); + + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) + goto out; + + /* block until irq becomes pending */ + xen_poll_irq(irq); + kstat_this_cpu.irqs[irq]++; + +out: + unspinning_lock(xl); + return ret; +} + +static void xen_spin_lock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + int timeout; + u8 oldval; + + do { + timeout = 1 << 10; + + asm("1: xchgb %1,%0\n" + " testb %1,%1\n" + " jz 3f\n" + "2: rep;nop\n" + " cmpb $0,%0\n" + " je 1b\n" + " dec %2\n" + " jnz 2b\n" + "3:\n" + : "+m" (xl->lock), "=q" (oldval), "+r" (timeout) + : "1" (1) + : "memory"); + + } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); +} + +static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) +{ + int cpu; + + for_each_online_cpu(cpu) { + /* XXX should mix up next cpu selection */ + if (per_cpu(lock_spinners, cpu) == xl) { + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); + break; + } + } +} + +static void xen_spin_unlock(struct raw_spinlock *lock) +{ + struct xen_spinlock *xl = (struct xen_spinlock *)lock; + + smp_wmb(); /* make sure no writes get moved after unlock */ + xl->lock = 0; /* release lock */ + + /* make sure unlock happens before kick */ + barrier(); + + if (unlikely(xl->spinners)) + xen_spin_unlock_slow(xl); +} + +static irqreturn_t dummy_handler(int irq, void *dev_id) +{ + BUG(); + return IRQ_HANDLED; +} + +void __cpuinit xen_init_lock_cpu(int cpu) +{ + int irq; + const char *name; + + name = kasprintf(GFP_KERNEL, "spinlock%d", cpu); + irq = bind_ipi_to_irqhandler(XEN_SPIN_UNLOCK_VECTOR, + cpu, + dummy_handler, + IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, + name, + NULL); + + if (irq >= 0) { + disable_irq(irq); /* make sure it's never delivered */ + per_cpu(lock_kicker_irq, cpu) = irq; + } + + printk("cpu %d spinlock event irq %d\n", cpu, irq); +} + +void __init xen_init_spinlocks(void) +{ + pv_lock_ops.spin_is_locked = xen_spin_is_locked; + pv_lock_ops.spin_is_contended = xen_spin_is_contended; + pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_trylock = xen_spin_trylock; + pv_lock_ops.spin_unlock = xen_spin_unlock; +} diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index dd3c23152a2e..8847fb34f17e 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -50,6 +50,9 @@ void __init xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP void xen_smp_init(void); +void __init xen_init_spinlocks(void); +__cpuinit void xen_init_lock_cpu(int cpu); + extern cpumask_t xen_cpu_initialized_map; #else static inline void xen_smp_init(void) {} -- cgit v1.2.2 From e0a5a5d9b006fd441e61685a051fa85d52fb172c Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 22 Jul 2008 18:14:16 +0200 Subject: x86, 64-bit, dwarf2: push pushes 8 bytes and popf pops 8 The CFI_ADJUST_CFA_OFFSET dwarf2 annotation of a push/popf pair in ret_from_fork wrongly used a value of 4. It should have been 8. Fix that. Signed-off-by: Alexander van Heukelum Cc: Andi Kleen Cc: heukelum@fastmail.fm Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 89434d439605..cf3a0b2d0059 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -275,9 +275,9 @@ ENTRY(native_usergs_sysret64) ENTRY(ret_from_fork) CFI_DEFAULT_STACK push kernel_eflags(%rip) - CFI_ADJUST_CFA_OFFSET 4 + CFI_ADJUST_CFA_OFFSET 8 popf # reset kernel eflags - CFI_ADJUST_CFA_OFFSET -4 + CFI_ADJUST_CFA_OFFSET -8 call schedule_tail GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT),TI_flags(%rcx) -- cgit v1.2.2 From 32f71aff77b6470d272f80ac28f43f9601c4d140 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Mon, 21 Jul 2008 00:52:49 +0100 Subject: x86: PIC, L-APIC and I/O APIC debug information Dump all the PIC, local APIC and I/O APIC information at the fs_initcall() level, which is after ACPI (if used) has initialised PCI information, making the point of invocation consistent across MP-table and ACPI platforms. Remove explicit calls to print_IO_APIC() from elsewhere. Make the interface of all the functions involved consistent between 32-bit and 64-bit versions and make them all static by default by the means of a New-and-Improved(TM) __apicdebuginit() macro. Note that like print_IO_APIC() all these only output anything if "apic=debug" has been passed to the kernel through the command line. Signed-off-by: Maciej W. Rozycki Cc: Chuck Ebbert Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 29 +++++++++++++++++++---------- arch/x86/kernel/io_apic_64.c | 31 +++++++++++++++++++------------ arch/x86/pci/acpi.c | 5 ----- 3 files changed, 38 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index de9aa0e3a9c5..d54455ec9850 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -50,6 +50,8 @@ #include #include +#define __apicdebuginit(type) static type __init + int (*ioapic_renumber_irq)(int ioapic, int irq); atomic_t irq_mis_count; @@ -1345,7 +1347,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, ioapic_write_entry(apic, pin, entry); } -void __init print_IO_APIC(void) + +__apicdebuginit(void) print_IO_APIC(void) { int apic, i; union IO_APIC_reg_00 reg_00; @@ -1460,9 +1463,7 @@ void __init print_IO_APIC(void) return; } -#if 0 - -static void print_APIC_bitfield(int base) +__apicdebuginit(void) print_APIC_bitfield(int base) { unsigned int v; int i, j; @@ -1483,7 +1484,7 @@ static void print_APIC_bitfield(int base) } } -void /*__init*/ print_local_APIC(void *dummy) +__apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; @@ -1567,12 +1568,12 @@ void /*__init*/ print_local_APIC(void *dummy) printk("\n"); } -void print_all_local_APICs(void) +__apicdebuginit(void) print_all_local_APICs(void) { on_each_cpu(print_local_APIC, NULL, 1); } -void /*__init*/ print_PIC(void) +__apicdebuginit(void) print_PIC(void) { unsigned int v; unsigned long flags; @@ -1604,7 +1605,17 @@ void /*__init*/ print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -#endif /* 0 */ +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + static void __init enable_IO_APIC(void) { @@ -2333,8 +2344,6 @@ void __init setup_IO_APIC(void) setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); } /* diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 8269434d1707..8cdcc4f287cc 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -53,6 +53,8 @@ #include #include +#define __apicdebuginit(type) static type __init + struct irq_cfg { cpumask_t domain; cpumask_t old_domain; @@ -87,8 +89,6 @@ int first_system_vector = 0xfe; char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; -#define __apicdebuginit __init - int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; @@ -965,7 +965,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, ioapic_write_entry(apic, pin, entry); } -void __apicdebuginit print_IO_APIC(void) + +__apicdebuginit(void) print_IO_APIC(void) { int apic, i; union IO_APIC_reg_00 reg_00; @@ -1059,9 +1060,7 @@ void __apicdebuginit print_IO_APIC(void) return; } -#if 0 - -static __apicdebuginit void print_APIC_bitfield (int base) +__apicdebuginit(void) print_APIC_bitfield(int base) { unsigned int v; int i, j; @@ -1082,7 +1081,7 @@ static __apicdebuginit void print_APIC_bitfield (int base) } } -void __apicdebuginit print_local_APIC(void * dummy) +__apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; @@ -1159,12 +1158,12 @@ void __apicdebuginit print_local_APIC(void * dummy) printk("\n"); } -void print_all_local_APICs (void) +__apicdebuginit(void) print_all_local_APICs(void) { on_each_cpu(print_local_APIC, NULL, 1); } -void __apicdebuginit print_PIC(void) +__apicdebuginit(void) print_PIC(void) { unsigned int v; unsigned long flags; @@ -1196,7 +1195,17 @@ void __apicdebuginit print_PIC(void) printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); } -#endif /* 0 */ +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + void __init enable_IO_APIC(void) { @@ -1849,8 +1858,6 @@ void __init setup_IO_APIC(void) setup_IO_APIC_irqs(); init_IO_APIC_traps(); check_timer(); - if (!acpi_ioapic) - print_IO_APIC(); } struct sysfs_ioapic_data { diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 19af06927fbc..1d88d2b39771 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -250,10 +250,5 @@ int __init pci_acpi_init(void) acpi_pci_irq_enable(dev); } -#ifdef CONFIG_X86_IO_APIC - if (acpi_ioapic) - print_IO_APIC(); -#endif - return 0; } -- cgit v1.2.2 From 9d25d4db81833029d30b7b03cc1000cbbe09e192 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 21 Jul 2008 18:41:26 +0100 Subject: x86: BUILD_IRQ say .text to avoid .data.percpu When I edit the x86_64 Makefile to -fno-unit-at-a-time, bootup panics on 0xCCs in IRQ0x3e_interrupt(): IRQ0x20_interrupt etc. have got linked into .data.percpu. Perhaps there are other ways of triggering that: specify ".text" in the BUILD_IRQ() macro for safety. I've been using -fno-unit-at-a-time (to lessen inlining, for easier debugging) for a long time. Signed-off-by: Hugh Dickins Cc: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 0373e88de95a..9414125f19ce 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -43,7 +43,7 @@ #define BUILD_IRQ(nr) \ asmlinkage void IRQ_NAME(nr); \ - asm("\n.p2align\n" \ + asm("\n.text\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ "push $~(" #nr ") ; " \ "jmp common_interrupt"); -- cgit v1.2.2 From 510b37258dfd61693ca6c039865c78bd996e3718 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 22 Jul 2008 13:20:22 -0700 Subject: x86: move declaring x2apic_extra_bits it is for uv only Signed-off-by: Yinghai Lu Cc: Jack Steiner Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_phys.c | 2 -- arch/x86/kernel/genx2apic_uv_x.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index f35a3bf3a9e5..3229c68aedd4 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -10,8 +10,6 @@ #include #include -DEFINE_PER_CPU(int, x2apic_extra_bits); - static int x2apic_phys; static int set_x2apic_phys_mode(char *arg) diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 14a9772778e5..169388c4cce9 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -26,6 +26,8 @@ #include #include +DEFINE_PER_CPU(int, x2apic_extra_bits); + static enum uv_system_type uv_system_type; static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) -- cgit v1.2.2 From 36a028de785c6e6f15ac84f8b3b087b89137ea26 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 24 Jul 2008 13:52:28 +0200 Subject: x86: apic unification - merge down lapic_get_maxlvt No code change on binary level. Signed-off-by: Cyrill Gorcunov Cc: macro@linux-mips.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 8 ++++++-- arch/x86/kernel/apic_64.c | 9 ++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d6c898358371..447dd8c5c0e9 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -193,9 +193,13 @@ int get_physical_broadcast(void) */ int lapic_get_maxlvt(void) { - unsigned int v = apic_read(APIC_LVR); + unsigned int v; - /* 82489DXs do not report # of LVT entries. */ + v = apic_read(APIC_LVR); + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 7f1f030da7ee..4fa2a8620c26 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -158,11 +158,14 @@ void __cpuinit enable_NMI_through_LVT0(void) */ int lapic_get_maxlvt(void) { - unsigned int v, maxlvt; + unsigned int v; v = apic_read(APIC_LVR); - maxlvt = GET_APIC_MAXLVT(v); - return maxlvt; + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } /* -- cgit v1.2.2 From d4c63ec060f3315653c0ae5bc3a7fe2419a2282f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 24 Jul 2008 13:52:29 +0200 Subject: x86: apic unification - merge down enable_NMI_through_LVT0 No code change on binary level. Signed-off-by: Cyrill Gorcunov Cc: macro@linux-mips.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 8 ++++++-- arch/x86/kernel/apic_64.c | 5 +++++ 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 447dd8c5c0e9..01708f128eee 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -172,11 +172,15 @@ u32 safe_apic_wait_icr_idle(void) */ void __cpuinit enable_NMI_through_LVT0(void) { - unsigned int v = APIC_DM_NMI; + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; - /* Level triggered for 82489DX */ + /* Level triggered for 82489DX (32bit mode) */ if (!lapic_is_integrated()) v |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT0, v); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 4fa2a8620c26..7615b4b9c3f3 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -150,6 +150,11 @@ void __cpuinit enable_NMI_through_LVT0(void) /* unmask and set to NMI */ v = APIC_DM_NMI; + + /* Level triggered for 82489DX (32bit mode) */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT0, v); } -- cgit v1.2.2 From b61bfa3c462671c48a51fb5c31af337c5a996a04 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 23 Jul 2008 21:26:55 -0700 Subject: mm: move bootmem descriptors definition to a single place There are a lot of places that define either a single bootmem descriptor or an array of them. Use only one central array with MAX_NUMNODES items instead. Signed-off-by: Johannes Weiner Acked-by: Ralf Baechle Cc: Ingo Molnar Cc: Richard Henderson Cc: Russell King Cc: Tony Luck Cc: Hirokazu Takata Cc: Geert Uytterhoeven Cc: Kyle McMartin Cc: Paul Mackerras Cc: Paul Mundt Cc: David S. Miller Cc: Yinghai Lu Cc: Christoph Lameter Cc: Mel Gorman Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/discontig_32.c | 3 +-- arch/x86/mm/numa_64.c | 4 +--- 2 files changed, 2 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 5dfef9fa061a..62fa440678d8 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -42,7 +42,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static bootmem_data_t node0_bdata; /* * numa interface - we expect the numa architecture specific code to have @@ -385,7 +384,7 @@ void __init initmem_init(unsigned long start_pfn, for_each_online_node(nid) memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - NODE_DATA(0)->bdata = &node0_bdata; + NODE_DATA(0)->bdata = &bootmem_node_data[0]; setup_bootmem_allocator(); } diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index 9782f42dd319..a4dd793d6003 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -23,8 +23,6 @@ struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; EXPORT_SYMBOL(node_data); -static bootmem_data_t plat_node_bdata[MAX_NUMNODES]; - struct memnode memnode; s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = { @@ -198,7 +196,7 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, nodedata_phys + pgdat_size - 1); memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); - NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid]; + NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; NODE_DATA(nodeid)->node_start_pfn = start_pfn; NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; -- cgit v1.2.2 From 28b2ee20c7cba812b6f2ccf6d722cf86d00a84dc Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 23 Jul 2008 21:27:05 -0700 Subject: access_process_vm device memory infrastructure In order to be able to debug things like the X server and programs using the PPC Cell SPUs, the debugger needs to be able to access device memory through ptrace and /proc/pid/mem. This patch: Add the generic_access_phys access function and put the hooks in place to allow access_process_vm to access device or PPC Cell SPU memory. [riel@redhat.com: Add documentation for the vm_ops->access function] Signed-off-by: Rik van Riel Signed-off-by: Benjamin Herrensmidt Cc: Dave Airlie Cc: Hugh Dickins Cc: Paul Mackerras Cc: Arnd Bergmann Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/mm/ioremap.c | 8 ++++++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 03980cb04291..b2ddfcf01728 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -21,6 +21,7 @@ config X86 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE + select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_KRETPROBES select HAVE_DYNAMIC_FTRACE diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 24c1d3c30186..016f335bbeea 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -330,6 +330,14 @@ static void __iomem *ioremap_default(resource_size_t phys_addr, return (void __iomem *)ret; } +void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, + unsigned long prot_val) +{ + return __ioremap_caller(phys_addr, size, (prot_val & _PAGE_CACHE_MASK), + __builtin_return_address(0)); +} +EXPORT_SYMBOL(ioremap_prot); + /** * iounmap - Free a IO remapping * @addr: virtual address from ioremap_* -- cgit v1.2.2 From 7ae8ed5053a39082d224a3f48409e016baca9c16 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 23 Jul 2008 21:27:07 -0700 Subject: use generic_access_phys for /dev/mem mappings Use generic_access_phys as the access_process_vm access function for /dev/mem mappings. This makes it possible to debug the X server. [akpm@linux-foundation.org: repair all the architectures which broke] Signed-off-by: Rik van Riel Cc: Benjamin Herrensmidt Cc: Dave Airlie Cc: Hugh Dickins Cc: Paul Mackerras Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/pci/i386.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 2aafb67dc5f1..a09505806b82 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -280,6 +280,7 @@ static void pci_track_mmap_page_range(struct vm_area_struct *vma) static struct vm_operations_struct pci_mmap_ops = { .open = pci_track_mmap_page_range, .close = pci_unmap_page_range, + .access = generic_access_phys, }; int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, -- cgit v1.2.2 From a5516438959d90b071ff0a484ce4f3f523dc3152 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:41 -0700 Subject: hugetlb: modular state for hugetlb page size The goal of this patchset is to support multiple hugetlb page sizes. This is achieved by introducing a new struct hstate structure, which encapsulates the important hugetlb state and constants (eg. huge page size, number of huge pages currently allocated, etc). The hstate structure is then passed around the code which requires these fields, they will do the right thing regardless of the exact hstate they are operating on. This patch adds the hstate structure, with a single global instance of it (default_hstate), and does the basic work of converting hugetlb to use the hstate. Future patches will add more hstate structures to allow for different hugetlbfs mounts to have different page sizes. [akpm@linux-foundation.org: coding-style fixes] Acked-by: Adam Litke Acked-by: Nishanth Aravamudan Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 0b3d567e686d..52476fde8996 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -124,7 +124,8 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) return 1; } -pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) +pte_t *huge_pte_alloc(struct mm_struct *mm, + unsigned long addr, unsigned long sz) { pgd_t *pgd; pud_t *pud; @@ -368,7 +369,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; if (flags & MAP_FIXED) { - if (prepare_hugepage_range(addr, len)) + if (prepare_hugepage_range(file, addr, len)) return -EINVAL; return addr; } -- cgit v1.2.2 From ceb868796181dc95ea01a110e123afd391639873 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:50 -0700 Subject: hugetlb: introduce pud_huge Straight forward extensions for huge pages located in the PUD instead of PMDs. Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 52476fde8996..a4789e87a315 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -189,6 +189,11 @@ int pmd_huge(pmd_t pmd) return 0; } +int pud_huge(pud_t pud) +{ + return 0; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) @@ -209,6 +214,11 @@ int pmd_huge(pmd_t pmd) return !!(pmd_val(pmd) & _PAGE_PSE); } +int pud_huge(pud_t pud) +{ + return 0; +} + struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) @@ -217,9 +227,22 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, page = pte_page(*(pte_t *)pmd); if (page) - page += ((address & ~HPAGE_MASK) >> PAGE_SHIFT); + page += ((address & ~PMD_MASK) >> PAGE_SHIFT); return page; } + +struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + struct page *page; + + page = pte_page(*(pte_t *)pud); + if (page) + page += ((address & ~PUD_MASK) >> PAGE_SHIFT); + return page; +} + #endif /* x86_64 also uses this file */ -- cgit v1.2.2 From 39c11e6c05b7fedbf7ed4df3908b25f622d56204 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:50 -0700 Subject: x86: support GB hugepages on 64-bit Acked-by: Adam Litke Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index a4789e87a315..b7a65a07af03 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -134,9 +134,14 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (pud) { - if (pud_none(*pud)) - huge_pmd_share(mm, addr, pud); - pte = (pte_t *) pmd_alloc(mm, pud, addr); + if (sz == PUD_SIZE) { + pte = (pte_t *)pud; + } else { + BUG_ON(sz != PMD_SIZE); + if (pud_none(*pud)) + huge_pmd_share(mm, addr, pud); + pte = (pte_t *) pmd_alloc(mm, pud, addr); + } } BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte)); @@ -152,8 +157,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) pgd = pgd_offset(mm, addr); if (pgd_present(*pgd)) { pud = pud_offset(pgd, addr); - if (pud_present(*pud)) + if (pud_present(*pud)) { + if (pud_large(*pud)) + return (pte_t *)pud; pmd = pmd_offset(pud, addr); + } } return (pte_t *) pmd; } @@ -216,7 +224,7 @@ int pmd_huge(pmd_t pmd) int pud_huge(pud_t pud) { - return 0; + return !!(pud_val(pud) & _PAGE_PSE); } struct page * @@ -252,6 +260,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long start_addr; @@ -264,7 +273,7 @@ static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, } full_search: - addr = ALIGN(start_addr, HPAGE_SIZE); + addr = ALIGN(start_addr, huge_page_size(h)); for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ @@ -286,7 +295,7 @@ full_search: } if (addr + mm->cached_hole_size < vma->vm_start) mm->cached_hole_size = vma->vm_start - addr; - addr = ALIGN(vma->vm_end, HPAGE_SIZE); + addr = ALIGN(vma->vm_end, huge_page_size(h)); } } @@ -294,6 +303,7 @@ static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr0, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev_vma; unsigned long base = mm->mmap_base, addr = addr0; @@ -314,7 +324,7 @@ try_again: goto fail; /* either no address requested or cant fit in requested address hole */ - addr = (mm->free_area_cache - len) & HPAGE_MASK; + addr = (mm->free_area_cache - len) & huge_page_mask(h); do { /* * Lookup failure means no vma is above this address, @@ -345,7 +355,7 @@ try_again: largest_hole = vma->vm_start - addr; /* try just below the current vma->vm_start */ - addr = (vma->vm_start - len) & HPAGE_MASK; + addr = (vma->vm_start - len) & huge_page_mask(h); } while (len <= vma->vm_start); fail: @@ -383,10 +393,11 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { + struct hstate *h = hstate_file(file); struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - if (len & ~HPAGE_MASK) + if (len & ~huge_page_mask(h)) return -EINVAL; if (len > TASK_SIZE) return -ENOMEM; @@ -398,7 +409,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } if (addr) { - addr = ALIGN(addr, HPAGE_SIZE); + addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) -- cgit v1.2.2 From b4718e628dbf68a2dee23b5709e2aa3190409c56 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 23 Jul 2008 21:27:51 -0700 Subject: x86: add hugepagesz option on 64-bit Add an hugepagesz=... option similar to IA64, PPC etc. to x86-64. This finally allows to select GB pages for hugetlbfs in x86 now that all the infrastructure is in place. Signed-off-by: Andi Kleen Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index b7a65a07af03..8f307d914c2e 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -425,3 +425,20 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, #endif /*HAVE_ARCH_HUGETLB_UNMAPPED_AREA*/ +#ifdef CONFIG_X86_64 +static __init int setup_hugepagesz(char *opt) +{ + unsigned long ps = memparse(opt, &opt); + if (ps == PMD_SIZE) { + hugetlb_add_hstate(PMD_SHIFT - PAGE_SHIFT); + } else if (ps == PUD_SIZE && cpu_has_gbpages) { + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + } else { + printk(KERN_ERR "hugepagesz: Unsupported page size %lu M\n", + ps >> 20); + return 0; + } + return 1; +} +__setup("hugepagesz=", setup_hugepagesz); +#endif -- cgit v1.2.2 From 27ac792ca0b0a1e7e65f20342260650516c95864 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Wed, 23 Jul 2008 21:28:13 -0700 Subject: PAGE_ALIGN(): correctly handle 64-bit values on 32-bit architectures On 32-bit architectures PAGE_ALIGN() truncates 64-bit values to the 32-bit boundary. For example: u64 val = PAGE_ALIGN(size); always returns a value < 4GB even if size is greater than 4GB. The problem resides in PAGE_MASK definition (from include/asm-x86/page.h for example): #define PAGE_SHIFT 12 #define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) #define PAGE_MASK (~(PAGE_SIZE-1)) ... #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) The "~" is performed on a 32-bit value, so everything in "and" with PAGE_MASK greater than 4GB will be truncated to the 32-bit boundary. Using the ALIGN() macro seems to be the right way, because it uses typeof(addr) for the mask. Also move the PAGE_ALIGN() definitions out of include/asm-*/page.h in include/linux/mm.h. See also lkml discussion: http://lkml.org/lkml/2008/6/11/237 [akpm@linux-foundation.org: fix drivers/media/video/uvc/uvc_queue.c] [akpm@linux-foundation.org: fix v850] [akpm@linux-foundation.org: fix powerpc] [akpm@linux-foundation.org: fix arm] [akpm@linux-foundation.org: fix mips] [akpm@linux-foundation.org: fix drivers/media/video/pvrusb2/pvrusb2-dvb.c] [akpm@linux-foundation.org: fix drivers/mtd/maps/uclinux.c] [akpm@linux-foundation.org: fix powerpc] Signed-off-by: Andrea Righi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/module_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/module_64.c b/arch/x86/kernel/module_64.c index 0e867676b5a5..6ba87830d4b1 100644 --- a/arch/x86/kernel/module_64.c +++ b/arch/x86/kernel/module_64.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include -- cgit v1.2.2 From d75f65fd247fe85d90a3880d143b1bb22fe13a48 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Wed, 23 Jul 2008 21:28:34 -0700 Subject: remove include/linux/pm_legacy.h Remove the obsolete and no longer used include/linux/pm_legacy.h Reviewed-by: Robert P. J. Day Signed-off-by: Adrian Bunk Cc: Pavel Machek Acked-by: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/apm_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index bf9b441331e9..9ee24e6bc4b0 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -219,7 +219,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.2 From bdfe6b7c681669148dae4db27eb24ee5408ba371 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Wed, 23 Jul 2008 21:28:41 -0700 Subject: pm: acpi hibernation: utilize hardware signature ACPI defines a hardware signature. BIOS calculates the signature according to hardware configure and if hardware changes while hibernated, the signature will change. In that case, S4 resume should fail. Still, there may be systems on which this mechanism does not work correctly, so it is better to provide a workaround for them. For this reason, add a new switch to the acpi_sleep= command line argument allowing one to disable hardware signature checking. [shaohua.li@intel.com: build fix] Signed-off-by: Shaohua Li Signed-off-by: Rafael J. Wysocki Cc: Andi Kleen Cc: Len Brown Acked-by: Pavel Machek Cc: Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/acpi/sleep.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index a3ddad18aaa3..fa2161d5003b 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -150,6 +150,10 @@ static int __init acpi_sleep_setup(char *str) acpi_realmode_flags |= 2; if (strncmp(str, "s3_beep", 7) == 0) acpi_realmode_flags |= 4; +#ifdef CONFIG_HIBERNATION + if (strncmp(str, "s4_nohwsig", 10) == 0) + acpi_no_s4_hw_signature(); +#endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); str = strchr(str, ','); -- cgit v1.2.2 From 9deb27baedb79759c3ab9435a7d8b841842d56e9 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:24 -0700 Subject: flag parameters: signalfd This patch adds the new signalfd4 syscall. It extends the old signalfd syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is SFD_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name SFD_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_signalfd4 # ifdef __x86_64__ # define __NR_signalfd4 289 # elif defined __i386__ # define __NR_signalfd4 327 # else # error "need __NR_signalfd4" # endif #endif #define SFD_CLOEXEC O_CLOEXEC int main (void) { sigset_t ss; sigemptyset (&ss); sigaddset (&ss, SIGUSR1); int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0); if (fd == -1) { puts ("signalfd4(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("signalfd4(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC); if (fd == -1) { puts ("signalfd4(SFD_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/kernel/syscall_table_32.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 021d71bc69b5..c308128b9251 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -826,4 +826,5 @@ ia32_sys_call_table: .quad sys32_fallocate .quad compat_sys_timerfd_settime /* 325 */ .quad compat_sys_timerfd_gettime + .quad compat_sys_signalfd4 ia32_syscall_end: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index adff5562f5fd..c12a36c9fd51 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -326,3 +326,4 @@ ENTRY(sys_call_table) .long sys_fallocate .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime + .long sys_signalfd4 -- cgit v1.2.2 From b087498eb5605673b0f260a7620d91818cd72304 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:25 -0700 Subject: flag parameters: eventfd This patch adds the new eventfd2 syscall. It extends the old eventfd syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is EFD_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name EFD_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_eventfd2 # ifdef __x86_64__ # define __NR_eventfd2 290 # elif defined __i386__ # define __NR_eventfd2 328 # else # error "need __NR_eventfd2" # endif #endif #define EFD_CLOEXEC O_CLOEXEC int main (void) { int fd = syscall (__NR_eventfd2, 1, 0); if (fd == -1) { puts ("eventfd2(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("eventfd2(0) sets close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_eventfd2, 1, EFD_CLOEXEC); if (fd == -1) { puts ("eventfd2(EFD_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("eventfd2(EFD_CLOEXEC) does not set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/kernel/syscall_table_32.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index c308128b9251..cf0eb31745ca 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -827,4 +827,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd_settime /* 325 */ .quad compat_sys_timerfd_gettime .quad compat_sys_signalfd4 + .quad sys_eventfd2 ia32_syscall_end: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index c12a36c9fd51..cf112cb11c37 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -327,3 +327,4 @@ ENTRY(sys_call_table) .long sys_timerfd_settime /* 325 */ .long sys_timerfd_gettime .long sys_signalfd4 + .long sys_eventfd2 -- cgit v1.2.2 From a0998b50c3f0b8fdd265c63e0032f86ebe377dbf Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:27 -0700 Subject: flag parameters: epoll_create This patch adds the new epoll_create2 syscall. It extends the old epoll_create syscall by one parameter which is meant to hold a flag value. In this patch the only flag support is EPOLL_CLOEXEC which causes the close-on-exec flag for the returned file descriptor to be set. A new name EPOLL_CLOEXEC is introduced which in this implementation must have the same value as O_CLOEXEC. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_epoll_create2 # ifdef __x86_64__ # define __NR_epoll_create2 291 # elif defined __i386__ # define __NR_epoll_create2 329 # else # error "need __NR_epoll_create2" # endif #endif #define EPOLL_CLOEXEC O_CLOEXEC int main (void) { int fd = syscall (__NR_epoll_create2, 1, 0); if (fd == -1) { puts ("epoll_create2(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("epoll_create2(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_epoll_create2, 1, EPOLL_CLOEXEC); if (fd == -1) { puts ("epoll_create2(EPOLL_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/kernel/syscall_table_32.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index cf0eb31745ca..04366f08f424 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -828,4 +828,5 @@ ia32_sys_call_table: .quad compat_sys_timerfd_gettime .quad compat_sys_signalfd4 .quad sys_eventfd2 + .quad sys_epoll_create2 ia32_syscall_end: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index cf112cb11c37..4d7007ca263d 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -328,3 +328,4 @@ ENTRY(sys_call_table) .long sys_timerfd_gettime .long sys_signalfd4 .long sys_eventfd2 + .long sys_epoll_create2 -- cgit v1.2.2 From 336dd1f70ff62d7dd8655228caed4c5bfc818c56 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:29 -0700 Subject: flag parameters: dup2 This patch adds the new dup3 syscall. It extends the old dup2 syscall by one parameter which is meant to hold a flag value. Support for the O_CLOEXEC flag is added in this patch. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_dup3 # ifdef __x86_64__ # define __NR_dup3 292 # elif defined __i386__ # define __NR_dup3 330 # else # error "need __NR_dup3" # endif #endif int main (void) { int fd = syscall (__NR_dup3, 1, 4, 0); if (fd == -1) { puts ("dup3(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("dup3(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_dup3, 1, 4, O_CLOEXEC); if (fd == -1) { puts ("dup3(O_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("dup3(O_CLOEXEC) set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/kernel/syscall_table_32.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 04366f08f424..5614a8f7bed4 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -829,4 +829,5 @@ ia32_sys_call_table: .quad compat_sys_signalfd4 .quad sys_eventfd2 .quad sys_epoll_create2 + .quad sys_dup3 /* 330 */ ia32_syscall_end: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 4d7007ca263d..24a3f1ea6a0e 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -329,3 +329,4 @@ ENTRY(sys_call_table) .long sys_signalfd4 .long sys_eventfd2 .long sys_epoll_create2 + .long sys_dup3 /* 330 */ -- cgit v1.2.2 From ed8cae8ba01348bfd83333f4648dd807b04d7f08 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:30 -0700 Subject: flag parameters: pipe This patch introduces the new syscall pipe2 which is like pipe but it also takes an additional parameter which takes a flag value. This patch implements the handling of O_CLOEXEC for the flag. I did not add support for the new syscall for the architectures which have a special sys_pipe implementation. I think the maintainers of those archs have the chance to go with the unified implementation but that's up to them. The implementation introduces do_pipe_flags. I did that instead of changing all callers of do_pipe because some of the callers are written in assembler. I would probably screw up changing the assembly code. To avoid breaking code do_pipe is now a small wrapper around do_pipe_flags. Once all callers are changed over to do_pipe_flags the old do_pipe function can be removed. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_pipe2 # ifdef __x86_64__ # define __NR_pipe2 293 # elif defined __i386__ # define __NR_pipe2 331 # else # error "need __NR_pipe2" # endif #endif int main (void) { int fd[2]; if (syscall (__NR_pipe2, fd, 0) != 0) { puts ("pipe2(0) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { printf ("pipe2(0) set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); if (syscall (__NR_pipe2, fd, O_CLOEXEC) != 0) { puts ("pipe2(O_CLOEXEC) failed"); return 1; } for (int i = 0; i < 2; ++i) { int coe = fcntl (fd[i], F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { printf ("pipe2(O_CLOEXEC) does not set close-on-exit for fd[%d]\n", i); return 1; } } close (fd[0]); close (fd[1]); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/ia32/sys_ia32.c | 2 +- arch/x86/kernel/syscall_table_32.S | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 5614a8f7bed4..18808b164570 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -830,4 +830,5 @@ ia32_sys_call_table: .quad sys_eventfd2 .quad sys_epoll_create2 .quad sys_dup3 /* 330 */ + .quad sys_pipe2 ia32_syscall_end: diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index f00afdf61e67..d3c64088b981 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -238,7 +238,7 @@ asmlinkage long sys32_pipe(int __user *fd) int retval; int fds[2]; - retval = do_pipe(fds); + retval = do_pipe_flags(fds, 0); if (retval) goto out; if (copy_to_user(fd, fds, sizeof(fds))) diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 24a3f1ea6a0e..66154769d52f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -330,3 +330,4 @@ ENTRY(sys_call_table) .long sys_eventfd2 .long sys_epoll_create2 .long sys_dup3 /* 330 */ + .long sys_pipe2 -- cgit v1.2.2 From 4006553b06306b34054529477b06b68a1c66249b Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:32 -0700 Subject: flag parameters: inotify_init This patch introduces the new syscall inotify_init1 (note: the 1 stands for the one parameter the syscall takes, as opposed to no parameter before). The values accepted for this parameter are function-specific and defined in the inotify.h header. Here the values must match the O_* flags, though. In this patch CLOEXEC support is introduced. The following test must be adjusted for architectures other than x86 and x86-64 and in case the syscall numbers changed. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #ifndef __NR_inotify_init1 # ifdef __x86_64__ # define __NR_inotify_init1 294 # elif defined __i386__ # define __NR_inotify_init1 332 # else # error "need __NR_inotify_init1" # endif #endif #define IN_CLOEXEC O_CLOEXEC int main (void) { int fd; fd = syscall (__NR_inotify_init1, 0); if (fd == -1) { puts ("inotify_init1(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("inotify_init1(0) set close-on-exit"); return 1; } close (fd); fd = syscall (__NR_inotify_init1, IN_CLOEXEC); if (fd == -1) { puts ("inotify_init1(IN_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("inotify_init1(O_CLOEXEC) does not set close-on-exit"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ [akpm@linux-foundation.org: add sys_ni stub] Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/kernel/syscall_table_32.S | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 18808b164570..4541073dd837 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -831,4 +831,5 @@ ia32_sys_call_table: .quad sys_epoll_create2 .quad sys_dup3 /* 330 */ .quad sys_pipe2 + .quad sys_inotify_init1 ia32_syscall_end: diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 66154769d52f..f59aba5ff0f0 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -331,3 +331,4 @@ ENTRY(sys_call_table) .long sys_epoll_create2 .long sys_dup3 /* 330 */ .long sys_pipe2 + .long sys_inotify_init1 -- cgit v1.2.2 From 9fe5ad9c8cef9ad5873d8ee55d1cf00d9b607df0 Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Wed, 23 Jul 2008 21:29:43 -0700 Subject: flag parameters add-on: remove epoll_create size param Remove the size parameter from the new epoll_create syscall and renames the syscall itself. The updated test program follows. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #include #include #include #include #include #ifndef __NR_epoll_create2 # ifdef __x86_64__ # define __NR_epoll_create2 291 # elif defined __i386__ # define __NR_epoll_create2 329 # else # error "need __NR_epoll_create2" # endif #endif #define EPOLL_CLOEXEC O_CLOEXEC int main (void) { int fd = syscall (__NR_epoll_create2, 0); if (fd == -1) { puts ("epoll_create2(0) failed"); return 1; } int coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if (coe & FD_CLOEXEC) { puts ("epoll_create2(0) set close-on-exec flag"); return 1; } close (fd); fd = syscall (__NR_epoll_create2, EPOLL_CLOEXEC); if (fd == -1) { puts ("epoll_create2(EPOLL_CLOEXEC) failed"); return 1; } coe = fcntl (fd, F_GETFD); if (coe == -1) { puts ("fcntl failed"); return 1; } if ((coe & FD_CLOEXEC) == 0) { puts ("epoll_create2(EPOLL_CLOEXEC) set close-on-exec flag"); return 1; } close (fd); puts ("OK"); return 0; } ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Signed-off-by: Ulrich Drepper Acked-by: Davide Libenzi Cc: Michael Kerrisk Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/kernel/syscall_table_32.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 4541073dd837..e4bd1793a5e4 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -828,7 +828,7 @@ ia32_sys_call_table: .quad compat_sys_timerfd_gettime .quad compat_sys_signalfd4 .quad sys_eventfd2 - .quad sys_epoll_create2 + .quad sys_epoll_create1 .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index f59aba5ff0f0..d44395ff34c3 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -328,7 +328,7 @@ ENTRY(sys_call_table) .long sys_timerfd_gettime .long sys_signalfd4 .long sys_eventfd2 - .long sys_epoll_create2 + .long sys_epoll_create1 .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 -- cgit v1.2.2 From 7e2a31da854dcf8324012a83a31b40bc11e52589 Mon Sep 17 00:00:00 2001 From: David Brownell Date: Wed, 23 Jul 2008 21:30:47 -0700 Subject: rtc-cmos: avoid spurious irqs This fixes kernel http://bugzilla.kernel.org/show_bug.cgi?id=11112 (bogus RTC update IRQs reported) for rtc-cmos, in two ways: - When HPET is stealing the IRQs, use the first IRQ to grab the seconds counter which will be monitored (instead of using whatever was previously in that memory); - In sane IRQ handling modes, scrub out old IRQ status before enabling IRQs. That latter is done by tightening up IRQ handling for rtc-cmos everywhere, also ensuring that when HPET is used it's the only thing triggering IRQ reports to userspace; net object shrink. Also fix a bogus HPET message related to its RTC emulation. Signed-off-by: David Brownell Report-by: W Unruh Cc: Andrew Victor Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/hpet.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 0ea6a19bfdfe..ad2b15a1334d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -468,7 +468,7 @@ void hpet_disable(void) #define RTC_NUM_INTS 1 static unsigned long hpet_rtc_flags; -static unsigned long hpet_prev_update_sec; +static int hpet_prev_update_sec; static struct rtc_time hpet_alarm_time; static unsigned long hpet_pie_count; static unsigned long hpet_t1_cmp; @@ -575,6 +575,9 @@ int hpet_set_rtc_irq_bit(unsigned long bit_mask) hpet_rtc_flags |= bit_mask; + if ((bit_mask & RTC_UIE) && !(oldbits & RTC_UIE)) + hpet_prev_update_sec = -1; + if (!oldbits) hpet_rtc_timer_init(); @@ -652,7 +655,7 @@ static void hpet_rtc_timer_reinit(void) if (hpet_rtc_flags & RTC_PIE) hpet_pie_count += lost_ints; if (printk_ratelimit()) - printk(KERN_WARNING "rtc: lost %d interrupts\n", + printk(KERN_WARNING "hpet1: lost %d rtc interrupts\n", lost_ints); } } @@ -670,7 +673,8 @@ irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) if (hpet_rtc_flags & RTC_UIE && curr_time.tm_sec != hpet_prev_update_sec) { - rtc_int_flag = RTC_UF; + if (hpet_prev_update_sec >= 0) + rtc_int_flag = RTC_UF; hpet_prev_update_sec = curr_time.tm_sec; } -- cgit v1.2.2 From 6209ed9d8443b63c36d340908530fa470c4d4fff Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 24 Jul 2008 12:49:26 -0700 Subject: x86-64: make BUILD_IRQ() also reset section back Commit 9d25d4db81833029d30b7b03cc1000cbbe09e192 ("x86: BUILD_IRQ say .text to avoid .data.percpu") added a ".text" specifier to make sure that BUILD_IRQ() builds the irq trampoline in the text segment rather than in some random left-over segment that the compiler happened to leave the asm in. However, we should also make sure that we switch back by adding a ".previous" at the end, so that there are no subtle issues with subsequent compiler-generated code. Signed-off-by: Linus Torvalds --- arch/x86/kernel/irqinit_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 9414125f19ce..1f26fd9ec4f4 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -46,7 +46,8 @@ asm("\n.text\n.p2align\n" \ "IRQ" #nr "_interrupt:\n\t" \ "push $~(" #nr ") ; " \ - "jmp common_interrupt"); + "jmp common_interrupt\n" \ + ".previous"); #define BI(x,y) \ BUILD_IRQ(x##y) -- cgit v1.2.2 From b30f3ae50cd03ef2ff433a5030fbf88dd8323528 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 24 Jul 2008 15:43:44 -0700 Subject: x86-64: Clean up 'save/restore_i387()' usage Suresh Siddha wants to fix a possible FPU leakage in error conditions, but the fact that save/restore_i387() are inlines in a header file makes that harder to do than necessary. So start off with an obvious cleanup. This just moves the x86-64 version of save/restore_i387() out of the header file, and moves it to the only file that it is actually used in: arch/x86/kernel/signal_64.c. So exposing it in a header file was wrong to begin with. [ Side note: I'd like to fix up some of the games we play with the 32-bit version of these functions too, but that's a separate matter. The 32-bit versions are shared - under different names at that! - by both the native x86-32 code and the x86-64 32-bit compatibility code ] Acked-by: Suresh Siddha Signed-off-by: Linus Torvalds --- arch/x86/kernel/signal_64.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 47c3d249e638..b45ef8ddd651 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -53,6 +53,59 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +/* + * Signal frame handlers. + */ + +static inline int save_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + BUILD_BUG_ON(sizeof(struct user_i387_struct) != + sizeof(tsk->thread.xstate->fxsave)); + + if ((unsigned long)buf % 16) + printk("save_i387: bad fpstate %p\n", buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *) + buf); + if (err) + return err; + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + sizeof(struct i387_fxsave_struct))) + return -1; + } + return 1; +} + +/* + * This restores directly out of user space. Exceptions are handled. + */ +static inline int restore_i387(struct _fpstate __user *buf) +{ + struct task_struct *tsk = current; + int err; + + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + + if (!(task_thread_info(current)->status & TS_USEDFPU)) { + clts(); + task_thread_info(current)->status |= TS_USEDFPU; + } + return restore_fpu_checking((__force struct i387_fxsave_struct *)buf); +} /* * Do a signal return; undo the signal stack. -- cgit v1.2.2 From 4b9f12a3779c548b68bc9af7d94030868ad3aa1b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 24 Jul 2008 17:29:00 -0700 Subject: x86/oprofile/nmi_int: add Nehalem to list of ppro cores ..otherwise oprofile will fall back on that poor timer interrupt. Also replace the unreadable chain of if-statements with a "switch()" statement instead. It generates better code, and is a lot clearer. Signed-off-by: Linus Torvalds --- arch/x86/oprofile/nmi_int.c | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 7f3329b55d2e..3f90289410e6 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -369,20 +369,34 @@ static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; - if (cpu_model == 14) + switch (cpu_model) { + case 0 ... 2: + *cpu_type = "i386/ppro"; + break; + case 3 ... 5: + *cpu_type = "i386/pii"; + break; + case 6 ... 8: + *cpu_type = "i386/piii"; + break; + case 9: + *cpu_type = "i386/p6_mobile"; + break; + case 10 ... 13: + *cpu_type = "i386/p6"; + break; + case 14: *cpu_type = "i386/core"; - else if (cpu_model == 15 || cpu_model == 23) + break; + case 15: case 23: + *cpu_type = "i386/core_2"; + break; + case 26: *cpu_type = "i386/core_2"; - else if (cpu_model > 0xd) + break; + default: + /* Unknown */ return 0; - else if (cpu_model == 9) { - *cpu_type = "i386/p6_mobile"; - } else if (cpu_model > 5) { - *cpu_type = "i386/piii"; - } else if (cpu_model > 2) { - *cpu_type = "i386/pii"; - } else { - *cpu_type = "i386/ppro"; } model = &op_ppro_spec; -- cgit v1.2.2 From 4fe702c7e401f912c0edd294af6e37c02f451bbb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 25 Jul 2008 10:19:27 +0530 Subject: X86_32: declare pt_regs_access as unsigned long Fixed pt_regs_access to unsigned long as per X86_64 Signed-off-by: Jaswinder Singh --- arch/x86/kernel/ptrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e37dccce85db..fc3e8dcd9da6 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -69,7 +69,7 @@ static inline bool invalid_selector(u16 value) #define FLAG_MASK FLAG_MASK_32 -static long *pt_regs_access(struct pt_regs *regs, unsigned long regno) +static unsigned long *pt_regs_access(struct pt_regs *regs, unsigned long regno) { BUILD_BUG_ON(offsetof(struct pt_regs, bx) != 0); regno >>= 2; -- cgit v1.2.2 From f86c99853b22576ee8dc4fa27ff6f3c0c7ce0ef8 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 25 Jul 2008 10:52:53 +0530 Subject: X86_SMP: smpboot.c declare idle_thread_array and smp_b_stepping as static Signed-off-by: Jaswinder Singh --- arch/x86/kernel/smpboot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4b53a647bc0a..a9331a42f76f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -88,7 +88,7 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array); #define get_idle_for_cpu(x) (per_cpu(idle_thread_array, x)) #define set_idle_for_cpu(x, p) (per_cpu(idle_thread_array, x) = (p)) #else -struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; +static struct task_struct *idle_thread_array[NR_CPUS] __cpuinitdata ; #define get_idle_for_cpu(x) (idle_thread_array[(x)]) #define set_idle_for_cpu(x, p) (idle_thread_array[(x)] = (p)) #endif @@ -129,7 +129,7 @@ static int boot_cpu_logical_apicid; static cpumask_t cpu_sibling_setup_map; /* Set if we find a B stepping CPU */ -int __cpuinitdata smp_b_stepping; +static int __cpuinitdata smp_b_stepping; #if defined(CONFIG_NUMA) && defined(CONFIG_X86_32) -- cgit v1.2.2 From 2907829cd0ffdef69083985ba28cf1cf3857c681 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 25 Jul 2008 11:05:56 +0530 Subject: X86_SMP: ipi.c declare functions before they get used move upwards for __send_IPI_shortcut and send_IPI_mask_bitmask Signed-off-by: Jaswinder Singh --- arch/x86/kernel/ipi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ipi.c b/arch/x86/kernel/ipi.c index 3f7537b669d3..f1c688e46f35 100644 --- a/arch/x86/kernel/ipi.c +++ b/arch/x86/kernel/ipi.c @@ -20,6 +20,8 @@ #ifdef CONFIG_X86_32 #include +#include + /* * the following functions deal with sending IPIs between CPUs. * @@ -147,7 +149,6 @@ void send_IPI_mask_sequence(cpumask_t mask, int vector) } /* must come after the send_IPI functions above for inlining */ -#include static int convert_apicid_to_cpu(int apic_id) { int i; -- cgit v1.2.2 From 3c9339049df5cc3a468c11de6c4101a1ea8c3d83 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 25 Jul 2008 11:32:36 +0200 Subject: x86: fix ds.c build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/ds.c: In function ‘ds_allocate_buffer': arch/x86/kernel/ds.c:339: error: implicit declaration of function ‘PAGE_ALIGN' Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 24a323c95997..ab21c270bfa4 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -29,6 +29,7 @@ #include #include #include +#include /* -- cgit v1.2.2 From 58340a07c194e0aed7bc58b61ff24330bb2a409f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 25 Jul 2008 01:45:33 -0700 Subject: introduce HAVE_EFFICIENT_UNALIGNED_ACCESS Kconfig symbol In many cases, especially in networking, it can be beneficial to know at compile time whether the architecture can do unaligned accesses efficiently. This patch introduces a new Kconfig symbol HAVE_EFFICIENT_UNALIGNED_ACCESS for that purpose and adds it to the powerpc and x86 architectures. Also add some documentation about alignment and networking, and especially one intended use of this symbol. Signed-off-by: Johannes Berg Acked-by: Sam Ravnborg Acked-by: Ingo Molnar [x86 architecture part] Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b2ddfcf01728..66f3ab05b18c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -28,6 +28,7 @@ config X86 select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER + select HAVE_EFFICIENT_UNALIGNED_ACCESS config ARCH_DEFCONFIG string -- cgit v1.2.2 From 2d6ffcca623a9a16df6cdfbe8250b7a5904a5f5e Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Fri, 25 Jul 2008 01:45:44 -0700 Subject: inflate: refactor inflate malloc code Inflate requires some dynamic memory allocation very early in the boot process and this is provided with a set of four functions: malloc/free/gzip_mark/gzip_release. The old inflate code used a mark/release strategy rather than implement free. This new version instead keeps a count on the number of outstanding allocations and when it hits zero, it resets the malloc arena. This allows removing all the mark and release implementations and unifying all the malloc/free implementations. The architecture-dependent code must define two addresses: - free_mem_ptr, the address of the beginning of the area in which allocations should be made - free_mem_end_ptr, the address of the end of the area in which allocations should be made. If set to 0, then no check is made on the number of allocations, it just grows as much as needed The architecture-dependent code can also provide an arch_decomp_wdog() function call. This function will be called several times during the decompression process, and allow to notify the watchdog that the system is still running. If an architecture provides such a call, then it must define ARCH_HAS_DECOMP_WDOG so that the generic inflate code calls arch_decomp_wdog(). Work initially done by Matt Mackall, updated to a recent version of the kernel and improved by me. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Thomas Petazzoni Cc: Matt Mackall Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Mikael Starvik Cc: Jesper Nilsson Cc: Haavard Skinnemoen Cc: David Howells Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Cc: "H. Peter Anvin" Acked-by: Paul Mundt Acked-by: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/boot/compressed/misc.c | 39 --------------------------------------- 1 file changed, 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index bc5553b496f7..9fea73706479 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -182,8 +182,6 @@ static unsigned outcnt; static int fill_inbuf(void); static void flush_window(void); static void error(char *m); -static void gzip_mark(void **); -static void gzip_release(void **); /* * This is set up by the setup-routine at boot-time @@ -196,9 +194,6 @@ extern int input_len; static long bytes_out; -static void *malloc(int size); -static void free(void *where); - static void *memset(void *s, int c, unsigned n); static void *memcpy(void *dest, const void *src, unsigned n); @@ -220,40 +215,6 @@ static int lines, cols; #include "../../../../lib/inflate.c" -static void *malloc(int size) -{ - void *p; - - if (size < 0) - error("Malloc error"); - if (free_mem_ptr <= 0) - error("Memory error"); - - free_mem_ptr = (free_mem_ptr + 3) & ~3; /* Align */ - - p = (void *)free_mem_ptr; - free_mem_ptr += size; - - if (free_mem_ptr >= free_mem_end_ptr) - error("Out of memory"); - - return p; -} - -static void free(void *where) -{ /* Don't care */ -} - -static void gzip_mark(void **ptr) -{ - *ptr = (void *) free_mem_ptr; -} - -static void gzip_release(void **ptr) -{ - free_mem_ptr = (memptr) *ptr; -} - static void scroll(void) { int i; -- cgit v1.2.2 From ef53d9c5e4da147ecaa43c44c5e5945eb83970a2 Mon Sep 17 00:00:00 2001 From: Srinivasa D S Date: Fri, 25 Jul 2008 01:46:04 -0700 Subject: kprobes: improve kretprobe scalability with hashed locking Currently list of kretprobe instances are stored in kretprobe object (as used_instances,free_instances) and in kretprobe hash table. We have one global kretprobe lock to serialise the access to these lists. This causes only one kretprobe handler to execute at a time. Hence affects system performance, particularly on SMP systems and when return probe is set on lot of functions (like on all systemcalls). Solution proposed here gives fine-grain locks that performs better on SMP system compared to present kretprobe implementation. Solution: 1) Instead of having one global lock to protect kretprobe instances present in kretprobe object and kretprobe hash table. We will have two locks, one lock for protecting kretprobe hash table and another lock for kretporbe object. 2) We hold lock present in kretprobe object while we modify kretprobe instance in kretprobe object and we hold per-hash-list lock while modifying kretprobe instances present in that hash list. To prevent deadlock, we never grab a per-hash-list lock while holding a kretprobe lock. 3) We can remove used_instances from struct kretprobe, as we can track used instances of kretprobe instances using kretprobe hash table. Time duration for kernel compilation ("make -j 8") on a 8-way ppc64 system with return probes set on all systemcalls looks like this. cacheline non-cacheline Un-patched kernel aligned patch aligned patch =============================================================================== real 9m46.784s 9m54.412s 10m2.450s user 40m5.715s 40m7.142s 40m4.273s sys 2m57.754s 2m58.583s 3m17.430s =========================================================== Time duration for kernel compilation ("make -j 8) on the same system, when kernel is not probed. ========================= real 9m26.389s user 40m8.775s sys 2m7.283s ========================= Signed-off-by: Srinivasa DS Signed-off-by: Jim Keniston Acked-by: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S. Miller Cc: Masami Hiramatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/kprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 43c019f85f0d..6c27679ec6aa 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -431,7 +431,6 @@ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) regs->ip = (unsigned long)p->ainsn.insn; } -/* Called with kretprobe_lock held */ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) { @@ -682,8 +681,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; INIT_HLIST_HEAD(&empty_rp); - spin_lock_irqsave(&kretprobe_lock, flags); - head = kretprobe_inst_table_head(current); + kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; @@ -732,7 +730,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs) kretprobe_assert(ri, orig_ret_address, trampoline_address); - spin_unlock_irqrestore(&kretprobe_lock, flags); + kretprobe_hash_unlock(current, &flags); hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { hlist_del(&ri->hlist); -- cgit v1.2.2 From 7444a72effa632fcd8edc566f880d96fe213c73b Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Fri, 25 Jul 2008 01:46:11 -0700 Subject: gpiolib: allow user-selection This patch adds functionality to the gpio-lib subsystem to make it possible to enable the gpio-lib code even if the architecture code didn't request to get it built in. The archtitecture code does still need to implement the gpiolib accessor functions in its asm/gpio.h file. This patch adds the implementations for x86 and PPC. With these changes it is possible to run generic GPIO expansion cards on every architecture that implements the trivial wrapper functions. Support for more architectures can easily be added. Signed-off-by: Michael Buesch Cc: Benjamin Herrenschmidt Cc: Stephen Rothwell Cc: David Brownell Cc: Russell King Cc: Haavard Skinnemoen Cc: Jesper Nilsson Cc: Ralf Baechle Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Jean Delvare Cc: Samuel Ortiz Cc: Kumar Gala Cc: Sam Ravnborg Cc: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 66f3ab05b18c..e3cba0b45600 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,6 +23,7 @@ config X86 select HAVE_OPROFILE select HAVE_IOREMAP_PROT select HAVE_KPROBES + select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X select HAVE_KRETPROBES select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE -- cgit v1.2.2 From 95b68dec0d52c7b8fea3698b3938cf3ab936436b Mon Sep 17 00:00:00 2001 From: Chandru Date: Fri, 25 Jul 2008 01:47:55 -0700 Subject: calgary iommu: use the first kernels TCE tables in kdump kdump kernel fails to boot with calgary iommu and aacraid driver on a x366 box. The ongoing dma's of aacraid from the first kernel continue to exist until the driver is loaded in the kdump kernel. Calgary is initialized prior to aacraid and creation of new tce tables causes wrong dma's to occur. Here we try to get the tce tables of the first kernel in kdump kernel and use them. While in the kdump kernel we do not allocate new tce tables but instead read the base address register contents of calgary iommu and use the tables that the registers point to. With these changes the kdump kernel and hence aacraid now boots normally. Signed-off-by: Chandru Siddalingappa Acked-by: Muli Ben-Yehuda Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 85 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 151f2d171f7c..19e7fc7c2c4f 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -167,6 +168,8 @@ static void calgary_dump_error_regs(struct iommu_table *tbl); static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); static void calioc2_tce_cache_blast(struct iommu_table *tbl); static void calioc2_dump_error_regs(struct iommu_table *tbl); +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl); +static void get_tce_space_from_tar(void); static struct cal_chipset_ops calgary_chip_ops = { .handle_quirks = calgary_handle_quirks, @@ -830,7 +833,11 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; - tce_free(tbl, 0, tbl->it_size); + + if (is_kdump_kernel()) + calgary_init_bitmap_from_tce_table(tbl); + else + tce_free(tbl, 0, tbl->it_size); if (is_calgary(dev->device)) tbl->chip_ops = &calgary_chip_ops; @@ -1209,6 +1216,10 @@ static int __init calgary_init(void) if (ret) return ret; + /* Purely for kdump kernel case */ + if (is_kdump_kernel()) + get_tce_space_from_tar(); + do { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) @@ -1339,6 +1350,61 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) return (val != 0xffffffff); } +/* + * calgary_init_bitmap_from_tce_table(): + * Funtion for kdump case. In the second/kdump kernel initialize + * the bitmap based on the tce table entries obtained from first kernel + */ +static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) +{ + u64 *tp; + unsigned int index; + tp = ((u64 *)tbl->it_base); + for (index = 0 ; index < tbl->it_size; index++) { + if (*tp != 0x0) + set_bit(index, tbl->it_map); + tp++; + } +} + +/* + * get_tce_space_from_tar(): + * Function for kdump case. Get the tce tables from first kernel + * by reading the contents of the base adress register of calgary iommu + */ +static void get_tce_space_from_tar() +{ + int bus; + void __iomem *target; + unsigned long tce_space; + + for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { + struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; + + if (!is_cal_pci_dev(pci_device)) + continue; + if (info->translation_disabled) + continue; + + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + target = calgary_reg(bus_info[bus].bbar, + tar_offset(bus)); + tce_space = be64_to_cpu(readq(target)); + tce_space = tce_space & TAR_SW_BITS; + + tce_space = tce_space & (~specified_table_size); + info->tce_space = (u64 *)__va(tce_space); + } + } + return; +} + void __init detect_calgary(void) { int bus; @@ -1394,7 +1460,8 @@ void __init detect_calgary(void) return; } - specified_table_size = determine_tce_table_size(max_pfn * PAGE_SIZE); + specified_table_size = determine_tce_table_size((is_kdump_kernel() ? + saved_max_pfn : max_pfn) * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { struct calgary_bus_info *info = &bus_info[bus]; @@ -1412,10 +1479,16 @@ void __init detect_calgary(void) if (calgary_bus_has_devices(bus, pci_device) || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; + /* + * If it is kdump kernel, find and use tce tables + * from first kernel, else allocate tce tables here + */ + if (!is_kdump_kernel()) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + } calgary_found = 1; } } -- cgit v1.2.2 From 024e8ac04453b3525448c31ef39848cf675ba6db Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 20:00:10 -0700 Subject: x86_64: fix ia32 AMD syscall audit fast-path The new code in commit 5cbf1565f29eb57a86a305b08836613508e294d7 has a bug in the version supporting the AMD 'syscall' instruction. It clobbers the user's %ecx register value (with the %ebp value). This change fixes it. Signed-off-by: Roland McGrath --- arch/x86/ia32/ia32entry.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e4bd1793a5e4..ffc1bb4fed7d 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -201,7 +201,7 @@ sysexit_from_sys_call: movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ .endm - .macro auditsys_exit exit + .macro auditsys_exit exit,ebpsave=RBP testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jnz int_ret_from_sys_call TRACE_IRQS_ON @@ -214,7 +214,7 @@ sysexit_from_sys_call: call audit_syscall_exit GET_THREAD_INFO(%r10) movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ - movl RBP-ARGOFFSET(%rsp),%ebp /* reload user register value */ + movl \ebpsave-ARGOFFSET(%rsp),%ebp /* reload user register value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF @@ -347,7 +347,7 @@ cstar_auditsys: jmp cstar_dispatch sysretl_audit: - auditsys_exit sysretl_from_sys_call + auditsys_exit sysretl_from_sys_call, RCX /* user %ebp in RCX slot */ #endif cstar_tracesys: -- cgit v1.2.2 From 286f571837ba9d67625afd015366d79345abb414 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:46 +0200 Subject: x86: apic_*.c: add description to AMD's extended LVT functions Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 3 +++ arch/x86/kernel/apic_64.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d6c898358371..fad94b0decc1 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -646,6 +646,9 @@ int setup_profiling_timer(unsigned int multiplier) * * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. */ #define APIC_EILVT_LVTOFF_MCE 0 diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 7f1f030da7ee..42bf69f8bc8e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -205,6 +205,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. */ #define APIC_EILVT_LVTOFF_MCE 0 -- cgit v1.2.2 From 12f2b2610e812627acf338aaf043fef20bb726ca Mon Sep 17 00:00:00 2001 From: Barry Kasindorf Date: Tue, 22 Jul 2008 21:08:47 +0200 Subject: oprofile: Add support for AMD Family 11h This patch add support for AMD Family 11h CPUs. Signed-off-by: Barry Kasindorf Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3f90289410e6..33db99ab90c0 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -436,6 +436,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) model = &op_athlon_spec; cpu_type = "x86-64/family10"; break; + case 0x11: + model = &op_athlon_spec; + cpu_type = "x86-64/family11h"; + break; } break; -- cgit v1.2.2 From adf5ec0bca553b763a6b9baed2677a4c7470025b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:48 +0200 Subject: x86/oprofile: introduce model specific init/exit functions This patch implements model specific OProfile init/exit functions for x86 CPUs. Though there is more rework needed at the initialization code, this new introduced functions allow it to keep model specific code in the corresponding op_model_*.c files. The function interface is the same as for oprofile_arch_init/exit(). Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 11 ++++++++++- arch/x86/oprofile/op_model_athlon.c | 18 +++++++++++++++--- arch/x86/oprofile/op_x86_model.h | 2 ++ 3 files changed, 27 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 33db99ab90c0..75e889156f23 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -1,10 +1,11 @@ /** * @file nmi_int.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2008 OProfile authors * @remark Read the file COPYING * * @author John Levon + * @author Robert Richter */ #include @@ -411,6 +412,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) __u8 vendor = boot_cpu_data.x86_vendor; __u8 family = boot_cpu_data.x86; char *cpu_type; + int ret = 0; if (!cpu_has_apic) return -ENODEV; @@ -466,6 +468,11 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } + if (model->init) + ret = model->init(ops); + if (ret) + return ret; + init_sysfs(); using_nmi = 1; ops->create_files = nmi_create_files; @@ -482,4 +489,6 @@ void op_nmi_exit(void) { if (using_nmi) exit_sysfs(); + if (model->exit) + model->exit(); } diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 3d534879a9dc..dd8b1dcd163b 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -1,14 +1,15 @@ /* - * @file op_model_athlon.h + * @file op_model_athlon.c * athlon / K7 / K8 / Family 10h model-specific MSR operations * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2008 OProfile authors * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare - */ + * @author Robert Richter +*/ #include #include @@ -178,7 +179,18 @@ static void athlon_shutdown(struct op_msrs const * const msrs) } } +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) +{ +} + struct op_x86_model_spec const op_athlon_spec = { + .init = op_amd_init, + .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, .fill_in_addresses = &athlon_fill_in_addresses, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 45b605fa71d0..ee9ca96253f4 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -32,6 +32,8 @@ struct pt_regs; * various x86 CPU models' perfctr support. */ struct op_x86_model_spec { + int (*init)(struct oprofile_operations *ops); + void (*exit)(void); unsigned int const num_counters; unsigned int const num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); -- cgit v1.2.2 From dfa154289701ed315909b4e371a0381c52c8bdc9 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:49 +0200 Subject: x86/oprofile: Minor changes in op_model_athlon.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index dd8b1dcd163b..d25d7f195036 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -45,6 +45,8 @@ static unsigned long reset_value[NUM_COUNTERS]; +/* functions for op_athlon_spec */ + static void athlon_fill_in_addresses(struct op_msrs * const msrs) { int i; -- cgit v1.2.2 From 6657fe4f5650ff7174d418d4bfa50c4640e81a2b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:50 +0200 Subject: x86/oprofile: renaming athlon_*() into op_amd_*() These functions contain code for all AMD CPUs. The new names fit better. Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 8 ++++---- arch/x86/oprofile/op_model_athlon.c | 28 ++++++++++++++-------------- arch/x86/oprofile/op_x86_model.h | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 75e889156f23..b29819313f2b 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -425,21 +425,21 @@ int __init op_nmi_init(struct oprofile_operations *ops) default: return -ENODEV; case 6: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "i386/athlon"; break; case 0xf: - model = &op_athlon_spec; + model = &op_amd_spec; /* Actually it could be i386/hammer too, but give user space an consistent name. */ cpu_type = "x86-64/hammer"; break; case 0x10: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "x86-64/family10"; break; case 0x11: - model = &op_athlon_spec; + model = &op_amd_spec; cpu_type = "x86-64/family11h"; break; } diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index d25d7f195036..40ecb020c7d2 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -45,9 +45,9 @@ static unsigned long reset_value[NUM_COUNTERS]; -/* functions for op_athlon_spec */ +/* functions for op_amd_spec */ -static void athlon_fill_in_addresses(struct op_msrs * const msrs) +static void op_amd_fill_in_addresses(struct op_msrs * const msrs) { int i; @@ -67,7 +67,7 @@ static void athlon_fill_in_addresses(struct op_msrs * const msrs) } -static void athlon_setup_ctrs(struct op_msrs const * const msrs) +static void op_amd_setup_ctrs(struct op_msrs const * const msrs) { unsigned int low, high; int i; @@ -116,7 +116,7 @@ static void athlon_setup_ctrs(struct op_msrs const * const msrs) } -static int athlon_check_ctrs(struct pt_regs * const regs, +static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { unsigned int low, high; @@ -137,7 +137,7 @@ static int athlon_check_ctrs(struct pt_regs * const regs, } -static void athlon_start(struct op_msrs const * const msrs) +static void op_amd_start(struct op_msrs const * const msrs) { unsigned int low, high; int i; @@ -151,7 +151,7 @@ static void athlon_start(struct op_msrs const * const msrs) } -static void athlon_stop(struct op_msrs const * const msrs) +static void op_amd_stop(struct op_msrs const * const msrs) { unsigned int low, high; int i; @@ -167,7 +167,7 @@ static void athlon_stop(struct op_msrs const * const msrs) } } -static void athlon_shutdown(struct op_msrs const * const msrs) +static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; @@ -190,15 +190,15 @@ static void op_amd_exit(void) { } -struct op_x86_model_spec const op_athlon_spec = { +struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, - .fill_in_addresses = &athlon_fill_in_addresses, - .setup_ctrs = &athlon_setup_ctrs, - .check_ctrs = &athlon_check_ctrs, - .start = &athlon_start, - .stop = &athlon_stop, - .shutdown = &athlon_shutdown + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown }; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index ee9ca96253f4..05a0261ba0c3 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -48,6 +48,6 @@ struct op_x86_model_spec { extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; -extern struct op_x86_model_spec const op_athlon_spec; +extern struct op_x86_model_spec const op_amd_spec; #endif /* OP_X86_MODEL_H */ -- cgit v1.2.2 From 56784f11df473b4c1d9d0e37777fd7c0b77b6bca Mon Sep 17 00:00:00 2001 From: Barry Kasindorf Date: Tue, 22 Jul 2008 21:08:55 +0200 Subject: x86/oprofile: add IBS support for AMD CPUs, model specific code This patchset supports the new profiling hardware available in the latest AMD CPUs in the oProfile driver. Signed-off-by: Barry Kasindorf Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 257 ++++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 40ecb020c7d2..229e0b4e21e3 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -9,9 +9,13 @@ * @author Philippe Elie * @author Graydon Hoare * @author Robert Richter + * @author Barry Kasindorf */ #include +#include +#include + #include #include #include @@ -43,7 +47,83 @@ #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) +#define IBS_FETCH_CTL_HIGH_MASK 0xFFFFFFFF +/* high dword bit IbsFetchCtl[bit 49] */ +#define IBS_FETCH_VALID_BIT (1UL << 17) +/* high dword bit IbsFetchCtl[bit 52] */ +#define IBS_FETCH_PHY_ADDR_VALID_BIT (1UL << 20) +/* high dword bit IbsFetchCtl[bit 48] */ +#define IBS_FETCH_ENABLE (1UL << 16) + +#define IBS_FETCH_CTL_CNT_MASK 0x00000000FFFF0000UL +#define IBS_FETCH_CTL_MAX_CNT_MASK 0x000000000000FFFFUL + +/*IbsOpCtl masks/bits */ +#define IBS_OP_VALID_BIT (1ULL<<18) /* IbsOpCtl[bit18] */ +#define IBS_OP_ENABLE (1ULL<<17) /* IBS_OP_ENABLE[bit17]*/ + +/* Codes used in cpu_buffer.c */ +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 + +/*IbsOpData3 masks */ +#define IBS_CTL_LVT_OFFSET_VALID_BIT (1ULL<<8) + +/*PCI Extended Configuration Constants */ +/* MSR to set the IBS control register APIC LVT offset */ +#define IBS_LVT_OFFSET_PCI 0x1CC + +struct ibs_fetch_sample { + /* MSRC001_1031 IBS Fetch Linear Address Register */ + unsigned int ibs_fetch_lin_addr_low; + unsigned int ibs_fetch_lin_addr_high; + /* MSRC001_1030 IBS Fetch Control Register */ + unsigned int ibs_fetch_ctl_low; + unsigned int ibs_fetch_ctl_high; + /* MSRC001_1032 IBS Fetch Physical Address Register */ + unsigned int ibs_fetch_phys_addr_low; + unsigned int ibs_fetch_phys_addr_high; +}; + +struct ibs_op_sample { + /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ + unsigned int ibs_op_rip_low; + unsigned int ibs_op_rip_high; + /* MSRC001_1035 IBS Op Data Register */ + unsigned int ibs_op_data1_low; + unsigned int ibs_op_data1_high; + /* MSRC001_1036 IBS Op Data 2 Register */ + unsigned int ibs_op_data2_low; + unsigned int ibs_op_data2_high; + /* MSRC001_1037 IBS Op Data 3 Register */ + unsigned int ibs_op_data3_low; + unsigned int ibs_op_data3_high; + /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ + unsigned int ibs_dc_linear_low; + unsigned int ibs_dc_linear_high; + /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ + unsigned int ibs_dc_phys_low; + unsigned int ibs_dc_phys_high; +}; + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ +*/ +static void clear_ibs_nmi(void); + static unsigned long reset_value[NUM_COUNTERS]; +static int ibs_allowed; /* AMD Family10h and later */ + +struct op_ibs_config { + unsigned long op_enabled; + unsigned long fetch_enabled; + unsigned long max_cnt_fetch; + unsigned long max_cnt_op; + unsigned long rand_en; + unsigned long dispatched_ops; +}; + +static struct op_ibs_config ibs_config; /* functions for op_amd_spec */ @@ -121,6 +201,8 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, { unsigned int low, high; int i; + struct ibs_fetch_sample ibs_fetch; + struct ibs_op_sample ibs_op; for (i = 0 ; i < NUM_COUNTERS; ++i) { if (!reset_value[i]) @@ -132,6 +214,65 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } + /*If AMD and IBS is available */ + if (ibs_allowed && ibs_config.fetch_enabled) { + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + if (high & IBS_FETCH_VALID_BIT) { + ibs_fetch.ibs_fetch_ctl_high = high; + ibs_fetch.ibs_fetch_ctl_low = low; + rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); + ibs_fetch.ibs_fetch_lin_addr_high = high; + ibs_fetch.ibs_fetch_lin_addr_low = low; + rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); + ibs_fetch.ibs_fetch_phys_addr_high = high; + ibs_fetch.ibs_fetch_phys_addr_low = low; + + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_fetch, + IBS_FETCH_BEGIN); + + /*reenable the IRQ */ + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + high &= ~(IBS_FETCH_VALID_BIT); + high |= IBS_FETCH_ENABLE; + low &= IBS_FETCH_CTL_MAX_CNT_MASK; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + } + + if (ibs_allowed && ibs_config.op_enabled) { + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + if (low & IBS_OP_VALID_BIT) { + rdmsr(MSR_AMD64_IBSOPRIP, low, high); + ibs_op.ibs_op_rip_low = low; + ibs_op.ibs_op_rip_high = high; + rdmsr(MSR_AMD64_IBSOPDATA, low, high); + ibs_op.ibs_op_data1_low = low; + ibs_op.ibs_op_data1_high = high; + rdmsr(MSR_AMD64_IBSOPDATA2, low, high); + ibs_op.ibs_op_data2_low = low; + ibs_op.ibs_op_data2_high = high; + rdmsr(MSR_AMD64_IBSOPDATA3, low, high); + ibs_op.ibs_op_data3_low = low; + ibs_op.ibs_op_data3_high = high; + rdmsr(MSR_AMD64_IBSDCLINAD, low, high); + ibs_op.ibs_dc_linear_low = low; + ibs_op.ibs_dc_linear_high = high; + rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); + ibs_op.ibs_dc_phys_low = low; + ibs_op.ibs_dc_phys_high = high; + + /* reenable the IRQ */ + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_op, + IBS_OP_BEGIN); + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + low &= ~(IBS_OP_VALID_BIT); + low |= IBS_OP_ENABLE; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } + } + /* See op_model_ppro.c */ return 1; } @@ -148,6 +289,17 @@ static void op_amd_start(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } } + if (ibs_allowed && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = IBS_FETCH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } } @@ -165,6 +317,18 @@ static void op_amd_stop(struct op_msrs const * const msrs) CTRL_SET_INACTIVE(low); CTRL_WRITE(low, high, msrs, i); } + + if (ibs_allowed && ibs_config.fetch_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } } static void op_amd_shutdown(struct op_msrs const * const msrs) @@ -181,6 +345,99 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } +static inline void apic_init_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); +} + +static inline void apic_clear_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); +} + +/* + * initialize the APIC for the IBS interrupts + * if needed on AMD Family10h rev B0 and later + */ +static void setup_ibs(void) +{ + struct pci_dev *gh_device = NULL; + u32 low, high; + u8 vector; + + ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + + if (!ibs_allowed) + return; + + /* This gets the APIC_EILVT_LVTOFF_IBS value */ + vector = setup_APIC_eilvt_ibs(0, 0, 1); + + /*see if the IBS control register is already set correctly*/ + /*remove this when we know for sure it is done + in the kernel init*/ + rdmsr(MSR_AMD64_IBSCTL, low, high); + if ((low & (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) != + (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) { + + /**** Be sure to run loop until NULL is returned to + decrement reference count on any pci_dev structures + returned ****/ + while ((gh_device = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, gh_device)) + != NULL) { + /* This code may change if we can find a proper + * way to get at the PCI extended config space */ + pci_write_config_dword( + gh_device, IBS_LVT_OFFSET_PCI, + (vector | IBS_CTL_LVT_OFFSET_VALID_BIT)); + } + } + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1, 1); +} + + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h + * rev B0 and later */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1, 1); +} + +static void setup_ibs_files(struct super_block *sb, struct dentry *root) +{ + char buf[12]; + struct dentry *dir; + + if (!ibs_allowed) + return; + + /* setup some reasonable defaults */ + ibs_config.max_cnt_fetch = 250000; + ibs_config.fetch_enabled = 0; + ibs_config.max_cnt_op = 250000; + ibs_config.op_enabled = 0; + ibs_config.dispatched_ops = 1; + snprintf(buf, sizeof(buf), "ibs_fetch"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + snprintf(buf, sizeof(buf), "ibs_uops"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); +} + static int op_amd_init(struct oprofile_operations *ops) { return 0; -- cgit v1.2.2 From 7939d2bf7e30353d40d14f967b7fe2b2a7be5bd9 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:56 +0200 Subject: x86/oprofile: separating the IBS handler Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 45 +++++++++++++++++++++++-------------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 229e0b4e21e3..a2c8e2e372bb 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -195,27 +195,18 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } } - -static int op_amd_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) +static inline int +op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) { unsigned int low, high; - int i; struct ibs_fetch_sample ibs_fetch; struct ibs_op_sample ibs_op; - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) - continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } - } + if (!ibs_allowed) + return 1; - /*If AMD and IBS is available */ - if (ibs_allowed && ibs_config.fetch_enabled) { + if (ibs_config.fetch_enabled) { rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); if (high & IBS_FETCH_VALID_BIT) { ibs_fetch.ibs_fetch_ctl_high = high; @@ -240,7 +231,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } - if (ibs_allowed && ibs_config.op_enabled) { + if (ibs_config.op_enabled) { rdmsr(MSR_AMD64_IBSOPCTL, low, high); if (low & IBS_OP_VALID_BIT) { rdmsr(MSR_AMD64_IBSOPRIP, low, high); @@ -273,10 +264,30 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } - /* See op_model_ppro.c */ return 1; } +static int op_amd_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) + continue; + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); + } + } + + op_amd_handle_ibs(regs, msrs); + + /* See op_model_ppro.c */ + return 1; +} static void op_amd_start(struct op_msrs const * const msrs) { -- cgit v1.2.2 From 7d77f2dcae37cf232950cd0181fb0a2cddb18130 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:57 +0200 Subject: OProfile: change IBS interrupt initialization Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 84 ++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index a2c8e2e372bb..90193b1538a0 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -356,9 +356,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } +static u8 ibs_eilvt_off; + static inline void apic_init_ibs_nmi_per_cpu(void *arg) { - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); + ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); } static inline void apic_clear_ibs_nmi_per_cpu(void *arg) @@ -366,45 +368,67 @@ static inline void apic_clear_ibs_nmi_per_cpu(void *arg) setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); } +static int pfm_amd64_setup_eilvt(void) +{ +#define IBSCTL_LVTOFFSETVAL (1 << 8) +#define IBSCTL 0x1cc + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + /* per CPU setup */ + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 0, 1); + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVTOFFSETVAL); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x", value); + return 1; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS"); + return 1; + } + +#ifdef CONFIG_NUMA + /* Sanity check */ + /* Works only for 64bit with proper numa implementation. */ + if (nodes != num_possible_nodes()) { + printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " + "found: %d, expected %d", + nodes, num_possible_nodes()); + return 1; + } +#endif + return 0; +} + /* * initialize the APIC for the IBS interrupts - * if needed on AMD Family10h rev B0 and later + * if available (AMD Family10h rev B0 and later) */ static void setup_ibs(void) { - struct pci_dev *gh_device = NULL; - u32 low, high; - u8 vector; - ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); if (!ibs_allowed) return; - /* This gets the APIC_EILVT_LVTOFF_IBS value */ - vector = setup_APIC_eilvt_ibs(0, 0, 1); - - /*see if the IBS control register is already set correctly*/ - /*remove this when we know for sure it is done - in the kernel init*/ - rdmsr(MSR_AMD64_IBSCTL, low, high); - if ((low & (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) != - (IBS_CTL_LVT_OFFSET_VALID_BIT | vector)) { - - /**** Be sure to run loop until NULL is returned to - decrement reference count on any pci_dev structures - returned ****/ - while ((gh_device = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_AMD_10H_NB_MISC, gh_device)) - != NULL) { - /* This code may change if we can find a proper - * way to get at the PCI extended config space */ - pci_write_config_dword( - gh_device, IBS_LVT_OFFSET_PCI, - (vector | IBS_CTL_LVT_OFFSET_VALID_BIT)); - } - } - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1, 1); + if (pfm_amd64_setup_eilvt()) + ibs_allowed = 0; } -- cgit v1.2.2 From 90645700ef8393cd9cdc02d83da36726fc88f49e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:58 +0200 Subject: OProfile: Fix build error in op_model_athlon.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 90193b1538a0..284c45661ed7 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -73,6 +73,11 @@ /* MSR to set the IBS control register APIC LVT offset */ #define IBS_LVT_OFFSET_PCI 0x1CC +/* The function interface needs to be fixed, something like add + data. Should then be added to linux/oprofile.h. */ +extern void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, u8 code); + struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ unsigned int ibs_fetch_lin_addr_low; -- cgit v1.2.2 From ebb535de267386f659e0348185b1e361dbba3b59 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:08:59 +0200 Subject: OProfile: on_each_cpu(): kill unused retry parameter Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 284c45661ed7..ce732362dbb0 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -382,7 +382,7 @@ static int pfm_amd64_setup_eilvt(void) u32 value = 0; /* per CPU setup */ - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 0, 1); + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); nodes = 0; cpu_cfg = NULL; @@ -443,7 +443,7 @@ static void setup_ibs(void) static void clear_ibs_nmi(void) { if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1, 1); + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } static void setup_ibs_files(struct super_block *sb, struct dentry *root) -- cgit v1.2.2 From fc2bd7345b4e006a34c2ea3711d8c6b83cba50f7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:00 +0200 Subject: OProfile: fix setup_ibs_files() function interface Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index ce732362dbb0..2650b12a0c56 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -446,13 +446,13 @@ static void clear_ibs_nmi(void) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } -static void setup_ibs_files(struct super_block *sb, struct dentry *root) +static int setup_ibs_files(struct super_block * sb, struct dentry * root) { char buf[12]; struct dentry *dir; if (!ibs_allowed) - return; + return 0; /* setup some reasonable defaults */ ibs_config.max_cnt_fetch = 250000; @@ -476,6 +476,8 @@ static void setup_ibs_files(struct super_block *sb, struct dentry *root) &ibs_config.max_cnt_op); oprofilefs_create_ulong(sb, dir, "dispatched_ops", &ibs_config.dispatched_ops); + + return 0; } static int op_amd_init(struct oprofile_operations *ops) -- cgit v1.2.2 From 270d3e1a10e6ef85d5a085377e01d91dbcbe3726 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:01 +0200 Subject: OProfile: enable IBS for AMD CPUs Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 14 ++++++++------ arch/x86/oprofile/op_model_athlon.c | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index b29819313f2b..287513a09819 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -468,6 +468,14 @@ int __init op_nmi_init(struct oprofile_operations *ops) return -ENODEV; } + /* default values, can be overwritten by model */ + ops->create_files = nmi_create_files; + ops->setup = nmi_setup; + ops->shutdown = nmi_shutdown; + ops->start = nmi_start; + ops->stop = nmi_stop; + ops->cpu_type = cpu_type; + if (model->init) ret = model->init(ops); if (ret) @@ -475,12 +483,6 @@ int __init op_nmi_init(struct oprofile_operations *ops) init_sysfs(); using_nmi = 1; - ops->create_files = nmi_create_files; - ops->setup = nmi_setup; - ops->shutdown = nmi_shutdown; - ops->start = nmi_start; - ops->stop = nmi_stop; - ops->cpu_type = cpu_type; printk(KERN_INFO "oprofile: using NMI interrupt.\n"); return 0; } diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 2650b12a0c56..0d8390319797 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -446,13 +446,25 @@ static void clear_ibs_nmi(void) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } +static int (*create_arch_files)(struct super_block * sb, struct dentry * root); + static int setup_ibs_files(struct super_block * sb, struct dentry * root) { char buf[12]; struct dentry *dir; + int ret = 0; + + /* architecture specific files */ + if (create_arch_files) + ret = create_arch_files(sb, root); + + if (ret) + return ret; if (!ibs_allowed) - return 0; + return ret; + + /* model specific files */ /* setup some reasonable defaults */ ibs_config.max_cnt_fetch = 250000; @@ -482,11 +494,15 @@ static int setup_ibs_files(struct super_block * sb, struct dentry * root) static int op_amd_init(struct oprofile_operations *ops) { + setup_ibs(); + create_arch_files = ops->create_files; + ops->create_files = setup_ibs_files; return 0; } static void op_amd_exit(void) { + clear_ibs_nmi(); } struct op_x86_model_spec const op_amd_spec = { -- cgit v1.2.2 From a4c408a41167949f820e2740e56a8f2f7bb6177c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:02 +0200 Subject: OProfile: fix IBS build error for UP Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 0d8390319797..1acb067bd343 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -361,6 +361,26 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } +#ifndef CONFIG_SMP + +/* no IBS support */ + +static void setup_ibs(void) +{ + ibs_allowed = 0; +} + +static void clear_ibs_nmi(void) {} + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#else + static u8 ibs_eilvt_off; static inline void apic_init_ibs_nmi_per_cpu(void *arg) @@ -505,6 +525,8 @@ static void op_amd_exit(void) clear_ibs_nmi(); } +#endif + struct op_x86_model_spec const op_amd_spec = { .init = op_amd_init, .exit = op_amd_exit, -- cgit v1.2.2 From 87f0baccc2e4f194c931186d3c8499314494a484 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:03 +0200 Subject: x86/oprofile: macro definition cleanup in op_model_athlon.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 46 ++++++++++++++----------------------- 1 file changed, 17 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 1acb067bd343..a3a2058c372c 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -47,32 +47,20 @@ #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) -#define IBS_FETCH_CTL_HIGH_MASK 0xFFFFFFFF -/* high dword bit IbsFetchCtl[bit 49] */ -#define IBS_FETCH_VALID_BIT (1UL << 17) -/* high dword bit IbsFetchCtl[bit 52] */ -#define IBS_FETCH_PHY_ADDR_VALID_BIT (1UL << 20) -/* high dword bit IbsFetchCtl[bit 48] */ -#define IBS_FETCH_ENABLE (1UL << 16) +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ +#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ +#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ -#define IBS_FETCH_CTL_CNT_MASK 0x00000000FFFF0000UL -#define IBS_FETCH_CTL_MAX_CNT_MASK 0x000000000000FFFFUL - -/*IbsOpCtl masks/bits */ -#define IBS_OP_VALID_BIT (1ULL<<18) /* IbsOpCtl[bit18] */ -#define IBS_OP_ENABLE (1ULL<<17) /* IBS_OP_ENABLE[bit17]*/ +/*IbsOpCtl bits */ +#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ +#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ /* Codes used in cpu_buffer.c */ +/* This produces duplicate code, need to be fixed */ #define IBS_FETCH_BEGIN 3 #define IBS_OP_BEGIN 4 -/*IbsOpData3 masks */ -#define IBS_CTL_LVT_OFFSET_VALID_BIT (1ULL<<8) - -/*PCI Extended Configuration Constants */ -/* MSR to set the IBS control register APIC LVT offset */ -#define IBS_LVT_OFFSET_PCI 0x1CC - /* The function interface needs to be fixed, something like add data. Should then be added to linux/oprofile.h. */ extern void oprofile_add_ibs_sample(struct pt_regs *const regs, @@ -213,7 +201,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, if (ibs_config.fetch_enabled) { rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_VALID_BIT) { + if (high & IBS_FETCH_HIGH_VALID_BIT) { ibs_fetch.ibs_fetch_ctl_high = high; ibs_fetch.ibs_fetch_ctl_low = low; rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); @@ -229,16 +217,16 @@ op_amd_handle_ibs(struct pt_regs * const regs, /*reenable the IRQ */ rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - high &= ~(IBS_FETCH_VALID_BIT); - high |= IBS_FETCH_ENABLE; - low &= IBS_FETCH_CTL_MAX_CNT_MASK; + high &= ~IBS_FETCH_HIGH_VALID_BIT; + high |= IBS_FETCH_HIGH_ENABLE; + low &= IBS_FETCH_LOW_MAX_CNT_MASK; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } } if (ibs_config.op_enabled) { rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_VALID_BIT) { + if (low & IBS_OP_LOW_VALID_BIT) { rdmsr(MSR_AMD64_IBSOPRIP, low, high); ibs_op.ibs_op_rip_low = low; ibs_op.ibs_op_rip_high = high; @@ -263,8 +251,8 @@ op_amd_handle_ibs(struct pt_regs * const regs, (unsigned int *)&ibs_op, IBS_OP_BEGIN); rdmsr(MSR_AMD64_IBSOPCTL, low, high); - low &= ~(IBS_OP_VALID_BIT); - low |= IBS_OP_ENABLE; + low &= ~IBS_OP_LOW_VALID_BIT; + low |= IBS_OP_LOW_ENABLE; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } } @@ -307,12 +295,12 @@ static void op_amd_start(struct op_msrs const * const msrs) } if (ibs_allowed && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = IBS_FETCH_ENABLE; + high = IBS_FETCH_HIGH_ENABLE; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } if (ibs_allowed && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_ENABLE; + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } -- cgit v1.2.2 From 543a157bbdfae8eb997506031c3b2d4d17957098 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:04 +0200 Subject: x86/oprofile: op_model_athlon.c: fix counter reset when reenabling IBS OP Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index a3a2058c372c..9c8c8c583132 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -251,6 +251,7 @@ op_amd_handle_ibs(struct pt_regs * const regs, (unsigned int *)&ibs_op, IBS_OP_BEGIN); rdmsr(MSR_AMD64_IBSOPCTL, low, high); + high = 0; low &= ~IBS_OP_LOW_VALID_BIT; low |= IBS_OP_LOW_ENABLE; wrmsr(MSR_AMD64_IBSOPCTL, low, high); -- cgit v1.2.2 From 09691616850b3614dfb44790e1e1419b6a7f5d13 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:05 +0200 Subject: x86: apic: export symbols for extended interrupt LVT functions This patch adds EXPORT_SYMBOLs to allow OProfile to be built as module. Cc: Arjan van de Ven Signed-off-by: Robert Richter Cc: oprofile-list Cc: Arjan van de Ven Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 1 + arch/x86/kernel/apic_64.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index fad94b0decc1..ed3a6d7e9de7 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -672,6 +672,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL(setup_APIC_eilvt_ibs); /* * Local APIC start and shutdown diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 42bf69f8bc8e..45ec90e37b93 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -232,6 +232,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } +EXPORT_SYMBOL(setup_APIC_eilvt_ibs); /* * Program the next event, relative to now -- cgit v1.2.2 From 6aa360e6c16c145edf1837690e0f7aaea6b86ef3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 23 Jul 2008 15:28:14 +0200 Subject: x86: apic: changing export symbols to *_GPL This fits better here. Signed-off-by: Robert Richter Cc: Arjan van de Ven Cc: Barry Kasindorf Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/apic_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index ed3a6d7e9de7..0059e7a8a9e6 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -672,7 +672,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } -EXPORT_SYMBOL(setup_APIC_eilvt_ibs); +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); /* * Local APIC start and shutdown diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 45ec90e37b93..e571351f2a93 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -232,7 +232,7 @@ u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); return APIC_EILVT_LVTOFF_IBS; } -EXPORT_SYMBOL(setup_APIC_eilvt_ibs); +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); /* * Program the next event, relative to now -- cgit v1.2.2 From 852402cc27bfa1200164e9e8dc7f6e5f0a4fbd46 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:06 +0200 Subject: x86/oprofile: add CONFIG_OPROFILE_IBS option Signed-off-by: Robert Richter Cc: oprofile-list Cc: Robert Richter Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_athlon.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c index 9c8c8c583132..fb6015c1132b 100644 --- a/arch/x86/oprofile/op_model_athlon.c +++ b/arch/x86/oprofile/op_model_athlon.c @@ -47,6 +47,10 @@ #define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) +static unsigned long reset_value[NUM_COUNTERS]; + +#ifdef CONFIG_OPROFILE_IBS + /* IbsFetchCtl bits/masks */ #define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ #define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ @@ -104,7 +108,6 @@ struct ibs_op_sample { */ static void clear_ibs_nmi(void); -static unsigned long reset_value[NUM_COUNTERS]; static int ibs_allowed; /* AMD Family10h and later */ struct op_ibs_config { @@ -118,6 +121,8 @@ struct op_ibs_config { static struct op_ibs_config ibs_config; +#endif + /* functions for op_amd_spec */ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) @@ -188,6 +193,8 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } } +#ifdef CONFIG_OPROFILE_IBS + static inline int op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) @@ -261,6 +268,8 @@ op_amd_handle_ibs(struct pt_regs * const regs, return 1; } +#endif + static int op_amd_check_ctrs(struct pt_regs * const regs, struct op_msrs const * const msrs) { @@ -277,7 +286,9 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, } } +#ifdef CONFIG_OPROFILE_IBS op_amd_handle_ibs(regs, msrs); +#endif /* See op_model_ppro.c */ return 1; @@ -294,6 +305,8 @@ static void op_amd_start(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } } + +#ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; high = IBS_FETCH_HIGH_ENABLE; @@ -305,6 +318,7 @@ static void op_amd_start(struct op_msrs const * const msrs) high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } +#endif } @@ -323,6 +337,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) CTRL_WRITE(low, high, msrs, i); } +#ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { low = 0; /* clear max count and enable */ high = 0; @@ -334,6 +349,7 @@ static void op_amd_stop(struct op_msrs const * const msrs) high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } +#endif } static void op_amd_shutdown(struct op_msrs const * const msrs) @@ -350,17 +366,10 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) } } -#ifndef CONFIG_SMP +#ifndef CONFIG_OPROFILE_IBS /* no IBS support */ -static void setup_ibs(void) -{ - ibs_allowed = 0; -} - -static void clear_ibs_nmi(void) {} - static int op_amd_init(struct oprofile_operations *ops) { return 0; @@ -441,8 +450,12 @@ static void setup_ibs(void) if (!ibs_allowed) return; - if (pfm_amd64_setup_eilvt()) + if (pfm_amd64_setup_eilvt()) { ibs_allowed = 0; + return; + } + + printk(KERN_INFO "oprofile: AMD IBS detected\n"); } -- cgit v1.2.2 From 6852fd9b86d05063c6ef49d2e12e061cc7f6a105 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 22 Jul 2008 21:09:08 +0200 Subject: x86/oprofile: reanaming op_model_athlon.c to op_model_amd.c Signed-off-by: Robert Richter Cc: oprofile-list Cc: Barry Kasindorf Signed-off-by: Ingo Molnar --- arch/x86/oprofile/Makefile | 2 +- arch/x86/oprofile/op_model_amd.c | 543 ++++++++++++++++++++++++++++++++++++ arch/x86/oprofile/op_model_athlon.c | 543 ------------------------------------ 3 files changed, 544 insertions(+), 544 deletions(-) create mode 100644 arch/x86/oprofile/op_model_amd.c delete mode 100644 arch/x86/oprofile/op_model_athlon.c (limited to 'arch/x86') diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 30f3eb366667..446902b2a6b6 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile @@ -7,6 +7,6 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o -oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_athlon.o \ +oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ op_model_ppro.o op_model_p4.o oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c new file mode 100644 index 000000000000..d9faf607b3a6 --- /dev/null +++ b/arch/x86/oprofile/op_model_amd.c @@ -0,0 +1,543 @@ +/* + * @file op_model_amd.c + * athlon / K7 / K8 / Family 10h model-specific MSR operations + * + * @remark Copyright 2002-2008 OProfile authors + * @remark Read the file COPYING + * + * @author John Levon + * @author Philippe Elie + * @author Graydon Hoare + * @author Robert Richter + * @author Barry Kasindorf +*/ + +#include +#include +#include + +#include +#include +#include + +#include "op_x86_model.h" +#include "op_counter.h" + +#define NUM_COUNTERS 4 +#define NUM_CONTROLS 4 + +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) +#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) + +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) +#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) +#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) +#define CTRL_CLEAR_LO(x) (x &= (1<<21)) +#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) +#define CTRL_SET_ENABLE(val) (val |= 1<<20) +#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) +#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) +#define CTRL_SET_UM(val, m) (val |= (m << 8)) +#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) +#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) +#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) +#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) + +static unsigned long reset_value[NUM_COUNTERS]; + +#ifdef CONFIG_OPROFILE_IBS + +/* IbsFetchCtl bits/masks */ +#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ +#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ +#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ + +/*IbsOpCtl bits */ +#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ +#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ + +/* Codes used in cpu_buffer.c */ +/* This produces duplicate code, need to be fixed */ +#define IBS_FETCH_BEGIN 3 +#define IBS_OP_BEGIN 4 + +/* The function interface needs to be fixed, something like add + data. Should then be added to linux/oprofile.h. */ +extern void oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, u8 code); + +struct ibs_fetch_sample { + /* MSRC001_1031 IBS Fetch Linear Address Register */ + unsigned int ibs_fetch_lin_addr_low; + unsigned int ibs_fetch_lin_addr_high; + /* MSRC001_1030 IBS Fetch Control Register */ + unsigned int ibs_fetch_ctl_low; + unsigned int ibs_fetch_ctl_high; + /* MSRC001_1032 IBS Fetch Physical Address Register */ + unsigned int ibs_fetch_phys_addr_low; + unsigned int ibs_fetch_phys_addr_high; +}; + +struct ibs_op_sample { + /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ + unsigned int ibs_op_rip_low; + unsigned int ibs_op_rip_high; + /* MSRC001_1035 IBS Op Data Register */ + unsigned int ibs_op_data1_low; + unsigned int ibs_op_data1_high; + /* MSRC001_1036 IBS Op Data 2 Register */ + unsigned int ibs_op_data2_low; + unsigned int ibs_op_data2_high; + /* MSRC001_1037 IBS Op Data 3 Register */ + unsigned int ibs_op_data3_low; + unsigned int ibs_op_data3_high; + /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ + unsigned int ibs_dc_linear_low; + unsigned int ibs_dc_linear_high; + /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ + unsigned int ibs_dc_phys_low; + unsigned int ibs_dc_phys_high; +}; + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ +*/ +static void clear_ibs_nmi(void); + +static int ibs_allowed; /* AMD Family10h and later */ + +struct op_ibs_config { + unsigned long op_enabled; + unsigned long fetch_enabled; + unsigned long max_cnt_fetch; + unsigned long max_cnt_op; + unsigned long rand_en; + unsigned long dispatched_ops; +}; + +static struct op_ibs_config ibs_config; + +#endif + +/* functions for op_amd_spec */ + +static void op_amd_fill_in_addresses(struct op_msrs * const msrs) +{ + int i; + + for (i = 0; i < NUM_COUNTERS; i++) { + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + else + msrs->counters[i].addr = 0; + } + + for (i = 0; i < NUM_CONTROLS; i++) { + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + else + msrs->controls[i].addr = 0; + } +} + + +static void op_amd_setup_ctrs(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* clear all counters */ + for (i = 0 ; i < NUM_CONTROLS; ++i) { + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_WRITE(low, high, msrs, i); + } + + /* avoid a false detection of ctr overflows in NMI handler */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if (unlikely(!CTR_IS_RESERVED(msrs, i))) + continue; + CTR_WRITE(1, msrs, i); + } + + /* enable active counters */ + for (i = 0; i < NUM_COUNTERS; ++i) { + if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); + + CTRL_READ(low, high, msrs, i); + CTRL_CLEAR_LO(low); + CTRL_CLEAR_HI(high); + CTRL_SET_ENABLE(low); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[i].event); + CTRL_SET_EVENT_HIGH(high, counter_config[i].event); + CTRL_SET_HOST_ONLY(high, 0); + CTRL_SET_GUEST_ONLY(high, 0); + + CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; + } + } +} + +#ifdef CONFIG_OPROFILE_IBS + +static inline int +op_amd_handle_ibs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + struct ibs_fetch_sample ibs_fetch; + struct ibs_op_sample ibs_op; + + if (!ibs_allowed) + return 1; + + if (ibs_config.fetch_enabled) { + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + if (high & IBS_FETCH_HIGH_VALID_BIT) { + ibs_fetch.ibs_fetch_ctl_high = high; + ibs_fetch.ibs_fetch_ctl_low = low; + rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); + ibs_fetch.ibs_fetch_lin_addr_high = high; + ibs_fetch.ibs_fetch_lin_addr_low = low; + rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); + ibs_fetch.ibs_fetch_phys_addr_high = high; + ibs_fetch.ibs_fetch_phys_addr_low = low; + + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_fetch, + IBS_FETCH_BEGIN); + + /*reenable the IRQ */ + rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); + high &= ~IBS_FETCH_HIGH_VALID_BIT; + high |= IBS_FETCH_HIGH_ENABLE; + low &= IBS_FETCH_LOW_MAX_CNT_MASK; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + } + + if (ibs_config.op_enabled) { + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + if (low & IBS_OP_LOW_VALID_BIT) { + rdmsr(MSR_AMD64_IBSOPRIP, low, high); + ibs_op.ibs_op_rip_low = low; + ibs_op.ibs_op_rip_high = high; + rdmsr(MSR_AMD64_IBSOPDATA, low, high); + ibs_op.ibs_op_data1_low = low; + ibs_op.ibs_op_data1_high = high; + rdmsr(MSR_AMD64_IBSOPDATA2, low, high); + ibs_op.ibs_op_data2_low = low; + ibs_op.ibs_op_data2_high = high; + rdmsr(MSR_AMD64_IBSOPDATA3, low, high); + ibs_op.ibs_op_data3_low = low; + ibs_op.ibs_op_data3_high = high; + rdmsr(MSR_AMD64_IBSDCLINAD, low, high); + ibs_op.ibs_dc_linear_low = low; + ibs_op.ibs_dc_linear_high = high; + rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); + ibs_op.ibs_dc_phys_low = low; + ibs_op.ibs_dc_phys_high = high; + + /* reenable the IRQ */ + oprofile_add_ibs_sample(regs, + (unsigned int *)&ibs_op, + IBS_OP_BEGIN); + rdmsr(MSR_AMD64_IBSOPCTL, low, high); + high = 0; + low &= ~IBS_OP_LOW_VALID_BIT; + low |= IBS_OP_LOW_ENABLE; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } + } + + return 1; +} + +#endif + +static int op_amd_check_ctrs(struct pt_regs * const regs, + struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) + continue; + CTR_READ(low, high, msrs, i); + if (CTR_OVERFLOWED(low)) { + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + op_amd_handle_ibs(regs, msrs); +#endif + + /* See op_model_ppro.c */ + return 1; +} + +static void op_amd_start(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (reset_value[i]) { + CTRL_READ(low, high, msrs, i); + CTRL_SET_ACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; + high = IBS_FETCH_HIGH_ENABLE; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + + +static void op_amd_stop(struct op_msrs const * const msrs) +{ + unsigned int low, high; + int i; + + /* Subtle: stop on all counters to avoid race with + * setting our pm callback */ + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (!reset_value[i]) + continue; + CTRL_READ(low, high, msrs, i); + CTRL_SET_INACTIVE(low); + CTRL_WRITE(low, high, msrs, i); + } + +#ifdef CONFIG_OPROFILE_IBS + if (ibs_allowed && ibs_config.fetch_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); + } + + if (ibs_allowed && ibs_config.op_enabled) { + low = 0; /* clear max count and enable */ + high = 0; + wrmsr(MSR_AMD64_IBSOPCTL, low, high); + } +#endif +} + +static void op_amd_shutdown(struct op_msrs const * const msrs) +{ + int i; + + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (CTR_IS_RESERVED(msrs, i)) + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + } + for (i = 0 ; i < NUM_CONTROLS ; ++i) { + if (CTRL_IS_RESERVED(msrs, i)) + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } +} + +#ifndef CONFIG_OPROFILE_IBS + +/* no IBS support */ + +static int op_amd_init(struct oprofile_operations *ops) +{ + return 0; +} + +static void op_amd_exit(void) {} + +#else + +static u8 ibs_eilvt_off; + +static inline void apic_init_ibs_nmi_per_cpu(void *arg) +{ + ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); +} + +static inline void apic_clear_ibs_nmi_per_cpu(void *arg) +{ + setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); +} + +static int pfm_amd64_setup_eilvt(void) +{ +#define IBSCTL_LVTOFFSETVAL (1 << 8) +#define IBSCTL 0x1cc + struct pci_dev *cpu_cfg; + int nodes; + u32 value = 0; + + /* per CPU setup */ + on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); + + nodes = 0; + cpu_cfg = NULL; + do { + cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, + PCI_DEVICE_ID_AMD_10H_NB_MISC, + cpu_cfg); + if (!cpu_cfg) + break; + ++nodes; + pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off + | IBSCTL_LVTOFFSETVAL); + pci_read_config_dword(cpu_cfg, IBSCTL, &value); + if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + printk(KERN_DEBUG "Failed to setup IBS LVT offset, " + "IBSCTL = 0x%08x", value); + return 1; + } + } while (1); + + if (!nodes) { + printk(KERN_DEBUG "No CPU node configured for IBS"); + return 1; + } + +#ifdef CONFIG_NUMA + /* Sanity check */ + /* Works only for 64bit with proper numa implementation. */ + if (nodes != num_possible_nodes()) { + printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " + "found: %d, expected %d", + nodes, num_possible_nodes()); + return 1; + } +#endif + return 0; +} + +/* + * initialize the APIC for the IBS interrupts + * if available (AMD Family10h rev B0 and later) + */ +static void setup_ibs(void) +{ + ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + + if (!ibs_allowed) + return; + + if (pfm_amd64_setup_eilvt()) { + ibs_allowed = 0; + return; + } + + printk(KERN_INFO "oprofile: AMD IBS detected\n"); +} + + +/* + * unitialize the APIC for the IBS interrupts if needed on AMD Family10h + * rev B0 and later */ +static void clear_ibs_nmi(void) +{ + if (ibs_allowed) + on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); +} + +static int (*create_arch_files)(struct super_block * sb, struct dentry * root); + +static int setup_ibs_files(struct super_block * sb, struct dentry * root) +{ + char buf[12]; + struct dentry *dir; + int ret = 0; + + /* architecture specific files */ + if (create_arch_files) + ret = create_arch_files(sb, root); + + if (ret) + return ret; + + if (!ibs_allowed) + return ret; + + /* model specific files */ + + /* setup some reasonable defaults */ + ibs_config.max_cnt_fetch = 250000; + ibs_config.fetch_enabled = 0; + ibs_config.max_cnt_op = 250000; + ibs_config.op_enabled = 0; + ibs_config.dispatched_ops = 1; + snprintf(buf, sizeof(buf), "ibs_fetch"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.fetch_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_fetch); + snprintf(buf, sizeof(buf), "ibs_uops"); + dir = oprofilefs_mkdir(sb, root, buf); + oprofilefs_create_ulong(sb, dir, "enable", + &ibs_config.op_enabled); + oprofilefs_create_ulong(sb, dir, "max_count", + &ibs_config.max_cnt_op); + oprofilefs_create_ulong(sb, dir, "dispatched_ops", + &ibs_config.dispatched_ops); + + return 0; +} + +static int op_amd_init(struct oprofile_operations *ops) +{ + setup_ibs(); + create_arch_files = ops->create_files; + ops->create_files = setup_ibs_files; + return 0; +} + +static void op_amd_exit(void) +{ + clear_ibs_nmi(); +} + +#endif + +struct op_x86_model_spec const op_amd_spec = { + .init = op_amd_init, + .exit = op_amd_exit, + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown +}; diff --git a/arch/x86/oprofile/op_model_athlon.c b/arch/x86/oprofile/op_model_athlon.c deleted file mode 100644 index fb6015c1132b..000000000000 --- a/arch/x86/oprofile/op_model_athlon.c +++ /dev/null @@ -1,543 +0,0 @@ -/* - * @file op_model_athlon.c - * athlon / K7 / K8 / Family 10h model-specific MSR operations - * - * @remark Copyright 2002-2008 OProfile authors - * @remark Read the file COPYING - * - * @author John Levon - * @author Philippe Elie - * @author Graydon Hoare - * @author Robert Richter - * @author Barry Kasindorf -*/ - -#include -#include -#include - -#include -#include -#include - -#include "op_x86_model.h" -#include "op_counter.h" - -#define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 - -#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_WRITE(l, msrs, c) do {wrmsr(msrs->counters[(c)].addr, -(unsigned int)(l), -1); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) - -#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTRL_READ(l, h, msrs, c) do {rdmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_WRITE(l, h, msrs, c) do {wrmsr(msrs->controls[(c)].addr, (l), (h)); } while (0) -#define CTRL_SET_ACTIVE(n) (n |= (1<<22)) -#define CTRL_SET_INACTIVE(n) (n &= ~(1<<22)) -#define CTRL_CLEAR_LO(x) (x &= (1<<21)) -#define CTRL_CLEAR_HI(x) (x &= 0xfffffcf0) -#define CTRL_SET_ENABLE(val) (val |= 1<<20) -#define CTRL_SET_USR(val, u) (val |= ((u & 1) << 16)) -#define CTRL_SET_KERN(val, k) (val |= ((k & 1) << 17)) -#define CTRL_SET_UM(val, m) (val |= (m << 8)) -#define CTRL_SET_EVENT_LOW(val, e) (val |= (e & 0xff)) -#define CTRL_SET_EVENT_HIGH(val, e) (val |= ((e >> 8) & 0xf)) -#define CTRL_SET_HOST_ONLY(val, h) (val |= ((h & 1) << 9)) -#define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) - -static unsigned long reset_value[NUM_COUNTERS]; - -#ifdef CONFIG_OPROFILE_IBS - -/* IbsFetchCtl bits/masks */ -#define IBS_FETCH_HIGH_VALID_BIT (1UL << 17) /* bit 49 */ -#define IBS_FETCH_HIGH_ENABLE (1UL << 16) /* bit 48 */ -#define IBS_FETCH_LOW_MAX_CNT_MASK 0x0000FFFFUL /* MaxCnt mask */ - -/*IbsOpCtl bits */ -#define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ -#define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ - -/* Codes used in cpu_buffer.c */ -/* This produces duplicate code, need to be fixed */ -#define IBS_FETCH_BEGIN 3 -#define IBS_OP_BEGIN 4 - -/* The function interface needs to be fixed, something like add - data. Should then be added to linux/oprofile.h. */ -extern void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, u8 code); - -struct ibs_fetch_sample { - /* MSRC001_1031 IBS Fetch Linear Address Register */ - unsigned int ibs_fetch_lin_addr_low; - unsigned int ibs_fetch_lin_addr_high; - /* MSRC001_1030 IBS Fetch Control Register */ - unsigned int ibs_fetch_ctl_low; - unsigned int ibs_fetch_ctl_high; - /* MSRC001_1032 IBS Fetch Physical Address Register */ - unsigned int ibs_fetch_phys_addr_low; - unsigned int ibs_fetch_phys_addr_high; -}; - -struct ibs_op_sample { - /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ - unsigned int ibs_op_rip_low; - unsigned int ibs_op_rip_high; - /* MSRC001_1035 IBS Op Data Register */ - unsigned int ibs_op_data1_low; - unsigned int ibs_op_data1_high; - /* MSRC001_1036 IBS Op Data 2 Register */ - unsigned int ibs_op_data2_low; - unsigned int ibs_op_data2_high; - /* MSRC001_1037 IBS Op Data 3 Register */ - unsigned int ibs_op_data3_low; - unsigned int ibs_op_data3_high; - /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ - unsigned int ibs_dc_linear_low; - unsigned int ibs_dc_linear_high; - /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ - unsigned int ibs_dc_phys_low; - unsigned int ibs_dc_phys_high; -}; - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h+ -*/ -static void clear_ibs_nmi(void); - -static int ibs_allowed; /* AMD Family10h and later */ - -struct op_ibs_config { - unsigned long op_enabled; - unsigned long fetch_enabled; - unsigned long max_cnt_fetch; - unsigned long max_cnt_op; - unsigned long rand_en; - unsigned long dispatched_ops; -}; - -static struct op_ibs_config ibs_config; - -#endif - -/* functions for op_amd_spec */ - -static void op_amd_fill_in_addresses(struct op_msrs * const msrs) -{ - int i; - - for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; - else - msrs->counters[i].addr = 0; - } - - for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; - else - msrs->controls[i].addr = 0; - } -} - - -static void op_amd_setup_ctrs(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { - if (unlikely(!CTRL_IS_RESERVED(msrs, i))) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_WRITE(low, high, msrs, i); - } - - /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if (unlikely(!CTR_IS_RESERVED(msrs, i))) - continue; - CTR_WRITE(1, msrs, i); - } - - /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; - - CTR_WRITE(counter_config[i].count, msrs, i); - - CTRL_READ(low, high, msrs, i); - CTRL_CLEAR_LO(low); - CTRL_CLEAR_HI(high); - CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); - CTRL_SET_HOST_ONLY(high, 0); - CTRL_SET_GUEST_ONLY(high, 0); - - CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; - } - } -} - -#ifdef CONFIG_OPROFILE_IBS - -static inline int -op_amd_handle_ibs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned int low, high; - struct ibs_fetch_sample ibs_fetch; - struct ibs_op_sample ibs_op; - - if (!ibs_allowed) - return 1; - - if (ibs_config.fetch_enabled) { - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - if (high & IBS_FETCH_HIGH_VALID_BIT) { - ibs_fetch.ibs_fetch_ctl_high = high; - ibs_fetch.ibs_fetch_ctl_low = low; - rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); - ibs_fetch.ibs_fetch_lin_addr_high = high; - ibs_fetch.ibs_fetch_lin_addr_low = low; - rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); - ibs_fetch.ibs_fetch_phys_addr_high = high; - ibs_fetch.ibs_fetch_phys_addr_low = low; - - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_fetch, - IBS_FETCH_BEGIN); - - /*reenable the IRQ */ - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); - high &= ~IBS_FETCH_HIGH_VALID_BIT; - high |= IBS_FETCH_HIGH_ENABLE; - low &= IBS_FETCH_LOW_MAX_CNT_MASK; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - } - - if (ibs_config.op_enabled) { - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - if (low & IBS_OP_LOW_VALID_BIT) { - rdmsr(MSR_AMD64_IBSOPRIP, low, high); - ibs_op.ibs_op_rip_low = low; - ibs_op.ibs_op_rip_high = high; - rdmsr(MSR_AMD64_IBSOPDATA, low, high); - ibs_op.ibs_op_data1_low = low; - ibs_op.ibs_op_data1_high = high; - rdmsr(MSR_AMD64_IBSOPDATA2, low, high); - ibs_op.ibs_op_data2_low = low; - ibs_op.ibs_op_data2_high = high; - rdmsr(MSR_AMD64_IBSOPDATA3, low, high); - ibs_op.ibs_op_data3_low = low; - ibs_op.ibs_op_data3_high = high; - rdmsr(MSR_AMD64_IBSDCLINAD, low, high); - ibs_op.ibs_dc_linear_low = low; - ibs_op.ibs_dc_linear_high = high; - rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); - ibs_op.ibs_dc_phys_low = low; - ibs_op.ibs_dc_phys_high = high; - - /* reenable the IRQ */ - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_op, - IBS_OP_BEGIN); - rdmsr(MSR_AMD64_IBSOPCTL, low, high); - high = 0; - low &= ~IBS_OP_LOW_VALID_BIT; - low |= IBS_OP_LOW_ENABLE; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } - } - - return 1; -} - -#endif - -static int op_amd_check_ctrs(struct pt_regs * const regs, - struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) - continue; - CTR_READ(low, high, msrs, i); - if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); - } - } - -#ifdef CONFIG_OPROFILE_IBS - op_amd_handle_ibs(regs, msrs); -#endif - - /* See op_model_ppro.c */ - return 1; -} - -static void op_amd_start(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { - CTRL_READ(low, high, msrs, i); - CTRL_SET_ACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - } - -#ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { - low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = IBS_FETCH_HIGH_ENABLE; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (ibs_allowed && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif -} - - -static void op_amd_stop(struct op_msrs const * const msrs) -{ - unsigned int low, high; - int i; - - /* Subtle: stop on all counters to avoid race with - * setting our pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) - continue; - CTRL_READ(low, high, msrs, i); - CTRL_SET_INACTIVE(low); - CTRL_WRITE(low, high, msrs, i); - } - -#ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { - low = 0; /* clear max count and enable */ - high = 0; - wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); - } - - if (ibs_allowed && ibs_config.op_enabled) { - low = 0; /* clear max count and enable */ - high = 0; - wrmsr(MSR_AMD64_IBSOPCTL, low, high); - } -#endif -} - -static void op_amd_shutdown(struct op_msrs const * const msrs) -{ - int i; - - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (CTR_IS_RESERVED(msrs, i)) - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { - if (CTRL_IS_RESERVED(msrs, i)) - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } -} - -#ifndef CONFIG_OPROFILE_IBS - -/* no IBS support */ - -static int op_amd_init(struct oprofile_operations *ops) -{ - return 0; -} - -static void op_amd_exit(void) {} - -#else - -static u8 ibs_eilvt_off; - -static inline void apic_init_ibs_nmi_per_cpu(void *arg) -{ - ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); -} - -static inline void apic_clear_ibs_nmi_per_cpu(void *arg) -{ - setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); -} - -static int pfm_amd64_setup_eilvt(void) -{ -#define IBSCTL_LVTOFFSETVAL (1 << 8) -#define IBSCTL 0x1cc - struct pci_dev *cpu_cfg; - int nodes; - u32 value = 0; - - /* per CPU setup */ - on_each_cpu(apic_init_ibs_nmi_per_cpu, NULL, 1); - - nodes = 0; - cpu_cfg = NULL; - do { - cpu_cfg = pci_get_device(PCI_VENDOR_ID_AMD, - PCI_DEVICE_ID_AMD_10H_NB_MISC, - cpu_cfg); - if (!cpu_cfg) - break; - ++nodes; - pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off - | IBSCTL_LVTOFFSETVAL); - pci_read_config_dword(cpu_cfg, IBSCTL, &value); - if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { - printk(KERN_DEBUG "Failed to setup IBS LVT offset, " - "IBSCTL = 0x%08x", value); - return 1; - } - } while (1); - - if (!nodes) { - printk(KERN_DEBUG "No CPU node configured for IBS"); - return 1; - } - -#ifdef CONFIG_NUMA - /* Sanity check */ - /* Works only for 64bit with proper numa implementation. */ - if (nodes != num_possible_nodes()) { - printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, " - "found: %d, expected %d", - nodes, num_possible_nodes()); - return 1; - } -#endif - return 0; -} - -/* - * initialize the APIC for the IBS interrupts - * if available (AMD Family10h rev B0 and later) - */ -static void setup_ibs(void) -{ - ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); - - if (!ibs_allowed) - return; - - if (pfm_amd64_setup_eilvt()) { - ibs_allowed = 0; - return; - } - - printk(KERN_INFO "oprofile: AMD IBS detected\n"); -} - - -/* - * unitialize the APIC for the IBS interrupts if needed on AMD Family10h - * rev B0 and later */ -static void clear_ibs_nmi(void) -{ - if (ibs_allowed) - on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); -} - -static int (*create_arch_files)(struct super_block * sb, struct dentry * root); - -static int setup_ibs_files(struct super_block * sb, struct dentry * root) -{ - char buf[12]; - struct dentry *dir; - int ret = 0; - - /* architecture specific files */ - if (create_arch_files) - ret = create_arch_files(sb, root); - - if (ret) - return ret; - - if (!ibs_allowed) - return ret; - - /* model specific files */ - - /* setup some reasonable defaults */ - ibs_config.max_cnt_fetch = 250000; - ibs_config.fetch_enabled = 0; - ibs_config.max_cnt_op = 250000; - ibs_config.op_enabled = 0; - ibs_config.dispatched_ops = 1; - snprintf(buf, sizeof(buf), "ibs_fetch"); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "rand_enable", - &ibs_config.rand_en); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.fetch_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_fetch); - snprintf(buf, sizeof(buf), "ibs_uops"); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.op_enabled); - oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_op); - oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); - - return 0; -} - -static int op_amd_init(struct oprofile_operations *ops) -{ - setup_ibs(); - create_arch_files = ops->create_files; - ops->create_files = setup_ibs_files; - return 0; -} - -static void op_amd_exit(void) -{ - clear_ibs_nmi(); -} - -#endif - -struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &op_amd_fill_in_addresses, - .setup_ctrs = &op_amd_setup_ctrs, - .check_ctrs = &op_amd_check_ctrs, - .start = &op_amd_start, - .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown -}; -- cgit v1.2.2 From 1a960b402a51d80abf54e3f8e4972374ffe5f22d Mon Sep 17 00:00:00 2001 From: Jason Yeh Date: Wed, 23 Jul 2008 23:05:53 +0200 Subject: Oprofile Multiplexing Patch This patch introduces multiplexing support for the Oprofile kernel module. It basically adds a new function pointer in oprofile_operator allowing each architecture to supply its callback to switch between different sets of event when the timer expires. Userspace tools can modify the time slice through /dev/oprofile/time_slice. It also modifies the number of counters exposed to the userspace through /dev/oprofile. For example, the number of counters for AMD CPUs are changed to 32 and multiplexed in the sets of 4. Signed-off-by: Jason Yeh Signed-off-by: Robert Richter Cc: oprofile-list Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 100 +++++++++++++++++++++++++++++++++++--- arch/x86/oprofile/op_counter.h | 3 +- arch/x86/oprofile/op_model_amd.c | 76 +++++++++++++++++------------ arch/x86/oprofile/op_model_p4.c | 4 ++ arch/x86/oprofile/op_model_ppro.c | 2 + arch/x86/oprofile/op_x86_model.h | 3 ++ 6 files changed, 149 insertions(+), 39 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 287513a09819..2a65fe7680ab 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -23,12 +23,18 @@ #include "op_counter.h" #include "op_x86_model.h" +DEFINE_PER_CPU(int, switch_index); + static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); static int nmi_start(void); static void nmi_stop(void); +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs); +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs); +static void nmi_cpu_stop(void *dummy); +static void nmi_cpu_start(void *dummy); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; @@ -81,6 +87,47 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ +static void nmi_cpu_switch(void *dummy) +{ + int cpu = smp_processor_id(); + int si = per_cpu(switch_index, cpu); + struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + + nmi_cpu_stop(NULL); + nmi_cpu_save_mpx_registers(msrs); + + /* move to next set */ + si += model->num_hardware_counters; + if ((si > model->num_counters) || (counter_config[si].count == 0)) + per_cpu(switch_index, smp_processor_id()) = 0; + else + per_cpu(switch_index, smp_processor_id()) = si; + + nmi_cpu_restore_mpx_registers(msrs); + model->setup_ctrs(msrs); + nmi_cpu_start(NULL); +} + +/* + * Quick check to see if multiplexing is necessary. + * The check should be sufficient since counters are used + * in ordre. + */ +static int nmi_multiplex_on(void) +{ + return counter_config[model->num_hardware_counters].count ? 0 : -EINVAL; +} + +static int nmi_switch_event(void) +{ + if (nmi_multiplex_on() < 0) + return -EINVAL; + + on_each_cpu(nmi_cpu_switch, NULL, 0, 1); + + return 0; +} + static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -144,11 +191,10 @@ static void free_msrs(void) static int allocate_msrs(void) { - int success = 1; + int i, success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; - int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, GFP_KERNEL); @@ -156,8 +202,8 @@ static int allocate_msrs(void) success = 0; break; } - per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, - GFP_KERNEL); + per_cpu(cpu_msrs, i).controls = + kmalloc(controls_size, GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) { success = 0; break; @@ -201,7 +247,8 @@ static int nmi_setup(void) return err; } - /* We need to serialize save and setup for HT because the subset + /* + * We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ @@ -217,7 +264,6 @@ static int nmi_setup(void) per_cpu(cpu_msrs, 0).controls, sizeof(struct op_msr) * model->num_controls); } - } on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); @@ -225,7 +271,41 @@ static int nmi_setup(void) return 0; } -static void nmi_restore_registers(struct op_msrs *msrs) +static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + unsigned int const nr_ctrs = model->num_hardware_counters; + struct op_msr *counters = &msrs->counters[si]; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + int offset = i + si; + if (counters[offset].addr) { + rdmsr(counters[offset].addr, + counters[offset].multiplex.low, + counters[offset].multiplex.high); + } + } +} + +static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) +{ + unsigned int si = __get_cpu_var(switch_index); + unsigned int const nr_ctrs = model->num_hardware_counters; + struct op_msr *counters = &msrs->counters[si]; + unsigned int i; + + for (i = 0; i < nr_ctrs; ++i) { + int offset = i + si; + if (counters[offset].addr) { + wrmsr(counters[offset].addr, + counters[offset].multiplex.low, + counters[offset].multiplex.high); + } + } +} + +static void nmi_cpu_restore_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; @@ -265,7 +345,8 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_restore_registers(msrs); + nmi_cpu_restore_registers(msrs); + __get_cpu_var(switch_index) = 0; } static void nmi_shutdown(void) @@ -328,6 +409,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); + counter_config[i].save_count_low = 0; } return 0; @@ -469,12 +551,14 @@ int __init op_nmi_init(struct oprofile_operations *ops) } /* default values, can be overwritten by model */ + __get_cpu_var(switch_index) = 0; ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; + ops->switch_events = nmi_switch_event; if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 2880b15c4675..786d6e01cf7f 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,13 +10,14 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 8 +#define OP_MAX_COUNTER 32 /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { unsigned long count; + unsigned long save_count_low; unsigned long enabled; unsigned long event; unsigned long kernel; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index d9faf607b3a6..bbf2b68bcc5d 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -23,8 +24,10 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 4 -#define NUM_CONTROLS 4 +#define NUM_COUNTERS 32 +#define NUM_HARDWARE_COUNTERS 4 +#define NUM_CONTROLS 32 +#define NUM_HARDWARE_CONTROLS 4 #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) @@ -48,6 +51,7 @@ #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; +DECLARE_PER_CPU(int, switch_index); #ifdef CONFIG_OPROFILE_IBS @@ -130,15 +134,17 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < NUM_COUNTERS; i++) { - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; + int hw_counter = i % NUM_HARDWARE_COUNTERS; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter; else msrs->counters[i].addr = 0; } for (i = 0; i < NUM_CONTROLS; i++) { - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; + int hw_control = i % NUM_HARDWARE_CONTROLS; + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control; else msrs->controls[i].addr = 0; } @@ -150,8 +156,16 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) unsigned int low, high; int i; + for (i = 0; i < NUM_HARDWARE_CONTROLS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (counter_config[offset].enabled) + reset_value[offset] = counter_config[offset].count; + else + reset_value[offset] = 0; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -161,34 +175,31 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; CTR_WRITE(1, msrs, i); } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { - if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { - reset_value[i] = counter_config[i].count; - - CTR_WRITE(counter_config[i].count, msrs, i); + for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { + int offset = i + __get_cpu_var(switch_index); + if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs, i))) { + CTR_WRITE(counter_config[offset].count, msrs, i); CTRL_READ(low, high, msrs, i); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[i].user); - CTRL_SET_KERN(low, counter_config[i].kernel); - CTRL_SET_UM(low, counter_config[i].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[i].event); - CTRL_SET_EVENT_HIGH(high, counter_config[i].event); + CTRL_SET_USR(low, counter_config[offset].user); + CTRL_SET_KERN(low, counter_config[offset].kernel); + CTRL_SET_UM(low, counter_config[offset].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[offset].event); + CTRL_SET_EVENT_HIGH(high, counter_config[offset].event); CTRL_SET_HOST_ONLY(high, 0); CTRL_SET_GUEST_ONLY(high, 0); CTRL_WRITE(low, high, msrs, i); - } else { - reset_value[i] = 0; } } } @@ -276,13 +287,14 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { - if (!reset_value[i]) + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (!reset_value[offset]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], msrs, i); + oprofile_add_sample(regs, offset); + CTR_WRITE(reset_value[offset], msrs, i); } } @@ -298,8 +310,10 @@ static void op_amd_start(struct op_msrs const * const msrs) { unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (reset_value[i]) { + + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + int offset = i + __get_cpu_var(switch_index); + if (reset_value[offset]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); CTRL_WRITE(low, high, msrs, i); @@ -329,8 +343,8 @@ static void op_amd_stop(struct op_msrs const * const msrs) /* Subtle: stop on all counters to avoid race with * setting our pm callback */ - for (i = 0 ; i < NUM_COUNTERS ; ++i) { - if (!reset_value[i]) + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) continue; CTRL_READ(low, high, msrs, i); CTRL_SET_INACTIVE(low); @@ -356,11 +370,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } @@ -534,6 +548,8 @@ struct op_x86_model_spec const op_amd_spec = { .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_hardware_counters = NUM_HARDWARE_COUNTERS, + .num_hardware_controls = NUM_HARDWARE_CONTROLS, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 56b4757a1f47..e641545d4796 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -701,6 +701,8 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, + .num_hardware_counters = NUM_COUNTERS_HT2, + .num_hardware_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -713,6 +715,8 @@ struct op_x86_model_spec const op_p4_ht2_spec = { struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, + .num_hardware_counters = NUM_COUNTERS_NON_HT, + .num_hardware_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57b..e5811aa480eb 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -183,6 +183,8 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, + .num_hardware_counters = NUM_COUNTERS, + .num_hardware_controls = NUM_CONTROLS, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 05a0261ba0c3..e07ba1076371 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -19,6 +19,7 @@ struct op_saved_msr { struct op_msr { unsigned long addr; struct op_saved_msr saved; + struct op_saved_msr multiplex; }; struct op_msrs { @@ -34,6 +35,8 @@ struct pt_regs; struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); + unsigned int const num_hardware_counters; + unsigned int const num_hardware_controls; unsigned int const num_counters; unsigned int const num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); -- cgit v1.2.2 From 7e7b43892b87b6be259479ef4de14029dcb4012f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 24 Jul 2008 16:00:16 +0200 Subject: x86/oprofile: fix on_each_cpu build error Signed-off-by: Robert Richter Cc: oprofile-list Cc: Jason Yeh Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 2a65fe7680ab..fb4902bc6f14 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -123,7 +123,7 @@ static int nmi_switch_event(void) if (nmi_multiplex_on() < 0) return -EINVAL; - on_each_cpu(nmi_cpu_switch, NULL, 0, 1); + on_each_cpu(nmi_cpu_switch, NULL, 1); return 0; } -- cgit v1.2.2 From 1f972768a1df1518f45adb6b8ffbf04fa1c99737 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 13:52:50 +0200 Subject: x86, RDC321x: add to mach-default first step to add RDC321x support to the default PC architecture. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 26 +++++++++++--------------- arch/x86/Makefile | 5 ----- 2 files changed, 11 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e3cba0b45600..39ae67985950 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -23,7 +23,7 @@ config X86 select HAVE_OPROFILE select HAVE_IOREMAP_PROT select HAVE_KPROBES - select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X + select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE @@ -332,20 +332,6 @@ config X86_BIGSMP endif -config X86_RDC321X - bool "RDC R-321x SoC" - depends on X86_32 - select M486 - select X86_REBOOTFIXUPS - select GENERIC_GPIO - select LEDS_CLASS - select LEDS_GPIO - select NEW_LEDS - help - This option is needed for RDC R-321x system-on-chip, also known - as R-8610-(G). - If you don't have one of these chips, you should say N here. - config X86_VSMP bool "Support for ScaleMP vSMP" select PARAVIRT @@ -369,6 +355,16 @@ config X86_VISWS A kernel compiled for the Visual Workstation will run on general PCs as well. See for details. +config X86_RDC321X + bool "RDC R-321x SoC" + depends on X86_32 + select M486 + select X86_REBOOTFIXUPS + help + This option is needed for RDC R-321x system-on-chip, also known + as R-8610-(G). + If you don't have one of these chips, you should say N here. + config SCHED_NO_NO_OMIT_FRAME_POINTER def_bool y prompt "Single-depth WCHAN output" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 919ce21ea654..f5631da585b6 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -118,11 +118,6 @@ mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ -# RDC R-321x subarch support -mflags-$(CONFIG_X86_RDC321X) := -Iinclude/asm-x86/mach-rdc321x -mcore-$(CONFIG_X86_RDC321X) := arch/x86/mach-default/ -core-$(CONFIG_X86_RDC321X) += arch/x86/mach-rdc321x/ - # default subarch .h files mflags-y += -Iinclude/asm-x86/mach-default -- cgit v1.2.2 From 1ddb5518052e4e28ab489237443f7443b3fd69ca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 16:48:55 +0200 Subject: x86: convert pci-dma.c from round_up to roundup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cbecb05551bb..88ddd04cfa98 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -82,7 +82,7 @@ void __init dma32_reserve_bootmem(void) * using 512M as goal */ align = 64ULL<<20; - size = round_up(dma32_bootmem_size, align); + size = roundup(dma32_bootmem_size, align); dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, 512ULL<<20); if (dma32_bootmem_ptr) -- cgit v1.2.2 From 15ae2d76ceb037a1e3fcd8fc9b4fe3177f9f3831 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 16:48:56 +0200 Subject: x86: convert pageattr.c from round_up to roundup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 65c6e46bf059..0d254adcc82d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -78,7 +78,7 @@ static inline unsigned long highmap_start_pfn(void) static inline unsigned long highmap_end_pfn(void) { - return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa(roundup((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif -- cgit v1.2.2 From d86bb0dac792c6a9c92944b6db2687980c808094 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 16:48:57 +0200 Subject: x86: convert init_64.c from round_up to roundup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec37121f6709..e4805771b5be 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -258,7 +258,7 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size) void __init cleanup_highmap(void) { unsigned long vaddr = __START_KERNEL_map; - unsigned long end = round_up((unsigned long)_end, PMD_SIZE) - 1; + unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; pmd_t *pmd = level2_kernel_pgt; pmd_t *last_pmd = pmd + PTRS_PER_PMD; @@ -474,14 +474,14 @@ static void __init find_early_table_space(unsigned long end) unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; - tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); + tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); if (direct_gbpages) { unsigned long extra; extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; } else pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; - tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); + tables += roundup(pmds * sizeof(pmd_t), PAGE_SIZE); if (cpu_has_pse) { unsigned long extra; @@ -489,7 +489,7 @@ static void __init find_early_table_space(unsigned long end) ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; } else ptes = (end + PAGE_SIZE - 1) >> PAGE_SHIFT; - tables += round_up(ptes * sizeof(pte_t), PAGE_SIZE); + tables += roundup(ptes * sizeof(pte_t), PAGE_SIZE); /* * RED-PEN putting page tables only on node 0 could -- cgit v1.2.2 From be3e89ee6df8607356f705901dd90bcf3836c86e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 16:48:58 +0200 Subject: x86: convert numa_64.c from round_up to roundup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/mm/numa_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a4dd793d6003..cebcbf152d46 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -79,7 +79,7 @@ static int __init allocate_cachealigned_memnodemap(void) return 0; addr = 0x8000; - nodemap_size = round_up(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); + nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES); nodemap_addr = find_e820_area(addr, max_pfn< Date: Fri, 25 Jul 2008 16:48:59 +0200 Subject: x86: convert discontig_32.c from round_up to roundup Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/mm/discontig_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c index 62fa440678d8..847c164725f4 100644 --- a/arch/x86/mm/discontig_32.c +++ b/arch/x86/mm/discontig_32.c @@ -328,7 +328,7 @@ void __init initmem_init(unsigned long start_pfn, get_memcfg_numa(); - kva_pages = round_up(calculate_numa_remap_pages(), PTRS_PER_PTE); + kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); do { -- cgit v1.2.2 From a8132e5fe2c4f3f780b8bd3cce7851640f79f1c7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 14:57:59 +0200 Subject: x86, AMD IOMMU: replace to_pages macro with iommu_num_pages This patch removes the to_pages macro from AMD IOMMU code and calls the generic iommu_num_pages function instead. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c25210e6ac88..79eff735fde6 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -29,9 +29,6 @@ #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EXIT_LOOP_COUNT 10000000 static DEFINE_RWLOCK(amd_iommu_devtable_lock); @@ -185,7 +182,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = to_pages(address, size); + unsigned pages = iommu_num_pages(address, size); address &= PAGE_MASK; @@ -557,8 +554,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = to_pages(iommu->exclusion_start, - iommu->exclusion_length); + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -767,7 +764,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned int pages; int i; - pages = to_pages(paddr, size); + pages = iommu_num_pages(paddr, size); paddr &= PAGE_MASK; address = dma_ops_alloc_addresses(dev, dma_dom, pages); @@ -802,7 +799,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = to_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size); dma_addr &= PAGE_MASK; start = dma_addr; -- cgit v1.2.2 From 87e39ea5714dd59ba31e36c25833d2b20255a29d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 14:58:00 +0200 Subject: x86 gart: replace to_pages macro with iommu_num_pages This patch removes the to_pages macro from x86 GART code and calls the generic iommu_num_pages function instead. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index df5f142657d2..c98240431561 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -67,9 +67,6 @@ static u32 gart_unmapped_entry; (((x) & 0xfffff000) | (((x) >> 32) << 4) | GPTE_VALID | GPTE_COHERENT) #define GPTE_DECODE(x) (((x) & 0xfffff000) | (((u64)(x) & 0xff0) << 28)) -#define to_pages(addr, size) \ - (round_up(((addr) & ~PAGE_MASK) + (size), PAGE_SIZE) >> PAGE_SHIFT) - #define EMERGENCY_PAGES 32 /* = 128KB */ #ifdef CONFIG_AGP @@ -241,7 +238,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir) { - unsigned long npages = to_pages(phys_mem, size); + unsigned long npages = iommu_num_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(dev, npages); int i; @@ -304,7 +301,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = to_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -387,7 +384,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = to_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -470,7 +467,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += to_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) -- cgit v1.2.2 From 3a61ec387c9092dfc91a5959145d36835a72fc4c Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 25 Jul 2008 13:07:50 +0200 Subject: x86, AMD IOMMU: include amd_iommu_last_bdf in device initialization All the values read while searching for amd_iommu_last_bdf are defined as inclusive. Let the code handle this value as such. Found by Wei Wang. Thanks Wei. Signed-off-by: Joerg Roedel Cc: iommu@lists.linux-foundation.org Cc: bhavna.sarathy@amd.com Cc: robert.richter@amd.com Cc: Wei Wang Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 4 ++-- arch/x86/kernel/amd_iommu_init.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c25210e6ac88..74697408576f 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -667,7 +667,7 @@ static int get_device_resources(struct device *dev, _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); /* device not translated by any IOMMU in the system? */ - if (_bdf >= amd_iommu_last_bdf) { + if (_bdf > amd_iommu_last_bdf) { *iommu = NULL; *domain = NULL; *bdf = 0xffff; @@ -1085,7 +1085,7 @@ void prealloc_protection_domains(void) while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { devid = (dev->bus->number << 8) | dev->devfn; - if (devid >= amd_iommu_last_bdf) + if (devid > amd_iommu_last_bdf) continue; devid = amd_iommu_alias_table[devid]; if (domain_for_device(devid)) diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index c9d8ff2eb130..d9a9da597e79 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -732,7 +732,7 @@ static int __init init_exclusion_range(struct ivmd_header *m) set_device_exclusion_range(m->devid, m); break; case ACPI_IVMD_TYPE_ALL: - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) set_device_exclusion_range(i, m); break; case ACPI_IVMD_TYPE_RANGE: @@ -934,7 +934,7 @@ int __init amd_iommu_init(void) /* * let all alias entries point to itself */ - for (i = 0; i < amd_iommu_last_bdf; ++i) + for (i = 0; i <= amd_iommu_last_bdf; ++i) amd_iommu_alias_table[i] = i; /* -- cgit v1.2.2 From 5c05917e7fe313a187ad6ebb94c1c6cf42862a0b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 24 Jul 2008 17:29:40 -0700 Subject: x86: usb debug port early console, v4 based on work from Eric, and add some timeout so don't dead loop when debug device is not installed v2: fix checkpatch warning v3: move ehci struct def to linux/usrb/ehci_def.h from host/ehci.h also add CONFIG_EARLY_PRINTK_DBGP to disable it by default v4: address comments from Ingo, seperate ehci reg def moving to another patch also add auto detect port that connect to debug device for Nvidia southbridge Signed-off-by: Yinghai Lu Cc: Andrew Morton Cc: Andi Kleen Cc: "Arjan van de Ven" Cc: "Eric W. Biederman" Cc: "Greg KH" Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 13 + arch/x86/kernel/early_printk.c | 762 ++++++++++++++++++++++++++++++++++++++++- 2 files changed, 772 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 092f019e033a..93422d2ecf61 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -43,6 +43,19 @@ config EARLY_PRINTK with klogd/syslogd or the X server. You should normally N here, unless you want to debug such a crash. +config EARLY_PRINTK_DBGP + bool "Early printk via EHCI debug port" + default n + depends on EARLY_PRINTK + help + Write kernel log output directly into the EHCI debug port. + + This is useful for kernel debugging when your machine crashes very + early before the console code is initialized. For normal operation + it is not recommended because it looks ugly and doesn't cooperate + with klogd/syslogd or the X server. You should normally N here, + unless you want to debug such a crash. You need usb debug device. + config DEBUG_STACKOVERFLOW bool "Check for stack overflows" depends on DEBUG_KERNEL diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index ff9e7350da54..28155acf706e 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -3,11 +3,19 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include #include +#include +#include +#include +#include /* Simple VGA output */ #define VGABASE (__ISA_IO_base + 0xb8000) @@ -78,6 +86,7 @@ static int early_serial_base = 0x3f8; /* ttyS0 */ static int early_serial_putc(unsigned char ch) { unsigned timeout = 0xffff; + while ((inb(early_serial_base + LSR) & XMTRDY) == 0 && --timeout) cpu_relax(); outb(ch, early_serial_base + TXR); @@ -151,6 +160,721 @@ static struct console early_serial_console = { .index = -1, }; +#ifdef CONFIG_EARLY_PRINTK_DBGP + +static struct ehci_caps __iomem *ehci_caps; +static struct ehci_regs __iomem *ehci_regs; +static struct ehci_dbg_port __iomem *ehci_debug; +static unsigned int dbgp_endpoint_out; + +struct ehci_dev { + u32 bus; + u32 slot; + u32 func; +}; + +static struct ehci_dev ehci_dev; + +#define USB_DEBUG_DEVNUM 127 + +#define DBGP_DATA_TOGGLE 0x8800 + +static inline u32 dbgp_pid_update(u32 x, u32 tok) +{ + return ((x ^ DBGP_DATA_TOGGLE) & 0xffff00) | (tok & 0xff); +} + +static inline u32 dbgp_len_update(u32 x, u32 len) +{ + return (x & ~0x0f) | (len & 0x0f); +} + +/* + * USB Packet IDs (PIDs) + */ + +/* token */ +#define USB_PID_OUT 0xe1 +#define USB_PID_IN 0x69 +#define USB_PID_SOF 0xa5 +#define USB_PID_SETUP 0x2d +/* handshake */ +#define USB_PID_ACK 0xd2 +#define USB_PID_NAK 0x5a +#define USB_PID_STALL 0x1e +#define USB_PID_NYET 0x96 +/* data */ +#define USB_PID_DATA0 0xc3 +#define USB_PID_DATA1 0x4b +#define USB_PID_DATA2 0x87 +#define USB_PID_MDATA 0x0f +/* Special */ +#define USB_PID_PREAMBLE 0x3c +#define USB_PID_ERR 0x3c +#define USB_PID_SPLIT 0x78 +#define USB_PID_PING 0xb4 +#define USB_PID_UNDEF_0 0xf0 + +#define USB_PID_DATA_TOGGLE 0x88 +#define DBGP_CLAIM (DBGP_OWNER | DBGP_ENABLED | DBGP_INUSE) + +#define PCI_CAP_ID_EHCI_DEBUG 0xa + +#define HUB_ROOT_RESET_TIME 50 /* times are in msec */ +#define HUB_SHORT_RESET_TIME 10 +#define HUB_LONG_RESET_TIME 200 +#define HUB_RESET_TIMEOUT 500 + +#define DBGP_MAX_PACKET 8 + +static int dbgp_wait_until_complete(void) +{ + u32 ctrl; + int loop = 0x100000; + + do { + ctrl = readl(&ehci_debug->control); + /* Stop when the transaction is finished */ + if (ctrl & DBGP_DONE) + break; + } while (--loop > 0); + + if (!loop) + return -1; + + /* + * Now that we have observed the completed transaction, + * clear the done bit. + */ + writel(ctrl | DBGP_DONE, &ehci_debug->control); + return (ctrl & DBGP_ERROR) ? -DBGP_ERRCODE(ctrl) : DBGP_LEN(ctrl); +} + +static void dbgp_mdelay(int ms) +{ + int i; + + while (ms--) { + for (i = 0; i < 1000; i++) + outb(0x1, 0x80); + } +} + +static void dbgp_breath(void) +{ + /* Sleep to give the debug port a chance to breathe */ +} + +static int dbgp_wait_until_done(unsigned ctrl) +{ + u32 pids, lpid; + int ret; + int loop = 3; + +retry: + writel(ctrl | DBGP_GO, &ehci_debug->control); + ret = dbgp_wait_until_complete(); + pids = readl(&ehci_debug->pids); + lpid = DBGP_PID_GET(pids); + + if (ret < 0) + return ret; + + /* + * If the port is getting full or it has dropped data + * start pacing ourselves, not necessary but it's friendly. + */ + if ((lpid == USB_PID_NAK) || (lpid == USB_PID_NYET)) + dbgp_breath(); + + /* If I get a NACK reissue the transmission */ + if (lpid == USB_PID_NAK) { + if (--loop > 0) + goto retry; + } + + return ret; +} + +static void dbgp_set_data(const void *buf, int size) +{ + const unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = hi = 0; + for (i = 0; i < 4 && i < size; i++) + lo |= bytes[i] << (8*i); + for (; i < 8 && i < size; i++) + hi |= bytes[i] << (8*(i - 4)); + writel(lo, &ehci_debug->data03); + writel(hi, &ehci_debug->data47); +} + +static void dbgp_get_data(void *buf, int size) +{ + unsigned char *bytes = buf; + u32 lo, hi; + int i; + + lo = readl(&ehci_debug->data03); + hi = readl(&ehci_debug->data47); + for (i = 0; i < 4 && i < size; i++) + bytes[i] = (lo >> (8*i)) & 0xff; + for (; i < 8 && i < size; i++) + bytes[i] = (hi >> (8*(i - 4))) & 0xff; +} + +static int dbgp_bulk_write(unsigned devnum, unsigned endpoint, + const char *bytes, int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_OUT); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + dbgp_set_data(bytes, size); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + return ret; +} + +static int dbgp_bulk_read(unsigned devnum, unsigned endpoint, void *data, + int size) +{ + u32 pids, addr, ctrl; + int ret; + + if (size > DBGP_MAX_PACKET) + return -1; + + addr = DBGP_EPADDR(devnum, endpoint); + + pids = readl(&ehci_debug->pids); + pids = dbgp_pid_update(pids, USB_PID_IN); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, size); + ctrl &= ~DBGP_OUT; + ctrl |= DBGP_GO; + + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + if (size > ret) + size = ret; + dbgp_get_data(data, size); + return ret; +} + +static int dbgp_control_msg(unsigned devnum, int requesttype, int request, + int value, int index, void *data, int size) +{ + u32 pids, addr, ctrl; + struct usb_ctrlrequest req; + int read; + int ret; + + read = (requesttype & USB_DIR_IN) != 0; + if (size > (read ? DBGP_MAX_PACKET:0)) + return -1; + + /* Compute the control message */ + req.bRequestType = requesttype; + req.bRequest = request; + req.wValue = value; + req.wIndex = index; + req.wLength = size; + + pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); + addr = DBGP_EPADDR(devnum, 0); + + ctrl = readl(&ehci_debug->control); + ctrl = dbgp_len_update(ctrl, sizeof(req)); + ctrl |= DBGP_OUT; + ctrl |= DBGP_GO; + + /* Send the setup message */ + dbgp_set_data(&req, sizeof(req)); + writel(addr, &ehci_debug->address); + writel(pids, &ehci_debug->pids); + ret = dbgp_wait_until_done(ctrl); + if (ret < 0) + return ret; + + /* Read the result */ + return dbgp_bulk_read(devnum, 0, data, size); +} + + +/* Find a PCI capability */ +static u32 __init find_cap(u32 num, u32 slot, u32 func, int cap) +{ + u8 pos; + int bytes; + + if (!(read_pci_config_16(num, slot, func, PCI_STATUS) & + PCI_STATUS_CAP_LIST)) + return 0; + + pos = read_pci_config_byte(num, slot, func, PCI_CAPABILITY_LIST); + for (bytes = 0; bytes < 48 && pos >= 0x40; bytes++) { + u8 id; + + pos &= ~3; + id = read_pci_config_byte(num, slot, func, pos+PCI_CAP_LIST_ID); + if (id == 0xff) + break; + if (id == cap) + return pos; + + pos = read_pci_config_byte(num, slot, func, + pos+PCI_CAP_LIST_NEXT); + } + return 0; +} + +static u32 __init __find_dbgp(u32 bus, u32 slot, u32 func) +{ + u32 class; + + class = read_pci_config(bus, slot, func, PCI_CLASS_REVISION); + if ((class >> 8) != PCI_CLASS_SERIAL_USB_EHCI) + return 0; + + return find_cap(bus, slot, func, PCI_CAP_ID_EHCI_DEBUG); +} + +static u32 __init find_dbgp(int ehci_num, u32 *rbus, u32 *rslot, u32 *rfunc) +{ + u32 bus, slot, func; + + for (bus = 0; bus < 256; bus++) { + for (slot = 0; slot < 32; slot++) { + for (func = 0; func < 8; func++) { + unsigned cap; + + cap = __find_dbgp(bus, slot, func); + + if (!cap) + continue; + if (ehci_num-- != 0) + continue; + *rbus = bus; + *rslot = slot; + *rfunc = func; + return cap; + } + } + } + return 0; +} + +static int ehci_reset_port(int port) +{ + u32 portsc; + u32 delay_time, delay; + int loop; + + /* Reset the usb debug port */ + portsc = readl(&ehci_regs->port_status[port - 1]); + portsc &= ~PORT_PE; + portsc |= PORT_RESET; + writel(portsc, &ehci_regs->port_status[port - 1]); + + delay = HUB_ROOT_RESET_TIME; + for (delay_time = 0; delay_time < HUB_RESET_TIMEOUT; + delay_time += delay) { + dbgp_mdelay(delay); + + portsc = readl(&ehci_regs->port_status[port - 1]); + if (portsc & PORT_RESET) { + /* force reset to complete */ + loop = 2; + writel(portsc & ~(PORT_RWC_BITS | PORT_RESET), + &ehci_regs->port_status[port - 1]); + do { + portsc = readl(&ehci_regs->port_status[port-1]); + } while ((portsc & PORT_RESET) && (--loop > 0)); + } + + /* Device went away? */ + if (!(portsc & PORT_CONNECT)) + return -ENOTCONN; + + /* bomb out completely if something weird happend */ + if ((portsc & PORT_CSC)) + return -EINVAL; + + /* If we've finished resetting, then break out of the loop */ + if (!(portsc & PORT_RESET) && (portsc & PORT_PE)) + return 0; + } + return -EBUSY; +} + +static int ehci_wait_for_port(int port) +{ + u32 status; + int ret, reps; + + for (reps = 0; reps < 3; reps++) { + dbgp_mdelay(100); + status = readl(&ehci_regs->status); + if (status & STS_PCD) { + ret = ehci_reset_port(port); + if (ret == 0) + return 0; + } + } + return -ENOTCONN; +} + +#ifdef DBGP_DEBUG +# define dbgp_printk early_printk +#else +static inline void dbgp_printk(const char *fmt, ...) { } +#endif + +typedef void (*set_debug_port_t)(int port); + +static void default_set_debug_port(int port) +{ +} + +static set_debug_port_t set_debug_port = default_set_debug_port; + +static void nvidia_set_debug_port(int port) +{ + u32 dword; + dword = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x74); + dword &= ~(0x0f<<12); + dword |= ((port & 0x0f)<<12); + write_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, 0x74, + dword); + dbgp_printk("set debug port to %d\n", port); +} + +static void __init detect_set_debug_port(void) +{ + u32 vendorid; + + vendorid = read_pci_config(ehci_dev.bus, ehci_dev.slot, ehci_dev.func, + 0x00); + + if ((vendorid & 0xffff) == 0x10de) { + dbgp_printk("using nvidia set_debug_port\n"); + set_debug_port = nvidia_set_debug_port; + } +} + +static int __init ehci_setup(void) +{ + struct usb_debug_descriptor dbgp_desc; + u32 cmd, ctrl, status, portsc, hcs_params; + u32 debug_port, new_debug_port = 0, n_ports; + u32 devnum; + int ret, i; + int loop; + int port_map_tried; + int playtimes = 3; + +try_next_time: + port_map_tried = 0; + +try_next_port: + + hcs_params = readl(&ehci_caps->hcs_params); + debug_port = HCS_DEBUG_PORT(hcs_params); + n_ports = HCS_N_PORTS(hcs_params); + + dbgp_printk("debug_port: %d\n", debug_port); + dbgp_printk("n_ports: %d\n", n_ports); + + for (i = 1; i <= n_ports; i++) { + portsc = readl(&ehci_regs->port_status[i-1]); + dbgp_printk("portstatus%d: %08x\n", i, portsc); + } + + if (port_map_tried && (new_debug_port != debug_port)) { + if (--playtimes) { + set_debug_port(new_debug_port); + goto try_next_time; + } + return -1; + } + + loop = 10; + /* Reset the EHCI controller */ + cmd = readl(&ehci_regs->command); + cmd |= CMD_RESET; + writel(cmd, &ehci_regs->command); + do { + cmd = readl(&ehci_regs->command); + } while ((cmd & CMD_RESET) && (--loop > 0)); + + if (!loop) { + dbgp_printk("can not reset ehci\n"); + return -1; + } + dbgp_printk("ehci reset done\n"); + + /* Claim ownership, but do not enable yet */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_OWNER; + ctrl &= ~(DBGP_ENABLED | DBGP_INUSE); + writel(ctrl, &ehci_debug->control); + + /* Start the ehci running */ + cmd = readl(&ehci_regs->command); + cmd &= ~(CMD_LRESET | CMD_IAAD | CMD_PSE | CMD_ASE | CMD_RESET); + cmd |= CMD_RUN; + writel(cmd, &ehci_regs->command); + + /* Ensure everything is routed to the EHCI */ + writel(FLAG_CF, &ehci_regs->configured_flag); + + /* Wait until the controller is no longer halted */ + loop = 10; + do { + status = readl(&ehci_regs->status); + } while ((status & STS_HALT) && (--loop > 0)); + + if (!loop) { + dbgp_printk("ehci can be started\n"); + return -1; + } + dbgp_printk("ehci started\n"); + + /* Wait for a device to show up in the debug port */ + ret = ehci_wait_for_port(debug_port); + if (ret < 0) { + dbgp_printk("No device found in debug port\n"); + goto next_debug_port; + } + dbgp_printk("ehci wait for port done\n"); + + /* Enable the debug port */ + ctrl = readl(&ehci_debug->control); + ctrl |= DBGP_CLAIM; + writel(ctrl, &ehci_debug->control); + ctrl = readl(&ehci_debug->control); + if ((ctrl & DBGP_CLAIM) != DBGP_CLAIM) { + dbgp_printk("No device in debug port\n"); + writel(ctrl & ~DBGP_CLAIM, &ehci_debug->control); + goto err; + } + dbgp_printk("debug ported enabled\n"); + + /* Completely transfer the debug device to the debug controller */ + portsc = readl(&ehci_regs->port_status[debug_port - 1]); + portsc &= ~PORT_PE; + writel(portsc, &ehci_regs->port_status[debug_port - 1]); + + dbgp_mdelay(100); + + /* Find the debug device and make it device number 127 */ + for (devnum = 0; devnum <= 127; devnum++) { + ret = dbgp_control_msg(devnum, + USB_DIR_IN | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_GET_DESCRIPTOR, (USB_DT_DEBUG << 8), 0, + &dbgp_desc, sizeof(dbgp_desc)); + if (ret > 0) + break; + } + if (devnum > 127) { + dbgp_printk("Could not find attached debug device\n"); + goto err; + } + if (ret < 0) { + dbgp_printk("Attached device is not a debug device\n"); + goto err; + } + dbgp_endpoint_out = dbgp_desc.bDebugOutEndpoint; + + /* Move the device to 127 if it isn't already there */ + if (devnum != USB_DEBUG_DEVNUM) { + ret = dbgp_control_msg(devnum, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_ADDRESS, USB_DEBUG_DEVNUM, 0, NULL, 0); + if (ret < 0) { + dbgp_printk("Could not move attached device to %d\n", + USB_DEBUG_DEVNUM); + goto err; + } + devnum = USB_DEBUG_DEVNUM; + dbgp_printk("debug device renamed to 127\n"); + } + + /* Enable the debug interface */ + ret = dbgp_control_msg(USB_DEBUG_DEVNUM, + USB_DIR_OUT | USB_TYPE_STANDARD | USB_RECIP_DEVICE, + USB_REQ_SET_FEATURE, USB_DEVICE_DEBUG_MODE, 0, NULL, 0); + if (ret < 0) { + dbgp_printk(" Could not enable the debug device\n"); + goto err; + } + dbgp_printk("debug interface enabled\n"); + + /* Perform a small write to get the even/odd data state in sync + */ + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, dbgp_endpoint_out, " ", 1); + if (ret < 0) { + dbgp_printk("dbgp_bulk_write failed: %d\n", ret); + goto err; + } + dbgp_printk("small write doned\n"); + + return 0; +err: + /* Things didn't work so remove my claim */ + ctrl = readl(&ehci_debug->control); + ctrl &= ~(DBGP_CLAIM | DBGP_OUT); + writel(ctrl, &ehci_debug->control); + return -1; + +next_debug_port: + port_map_tried |= (1<<(debug_port - 1)); + new_debug_port = ((debug_port-1+1)%n_ports) + 1; + if (port_map_tried != ((1<> 29) & 0x7; + bar = (bar * 4) + 0xc; + offset = (debug_port >> 16) & 0xfff; + dbgp_printk("bar: %02x offset: %03x\n", bar, offset); + if (bar != PCI_BASE_ADDRESS_0) { + dbgp_printk("only debug ports on bar 1 handled.\n"); + + return -1; + } + + bar_val = read_pci_config(bus, slot, func, PCI_BASE_ADDRESS_0); + dbgp_printk("bar_val: %02x offset: %03x\n", bar_val, offset); + if (bar_val & ~PCI_BASE_ADDRESS_MEM_MASK) { + dbgp_printk("only simple 32bit mmio bars supported\n"); + + return -1; + } + + /* double check if the mem space is enabled */ + byte = read_pci_config_byte(bus, slot, func, 0x04); + if (!(byte & 0x2)) { + byte |= 0x02; + write_pci_config_byte(bus, slot, func, 0x04, byte); + dbgp_printk("mmio for ehci enabled\n"); + } + + /* + * FIXME I don't have the bar size so just guess PAGE_SIZE is more + * than enough. 1K is the biggest I have seen. + */ + set_fixmap_nocache(FIX_DBGP_BASE, bar_val & PAGE_MASK); + ehci_bar = (void __iomem *)__fix_to_virt(FIX_DBGP_BASE); + ehci_bar += bar_val & ~PAGE_MASK; + dbgp_printk("ehci_bar: %p\n", ehci_bar); + + ehci_caps = ehci_bar; + ehci_regs = ehci_bar + HC_LENGTH(readl(&ehci_caps->hc_capbase)); + ehci_debug = ehci_bar + offset; + ehci_dev.bus = bus; + ehci_dev.slot = slot; + ehci_dev.func = func; + + detect_set_debug_port(); + + ret = ehci_setup(); + if (ret < 0) { + dbgp_printk("ehci_setup failed\n"); + ehci_debug = 0; + + return -1; + } + + return 0; +} + +static void early_dbgp_write(struct console *con, const char *str, u32 n) +{ + int chunk, ret; + + if (!ehci_debug) + return; + while (n > 0) { + chunk = n; + if (chunk > DBGP_MAX_PACKET) + chunk = DBGP_MAX_PACKET; + ret = dbgp_bulk_write(USB_DEBUG_DEVNUM, + dbgp_endpoint_out, str, chunk); + str += chunk; + n -= chunk; + } +} + +static struct console early_dbgp_console = { + .name = "earlydbg", + .write = early_dbgp_write, + .flags = CON_PRINTBUFFER, + .index = -1, +}; +#endif + /* Console interface to a host file on AMD's SimNow! */ static int simnow_fd; @@ -165,6 +889,7 @@ enum { static noinline long simnow(long cmd, long a, long b, long c) { long ret; + asm volatile("cpuid" : "=a" (ret) : "b" (a), "c" (b), "d" (c), "0" (MAGIC1), "D" (cmd + MAGIC2)); @@ -174,6 +899,7 @@ static noinline long simnow(long cmd, long a, long b, long c) static void __init simnow_init(char *str) { char *fn = "klog"; + if (*str == '=') fn = ++str; /* error ignored */ @@ -208,10 +934,11 @@ asmlinkage void early_printk(const char *fmt, ...) va_end(ap); } -static int __initdata keep_early; static int __init setup_early_printk(char *buf) { + int keep_early; + if (!buf) return 0; @@ -219,8 +946,7 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - if (strstr(buf, "keep")) - keep_early = 1; + keep_early = (strstr(buf, "keep") != NULL); if (!strncmp(buf, "serial", 6)) { early_serial_init(buf + 6); @@ -238,6 +964,17 @@ static int __init setup_early_printk(char *buf) simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_EARLY_PRINTK_DBGP + } else if (!strncmp(buf, "dbgp", 4)) { + if (early_dbgp_init(buf+4) < 0) + return 0; + early_console = &early_dbgp_console; + /* + * usb subsys will reset ehci controller, so don't keep + * that early console + */ + keep_early = 0; +#endif #ifdef CONFIG_HVC_XEN } else if (!strncmp(buf, "xen", 3)) { early_console = &xenboot_console; @@ -251,4 +988,23 @@ static int __init setup_early_printk(char *buf) register_console(early_console); return 0; } + +void __init enable_debug_console(char *buf) +{ +#ifdef DBGP_DEBUG + struct console *old_early_console = NULL; + + if (early_console_initialized && early_console) { + old_early_console = early_console; + unregister_console(early_console); + early_console_initialized = 0; + } + + setup_early_printk(buf); + + if (early_console == old_early_console && old_early_console) + register_console(old_early_console); +#endif +} + early_param("earlyprintk", setup_early_printk); -- cgit v1.2.2 From a4dbc34d181e87a0d724dee365921e9251f831d4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 25 Jul 2008 02:14:28 -0700 Subject: x86: add setup_ioapic_ids for numaq in x86_quirks Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 5 ++--- arch/x86/kernel/numaq_32.c | 7 +++++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 98e4db5373f3..72ba06314c7b 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -1728,10 +1729,8 @@ static void __init setup_ioapic_ids_from_mpc(void) unsigned char old_id; unsigned long flags; -#ifdef CONFIG_X86_NUMAQ - if (found_numaq) + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) return; -#endif /* * Don't check I/O APIC IDs for xAPIC systems. They have diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index b8c45610b20a..2434467ddf72 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -229,6 +229,12 @@ static void __init smp_read_mpc_oem(struct mp_config_oemtable *oemtable, } } +static int __init numaq_setup_ioapic_ids(void) +{ + /* so can skip it */ + return 1; +} + static struct x86_quirks numaq_x86_quirks __initdata = { .arch_pre_time_init = numaq_pre_time_init, .arch_time_init = NULL, @@ -243,6 +249,7 @@ static struct x86_quirks numaq_x86_quirks __initdata = { .mpc_oem_bus_info = mpc_oem_bus_info, .mpc_oem_pci_bus = mpc_oem_pci_bus, .smp_read_mpc_oem = smp_read_mpc_oem, + .setup_ioapic_ids = numaq_setup_ioapic_ids, }; void numaq_mps_oem_check(struct mp_config_table *mpc, char *oem, -- cgit v1.2.2 From 1176fa919292887aa738d317d61fe2bba4d764f2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 25 Jul 2008 02:17:21 -0700 Subject: x86: mach-bigsmp to bigsmp Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mach-generic/bigsmp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index b31f2800638e..df37fc9d6a26 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -12,10 +12,10 @@ #include #include #include -#include +#include #include -#include -#include +#include +#include #include static int dmi_bigsmp; /* can be set by dmi scanners */ -- cgit v1.2.2 From c7e7964c9828083aeb41c2664cbf5a32acbfa9d1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 25 Jul 2008 02:17:33 -0700 Subject: x86: mach_es7000 to es7000 Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/es7000/Makefile | 5 + arch/x86/es7000/es7000.h | 114 +++++++++++++++++ arch/x86/es7000/es7000plat.c | 262 ++++++++++++++++++++++++++++++++++++++ arch/x86/mach-es7000/Makefile | 5 - arch/x86/mach-es7000/es7000.h | 114 ----------------- arch/x86/mach-es7000/es7000plat.c | 262 -------------------------------------- arch/x86/mach-generic/Makefile | 2 +- arch/x86/mach-generic/es7000.c | 10 +- 8 files changed, 387 insertions(+), 387 deletions(-) create mode 100644 arch/x86/es7000/Makefile create mode 100644 arch/x86/es7000/es7000.h create mode 100644 arch/x86/es7000/es7000plat.c delete mode 100644 arch/x86/mach-es7000/Makefile delete mode 100644 arch/x86/mach-es7000/es7000.h delete mode 100644 arch/x86/mach-es7000/es7000plat.c (limited to 'arch/x86') diff --git a/arch/x86/es7000/Makefile b/arch/x86/es7000/Makefile new file mode 100644 index 000000000000..3ef8b43b62fc --- /dev/null +++ b/arch/x86/es7000/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for the linux kernel. +# + +obj-$(CONFIG_X86_ES7000) := es7000plat.o diff --git a/arch/x86/es7000/es7000.h b/arch/x86/es7000/es7000.h new file mode 100644 index 000000000000..4e62f6fa95b8 --- /dev/null +++ b/arch/x86/es7000/es7000.h @@ -0,0 +1,114 @@ +/* + * Written by: Garry Forsgren, Unisys Corporation + * Natalie Protasevich, Unisys Corporation + * This file contains the code to configure and interface + * with Unisys ES7000 series hardware system manager. + * + * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Unisys Corporation, Township Line & Union Meeting + * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: + * + * http://www.unisys.com + */ + +/* + * ES7000 chipsets + */ + +#define NON_UNISYS 0 +#define ES7000_CLASSIC 1 +#define ES7000_ZORRO 2 + + +#define MIP_REG 1 +#define MIP_PSAI_REG 4 + +#define MIP_BUSY 1 +#define MIP_SPIN 0xf0000 +#define MIP_VALID 0x0100000000000000ULL +#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) + +#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) + +struct mip_reg_info { + unsigned long long mip_info; + unsigned long long delivery_info; + unsigned long long host_reg; + unsigned long long mip_reg; +}; + +struct part_info { + unsigned char type; + unsigned char length; + unsigned char part_id; + unsigned char apic_mode; + unsigned long snum; + char ptype[16]; + char sname[64]; + char pname[64]; +}; + +struct psai { + unsigned long long entry_type; + unsigned long long addr; + unsigned long long bep_addr; +}; + +struct es7000_mem_info { + unsigned char type; + unsigned char length; + unsigned char resv[6]; + unsigned long long start; + unsigned long long size; +}; + +struct es7000_oem_table { + unsigned long long hdr; + struct mip_reg_info mip; + struct part_info pif; + struct es7000_mem_info shm; + struct psai psai; +}; + +#ifdef CONFIG_ACPI + +struct oem_table { + struct acpi_table_header Header; + u32 OEMTableAddr; + u32 OEMTableSize; +}; + +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +#endif + +struct mip_reg { + unsigned long long off_0; + unsigned long long off_8; + unsigned long long off_10; + unsigned long long off_18; + unsigned long long off_20; + unsigned long long off_28; + unsigned long long off_30; + unsigned long long off_38; +}; + +#define MIP_SW_APIC 0x1020b +#define MIP_FUNC(VALUE) (VALUE & 0xff) + +extern int parse_unisys_oem (char *oemptr); +extern void setup_unisys(void); +extern int es7000_start_cpu(int cpu, unsigned long eip); +extern void es7000_sw_apic(void); diff --git a/arch/x86/es7000/es7000plat.c b/arch/x86/es7000/es7000plat.c new file mode 100644 index 000000000000..7789fde13c3f --- /dev/null +++ b/arch/x86/es7000/es7000plat.c @@ -0,0 +1,262 @@ +/* + * Written by: Garry Forsgren, Unisys Corporation + * Natalie Protasevich, Unisys Corporation + * This file contains the code to configure and interface + * with Unisys ES7000 series hardware system manager. + * + * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Unisys Corporation, Township Line & Union Meeting + * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: + * + * http://www.unisys.com + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "es7000.h" +#include + +/* + * ES7000 Globals + */ + +static volatile unsigned long *psai = NULL; +static struct mip_reg *mip_reg; +static struct mip_reg *host_reg; +static int mip_port; +static unsigned long mip_addr, host_addr; + +int es7000_plat; + +/* + * GSI override for ES7000 platforms. + */ + +static unsigned int base; + +static int +es7000_rename_gsi(int ioapic, int gsi) +{ + if (es7000_plat == ES7000_ZORRO) + return gsi; + + if (!base) { + int i; + for (i = 0; i < nr_ioapics; i++) + base += nr_ioapic_registers[i]; + } + + if (!ioapic && (gsi < 16)) + gsi += base; + return gsi; +} + +void __init +setup_unisys(void) +{ + /* + * Determine the generation of the ES7000 currently running. + * + * es7000_plat = 1 if the machine is a 5xx ES7000 box + * es7000_plat = 2 if the machine is a x86_64 ES7000 box + * + */ + if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) + es7000_plat = ES7000_ZORRO; + else + es7000_plat = ES7000_CLASSIC; + ioapic_renumber_irq = es7000_rename_gsi; +} + +/* + * Parse the OEM Table + */ + +int __init +parse_unisys_oem (char *oemptr) +{ + int i; + int success = 0; + unsigned char type, size; + unsigned long val; + char *tp = NULL; + struct psai *psaip = NULL; + struct mip_reg_info *mi; + struct mip_reg *host, *mip; + + tp = oemptr; + + tp += 8; + + for (i=0; i <= 6; i++) { + type = *tp++; + size = *tp++; + tp -= 2; + switch (type) { + case MIP_REG: + mi = (struct mip_reg_info *)tp; + val = MIP_RD_LO(mi->host_reg); + host_addr = val; + host = (struct mip_reg *)val; + host_reg = __va(host); + val = MIP_RD_LO(mi->mip_reg); + mip_port = MIP_PORT(mi->mip_info); + mip_addr = val; + mip = (struct mip_reg *)val; + mip_reg = __va(mip); + pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + (unsigned long)host_reg); + pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + (unsigned long)mip_reg); + success++; + break; + case MIP_PSAI_REG: + psaip = (struct psai *)tp; + if (tp != NULL) { + if (psaip->addr) + psai = __va(psaip->addr); + else + psai = NULL; + success++; + } + break; + default: + break; + } + tp += size; + } + + if (success < 2) { + es7000_plat = NON_UNISYS; + } else + setup_unisys(); + return es7000_plat; +} + +#ifdef CONFIG_ACPI +int __init +find_unisys_acpi_oem_table(unsigned long *oem_addr) +{ + struct acpi_table_header *header = NULL; + int i = 0; + while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { + if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { + struct oem_table *t = (struct oem_table *)header; + *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, + t->OEMTableSize); + return 0; + } + } + return -1; +} +#endif + +static void +es7000_spin(int n) +{ + int i = 0; + + while (i++ < n) + rep_nop(); +} + +static int __init +es7000_mip_write(struct mip_reg *mip_reg) +{ + int status = 0; + int spin; + + spin = MIP_SPIN; + while (((unsigned long long)host_reg->off_38 & + (unsigned long long)MIP_VALID) != 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); + outb(1, mip_port); + + spin = MIP_SPIN; + + while (((unsigned long long)mip_reg->off_38 & + (unsigned long long)MIP_VALID) == 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + status = ((unsigned long long)mip_reg->off_0 & + (unsigned long long)0xffff0000000000ULL) >> 48; + mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & + (unsigned long long)~MIP_VALID); + return status; +} + +int +es7000_start_cpu(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; + +} + +void __init +es7000_sw_apic(void) +{ + if (es7000_plat) { + int mip_status; + struct mip_reg es7000_mip_reg; + + printk("ES7000: Enabling APIC mode.\n"); + memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); + es7000_mip_reg.off_0 = MIP_SW_APIC; + es7000_mip_reg.off_38 = (MIP_VALID); + while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) + printk("es7000_sw_apic: command failed, status = %x\n", + mip_status); + return; + } +} diff --git a/arch/x86/mach-es7000/Makefile b/arch/x86/mach-es7000/Makefile deleted file mode 100644 index 3ef8b43b62fc..000000000000 --- a/arch/x86/mach-es7000/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the linux kernel. -# - -obj-$(CONFIG_X86_ES7000) := es7000plat.o diff --git a/arch/x86/mach-es7000/es7000.h b/arch/x86/mach-es7000/es7000.h deleted file mode 100644 index c8d5aa132fa0..000000000000 --- a/arch/x86/mach-es7000/es7000.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - * Natalie Protasevich, Unisys Corporation - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -/* - * ES7000 chipsets - */ - -#define NON_UNISYS 0 -#define ES7000_CLASSIC 1 -#define ES7000_ZORRO 2 - - -#define MIP_REG 1 -#define MIP_PSAI_REG 4 - -#define MIP_BUSY 1 -#define MIP_SPIN 0xf0000 -#define MIP_VALID 0x0100000000000000ULL -#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) - -#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) - -struct mip_reg_info { - unsigned long long mip_info; - unsigned long long delivery_info; - unsigned long long host_reg; - unsigned long long mip_reg; -}; - -struct part_info { - unsigned char type; - unsigned char length; - unsigned char part_id; - unsigned char apic_mode; - unsigned long snum; - char ptype[16]; - char sname[64]; - char pname[64]; -}; - -struct psai { - unsigned long long entry_type; - unsigned long long addr; - unsigned long long bep_addr; -}; - -struct es7000_mem_info { - unsigned char type; - unsigned char length; - unsigned char resv[6]; - unsigned long long start; - unsigned long long size; -}; - -struct es7000_oem_table { - unsigned long long hdr; - struct mip_reg_info mip; - struct part_info pif; - struct es7000_mem_info shm; - struct psai psai; -}; - -#ifdef CONFIG_ACPI - -struct oem_table { - struct acpi_table_header Header; - u32 OEMTableAddr; - u32 OEMTableSize; -}; - -extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); -#endif - -struct mip_reg { - unsigned long long off_0; - unsigned long long off_8; - unsigned long long off_10; - unsigned long long off_18; - unsigned long long off_20; - unsigned long long off_28; - unsigned long long off_30; - unsigned long long off_38; -}; - -#define MIP_SW_APIC 0x1020b -#define MIP_FUNC(VALUE) (VALUE & 0xff) - -extern int parse_unisys_oem (char *oemptr); -extern void setup_unisys(void); -extern int es7000_start_cpu(int cpu, unsigned long eip); -extern void es7000_sw_apic(void); diff --git a/arch/x86/mach-es7000/es7000plat.c b/arch/x86/mach-es7000/es7000plat.c deleted file mode 100644 index 50189af14b85..000000000000 --- a/arch/x86/mach-es7000/es7000plat.c +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - * Natalie Protasevich, Unisys Corporation - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "es7000.h" -#include - -/* - * ES7000 Globals - */ - -static volatile unsigned long *psai = NULL; -static struct mip_reg *mip_reg; -static struct mip_reg *host_reg; -static int mip_port; -static unsigned long mip_addr, host_addr; - -int es7000_plat; - -/* - * GSI override for ES7000 platforms. - */ - -static unsigned int base; - -static int -es7000_rename_gsi(int ioapic, int gsi) -{ - if (es7000_plat == ES7000_ZORRO) - return gsi; - - if (!base) { - int i; - for (i = 0; i < nr_ioapics; i++) - base += nr_ioapic_registers[i]; - } - - if (!ioapic && (gsi < 16)) - gsi += base; - return gsi; -} - -void __init -setup_unisys(void) -{ - /* - * Determine the generation of the ES7000 currently running. - * - * es7000_plat = 1 if the machine is a 5xx ES7000 box - * es7000_plat = 2 if the machine is a x86_64 ES7000 box - * - */ - if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) - es7000_plat = ES7000_ZORRO; - else - es7000_plat = ES7000_CLASSIC; - ioapic_renumber_irq = es7000_rename_gsi; -} - -/* - * Parse the OEM Table - */ - -int __init -parse_unisys_oem (char *oemptr) -{ - int i; - int success = 0; - unsigned char type, size; - unsigned long val; - char *tp = NULL; - struct psai *psaip = NULL; - struct mip_reg_info *mi; - struct mip_reg *host, *mip; - - tp = oemptr; - - tp += 8; - - for (i=0; i <= 6; i++) { - type = *tp++; - size = *tp++; - tp -= 2; - switch (type) { - case MIP_REG: - mi = (struct mip_reg_info *)tp; - val = MIP_RD_LO(mi->host_reg); - host_addr = val; - host = (struct mip_reg *)val; - host_reg = __va(host); - val = MIP_RD_LO(mi->mip_reg); - mip_port = MIP_PORT(mi->mip_info); - mip_addr = val; - mip = (struct mip_reg *)val; - mip_reg = __va(mip); - pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", - (unsigned long)host_reg); - pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", - (unsigned long)mip_reg); - success++; - break; - case MIP_PSAI_REG: - psaip = (struct psai *)tp; - if (tp != NULL) { - if (psaip->addr) - psai = __va(psaip->addr); - else - psai = NULL; - success++; - } - break; - default: - break; - } - tp += size; - } - - if (success < 2) { - es7000_plat = NON_UNISYS; - } else - setup_unisys(); - return es7000_plat; -} - -#ifdef CONFIG_ACPI -int __init -find_unisys_acpi_oem_table(unsigned long *oem_addr) -{ - struct acpi_table_header *header = NULL; - int i = 0; - while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { - if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { - struct oem_table *t = (struct oem_table *)header; - *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, - t->OEMTableSize); - return 0; - } - } - return -1; -} -#endif - -static void -es7000_spin(int n) -{ - int i = 0; - - while (i++ < n) - rep_nop(); -} - -static int __init -es7000_mip_write(struct mip_reg *mip_reg) -{ - int status = 0; - int spin; - - spin = MIP_SPIN; - while (((unsigned long long)host_reg->off_38 & - (unsigned long long)MIP_VALID) != 0) { - if (--spin <= 0) { - printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); - outb(1, mip_port); - - spin = MIP_SPIN; - - while (((unsigned long long)mip_reg->off_38 & - (unsigned long long)MIP_VALID) == 0) { - if (--spin <= 0) { - printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - status = ((unsigned long long)mip_reg->off_0 & - (unsigned long long)0xffff0000000000ULL) >> 48; - mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & - (unsigned long long)~MIP_VALID); - return status; -} - -int -es7000_start_cpu(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; - -} - -void __init -es7000_sw_apic(void) -{ - if (es7000_plat) { - int mip_status; - struct mip_reg es7000_mip_reg; - - printk("ES7000: Enabling APIC mode.\n"); - memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); - es7000_mip_reg.off_0 = MIP_SW_APIC; - es7000_mip_reg.off_38 = (MIP_VALID); - while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) - printk("es7000_sw_apic: command failed, status = %x\n", - mip_status); - return; - } -} diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile index 0dbd7803a1d5..4706de7676b1 100644 --- a/arch/x86/mach-generic/Makefile +++ b/arch/x86/mach-generic/Makefile @@ -9,4 +9,4 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT) += summit.o obj-$(CONFIG_X86_BIGSMP) += bigsmp.o obj-$(CONFIG_X86_ES7000) += es7000.o -obj-$(CONFIG_X86_ES7000) += ../../x86/mach-es7000/ +obj-$(CONFIG_X86_ES7000) += ../../x86/es7000/ diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 9b30547d746e..520cca0ee04e 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -11,12 +11,12 @@ #include #include #include -#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include static int probe_es7000(void) { -- cgit v1.2.2 From e8c48efdb971cfb1b043076cf68be535d461a20b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 25 Jul 2008 02:17:44 -0700 Subject: x86: mach_summit to summit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/summit_32.c | 2 +- arch/x86/mach-generic/summit.c | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/summit_32.c b/arch/x86/kernel/summit_32.c index d67ce5f044ba..7b987852e876 100644 --- a/arch/x86/kernel/summit_32.c +++ b/arch/x86/kernel/summit_32.c @@ -30,7 +30,7 @@ #include #include #include -#include +#include static struct rio_table_hdr *rio_table_hdr __initdata; static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 752edd96b1bf..6ad6b67a723d 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -11,11 +11,11 @@ #include #include #include -#include +#include #include -#include -#include -#include +#include +#include +#include static int probe_summit(void) { -- cgit v1.2.2 From edb181ac4b0c7c1240503da46349a3fb69af9ef6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 25 Jul 2008 02:17:55 -0700 Subject: x86: mach-numaq to numaq Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mach-generic/numaq.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 95c07efff6b7..8cf58394975e 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -11,12 +11,12 @@ #include #include #include -#include +#include #include -#include -#include -#include -#include +#include +#include +#include +#include #include static int mps_oem_check(struct mp_config_table *mpc, char *oem, -- cgit v1.2.2 From 39eacc20f93614f7bab63eb1d45060503afc46d0 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Fri, 25 Jul 2008 23:30:13 +0800 Subject: arch/x86/kernel/visws_quirks.c: Removed duplicated #include Removed duplicated #include in arch/x86/kernel/visws_quirks.c. asm/apic.h asm/arch_hooks.h asm/io.h asm/visws/cobalt.h asm/visws/lithium.h asm/visws/piix4.h linux/init.h linux/interrupt.h linux/smp.h Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/visws_quirks.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 41e01b145c48..0c75691bcf5f 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -29,41 +29,26 @@ #include #include #include -#include #include #include #include "mach_apic.h" -#include -#include - #include -#include -#include -#include -#include #include #include -#include #include -#include #include #include -#include #include #include extern int no_broadcast; -#include #include -#include -#include -#include char visws_board_type = -1; char visws_board_rev = -1; -- cgit v1.2.2 From 6ffac1e90a17ea0aded5c581204397421eec91b6 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 24 Jul 2008 18:07:56 -0700 Subject: x64, fpu: fix possible FPU leakage in error conditions On Thu, Jul 24, 2008 at 03:43:44PM -0700, Linus Torvalds wrote: > So how about this patch as a starting point? This is the RightThing(tm) to > do regardless, and if it then makes it easier to do some other cleanups, > we should do it first. What do you think? restore_fpu_checking() calls init_fpu() in error conditions. While this is wrong(as our main intention is to clear the fpu state of the thread), this was benign before commit 92d140e21f1 ("x86: fix taking DNA during 64bit sigreturn"). Post commit 92d140e21f1, live FPU registers may not belong to this process at this error scenario. In the error condition for restore_fpu_checking() (especially during the 64bit signal return), we are doing init_fpu(), which saves the live FPU register state (possibly belonging to some other process context) into the thread struct (through unlazy_fpu() in init_fpu()). This is wrong and can leak the FPU data. For the signal handler restore error condition in restore_i387(), clear the fpu state present in the thread struct(before ultimately sending a SIGSEGV for badframe). For the paranoid error condition check in math_state_restore(), send a SIGSEGV, if we fail to restore the state. Signed-off-by: Suresh Siddha Cc: Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 11 ++++++++++- arch/x86/kernel/traps_64.c | 9 ++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index b45ef8ddd651..ca316b5b742c 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -104,7 +104,16 @@ static inline int restore_i387(struct _fpstate __user *buf) clts(); task_thread_info(current)->status |= TS_USEDFPU; } - return restore_fpu_checking((__force struct i387_fxsave_struct *)buf); + err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf); + if (unlikely(err)) { + /* + * Encountered an error while doing the restore from the + * user buffer, clear the fpu state. + */ + clear_fpu(tsk); + clear_used_math(); + } + return err; } /* diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 3f18d73f420c..513caaca7115 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -1131,7 +1131,14 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ - restore_fpu_checking(&me->thread.xstate->fxsave); + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) { + stts(); + force_sig(SIGSEGV, me); + return; + } task_thread_info(me)->status |= TS_USEDFPU; me->fpu_counter++; } -- cgit v1.2.2 From 6524d938b3360504b43a1278b5a8403e85383d1a Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:30 -0700 Subject: cpumask: put cpumask_of_cpu_map in the initdata section * Create the cpumask_of_cpu_map statically in the init data section using NR_CPUS but replace it during boot up with one sized by nr_cpu_ids (num possible cpus). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f7745f94c006..1cd53dfcd309 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -81,10 +81,12 @@ static void __init setup_per_cpu_maps(void) } #ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -cpumask_t *cpumask_of_cpu_map __read_mostly; -EXPORT_SYMBOL(cpumask_of_cpu_map); - -/* requires nr_cpu_ids to be initialized */ +/* + * Replace static cpumask_of_cpu_map in the initdata section, + * with one that's allocated sized by the possible number of cpus. + * + * (requires nr_cpu_ids to be initialized) + */ static void __init setup_cpumask_of_cpu(void) { int i; -- cgit v1.2.2 From 0bc3cc03fa6e1c20aecb5a33356bcaae410640b9 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Thu, 24 Jul 2008 18:21:31 -0700 Subject: cpumask: change cpumask_of_cpu_ptr to use new cpumask_of_cpu * Replace previous instances of the cpumask_of_cpu_ptr* macros with a the new (lvalue capable) generic cpumask_of_cpu(). Signed-off-by: Mike Travis Cc: Andrew Morton Cc: Jack Steiner Cc: Rusty Russell Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/cstate.c | 3 +-- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 10 +++------- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 15 +++++---------- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 12 ++++-------- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 3 +-- arch/x86/kernel/cpu/intel_cacheinfo.c | 3 +-- arch/x86/kernel/ldt.c | 6 ++---- arch/x86/kernel/microcode.c | 17 +++++------------ arch/x86/kernel/reboot.c | 11 +++-------- 9 files changed, 25 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 9220cf46aa10..c2502eb9aa83 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -73,7 +73,6 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, struct cpuinfo_x86 *c = &cpu_data(cpu); cpumask_t saved_mask; - cpumask_of_cpu_ptr(new_mask, cpu); int retval; unsigned int eax, ebx, ecx, edx; unsigned int edx_part; @@ -92,7 +91,7 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, /* Make sure we are running on right CPU */ saved_mask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, new_mask); + retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (retval) return -1; diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index ff2fff56f0a8..dd097b835839 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -200,12 +200,10 @@ static void drv_read(struct drv_cmd *cmd) static void drv_write(struct drv_cmd *cmd) { cpumask_t saved_mask = current->cpus_allowed; - cpumask_of_cpu_ptr_declare(cpu_mask); unsigned int i; for_each_cpu_mask_nr(i, cmd->mask) { - cpumask_of_cpu_ptr_next(cpu_mask, i); - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(i)); do_drv_write(cmd); } @@ -269,12 +267,11 @@ static unsigned int get_measured_perf(unsigned int cpu) } aperf_cur, mperf_cur; cpumask_t saved_mask; - cpumask_of_cpu_ptr(cpu_mask, cpu); unsigned int perf_percent; unsigned int retval; saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (get_cpu() != cpu) { /* We were not able to run on requested processor */ put_cpu(); @@ -340,7 +337,6 @@ static unsigned int get_measured_perf(unsigned int cpu) static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { - cpumask_of_cpu_ptr(cpu_mask, cpu); struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); unsigned int freq; unsigned int cached_freq; @@ -353,7 +349,7 @@ static unsigned int get_cur_freq_on_cpu(unsigned int cpu) } cached_freq = data->freq_table[data->acpi_data->state].frequency; - freq = extract_freq(get_cur_val(cpu_mask), data); + freq = extract_freq(get_cur_val(&cpumask_of_cpu(cpu)), data); if (freq != cached_freq) { /* * The dreaded BIOS frequency change behind our back. diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 53c7b6936973..c45ca6d4dce1 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -479,12 +479,11 @@ static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvi static int check_supported_cpu(unsigned int cpu) { cpumask_t oldmask; - cpumask_of_cpu_ptr(cpu_mask, cpu); u32 eax, ebx, ecx, edx; unsigned int rc = 0; oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", cpu); @@ -1017,7 +1016,6 @@ static int transition_frequency_pstate(struct powernow_k8_data *data, unsigned i static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsigned relation) { cpumask_t oldmask; - cpumask_of_cpu_ptr(cpu_mask, pol->cpu); struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); u32 checkfid; u32 checkvid; @@ -1032,7 +1030,7 @@ static int powernowk8_target(struct cpufreq_policy *pol, unsigned targfreq, unsi /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - set_cpus_allowed_ptr(current, cpu_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1107,7 +1105,6 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; cpumask_t oldmask; - cpumask_of_cpu_ptr_declare(newmask); int rc; if (!cpu_online(pol->cpu)) @@ -1159,8 +1156,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* only run on specific CPU from here on */ oldmask = current->cpus_allowed; - cpumask_of_cpu_ptr_next(newmask, pol->cpu); - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); if (smp_processor_id() != pol->cpu) { printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); @@ -1182,7 +1178,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) set_cpus_allowed_ptr(current, &oldmask); if (cpu_family == CPU_HW_PSTATE) - pol->cpus = *newmask; + pol->cpus = cpumask_of_cpu(pol->cpu); else pol->cpus = per_cpu(cpu_core_map, pol->cpu); data->available_cores = &(pol->cpus); @@ -1248,7 +1244,6 @@ static unsigned int powernowk8_get (unsigned int cpu) { struct powernow_k8_data *data; cpumask_t oldmask = current->cpus_allowed; - cpumask_of_cpu_ptr(newmask, cpu); unsigned int khz = 0; unsigned int first; @@ -1258,7 +1253,7 @@ static unsigned int powernowk8_get (unsigned int cpu) if (!data) return -EINVAL; - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (smp_processor_id() != cpu) { printk(KERN_ERR PFX "limiting to CPU %d failed in powernowk8_get\n", cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index ca2ac13b7af2..15e13c01cc36 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -324,10 +324,9 @@ static unsigned int get_cur_freq(unsigned int cpu) unsigned l, h; unsigned clock_freq; cpumask_t saved_mask; - cpumask_of_cpu_ptr(new_mask, cpu); saved_mask = current->cpus_allowed; - set_cpus_allowed_ptr(current, new_mask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (smp_processor_id() != cpu) return 0; @@ -585,15 +584,12 @@ static int centrino_target (struct cpufreq_policy *policy, * Best effort undo.. */ - if (!cpus_empty(*covered_cpus)) { - cpumask_of_cpu_ptr_declare(new_mask); - + if (!cpus_empty(*covered_cpus)) for_each_cpu_mask_nr(j, *covered_cpus) { - cpumask_of_cpu_ptr_next(new_mask, j); - set_cpus_allowed_ptr(current, new_mask); + set_cpus_allowed_ptr(current, + &cpumask_of_cpu(j)); wrmsr(MSR_IA32_PERF_CTL, oldmsr, h); } - } tmp = freqs.new; freqs.new = freqs.old; diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 2f3728dc24f6..191f7263c61d 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -244,8 +244,7 @@ static unsigned int _speedstep_get(const cpumask_t *cpus) static unsigned int speedstep_get(unsigned int cpu) { - cpumask_of_cpu_ptr(newmask, cpu); - return _speedstep_get(newmask); + return _speedstep_get(&cpumask_of_cpu(cpu)); } /** diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 650d40f7912b..6b0a10b002f1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -516,7 +516,6 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) unsigned long j; int retval; cpumask_t oldmask; - cpumask_of_cpu_ptr(newmask, cpu); if (num_cache_leaves == 0) return -ENOENT; @@ -527,7 +526,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) return -ENOMEM; oldmask = current->cpus_allowed; - retval = set_cpus_allowed_ptr(current, newmask); + retval = set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); if (retval) goto out; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 3fee2aa50f3f..b68e21f06f4f 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -62,12 +62,10 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload) if (reload) { #ifdef CONFIG_SMP - cpumask_of_cpu_ptr_declare(mask); - preempt_disable(); load_LDT(pc); - cpumask_of_cpu_ptr_next(mask, smp_processor_id()); - if (!cpus_equal(current->mm->cpu_vm_mask, *mask)) + if (!cpus_equal(current->mm->cpu_vm_mask, + cpumask_of_cpu(smp_processor_id()))) smp_call_function(flush_ldt, current->mm, 1); preempt_enable(); #else diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 6994c751590e..652fa5c38ebe 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -388,7 +388,6 @@ static int do_microcode_update (void) void *new_mc = NULL; int cpu; cpumask_t old; - cpumask_of_cpu_ptr_declare(newmask); old = current->cpus_allowed; @@ -405,8 +404,7 @@ static int do_microcode_update (void) if (!uci->valid) continue; - cpumask_of_cpu_ptr_next(newmask, cpu); - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); error = get_maching_microcode(new_mc, cpu); if (error < 0) goto out; @@ -576,7 +574,6 @@ static int apply_microcode_check_cpu(int cpu) struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); unsigned int val[2]; int err = 0; @@ -585,7 +582,7 @@ static int apply_microcode_check_cpu(int cpu) return 0; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); /* Check if the microcode we have in memory matches the CPU */ if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || @@ -623,12 +620,11 @@ static int apply_microcode_check_cpu(int cpu) static void microcode_init_cpu(int cpu, int resume) { cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; old = current->cpus_allowed; - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING && !resume) @@ -661,13 +657,10 @@ static ssize_t reload_store(struct sys_device *dev, if (end == buf) return -EINVAL; if (val == 1) { - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - - old = current->cpus_allowed; + cpumask_t old = current->cpus_allowed; get_online_cpus(); - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); if (uci->valid) diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 06a9f643817e..724adfc63cb9 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -414,25 +414,20 @@ void native_machine_shutdown(void) /* The boot cpu is always logical cpu 0 */ int reboot_cpu_id = 0; - cpumask_of_cpu_ptr(newmask, reboot_cpu_id); #ifdef CONFIG_X86_32 /* See if there has been given a command line override */ if ((reboot_cpu != -1) && (reboot_cpu < NR_CPUS) && - cpu_online(reboot_cpu)) { + cpu_online(reboot_cpu)) reboot_cpu_id = reboot_cpu; - cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); - } #endif /* Make certain the cpu I'm about to reboot on is online */ - if (!cpu_online(reboot_cpu_id)) { + if (!cpu_online(reboot_cpu_id)) reboot_cpu_id = smp_processor_id(); - cpumask_of_cpu_ptr_next(newmask, reboot_cpu_id); - } /* Make certain I only run on the appropriate processor */ - set_cpus_allowed_ptr(current, newmask); + set_cpus_allowed_ptr(current, &cpumask_of_cpu(reboot_cpu_id)); /* O.K Now that I'm on the appropriate processor, * stop all of the others. -- cgit v1.2.2 From 9749986a878e91182ff027ff0010ab8e3211031a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 17:28:11 +0200 Subject: x86: usb debug port early console, fix fix: arch/x86/kernel/built-in.o: In function `nvidia_set_debug_port': early_printk.c:(.text+0xf8b1): undefined reference to `read_pci_config' early_printk.c:(.text+0xf8dc): undefined reference to `write_pci_config' arch/x86/kernel/built-in.o: In function `setup_early_printk': early_printk.c:(.init.text+0x5487): undefined reference to `early_pci_allowed' early_printk.c:(.init.text+0x54cb): undefined reference to `read_pci_config' early_printk.c:(.init.text+0x54ec): undefined reference to `read_pci_config_16' [...] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 93422d2ecf61..2a3dfbd5e677 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -46,7 +46,7 @@ config EARLY_PRINTK config EARLY_PRINTK_DBGP bool "Early printk via EHCI debug port" default n - depends on EARLY_PRINTK + depends on EARLY_PRINTK && PCI help Write kernel log output directly into the EHCI debug port. -- cgit v1.2.2 From b56afe1d41653fb07ab1b5af5ccc12001c4dd5a0 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 24 Jul 2008 12:15:45 -0300 Subject: x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags Using native_pte_val triggers the BUG_ON() in the paravirt_ops version of pte_flags(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 06219e60e9c8..e2767c28dac7 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1347,7 +1347,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, -- cgit v1.2.2 From 3964cd3a6721f18ef1dd67b9a0a89dc5b36683b9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 26 Jul 2008 19:35:20 +0200 Subject: x86: visws_quirks, fix build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/visws_quirks.c: In function ‘visws_early_detect’: arch/x86/kernel/visws_quirks.c:290: error: ‘skip_ioapic_setup’ undeclared (first use in this function) arch/x86/kernel/visws_quirks.c:290: error: (Each undeclared identifier is reported only once arch/x86/kernel/visws_quirks.c:290: error: for each function it appears in.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/visws_quirks.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 0c75691bcf5f..3059eb45a915 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.2 From 8d8bb39b9eba32dd70e87fd5ad5c5dd4ba118e06 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 25 Jul 2008 19:44:49 -0700 Subject: dma-mapping: add the device argument to dma_mapping_error() Add per-device dma_mapping_ops support for CONFIG_X86_64 as POWER architecture does: This enables us to cleanly fix the Calgary IOMMU issue that some devices are not behind the IOMMU (http://lkml.org/lkml/2008/5/8/423). I think that per-device dma_mapping_ops support would be also helpful for KVM people to support PCI passthrough but Andi thinks that this makes it difficult to support the PCI passthrough (see the above thread). So I CC'ed this to KVM camp. Comments are appreciated. A pointer to dma_mapping_ops to struct dev_archdata is added. If the pointer is non NULL, DMA operations in asm/dma-mapping.h use it. If it's NULL, the system-wide dma_ops pointer is used as before. If it's useful for KVM people, I plan to implement a mechanism to register a hook called when a new pci (or dma capable) device is created (it works with hot plugging). It enables IOMMUs to set up an appropriate dma_mapping_ops per device. The major obstacle is that dma_mapping_error doesn't take a pointer to the device unlike other DMA operations. So x86 can't have dma_mapping_ops per device. Note all the POWER IOMMUs use the same dma_mapping_error function so this is not a problem for POWER but x86 IOMMUs use different dma_mapping_error functions. The first patch adds the device argument to dma_mapping_error. The patch is trivial but large since it touches lots of drivers and dma-mapping.h in all the architecture. This patch: dma_mapping_error() doesn't take a pointer to the device unlike other DMA operations. So we can't have dma_mapping_ops per device. Note that POWER already has dma_mapping_ops per device but all the POWER IOMMUs use the same dma_mapping_error function. x86 IOMMUs use device argument. [akpm@linux-foundation.org: fix sge] [akpm@linux-foundation.org: fix svc_rdma] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix bnx2x] [akpm@linux-foundation.org: fix s2io] [akpm@linux-foundation.org: fix pasemi_mac] [akpm@linux-foundation.org: fix sdhci] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: fix sparc] [akpm@linux-foundation.org: fix ibmvscsi] Signed-off-by: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/pci-dma.c | 27 ++++++++++++++++----------- arch/x86/kernel/pci-gart_64.c | 3 +-- arch/x86/kernel/pci-nommu.c | 14 +------------- arch/x86/kernel/pci-swiotlb_64.c | 2 +- 5 files changed, 20 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 19e7fc7c2c4f..1eb86be93d7a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -544,7 +544,7 @@ error: return ret; } -static const struct dma_mapping_ops calgary_dma_ops = { +static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index cbecb05551bb..37544123896d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -11,7 +11,7 @@ static int forbid_dac __read_mostly; -const struct dma_mapping_ops *dma_ops; +struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; @@ -312,6 +312,8 @@ static int dma_release_coherent(struct device *dev, int order, void *vaddr) int dma_supported(struct device *dev, u64 mask) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + #ifdef CONFIG_PCI if (mask > 0xffffffff && forbid_dac > 0) { dev_info(dev, "PCI: Disallowing DAC for device\n"); @@ -319,8 +321,8 @@ int dma_supported(struct device *dev, u64 mask) } #endif - if (dma_ops->dma_supported) - return dma_ops->dma_supported(dev, mask); + if (ops->dma_supported) + return ops->dma_supported(dev, mask); /* Copied from i386. Doesn't make much sense, because it will only work for pci_alloc_coherent. @@ -367,6 +369,7 @@ void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { + struct dma_mapping_ops *ops = get_dma_ops(dev); void *memory = NULL; struct page *page; unsigned long dma_mask = 0; @@ -435,8 +438,8 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, /* Let low level make its own zone decisions */ gfp &= ~(GFP_DMA32|GFP_DMA); - if (dma_ops->alloc_coherent) - return dma_ops->alloc_coherent(dev, size, + if (ops->alloc_coherent) + return ops->alloc_coherent(dev, size, dma_handle, gfp); return NULL; } @@ -448,14 +451,14 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, } } - if (dma_ops->alloc_coherent) { + if (ops->alloc_coherent) { free_pages((unsigned long)memory, get_order(size)); gfp &= ~(GFP_DMA|GFP_DMA32); - return dma_ops->alloc_coherent(dev, size, dma_handle, gfp); + return ops->alloc_coherent(dev, size, dma_handle, gfp); } - if (dma_ops->map_simple) { - *dma_handle = dma_ops->map_simple(dev, virt_to_phys(memory), + if (ops->map_simple) { + *dma_handle = ops->map_simple(dev, virt_to_phys(memory), size, PCI_DMA_BIDIRECTIONAL); if (*dma_handle != bad_dma_address) @@ -477,12 +480,14 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t bus) { + struct dma_mapping_ops *ops = get_dma_ops(dev); + int order = get_order(size); WARN_ON(irqs_disabled()); /* for portability */ if (dma_release_coherent(dev, order, vaddr)) return; - if (dma_ops->unmap_single) - dma_ops->unmap_single(dev, bus, size, 0); + if (ops->unmap_single) + ops->unmap_single(dev, bus, size, 0); free_pages((unsigned long)vaddr, order); } EXPORT_SYMBOL(dma_free_coherent); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index df5f142657d2..744126e64950 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -692,8 +692,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info) extern int agp_amd64_init(void); -static const struct dma_mapping_ops gart_dma_ops = { - .mapping_error = NULL, +static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 792b9179eff3..3f91f71cdc3e 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -72,21 +72,9 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -/* Make sure we keep the same behaviour */ -static int nommu_mapping_error(dma_addr_t dma_addr) -{ -#ifdef CONFIG_X86_32 - return 0; -#else - return (dma_addr == bad_dma_address); -#endif -} - - -const struct dma_mapping_ops nommu_dma_ops = { +struct dma_mapping_ops nommu_dma_ops = { .map_single = nommu_map_single, .map_sg = nommu_map_sg, - .mapping_error = nommu_mapping_error, .is_phys = 1, }; diff --git a/arch/x86/kernel/pci-swiotlb_64.c b/arch/x86/kernel/pci-swiotlb_64.c index 20df839b9c20..c4ce0332759e 100644 --- a/arch/x86/kernel/pci-swiotlb_64.c +++ b/arch/x86/kernel/pci-swiotlb_64.c @@ -18,7 +18,7 @@ swiotlb_map_single_phys(struct device *hwdev, phys_addr_t paddr, size_t size, return swiotlb_map_single(hwdev, phys_to_virt(paddr), size, direction); } -const struct dma_mapping_ops swiotlb_dma_ops = { +struct dma_mapping_ops swiotlb_dma_ops = { .mapping_error = swiotlb_dma_mapping_error, .alloc_coherent = swiotlb_alloc_coherent, .free_coherent = swiotlb_free_coherent, -- cgit v1.2.2 From 1956a96de488feb05e95c08c9d5e80f63a4be2b1 Mon Sep 17 00:00:00 2001 From: Alexis Bruemmer Date: Fri, 25 Jul 2008 19:44:51 -0700 Subject: x86 calgary: fix handling of devices that aren't behind the Calgary The calgary code can give drivers addresses above 4GB which is very bad for hardware that is only 32bit DMA addressable. With this patch, the calgary code sets the global dma_ops to swiotlb or nommu properly, and the dma_ops of devices behind the Calgary/CalIOC2 to calgary_dma_ops. So the calgary code can handle devices safely that aren't behind the Calgary/CalIOC2. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Alexis Bruemmer Signed-off-by: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 71 +++++++++++++++------------------------- 1 file changed, 26 insertions(+), 45 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 1eb86be93d7a..b67a4b1d4eae 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -37,6 +37,7 @@ #include #include #include + #include #include #include @@ -413,22 +414,6 @@ static void calgary_unmap_sg(struct device *dev, } } -static int calgary_nontranslate_map_sg(struct device* dev, - struct scatterlist *sg, int nelems, int direction) -{ - struct scatterlist *s; - int i; - - for_each_sg(sg, s, nelems, i) { - struct page *p = sg_page(s); - - BUG_ON(!p); - s->dma_address = virt_to_bus(sg_virt(s)); - s->dma_length = s->length; - } - return nelems; -} - static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { @@ -439,9 +424,6 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, unsigned long entry; int i; - if (!translation_enabled(tbl)) - return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - for_each_sg(sg, s, nelems, i) { BUG_ON(!sg_page(s)); @@ -477,7 +459,6 @@ error: static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, size_t size, int direction) { - dma_addr_t dma_handle = bad_dma_address; void *vaddr = phys_to_virt(paddr); unsigned long uaddr; unsigned int npages; @@ -486,12 +467,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); - if (translation_enabled(tbl)) - dma_handle = iommu_alloc(dev, tbl, vaddr, npages, direction); - else - dma_handle = virt_to_bus(vaddr); - - return dma_handle; + return iommu_alloc(dev, tbl, vaddr, npages, direction); } static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, @@ -500,9 +476,6 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - if (!translation_enabled(tbl)) - return; - npages = num_dma_pages(dma_handle, size); iommu_free(tbl, dma_handle, npages); } @@ -525,18 +498,12 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, goto error; memset(ret, 0, size); - if (translation_enabled(tbl)) { - /* set up tces to cover the allocated range */ - mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); - if (mapping == bad_dma_address) - goto free; - - *dma_handle = mapping; - } else /* non translated slot */ - *dma_handle = virt_to_bus(ret); - + /* set up tces to cover the allocated range */ + mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); + if (mapping == bad_dma_address) + goto free; + *dma_handle = mapping; return ret; - free: free_pages((unsigned long)ret, get_order(size)); ret = NULL; @@ -1241,6 +1208,16 @@ static int __init calgary_init(void) goto error; } while (1); + dev = NULL; + for_each_pci_dev(dev) { + struct iommu_table *tbl; + + tbl = find_iommu_table(&dev->dev); + + if (translation_enabled(tbl)) + dev->dev.archdata.dma_ops = &calgary_dma_ops; + } + return ret; error: @@ -1262,6 +1239,7 @@ error: calgary_disable_translation(dev); calgary_free_bus(dev); pci_dev_put(dev); /* Undo calgary_init_one()'s pci_dev_get() */ + dev->dev.archdata.dma_ops = NULL; } while (1); return ret; @@ -1503,6 +1481,10 @@ void __init detect_calgary(void) printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, debugging ? "enabled" : "disabled"); + + /* swiotlb for devices that aren't behind the Calgary. */ + if (max_pfn > MAX_DMA32_PFN) + swiotlb = 1; } return; @@ -1519,7 +1501,7 @@ int __init calgary_iommu_init(void) { int ret; - if (no_iommu || swiotlb) + if (no_iommu || (swiotlb && !calgary_detected)) return -ENODEV; if (!calgary_detected) @@ -1532,15 +1514,14 @@ int __init calgary_iommu_init(void) if (ret) { printk(KERN_ERR "PCI-DMA: Calgary init failed %d, " "falling back to no_iommu\n", ret); - if (max_pfn > MAX_DMA32_PFN) - printk(KERN_ERR "WARNING more than 4GB of memory, " - "32bit PCI may malfunction.\n"); return ret; } force_iommu = 1; bad_dma_address = 0x0; - dma_ops = &calgary_dma_ops; + /* dma_ops is set to swiotlb or nommu */ + if (!dma_ops) + dma_ops = &nommu_dma_ops; return 0; } -- cgit v1.2.2 From 3ab83521378268044a448113c6aa9a9e245f4d2f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:07 -0700 Subject: kexec jump This patch provides an enhancement to kexec/kdump. It implements the following features: - Backup/restore memory used by the original kernel before/after kexec. - Save/restore CPU state before/after kexec. The features of this patch can be used as a general method to call program in physical mode (paging turning off). This can be used to call BIOS code under Linux. kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 Usage example of calling some physical mode code and return: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_KEXEC=y CONFIG_PM=y CONFIG_KEXEC_JUMP=y 2. Build patched kexec-tool or download the pre-built one. 3. Build some physical mode executable named such as "phy_mode" 4. Boot kernel compiled in step 1. 5. Load physical mode executable with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context --args-none phy_mode 6. Call physical mode executable with following shell command line: /sbin/kexec -e Implementation point: To support jumping without reserving memory. One shadow backup page (source page) is allocated for each page used by kexeced code image (destination page). When do kexec_load, the image of kexeced code is loaded into source pages, and before executing, the destination pages and the source pages are swapped, so the contents of destination pages are backupped. Before jumping to the kexeced code image and after jumping back to the original kernel, the destination pages and the source pages are swapped too. C ABI (calling convention) is used as communication protocol between kernel and called code. A flag named KEXEC_PRESERVE_CONTEXT for sys_kexec_load is added to indicate that the loaded kernel image is used for jumping back. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 7 ++ arch/x86/kernel/machine_kexec_32.c | 27 ++++-- arch/x86/kernel/machine_kexec_64.c | 2 +- arch/x86/kernel/relocate_kernel_32.S | 174 ++++++++++++++++++++++++++++++----- 4 files changed, 179 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e3cba0b45600..7ecb679f0130 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1279,6 +1279,13 @@ config CRASH_DUMP (CONFIG_RELOCATABLE=y). For more details see Documentation/kdump/kdump.txt +config KEXEC_JUMP + bool "kexec jump (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on KEXEC && PM_SLEEP && X86_32 + help + Invoke code in physical address mode via KEXEC + config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) default "0x1000000" if X86_NUMAQ diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 8864230d55af..2b67609d0a1c 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -22,6 +22,7 @@ #include #include #include +#include #define PAGE_ALIGNED __attribute__ ((__aligned__(PAGE_SIZE))) static u32 kexec_pgd[1024] PAGE_ALIGNED; @@ -85,10 +86,12 @@ static void load_segments(void) * reboot code buffer to allow us to avoid allocations * later. * - * Currently nothing. + * Make control page executable. */ int machine_kexec_prepare(struct kimage *image) { + if (nx_enabled) + set_pages_x(image->control_code_page, 1); return 0; } @@ -98,16 +101,24 @@ int machine_kexec_prepare(struct kimage *image) */ void machine_kexec_cleanup(struct kimage *image) { + if (nx_enabled) + set_pages_nx(image->control_code_page, 1); } /* * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + asmlinkage unsigned long + (*relocate_kernel_ptr)(unsigned long indirection_page, + unsigned long control_page, + unsigned long start_address, + unsigned int has_pae, + unsigned int preserve_context); tracer_disable(); @@ -115,10 +126,11 @@ NORET_TYPE void machine_kexec(struct kimage *image) local_irq_disable(); control_page = page_address(image->control_code_page); - memcpy(control_page, relocate_kernel, PAGE_SIZE); + memcpy(control_page, relocate_kernel, PAGE_SIZE/2); + relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); - page_list[VA_CONTROL_PAGE] = (unsigned long)relocate_kernel; + page_list[VA_CONTROL_PAGE] = (unsigned long)control_page; page_list[PA_PGD] = __pa(kexec_pgd); page_list[VA_PGD] = (unsigned long)kexec_pgd; #ifdef CONFIG_X86_PAE @@ -131,6 +143,7 @@ NORET_TYPE void machine_kexec(struct kimage *image) page_list[VA_PTE_0] = (unsigned long)kexec_pte0; page_list[PA_PTE_1] = __pa(kexec_pte1); page_list[VA_PTE_1] = (unsigned long)kexec_pte1; + page_list[PA_SWAP_PAGE] = (page_to_pfn(image->swap_page) << PAGE_SHIFT); /* The segment registers are funny things, they have both a * visible and an invisible part. Whenever the visible part is @@ -149,8 +162,10 @@ NORET_TYPE void machine_kexec(struct kimage *image) set_idt(phys_to_virt(0),0); /* now call it */ - relocate_kernel((unsigned long)image->head, (unsigned long)page_list, - image->start, cpu_has_pae); + image->start = relocate_kernel_ptr((unsigned long)image->head, + (unsigned long)page_list, + image->start, cpu_has_pae, + image->preserve_context); } void arch_crash_save_vmcoreinfo(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 9dd9262693a3..c43caa3a91f3 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -181,7 +181,7 @@ void machine_kexec_cleanup(struct kimage *image) * Do not allocate memory (or fail in any way) in machine_kexec(). * We are past the point of no return, committed to rebooting now. */ -NORET_TYPE void machine_kexec(struct kimage *image) +void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c30fe25d470d..703310a99023 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -20,11 +20,44 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define PAE_PGD_ATTR (_PAGE_PRESENT) +/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are + * used to save some data for jumping back + */ +#define DATA(offset) (PAGE_SIZE/2+(offset)) + +/* Minimal CPU state */ +#define ESP DATA(0x0) +#define CR0 DATA(0x4) +#define CR3 DATA(0x8) +#define CR4 DATA(0xc) + +/* other data */ +#define CP_VA_CONTROL_PAGE DATA(0x10) +#define CP_PA_PGD DATA(0x14) +#define CP_PA_SWAP_PAGE DATA(0x18) +#define CP_PA_BACKUP_PAGES_MAP DATA(0x1c) + .text .align PAGE_SIZE .globl relocate_kernel relocate_kernel: - movl 8(%esp), %ebp /* list of pages */ + /* Save the CPU context, used for jumping back */ + + pushl %ebx + pushl %esi + pushl %edi + pushl %ebp + pushf + + movl 20+8(%esp), %ebp /* list of pages */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %esp, ESP(%edi) + movl %cr0, %eax + movl %eax, CR0(%edi) + movl %cr3, %eax + movl %eax, CR3(%edi) + movl %cr4, %eax + movl %eax, CR4(%edi) #ifdef CONFIG_X86_PAE /* map the control page at its virtual address */ @@ -138,15 +171,25 @@ relocate_kernel: relocate_new_kernel: /* read the arguments and say goodbye to the stack */ - movl 4(%esp), %ebx /* page_list */ - movl 8(%esp), %ebp /* list of pages */ - movl 12(%esp), %edx /* start address */ - movl 16(%esp), %ecx /* cpu_has_pae */ + movl 20+4(%esp), %ebx /* page_list */ + movl 20+8(%esp), %ebp /* list of pages */ + movl 20+12(%esp), %edx /* start address */ + movl 20+16(%esp), %ecx /* cpu_has_pae */ + movl 20+20(%esp), %esi /* preserve_context */ /* zero out flags, and disable interrupts */ pushl $0 popfl + /* save some information for jumping back */ + movl PTR(VA_CONTROL_PAGE)(%ebp), %edi + movl %edi, CP_VA_CONTROL_PAGE(%edi) + movl PTR(PA_PGD)(%ebp), %eax + movl %eax, CP_PA_PGD(%edi) + movl PTR(PA_SWAP_PAGE)(%ebp), %eax + movl %eax, CP_PA_SWAP_PAGE(%edi) + movl %ebx, CP_PA_BACKUP_PAGES_MAP(%edi) + /* get physical address of control page now */ /* this is impossible after page table switch */ movl PTR(PA_CONTROL_PAGE)(%ebp), %edi @@ -197,8 +240,90 @@ identity_mapped: xorl %eax, %eax movl %eax, %cr3 + movl CP_PA_SWAP_PAGE(%edi), %eax + pushl %eax + pushl %ebx + call swap_pages + addl $8, %esp + + /* To be certain of avoiding problems with self-modifying code + * I need to execute a serializing instruction here. + * So I flush the TLB, it's handy, and not processor dependent. + */ + xorl %eax, %eax + movl %eax, %cr3 + + /* set all of the registers to known values */ + /* leave %esp alone */ + + testl %esi, %esi + jnz 1f + xorl %edi, %edi + xorl %eax, %eax + xorl %ebx, %ebx + xorl %ecx, %ecx + xorl %edx, %edx + xorl %esi, %esi + xorl %ebp, %ebp + ret +1: + popl %edx + movl CP_PA_SWAP_PAGE(%edi), %esp + addl $PAGE_SIZE, %esp +2: + call *%edx + + /* get the re-entry point of the peer system */ + movl 0(%esp), %ebp + call 1f +1: + popl %ebx + subl $(1b - relocate_kernel), %ebx + movl CP_VA_CONTROL_PAGE(%ebx), %edi + lea PAGE_SIZE(%ebx), %esp + movl CP_PA_SWAP_PAGE(%ebx), %eax + movl CP_PA_BACKUP_PAGES_MAP(%ebx), %edx + pushl %eax + pushl %edx + call swap_pages + addl $8, %esp + movl CP_PA_PGD(%ebx), %eax + movl %eax, %cr3 + movl %cr0, %eax + orl $(1<<31), %eax + movl %eax, %cr0 + lea PAGE_SIZE(%edi), %esp + movl %edi, %eax + addl $(virtual_mapped - relocate_kernel), %eax + pushl %eax + ret + +virtual_mapped: + movl CR4(%edi), %eax + movl %eax, %cr4 + movl CR3(%edi), %eax + movl %eax, %cr3 + movl CR0(%edi), %eax + movl %eax, %cr0 + movl ESP(%edi), %esp + movl %ebp, %eax + + popf + popl %ebp + popl %edi + popl %esi + popl %ebx + ret + /* Do the copies */ - movl %ebx, %ecx +swap_pages: + movl 8(%esp), %edx + movl 4(%esp), %ecx + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + movl %ecx, %ebx jmp 1f 0: /* top, read another word from the indirection page */ @@ -226,27 +351,28 @@ identity_mapped: movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi + movl %edi, %eax + movl %esi, %ebp + + movl %edx, %edi movl $1024, %ecx rep ; movsl - jmp 0b -3: - - /* To be certain of avoiding problems with self-modifying code - * I need to execute a serializing instruction here. - * So I flush the TLB, it's handy, and not processor dependent. - */ - xorl %eax, %eax - movl %eax, %cr3 + movl %ebp, %edi + movl %eax, %esi + movl $1024, %ecx + rep ; movsl - /* set all of the registers to known values */ - /* leave %esp alone */ + movl %eax, %edi + movl %edx, %esi + movl $1024, %ecx + rep ; movsl - xorl %eax, %eax - xorl %ebx, %ebx - xorl %ecx, %ecx - xorl %edx, %edx - xorl %esi, %esi - xorl %edi, %edi - xorl %ebp, %ebp + lea PAGE_SIZE(%ebp), %esi + jmp 0b +3: + popl %esi + popl %edi + popl %ebx + popl %ebp ret -- cgit v1.2.2 From 89081d17f7bb81d89fa1aa9b70f821c5cf4d39e9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 25 Jul 2008 19:45:10 -0700 Subject: kexec jump: save/restore device state This patch implements devices state save/restore before after kexec. This patch together with features in kexec_jump patch can be used for following: - A simple hibernation implementation without ACPI support. You can kexec a hibernating kernel, save the memory image of original system and shutdown the system. When resuming, you restore the memory image of original system via ordinary kexec load then jump back. - Kernel/system debug through making system snapshot. You can make system snapshot, jump back, do some thing and make another system snapshot. - Cooperative multi-kernel/system. With kexec jump, you can switch between several kernels/systems quickly without boot process except the first time. This appears like swap a whole kernel/system out/in. - A general method to call program in physical mode (paging turning off). This can be used to invoke BIOS code under Linux. The following user-space tools can be used with kexec jump: - kexec-tools needs to be patched to support kexec jump. The patches and the precompiled kexec can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-src_git_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec-tools-patches_git_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/kexec-tools/kexec_git_kh10 - makedumpfile with patches are used as memory image saving tool, it can exclude free pages from original kernel memory image file. The patches and the precompiled makedumpfile can be download from the following URL: source: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-src_cvs_kh10.tar.bz2 patches: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile-patches_cvs_kh10.tar.bz2 binary: http://khibernation.sourceforge.net/download/release_v10/makedumpfile/makedumpfile_cvs_kh10 - An initramfs image can be used as the root file system of kexeced kernel. An initramfs image built with "BuildRoot" can be downloaded from the following URL: initramfs image: http://khibernation.sourceforge.net/download/release_v10/initramfs/rootfs_cvs_kh10.gz All user space tools above are included in the initramfs image. Usage example of simple hibernation: 1. Compile and install patched kernel with following options selected: CONFIG_X86_32=y CONFIG_RELOCATABLE=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y CONFIG_PM=y CONFIG_HIBERNATION=y CONFIG_KEXEC_JUMP=y 2. Build an initramfs image contains kexec-tool and makedumpfile, or download the pre-built initramfs image, called rootfs.gz in following text. 3. Prepare a partition to save memory image of original kernel, called hibernating partition in following text. 4. Boot kernel compiled in step 1 (kernel A). 5. In the kernel A, load kernel compiled in step 1 (kernel B) with /sbin/kexec. The shell command line can be as follow: /sbin/kexec --load-preserve-context /boot/bzImage --mem-min=0x100000 --mem-max=0xffffff --initrd=rootfs.gz 6. Boot the kernel B with following shell command line: /sbin/kexec -e 7. The kernel B will boot as normal kexec. In kernel B the memory image of kernel A can be saved into hibernating partition as follow: jump_back_entry=`cat /proc/cmdline | tr ' ' '\n' | grep kexec_jump_back_entry | cut -d '='` echo $jump_back_entry > kexec_jump_back_entry cp /proc/vmcore dump.elf Then you can shutdown the machine as normal. 8. Boot kernel compiled in step 1 (kernel C). Use the rootfs.gz as root file system. 9. In kernel C, load the memory image of kernel A as follow: /sbin/kexec -l --args-none --entry=`cat kexec_jump_back_entry` dump.elf 10. Jump back to the kernel A as follow: /sbin/kexec -e Then, kernel A is resumed. Implementation point: To support jumping between two kernels, before jumping to (executing) the new kernel and jumping back to the original kernel, the devices are put into quiescent state, and the state of devices and CPU is saved. After jumping back from kexeced kernel and jumping to the new kernel, the state of devices and CPU are restored accordingly. The devices/CPU state save/restore code of software suspend is called to implement corresponding function. Known issues: - Because the segment number supported by sys_kexec_load is limited, hibernation image with many segments may not be load. This is planned to be eliminated by adding a new flag to sys_kexec_load to make a image can be loaded with multiple sys_kexec_load invoking. Now, only the i386 architecture is supported. Signed-off-by: Huang Ying Acked-by: Vivek Goyal Cc: "Eric W. Biederman" Cc: Pavel Machek Cc: Nigel Cunningham Cc: "Rafael J. Wysocki" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 5 +++-- arch/x86/kernel/machine_kexec_32.c | 12 ++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7ecb679f0130..6b2debfabddc 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1282,9 +1282,10 @@ config CRASH_DUMP config KEXEC_JUMP bool "kexec jump (EXPERIMENTAL)" depends on EXPERIMENTAL - depends on KEXEC && PM_SLEEP && X86_32 + depends on KEXEC && HIBERNATION && X86_32 help - Invoke code in physical address mode via KEXEC + Jump between original kernel and kexeced kernel and invoke + code in physical address mode via KEXEC config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 2b67609d0a1c..9fe478d98406 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -125,6 +125,18 @@ void machine_kexec(struct kimage *image) /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); + if (image->preserve_context) { +#ifdef CONFIG_X86_IO_APIC + /* We need to put APICs in legacy mode so that we can + * get timer interrupts in second kernel. kexec/kdump + * paths already have calls to disable_IO_APIC() in + * one form or other. kexec jump path also need + * one. + */ + disable_IO_APIC(); +#endif + } + control_page = page_address(image->control_code_page); memcpy(control_page, relocate_kernel, PAGE_SIZE/2); -- cgit v1.2.2 From 8174c430e445a93016ef18f717fe570214fa38bf Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:24 -0700 Subject: x86: lockless get_user_pages_fast() Implement get_user_pages_fast without locking in the fastpath on x86. Do an optimistic lockless pagetable walk, without taking mmap_sem or any page table locks or even mmap_sem. Page table existence is guaranteed by turning interrupts off (combined with the fact that we're always looking up the current mm, means we can do the lockless page table walk within the constraints of the TLB shootdown design). Basically we can do this lockless pagetable walk in a similar manner to the way the CPU's pagetable walker does not have to take any locks to find present ptes. This patch (combined with the subsequent ones to convert direct IO to use it) was found to give about 10% performance improvement on a 2 socket 8 core Intel Xeon system running an OLTP workload on DB2 v9.5 "To test the effects of the patch, an OLTP workload was run on an IBM x3850 M2 server with 2 processors (quad-core Intel Xeon processors at 2.93 GHz) using IBM DB2 v9.5 running Linux 2.6.24rc7 kernel. Comparing runs with and without the patch resulted in an overall performance benefit of ~9.8%. Correspondingly, oprofiles showed that samples from __up_read and __down_read routines that is seen during thread contention for system resources was reduced from 2.8% down to .05%. Monitoring the /proc/vmstat output from the patched run showed that the counter for fast_gup contained a very high number while the fast_gup_slow value was zero." (fast_gup is the old name for get_user_pages_fast, fast_gup_slow is a counter we had for the number of times the slowpath was invoked). The main reason for the improvement is that DB2 has multiple threads each issuing direct-IO. Direct-IO uses get_user_pages, and thus the threads contend the mmap_sem cacheline, and can also contend on page table locks. I would anticipate larger performance gains on larger systems, however I think DB2 uses an adaptive mix of threads and processes, so it could be that thread contention remains pretty constant as machine size increases. In which case, we stuck with "only" a 10% gain. The downside of using get_user_pages_fast is that if there is not a pte with the correct permissions for the access, we end up falling back to get_user_pages and so the get_user_pages_fast is a bit of extra work. However this should not be the common case in most performance critical code. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: build fix] [akpm@linux-foundation.org: Kconfig fix] [akpm@linux-foundation.org: Makefile fix/cleanup] [akpm@linux-foundation.org: warning fix] Signed-off-by: Nick Piggin Cc: Dave Kleikamp Cc: Andy Whitcroft Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Cc: Dave Kleikamp Cc: Badari Pulavarty Cc: Zach Brown Cc: Jens Axboe Reviewed-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/mm/Makefile | 1 + arch/x86/mm/gup.c | 258 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 260 insertions(+) create mode 100644 arch/x86/mm/gup.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6b2debfabddc..6bdde845818e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,6 +22,7 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_IOREMAP_PROT + select HAVE_GET_USER_PAGES_FAST select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB if !X86_RDC321X select HAVE_KRETPROBES diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 1fbb844c3d7a..2977ea37791f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,6 +1,7 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ pat.o pgtable.o +obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o obj-$(CONFIG_X86_32) += pgtable_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c new file mode 100644 index 000000000000..6f733121f32e --- /dev/null +++ b/arch/x86/mm/gup.c @@ -0,0 +1,258 @@ +/* + * Lockless get_user_pages_fast for x86 + * + * Copyright (C) 2008 Nick Piggin + * Copyright (C) 2008 Novell Inc. + */ +#include +#include +#include +#include + +#include + +static inline pte_t gup_get_pte(pte_t *ptep) +{ +#ifndef CONFIG_X86_PAE + return *ptep; +#else + /* + * With get_user_pages_fast, we walk down the pagetables without taking + * any locks. For this we would like to load the pointers atoimcally, + * but that is not possible (without expensive cmpxchg8b) on PAE. What + * we do have is the guarantee that a pte will only either go from not + * present to present, or present to not present or both -- it will not + * switch to a completely different present page without a TLB flush in + * between; something that we are blocking by holding interrupts off. + * + * Setting ptes from not present to present goes: + * ptep->pte_high = h; + * smp_wmb(); + * ptep->pte_low = l; + * + * And present to not present goes: + * ptep->pte_low = 0; + * smp_wmb(); + * ptep->pte_high = 0; + * + * We must ensure here that the load of pte_low sees l iff pte_high + * sees h. We load pte_high *after* loading pte_low, which ensures we + * don't see an older value of pte_high. *Then* we recheck pte_low, + * which ensures that we haven't picked up a changed pte high. We might + * have got rubbish values from pte_low and pte_high, but we are + * guaranteed that pte_low will not have the present bit set *unless* + * it is 'l'. And get_user_pages_fast only operates on present ptes, so + * we're safe. + * + * gup_get_pte should not be used or copied outside gup.c without being + * very careful -- it does not atomically load the pte or anything that + * is likely to be useful for you. + */ + pte_t pte; + +retry: + pte.pte_low = ptep->pte_low; + smp_rmb(); + pte.pte_high = ptep->pte_high; + smp_rmb(); + if (unlikely(pte.pte_low != ptep->pte_low)) + goto retry; + + return pte; +#endif +} + +/* + * The performance critical leaf functions are made noinline otherwise gcc + * inlines everything into a single function which results in too much + * register pressure. + */ +static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t *ptep; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + + ptep = pte_offset_map(&pmd, addr); + do { + pte_t pte = gup_get_pte(ptep); + struct page *page; + + if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { + pte_unmap(ptep); + return 0; + } + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + page = pte_page(pte); + get_page(page); + pages[*nr] = page; + (*nr)++; + + } while (ptep++, addr += PAGE_SIZE, addr != end); + pte_unmap(ptep - 1); + + return 1; +} + +static inline void get_head_page_multiple(struct page *page, int nr) +{ + VM_BUG_ON(page != compound_head(page)); + VM_BUG_ON(page_count(page) == 0); + atomic_add(nr, &page->_count); +} + +static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pmd; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_val(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + +static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pmd_t *pmdp; + + pmdp = pmd_offset(&pud, addr); + do { + pmd_t pmd = *pmdp; + + next = pmd_addr_end(addr, end); + if (pmd_none(pmd)) + return 0; + if (unlikely(pmd_large(pmd))) { + if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pte_range(pmd, addr, next, write, pages, nr)) + return 0; + } + } while (pmdp++, addr = next, addr != end); + + return 1; +} + +static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, + int write, struct page **pages, int *nr) +{ + unsigned long next; + pud_t *pudp; + + pudp = pud_offset(&pgd, addr); + do { + pud_t pud = *pudp; + + next = pud_addr_end(addr, end); + if (pud_none(pud)) + return 0; + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } while (pudp++, addr = next, addr != end); + + return 1; +} + +int get_user_pages_fast(unsigned long start, int nr_pages, int write, + struct page **pages) +{ + struct mm_struct *mm = current->mm; + unsigned long end = start + (nr_pages << PAGE_SHIFT); + unsigned long addr = start; + unsigned long next; + pgd_t *pgdp; + int nr = 0; + + if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, + start, nr_pages*PAGE_SIZE))) + goto slow_irqon; + + /* + * XXX: batch / limit 'nr', to avoid large irq off latency + * needs some instrumenting to determine the common sizes used by + * important workloads (eg. DB2), and whether limiting the batch size + * will decrease performance. + * + * It seems like we're in the clear for the moment. Direct-IO is + * the main guy that batches up lots of get_user_pages, and even + * they are limited to 64-at-a-time which is not so many. + */ + /* + * This doesn't prevent pagetable teardown, but does prevent + * the pagetables and pages from being freed on x86. + * + * So long as we atomically load page table pointers versus teardown + * (which we do on x86, with the above PAE exception), we can follow the + * address down to the the page and take a ref on it. + */ + local_irq_disable(); + pgdp = pgd_offset(mm, addr); + do { + pgd_t pgd = *pgdp; + + next = pgd_addr_end(addr, end); + if (pgd_none(pgd)) + goto slow; + if (!gup_pud_range(pgd, addr, next, write, pages, &nr)) + goto slow; + } while (pgdp++, addr = next, addr != end); + local_irq_enable(); + + VM_BUG_ON(nr != (end - start) >> PAGE_SHIFT); + return nr; + + { + int ret; + +slow: + local_irq_enable(); +slow_irqon: + /* Try to get the remaining pages with get_user_pages */ + start += nr << PAGE_SHIFT; + pages += nr; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, start, + (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); + up_read(&mm->mmap_sem); + + /* Have to be a bit careful with return values */ + if (nr > 0) { + if (ret < 0) + ret = nr; + else + ret += nr; + } + + return ret; + } +} -- cgit v1.2.2 From 652ea695364142b2464744746beac206d050ef19 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 25 Jul 2008 19:45:27 -0700 Subject: x86: support 1GB hugepages with get_user_pages_lockless() Signed-off-by: Nick Piggin Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 43 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 6f733121f32e..3085f25b4355 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -124,7 +124,7 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, refs = 0; head = pte_page(pte); - page = head + ((addr & ~HPAGE_MASK) >> PAGE_SHIFT); + page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; @@ -162,6 +162,38 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, return 1; } +static noinline int gup_huge_pud(pud_t pud, unsigned long addr, + unsigned long end, int write, struct page **pages, int *nr) +{ + unsigned long mask; + pte_t pte = *(pte_t *)&pud; + struct page *head, *page; + int refs; + + mask = _PAGE_PRESENT|_PAGE_USER; + if (write) + mask |= _PAGE_RW; + if ((pte_val(pte) & mask) != mask) + return 0; + /* hugepages are never "special" */ + VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(!pfn_valid(pte_pfn(pte))); + + refs = 0; + head = pte_page(pte); + page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT); + do { + VM_BUG_ON(compound_head(page) != head); + pages[*nr] = page; + (*nr)++; + page++; + refs++; + } while (addr += PAGE_SIZE, addr != end); + get_head_page_multiple(head, refs); + + return 1; +} + static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -175,8 +207,13 @@ static int gup_pud_range(pgd_t pgd, unsigned long addr, unsigned long end, next = pud_addr_end(addr, end); if (pud_none(pud)) return 0; - if (!gup_pmd_range(pud, addr, next, write, pages, nr)) - return 0; + if (unlikely(pud_large(pud))) { + if (!gup_huge_pud(pud, addr, next, write, pages, nr)) + return 0; + } else { + if (!gup_pmd_range(pud, addr, next, write, pages, nr)) + return 0; + } } while (pudp++, addr = next, addr != end); return 1; -- cgit v1.2.2 From 6341c393fcc37d58727865f1ee2f65e632e9d4f0 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 25 Jul 2008 19:45:44 -0700 Subject: tracehook: exec This moves all the ptrace hooks related to exec into tracehook.h inlines. This also lifts the calls for tracing out of the binfmt load_binary hooks into search_binary_handler() after it calls into the binfmt module. This change has no effect, since all the binfmt modules' load_binary functions did the call at the end on success, and now search_binary_handler() does it immediately after return if successful. We consolidate the repeated code, and binfmt modules no longer need to import ptrace_notify(). Signed-off-by: Roland McGrath Cc: Oleg Nesterov Reviewed-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32_aout.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 58cccb6483b0..a0e1dbe67dc1 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -441,12 +441,6 @@ beyond_if: regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; set_fs(USER_DS); - if (unlikely(current->ptrace & PT_PTRACED)) { - if (current->ptrace & PT_TRACE_EXEC) - ptrace_notify((PTRACE_EVENT_EXEC << 8) | SIGTRAP); - else - send_sig(SIGTRAP, current, 0); - } return 0; } -- cgit v1.2.2 From 8dad322f5449010c14990dd6934878f576b2ee60 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 25 Jul 2008 19:46:11 -0700 Subject: x86: use generic show_mem() Remove arch-specific show_mem() in favor of the generic version. This also removes the following redundant information display: - pages in swapcache, printed by show_swap_cache_info() - dirty pages, writeback pages, mapped pages, slab pages, pagetable pages, printed by show_free_areas() where show_mem() calls show_free_areas(), which calls show_swap_cache_info(). Signed-off-by: Johannes Weiner Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 37 ------------------------------------- arch/x86/mm/pgtable_32.c | 47 ----------------------------------------------- 2 files changed, 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ec37121f6709..129618ca0ea2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -86,43 +86,6 @@ early_param("gbpages", parse_direct_gbpages_on); * around without checking the pgd every time. */ -void show_mem(void) -{ - long i, total = 0, reserved = 0; - long shared = 0, cached = 0; - struct page *page; - pg_data_t *pgdat; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - for_each_online_pgdat(pgdat) { - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - /* - * This loop can take a while with 256 GB and - * 4k pages so defer the NMI watchdog: - */ - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - - if (!pfn_valid(pgdat->node_start_pfn + i)) - continue; - - page = pfn_to_page(pgdat->node_start_pfn + i); - total++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - } - printk(KERN_INFO "%lu pages of RAM\n", total); - printk(KERN_INFO "%lu reserved pages\n", reserved); - printk(KERN_INFO "%lu pages shared\n", shared); - printk(KERN_INFO "%lu pages swap cached\n", cached); -} - int after_bootmem; static __init void *spp_getpage(void) diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index b4becbf8c570..cab0abbd1ebe 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -20,53 +20,6 @@ #include #include -void show_mem(void) -{ - int total = 0, reserved = 0; - int shared = 0, cached = 0; - int highmem = 0; - struct page *page; - pg_data_t *pgdat; - unsigned long i; - unsigned long flags; - - printk(KERN_INFO "Mem-info:\n"); - show_free_areas(); - for_each_online_pgdat(pgdat) { - pgdat_resize_lock(pgdat, &flags); - for (i = 0; i < pgdat->node_spanned_pages; ++i) { - if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) - touch_nmi_watchdog(); - page = pgdat_page_nr(pgdat, i); - total++; - if (PageHighMem(page)) - highmem++; - if (PageReserved(page)) - reserved++; - else if (PageSwapCache(page)) - cached++; - else if (page_count(page)) - shared += page_count(page) - 1; - } - pgdat_resize_unlock(pgdat, &flags); - } - printk(KERN_INFO "%d pages of RAM\n", total); - printk(KERN_INFO "%d pages of HIGHMEM\n", highmem); - printk(KERN_INFO "%d reserved pages\n", reserved); - printk(KERN_INFO "%d pages shared\n", shared); - printk(KERN_INFO "%d pages swap cached\n", cached); - - printk(KERN_INFO "%lu pages dirty\n", global_page_state(NR_FILE_DIRTY)); - printk(KERN_INFO "%lu pages writeback\n", - global_page_state(NR_WRITEBACK)); - printk(KERN_INFO "%lu pages mapped\n", global_page_state(NR_FILE_MAPPED)); - printk(KERN_INFO "%lu pages slab\n", - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE)); - printk(KERN_INFO "%lu pages pagetables\n", - global_page_state(NR_PAGETABLE)); -} - /* * Associate a virtual page frame with a given physical page frame * and protection flags for that frame. -- cgit v1.2.2 From 36a033082b5243d45d508c5ccd47a754edbc6821 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 14 Mar 2008 17:46:38 -0700 Subject: x86: tracehook_signal_handler This makes the x86 signal handling code use tracehook_signal_handler() in place of calling into ptrace guts. The call is moved after the sa_mask processing, but there is no other change. This cleanup doesn't matter to existing debuggers, but is the sensible thing: have all facets of the handler setup complete before the debugger inspects the task again. Signed-off-by: Roland McGrath --- arch/x86/kernel/signal_32.c | 6 ++++-- arch/x86/kernel/signal_64.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6fb5bcdd8933..22aae1683c14 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -558,8 +559,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * handler too. */ regs->flags &= ~X86_EFLAGS_TF; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); @@ -568,6 +567,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + return 0; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index b45ef8ddd651..3beb2db88c5a 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -444,8 +445,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * handler too. */ regs->flags &= ~X86_EFLAGS_TF; - if (test_thread_flag(TIF_SINGLESTEP)) - ptrace_notify(SIGTRAP); spin_lock_irq(¤t->sighand->siglock); sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); @@ -453,6 +452,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigaddset(¤t->blocked,sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); + + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); } return ret; -- cgit v1.2.2 From eeea3c3ff8af7f6960a0515d46dff6479bdb91f9 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sun, 16 Mar 2008 23:36:28 -0700 Subject: x86: tracehook syscall This changes x86 syscall tracing to use the new tracehook.h entry points. There is no change, only cleanup. Signed-off-by: Roland McGrath --- arch/x86/kernel/ptrace.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e37dccce85db..19a7d2c40560 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1375,30 +1376,6 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) force_sig_info(SIGTRAP, &info, tsk); } -static void syscall_trace(struct pt_regs *regs) -{ - if (!(current->ptrace & PT_PTRACED)) - return; - -#if 0 - printk("trace %s ip %lx sp %lx ax %d origrax %d caller %lx tiflags %x ptrace %x\n", - current->comm, - regs->ip, regs->sp, regs->ax, regs->orig_ax, __builtin_return_address(0), - current_thread_info()->flags, current->ptrace); -#endif - - ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD) - ? 0x80 : 0)); - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} #ifdef CONFIG_X86_32 # define IS_IA32 1 @@ -1432,8 +1409,9 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) if (unlikely(test_thread_flag(TIF_SYSCALL_EMU))) ret = -1L; - if (ret || test_thread_flag(TIF_SYSCALL_TRACE)) - syscall_trace(regs); + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; if (unlikely(current->audit_context)) { if (IS_IA32) @@ -1459,7 +1437,7 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); if (test_thread_flag(TIF_SYSCALL_TRACE)) - syscall_trace(regs); + tracehook_report_syscall_exit(regs, 0); /* * If TIF_SYSCALL_EMU is set, we only get here because of @@ -1475,6 +1453,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) * system call instruction. */ if (test_thread_flag(TIF_SINGLESTEP) && - (current->ptrace & PT_PTRACED)) + tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) send_sigtrap(current, regs, 0); } -- cgit v1.2.2 From 4dfcbb997aa9f3a6a3ed8c192f0dac28b027e08f Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sat, 19 Apr 2008 15:37:09 -0700 Subject: x86 signals: use asm/syscall.h Replace local inlines with the asm/syscall.h interfaces that do the same things. Signed-off-by: Roland McGrath --- arch/x86/kernel/signal_64.c | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 3beb2db88c5a..cb7cf0216ab7 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "sigframe.h" #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP))) @@ -346,35 +347,6 @@ give_sigsegv: return -EFAULT; } -/* - * Return -1L or the syscall number that @regs is executing. - */ -static long current_syscall(struct pt_regs *regs) -{ - /* - * We always sign-extend a -1 value being set here, - * so this is always either -1L or a syscall number. - */ - return regs->orig_ax; -} - -/* - * Return a value that is -EFOO if the system call in @regs->orig_ax - * returned an error. This only works for @regs from @current. - */ -static long current_syscall_ret(struct pt_regs *regs) -{ -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - /* - * Sign-extend the value so (int)-EFOO becomes (long)-EFOO - * and will match correctly in comparisons. - */ - return (int) regs->ax; -#endif - return regs->ax; -} - /* * OK, we're invoking a handler */ @@ -386,9 +358,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if (current_syscall(regs) >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (current_syscall_ret(regs)) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -511,9 +483,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if (current_syscall(regs) >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (current_syscall_ret(regs)) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: -- cgit v1.2.2 From 59e52130f04537d2c80ea44bb007cadd1ad29543 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sat, 19 Apr 2008 19:10:57 -0700 Subject: x86: tracehook: TIF_NOTIFY_RESUME This adds TIF_NOTIFY_RESUME support for x86, both 64-bit and 32-bit. When set, we call tracehook_notify_resume() on the way to user mode. Signed-off-by: Roland McGrath --- arch/x86/kernel/signal_32.c | 5 +++++ arch/x86/kernel/signal_64.c | 5 +++++ 2 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 22aae1683c14..4445d26efd47 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -663,5 +663,10 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + clear_thread_flag(TIF_IRET); } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index cb7cf0216ab7..d01e3f6ef26d 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -523,6 +523,11 @@ void do_notify_resume(struct pt_regs *regs, void *unused, /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -- cgit v1.2.2 From 99bbc4b1e677ac695431e8d9c8e710ef391c567f Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Sun, 20 Apr 2008 14:35:12 -0700 Subject: x86: tracehook: CONFIG_HAVE_ARCH_TRACEHOOK The x86 arch code has all the prerequisites, so set HAVE_ARCH_TRACEHOOK. Signed-off-by: Roland McGrath --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b6fa2877b173..f463a8a3b21f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -30,6 +30,7 @@ config X86 select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) select HAVE_ARCH_KGDB if !X86_VOYAGER + select HAVE_ARCH_TRACEHOOK select HAVE_EFFICIENT_UNALIGNED_ACCESS config ARCH_DEFCONFIG -- cgit v1.2.2 From d25ae38b7e005af03843833bbd811ffe8c5f8cb4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 25 Jul 2008 19:39:03 -0700 Subject: x86: add apic probe for genapic 64bit - fix intr_remapping_enabled get assigned later, so need to check that in setup_apic_routing Signed-off-by: Yinghai Lu Cc: Jack Steiner Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_64.c | 6 ++++++ arch/x86/kernel/genx2apic_cluster.c | 2 +- arch/x86/kernel/genx2apic_phys.c | 2 +- 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index b3ba969c50d2..6c9bfc9e1e95 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -42,6 +43,11 @@ static struct genapic *apic_probe[] __initdata = { */ void __init setup_apic_routing(void) { + if (genapic == &apic_x2apic_phys || genapic == &apic_x2apic_cluster) { + if (!intr_remapping_enabled) + genapic = &apic_flat; + } + if (genapic == &apic_flat) { if (max_physical_apicid >= 8) genapic = &apic_physflat; diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index ef3f3182d50a..fed9f68efd66 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -14,7 +14,7 @@ DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (cpu_has_x2apic && intr_remapping_enabled) + if (cpu_has_x2apic) return 1; return 0; diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 3229c68aedd4..958d537b4cc9 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -21,7 +21,7 @@ early_param("x2apic_phys", set_x2apic_phys_mode); static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - if (cpu_has_x2apic && intr_remapping_enabled && x2apic_phys) + if (cpu_has_x2apic && x2apic_phys) return 1; return 0; -- cgit v1.2.2 From 5f4cb662a0a2533b45656607471571460310a5ca Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 14 Jul 2008 20:36:36 +0200 Subject: KVM: SVM: allow enabling/disabling NPT by reloading only the architecture module If NPT is enabled after loading both KVM modules on AMD and it should be disabled, both KVM modules must be reloaded. If only the architecture module is reloaded the behavior is undefined. With this patch it is possible to disable NPT only by reloading the kvm_amd module. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 6 ++++++ arch/x86/kvm/svm.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b0e4ddca6c18..d087d9c4f2d9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1870,6 +1870,12 @@ void kvm_enable_tdp(void) } EXPORT_SYMBOL_GPL(kvm_enable_tdp); +void kvm_disable_tdp(void) +{ + tdp_enabled = false; +} +EXPORT_SYMBOL_GPL(kvm_disable_tdp); + static void free_mmu_pages(struct kvm_vcpu *vcpu) { struct kvm_mmu_page *sp; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b756e876dce3..951b789cc913 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -453,7 +453,8 @@ static __init int svm_hardware_setup(void) if (npt_enabled) { printk(KERN_INFO "kvm: Nested Paging enabled\n"); kvm_enable_tdp(); - } + } else + kvm_disable_tdp(); return 0; -- cgit v1.2.2 From 98899aa0e0bf5de05850082be0eb837058c09ea5 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 16 Jul 2008 19:07:10 -0300 Subject: KVM: task switch: segment base is linear address The segment base is always a linear address, so translate before accessing guest memory. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9f1cdb011cff..cd687395e4e7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3223,6 +3223,7 @@ static void get_segment_descritptor_dtable(struct kvm_vcpu *vcpu, static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { + gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3232,13 +3233,16 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); return 1; } - return kvm_read_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); + gpa += index * 8; + return kvm_read_guest(vcpu->kvm, gpa, seg_desc, 8); } /* allowed just for 8 bytes segments */ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, struct desc_struct *seg_desc) { + gpa_t gpa; struct descriptor_table dtable; u16 index = selector >> 3; @@ -3246,7 +3250,9 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, if (dtable.limit < index * 8 + 7) return 1; - return kvm_write_guest(vcpu->kvm, dtable.base + index * 8, seg_desc, 8); + gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, dtable.base); + gpa += index * 8; + return kvm_write_guest(vcpu->kvm, gpa, seg_desc, 8); } static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, @@ -3258,7 +3264,7 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, base_addr |= (seg_desc->base1 << 16); base_addr |= (seg_desc->base2 << 24); - return base_addr; + return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } static int load_tss_segment32(struct kvm_vcpu *vcpu, -- cgit v1.2.2 From 34198bf8426276a2ce1e97056a0f02d43637e5ae Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 16 Jul 2008 19:07:11 -0300 Subject: KVM: task switch: use seg regs provided by subarch instead of reading from GDT There is no guarantee that the old TSS descriptor in the GDT contains the proper base address. This is the case for Windows installation's reboot-via-triplefault. Use guest registers instead. Also translate the address properly. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 93 ++++++++++++++++++------------------------------------ 1 file changed, 30 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index cd687395e4e7..27c6ece91da6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3267,54 +3267,6 @@ static u32 get_tss_base_addr(struct kvm_vcpu *vcpu, return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); } -static int load_tss_segment32(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_32 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_read_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_32)); -} - -static int save_tss_segment32(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_32 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_write_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_32)); -} - -static int load_tss_segment16(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_16 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_read_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_16)); -} - -static int save_tss_segment16(struct kvm_vcpu *vcpu, - struct desc_struct *seg_desc, - struct tss_segment_16 *tss) -{ - u32 base_addr; - - base_addr = get_tss_base_addr(vcpu, seg_desc); - - return kvm_write_guest(vcpu->kvm, base_addr, tss, - sizeof(struct tss_segment_16)); -} - static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) { struct kvm_segment kvm_seg; @@ -3472,20 +3424,26 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu, } static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector, - struct desc_struct *cseg_desc, + u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_16 tss_segment_16; int ret = 0; - if (load_tss_segment16(vcpu, cseg_desc, &tss_segment_16)) + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) goto out; save_state_to_tss16(vcpu, &tss_segment_16); - save_tss_segment16(vcpu, cseg_desc, &tss_segment_16); - if (load_tss_segment16(vcpu, nseg_desc, &tss_segment_16)) + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_16, + sizeof tss_segment_16)) goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_16, sizeof tss_segment_16)) + goto out; + if (load_state_from_tss16(vcpu, &tss_segment_16)) goto out; @@ -3495,20 +3453,26 @@ out: } static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector, - struct desc_struct *cseg_desc, + u32 old_tss_base, struct desc_struct *nseg_desc) { struct tss_segment_32 tss_segment_32; int ret = 0; - if (load_tss_segment32(vcpu, cseg_desc, &tss_segment_32)) + if (kvm_read_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) goto out; save_state_to_tss32(vcpu, &tss_segment_32); - save_tss_segment32(vcpu, cseg_desc, &tss_segment_32); - if (load_tss_segment32(vcpu, nseg_desc, &tss_segment_32)) + if (kvm_write_guest(vcpu->kvm, old_tss_base, &tss_segment_32, + sizeof tss_segment_32)) + goto out; + + if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), + &tss_segment_32, sizeof tss_segment_32)) goto out; + if (load_state_from_tss32(vcpu, &tss_segment_32)) goto out; @@ -3523,16 +3487,20 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) struct desc_struct cseg_desc; struct desc_struct nseg_desc; int ret = 0; + u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); + u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); - kvm_get_segment(vcpu, &tr_seg, VCPU_SREG_TR); + old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); + /* FIXME: Handle errors. Failure to read either TSS or their + * descriptors should generate a pagefault. + */ if (load_guest_segment_descriptor(vcpu, tss_selector, &nseg_desc)) goto out; - if (load_guest_segment_descriptor(vcpu, tr_seg.selector, &cseg_desc)) + if (load_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc)) goto out; - if (reason != TASK_SWITCH_IRET) { int cpl; @@ -3550,8 +3518,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) if (reason == TASK_SWITCH_IRET || reason == TASK_SWITCH_JMP) { cseg_desc.type &= ~(1 << 1); //clear the B flag - save_guest_segment_descriptor(vcpu, tr_seg.selector, - &cseg_desc); + save_guest_segment_descriptor(vcpu, old_tss_sel, &cseg_desc); } if (reason == TASK_SWITCH_IRET) { @@ -3563,10 +3530,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) kvm_x86_ops->cache_regs(vcpu); if (nseg_desc.type & 8) - ret = kvm_task_switch_32(vcpu, tss_selector, &cseg_desc, + ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, &nseg_desc); else - ret = kvm_task_switch_16(vcpu, tss_selector, &cseg_desc, + ret = kvm_task_switch_16(vcpu, tss_selector, old_tss_base, &nseg_desc); if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { -- cgit v1.2.2 From 577bdc496614ced56d999bbb425e85adf2386490 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 19 Jul 2008 08:57:05 +0300 Subject: KVM: Avoid instruction emulation when event delivery is pending When an event (such as an interrupt) is injected, and the stack is shadowed (and therefore write protected), the guest will exit. The current code will see that the stack is shadowed and emulate a few instructions, each time postponing the injection. Eventually the injection may succeed, but at that time the guest may be unwilling to accept the interrupt (for example, the TPR may have changed). This occurs every once in a while during a Windows 2008 boot. Fix by unshadowing the fault address if the fault was due to an event injection. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + arch/x86/kvm/svm.c | 7 ++++++- arch/x86/kvm/vmx.c | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d087d9c4f2d9..2fa231923cf7 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1814,6 +1814,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) spin_unlock(&vcpu->kvm->mmu_lock); return r; } +EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt); void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 951b789cc913..e2ee264740c7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1008,10 +1008,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) struct kvm *kvm = svm->vcpu.kvm; u64 fault_address; u32 error_code; + bool event_injection = false; if (!irqchip_in_kernel(kvm) && - is_external_interrupt(exit_int_info)) + is_external_interrupt(exit_int_info)) { + event_injection = true; push_irq(&svm->vcpu, exit_int_info & SVM_EVTINJ_VEC_MASK); + } fault_address = svm->vmcb->control.exit_info_2; error_code = svm->vmcb->control.exit_info_1; @@ -1025,6 +1028,8 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) (u32)fault_address, (u32)(fault_address >> 32), handler); + if (event_injection) + kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 0cac63701719..b918fc83435c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2298,6 +2298,8 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); + if (vect_info & VECTORING_INFO_VALID_MASK) + kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } -- cgit v1.2.2 From c93cd3a58845012df2d658fecd0ac99f7008d753 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 19 Jul 2008 19:08:07 -0300 Subject: KVM: task switch: translate guest segment limit to virt-extension byte granular field If 'g' is one then limit is 4kb granular. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 27c6ece91da6..5916191420c7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3184,6 +3184,10 @@ static void seg_desct_to_kvm_desct(struct desc_struct *seg_desc, u16 selector, kvm_desct->base |= seg_desc->base2 << 24; kvm_desct->limit = seg_desc->limit0; kvm_desct->limit |= seg_desc->limit << 16; + if (seg_desc->g) { + kvm_desct->limit <<= 12; + kvm_desct->limit |= 0xfff; + } kvm_desct->selector = selector; kvm_desct->type = seg_desc->type; kvm_desct->present = seg_desc->p; -- cgit v1.2.2 From 5ec5726a16245138f5d5305b00a752acb5730076 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 16 Jul 2008 09:21:22 +0800 Subject: KVM: VMX: Fix bypass_guest_pf enabling when disable EPT in module parameter Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b918fc83435c..f71151d999e4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3305,7 +3305,7 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); - if (cpu_has_vmx_ept()) + if (vm_need_ept()) bypass_guest_pf = 0; if (bypass_guest_pf) -- cgit v1.2.2 From 5fdbcb9dd16f1e89ead127d3ee1a38e3a00cf1ea Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 16 Jul 2008 09:25:40 +0800 Subject: KVM: VMX: Fix undefined beaviour of EPT after reload kvm-intel.ko As well as move set base/mask ptes to vmx_init(). Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f71151d999e4..2a69773e3b26 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3118,15 +3118,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) return ERR_PTR(-ENOMEM); allocate_vpid(vmx); - if (id == 0 && vm_need_ept()) { - kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | - VMX_EPT_WRITABLE_MASK | - VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); - kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, - VMX_EPT_FAKE_DIRTY_MASK, 0ull, - VMX_EPT_EXECUTABLE_MASK); - kvm_enable_tdp(); - } err = kvm_vcpu_init(&vmx->vcpu, kvm, id); if (err) @@ -3305,8 +3296,17 @@ static int __init vmx_init(void) vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(vmx_msr_bitmap, MSR_IA32_SYSENTER_EIP); - if (vm_need_ept()) + if (vm_need_ept()) { bypass_guest_pf = 0; + kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | + VMX_EPT_WRITABLE_MASK | + VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); + kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, + VMX_EPT_FAKE_DIRTY_MASK, 0ull, + VMX_EPT_EXECUTABLE_MASK); + kvm_enable_tdp(); + } else + kvm_disable_tdp(); if (bypass_guest_pf) kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); -- cgit v1.2.2 From 583323b9d2f624884a8c9563fb5a4d795f39ab07 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 27 Jul 2008 21:43:11 +0200 Subject: x86: fix cpu hotplug on 32bit commit 3e9704739daf46a8ba6593d749c67b5f7cd633d2 ("x86: boot secondary cpus through initial_code") causes the kernel to crash when a CPU is brought online after the read only sections have been write protected. The write to initial_code in do_boot_cpu() fails. Move inital_code to .cpuinit.data section. Signed-off-by: Thomas Gleixner Acked-by: H. Peter Anvin --- arch/x86/kernel/head_32.S | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f67e93441caf..a7010c3a377a 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -456,9 +456,6 @@ is386: movl $2,%ecx # set MP 1: #endif /* CONFIG_SMP */ jmp *(initial_code) -.align 4 -ENTRY(initial_code) - .long i386_start_kernel /* * We depend on ET to be correct. This checks for 287/387. @@ -601,6 +598,11 @@ ignore_int: #endif iret +.section .cpuinit.data,"wa" +.align 4 +ENTRY(initial_code) + .long i386_start_kernel + .section .text /* * Real beginning of normal "text" segment -- cgit v1.2.2 From a05d2ebab28011c2f3f520833f4bfdd2fd1b9c02 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 27 Jul 2008 08:45:02 -0700 Subject: xen: fix allocation and use of large ldts When the ldt gets to more than 1 page in size, the kernel uses vmalloc to allocate it. This means that: - when making the ldt RO, we must update the pages in both the vmalloc mapping and the linear mapping to make sure there are no RW aliases. - we need to use arbitrary_virt_to_machine to compute the machine addr for each update Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 51 ++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e2767c28dac7..b011e4a5dbbe 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -325,24 +325,55 @@ static unsigned long xen_store_tr(void) return 0; } +/* + * If 'v' is a vmalloc mapping, then find the linear mapping of the + * page (if any) and also set its protections to match: + */ +static void set_aliased_prot(void *v, pgprot_t prot) +{ + int level; + pte_t *ptep; + pte_t pte; + unsigned long pfn; + struct page *page; + + ptep = lookup_address((unsigned long)v, &level); + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + page = pfn_to_page(pfn); + + pte = pfn_pte(pfn, prot); + + if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) + BUG(); + + if (!PageHighMem(page)) { + void *av = __va(PFN_PHYS(pfn)); + + if (av != v) + if (HYPERVISOR_update_va_mapping((unsigned long)av, pte, 0)) + BUG(); + } else + kmap_flush_unused(); +} + static void xen_alloc_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readonly(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL_RO); } static void xen_free_ldt(struct desc_struct *ldt, unsigned entries) { - unsigned pages = roundup(entries * LDT_ENTRY_SIZE, PAGE_SIZE); - void *v = ldt; + const unsigned entries_per_page = PAGE_SIZE / LDT_ENTRY_SIZE; int i; - for(i = 0; i < pages; i += PAGE_SIZE) - make_lowmem_page_readwrite(v + i); + for(i = 0; i < entries; i += entries_per_page) + set_aliased_prot(ldt + i, PAGE_KERNEL); } static void xen_set_ldt(const void *addr, unsigned entries) @@ -446,7 +477,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); u64 entry = *(u64 *)ptr; preempt_disable(); @@ -579,7 +610,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry, } static void xen_load_sp0(struct tss_struct *tss, - struct thread_struct *thread) + struct thread_struct *thread) { struct multicall_space mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); -- cgit v1.2.2 From 7225e75144b9718cbbe1820d9c011c809d5773fd Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sat, 26 Jul 2008 17:54:22 -0700 Subject: documentation: move mtrr.txt to Doc/x86/ subdir Move mtrr.txt to the Documentation/x86/ subdirectory. Add 00-INDEX to the Documentation/x86/ subdirectory. Signed-off-by: Randy Dunlap Cc: Adrian Bunk Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 03980cb04291..06f935469d0b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1131,7 +1131,7 @@ config MTRR You can safely say Y even if your machine doesn't have MTRRs, you'll just add about 9 KB to your kernel. - See for more information. + See for more information. config MTRR_SANITIZER bool -- cgit v1.2.2 From 8cb22bcb1f3ef70d4d48092e9b057175ad9ec78d Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Fri, 18 Jul 2008 16:03:52 -0500 Subject: x86: L3 cache index disable for 2.6.26 New versions of AMD processors have support to disable parts of their L3 caches if too many MCEs are generated by the L3 cache. This patch provides a /sysfs interface under the cache hierarchy to display which caches indices are disabled (if any) and to monitoring applications to disable a cache index. This patch does not set an automatic policy to disable the L3 cache. Policy decisions would need to be made by a RAS handler. This patch merely makes it easier to see what indices are currently disabled. Signed-off-by: Mark Langsdorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 87 ++++++++++++++++++++++++++++++++--- 1 file changed, 81 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index ff517f0b8cc4..d6ea50e270e0 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -16,6 +16,7 @@ #include #include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -130,6 +131,7 @@ struct _cpuid4_info { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; + unsigned long can_disable; cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ }; @@ -251,6 +253,13 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } +static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) +{ + if (index < 3) + return; + this_leaf->can_disable = 1; +} + static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) { union _cpuid4_leaf_eax eax; @@ -258,9 +267,12 @@ static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le union _cpuid4_leaf_ecx ecx; unsigned edx; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - else + if (boot_cpu_data.x86 >= 0x10) + amd_check_l3_disable(index, this_leaf); + + } else cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); if (eax.split.type == CACHE_TYPE_NULL) return -EIO; /* better error ? */ @@ -637,6 +649,61 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { } } +#define to_object(k) container_of(k, struct _index_kobject, kobj) +#define to_attr(a) container_of(a, struct _cache_attr, attr) + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +{ + struct pci_dev *dev; + if (this_leaf->can_disable) { + int i; + ssize_t ret = 0; + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + dev = k8_northbridges[node]; + + for (i = 0; i < 2; i++) { + unsigned int reg; + pci_read_config_dword(dev, 0x1BC + i * 4, ®); + ret += sprintf(buf, "%sEntry: %d\n", buf, i); + ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", + buf, + reg & 0x80000000 ? "Disabled" : "Allowed", + reg & 0x40000000 ? "Disabled" : "Allowed"); + ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", buf, + (reg & 0x30000) >> 16, reg & 0xfff); + + } + return ret; + } + return sprintf(buf, "Feature not enabled\n"); +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, size_t count) +{ + struct pci_dev *dev; + if (this_leaf->can_disable) { + /* write the MSR value */ + unsigned int ret; + unsigned int index, val; + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + dev = k8_northbridges[node]; + + if (strlen(buf) > 15) + return -EINVAL; + ret = sscanf(buf, "%x %x", &index, &val); + if (ret != 2) + return -EINVAL; + if (index > 1) + return -EINVAL; + val |= 0xc0000000; + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); + wbinvd(); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + return 1; + } + return 0; +} + struct _cache_attr { struct attribute attr; ssize_t (*show)(struct _cpuid4_info *, char *); @@ -657,6 +724,8 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); +static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); + static struct attribute * default_attrs[] = { &type.attr, &level.attr, @@ -667,12 +736,10 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, + &cache_disable.attr, NULL }; -#define to_object(k) container_of(k, struct _index_kobject, kobj) -#define to_attr(a) container_of(a, struct _cache_attr, attr) - static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) { struct _cache_attr *fattr = to_attr(attr); @@ -689,7 +756,15 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) static ssize_t store(struct kobject * kobj, struct attribute * attr, const char * buf, size_t count) { - return 0; + struct _cache_attr *fattr = to_attr(attr); + struct _index_kobject *this_leaf = to_object(kobj); + ssize_t ret; + + ret = fattr->store ? + fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), + buf, count) : + 0; + return ret; } static struct sysfs_ops sysfs_ops = { -- cgit v1.2.2 From 7a4983bb5f94f6521aa3236fe5c035cf9bef543f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 21 Jul 2008 13:34:21 +0200 Subject: x86: L3 cache index disable for 2.6.26, cleanups No change in functionality. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 115 ++++++++++++++++++---------------- 1 file changed, 61 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index d6ea50e270e0..a61c9e399ba9 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -253,14 +253,16 @@ static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } -static void __cpuinit amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) +static void __cpuinit +amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) { if (index < 3) return; this_leaf->can_disable = 1; } -static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) +static int +__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; @@ -271,19 +273,20 @@ static int __cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_le amd_cpuid4(index, &eax, &ebx, &ecx); if (boot_cpu_data.x86 >= 0x10) amd_check_l3_disable(index, this_leaf); - - } else - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } else { + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); + } + if (eax.split.type == CACHE_TYPE_NULL) return -EIO; /* better error ? */ this_leaf->eax = eax; this_leaf->ebx = ebx; this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); + this_leaf->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } @@ -649,59 +652,63 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { } } -#define to_object(k) container_of(k, struct _index_kobject, kobj) -#define to_attr(a) container_of(a, struct _cache_attr, attr) +#define to_object(k) container_of(k, struct _index_kobject, kobj) +#define to_attr(a) container_of(a, struct _cache_attr, attr) static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) { - struct pci_dev *dev; - if (this_leaf->can_disable) { - int i; - ssize_t ret = 0; - int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); - dev = k8_northbridges[node]; - - for (i = 0; i < 2; i++) { - unsigned int reg; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", buf, - (reg & 0x30000) >> 16, reg & 0xfff); + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + struct pci_dev *dev = k8_northbridges[node]; + ssize_t ret = 0; + int i; - } - return ret; + if (!this_leaf->can_disable) + return sprintf(buf, "Feature not enabled\n"); + + for (i = 0; i < 2; i++) { + unsigned int reg; + + pci_read_config_dword(dev, 0x1BC + i * 4, ®); + + ret += sprintf(buf, "%sEntry: %d\n", buf, i); + ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", + buf, + reg & 0x80000000 ? "Disabled" : "Allowed", + reg & 0x40000000 ? "Disabled" : "Allowed"); + ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", + buf, (reg & 0x30000) >> 16, reg & 0xfff); } - return sprintf(buf, "Feature not enabled\n"); + return ret; } -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, size_t count) +static ssize_t +store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, + size_t count) { - struct pci_dev *dev; - if (this_leaf->can_disable) { - /* write the MSR value */ - unsigned int ret; - unsigned int index, val; - int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); - dev = k8_northbridges[node]; - - if (strlen(buf) > 15) - return -EINVAL; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) - return -EINVAL; - if (index > 1) - return -EINVAL; - val |= 0xc0000000; - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); - wbinvd(); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - return 1; - } - return 0; + int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); + struct pci_dev *dev = k8_northbridges[node]; + unsigned int ret, index, val; + + if (!this_leaf->can_disable) + return 0; + + /* write the MSR value */ + + if (strlen(buf) > 15) + return -EINVAL; + + ret = sscanf(buf, "%x %x", &index, &val); + if (ret != 2) + return -EINVAL; + if (index > 1) + return -EINVAL; + + val |= 0xc0000000; + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); + wbinvd(); + pci_write_config_dword(dev, 0x1BC + index * 4, val); + + return 1; } struct _cache_attr { -- cgit v1.2.2 From a24e8d36f5fc047dac9af6200322ed393f2e3175 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Tue, 22 Jul 2008 13:06:02 -0500 Subject: x86: L3 cache index disable for 2.6.26 On Monday 21 July 2008, Ingo Molnar wrote: > > applied to tip/x86/cpu, thanks Mark. > > > > I've done some coding style fixes for the new functions you've > > introduced, see that commit below. > > -tip testing found the following build failure: > > arch/x86/kernel/built-in.o: In function `show_cache_disable': > intel_cacheinfo.c:(.text+0xbbf2): undefined reference to `k8_northbridges' > arch/x86/kernel/built-in.o: In function `store_cache_disable': > intel_cacheinfo.c:(.text+0xbd91): undefined reference to `k8_northbridges' > > please send a delta fix patch against the tip/x86/cpu branch: > > http://people.redhat.com/mingo/tip.git/README > > which has your patch plus the cleanup applied. delta fix patch follows. It removes the dependency on k8_northbridges. -Mark Langsdorf Operating System Research Center AMD Signed-off-by: Mark Langsdorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 43 +++++++++++++++++++++++++++++++---- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a61c9e399ba9..a0c6c6ffed46 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -13,10 +13,10 @@ #include #include #include +#include #include #include -#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -135,6 +135,12 @@ struct _cpuid4_info { cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ }; +static struct pci_device_id k8_nb_id[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + {} +}; + unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -655,16 +661,39 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) +static struct pci_dev *get_k8_northbridge(int node) +{ + struct pci_dev *dev = NULL; + int i; + + for (i = 0; i <= node; i++) { + do { + dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); + if (!dev) + break; + } while (!pci_match_id(&k8_nb_id[0], dev)); + if (!dev) + break; + } + return dev; +} + static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) { int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); - struct pci_dev *dev = k8_northbridges[node]; + struct pci_dev *dev = NULL; ssize_t ret = 0; int i; if (!this_leaf->can_disable) return sprintf(buf, "Feature not enabled\n"); + dev = get_k8_northbridge(node); + if (!dev) { + printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + return -EINVAL; + } + for (i = 0; i < 2; i++) { unsigned int reg; @@ -686,14 +715,12 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, size_t count) { int node = cpu_to_node(first_cpu(this_leaf->shared_cpu_map)); - struct pci_dev *dev = k8_northbridges[node]; + struct pci_dev *dev = NULL; unsigned int ret, index, val; if (!this_leaf->can_disable) return 0; - /* write the MSR value */ - if (strlen(buf) > 15) return -EINVAL; @@ -704,6 +731,12 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, return -EINVAL; val |= 0xc0000000; + dev = get_k8_northbridge(node); + if (!dev) { + printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); + return -EINVAL; + } + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); -- cgit v1.2.2 From cdcf772ed163651cacac8098b4974aba7f9e1c73 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 28 Jul 2008 16:20:08 +0200 Subject: x86 l3 cache index disable for 2 6 26 fix Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 39 ++++++++++++++++++----------------- 1 file changed, 20 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a0c6c6ffed46..535d662716de 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -1,8 +1,8 @@ /* - * Routines to indentify caches on Intel CPU. + * Routines to indentify caches on Intel CPU. * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) + * Changes: + * Venkatesh Pallipadi : Adding cache identification through cpuid(4) * Ashok Raj : Work with CPU hotplug infrastructure. * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ @@ -136,9 +136,9 @@ struct _cpuid4_info { }; static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + {} }; unsigned short num_cache_leaves; @@ -190,9 +190,10 @@ static unsigned short assocs[] __cpuinitdata = { static unsigned char levels[] __cpuinitdata = { 1, 1, 2, 3 }; static unsigned char types[] __cpuinitdata = { 1, 2, 3, 3 }; -static void __cpuinit amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void __cpuinit +amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, + union _cpuid4_leaf_ecx *ecx) { unsigned dummy; unsigned line_size, lines_per_tag, assoc, size_in_kb; @@ -264,7 +265,7 @@ amd_check_l3_disable(int index, struct _cpuid4_info *this_leaf) { if (index < 3) return; - this_leaf->can_disable = 1; + this_leaf->can_disable = 1; } static int @@ -474,7 +475,7 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) /* pointer to _cpuid4_info array (for each cache leaf) */ static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) +#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) #ifdef CONFIG_SMP static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) @@ -511,7 +512,7 @@ static void __cpuinit cache_remove_shared_cpu_map(unsigned int cpu, int index) this_leaf = CPUID4_INFO_IDX(cpu, index); for_each_cpu_mask(sibling, this_leaf->shared_cpu_map) { - sibling_leaf = CPUID4_INFO_IDX(sibling, index); + sibling_leaf = CPUID4_INFO_IDX(sibling, index); cpu_clear(cpu, sibling_leaf->shared_cpu_map); } } @@ -593,7 +594,7 @@ struct _index_kobject { /* pointer to array of kobjects for cpuX/cache/indexY */ static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) +#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) #define show_one_plus(file_name, object, val) \ static ssize_t show_##file_name \ @@ -675,7 +676,7 @@ static struct pci_dev *get_k8_northbridge(int node) if (!dev) break; } - return dev; + return dev; } static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) @@ -736,7 +737,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; } - + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); @@ -789,7 +790,7 @@ static ssize_t show(struct kobject * kobj, struct attribute * attr, char * buf) ret = fattr->show ? fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), buf) : - 0; + 0; return ret; } @@ -800,9 +801,9 @@ static ssize_t store(struct kobject * kobj, struct attribute * attr, struct _index_kobject *this_leaf = to_object(kobj); ssize_t ret; - ret = fattr->store ? - fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count) : + ret = fattr->store ? + fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), + buf, count) : 0; return ret; } -- cgit v1.2.2 From d89961e2dc87b6e30b8e3f60bd2af5cd92cf4643 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 24 Jul 2008 13:48:58 -0700 Subject: xen: suppress known wrmsrs In general, Xen doesn't support wrmsr from an unprivileged domain; it just ends up ignoring the instruction and printing a message on the console. Given that there are sets of MSRs we know the kernel will try to write to, but we don't care, just eat them in xen_write_msr to cut down on console noise. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b011e4a5dbbe..b795470ec069 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -854,6 +854,19 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) ret = -EFAULT; break; #endif + + case MSR_STAR: + case MSR_CSTAR: + case MSR_LSTAR: + case MSR_SYSCALL_MASK: + case MSR_IA32_SYSENTER_CS: + case MSR_IA32_SYSENTER_ESP: + case MSR_IA32_SYSENTER_EIP: + /* Fast syscall setup is all done in hypercalls, so + these are all ignored. Stub them out here to stop + Xen console noise. */ + break; + default: ret = native_write_msr_safe(msr, low, high); } -- cgit v1.2.2 From 239bd83104ec6bcba90221d8b0973d2565142ef8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 28 Jul 2008 16:45:49 +0200 Subject: x86: L3 cache index disable for 2.6.26, fix #2 fix !PCI build failure: arch/x86/kernel/cpu/intel_cacheinfo.c: In function 'get_k8_northbridge': arch/x86/kernel/cpu/intel_cacheinfo.c:675: error: implicit declaration of function 'pci_match_id' Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index d763d24187c2..1677b55371a5 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -135,11 +135,13 @@ struct _cpuid4_info { cpumask_t shared_cpu_map; /* future?: only cpus/node is needed */ }; +#ifdef CONFIG_PCI static struct pci_device_id k8_nb_id[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, {} }; +#endif unsigned short num_cache_leaves; @@ -663,6 +665,7 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) { #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) +#ifdef CONFIG_PCI static struct pci_dev *get_k8_northbridge(int node) { struct pci_dev *dev = NULL; @@ -679,6 +682,12 @@ static struct pci_dev *get_k8_northbridge(int node) } return dev; } +#else +static struct pci_dev *get_k8_northbridge(int node) +{ + return NULL; +} +#endif static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) { -- cgit v1.2.2 From 9a56a0f80b52cb41c5e0add47c7ce0bb2ef25eb0 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:13 +0200 Subject: x86: moved Intel microcode patch loader declarations to seperate header file Intel specific microcode declarations have been moved to a seperate header file. There are no code changes to the code itself and no side effects to other parts. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 6994c751590e..0d654bd32928 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -93,6 +93,7 @@ #include #include #include +#include MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); -- cgit v1.2.2 From 8e61028dfdc6b8ca996abfe8f9baef6792ea2904 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:14 +0200 Subject: x86: typedef removal Removed typedefs. No functional changes to the code. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 0d654bd32928..74e6a77bf190 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -102,17 +102,17 @@ MODULE_LICENSE("GPL"); #define MICROCODE_VERSION "1.14a" #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (microcode_header_t)) /* 48 bytes */ +#define MC_HEADER_SIZE (sizeof (struct microcode_header)) /* 48 bytes */ #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ #define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ #define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ #define DWSIZE (sizeof (u32)) #define get_totalsize(mc) \ - (((microcode_t *)mc)->hdr.totalsize ? \ - ((microcode_t *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) + (((struct microcode *)mc)->hdr.totalsize ? \ + ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) #define get_datasize(mc) \ - (((microcode_t *)mc)->hdr.datasize ? \ - ((microcode_t *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + (((struct microcode *)mc)->hdr.datasize ? \ + ((struct microcode *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) #define sigmatch(s1, s2, p1, p2) \ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) @@ -130,7 +130,7 @@ static struct ucode_cpu_info { unsigned int sig; unsigned int pf; unsigned int rev; - microcode_t *mc; + struct microcode *mc; } ucode_cpu_info[NR_CPUS]; static void collect_cpu_info(int cpu_num) @@ -171,7 +171,7 @@ static void collect_cpu_info(int cpu_num) } static inline int microcode_update_match(int cpu_num, - microcode_header_t *mc_header, int sig, int pf) + struct microcode_header *mc_header, int sig, int pf) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -183,7 +183,7 @@ static inline int microcode_update_match(int cpu_num, static int microcode_sanity_check(void *mc) { - microcode_header_t *mc_header = mc; + struct microcode_header *mc_header = mc; struct extended_sigtable *ext_header = NULL; struct extended_signature *ext_sig; unsigned long total_size, data_size, ext_table_size; @@ -268,7 +268,7 @@ static int microcode_sanity_check(void *mc) static int get_maching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - microcode_header_t *mc_header = mc; + struct microcode_header *mc_header = mc; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; @@ -355,7 +355,7 @@ static unsigned int user_buffer_size; /* it's size */ static long get_next_ucode(void **mc, long offset) { - microcode_header_t mc_header; + struct microcode_header mc_header; unsigned long total_size; /* No more data */ @@ -497,13 +497,13 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); static long get_next_ucode_from_buffer(void **mc, const u8 *buf, unsigned long size, long offset) { - microcode_header_t *mc_header; + struct microcode_header *mc_header; unsigned long total_size; /* No more data */ if (offset >= size) return 0; - mc_header = (microcode_header_t *)(buf + offset); + mc_header = (struct microcode_header *)(buf + offset); total_size = get_totalsize(mc_header); if (offset + total_size > size) { -- cgit v1.2.2 From c3b71bcec0380836caac9b524fa1585b469b7456 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:15 +0200 Subject: x86: move per CPU microcode structure declaration to header file This structure will be later used by other modules as well and needs therfore to be moved out to a header file. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 74e6a77bf190..4e7b2f65fed6 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -125,13 +125,7 @@ static DEFINE_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ static DEFINE_MUTEX(microcode_mutex); -static struct ucode_cpu_info { - int valid; - unsigned int sig; - unsigned int pf; - unsigned int rev; - struct microcode *mc; -} ucode_cpu_info[NR_CPUS]; +static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; static void collect_cpu_info(int cpu_num) { -- cgit v1.2.2 From 1abae31096007cc993f67ae2576fe8f812270ad6 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:16 +0200 Subject: x86: move microcode.c to microcode_intel.c Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/microcode.c | 855 -------------------------------------- arch/x86/kernel/microcode_intel.c | 855 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 856 insertions(+), 856 deletions(-) delete mode 100644 arch/x86/kernel/microcode.c create mode 100644 arch/x86/kernel/microcode_intel.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 3db651fc8ec5..a9be2a0dde80 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,7 +51,7 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_MICROCODE) += microcode_intel.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c deleted file mode 100644 index 4e7b2f65fed6..000000000000 --- a/arch/x86/kernel/microcode.c +++ /dev/null @@ -1,855 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2006 Tigran Aivazian - * 2006 Shaohua Li - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/design/pentium4/manuals/253668.htm - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman for the fix. - * 1.08 11 Dec 2000, Richard Schaal and - * Tigran Aivazian - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick and - * Tigran Aivazian , - * Serialize updates as required on HT processors due to speculative - * nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble , - * Jun Nakajima - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. - */ - -//#define DEBUG /* pr_debug */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "1.14a" - -#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (struct microcode_header)) /* 48 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ -#define DWSIZE (sizeof (u32)) -#define get_totalsize(mc) \ - (((struct microcode *)mc)->hdr.totalsize ? \ - ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) -#define get_datasize(mc) \ - (((struct microcode *)mc)->hdr.datasize ? \ - ((struct microcode *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) - -#define sigmatch(s1, s2, p1, p2) \ - (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) - -#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) - -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); - -static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; - -static void collect_cpu_info(int cpu_num) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu_num); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - unsigned int val[2]; - - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu_num); - uci->pf = uci->rev = 0; - uci->mc = NULL; - uci->valid = 1; - - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { - printk(KERN_ERR "microcode: CPU%d not a capable Intel " - "processor\n", cpu_num); - uci->valid = 0; - return; - } - - uci->sig = cpuid_eax(0x00000001); - - if ((c->x86_model >= 5) || (c->x86 > 6)) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - uci->pf = 1 << ((val[1] >> 18) & 7); - } - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - uci->sig, uci->pf, uci->rev); -} - -static inline int microcode_update_match(int cpu_num, - struct microcode_header *mc_header, int sig, int pf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - if (!sigmatch(sig, uci->sig, pf, uci->pf) - || mc_header->rev <= uci->rev) - return 0; - return 1; -} - -static int microcode_sanity_check(void *mc) -{ - struct microcode_header *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - struct extended_signature *ext_sig; - unsigned long total_size, data_size, ext_table_size; - int sum, orig_sum, ext_sigcount = 0, i; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - if (data_size + MC_HEADER_SIZE > total_size) { - printk(KERN_ERR "microcode: error! " - "Bad data size in microcode data file\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - printk(KERN_ERR "microcode: error! " - "Unknown microcode update format\n"); - return -EINVAL; - } - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - printk(KERN_ERR "microcode: error! " - "Small exttable size in microcode data file\n"); - return -EINVAL; - } - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - printk(KERN_ERR "microcode: error! " - "Bad exttable size in microcode data file\n"); - return -EFAULT; - } - ext_sigcount = ext_header->count; - } - - /* check extended table checksum */ - if (ext_table_size) { - int ext_table_sum = 0; - int *ext_tablep = (int *)ext_header; - - i = ext_table_size / DWSIZE; - while (i--) - ext_table_sum += ext_tablep[i]; - if (ext_table_sum) { - printk(KERN_WARNING "microcode: aborting, " - "bad extended signature table checksum\n"); - return -EINVAL; - } - } - - /* calculate the checksum */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / DWSIZE; - while (i--) - orig_sum += ((int *)mc)[i]; - if (orig_sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); - return -EINVAL; - } - if (!ext_table_size) - return 0; - /* check extended signature checksum */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - sum = orig_sum - - (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - printk(KERN_ERR "microcode: aborting, bad checksum\n"); - return -EINVAL; - } - } - return 0; -} - -/* - * return 0 - no update found - * return 1 - found update - * return < 0 - error - */ -static int get_maching_microcode(void *mc, int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct microcode_header *mc_header = mc; - struct extended_sigtable *ext_header; - unsigned long total_size = get_totalsize(mc_header); - int ext_sigcount, i; - struct extended_signature *ext_sig; - void *new_mc; - - if (microcode_update_match(cpu, mc_header, - mc_header->sig, mc_header->pf)) - goto find; - - if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) - return 0; - - ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; - ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; - for (i = 0; i < ext_sigcount; i++) { - if (microcode_update_match(cpu, mc_header, - ext_sig->sig, ext_sig->pf)) - goto find; - ext_sig++; - } - return 0; -find: - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); - new_mc = vmalloc(total_size); - if (!new_mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - - /* free previous update file */ - vfree(uci->mc); - - memcpy(new_mc, mc, total_size); - uci->mc = new_mc; - return 1; -} - -static void apply_microcode(int cpu) -{ - unsigned long flags; - unsigned int val[2]; - int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - - /* We should bind the task to the CPU */ - BUG_ON(cpu_num != cpu); - - if (uci->mc == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - - /* write microcode via MSR 0x79 */ - wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc->bits, - (unsigned long) uci->mc->bits >> 16 >> 16); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - - spin_unlock_irqrestore(µcode_update_lock, flags); - if (val[1] != uci->mc->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); - return; - } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc->hdr.date); - uci->rev = val[1]; -} - -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ - -static long get_next_ucode(void **mc, long offset) -{ - struct microcode_header mc_header; - unsigned long total_size; - - /* No more data */ - if (offset >= user_buffer_size) - return 0; - if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - return -EFAULT; - } - total_size = get_totalsize(&mc_header); - if (offset + total_size > user_buffer_size) { - printk(KERN_ERR "microcode: error! Bad total size in microcode " - "data file\n"); - return -EINVAL; - } - *mc = vmalloc(total_size); - if (!*mc) - return -ENOMEM; - if (copy_from_user(*mc, user_buffer + offset, total_size)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(*mc); - return -EFAULT; - } - return offset + total_size; -} - -static int do_microcode_update (void) -{ - long cursor = 0; - int error = 0; - void *new_mc = NULL; - int cpu; - cpumask_t old; - cpumask_of_cpu_ptr_declare(newmask); - - old = current->cpus_allowed; - - while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_sanity_check(new_mc); - if (error) - goto out; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->valid) - continue; - cpumask_of_cpu_ptr_next(newmask, cpu); - set_cpus_allowed_ptr(current, newmask); - error = get_maching_microcode(new_mc, cpu); - if (error < 0) - goto out; - if (error == 1) - apply_microcode(cpu); - } - vfree(new_mc); - } -out: - if (cursor > 0) - vfree(new_mc); - if (cursor < 0) - error = cursor; - set_cpus_allowed_ptr(current, &old); - return error; -} - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - cycle_kernel_lock(); - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -{ - ssize_t ret; - - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); - return -EINVAL; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); - if (!ret) - ret = (ssize_t)len; - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init (void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void microcode_dev_exit (void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) -#endif - -static long get_next_ucode_from_buffer(void **mc, const u8 *buf, - unsigned long size, long offset) -{ - struct microcode_header *mc_header; - unsigned long total_size; - - /* No more data */ - if (offset >= size) - return 0; - mc_header = (struct microcode_header *)(buf + offset); - total_size = get_totalsize(mc_header); - - if (offset + total_size > size) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - return -EINVAL; - } - - *mc = vmalloc(total_size); - if (!*mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - memcpy(*mc, buf + offset, total_size); - return offset + total_size; -} - -/* fake device for request_firmware */ -static struct platform_device *microcode_pdev; - -static int cpu_request_microcode(int cpu) -{ - char name[30]; - struct cpuinfo_x86 *c = &cpu_data(cpu); - const struct firmware *firmware; - const u8 *buf; - unsigned long size; - long offset = 0; - int error; - void *mc; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - sprintf(name,"intel-ucode/%02x-%02x-%02x", - c->x86, c->x86_model, c->x86_mask); - error = request_firmware(&firmware, name, µcode_pdev->dev); - if (error) { - pr_debug("microcode: data file %s load failed\n", name); - return error; - } - buf = firmware->data; - size = firmware->size; - while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) - > 0) { - error = microcode_sanity_check(mc); - if (error) - break; - error = get_maching_microcode(mc, cpu); - if (error < 0) - break; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - if (error == 1) { - apply_microcode(cpu); - error = 0; - } - vfree(mc); - } - if (offset > 0) - vfree(mc); - if (offset < 0) - error = offset; - release_firmware(firmware); - - return error; -} - -static int apply_microcode_check_cpu(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - unsigned int val[2]; - int err = 0; - - /* Check if the microcode is available */ - if (!uci->mc) - return 0; - - old = current->cpus_allowed; - set_cpus_allowed_ptr(current, newmask); - - /* Check if the microcode we have in memory matches the CPU */ - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) - err = -EINVAL; - - if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - if (uci->pf != (1 << ((val[1] >> 18) & 7))) - err = -EINVAL; - } - - if (!err) { - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (uci->rev != val[1]) - err = -EINVAL; - } - - if (!err) - apply_microcode(cpu); - else - printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" - " sig=0x%x, pf=0x%x, rev=0x%x\n", - cpu, uci->sig, uci->pf, uci->rev); - - set_cpus_allowed_ptr(current, &old); - return err; -} - -static void microcode_init_cpu(int cpu, int resume) -{ - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, newmask); - mutex_lock(µcode_mutex); - collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); -} - -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - mutex_lock(µcode_mutex); - uci->valid = 0; - vfree(uci->mc); - uci->mc = NULL; - mutex_unlock(µcode_mutex); -} - -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, size_t sz) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; - int cpu = dev->id; - - if (end == buf) - return -EINVAL; - if (val == 1) { - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - - old = current->cpus_allowed; - - get_online_cpus(); - set_cpus_allowed_ptr(current, newmask); - - mutex_lock(µcode_mutex); - if (uci->valid) - err = cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - put_online_cpus(); - set_cpus_allowed_ptr(current, &old); - } - if (err) - return err; - return sz; -} - -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->rev); -} - -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->pf); -} - -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); - -static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, - NULL -}; - -static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", -}; - -static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) -{ - int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); - - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); - if (err) - return err; - - microcode_init_cpu(cpu, resume); - - return 0; -} - -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - return __mc_sysdev_add(sys_dev, 0); -} - -static int mc_sysdev_remove(struct sys_device *sys_dev) -{ - int cpu = sys_dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - return 0; -} - -static int mc_sysdev_resume(struct sys_device *dev) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - pr_debug("microcode: CPU%d resumed\n", cpu); - /* only CPU 0 will apply ucode here */ - apply_microcode(0); - return 0; -} - -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, -}; - -static __cpuinit int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; - - sys_dev = get_cpu_sysdev(cpu); - switch (action) { - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - mc_sysdev_add(sys_dev); - break; - case CPU_ONLINE_FROZEN: - /* System-wide resume is in progress, try to apply microcode */ - if (apply_microcode_check_cpu(cpu)) { - /* The application of microcode failed */ - microcode_fini_cpu(cpu); - __mc_sysdev_add(sys_dev, 1); - break; - } - case CPU_DOWN_FAILED_FROZEN: - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); - break; - case CPU_DOWN_PREPARE: - mc_sysdev_remove(sys_dev); - break; - case CPU_DOWN_PREPARE_FROZEN: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - -static int __init microcode_init (void) -{ - int error; - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); - - error = microcode_dev_init(); - if (error) - return error; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); - return PTR_ERR(microcode_pdev); - } - - get_online_cpus(); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - if (error) { - microcode_dev_exit(); - platform_device_unregister(microcode_pdev); - return error; - } - - register_hotcpu_notifier(&mc_cpu_notifier); - return 0; -} - -static void __exit microcode_exit (void) -{ - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - - get_online_cpus(); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); -} - -module_init(microcode_init) -module_exit(microcode_exit) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c new file mode 100644 index 000000000000..4e7b2f65fed6 --- /dev/null +++ b/arch/x86/kernel/microcode_intel.c @@ -0,0 +1,855 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman for the fix. + * 1.08 11 Dec 2000, Richard Schaal and + * Tigran Aivazian + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick and + * Tigran Aivazian , + * Serialize updates as required on HT processors due to speculative + * nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble , + * Jun Nakajima + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +//#define DEBUG /* pr_debug */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "1.14a" + +#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ +#define MC_HEADER_SIZE (sizeof (struct microcode_header)) /* 48 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ +#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ +#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ +#define DWSIZE (sizeof (u32)) +#define get_totalsize(mc) \ + (((struct microcode *)mc)->hdr.totalsize ? \ + ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) +#define get_datasize(mc) \ + (((struct microcode *)mc)->hdr.datasize ? \ + ((struct microcode *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + +#define sigmatch(s1, s2, p1, p2) \ + (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) + +#define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) + +/* serialize access to the physical write to MSR 0x79 */ +static DEFINE_SPINLOCK(microcode_update_lock); + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + +static void collect_cpu_info(int cpu_num) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu_num); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + unsigned int val[2]; + + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu_num); + uci->pf = uci->rev = 0; + uci->mc = NULL; + uci->valid = 1; + + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64)) { + printk(KERN_ERR "microcode: CPU%d not a capable Intel " + "processor\n", cpu_num); + uci->valid = 0; + return; + } + + uci->sig = cpuid_eax(0x00000001); + + if ((c->x86_model >= 5) || (c->x86 > 6)) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + uci->pf = 1 << ((val[1] >> 18) & 7); + } + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); + pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", + uci->sig, uci->pf, uci->rev); +} + +static inline int microcode_update_match(int cpu_num, + struct microcode_header *mc_header, int sig, int pf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + if (!sigmatch(sig, uci->sig, pf, uci->pf) + || mc_header->rev <= uci->rev) + return 0; + return 1; +} + +static int microcode_sanity_check(void *mc) +{ + struct microcode_header *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + struct extended_signature *ext_sig; + unsigned long total_size, data_size, ext_table_size; + int sum, orig_sum, ext_sigcount = 0, i; + + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); + if (data_size + MC_HEADER_SIZE > total_size) { + printk(KERN_ERR "microcode: error! " + "Bad data size in microcode data file\n"); + return -EINVAL; + } + + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + printk(KERN_ERR "microcode: error! " + "Unknown microcode update format\n"); + return -EINVAL; + } + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + printk(KERN_ERR "microcode: error! " + "Small exttable size in microcode data file\n"); + return -EINVAL; + } + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + printk(KERN_ERR "microcode: error! " + "Bad exttable size in microcode data file\n"); + return -EFAULT; + } + ext_sigcount = ext_header->count; + } + + /* check extended table checksum */ + if (ext_table_size) { + int ext_table_sum = 0; + int *ext_tablep = (int *)ext_header; + + i = ext_table_size / DWSIZE; + while (i--) + ext_table_sum += ext_tablep[i]; + if (ext_table_sum) { + printk(KERN_WARNING "microcode: aborting, " + "bad extended signature table checksum\n"); + return -EINVAL; + } + } + + /* calculate the checksum */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / DWSIZE; + while (i--) + orig_sum += ((int *)mc)[i]; + if (orig_sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + if (!ext_table_size) + return 0; + /* check extended signature checksum */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + sum = orig_sum + - (mc_header->sig + mc_header->pf + mc_header->cksum) + + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + printk(KERN_ERR "microcode: aborting, bad checksum\n"); + return -EINVAL; + } + } + return 0; +} + +/* + * return 0 - no update found + * return 1 - found update + * return < 0 - error + */ +static int get_maching_microcode(void *mc, int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_header *mc_header = mc; + struct extended_sigtable *ext_header; + unsigned long total_size = get_totalsize(mc_header); + int ext_sigcount, i; + struct extended_signature *ext_sig; + void *new_mc; + + if (microcode_update_match(cpu, mc_header, + mc_header->sig, mc_header->pf)) + goto find; + + if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) + return 0; + + ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; + ext_sigcount = ext_header->count; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + for (i = 0; i < ext_sigcount; i++) { + if (microcode_update_match(cpu, mc_header, + ext_sig->sig, ext_sig->pf)) + goto find; + ext_sig++; + } + return 0; +find: + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); + new_mc = vmalloc(total_size); + if (!new_mc) { + printk(KERN_ERR "microcode: error! Can not allocate memory\n"); + return -ENOMEM; + } + + /* free previous update file */ + vfree(uci->mc); + + memcpy(new_mc, mc, total_size); + uci->mc = new_mc; + return 1; +} + +static void apply_microcode(int cpu) +{ + unsigned long flags; + unsigned int val[2]; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (uci->mc == NULL) + return; + + /* serialize access to the physical write to MSR 0x79 */ + spin_lock_irqsave(µcode_update_lock, flags); + + /* write microcode via MSR 0x79 */ + wrmsr(MSR_IA32_UCODE_WRITE, + (unsigned long) uci->mc->bits, + (unsigned long) uci->mc->bits >> 16 >> 16); + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + + spin_unlock_irqrestore(µcode_update_lock, flags); + if (val[1] != uci->mc->hdr.rev) { + printk(KERN_ERR "microcode: CPU%d update from revision " + "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); + return; + } + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x, date = %08x \n", + cpu_num, uci->rev, val[1], uci->mc->hdr.date); + uci->rev = val[1]; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static void __user *user_buffer; /* user area microcode data buffer */ +static unsigned int user_buffer_size; /* it's size */ + +static long get_next_ucode(void **mc, long offset) +{ + struct microcode_header mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= user_buffer_size) + return 0; + if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + return -EFAULT; + } + total_size = get_totalsize(&mc_header); + if (offset + total_size > user_buffer_size) { + printk(KERN_ERR "microcode: error! Bad total size in microcode " + "data file\n"); + return -EINVAL; + } + *mc = vmalloc(total_size); + if (!*mc) + return -ENOMEM; + if (copy_from_user(*mc, user_buffer + offset, total_size)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + vfree(*mc); + return -EFAULT; + } + return offset + total_size; +} + +static int do_microcode_update (void) +{ + long cursor = 0; + int error = 0; + void *new_mc = NULL; + int cpu; + cpumask_t old; + cpumask_of_cpu_ptr_declare(newmask); + + old = current->cpus_allowed; + + while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { + error = microcode_sanity_check(new_mc); + if (error) + goto out; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + cpumask_of_cpu_ptr_next(newmask, cpu); + set_cpus_allowed_ptr(current, newmask); + error = get_maching_microcode(new_mc, cpu); + if (error < 0) + goto out; + if (error == 1) + apply_microcode(cpu); + } + vfree(new_mc); + } +out: + if (cursor > 0) + vfree(new_mc); + if (cursor < 0) + error = cursor; + set_cpus_allowed_ptr(current, &old); + return error; +} + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + return -EINVAL; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + user_buffer = (void __user *) buf; + user_buffer_size = (int) len; + + ret = do_microcode_update(); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init (void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit (void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while(0) +#endif + +static long get_next_ucode_from_buffer(void **mc, const u8 *buf, + unsigned long size, long offset) +{ + struct microcode_header *mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= size) + return 0; + mc_header = (struct microcode_header *)(buf + offset); + total_size = get_totalsize(mc_header); + + if (offset + total_size > size) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + return -EINVAL; + } + + *mc = vmalloc(total_size); + if (!*mc) { + printk(KERN_ERR "microcode: error! Can not allocate memory\n"); + return -ENOMEM; + } + memcpy(*mc, buf + offset, total_size); + return offset + total_size; +} + +/* fake device for request_firmware */ +static struct platform_device *microcode_pdev; + +static int cpu_request_microcode(int cpu) +{ + char name[30]; + struct cpuinfo_x86 *c = &cpu_data(cpu); + const struct firmware *firmware; + const u8 *buf; + unsigned long size; + long offset = 0; + int error; + void *mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + sprintf(name,"intel-ucode/%02x-%02x-%02x", + c->x86, c->x86_model, c->x86_mask); + error = request_firmware(&firmware, name, µcode_pdev->dev); + if (error) { + pr_debug("microcode: data file %s load failed\n", name); + return error; + } + buf = firmware->data; + size = firmware->size; + while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) + > 0) { + error = microcode_sanity_check(mc); + if (error) + break; + error = get_maching_microcode(mc, cpu); + if (error < 0) + break; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + if (error == 1) { + apply_microcode(cpu); + error = 0; + } + vfree(mc); + } + if (offset > 0) + vfree(mc); + if (offset < 0) + error = offset; + release_firmware(firmware); + + return error; +} + +static int apply_microcode_check_cpu(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + unsigned int val[2]; + int err = 0; + + /* Check if the microcode is available */ + if (!uci->mc) + return 0; + + old = current->cpus_allowed; + set_cpus_allowed_ptr(current, newmask); + + /* Check if the microcode we have in memory matches the CPU */ + if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || + cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) + err = -EINVAL; + + if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { + /* get processor flags from MSR 0x17 */ + rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); + if (uci->pf != (1 << ((val[1] >> 18) & 7))) + err = -EINVAL; + } + + if (!err) { + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* see notes above for revision 1.07. Apparent chip bug */ + sync_core(); + /* get the current revision from MSR 0x8B */ + rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); + if (uci->rev != val[1]) + err = -EINVAL; + } + + if (!err) + apply_microcode(cpu); + else + printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" + " sig=0x%x, pf=0x%x, rev=0x%x\n", + cpu, uci->sig, uci->pf, uci->rev); + + set_cpus_allowed_ptr(current, &old); + return err; +} + +static void microcode_init_cpu(int cpu, int resume) +{ + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, newmask); + mutex_lock(µcode_mutex); + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING && !resume) + cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); +} + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + uci->valid = 0; + vfree(uci->mc); + uci->mc = NULL; + mutex_unlock(µcode_mutex); +} + +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + + old = current->cpus_allowed; + + get_online_cpus(); + set_cpus_allowed_ptr(current, newmask); + + mutex_lock(µcode_mutex); + if (uci->valid) + err = cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + put_online_cpus(); + set_cpus_allowed_ptr(current, &old); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->rev); +} + +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) +{ + int err, cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + + err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + if (err) + return err; + + microcode_init_cpu(cpu, resume); + + return 0; +} + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + return __mc_sysdev_add(sys_dev, 0); +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + pr_debug("microcode: CPU%d resumed\n", cpu); + /* only CPU 0 will apply ucode here */ + apply_microcode(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + mc_sysdev_add(sys_dev); + break; + case CPU_ONLINE_FROZEN: + /* System-wide resume is in progress, try to apply microcode */ + if (apply_microcode_check_cpu(cpu)) { + /* The application of microcode failed */ + microcode_fini_cpu(cpu); + __mc_sysdev_add(sys_dev, 1); + break; + } + case CPU_DOWN_FAILED_FROZEN: + if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + printk(KERN_ERR "microcode: Failed to create the sysfs " + "group for CPU%d\n", cpu); + break; + case CPU_DOWN_PREPARE: + mc_sysdev_remove(sys_dev); + break; + case CPU_DOWN_PREPARE_FROZEN: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +static int __init microcode_init (void) +{ + int error; + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + get_online_cpus(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + return 0; +} + +static void __exit microcode_exit (void) +{ + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + get_online_cpus(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); +} + +module_init(microcode_init) +module_exit(microcode_exit) -- cgit v1.2.2 From 3e135d887c973b525d43fbb67dfc5972694882f6 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:17 +0200 Subject: x86: code split to two parts Split off existing code into two seperate files. One file holds general code, the other file vendor specific parts. No functional changes, only refactoring. Temporarily Introduced a new module name 'ucode' for result, due to already taken name 'microcode'. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 3 +- arch/x86/kernel/microcode.c | 463 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/microcode_intel.c | 387 ++----------------------------- 3 files changed, 488 insertions(+), 365 deletions(-) create mode 100644 arch/x86/kernel/microcode.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a9be2a0dde80..abb32aed7f4b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,7 +51,8 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode_intel.o +obj-$(CONFIG_MICROCODE) += ucode.o +ucode-objs := microcode.o microcode_intel.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c new file mode 100644 index 000000000000..c1047d7f7ede --- /dev/null +++ b/arch/x86/kernel/microcode.c @@ -0,0 +1,463 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman for the fix. + * 1.08 11 Dec 2000, Richard Schaal and + * Tigran Aivazian + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick and + * Tigran Aivazian , + * Serialize updates as required on HT processors due to speculative + * nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble , + * Jun Nakajima + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ + +//#define DEBUG /* pr_debug */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "1.14a" + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + +extern long get_next_ucode(void **mc, long offset); +extern int microcode_sanity_check(void *mc); +extern int get_matching_microcode(void *mc, int cpu); +extern void collect_cpu_info(int cpu_num); +extern int cpu_request_microcode(int cpu); +extern void microcode_fini_cpu(int cpu); +extern void apply_microcode(int cpu); +extern int apply_microcode_check_cpu(int cpu); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +void __user *user_buffer; /* user area microcode data buffer */ +unsigned int user_buffer_size; /* it's size */ + +static int do_microcode_update (void) +{ + long cursor = 0; + int error = 0; + void *new_mc = NULL; + int cpu; + cpumask_t old; + cpumask_of_cpu_ptr_declare(newmask); + + old = current->cpus_allowed; + + while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { + error = microcode_sanity_check(new_mc); + if (error) + goto out; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + cpumask_of_cpu_ptr_next(newmask, cpu); + set_cpus_allowed_ptr(current, newmask); + error = get_maching_microcode(new_mc, cpu); + if (error < 0) + goto out; + if (error == 1) + apply_microcode(cpu); + } + vfree(new_mc); + } +out: + if (cursor > 0) + vfree(new_mc); + if (cursor < 0) + error = cursor; + set_cpus_allowed_ptr(current, &old); + return error; +} + +static int microcode_open (struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + return -EINVAL; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + user_buffer = (void __user *) buf; + user_buffer_size = (int) len; + + ret = do_microcode_update(); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init (void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit (void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while(0) +#endif + +/* fake device for request_firmware */ +struct platform_device *microcode_pdev; + +static void microcode_init_cpu(int cpu, int resume) +{ + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, newmask); + mutex_lock(µcode_mutex); + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING && !resume) + cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); +} + +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + + old = current->cpus_allowed; + + get_online_cpus(); + set_cpus_allowed_ptr(current, newmask); + + mutex_lock(µcode_mutex); + if (uci->valid) + err = cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + put_online_cpus(); + set_cpus_allowed_ptr(current, &old); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->rev); +} + +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) +{ + int err, cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + + err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + if (err) + return err; + + microcode_init_cpu(cpu, resume); + + return 0; +} + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + return __mc_sysdev_add(sys_dev, 0); +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + pr_debug("microcode: CPU%d resumed\n", cpu); + /* only CPU 0 will apply ucode here */ + apply_microcode(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); + break; + case CPU_ONLINE: + case CPU_DOWN_FAILED: + mc_sysdev_add(sys_dev); + break; + case CPU_ONLINE_FROZEN: + /* System-wide resume is in progress, try to apply microcode */ + if (apply_microcode_check_cpu(cpu)) { + /* The application of microcode failed */ + microcode_fini_cpu(cpu); + __mc_sysdev_add(sys_dev, 1); + break; + } + case CPU_DOWN_FAILED_FROZEN: + if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + printk(KERN_ERR "microcode: Failed to create the sysfs " + "group for CPU%d\n", cpu); + break; + case CPU_DOWN_PREPARE: + mc_sysdev_remove(sys_dev); + break; + case CPU_DOWN_PREPARE_FROZEN: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +static int __init microcode_init (void) +{ + int error; + + printk(KERN_INFO + "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + get_online_cpus(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + return 0; +} + +static void __exit microcode_exit (void) +{ + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + get_online_cpus(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); +} + +module_init(microcode_init) +module_exit(microcode_exit) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 4e7b2f65fed6..eded0a154ea8 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -95,18 +95,16 @@ #include #include -MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); +MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); -#define MICROCODE_VERSION "1.14a" - #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof (struct microcode_header)) /* 48 bytes */ +#define MC_HEADER_SIZE (sizeof(struct microcode_header)) /* 48 bytes */ #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof (struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof (struct extended_signature)) /* 12 bytes */ -#define DWSIZE (sizeof (u32)) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) /* 20 bytes */ +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) /* 12 bytes */ +#define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode *)mc)->hdr.totalsize ? \ ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) @@ -123,11 +121,11 @@ MODULE_LICENSE("GPL"); static DEFINE_SPINLOCK(microcode_update_lock); /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); +extern struct mutex microcode_mutex; -static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -static void collect_cpu_info(int cpu_num) +void collect_cpu_info(int cpu_num) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -140,7 +138,7 @@ static void collect_cpu_info(int cpu_num) uci->valid = 1; if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64)) { + cpu_has(c, X86_FEATURE_IA64)) { printk(KERN_ERR "microcode: CPU%d not a capable Intel " "processor\n", cpu_num); uci->valid = 0; @@ -175,7 +173,7 @@ static inline int microcode_update_match(int cpu_num, return 1; } -static int microcode_sanity_check(void *mc) +int microcode_sanity_check(void *mc) { struct microcode_header *mc_header = mc; struct extended_sigtable *ext_header = NULL; @@ -259,7 +257,7 @@ static int microcode_sanity_check(void *mc) * return 1 - found update * return < 0 - error */ -static int get_maching_microcode(void *mc, int cpu) +int get_matching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header *mc_header = mc; @@ -288,7 +286,7 @@ static int get_maching_microcode(void *mc, int cpu) return 0; find: pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", cpu, mc_header->rev,uci->rev); + " version 0x%x (current=0x%x)\n", cpu, mc_header->rev, uci->rev); new_mc = vmalloc(total_size); if (!new_mc) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); @@ -303,7 +301,7 @@ find: return 1; } -static void apply_microcode(int cpu) +void apply_microcode(int cpu) { unsigned long flags; unsigned int val[2]; @@ -344,10 +342,10 @@ static void apply_microcode(int cpu) } #ifdef CONFIG_MICROCODE_OLD_INTERFACE -static void __user *user_buffer; /* user area microcode data buffer */ -static unsigned int user_buffer_size; /* it's size */ +extern void __user *user_buffer; /* user area microcode data buffer */ +extern unsigned int user_buffer_size; /* it's size */ -static long get_next_ucode(void **mc, long offset) +long get_next_ucode(void **mc, long offset) { struct microcode_header mc_header; unsigned long total_size; @@ -375,117 +373,6 @@ static long get_next_ucode(void **mc, long offset) } return offset + total_size; } - -static int do_microcode_update (void) -{ - long cursor = 0; - int error = 0; - void *new_mc = NULL; - int cpu; - cpumask_t old; - cpumask_of_cpu_ptr_declare(newmask); - - old = current->cpus_allowed; - - while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_sanity_check(new_mc); - if (error) - goto out; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->valid) - continue; - cpumask_of_cpu_ptr_next(newmask, cpu); - set_cpus_allowed_ptr(current, newmask); - error = get_maching_microcode(new_mc, cpu); - if (error < 0) - goto out; - if (error == 1) - apply_microcode(cpu); - } - vfree(new_mc); - } -out: - if (cursor > 0) - vfree(new_mc); - if (cursor < 0) - error = cursor; - set_cpus_allowed_ptr(current, &old); - return error; -} - -static int microcode_open (struct inode *unused1, struct file *unused2) -{ - cycle_kernel_lock(); - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) -{ - ssize_t ret; - - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); - return -EINVAL; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); - if (!ret) - ret = (ssize_t)len; - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init (void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void microcode_dev_exit (void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) #endif static long get_next_ucode_from_buffer(void **mc, const u8 *buf, @@ -515,9 +402,9 @@ static long get_next_ucode_from_buffer(void **mc, const u8 *buf, } /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +extern struct platform_device *microcode_pdev; -static int cpu_request_microcode(int cpu) +int cpu_request_microcode(int cpu) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -530,7 +417,7 @@ static int cpu_request_microcode(int cpu) /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); - sprintf(name,"intel-ucode/%02x-%02x-%02x", + sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); error = request_firmware(&firmware, name, µcode_pdev->dev); if (error) { @@ -544,7 +431,7 @@ static int cpu_request_microcode(int cpu) error = microcode_sanity_check(mc); if (error) break; - error = get_maching_microcode(mc, cpu); + error = get_matching_microcode(mc, cpu); if (error < 0) break; /* @@ -566,7 +453,7 @@ static int cpu_request_microcode(int cpu) return error; } -static int apply_microcode_check_cpu(int cpu) +int apply_microcode_check_cpu(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -615,241 +502,13 @@ static int apply_microcode_check_cpu(int cpu) return err; } -static void microcode_init_cpu(int cpu, int resume) -{ - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, newmask); - mutex_lock(µcode_mutex); - collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); -} - -static void microcode_fini_cpu(int cpu) +void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; mutex_lock(µcode_mutex); uci->valid = 0; - vfree(uci->mc); + kfree(uci->mc); uci->mc = NULL; mutex_unlock(µcode_mutex); } - -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, size_t sz) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; - int cpu = dev->id; - - if (end == buf) - return -EINVAL; - if (val == 1) { - cpumask_t old; - cpumask_of_cpu_ptr(newmask, cpu); - - old = current->cpus_allowed; - - get_online_cpus(); - set_cpus_allowed_ptr(current, newmask); - - mutex_lock(µcode_mutex); - if (uci->valid) - err = cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - put_online_cpus(); - set_cpus_allowed_ptr(current, &old); - } - if (err) - return err; - return sz; -} - -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->rev); -} - -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->pf); -} - -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); - -static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, - NULL -}; - -static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", -}; - -static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) -{ - int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); - - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); - if (err) - return err; - - microcode_init_cpu(cpu, resume); - - return 0; -} - -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - return __mc_sysdev_add(sys_dev, 0); -} - -static int mc_sysdev_remove(struct sys_device *sys_dev) -{ - int cpu = sys_dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - return 0; -} - -static int mc_sysdev_resume(struct sys_device *dev) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - pr_debug("microcode: CPU%d resumed\n", cpu); - /* only CPU 0 will apply ucode here */ - apply_microcode(0); - return 0; -} - -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, -}; - -static __cpuinit int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; - - sys_dev = get_cpu_sysdev(cpu); - switch (action) { - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; - case CPU_ONLINE: - case CPU_DOWN_FAILED: - mc_sysdev_add(sys_dev); - break; - case CPU_ONLINE_FROZEN: - /* System-wide resume is in progress, try to apply microcode */ - if (apply_microcode_check_cpu(cpu)) { - /* The application of microcode failed */ - microcode_fini_cpu(cpu); - __mc_sysdev_add(sys_dev, 1); - break; - } - case CPU_DOWN_FAILED_FROZEN: - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); - break; - case CPU_DOWN_PREPARE: - mc_sysdev_remove(sys_dev); - break; - case CPU_DOWN_PREPARE_FROZEN: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - -static int __init microcode_init (void) -{ - int error; - - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); - - error = microcode_dev_init(); - if (error) - return error; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); - return PTR_ERR(microcode_pdev); - } - - get_online_cpus(); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - if (error) { - microcode_dev_exit(); - platform_device_unregister(microcode_pdev); - return error; - } - - register_hotcpu_notifier(&mc_cpu_notifier); - return 0; -} - -static void __exit microcode_exit (void) -{ - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - - get_online_cpus(); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); -} - -module_init(microcode_init) -module_exit(microcode_exit) -- cgit v1.2.2 From d4ee36686853d5714437c4409f17ad42bfaf4211 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:18 +0200 Subject: x86: structure declaration renaming Renamed common structures to vendor specific naming scheme so other vendors will be able to use the same naming convention. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_intel.c | 46 ++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index eded0a154ea8..ca9861bf067e 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -100,17 +100,19 @@ MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); #define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof(struct microcode_header)) /* 48 bytes */ +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) /* 48 bytes */ #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ #define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) /* 20 bytes */ #define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) /* 12 bytes */ #define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ - (((struct microcode *)mc)->hdr.totalsize ? \ - ((struct microcode *)mc)->hdr.totalsize : DEFAULT_UCODE_TOTALSIZE) + (((struct microcode_intel *)mc)->hdr.totalsize ? \ + ((struct microcode_intel *)mc)->hdr.totalsize : \ + DEFAULT_UCODE_TOTALSIZE) + #define get_datasize(mc) \ - (((struct microcode *)mc)->hdr.datasize ? \ - ((struct microcode *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) + (((struct microcode_intel *)mc)->hdr.datasize ? \ + ((struct microcode_intel *)mc)->hdr.datasize : DEFAULT_UCODE_DATASIZE) #define sigmatch(s1, s2, p1, p2) \ (((s1) == (s2)) && (((p1) & (p2)) || (((p1) == 0) && ((p2) == 0)))) @@ -134,7 +136,7 @@ void collect_cpu_info(int cpu_num) /* We should bind the task to the CPU */ BUG_ON(raw_smp_processor_id() != cpu_num); uci->pf = uci->rev = 0; - uci->mc = NULL; + uci->mc.mc_intel = NULL; uci->valid = 1; if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || @@ -163,7 +165,7 @@ void collect_cpu_info(int cpu_num) } static inline int microcode_update_match(int cpu_num, - struct microcode_header *mc_header, int sig, int pf) + struct microcode_header_intel *mc_header, int sig, int pf) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -175,7 +177,7 @@ static inline int microcode_update_match(int cpu_num, int microcode_sanity_check(void *mc) { - struct microcode_header *mc_header = mc; + struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; struct extended_signature *ext_sig; unsigned long total_size, data_size, ext_table_size; @@ -260,7 +262,7 @@ int microcode_sanity_check(void *mc) int get_matching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct microcode_header *mc_header = mc; + struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; @@ -294,10 +296,10 @@ find: } /* free previous update file */ - vfree(uci->mc); + vfree(uci->mc.mc_intel); memcpy(new_mc, mc, total_size); - uci->mc = new_mc; + uci->mc.mc_intel = new_mc; return 1; } @@ -311,7 +313,7 @@ void apply_microcode(int cpu) /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); - if (uci->mc == NULL) + if (uci->mc.mc_intel == NULL) return; /* serialize access to the physical write to MSR 0x79 */ @@ -319,8 +321,8 @@ void apply_microcode(int cpu) /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc->bits, - (unsigned long) uci->mc->bits >> 16 >> 16); + (unsigned long) uci->mc.mc_intel->bits, + (unsigned long) uci->mc.mc_intel->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ @@ -330,14 +332,14 @@ void apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); spin_unlock_irqrestore(µcode_update_lock, flags); - if (val[1] != uci->mc->hdr.rev) { + if (val[1] != uci->mc.mc_intel->hdr.rev) { printk(KERN_ERR "microcode: CPU%d update from revision " "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); return; } printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc->hdr.date); + cpu_num, uci->rev, val[1], uci->mc.mc_intel->hdr.date); uci->rev = val[1]; } @@ -347,7 +349,7 @@ extern unsigned int user_buffer_size; /* it's size */ long get_next_ucode(void **mc, long offset) { - struct microcode_header mc_header; + struct microcode_header_intel mc_header; unsigned long total_size; /* No more data */ @@ -378,13 +380,13 @@ long get_next_ucode(void **mc, long offset) static long get_next_ucode_from_buffer(void **mc, const u8 *buf, unsigned long size, long offset) { - struct microcode_header *mc_header; + struct microcode_header_intel *mc_header; unsigned long total_size; /* No more data */ if (offset >= size) return 0; - mc_header = (struct microcode_header *)(buf + offset); + mc_header = (struct microcode_header_intel *)(buf + offset); total_size = get_totalsize(mc_header); if (offset + total_size > size) { @@ -463,7 +465,7 @@ int apply_microcode_check_cpu(int cpu) int err = 0; /* Check if the microcode is available */ - if (!uci->mc) + if (!uci->mc.mc_intel) return 0; old = current->cpus_allowed; @@ -508,7 +510,7 @@ void microcode_fini_cpu(int cpu) mutex_lock(µcode_mutex); uci->valid = 0; - kfree(uci->mc); - uci->mc = NULL; + kfree(uci->mc.mc_intel); + uci->mc.mc_intel = NULL; mutex_unlock(µcode_mutex); } -- cgit v1.2.2 From 8d86f390d9bb5b39f0a315838d1616de6363e1b9 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:21 +0200 Subject: x86: major refactoring Refactored code by introducing a two-module solution. There is one general module in which vendor specific modules can hook into. However, that is exclusive, there is only one vendor specific module allowed at a time. A CPU vendor check makes sure only the correct module for the underlying system gets called. Functinally in terms of patch loading itself there are no changes. This refactoring provides a basis for future implementations of other vendors' patch loaders. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 25 +++++++++--- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/microcode.c | 80 +++++++++++++++++++++++---------------- arch/x86/kernel/microcode_intel.c | 50 +++++++++++++++++++----- 4 files changed, 109 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b6fa2877b173..6b0b885cf47a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -782,7 +782,7 @@ config X86_REBOOTFIXUPS Say N otherwise. config MICROCODE - tristate "/dev/cpu/microcode - Intel IA32 CPU microcode support" + tristate "/dev/cpu/microcode - microcode support" select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on @@ -791,14 +791,29 @@ config MICROCODE actual microcode binary data itself which is not shipped with the Linux kernel. - For latest news and information on obtaining all the required - ingredients for this driver, check: - . + This option selects the general module only, you need to select + at least one vendor specific module as well. To compile this driver as a module, choose M here: the module will be called microcode. -config MICROCODE_OLD_INTERFACE +config MICROCODE_INTEL + tristate "Intel microcode patch loading support" + depends on MICROCODE + default MICROCODE + select FW_LOADER + --help--- + This options enables microcode patch loading support for Intel + processors. + + For latest news and information on obtaining all the required + Intel ingredients for this driver, check: + . + + This driver is only available as a module: the module + will be called microcode_intel. + + config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index abb32aed7f4b..f2f9f6da8c18 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,8 +51,8 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += ucode.o -ucode-objs := microcode.o microcode_intel.o +obj-$(CONFIG_MICROCODE) += microcode.o +obj-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index c1047d7f7ede..1e42e79ca694 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -99,25 +99,22 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); -#define MICROCODE_VERSION "1.14a" +#define MICROCODE_VERSION "2.00" -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -DEFINE_MUTEX(microcode_mutex); +struct microcode_ops *microcode_ops; -struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); +EXPORT_SYMBOL_GPL(microcode_mutex); -extern long get_next_ucode(void **mc, long offset); -extern int microcode_sanity_check(void *mc); -extern int get_matching_microcode(void *mc, int cpu); -extern void collect_cpu_info(int cpu_num); -extern int cpu_request_microcode(int cpu); -extern void microcode_fini_cpu(int cpu); -extern void apply_microcode(int cpu); -extern int apply_microcode_check_cpu(int cpu); +static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE -void __user *user_buffer; /* user area microcode data buffer */ -unsigned int user_buffer_size; /* it's size */ +static void __user *user_buffer; /* user area microcode data buffer */ +EXPORT_SYMBOL_GPL(user_buffer); +static unsigned int user_buffer_size; /* it's size */ +EXPORT_SYMBOL_GPL(user_buffer_size); static int do_microcode_update (void) { @@ -130,8 +127,8 @@ static int do_microcode_update (void) old = current->cpus_allowed; - while ((cursor = get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_sanity_check(new_mc); + while ((cursor = microcode_ops->get_next_ucode(&new_mc, cursor)) > 0) { + error = microcode_ops->microcode_sanity_check(new_mc); if (error) goto out; /* @@ -145,11 +142,12 @@ static int do_microcode_update (void) continue; cpumask_of_cpu_ptr_next(newmask, cpu); set_cpus_allowed_ptr(current, newmask); - error = get_maching_microcode(new_mc, cpu); + error = microcode_ops->get_matching_microcode(new_mc, + cpu); if (error < 0) goto out; if (error == 1) - apply_microcode(cpu); + microcode_ops->apply_microcode(cpu); } vfree(new_mc); } @@ -232,7 +230,8 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -struct platform_device *microcode_pdev; +static struct platform_device *microcode_pdev; +EXPORT_SYMBOL_GPL(microcode_pdev); static void microcode_init_cpu(int cpu, int resume) { @@ -244,9 +243,9 @@ static void microcode_init_cpu(int cpu, int resume) set_cpus_allowed_ptr(current, newmask); mutex_lock(µcode_mutex); - collect_cpu_info(cpu); + microcode_ops->collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - cpu_request_microcode(cpu); + microcode_ops->cpu_request_microcode(cpu); mutex_unlock(µcode_mutex); set_cpus_allowed_ptr(current, &old); } @@ -274,7 +273,7 @@ static ssize_t reload_store(struct sys_device *dev, mutex_lock(µcode_mutex); if (uci->valid) - err = cpu_request_microcode(cpu); + err = microcode_ops->cpu_request_microcode(cpu); mutex_unlock(µcode_mutex); put_online_cpus(); set_cpus_allowed_ptr(current, &old); @@ -349,7 +348,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); + microcode_ops->microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; } @@ -362,7 +361,7 @@ static int mc_sysdev_resume(struct sys_device *dev) return 0; pr_debug("microcode: CPU%d resumed\n", cpu); /* only CPU 0 will apply ucode here */ - apply_microcode(0); + microcode_ops->apply_microcode(0); return 0; } @@ -382,7 +381,7 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_UP_CANCELED_FROZEN: /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); + microcode_ops->microcode_fini_cpu(cpu); break; case CPU_ONLINE: case CPU_DOWN_FAILED: @@ -390,9 +389,9 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) break; case CPU_ONLINE_FROZEN: /* System-wide resume is in progress, try to apply microcode */ - if (apply_microcode_check_cpu(cpu)) { + if (microcode_ops->apply_microcode_check_cpu(cpu)) { /* The application of microcode failed */ - microcode_fini_cpu(cpu); + microcode_ops->microcode_fini_cpu(cpu); __mc_sysdev_add(sys_dev, 1); break; } @@ -416,12 +415,17 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -static int __init microcode_init (void) +static int microcode_init(void *opaque, struct module *module) { + struct microcode_ops *ops = (struct microcode_ops *)opaque; int error; - printk(KERN_INFO - "IA-32 Microcode Update Driver: v" MICROCODE_VERSION " \n"); + if (microcode_ops) { + printk(KERN_ERR "microcode: already loaded the other module\n"); + return -EEXIST; + } + + microcode_ops = ops; error = microcode_dev_init(); if (error) @@ -443,8 +447,15 @@ static int __init microcode_init (void) } register_hotcpu_notifier(&mc_cpu_notifier); + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION + " " + " \n"); + return 0; } +EXPORT_SYMBOL_GPL(microcode_init); static void __exit microcode_exit (void) { @@ -457,7 +468,10 @@ static void __exit microcode_exit (void) put_online_cpus(); platform_device_unregister(microcode_pdev); -} -module_init(microcode_init) -module_exit(microcode_exit) + microcode_ops = NULL; + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} +EXPORT_SYMBOL_GPL(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index ca9861bf067e..831db5363dba 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -127,7 +127,7 @@ extern struct mutex microcode_mutex; extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -void collect_cpu_info(int cpu_num) +static void collect_cpu_info(int cpu_num) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -175,7 +175,7 @@ static inline int microcode_update_match(int cpu_num, return 1; } -int microcode_sanity_check(void *mc) +static int microcode_sanity_check(void *mc) { struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header = NULL; @@ -259,7 +259,7 @@ int microcode_sanity_check(void *mc) * return 1 - found update * return < 0 - error */ -int get_matching_microcode(void *mc, int cpu) +static int get_matching_microcode(void *mc, int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_intel *mc_header = mc; @@ -288,7 +288,8 @@ int get_matching_microcode(void *mc, int cpu) return 0; find: pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", cpu, mc_header->rev, uci->rev); + " version 0x%x (current=0x%x)\n", + cpu, mc_header->rev, uci->rev); new_mc = vmalloc(total_size); if (!new_mc) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); @@ -303,7 +304,7 @@ find: return 1; } -void apply_microcode(int cpu) +static void apply_microcode(int cpu) { unsigned long flags; unsigned int val[2]; @@ -347,7 +348,7 @@ void apply_microcode(int cpu) extern void __user *user_buffer; /* user area microcode data buffer */ extern unsigned int user_buffer_size; /* it's size */ -long get_next_ucode(void **mc, long offset) +static long get_next_ucode(void **mc, long offset) { struct microcode_header_intel mc_header; unsigned long total_size; @@ -406,7 +407,7 @@ static long get_next_ucode_from_buffer(void **mc, const u8 *buf, /* fake device for request_firmware */ extern struct platform_device *microcode_pdev; -int cpu_request_microcode(int cpu) +static int cpu_request_microcode(int cpu) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -455,7 +456,7 @@ int cpu_request_microcode(int cpu) return error; } -int apply_microcode_check_cpu(int cpu) +static int apply_microcode_check_cpu(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -504,13 +505,42 @@ int apply_microcode_check_cpu(int cpu) return err; } -void microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; mutex_lock(µcode_mutex); uci->valid = 0; - kfree(uci->mc.mc_intel); + vfree(uci->mc.mc_intel); uci->mc.mc_intel = NULL; mutex_unlock(µcode_mutex); } + +static struct microcode_ops microcode_intel_ops = { + .get_next_ucode = get_next_ucode, + .get_matching_microcode = get_matching_microcode, + .microcode_sanity_check = microcode_sanity_check, + .apply_microcode_check_cpu = apply_microcode_check_cpu, + .cpu_request_microcode = cpu_request_microcode, + .collect_cpu_info = collect_cpu_info, + .apply_microcode = apply_microcode, + .microcode_fini_cpu = microcode_fini_cpu, +}; + +static int __init microcode_intel_module_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + + if (c->x86_vendor == X86_VENDOR_INTEL) + return microcode_init(µcode_intel_ops, THIS_MODULE); + else + return -ENODEV; +} + +static void __exit microcode_intel_module_exit(void) +{ + microcode_exit(); +} + +module_init(microcode_intel_module_init) +module_exit(microcode_intel_module_exit) -- cgit v1.2.2 From 80cc9f1020f49c9e5b50898c102fd444de70a0a3 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Mon, 28 Jul 2008 18:44:22 +0200 Subject: x86: AMD microcode patch loading support This patch introduces microcode patch loading for AMD processors. It is based on previous corresponding work for Intel processors. It hooks into the general patch loading module. Main difference is that a container file format is used to hold all patch data for multiple processors as well as an equivalent CPU table, which comes seperately, as opposed to Intel's microcode patching solution. Kconfig and Makefile have been changed provice config and build option for new source file. Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 21 +- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/microcode_amd.c | 521 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 539 insertions(+), 4 deletions(-) create mode 100644 arch/x86/kernel/microcode_amd.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6b0b885cf47a..7bb461758d31 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -786,10 +786,12 @@ config MICROCODE select FW_LOADER ---help--- If you say Y here, you will be able to update the microcode on - Intel processors in the IA32 family, e.g. Pentium Pro, Pentium II, - Pentium III, Pentium 4, Xeon etc. You will obviously need the - actual microcode binary data itself which is not shipped with the - Linux kernel. + certain Intel and AMD processors. The Intel support is for the + IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, + Pentium 4, Xeon etc. The AMD support is for family 0x10 and + 0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra. + You will obviously need the actual microcode binary data itself + which is not shipped with the Linux kernel. This option selects the general module only, you need to select at least one vendor specific module as well. @@ -813,6 +815,17 @@ config MICROCODE_INTEL This driver is only available as a module: the module will be called microcode_intel. +config MICROCODE_AMD + tristate "AMD microcode patch loading support" + depends on MICROCODE + select FW_LOADER + --help--- + If you select this option, microcode patch loading support for AMD + processors will be enabled. + + This driver is only available as a module: the module + will be called microcode_amd. + config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f2f9f6da8c18..be454f348c3b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -53,6 +53,7 @@ obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o obj-$(CONFIG_MICROCODE) += microcode.o obj-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o +obj-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c new file mode 100644 index 000000000000..db199e3fdd1f --- /dev/null +++ b/arch/x86/kernel/microcode_amd.c @@ -0,0 +1,521 @@ +/* + * AMD CPU Microcode Update Driver for Linux + * Copyright (C) 2008 Advanced Micro Devices Inc. + * + * Author: Peter Oruba + * + * Based on work by: + * Tigran Aivazian + * + * This driver allows to upgrade microcode on AMD + * family 0x10 and 0x11 processors. + * + * Licensed unter the terms of the GNU General Public + * License version 2. See file COPYING for details. +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("AMD Microcode Update Driver"); +MODULE_AUTHOR("Peter Oruba "); +MODULE_LICENSE("GPLv2"); + +#define UCODE_MAGIC 0x00414d44 +#define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 +#define UCODE_UCODE_TYPE 0x00000001 + +#define UCODE_MAX_SIZE (2048) +#define DEFAULT_UCODE_DATASIZE (896) /* 896 bytes */ +#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) /* 64 bytes */ +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 960 bytes */ +#define DWSIZE (sizeof(u32)) +/* For now we support a fixed ucode total size only */ +#define get_totalsize(mc) \ + ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ + + MC_HEADER_SIZE) + +extern int microcode_init(void *opaque, struct module *module); +extern void microcode_exit(void); + +/* serialize access to the physical write */ +static DEFINE_SPINLOCK(microcode_update_lock); + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +extern struct mutex (microcode_mutex); + +struct equiv_cpu_entry *equiv_cpu_table; + +extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; + +static void collect_cpu_info_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + uci->rev = 0; + uci->pf = 0; + uci->mc.mc_amd = NULL; + uci->valid = 1; + + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { + printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", + cpu); + uci->valid = 0; + return; + } + + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (uci->rev) + : "i" (0x0000008B) : "ecx"); + + printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", + uci->rev); +} + +static int get_matching_microcode_amd(void *mc, int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_header_amd *mc_header = mc; + unsigned long total_size = get_totalsize(mc_header); + void *new_mc; + struct pci_dev *nb_pci_dev, *sb_pci_dev; + unsigned int current_cpu_id; + unsigned int equiv_cpu_id = 0x00; + unsigned int i = 0; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + /* This is a tricky part. We might be called from a write operation */ + /* to the device file instead of the usual process of firmware */ + /* loading. This routine needs to be able to distinguish both */ +/* cases. This is done by checking if there alread is a equivalent */ + /* CPU table installed. If not, we're written through */ + /* /dev/cpu/microcode. */ +/* Since we ignore all checks. The error case in which going through */ +/* firmware loading and that table is not loaded has already been */ + /* checked earlier. */ + if (equiv_cpu_table == NULL) { + printk(KERN_INFO "microcode: CPU%d microcode update with " + "version 0x%x (current=0x%x)\n", + cpu, mc_header->patch_id, uci->rev); + goto out; + } + + current_cpu_id = cpuid_eax(0x00000001); + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { + equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; + break; + } + i++; + } + + if (!equiv_cpu_id) { + printk(KERN_ERR "microcode: CPU%d cpu_id " + "not found in equivalent cpu table \n", cpu); + return 0; + } + + if ((mc_header->processor_rev_id[0]) != (equiv_cpu_id & 0xff)) { + printk(KERN_ERR + "microcode: CPU%d patch does not match " + "(patch is %x, cpu extended is %x) \n", + cpu, mc_header->processor_rev_id[0], + (equiv_cpu_id & 0xff)); + return 0; + } + + if ((mc_header->processor_rev_id[1]) != ((equiv_cpu_id >> 16) & 0xff)) { + printk(KERN_ERR "microcode: CPU%d patch does not match " + "(patch is %x, cpu base id is %x) \n", + cpu, mc_header->processor_rev_id[1], + ((equiv_cpu_id >> 16) & 0xff)); + + return 0; + } + + /* ucode may be northbridge specific */ + if (mc_header->nb_dev_id) { + nb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->nb_dev_id & 0xff), + NULL); + if ((!nb_pci_dev) || + (mc_header->nb_rev_id != nb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d NB mismatch \n", cpu); + pci_dev_put(nb_pci_dev); + return 0; + } + pci_dev_put(nb_pci_dev); + } + + /* ucode may be southbridge specific */ + if (mc_header->sb_dev_id) { + sb_pci_dev = pci_get_device(PCI_VENDOR_ID_AMD, + (mc_header->sb_dev_id & 0xff), + NULL); + if ((!sb_pci_dev) || + (mc_header->sb_rev_id != sb_pci_dev->revision)) { + printk(KERN_ERR "microcode: CPU%d SB mismatch \n", cpu); + pci_dev_put(sb_pci_dev); + return 0; + } + pci_dev_put(sb_pci_dev); + } + + if (mc_header->patch_id <= uci->rev) + return 0; + + printk(KERN_INFO "microcode: CPU%d found a matching microcode " + "update with version 0x%x (current=0x%x)\n", + cpu, mc_header->patch_id, uci->rev); + +out: + new_mc = vmalloc(UCODE_MAX_SIZE); + if (!new_mc) { + printk(KERN_ERR "microcode: error, can't allocate memory\n"); + return -ENOMEM; + } + memset(new_mc, 0, UCODE_MAX_SIZE); + + /* free previous update file */ + vfree(uci->mc.mc_amd); + + memcpy(new_mc, mc, total_size); + + uci->mc.mc_amd = new_mc; + return 1; +} + +static void apply_microcode_amd(int cpu) +{ + unsigned long flags; + unsigned int eax, edx; + unsigned int rev; + int cpu_num = raw_smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + + /* We should bind the task to the CPU */ + BUG_ON(cpu_num != cpu); + + if (uci->mc.mc_amd == NULL) + return; + + spin_lock_irqsave(µcode_update_lock, flags); + + edx = (unsigned int)(((unsigned long) + &(uci->mc.mc_amd->hdr.data_code)) >> 32); + eax = (unsigned int)(((unsigned long) + &(uci->mc.mc_amd->hdr.data_code)) & 0xffffffffL); + + asm volatile("movl %0, %%ecx; wrmsr" : + : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); + + /* get patch id after patching */ + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (rev) + : "i" (0x0000008B) : "ecx"); + + spin_unlock_irqrestore(µcode_update_lock, flags); + + /* check current patch id and patch's id for match */ + if (rev != uci->mc.mc_amd->hdr.patch_id) { + printk(KERN_ERR "microcode: CPU%d update from revision " + "0x%x to 0x%x failed\n", cpu_num, + uci->mc.mc_amd->hdr.patch_id, rev); + return; + } + + printk(KERN_INFO "microcode: CPU%d updated from revision " + "0x%x to 0x%x \n", + cpu_num, uci->rev, uci->mc.mc_amd->hdr.patch_id); + + uci->rev = rev; +} + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +extern void __user *user_buffer; /* user area microcode data buffer */ +extern unsigned int user_buffer_size; /* it's size */ + +static long get_next_ucode_amd(void **mc, long offset) +{ + struct microcode_header_amd mc_header; + unsigned long total_size; + + /* No more data */ + if (offset >= user_buffer_size) + return 0; + if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + return -EFAULT; + } + total_size = get_totalsize(&mc_header); + if (offset + total_size > user_buffer_size) { + printk(KERN_ERR "microcode: error! Bad total size in microcode " + "data file\n"); + return -EINVAL; + } + *mc = vmalloc(UCODE_MAX_SIZE); + if (!*mc) + return -ENOMEM; + memset(*mc, 0, UCODE_MAX_SIZE); + + if (copy_from_user(*mc, user_buffer + offset, total_size)) { + printk(KERN_ERR "microcode: error! Can not read user data\n"); + vfree(*mc); + return -EFAULT; + } + return offset + total_size; +} +#else +#define get_next_ucode_amd() NULL +#endif + +static long get_next_ucode_from_buffer_amd(void **mc, void *buf, + unsigned long size, long offset) +{ + struct microcode_header_amd *mc_header; + unsigned long total_size; + unsigned char *buf_pos = buf; + + /* No more data */ + if (offset >= size) + return 0; + + if (buf_pos[offset] != UCODE_UCODE_TYPE) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode payload type field\n"); + return -EINVAL; + } + + mc_header = (struct microcode_header_amd *)(&buf_pos[offset+8]); + + total_size = (unsigned long) (buf_pos[offset+4] + + (buf_pos[offset+5] << 8)); + + printk(KERN_INFO "microcode: size %lu, total_size %lu, offset %ld\n", + size, total_size, offset); + + if (offset + total_size > size) { + printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); + return -EINVAL; + } + + *mc = vmalloc(UCODE_MAX_SIZE); + if (!*mc) { + printk(KERN_ERR "microcode: error! " + "Can not allocate memory for microcode patch\n"); + return -ENOMEM; + } + + memset(*mc, 0, UCODE_MAX_SIZE); + memcpy(*mc, buf + offset + 8, total_size); + + return offset + total_size + 8; +} + +static long install_equiv_cpu_table(void *buf, unsigned long size, long offset) +{ + unsigned int *buf_pos = buf; + + /* No more data */ + if (offset >= size) + return 0; + + if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode equivalnet cpu table type field\n"); + return 0; + } + + if (size == 0) { + printk(KERN_ERR "microcode: error! " + "Wrong microcode equivalnet cpu table length\n"); + return 0; + } + + equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); + if (!equiv_cpu_table) { + printk(KERN_ERR "microcode: error, can't allocate memory for equiv CPU table\n"); + return 0; + } + + memset(equiv_cpu_table, 0, size); + memcpy(equiv_cpu_table, &buf_pos[3], size); + + return size + 12; /* add header length */ +} + +/* fake device for request_firmware */ +extern struct platform_device *microcode_pdev; + +static int cpu_request_microcode_amd(int cpu) +{ + char name[30]; + const struct firmware *firmware; + void *buf; + unsigned int *buf_pos; + unsigned long size; + long offset = 0; + int error; + void *mc; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + sprintf(name, "amd-ucode/microcode_amd.bin"); + error = request_firmware(&firmware, "amd-ucode/microcode_amd.bin", + µcode_pdev->dev); + if (error) { + printk(KERN_ERR "microcode: ucode data file %s load failed\n", + name); + return error; + } + + buf_pos = buf = firmware->data; + size = firmware->size; + + if (buf_pos[0] != UCODE_MAGIC) { + printk(KERN_ERR "microcode: error! Wrong microcode patch file magic\n"); + return -EINVAL; + } + + offset = install_equiv_cpu_table(buf, buf_pos[2], offset); + + if (!offset) { + printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); + return -EINVAL; + } + + while ((offset = + get_next_ucode_from_buffer_amd(&mc, buf, size, offset)) > 0) { + error = get_matching_microcode_amd(mc, cpu); + if (error < 0) + break; + /* + * It's possible the data file has multiple matching ucode, + * lets keep searching till the latest version + */ + if (error == 1) { + apply_microcode_amd(cpu); + error = 0; + } + vfree(mc); + } + if (offset > 0) { + vfree(mc); + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; + } + if (offset < 0) + error = offset; + release_firmware(firmware); + + return error; +} + +static int apply_microcode_check_cpu_amd(int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + unsigned int rev; + cpumask_t old; + cpumask_of_cpu_ptr(newmask, cpu); + int err = 0; + + /* Check if the microcode is available */ + if (!uci->mc.mc_amd) + return 0; + + old = current->cpus_allowed; + set_cpus_allowed(current, newmask); + + /* Check if the microcode we have in memory matches the CPU */ + if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 16) + err = -EINVAL; + + if (!err) { + asm volatile("movl %1, %%ecx; rdmsr" + : "=a" (rev) + : "i" (0x0000008B) : "ecx"); + + if (uci->rev != rev) + err = -EINVAL; + } + + if (!err) + apply_microcode_amd(cpu); + else + printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" + " rev=0x%x\n", + cpu, uci->rev); + + set_cpus_allowed(current, old); + return err; +} + +static void microcode_fini_cpu_amd(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + uci->valid = 0; + vfree(uci->mc.mc_amd); + uci->mc.mc_amd = NULL; + mutex_unlock(µcode_mutex); +} + +static struct microcode_ops microcode_amd_ops = { + .get_next_ucode = get_next_ucode_amd, + .get_matching_microcode = get_matching_microcode_amd, + .microcode_sanity_check = NULL, + .apply_microcode_check_cpu = apply_microcode_check_cpu_amd, + .cpu_request_microcode = cpu_request_microcode_amd, + .collect_cpu_info = collect_cpu_info_amd, + .apply_microcode = apply_microcode_amd, + .microcode_fini_cpu = microcode_fini_cpu_amd, +}; + +static int __init microcode_amd_module_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + + equiv_cpu_table = NULL; + if (c->x86_vendor == X86_VENDOR_AMD) + return microcode_init(µcode_amd_ops, THIS_MODULE); + else + return -ENODEV; +} + +static void __exit microcode_amd_module_exit(void) +{ + microcode_exit(); +} + +module_init(microcode_amd_module_init) +module_exit(microcode_amd_module_exit) -- cgit v1.2.2 From e56b3bc7942982ac2589c942fb345e38bc7a341a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Jul 2008 11:32:33 -0700 Subject: cpu masks: optimize and clean up cpumask_of_cpu() Clean up and optimize cpumask_of_cpu(), by sharing all the zero words. Instead of stupidly generating all possible i=0...NR_CPUS 2^i patterns creating a huge array of constant bitmasks, realize that the zero words can be shared. In other words, on a 64-bit architecture, we only ever need 64 of these arrays - with a different bit set in one single world (with enough zero words around it so that we can create any bitmask by just offsetting in that big array). And then we just put enough zeroes around it that we can point every single cpumask to be one of those things. So when we have 4k CPU's, instead of having 4k arrays (of 4k bits each, with one bit set in each array - 2MB memory total), we have exactly 64 arrays instead, each 8k bits in size (64kB total). And then we just point cpumask(n) to the right position (which we can calculate dynamically). Once we have the right arrays, getting "cpumask(n)" ends up being: static inline const cpumask_t *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return (const cpumask_t *)p; } This brings other advantages and simplifications as well: - we are not wasting memory that is just filled with a single bit in various different places - we don't need all those games to re-create the arrays in some dense format, because they're already going to be dense enough. if we compile a kernel for up to 4k CPU's, "wasting" that 64kB of memory is a non-issue (especially since by doing this "overlapping" trick we probably get better cache behaviour anyway). [ mingo@elte.hu: Converted Linus's mails into a commit. See: http://lkml.org/lkml/2008/7/27/156 http://lkml.org/lkml/2008/7/28/320 Also applied a family filter - which also has the side-effect of leaving out the bits where Linus calls me an idio... Oh, never mind ;-) ] Signed-off-by: Ingo Molnar Cc: Rusty Russell Cc: Andrew Morton Cc: Al Viro Cc: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 1cd53dfcd309..76e305e064f9 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -80,26 +80,6 @@ static void __init setup_per_cpu_maps(void) #endif } -#ifdef CONFIG_HAVE_CPUMASK_OF_CPU_MAP -/* - * Replace static cpumask_of_cpu_map in the initdata section, - * with one that's allocated sized by the possible number of cpus. - * - * (requires nr_cpu_ids to be initialized) - */ -static void __init setup_cpumask_of_cpu(void) -{ - int i; - - /* alloc_bootmem zeroes memory */ - cpumask_of_cpu_map = alloc_bootmem_low(sizeof(cpumask_t) * nr_cpu_ids); - for (i = 0; i < nr_cpu_ids; i++) - cpu_set(i, cpumask_of_cpu_map[i]); -} -#else -static inline void setup_cpumask_of_cpu(void) { } -#endif - #ifdef CONFIG_X86_32 /* * Great future not-so-futuristic plan: make i386 and x86_64 do it @@ -199,9 +179,6 @@ void __init setup_per_cpu_areas(void) /* Setup node to cpumask map */ setup_node_to_cpumask_map(); - - /* Setup cpumask_of_cpu map */ - setup_cpumask_of_cpu(); } #endif -- cgit v1.2.2 From 12c0b20fa4afb5c8a377d6987fb2dcf353e1dce1 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 23 Jul 2008 17:00:13 -0600 Subject: x86/PCI: use dev_printk when possible Convert printks to use dev_printk(). I converted DBG() to dev_dbg(). This DBG() is from arch/x86/pci/pci.h and requires source-code modification to enable, so dev_dbg() seems roughly equivalent. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/fixup.c | 3 +- arch/x86/pci/i386.c | 26 +++++------- arch/x86/pci/irq.c | 106 ++++++++++++++++++++++-------------------------- arch/x86/pci/numaq_32.c | 5 ++- 4 files changed, 64 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index ff3a6a336342..4bdaa590375d 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -23,7 +23,8 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) pci_read_config_byte(d, reg++, &busno); pci_read_config_byte(d, reg++, &suba); pci_read_config_byte(d, reg++, &subb); - DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); + dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, + suba, subb); if (busno) pci_scan_bus_with_sysdata(busno); /* Bus A */ if (suba < subb) diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index a09505806b82..5807d1bc73f7 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -128,10 +128,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) pr = pci_find_parent_resource(dev, r); if (!r->start || !pr || request_resource(pr, r) < 0) { - printk(KERN_ERR "PCI: Cannot allocate " - "resource region %d " - "of bridge %s\n", - idx, pci_name(dev)); + dev_err(&dev->dev, "BAR %d: can't " + "allocate resource\n", idx); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -166,15 +164,15 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - DBG("PCI: Resource %08lx-%08lx " - "(f=%lx, d=%d, p=%d)\n", - r->start, r->end, r->flags, disabled, pass); + dev_dbg(&dev->dev, "resource %#08llx-%#08llx " + "(f=%lx, d=%d, p=%d)\n", + (unsigned long long) r->start, + (unsigned long long) r->end, + r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { - printk(KERN_ERR "PCI: Cannot allocate " - "resource region %d " - "of device %s\n", - idx, pci_name(dev)); + dev_err(&dev->dev, "BAR %d: can't " + "allocate resource\n", idx); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; @@ -187,8 +185,7 @@ static void __init pcibios_allocate_resources(int pass) /* Turn the ROM off, leave the resource region, * but keep it unregistered. */ u32 reg; - DBG("PCI: Switching off ROM of %s\n", - pci_name(dev)); + dev_dbg(&dev->dev, "disabling ROM\n"); r->flags &= ~IORESOURCE_ROM_ENABLE; pci_read_config_dword(dev, dev->rom_base_reg, ®); @@ -257,8 +254,7 @@ void pcibios_set_master(struct pci_dev *dev) lat = pcibios_max_latency; else return; - printk(KERN_DEBUG "PCI: Setting latency timer of device %s to %d\n", - pci_name(dev), lat); + dev_printk(KERN_DEBUG, &dev->dev, "setting latency timer to %d\n", lat); pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 6a06a2eb0597..fec0123b33a9 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -436,7 +436,7 @@ static int pirq_vlsi_get(struct pci_dev *router, struct pci_dev *dev, int pirq) { WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq); return 0; } return read_config_nybble(router, 0x74, pirq-1); @@ -446,7 +446,7 @@ static int pirq_vlsi_set(struct pci_dev *router, struct pci_dev *dev, int pirq, { WARN_ON_ONCE(pirq >= 9); if (pirq > 8) { - printk(KERN_INFO "VLSI router pirq escape (%d)\n", pirq); + dev_info(&dev->dev, "VLSI router PIRQ escape (%d)\n", pirq); return 0; } write_config_nybble(router, 0x74, pirq-1, irq); @@ -492,15 +492,17 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq irq = 0; if (pirq <= 4) irq = read_config_nybble(router, 0x56, pirq - 1); - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d get irq : %2d\n", - dev->vendor, dev->device, pirq, irq); + dev_info(&dev->dev, + "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", + dev->vendor, dev->device, pirq, irq); return irq; } static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { - printk(KERN_INFO "AMD756: dev %04x:%04x, router pirq : %d SET irq : %2d\n", - dev->vendor, dev->device, pirq, irq); + dev_info(&dev->dev, + "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", + dev->vendor, dev->device, pirq, irq); if (pirq <= 4) write_config_nybble(router, 0x56, pirq - 1, irq); return 1; @@ -730,7 +732,6 @@ static __init int ali_router_probe(struct irq_router *r, struct pci_dev *router, switch (device) { case PCI_DEVICE_ID_AL_M1533: case PCI_DEVICE_ID_AL_M1563: - printk(KERN_DEBUG "PCI: Using ALI IRQ Router\n"); r->name = "ALI"; r->get = pirq_ali_get; r->set = pirq_ali_set; @@ -840,11 +841,9 @@ static void __init pirq_find_router(struct irq_router *r) h->probe(r, pirq_router_dev, pirq_router_dev->device)) break; } - printk(KERN_INFO "PCI: Using IRQ router %s [%04x/%04x] at %s\n", - pirq_router.name, - pirq_router_dev->vendor, - pirq_router_dev->device, - pci_name(pirq_router_dev)); + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", + pirq_router.name, + pirq_router_dev->vendor, pirq_router_dev->device); /* The device remains referenced for the kernel lifetime */ } @@ -877,7 +876,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) /* Find IRQ pin */ pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin); if (!pin) { - DBG(KERN_DEBUG " -> no interrupt pin\n"); + dev_dbg(&dev->dev, "no interrupt pin\n"); return 0; } pin = pin - 1; @@ -887,20 +886,20 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (!pirq_table) return 0; - DBG(KERN_DEBUG "IRQ for %s[%c]", pci_name(dev), 'A' + pin); info = pirq_get_info(dev); if (!info) { - DBG(" -> not found in routing table\n" KERN_DEBUG); + dev_dbg(&dev->dev, "PCI INT %c not found in routing table\n", + 'A' + pin); return 0; } pirq = info->irq[pin].link; mask = info->irq[pin].bitmap; if (!pirq) { - DBG(" -> not routed\n" KERN_DEBUG); + dev_dbg(&dev->dev, "PCI INT %c not routed\n", 'A' + pin); return 0; } - DBG(" -> PIRQ %02x, mask %04x, excl %04x", pirq, mask, - pirq_table->exclusive_irqs); + dev_dbg(&dev->dev, "PCI INT %c -> PIRQ %02x, mask %04x, excl %04x", + 'A' + pin, pirq, mask, pirq_table->exclusive_irqs); mask &= pcibios_irq_mask; /* Work around broken HP Pavilion Notebooks which assign USB to @@ -930,10 +929,8 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) if (pci_probe & PCI_USE_PIRQ_MASK) newirq = 0; else - printk("\n" KERN_WARNING - "PCI: IRQ %i for device %s doesn't match PIRQ mask - try pci=usepirqmask\n" - KERN_DEBUG, newirq, - pci_name(dev)); + dev_warn(&dev->dev, "IRQ %d doesn't match PIRQ mask " + "%#x; try pci=usepirqmask\n", newirq, mask); } if (!newirq && assign) { for (i = 0; i < 16; i++) { @@ -944,39 +941,35 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) newirq = i; } } - DBG(" -> newirq=%d", newirq); + dev_dbg(&dev->dev, "PCI INT %c -> newirq %d", 'A' + pin, newirq); /* Check if it is hardcoded */ if ((pirq & 0xf0) == 0xf0) { irq = pirq & 0xf; - DBG(" -> hardcoded IRQ %d\n", irq); - msg = "Hardcoded"; + msg = "hardcoded"; } else if (r->get && (irq = r->get(pirq_router_dev, dev, pirq)) && \ ((!(pci_probe & PCI_USE_PIRQ_MASK)) || ((1 << irq) & mask))) { - DBG(" -> got IRQ %d\n", irq); - msg = "Found"; + msg = "found"; eisa_set_level_irq(irq); } else if (newirq && r->set && (dev->class >> 8) != PCI_CLASS_DISPLAY_VGA) { - DBG(" -> assigning IRQ %d", newirq); if (r->set(pirq_router_dev, dev, pirq, newirq)) { eisa_set_level_irq(newirq); - DBG(" ... OK\n"); - msg = "Assigned"; + msg = "assigned"; irq = newirq; } } if (!irq) { - DBG(" ... failed\n"); if (newirq && mask == (1 << newirq)) { - msg = "Guessed"; + msg = "guessed"; irq = newirq; - } else + } else { + dev_dbg(&dev->dev, "can't route interrupt\n"); return 0; + } } - printk(KERN_INFO "PCI: %s IRQ %d for device %s\n", msg, irq, - pci_name(dev)); + dev_info(&dev->dev, "%s PCI INT %c -> IRQ %d\n", msg, 'A' + pin, irq); /* Update IRQ for all devices with the same pirq value */ while ((dev2 = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev2)) != NULL) { @@ -996,17 +989,17 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign) (!(pci_probe & PCI_USE_PIRQ_MASK) || \ ((1 << dev2->irq) & mask))) { #ifndef CONFIG_PCI_MSI - printk(KERN_INFO "IRQ routing conflict for %s, have irq %d, want irq %d\n", - pci_name(dev2), dev2->irq, irq); + dev_info(&dev2->dev, "IRQ routing conflict: " + "have IRQ %d, want IRQ %d\n", + dev2->irq, irq); #endif continue; } dev2->irq = irq; pirq_penalty[irq]++; if (dev != dev2) - printk(KERN_INFO - "PCI: Sharing IRQ %d with %s\n", - irq, pci_name(dev2)); + dev_info(&dev->dev, "sharing IRQ %d with %s\n", + irq, pci_name(dev2)); } } return 1; @@ -1025,8 +1018,7 @@ static void __init pcibios_fixup_irqs(void) * already in use. */ if (dev->irq >= 16) { - DBG(KERN_DEBUG "%s: ignoring bogus IRQ %d\n", - pci_name(dev), dev->irq); + dev_dbg(&dev->dev, "ignoring bogus IRQ %d\n", dev->irq); dev->irq = 0; } /* @@ -1070,12 +1062,12 @@ static void __init pcibios_fixup_irqs(void) irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, PCI_SLOT(bridge->devfn), pin); if (irq >= 0) - printk(KERN_WARNING "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), 'A' + pin, irq); + dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n", + pci_name(bridge), + 'A' + pin, irq); } if (irq >= 0) { - printk(KERN_INFO "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", - pci_name(dev), 'A' + pin, irq); + dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq); dev->irq = irq; } } @@ -1231,25 +1223,24 @@ static int pirq_enable_irq(struct pci_dev *dev) irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, PCI_SLOT(bridge->devfn), pin); if (irq >= 0) - printk(KERN_WARNING - "PCI: using PPB %s[%c] to get irq %d\n", - pci_name(bridge), - 'A' + pin, irq); + dev_warn(&dev->dev, "using bridge %s " + "INT %c to get IRQ %d\n", + pci_name(bridge), 'A' + pin, + irq); dev = bridge; } dev = temp_dev; if (irq >= 0) { - printk(KERN_INFO - "PCI->APIC IRQ transform: %s[%c] -> IRQ %d\n", - pci_name(dev), 'A' + pin, irq); + dev_info(&dev->dev, "PCI->APIC IRQ transform: " + "INT %c -> IRQ %d\n", 'A' + pin, irq); dev->irq = irq; return 0; } else - msg = " Probably buggy MP table."; + msg = "; probably buggy MP table"; } else if (pci_probe & PCI_BIOS_IRQ_SCAN) msg = ""; else - msg = " Please try using pci=biosirq."; + msg = "; please try using pci=biosirq"; /* * With IDE legacy devices the IRQ lookup failure is not @@ -1259,9 +1250,8 @@ static int pirq_enable_irq(struct pci_dev *dev) !(dev->class & 0x5)) return 0; - printk(KERN_WARNING - "PCI: No IRQ known for interrupt pin %c of device %s.%s\n", - 'A' + pin, pci_name(dev), msg); + dev_warn(&dev->dev, "can't find IRQ for PCI INT %c%s\n", + 'A' + pin, msg); } return 0; } diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index f4b16dc11dad..1177845d3186 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -131,13 +131,14 @@ static void __devinit pci_fixup_i450nx(struct pci_dev *d) u8 busno, suba, subb; int quad = BUS2QUAD(d->bus->number); - printk("PCI: Searching for i450NX host bridges on %s\n", pci_name(d)); + dev_info(&d->dev, "searching for i450NX host bridges\n"); reg = 0xd0; for(pxb=0; pxb<2; pxb++) { pci_read_config_byte(d, reg++, &busno); pci_read_config_byte(d, reg++, &suba); pci_read_config_byte(d, reg++, &subb); - DBG("i450NX PXB %d: %02x/%02x/%02x\n", pxb, busno, suba, subb); + dev_dbg(&d->dev, "i450NX PXB %d: %02x/%02x/%02x\n", + pxb, busno, suba, subb); if (busno) { /* Bus A */ pci_scan_bus_with_sysdata(QUADLOCAL2BUS(quad, busno)); -- cgit v1.2.2 From cddb8a5c14aa89810b40495d94d3d2a0faee6619 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Mon, 28 Jul 2008 15:46:29 -0700 Subject: mmu-notifiers: core With KVM/GFP/XPMEM there isn't just the primary CPU MMU pointing to pages. There are secondary MMUs (with secondary sptes and secondary tlbs) too. sptes in the kvm case are shadow pagetables, but when I say spte in mmu-notifier context, I mean "secondary pte". In GRU case there's no actual secondary pte and there's only a secondary tlb because the GRU secondary MMU has no knowledge about sptes and every secondary tlb miss event in the MMU always generates a page fault that has to be resolved by the CPU (this is not the case of KVM where the a secondary tlb miss will walk sptes in hardware and it will refill the secondary tlb transparently to software if the corresponding spte is present). The same way zap_page_range has to invalidate the pte before freeing the page, the spte (and secondary tlb) must also be invalidated before any page is freed and reused. Currently we take a page_count pin on every page mapped by sptes, but that means the pages can't be swapped whenever they're mapped by any spte because they're part of the guest working set. Furthermore a spte unmap event can immediately lead to a page to be freed when the pin is released (so requiring the same complex and relatively slow tlb_gather smp safe logic we have in zap_page_range and that can be avoided completely if the spte unmap event doesn't require an unpin of the page previously mapped in the secondary MMU). The mmu notifiers allow kvm/GRU/XPMEM to attach to the tsk->mm and know when the VM is swapping or freeing or doing anything on the primary MMU so that the secondary MMU code can drop sptes before the pages are freed, avoiding all page pinning and allowing 100% reliable swapping of guest physical address space. Furthermore it avoids the code that teardown the mappings of the secondary MMU, to implement a logic like tlb_gather in zap_page_range that would require many IPI to flush other cpu tlbs, for each fixed number of spte unmapped. To make an example: if what happens on the primary MMU is a protection downgrade (from writeable to wrprotect) the secondary MMU mappings will be invalidated, and the next secondary-mmu-page-fault will call get_user_pages and trigger a do_wp_page through get_user_pages if it called get_user_pages with write=1, and it'll re-establishing an updated spte or secondary-tlb-mapping on the copied page. Or it will setup a readonly spte or readonly tlb mapping if it's a guest-read, if it calls get_user_pages with write=0. This is just an example. This allows to map any page pointed by any pte (and in turn visible in the primary CPU MMU), into a secondary MMU (be it a pure tlb like GRU, or an full MMU with both sptes and secondary-tlb like the shadow-pagetable layer with kvm), or a remote DMA in software like XPMEM (hence needing of schedule in XPMEM code to send the invalidate to the remote node, while no need to schedule in kvm/gru as it's an immediate event like invalidating primary-mmu pte). At least for KVM without this patch it's impossible to swap guests reliably. And having this feature and removing the page pin allows several other optimizations that simplify life considerably. Dependencies: 1) mm_take_all_locks() to register the mmu notifier when the whole VM isn't doing anything with "mm". This allows mmu notifier users to keep track if the VM is in the middle of the invalidate_range_begin/end critical section with an atomic counter incraese in range_begin and decreased in range_end. No secondary MMU page fault is allowed to map any spte or secondary tlb reference, while the VM is in the middle of range_begin/end as any page returned by get_user_pages in that critical section could later immediately be freed without any further ->invalidate_page notification (invalidate_range_begin/end works on ranges and ->invalidate_page isn't called immediately before freeing the page). To stop all page freeing and pagetable overwrites the mmap_sem must be taken in write mode and all other anon_vma/i_mmap locks must be taken too. 2) It'd be a waste to add branches in the VM if nobody could possibly run KVM/GRU/XPMEM on the kernel, so mmu notifiers will only enabled if CONFIG_KVM=m/y. In the current kernel kvm won't yet take advantage of mmu notifiers, but this already allows to compile a KVM external module against a kernel with mmu notifiers enabled and from the next pull from kvm.git we'll start using them. And GRU/XPMEM will also be able to continue the development by enabling KVM=m in their config, until they submit all GRU/XPMEM GPLv2 code to the mainline kernel. Then they can also enable MMU_NOTIFIERS in the same way KVM does it (even if KVM=n). This guarantees nobody selects MMU_NOTIFIER=y if KVM and GRU and XPMEM are all =n. The mmu_notifier_register call can fail because mm_take_all_locks may be interrupted by a signal and return -EINTR. Because mmu_notifier_reigster is used when a driver startup, a failure can be gracefully handled. Here an example of the change applied to kvm to register the mmu notifiers. Usually when a driver startups other allocations are required anyway and -ENOMEM failure paths exists already. struct kvm *kvm_arch_create_vm(void) { struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL); + int err; if (!kvm) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + kvm->arch.mmu_notifier.ops = &kvm_mmu_notifier_ops; + err = mmu_notifier_register(&kvm->arch.mmu_notifier, current->mm); + if (err) { + kfree(kvm); + return ERR_PTR(err); + } + return kvm; } mmu_notifier_unregister returns void and it's reliable. The patch also adds a few needed but missing includes that would prevent kernel to compile after these changes on non-x86 archs (x86 didn't need them by luck). [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix mm/filemap_xip.c build] [akpm@linux-foundation.org: fix mm/mmu_notifier.c build] Signed-off-by: Andrea Arcangeli Signed-off-by: Nick Piggin Signed-off-by: Christoph Lameter Cc: Jack Steiner Cc: Robin Holt Cc: Nick Piggin Cc: Peter Zijlstra Cc: Kanoj Sarcar Cc: Roland Dreier Cc: Steve Wise Cc: Avi Kivity Cc: Hugh Dickins Cc: Rusty Russell Cc: Anthony Liguori Cc: Chris Wright Cc: Marcelo Tosatti Cc: Eric Dumazet Cc: "Paul E. McKenney" Cc: Izik Eidus Cc: Anthony Liguori Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kvm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index 8d45fabc5f3b..ce3251ce5504 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -21,6 +21,7 @@ config KVM tristate "Kernel-based Virtual Machine (KVM) support" depends on HAVE_KVM select PREEMPT_NOTIFIERS + select MMU_NOTIFIER select ANON_INODES ---help--- Support hosting fully virtualized guest machines using hardware -- cgit v1.2.2 From 5d006d8d09e82f086ca0baf79a2907f2c1e25af7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 29 Jul 2008 09:58:29 -0500 Subject: lguest: set max_pfn_mapped, growl loudly at Yinghai Lu 6af61a7614a306fe882a0c2b4ddc63b65aa66efc 'x86: clean up max_pfn_mapped usage - 32-bit' makes the following comment: XEN PV and lguest may need to assign max_pfn_mapped too. But no CC. Yinghai, wasting fellow developers' time is a VERY bad habit. If you do it again, I will hunt you down and try to extract the three hours of my life I just lost :) Signed-off-by: Rusty Russell Cc: Yinghai Lu --- arch/x86/lguest/boot.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 0313a5eec412..d9249a882aa5 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1014,6 +1014,9 @@ __init void lguest_init(void) init_pg_tables_start = __pa(pg0); init_pg_tables_end = __pa(pg0); + /* As described in head_32.S, we map the first 128M of memory. */ + max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; + /* Load the %fs segment register (the per-cpu segment register) with * the normal data segment to get through booting. */ asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_DS) : "memory"); -- cgit v1.2.2 From 9b79022ca909b66e2cd0cfd9248f832fc165f77f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 28 Jul 2008 17:54:21 -0700 Subject: Fix 'get_user_pages_fast()' with non-page-aligned start address Alexey Dobriyan reported trouble with LTP with the new fast-gup code, and Johannes Weiner debugged it to non-page-aligned addresses, where the new get_user_pages_fast() code would do all the wrong things, including just traversing past the end of the requested area due to 'addr' never matching 'end' exactly. This is not a pretty fix, and we may actually want to move the alignment into generic code, leaving just the core code per-arch, but Alexey verified that the vmsplice01 LTP test doesn't crash with this. Reported-and-tested-by: Alexey Dobriyan Debugged-by: Johannes Weiner Cc: Nick Piggin Cc: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 3085f25b4355..007bb06c7504 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -223,14 +223,17 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages) { struct mm_struct *mm = current->mm; - unsigned long end = start + (nr_pages << PAGE_SHIFT); - unsigned long addr = start; + unsigned long addr, len, end; unsigned long next; pgd_t *pgdp; int nr = 0; + start &= PAGE_MASK; + addr = start; + len = (unsigned long) nr_pages << PAGE_SHIFT; + end = start + len; if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ, - start, nr_pages*PAGE_SIZE))) + start, len))) goto slow_irqon; /* -- cgit v1.2.2 From 45b1e23eca1c53fa79a611a2bc8c93697ede6c97 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 09:42:17 +0200 Subject: x86, microcode support: fix build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/microcode.c:412: error: static declaration of ‘microcode_init’ follows non-static declaration include/asm/microcode.h:1: error: previous declaration of ‘microcode_init’ was here arch/x86/kernel/microcode.c:454: error: static declaration of ‘microcode_exit’ follows non-static declaration include/asm/microcode.h:2: error: previous declaration of ‘microcode_exit’ was here Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 1e42e79ca694..9a881845b35b 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -415,7 +415,7 @@ static struct notifier_block __refdata mc_cpu_notifier = { .notifier_call = mc_cpu_callback, }; -static int microcode_init(void *opaque, struct module *module) +int microcode_init(void *opaque, struct module *module) { struct microcode_ops *ops = (struct microcode_ops *)opaque; int error; @@ -457,7 +457,7 @@ static int microcode_init(void *opaque, struct module *module) } EXPORT_SYMBOL_GPL(microcode_init); -static void __exit microcode_exit (void) +void __exit microcode_exit (void) { microcode_dev_exit(); -- cgit v1.2.2 From 224e946b81166d732f3e9b414cb69612072fb500 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 09:52:55 +0200 Subject: x86, microcode: fix symbol exports fix tons of build errors: arch/x86/kernel/built-in.o: In function `microcode_fini_cpu': microcode_intel.c:(.text+0x11598): undefined reference to `microcode_mutex' microcode_intel.c:(.text+0x115a4): undefined reference to `ucode_cpu_info' microcode_intel.c:(.text+0x115ae): undefined reference to `ucode_cpu_info' microcode_intel.c:(.text+0x115bc): undefined reference to `microcode_mutex' [...] Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 9a881845b35b..758b958a6637 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -104,16 +104,16 @@ MODULE_LICENSE("GPL"); struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); +DEFINE_MUTEX(microcode_mutex); EXPORT_SYMBOL_GPL(microcode_mutex); -static struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE -static void __user *user_buffer; /* user area microcode data buffer */ +void __user *user_buffer; /* user area microcode data buffer */ EXPORT_SYMBOL_GPL(user_buffer); -static unsigned int user_buffer_size; /* it's size */ +unsigned int user_buffer_size; /* it's size */ EXPORT_SYMBOL_GPL(user_buffer_size); static int do_microcode_update (void) @@ -230,7 +230,7 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #endif /* fake device for request_firmware */ -static struct platform_device *microcode_pdev; +struct platform_device *microcode_pdev; EXPORT_SYMBOL_GPL(microcode_pdev); static void microcode_init_cpu(int cpu, int resume) -- cgit v1.2.2 From 5d7b605245b1aa1a9cd6549b1f57d69273eb0c37 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 10:07:36 +0200 Subject: x86, microcode: fix module license string fix: FATAL: modpost: GPL-incompatible module microcode_amd.ko uses GPL-only symbol 'set_cpus_allowed_ptr' Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index db199e3fdd1f..fd9e68e88899 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -40,7 +40,7 @@ MODULE_DESCRIPTION("AMD Microcode Update Driver"); MODULE_AUTHOR("Peter Oruba "); -MODULE_LICENSE("GPLv2"); +MODULE_LICENSE("GPL v2"); #define UCODE_MAGIC 0x00414d44 #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 -- cgit v1.2.2 From a1708ce8a362c4999f1201237ae7b77c4d13af82 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 25 Jul 2008 16:26:39 +0200 Subject: KVM: Allow reading aliases with mmu_lock This allows the mmu notifier code to run unalias_gfn with only the mmu_lock held. Only alias writes need the mmu_lock held. Readers will either take the slots_lock in read mode or the mmu_lock. Signed-off-by: Andrea Arcangeli Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5916191420c7..9870ce422920 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1495,6 +1495,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, goto out; down_write(&kvm->slots_lock); + spin_lock(&kvm->mmu_lock); p = &kvm->arch.aliases[alias->slot]; p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; @@ -1506,6 +1507,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm, break; kvm->arch.naliases = n; + spin_unlock(&kvm->mmu_lock); kvm_mmu_zap_all(kvm); up_write(&kvm->slots_lock); -- cgit v1.2.2 From 604b38ac0369bd50fcbb33344aa5553c071009f7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 25 Jul 2008 16:32:03 +0200 Subject: KVM: Allow browsing memslots with mmu_lock This allows reading memslots with only the mmu_lock hold for mmu notifiers that runs in atomic context and with mmu_lock held. Signed-off-by: Andrea Arcangeli Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9870ce422920..c7b01efe0646 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3974,16 +3974,23 @@ int kvm_arch_set_memory_region(struct kvm *kvm, */ if (!user_alloc) { if (npages && !old.rmap) { + unsigned long userspace_addr; + down_write(¤t->mm->mmap_sem); - memslot->userspace_addr = do_mmap(NULL, 0, - npages * PAGE_SIZE, - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, - 0); + userspace_addr = do_mmap(NULL, 0, + npages * PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + 0); up_write(¤t->mm->mmap_sem); - if (IS_ERR((void *)memslot->userspace_addr)) - return PTR_ERR((void *)memslot->userspace_addr); + if (IS_ERR((void *)userspace_addr)) + return PTR_ERR((void *)userspace_addr); + + /* set userspace_addr atomically for kvm_hva_to_rmapp */ + spin_lock(&kvm->mmu_lock); + memslot->userspace_addr = userspace_addr; + spin_unlock(&kvm->mmu_lock); } else { if (!old.user_alloc && old.rmap) { int ret; -- cgit v1.2.2 From e930bffe95e1e886a1ede80726ea38df5838d067 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 25 Jul 2008 16:24:52 +0200 Subject: KVM: Synchronize guest physical memory map to host virtual memory map Synchronize changes to host virtual addresses which are part of a KVM memory slot to the KVM shadow mmu. This allows pte operations like swapping, page migration, and madvise() to transparently work with KVM. Signed-off-by: Andrea Arcangeli Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 100 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 12 ++++++ 2 files changed, 112 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 2fa231923cf7..0bfe2bd305eb 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -653,6 +653,84 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) account_shadowed(kvm, gfn); } +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int need_tlb_flush = 0; + + while ((spte = rmap_next(kvm, rmapp, NULL))) { + BUG_ON(!(*spte & PT_PRESENT_MASK)); + rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); + rmap_remove(kvm, spte); + set_shadow_pte(spte, shadow_trap_nonpresent_pte); + need_tlb_flush = 1; + } + return need_tlb_flush; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp)) +{ + int i; + int retval = 0; + + /* + * If mmap_sem isn't taken, we can look the memslots with only + * the mmu_lock by skipping over the slots with userspace_addr == 0. + */ + for (i = 0; i < kvm->nmemslots; i++) { + struct kvm_memory_slot *memslot = &kvm->memslots[i]; + unsigned long start = memslot->userspace_addr; + unsigned long end; + + /* mmu_lock protects userspace_addr */ + if (!start) + continue; + + end = start + (memslot->npages << PAGE_SHIFT); + if (hva >= start && hva < end) { + gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; + retval |= handler(kvm, &memslot->rmap[gfn_offset]); + retval |= handler(kvm, + &memslot->lpage_info[ + gfn_offset / + KVM_PAGES_PER_HPAGE].rmap_pde); + } + } + + return retval; +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) +{ + u64 *spte; + int young = 0; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + int _young; + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + _young = _spte & PT_ACCESSED_MASK; + if (_young) { + young = 1; + clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); + } + spte = rmap_next(kvm, rmapp, spte); + } + return young; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, kvm_age_rmapp); +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { @@ -1203,6 +1281,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) int r; int largepage = 0; pfn_t pfn; + unsigned long mmu_seq; down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { @@ -1210,6 +1289,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) largepage = 1; } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); @@ -1220,6 +1301,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, v, write, largepage, gfn, pfn, PT32E_ROOT_LEVEL); @@ -1227,6 +1310,11 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } @@ -1345,6 +1433,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, int r; int largepage = 0; gfn_t gfn = gpa >> PAGE_SHIFT; + unsigned long mmu_seq; ASSERT(vcpu); ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); @@ -1358,6 +1447,8 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { @@ -1365,12 +1456,19 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, return 1; } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); spin_unlock(&vcpu->kvm->mmu_lock); return r; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } static void nonpaging_free(struct kvm_vcpu *vcpu) @@ -1670,6 +1768,8 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } + vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, gfn); up_read(¤t->mm->mmap_sem); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4d918220baeb..f72ac1fa35f0 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -263,6 +263,8 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, pfn = vcpu->arch.update_pte.pfn; if (is_error_pfn(pfn)) return; + if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq)) + return; kvm_get_pfn(pfn); mmu_set_spte(vcpu, spte, page->role.access, pte_access, 0, 0, gpte & PT_DIRTY_MASK, NULL, largepage, gpte_to_gfn(gpte), @@ -380,6 +382,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, int r; pfn_t pfn; int largepage = 0; + unsigned long mmu_seq; pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); kvm_mmu_audit(vcpu, "pre page fault"); @@ -413,6 +416,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, largepage = 1; } } + mmu_seq = vcpu->kvm->mmu_notifier_seq; + /* implicit mb(), we'll read before PT lock is unlocked */ pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); up_read(¤t->mm->mmap_sem); @@ -424,6 +429,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, } spin_lock(&vcpu->kvm->mmu_lock); + if (mmu_notifier_retry(vcpu, mmu_seq)) + goto out_unlock; kvm_mmu_free_some_pages(vcpu); shadow_pte = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, largepage, &write_pt, pfn); @@ -439,6 +446,11 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, spin_unlock(&vcpu->kvm->mmu_lock); return write_pt; + +out_unlock: + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + return 0; } static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) -- cgit v1.2.2 From ed8486243379ef3e6c61363df915882945c0eaec Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 29 Jul 2008 11:30:57 +0300 Subject: KVM: Advertise synchronized mmu support to userspace Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c7b01efe0646..0d682fc6aeb3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -883,6 +883,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PIT: case KVM_CAP_NOP_IO_DELAY: case KVM_CAP_MP_STATE: + case KVM_CAP_SYNC_MMU: r = 1; break; case KVM_CAP_COALESCED_MMIO: -- cgit v1.2.2 From 8978b74253280d59e97cf49a3ec2c0cbccd5b801 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 29 Jul 2008 13:38:53 +0900 Subject: generic, x86: fix add iommu_num_pages helper function This IOMMU helper function doesn't work for some architectures: http://marc.info/?l=linux-kernel&m=121699304403202&w=2 It also breaks POWER and SPARC builds: http://marc.info/?l=linux-kernel&m=121730388001890&w=2 Currently, only x86 IOMMUs use this so let's move it to x86 for now. Reported-by: Stephen Rothwell Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 8dbffb846de9..87d4d6964ec2 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -123,6 +123,14 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } + +unsigned long iommu_num_pages(unsigned long addr, unsigned long len) +{ + unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); + + return size >> PAGE_SHIFT; +} +EXPORT_SYMBOL(iommu_num_pages); #endif /* -- cgit v1.2.2 From 0d39741a27d86d305cc75ba626392be410dcbab9 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 29 Jul 2008 22:34:01 -0700 Subject: GRU Driver: export is_uv_system(), zap_page_range() & follow_page() Exports needed by the GRU driver. Signed-off-by: Jack Steiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/genapic_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_64.c b/arch/x86/kernel/genapic_64.c index 1fa8be5bd217..eaff0bbb1444 100644 --- a/arch/x86/kernel/genapic_64.c +++ b/arch/x86/kernel/genapic_64.c @@ -99,3 +99,4 @@ int is_uv_system(void) { return uv_system_type != UV_NONE; } +EXPORT_SYMBOL_GPL(is_uv_system); -- cgit v1.2.2 From afd962a9e8708c571c5c0c4a6d098f931742c229 Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Wed, 30 Jul 2008 13:30:14 +0200 Subject: x86: wrong register was used in align macro New ALIGN_DESTINATION macro has sad typo: r8d register was used instead of ecx in fixup section. This can be considered as a regression. Register ecx was also wrongly loaded with value in r8d in copy_user_nocache routine. Signed-off-by: Vitaly Mayatskikh Signed-off-by: Linus Torvalds --- arch/x86/lib/copy_user_64.S | 2 +- arch/x86/lib/copy_user_nocache_64.S | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dfdf428975c0..f118c110af32 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -52,7 +52,7 @@ jnz 100b 102: .section .fixup,"ax" -103: addl %r8d,%edx /* ecx is zerorest also */ +103: addl %ecx,%edx /* ecx is zerorest also */ jmp copy_user_handle_tail .previous diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index 40e0e309d27e..cb0c112386fb 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -32,7 +32,7 @@ jnz 100b 102: .section .fixup,"ax" -103: addl %r8d,%edx /* ecx is zerorest also */ +103: addl %ecx,%edx /* ecx is zerorest also */ jmp copy_user_handle_tail .previous @@ -108,7 +108,6 @@ ENTRY(__copy_user_nocache) jmp 60f 50: movl %ecx,%edx 60: sfence - movl %r8d,%ecx jmp copy_user_handle_tail .previous -- cgit v1.2.2 From a648bf4632628c787abb0514277f2a231fca39ca Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Jul 2008 10:29:18 -0700 Subject: x86, xsave: xsave cpuid feature bits Add xsave CPU feature bits. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/feature_names.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index 0bf4d37a0483..741547225659 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -46,7 +46,7 @@ const char * const x86_cap_flags[NCAPINTS*32] = { "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, NULL, NULL, "dca", "sse4_1", "sse4_2", "x2apic", NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, "xsave", NULL, NULL, NULL, NULL, NULL, /* VIA/Cyrix/Centaur-defined */ NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", -- cgit v1.2.2 From dc1e35c6e95e8923cf1d3510438b63c600fee1e2 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Jul 2008 10:29:19 -0700 Subject: x86, xsave: enable xsave/xrstor on cpus with xsave support Enables xsave/xrstor by turning on cr4.osxsave on cpu's which have the xsave support. For now, features that OS supports/enabled are FP and SSE. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/cpu/common.c | 8 ++++ arch/x86/kernel/i387.c | 12 ++++++ arch/x86/kernel/traps_32.c | 1 - arch/x86/kernel/traps_64.c | 4 -- arch/x86/kernel/xsave.c | 87 ++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 6 deletions(-) create mode 100644 arch/x86/kernel/xsave.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a07ec14f3312..d6ea91abaebc 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -38,7 +38,7 @@ obj-y += tsc.o io_delay.o rtc.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o -obj-y += i387.o +obj-y += i387.o xsave.o obj-y += ptrace.o obj-y += ds.o obj-$(CONFIG_X86_32) += tls.o diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..fabbcb7020fb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -712,6 +712,14 @@ void __cpuinit cpu_init(void) current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); + + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + + xsave_init(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index eb9ddd8efb82..e22a9a9dce8a 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -61,6 +61,11 @@ void __init init_thread_xstate(void) return; } + if (cpu_has_xsave) { + xsave_cntxt_init(); + return; + } + if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -83,6 +88,13 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + xsave_init(); + mxcsr_feature_mask_init(); /* clean state in init */ current_thread_info()->status = 0; diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 03df8e45e5a1..da5a5964fccb 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1228,7 +1228,6 @@ void __init trap_init(void) set_bit(SYSCALL_VECTOR, used_vectors); - init_thread_xstate(); /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 513caaca7115..3580a7938a2e 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -1172,10 +1172,6 @@ void __init trap_init(void) #ifdef CONFIG_IA32_EMULATION set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif - /* - * initialize the per thread extended state: - */ - init_thread_xstate(); /* * Should be a barrier for any external CPU state: */ diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c new file mode 100644 index 000000000000..c68b7c4ca249 --- /dev/null +++ b/arch/x86/kernel/xsave.c @@ -0,0 +1,87 @@ +/* + * xsave/xrstor support. + * + * Author: Suresh Siddha + */ +#include +#include +#include + +/* + * Supported feature mask by the CPU and the kernel. + */ +unsigned int pcntxt_hmask, pcntxt_lmask; + +/* + * Represents init state for the supported extended state. + */ +struct xsave_struct *init_xstate_buf; + +/* + * Enable the extended processor state save/restore feature + */ +void __cpuinit xsave_init(void) +{ + if (!cpu_has_xsave) + return; + + set_in_cr4(X86_CR4_OSXSAVE); + + /* + * Enable all the features that the HW is capable of + * and the Linux kernel is aware of. + * + * xsetbv(); + */ + asm volatile(".byte 0x0f,0x01,0xd1" : : "c" (0), + "a" (pcntxt_lmask), "d" (pcntxt_hmask)); +} + +/* + * setup the xstate image representing the init state + */ +void setup_xstate_init(void) +{ + init_xstate_buf = alloc_bootmem(xstate_size); + init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; +} + +/* + * Enable and initialize the xsave feature. + */ +void __init xsave_cntxt_init(void) +{ + unsigned int eax, ebx, ecx, edx; + + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + + pcntxt_lmask = eax; + pcntxt_hmask = edx; + + if ((pcntxt_lmask & XSTATE_FPSSE) != XSTATE_FPSSE) { + printk(KERN_ERR "FP/SSE not shown under xsave features %x\n", + pcntxt_lmask); + BUG(); + } + + /* + * for now OS knows only about FP/SSE + */ + pcntxt_lmask = pcntxt_lmask & XCNTXT_LMASK; + pcntxt_hmask = pcntxt_hmask & XCNTXT_HMASK; + + xsave_init(); + + /* + * Recompute the context size for enabled features + */ + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + + xstate_size = ebx; + + setup_xstate_init(); + + printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%Lx, " + "cntxt size 0x%x\n", + (pcntxt_lmask | ((u64) pcntxt_hmask << 32)), xstate_size); +} -- cgit v1.2.2 From b359e8a434cc3d09847010fc4aeccf48d69740e4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Jul 2008 10:29:20 -0700 Subject: x86, xsave: context switch support using xsave/xrstor Uses xsave/xrstor (instead of traditional fxsave/fxrstor) in context switch when available. Introduces TS_XSAVE flag, which determine the need to use xsave/xrstor instructions during context switch instead of the legacy fxsave/fxrstor instructions. Thread-synchronous status word is already in L1 cache during this code patch and thus minimizes the performance penality compared to (cpu_has_xsave) checks. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 5 ++++- arch/x86/kernel/i387.c | 5 ++++- arch/x86/kernel/traps_64.c | 2 +- 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fabbcb7020fb..6c2b9e756db2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -709,7 +709,10 @@ void __cpuinit cpu_init(void) /* * Force FPU initialization: */ - current_thread_info()->status = 0; + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e22a9a9dce8a..b778e17e4b01 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -97,7 +97,10 @@ void __cpuinit fpu_init(void) mxcsr_feature_mask_init(); /* clean state in init */ - current_thread_info()->status = 0; + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; clear_used_math(); } #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 3580a7938a2e..38eb76156a47 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -1134,7 +1134,7 @@ asmlinkage void math_state_restore(void) /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ - if (unlikely(restore_fpu_checking(&me->thread.xstate->fxsave))) { + if (unlikely(restore_fpu_checking(me))) { stts(); force_sig(SIGSEGV, me); return; -- cgit v1.2.2 From 3c1c7f101426cb2ecc79d817a8a65928965fc860 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Jul 2008 10:29:21 -0700 Subject: x86, xsave: dynamically allocate sigframes fpstate instead of static allocation dynamically allocate fpstate on the stack, instead of static allocation in the current sigframe layout on the user stack. This will allow the fpstate structure to grow in the future, which includes extended state information supporting xsave/xrstor. signal handlers will be able to access the fpstate pointer from the sigcontext structure asusual, with no change. For the non RT sigframe's (which are supported only for 32bit apps), current static fpstate layout in the sigframe will be unused(so that we don't change the extramask[] offset in the sigframe and thus prevent breaking app's which modify extramask[]). Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 24 ++++++++++++++++-------- arch/x86/kernel/i387.c | 2 ++ arch/x86/kernel/sigframe.h | 14 ++++++++++++-- arch/x86/kernel/signal_32.c | 18 +++++++++++++----- arch/x86/kernel/signal_64.c | 2 +- arch/x86/kernel/xsave.c | 4 ++++ 6 files changed, 48 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 20af4c79579a..a05bf0fb7415 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -179,9 +179,10 @@ struct sigframe u32 pretcode; int sig; struct sigcontext_ia32 sc; - struct _fpstate_ia32 fpstate; + struct _fpstate_ia32 fpstate_unused; /* look at kernel/sigframe.h */ unsigned int extramask[_COMPAT_NSIG_WORDS-1]; char retcode[8]; + /* fp state follows here */ }; struct rt_sigframe @@ -192,8 +193,8 @@ struct rt_sigframe u32 puc; compat_siginfo_t info; struct ucontext_ia32 uc; - struct _fpstate_ia32 fpstate; char retcode[8]; + /* fp state follows here */ }; #define COPY(x) { \ @@ -402,7 +403,8 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, * Determine which stack to use.. */ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, - size_t frame_size) + size_t frame_size, + struct _fpstate_ia32 **fpstate) { unsigned long sp; @@ -421,6 +423,11 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, ka->sa.sa_restorer) sp = (unsigned long) ka->sa.sa_restorer; + if (used_math()) { + sp = sp - sig_xstate_ia32_size; + *fpstate = (struct _fpstate_ia32 *) sp; + } + sp -= frame_size; /* Align the stack pointer according to the i386 ABI, * i.e. so that on function entry ((sp + 4) & 15) == 0. */ @@ -434,6 +441,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, struct sigframe __user *frame; void __user *restorer; int err = 0; + struct _fpstate_ia32 __user *fpstate = NULL; /* copy_to_user optimizes that into a single 8 byte store */ static const struct { @@ -448,7 +456,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, 0, }; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -457,8 +465,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (err) goto give_sigsegv; - err |= ia32_setup_sigcontext(&frame->sc, &frame->fpstate, regs, - set->sig[0]); + err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); if (err) goto give_sigsegv; @@ -522,6 +529,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct rt_sigframe __user *frame; void __user *restorer; int err = 0; + struct _fpstate_ia32 __user *fpstate = NULL; /* __copy_to_user optimizes that into a single 8 byte store */ static const struct { @@ -537,7 +545,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, 0, }; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -556,7 +564,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index b778e17e4b01..51fb288a2c97 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -24,6 +24,7 @@ # define save_i387_ia32 save_i387 # define restore_i387_ia32 restore_i387 # define _fpstate_ia32 _fpstate +# define sig_xstate_ia32_size sig_xstate_size # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct #endif @@ -36,6 +37,7 @@ static unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; unsigned int xstate_size; +unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32); static struct i387_fxsave_struct fx_scratch __cpuinitdata; void __cpuinit mxcsr_feature_mask_init(void) diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index 72bbb519d2dc..6dd7e2b70a4b 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -3,9 +3,18 @@ struct sigframe { char __user *pretcode; int sig; struct sigcontext sc; - struct _fpstate fpstate; + /* + * fpstate is unused. fpstate is moved/allocated after + * retcode[] below. This movement allows to have the FP state and the + * future state extensions (xsave) stay together. + * And at the same time retaining the unused fpstate, prevents changing + * the offset of extramask[] in the sigframe and thus prevent any + * legacy application accessing/modifying it. + */ + struct _fpstate fpstate_unused; unsigned long extramask[_NSIG_WORDS-1]; char retcode[8]; + /* fp state follows here */ }; struct rt_sigframe { @@ -15,13 +24,14 @@ struct rt_sigframe { void __user *puc; struct siginfo info; struct ucontext uc; - struct _fpstate fpstate; char retcode[8]; + /* fp state follows here */ }; #else struct rt_sigframe { char __user *pretcode; struct ucontext uc; struct siginfo info; + /* fp state follows here */ }; #endif diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 6fb5bcdd8933..19a7a5669b5b 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -306,7 +306,8 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, * Determine which stack to use.. */ static inline void __user * -get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) +get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, + struct _fpstate **fpstate) { unsigned long sp; @@ -332,6 +333,11 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size) sp = (unsigned long) ka->sa.sa_restorer; } + if (used_math()) { + sp = sp - sig_xstate_size; + *fpstate = (struct _fpstate *) sp; + } + sp -= frame_size; /* * Align the stack pointer according to the i386 ABI, @@ -350,8 +356,9 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, void __user *restorer; int err = 0; int usig; + struct _fpstate __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -366,7 +373,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, if (err) goto give_sigsegv; - err = setup_sigcontext(&frame->sc, &frame->fpstate, regs, set->sig[0]); + err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); if (err) goto give_sigsegv; @@ -427,8 +434,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, void __user *restorer; int err = 0; int usig; + struct _fpstate __user *fpstate = NULL; - frame = get_sigframe(ka, regs, sizeof(*frame)); + frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; @@ -453,7 +461,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(sas_ss_flags(regs->sp), &frame->uc.uc_stack.ss_flags); err |= __put_user(current->sas_ss_size, &frame->uc.uc_stack.ss_size); - err |= setup_sigcontext(&frame->uc.uc_mcontext, &frame->fpstate, + err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ca316b5b742c..0deab8eff33f 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -281,7 +281,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct task_struct *me = current; if (used_math()) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); + fp = get_stack(ka, regs, sig_xstate_size); frame = (void __user *)round_down( (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index c68b7c4ca249..7ad169e33528 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -17,6 +17,10 @@ unsigned int pcntxt_hmask, pcntxt_lmask; */ struct xsave_struct *init_xstate_buf; +#ifdef CONFIG_X86_64 +unsigned int sig_xstate_size = sizeof(struct _fpstate); +#endif + /* * Enable the extended processor state save/restore feature */ -- cgit v1.2.2 From ab5137015fed9b948fe835a2d99a4cfbd50a0c40 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Jul 2008 10:29:22 -0700 Subject: x86, xsave: reorganization of signal save/restore fpstate code layout move 64bit routines that saves/restores fpstate in/from user stack from signal_64.c to xsave.c restore_i387_xstate() now handles the condition when user passes NULL fpstate. Other misc changes for prepartion of xsave/xrstor sigcontext support. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 28 ++++----------- arch/x86/kernel/i387.c | 44 +++++++++++++++++------- arch/x86/kernel/signal_32.c | 28 ++++----------- arch/x86/kernel/signal_64.c | 83 ++------------------------------------------- arch/x86/kernel/xsave.c | 79 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 127 insertions(+), 135 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a05bf0fb7415..c596eabbe98b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -216,7 +216,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, unsigned int *peax) { unsigned int tmpflags, gs, oldgs, err = 0; - struct _fpstate_ia32 __user *buf; + void __user *buf; u32 tmp; /* Always make any pending restarted system calls return -EINTR */ @@ -260,26 +260,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, err |= __get_user(tmp, &sc->fpstate); buf = compat_ptr(tmp); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387_ia32(buf); - } else { - struct task_struct *me = current; - - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } + err |= restore_i387_xstate_ia32(buf); err |= __get_user(tmp, &sc->ax); *peax = tmp; return err; - -badframe: - return 1; } asmlinkage long sys32_sigreturn(struct pt_regs *regs) @@ -351,7 +337,7 @@ badframe: */ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, - struct _fpstate_ia32 __user *fpstate, + void __user *fpstate, struct pt_regs *regs, unsigned int mask) { int tmp, err = 0; @@ -382,7 +368,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, err |= __put_user((u32)regs->flags, &sc->flags); err |= __put_user((u32)regs->sp, &sc->sp_at_signal); - tmp = save_i387_ia32(fpstate); + tmp = save_i387_xstate_ia32(fpstate); if (tmp < 0) err = -EFAULT; else { @@ -404,7 +390,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, */ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, - struct _fpstate_ia32 **fpstate) + void **fpstate) { unsigned long sp; @@ -441,7 +427,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, struct sigframe __user *frame; void __user *restorer; int err = 0; - struct _fpstate_ia32 __user *fpstate = NULL; + void __user *fpstate = NULL; /* copy_to_user optimizes that into a single 8 byte store */ static const struct { @@ -529,7 +515,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct rt_sigframe __user *frame; void __user *restorer; int err = 0; - struct _fpstate_ia32 __user *fpstate = NULL; + void __user *fpstate = NULL; /* __copy_to_user optimizes that into a single 8 byte store */ static const struct { diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 51fb288a2c97..7daf3a011dd9 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -21,9 +21,10 @@ # include # include #else -# define save_i387_ia32 save_i387 -# define restore_i387_ia32 restore_i387 +# define save_i387_xstate_ia32 save_i387_xstate +# define restore_i387_xstate_ia32 restore_i387_xstate # define _fpstate_ia32 _fpstate +# define _xstate_ia32 _xstate # define sig_xstate_ia32_size sig_xstate_size # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct @@ -424,7 +425,6 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) struct task_struct *tsk = current; struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; - unlazy_fpu(tsk); fp->status = fp->swd; if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) return -1; @@ -438,8 +438,6 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) struct user_i387_ia32_struct env; int err = 0; - unlazy_fpu(tsk); - convert_from_fxsr(&env, tsk); if (__copy_to_user(buf, &env, sizeof(env))) return -1; @@ -455,10 +453,16 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) return 1; } -int save_i387_ia32(struct _fpstate_ia32 __user *buf) +int save_i387_xstate_ia32(void __user *buf) { + struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; + struct task_struct *tsk = current; + if (!used_math()) return 0; + + if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size)) + return -EACCES; /* * This will cause a "finit" to be triggered by the next * attempted FPU operation by the 'current' process. @@ -468,13 +472,15 @@ int save_i387_ia32(struct _fpstate_ia32 __user *buf) if (!HAVE_HWFP) { return fpregs_soft_get(current, NULL, 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) ? -1 : 1; + NULL, fp) ? -1 : 1; } + unlazy_fpu(tsk); + if (cpu_has_fxsr) - return save_i387_fxsave(buf); + return save_i387_fxsave(fp); else - return save_i387_fsave(buf); + return save_i387_fsave(fp); } static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) @@ -502,14 +508,26 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) return 0; } -int restore_i387_ia32(struct _fpstate_ia32 __user *buf) +int restore_i387_xstate_ia32(void __user *buf) { int err; struct task_struct *tsk = current; + struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; if (HAVE_HWFP) clear_fpu(tsk); + if (!buf) { + if (used_math()) { + clear_fpu(tsk); + clear_used_math(); + } + + return 0; + } else + if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size)) + return -EACCES; + if (!used_math()) { err = init_fpu(tsk); if (err) @@ -518,13 +536,13 @@ int restore_i387_ia32(struct _fpstate_ia32 __user *buf) if (HAVE_HWFP) { if (cpu_has_fxsr) - err = restore_i387_fxsave(buf); + err = restore_i387_fxsave(fp); else - err = restore_i387_fsave(buf); + err = restore_i387_fsave(fp); } else { err = fpregs_soft_set(current, NULL, 0, sizeof(struct user_i387_ia32_struct), - NULL, buf) != 0; + NULL, fp) != 0; } set_used_math(); diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 19a7a5669b5b..690cc616ac07 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -159,28 +159,14 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, } { - struct _fpstate __user *buf; + void __user *buf; err |= __get_user(buf, &sc->fpstate); - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } + err |= restore_i387_xstate(buf); } err |= __get_user(*pax, &sc->ax); return err; - -badframe: - return 1; } asmlinkage unsigned long sys_sigreturn(unsigned long __unused) @@ -262,7 +248,7 @@ badframe: * Set up a signal frame. */ static int -setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, +setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask) { int tmp, err = 0; @@ -289,7 +275,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, err |= __put_user(regs->sp, &sc->sp_at_signal); err |= __put_user(regs->ss, (unsigned int __user *)&sc->ss); - tmp = save_i387(fpstate); + tmp = save_i387_xstate(fpstate); if (tmp < 0) err = 1; else @@ -307,7 +293,7 @@ setup_sigcontext(struct sigcontext __user *sc, struct _fpstate __user *fpstate, */ static inline void __user * get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, - struct _fpstate **fpstate) + void **fpstate) { unsigned long sp; @@ -356,7 +342,7 @@ setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, void __user *restorer; int err = 0; int usig; - struct _fpstate __user *fpstate = NULL; + void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); @@ -434,7 +420,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, void __user *restorer; int err = 0; int usig; - struct _fpstate __user *fpstate = NULL; + void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 0deab8eff33f..ddf6123a55c8 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -53,69 +53,6 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } -/* - * Signal frame handlers. - */ - -static inline int save_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err = 0; - - BUILD_BUG_ON(sizeof(struct user_i387_struct) != - sizeof(tsk->thread.xstate->fxsave)); - - if ((unsigned long)buf % 16) - printk("save_i387: bad fpstate %p\n", buf); - - if (!used_math()) - return 0; - clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *) - buf); - if (err) - return err; - task_thread_info(tsk)->status &= ~TS_USEDFPU; - stts(); - } else { - if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, - sizeof(struct i387_fxsave_struct))) - return -1; - } - return 1; -} - -/* - * This restores directly out of user space. Exceptions are handled. - */ -static inline int restore_i387(struct _fpstate __user *buf) -{ - struct task_struct *tsk = current; - int err; - - if (!used_math()) { - err = init_fpu(tsk); - if (err) - return err; - } - - if (!(task_thread_info(current)->status & TS_USEDFPU)) { - clts(); - task_thread_info(current)->status |= TS_USEDFPU; - } - err = restore_fpu_checking((__force struct i387_fxsave_struct *)buf); - if (unlikely(err)) { - /* - * Encountered an error while doing the restore from the - * user buffer, clear the fpu state. - */ - clear_fpu(tsk); - clear_used_math(); - } - return err; -} - /* * Do a signal return; undo the signal stack. */ @@ -160,25 +97,11 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, { struct _fpstate __user * buf; err |= __get_user(buf, &sc->fpstate); - - if (buf) { - if (!access_ok(VERIFY_READ, buf, sizeof(*buf))) - goto badframe; - err |= restore_i387(buf); - } else { - struct task_struct *me = current; - if (used_math()) { - clear_fpu(me); - clear_used_math(); - } - } + err |= restore_i387_xstate(buf); } err |= __get_user(*pax, &sc->ax); return err; - -badframe: - return 1; } asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) @@ -276,7 +199,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs * regs) { struct rt_sigframe __user *frame; - struct _fpstate __user *fp = NULL; + void __user *fp = NULL; int err = 0; struct task_struct *me = current; @@ -288,7 +211,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) goto give_sigsegv; - if (save_i387(fp) < 0) + if (save_i387_xstate(fp) < 0) err |= -1; } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 7ad169e33528..608e72d7ca64 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -12,6 +12,85 @@ */ unsigned int pcntxt_hmask, pcntxt_lmask; +#ifdef CONFIG_X86_64 +/* + * Signal frame handlers. + */ + +int save_i387_xstate(void __user *buf) +{ + struct task_struct *tsk = current; + int err = 0; + + if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) + return -EACCES; + + BUILD_BUG_ON(sizeof(struct user_i387_struct) != + sizeof(tsk->thread.xstate->fxsave)); + + if ((unsigned long)buf % 16) + printk("save_i387_xstate: bad fpstate %p\n", buf); + + if (!used_math()) + return 0; + clear_used_math(); /* trigger finit */ + if (task_thread_info(tsk)->status & TS_USEDFPU) { + err = save_i387_checking((struct i387_fxsave_struct __user *) + buf); + if (err) + return err; + task_thread_info(tsk)->status &= ~TS_USEDFPU; + stts(); + } else { + if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + xstate_size)) + return -1; + } + return 1; +} + +/* + * This restores directly out of user space. Exceptions are handled. + */ +int restore_i387_xstate(void __user *buf) +{ + struct task_struct *tsk = current; + int err; + + if (!buf) { + if (used_math()) { + clear_fpu(tsk); + clear_used_math(); + } + + return 0; + } else + if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) + return -EACCES; + + if (!used_math()) { + err = init_fpu(tsk); + if (err) + return err; + } + + if (!(task_thread_info(current)->status & TS_USEDFPU)) { + clts(); + task_thread_info(current)->status |= TS_USEDFPU; + } + err = fxrstor_checking((__force struct i387_fxsave_struct *)buf); + if (unlikely(err)) { + /* + * Encountered an error while doing the restore from the + * user buffer, clear the fpu state. + */ + clear_fpu(tsk); + clear_used_math(); + } + return err; +} +#endif + /* * Represents init state for the supported extended state. */ -- cgit v1.2.2 From c37b5efea43f9e500363f9973dd00e3d2cdcc685 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Jul 2008 10:29:25 -0700 Subject: x86, xsave: save/restore the extended state context in sigframe On cpu's supporting xsave/xrstor, fpstate pointer in the sigcontext, will include the extended state information along with fpstate information. Presence of extended state information is indicated by the presence of FP_XSTATE_MAGIC1 at fpstate.sw_reserved.magic1 and FP_XSTATE_MAGIC2 at fpstate + (fpstate.sw_reserved.extended_size - FP_XSTATE_MAGIC2_SIZE). Extended feature bit mask that is saved in the memory layout is represented by the fpstate.sw_reserved.xstate_bv For RT signal frames, UC_FP_XSTATE in the uc_flags also indicate the presence of extended state information in the sigcontext's fpstate pointer. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 5 +- arch/x86/kernel/i387.c | 82 +++++++++++++++++++-- arch/x86/kernel/signal_32.c | 5 +- arch/x86/kernel/signal_64.c | 7 +- arch/x86/kernel/xsave.c | 172 +++++++++++++++++++++++++++++++++++++++++--- 5 files changed, 251 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index c596eabbe98b..f25a10124005 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -544,7 +544,10 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 7daf3a011dd9..cbb9dc474a21 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -26,6 +26,7 @@ # define _fpstate_ia32 _fpstate # define _xstate_ia32 _xstate # define sig_xstate_ia32_size sig_xstate_size +# define fx_sw_reserved_ia32 fx_sw_reserved # define user_i387_ia32_struct user_i387_struct # define user32_fxsr_struct user_fxsr_struct #endif @@ -447,12 +448,30 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) if (err) return -1; - if (__copy_to_user(&buf->_fxsr_env[0], fx, - sizeof(struct i387_fxsave_struct))) + if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size)) return -1; return 1; } +static int save_i387_xsave(void __user *buf) +{ + struct _fpstate_ia32 __user *fx = buf; + int err = 0; + + if (save_i387_fxsave(fx) < 0) + return -1; + + err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32, + sizeof(struct _fpx_sw_bytes)); + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *) (buf + sig_xstate_ia32_size + - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return -1; + + return 1; +} + int save_i387_xstate_ia32(void __user *buf) { struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; @@ -477,6 +496,8 @@ int save_i387_xstate_ia32(void __user *buf) unlazy_fpu(tsk); + if (cpu_has_xsave) + return save_i387_xsave(fp); if (cpu_has_fxsr) return save_i387_fxsave(fp); else @@ -491,14 +512,15 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) sizeof(struct i387_fsave_struct)); } -static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) +static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, + unsigned int size) { struct task_struct *tsk = current; struct user_i387_ia32_struct env; int err; err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], - sizeof(struct i387_fxsave_struct)); + size); /* mxcsr reserved bits must be masked to zero for security reasons */ tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; if (err || __copy_from_user(&env, buf, sizeof(env))) @@ -508,6 +530,51 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf) return 0; } +static int restore_i387_xsave(void __user *buf) +{ + struct _fpx_sw_bytes fx_sw_user; + struct _fpstate_ia32 __user *fx_user = + ((struct _fpstate_ia32 __user *) buf); + struct i387_fxsave_struct __user *fx = + (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; + struct xsave_hdr_struct *xsave_hdr = + ¤t->thread.xstate->xsave.xsave_hdr; + unsigned int lmask, hmask; + int err; + + if (check_for_xstate(fx, buf, &fx_sw_user)) + goto fx_only; + + lmask = fx_sw_user.xstate_bv; + hmask = fx_sw_user.xstate_bv >> 32; + + err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); + + xsave_hdr->xstate_bv &= (pcntxt_lmask | (((u64) pcntxt_hmask) << 32)); + /* + * These bits must be zero. + */ + xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + + /* + * Init the state that is not present in the memory layout + * and enabled by the OS. + */ + lmask = ~(pcntxt_lmask & ~lmask); + hmask = ~(pcntxt_hmask & ~hmask); + xsave_hdr->xstate_bv &= (lmask | (((u64) hmask) << 32)); + + return err; +fx_only: + /* + * Couldn't find the extended state information in the memory + * layout. Restore the FP/SSE and init the other extended state + * enabled by the OS. + */ + xsave_hdr->xstate_bv = XSTATE_FPSSE; + return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct)); +} + int restore_i387_xstate_ia32(void __user *buf) { int err; @@ -535,8 +602,11 @@ int restore_i387_xstate_ia32(void __user *buf) } if (HAVE_HWFP) { - if (cpu_has_fxsr) - err = restore_i387_fxsave(fp); + if (cpu_has_xsave) + err = restore_i387_xsave(buf); + else if (cpu_has_fxsr) + err = restore_i387_fxsave(fp, sizeof(struct + i387_fxsave_struct)); else err = restore_i387_fsave(fp); } else { diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 690cc616ac07..0f98d69fbdb0 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -441,7 +441,10 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, goto give_sigsegv; /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index ddf6123a55c8..2621b98f5bf6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -192,7 +192,7 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) sp = current->sas_ss_sp + current->sas_ss_size; } - return (void __user *)round_down(sp - size, 16); + return (void __user *)round_down(sp - size, 64); } static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -226,7 +226,10 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, } /* Create the ucontext. */ - err |= __put_user(0, &frame->uc.uc_flags); + if (cpu_has_xsave) + err |= __put_user(UC_FP_XSTATE, &frame->uc.uc_flags); + else + err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); err |= __put_user(sas_ss_flags(regs->sp), diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 608e72d7ca64..dd66d0714c18 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -6,12 +6,68 @@ #include #include #include +#ifdef CONFIG_IA32_EMULATION +#include +#endif /* * Supported feature mask by the CPU and the kernel. */ unsigned int pcntxt_hmask, pcntxt_lmask; +struct _fpx_sw_bytes fx_sw_reserved; +#ifdef CONFIG_IA32_EMULATION +struct _fpx_sw_bytes fx_sw_reserved_ia32; +#endif + +/* + * Check for the presence of extended state information in the + * user fpstate pointer in the sigcontext. + */ +int check_for_xstate(struct i387_fxsave_struct __user *buf, + void __user *fpstate, + struct _fpx_sw_bytes *fx_sw_user) +{ + int min_xstate_size = sizeof(struct i387_fxsave_struct) + + sizeof(struct xsave_hdr_struct); + unsigned int magic2; + int err; + + err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], + sizeof(struct _fpx_sw_bytes)); + + if (err) + return err; + + /* + * First Magic check failed. + */ + if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) + return -1; + + /* + * Check for error scenarios. + */ + if (fx_sw_user->xstate_size < min_xstate_size || + fx_sw_user->xstate_size > xstate_size || + fx_sw_user->xstate_size > fx_sw_user->extended_size) + return -1; + + err = __get_user(magic2, (__u32 *) (((void *)fpstate) + + fx_sw_user->extended_size - + FP_XSTATE_MAGIC2_SIZE)); + /* + * Check for the presence of second magic word at the end of memory + * layout. This detects the case where the user just copied the legacy + * fpstate layout with out copying the extended state information + * in the memory layout. + */ + if (err || magic2 != FP_XSTATE_MAGIC2) + return -1; + + return 0; +} + #ifdef CONFIG_X86_64 /* * Signal frame handlers. @@ -28,15 +84,18 @@ int save_i387_xstate(void __user *buf) BUILD_BUG_ON(sizeof(struct user_i387_struct) != sizeof(tsk->thread.xstate->fxsave)); - if ((unsigned long)buf % 16) + if ((unsigned long)buf % 64) printk("save_i387_xstate: bad fpstate %p\n", buf); if (!used_math()) return 0; clear_used_math(); /* trigger finit */ if (task_thread_info(tsk)->status & TS_USEDFPU) { - err = save_i387_checking((struct i387_fxsave_struct __user *) - buf); + if (task_thread_info(tsk)->status & TS_XSAVE) + err = xsave_user(buf); + else + err = fxsave_user(buf); + if (err) return err; task_thread_info(tsk)->status &= ~TS_USEDFPU; @@ -46,23 +105,77 @@ int save_i387_xstate(void __user *buf) xstate_size)) return -1; } + + if (task_thread_info(tsk)->status & TS_XSAVE) { + struct _fpstate __user *fx = buf; + + err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, + sizeof(struct _fpx_sw_bytes)); + + err |= __put_user(FP_XSTATE_MAGIC2, + (__u32 __user *) (buf + sig_xstate_size + - FP_XSTATE_MAGIC2_SIZE)); + } + return 1; } +/* + * Restore the extended state if present. Otherwise, restore the FP/SSE + * state. + */ +int restore_user_xstate(void __user *buf) +{ + struct _fpx_sw_bytes fx_sw_user; + unsigned int lmask, hmask; + int err; + + if (((unsigned long)buf % 64) || + check_for_xstate(buf, buf, &fx_sw_user)) + goto fx_only; + + lmask = fx_sw_user.xstate_bv; + hmask = fx_sw_user.xstate_bv >> 32; + + /* + * restore the state passed by the user. + */ + err = xrestore_user(buf, lmask, hmask); + if (err) + return err; + + /* + * init the state skipped by the user. + */ + lmask = pcntxt_lmask & ~lmask; + hmask = pcntxt_hmask & ~hmask; + + xrstor_state(init_xstate_buf, lmask, hmask); + + return 0; + +fx_only: + /* + * couldn't find the extended state information in the + * memory layout. Restore just the FP/SSE and init all + * the other extended state. + */ + xrstor_state(init_xstate_buf, pcntxt_lmask & ~XSTATE_FPSSE, + pcntxt_hmask); + return fxrstor_checking((__force struct i387_fxsave_struct *)buf); +} + /* * This restores directly out of user space. Exceptions are handled. */ int restore_i387_xstate(void __user *buf) { struct task_struct *tsk = current; - int err; + int err = 0; if (!buf) { - if (used_math()) { - clear_fpu(tsk); - clear_used_math(); - } - + if (used_math()) + goto clear; return 0; } else if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) @@ -78,12 +191,17 @@ int restore_i387_xstate(void __user *buf) clts(); task_thread_info(current)->status |= TS_USEDFPU; } - err = fxrstor_checking((__force struct i387_fxsave_struct *)buf); + if (task_thread_info(tsk)->status & TS_XSAVE) + err = restore_user_xstate(buf); + else + err = fxrstor_checking((__force struct i387_fxsave_struct *) + buf); if (unlikely(err)) { /* * Encountered an error while doing the restore from the * user buffer, clear the fpu state. */ +clear: clear_fpu(tsk); clear_used_math(); } @@ -91,6 +209,38 @@ int restore_i387_xstate(void __user *buf) } #endif +/* + * Prepare the SW reserved portion of the fxsave memory layout, indicating + * the presence of the extended state information in the memory layout + * pointed by the fpstate pointer in the sigcontext. + * This will be saved when ever the FP and extended state context is + * saved on the user stack during the signal handler delivery to the user. + */ +void prepare_fx_sw_frame(void) +{ + int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + + FP_XSTATE_MAGIC2_SIZE; + + sig_xstate_size = sizeof(struct _fpstate) + size_extended; + +#ifdef CONFIG_IA32_EMULATION + sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended; +#endif + + memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved)); + + fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; + fx_sw_reserved.extended_size = sig_xstate_size; + fx_sw_reserved.xstate_bv = pcntxt_lmask | + (((u64) (pcntxt_hmask)) << 32); + fx_sw_reserved.xstate_size = xstate_size; +#ifdef CONFIG_IA32_EMULATION + memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, + sizeof(struct _fpx_sw_bytes)); + fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size; +#endif +} + /* * Represents init state for the supported extended state. */ @@ -162,6 +312,8 @@ void __init xsave_cntxt_init(void) xstate_size = ebx; + prepare_fx_sw_frame(); + setup_xstate_init(); printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%Lx, " -- cgit v1.2.2 From 42deec6f2c3688fdaf986225ac901b817cd91568 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 29 Jul 2008 10:29:26 -0700 Subject: x86, xsave: update xsave header bits during ptrace fpregs set FP/SSE bits may be zero in the xsave header(representing the init state). Update these bits during the ptrace fpregs set operation, to indicate the non-init state. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index cbb9dc474a21..e0ed59f5c19f 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -214,6 +214,13 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, */ target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + /* + * update the header bits in the xsave header, indicating the + * presence of FP and SSE state. + */ + if (cpu_has_xsave) + target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + return ret; } @@ -414,6 +421,12 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (!ret) convert_to_fxsr(target, &env); + /* + * update the header bit in the xsave header, indicating the + * presence of FP. + */ + if (cpu_has_xsave) + target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; return ret; } -- cgit v1.2.2 From 6152e4b1c99a3689fc318d092cd144597f7dbd14 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 29 Jul 2008 17:23:16 -0700 Subject: x86, xsave: keep the XSAVE feature mask as an u64 The XSAVE feature mask is a 64-bit number; keep it that way, in order to avoid the mistake done with rdmsr/wrmsr. Use the xsetbv() function provided in the previous patch. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/i387.c | 12 +++++------- arch/x86/kernel/xsave.c | 45 +++++++++++++++++---------------------------- 2 files changed, 22 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index e0ed59f5c19f..45723f1fe198 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -552,18 +552,17 @@ static int restore_i387_xsave(void __user *buf) (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; struct xsave_hdr_struct *xsave_hdr = ¤t->thread.xstate->xsave.xsave_hdr; - unsigned int lmask, hmask; + u64 mask; int err; if (check_for_xstate(fx, buf, &fx_sw_user)) goto fx_only; - lmask = fx_sw_user.xstate_bv; - hmask = fx_sw_user.xstate_bv >> 32; + mask = fx_sw_user.xstate_bv; err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); - xsave_hdr->xstate_bv &= (pcntxt_lmask | (((u64) pcntxt_hmask) << 32)); + xsave_hdr->xstate_bv &= pcntxt_mask; /* * These bits must be zero. */ @@ -573,9 +572,8 @@ static int restore_i387_xsave(void __user *buf) * Init the state that is not present in the memory layout * and enabled by the OS. */ - lmask = ~(pcntxt_lmask & ~lmask); - hmask = ~(pcntxt_hmask & ~hmask); - xsave_hdr->xstate_bv &= (lmask | (((u64) hmask) << 32)); + mask = ~(pcntxt_mask & ~mask); + xsave_hdr->xstate_bv &= mask; return err; fx_only: diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index dd66d0714c18..7415f3e38a51 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -9,11 +9,12 @@ #ifdef CONFIG_IA32_EMULATION #include #endif +#include /* * Supported feature mask by the CPU and the kernel. */ -unsigned int pcntxt_hmask, pcntxt_lmask; +u64 pcntxt_mask; struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION @@ -127,30 +128,28 @@ int save_i387_xstate(void __user *buf) int restore_user_xstate(void __user *buf) { struct _fpx_sw_bytes fx_sw_user; - unsigned int lmask, hmask; + u64 mask; int err; if (((unsigned long)buf % 64) || check_for_xstate(buf, buf, &fx_sw_user)) goto fx_only; - lmask = fx_sw_user.xstate_bv; - hmask = fx_sw_user.xstate_bv >> 32; + mask = fx_sw_user.xstate_bv; /* * restore the state passed by the user. */ - err = xrestore_user(buf, lmask, hmask); + err = xrestore_user(buf, mask); if (err) return err; /* * init the state skipped by the user. */ - lmask = pcntxt_lmask & ~lmask; - hmask = pcntxt_hmask & ~hmask; + mask = pcntxt_mask & ~mask; - xrstor_state(init_xstate_buf, lmask, hmask); + xrstor_state(init_xstate_buf, mask); return 0; @@ -160,8 +159,7 @@ fx_only: * memory layout. Restore just the FP/SSE and init all * the other extended state. */ - xrstor_state(init_xstate_buf, pcntxt_lmask & ~XSTATE_FPSSE, - pcntxt_hmask); + xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); return fxrstor_checking((__force struct i387_fxsave_struct *)buf); } @@ -231,8 +229,7 @@ void prepare_fx_sw_frame(void) fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = sig_xstate_size; - fx_sw_reserved.xstate_bv = pcntxt_lmask | - (((u64) (pcntxt_hmask)) << 32); + fx_sw_reserved.xstate_bv = pcntxt_mask; fx_sw_reserved.xstate_size = xstate_size; #ifdef CONFIG_IA32_EMULATION memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, @@ -263,11 +260,8 @@ void __cpuinit xsave_init(void) /* * Enable all the features that the HW is capable of * and the Linux kernel is aware of. - * - * xsetbv(); */ - asm volatile(".byte 0x0f,0x01,0xd1" : : "c" (0), - "a" (pcntxt_lmask), "d" (pcntxt_hmask)); + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* @@ -287,36 +281,31 @@ void __init xsave_cntxt_init(void) unsigned int eax, ebx, ecx, edx; cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + pcntxt_mask = eax + ((u64)edx << 32); - pcntxt_lmask = eax; - pcntxt_hmask = edx; - - if ((pcntxt_lmask & XSTATE_FPSSE) != XSTATE_FPSSE) { - printk(KERN_ERR "FP/SSE not shown under xsave features %x\n", - pcntxt_lmask); + if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { + printk(KERN_ERR "FP/SSE not shown under xsave features 0x%llx\n", + pcntxt_mask); BUG(); } /* * for now OS knows only about FP/SSE */ - pcntxt_lmask = pcntxt_lmask & XCNTXT_LMASK; - pcntxt_hmask = pcntxt_hmask & XCNTXT_HMASK; - + pcntxt_mask = pcntxt_mask & XCNTXT_MASK; xsave_init(); /* * Recompute the context size for enabled features */ cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); - xstate_size = ebx; prepare_fx_sw_frame(); setup_xstate_init(); - printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%Lx, " + printk(KERN_INFO "xsave/xrstor: enabled xstate_bv 0x%llx, " "cntxt size 0x%x\n", - (pcntxt_lmask | ((u64) pcntxt_hmask << 32)), xstate_size); + pcntxt_mask, xstate_size); } -- cgit v1.2.2 From 0d1edf46ba229b46efacf75c0544b88c05a7b266 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 11:53:57 -0700 Subject: xen: compile irq functions without -pg for ftrace For some reason I managed to miss a bunch of irq-related functions which also need to be compiled without -pg when using ftrace. This patch moves them into their own file, and starts a cleanup process I've been meaning to do anyway. Signed-off-by: Jeremy Fitzhardinge Cc: Sam Ravnborg Cc: "Alex Nixon (Intern)" Cc: Eduardo Habkost Signed-off-by: Ingo Molnar --- arch/x86/xen/Makefile | 3 +- arch/x86/xen/enlighten.c | 122 +-------------------------------------- arch/x86/xen/irq.c | 143 ++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/xen/xen-asm_32.S | 2 +- arch/x86/xen/xen-asm_64.S | 2 +- arch/x86/xen/xen-ops.h | 1 + 6 files changed, 150 insertions(+), 123 deletions(-) create mode 100644 arch/x86/xen/irq.c (limited to 'arch/x86') diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 5bfee243cf9a..9ee745fa5527 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -2,9 +2,10 @@ ifdef CONFIG_FTRACE # Do not profile debug and lowlevel utilities CFLAGS_REMOVE_spinlock.o = -pg CFLAGS_REMOVE_time.o = -pg +CFLAGS_REMOVE_irq.o = -pg endif -obj-y := enlighten.o setup.o multicalls.o mmu.o \ +obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o obj-$(CONFIG_SMP) += smp.o spinlock.o diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b795470ec069..cf8b3a93122b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -226,94 +225,6 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static unsigned long xen_save_fl(void) -{ - struct vcpu_info *vcpu; - unsigned long flags; - - vcpu = x86_read_percpu(xen_vcpu); - - /* flag has opposite sense of mask */ - flags = !vcpu->evtchn_upcall_mask; - - /* convert to IF type flag - -0 -> 0x00000000 - -1 -> 0xffffffff - */ - return (-flags) & X86_EFLAGS_IF; -} - -static void xen_restore_fl(unsigned long flags) -{ - struct vcpu_info *vcpu; - - /* convert from IF type flag */ - flags = !(flags & X86_EFLAGS_IF); - - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = flags; - preempt_enable_no_resched(); - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - if (flags == 0) { - preempt_check_resched(); - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); - } -} - -static void xen_irq_disable(void) -{ - /* There's a one instruction preempt window here. We need to - make sure we're don't switch CPUs between getting the vcpu - pointer and updating the mask. */ - preempt_disable(); - x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; - preempt_enable_no_resched(); -} - -static void xen_irq_enable(void) -{ - struct vcpu_info *vcpu; - - /* We don't need to worry about being preempted here, since - either a) interrupts are disabled, so no preemption, or b) - the caller is confused and is trying to re-enable interrupts - on an indeterminate processor. */ - - vcpu = x86_read_percpu(xen_vcpu); - vcpu->evtchn_upcall_mask = 0; - - /* Doesn't matter if we get preempted here, because any - pending event will get dealt with anyway. */ - - barrier(); /* unmask then check (avoid races) */ - if (unlikely(vcpu->evtchn_upcall_pending)) - force_evtchn_callback(); -} - -static void xen_safe_halt(void) -{ - /* Blocking includes an implicit local_irq_enable(). */ - if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) - BUG(); -} - -static void xen_halt(void) -{ - if (irqs_disabled()) - HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); - else - xen_safe_halt(); -} - static void xen_leave_lazy(void) { paravirt_leave_lazy(paravirt_get_lazy_mode()); @@ -1308,36 +1219,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { }, }; -static void __init __xen_init_IRQ(void) -{ -#ifdef CONFIG_X86_64 - int i; - - /* Create identity vector->irq map */ - for(i = 0; i < NR_VECTORS; i++) { - int cpu; - - for_each_possible_cpu(cpu) - per_cpu(vector_irq, cpu)[i] = i; - } -#endif /* CONFIG_X86_64 */ - - xen_init_IRQ(); -} - -static const struct pv_irq_ops xen_irq_ops __initdata = { - .init_IRQ = __xen_init_IRQ, - .save_fl = xen_save_fl, - .restore_fl = xen_restore_fl, - .irq_disable = xen_irq_disable, - .irq_enable = xen_irq_enable, - .safe_halt = xen_safe_halt, - .halt = xen_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = xen_adjust_exception_frame, -#endif -}; - static const struct pv_apic_ops xen_apic_ops __initdata = { #ifdef CONFIG_X86_LOCAL_APIC .apic_write = xen_apic_write, @@ -1740,10 +1621,11 @@ asmlinkage void __init xen_start_kernel(void) pv_init_ops = xen_init_ops; pv_time_ops = xen_time_ops; pv_cpu_ops = xen_cpu_ops; - pv_irq_ops = xen_irq_ops; pv_apic_ops = xen_apic_ops; pv_mmu_ops = xen_mmu_ops; + xen_init_irq_ops(); + if (xen_feature(XENFEAT_mmu_pt_update_preserve_ad)) { pv_mmu_ops.ptep_modify_prot_start = xen_ptep_modify_prot_start; pv_mmu_ops.ptep_modify_prot_commit = xen_ptep_modify_prot_commit; diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c new file mode 100644 index 000000000000..28b85ab8422e --- /dev/null +++ b/arch/x86/xen/irq.c @@ -0,0 +1,143 @@ +#include + +#include +#include +#include + +#include +#include + +#include "xen-ops.h" + +/* + * Force a proper event-channel callback from Xen after clearing the + * callback mask. We do this in a very simple manner, by making a call + * down into Xen. The pending flag will be checked by Xen on return. + */ +void xen_force_evtchn_callback(void) +{ + (void)HYPERVISOR_xen_version(0, NULL); +} + +static void __init __xen_init_IRQ(void) +{ +#ifdef CONFIG_X86_64 + int i; + + /* Create identity vector->irq map */ + for(i = 0; i < NR_VECTORS; i++) { + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[i] = i; + } +#endif /* CONFIG_X86_64 */ + + xen_init_IRQ(); +} + +static unsigned long xen_save_fl(void) +{ + struct vcpu_info *vcpu; + unsigned long flags; + + vcpu = x86_read_percpu(xen_vcpu); + + /* flag has opposite sense of mask */ + flags = !vcpu->evtchn_upcall_mask; + + /* convert to IF type flag + -0 -> 0x00000000 + -1 -> 0xffffffff + */ + return (-flags) & X86_EFLAGS_IF; +} + +static void xen_restore_fl(unsigned long flags) +{ + struct vcpu_info *vcpu; + + /* convert from IF type flag */ + flags = !(flags & X86_EFLAGS_IF); + + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = flags; + preempt_enable_no_resched(); + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + if (flags == 0) { + preempt_check_resched(); + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); + } +} + +static void xen_irq_disable(void) +{ + /* There's a one instruction preempt window here. We need to + make sure we're don't switch CPUs between getting the vcpu + pointer and updating the mask. */ + preempt_disable(); + x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1; + preempt_enable_no_resched(); +} + +static void xen_irq_enable(void) +{ + struct vcpu_info *vcpu; + + /* We don't need to worry about being preempted here, since + either a) interrupts are disabled, so no preemption, or b) + the caller is confused and is trying to re-enable interrupts + on an indeterminate processor. */ + + vcpu = x86_read_percpu(xen_vcpu); + vcpu->evtchn_upcall_mask = 0; + + /* Doesn't matter if we get preempted here, because any + pending event will get dealt with anyway. */ + + barrier(); /* unmask then check (avoid races) */ + if (unlikely(vcpu->evtchn_upcall_pending)) + xen_force_evtchn_callback(); +} + +static void xen_safe_halt(void) +{ + /* Blocking includes an implicit local_irq_enable(). */ + if (HYPERVISOR_sched_op(SCHEDOP_block, NULL) != 0) + BUG(); +} + +static void xen_halt(void) +{ + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + else + xen_safe_halt(); +} + +static const struct pv_irq_ops xen_irq_ops __initdata = { + .init_IRQ = __xen_init_IRQ, + .save_fl = xen_save_fl, + .restore_fl = xen_restore_fl, + .irq_disable = xen_irq_disable, + .irq_enable = xen_irq_enable, + .safe_halt = xen_safe_halt, + .halt = xen_halt, +#ifdef CONFIG_X86_64 + .adjust_exception_frame = xen_adjust_exception_frame, +#endif +}; + +void __init xen_init_irq_ops() +{ + pv_irq_ops = xen_irq_ops; +} diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index 2497a30f41de..42786f59d9c0 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -298,7 +298,7 @@ check_events: push %eax push %ecx push %edx - call force_evtchn_callback + call xen_force_evtchn_callback pop %edx pop %ecx pop %eax diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 7f58304fafb3..3b9bda46487a 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -122,7 +122,7 @@ check_events: push %r9 push %r10 push %r11 - call force_evtchn_callback + call xen_force_evtchn_callback pop %r11 pop %r10 pop %r9 diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8847fb34f17e..3c70ebc50b1b 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -31,6 +31,7 @@ void xen_vcpu_restore(void); void __init xen_build_dynamic_phys_to_machine(void); +void xen_init_irq_ops(void); void xen_setup_timer(int cpu); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); -- cgit v1.2.2 From cef43bf6b3afd819f7cdcba356af0e8220fb3789 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 28 Jul 2008 13:33:44 -0700 Subject: xen: fix allocation and use of large ldts, cleanup Add a proper comment for set_aliased_prot() and fix an unsigned long/void * warning. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index cf8b3a93122b..04ec69e4d02e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -237,8 +237,10 @@ static unsigned long xen_store_tr(void) } /* - * If 'v' is a vmalloc mapping, then find the linear mapping of the - * page (if any) and also set its protections to match: + * Set the page permissions for a particular virtual address. If the + * address is a vmalloc mapping (or other non-linear mapping), then + * find the linear mapping of the page and also set its protections to + * match. */ static void set_aliased_prot(void *v, pgprot_t prot) { @@ -387,8 +389,7 @@ static void xen_load_gs_index(unsigned int idx) static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, const void *ptr) { - unsigned long lp = (unsigned long)&dt[entrynum]; - xmaddr_t mach_lp = arbitrary_virt_to_machine(lp); + xmaddr_t mach_lp = arbitrary_virt_to_machine(&dt[entrynum]); u64 entry = *(u64 *)ptr; preempt_disable(); -- cgit v1.2.2 From 169ad16bb87c10a3f7c108bb7008ebc0270f617a Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Mon, 28 Jul 2008 18:32:09 -0300 Subject: xen_alloc_ptpage: cast PFN_PHYS() argument to unsigned long Currently paravirt_ops alloc_p*() uses u32 for the pfn args. We should change that later, but while the pfn parameter is still u32, we need to cast the PFN_PHYS() argument at xen_alloc_ptpage() to unsigned long, otherwise it will lose bits on the shift. I think PFN_PHYS() should behave better when fed with smaller integers, but a cast to unsigned long won't be enough for all cases on 32-bit PAE, and a cast to u64 would be overkill for most users of PFN_PHYS(). We could have two different flavors of PFN_PHYS: one for low pages only (unsigned long) and another that works for any page (u64)), but while we don't have it, we will need the cast to unsigned long on xen_alloc_ptpage(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 04ec69e4d02e..53afa14eb314 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -822,7 +822,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) SetPagePinned(page); if (!PageHighMem(page)) { - make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); if (level == PT_PTE) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else -- cgit v1.2.2 From 7de08b4e1ed8d80e6086f71b7e99fc4b397aae39 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 29 Jul 2008 02:48:51 -0300 Subject: x86: coding styles fixes to arch/x86/kernel/process_64.c Fix about 50 errors and many warnings without change process_64.o arch/x86/kernel/process_64.o: text data bss dec hex filename 5236 8 24 5268 1494 process_64.o.after 5236 8 24 5268 1494 process_64.o.before md5: 9c35e9debdea4e471288c6e8ca267a75 process_64.o.after 9c35e9debdea4e471288c6e8ca267a75 process_64.o.before Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 101 +++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3fb62a7d9a16..4da8514dd25c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,11 +37,11 @@ #include #include #include +#include +#include -#include #include #include -#include #include #include #include @@ -88,7 +88,7 @@ void exit_idle(void) #ifdef CONFIG_HOTPLUG_CPU DECLARE_PER_CPU(int, cpu_state); -#include +#include /* We halt the CPU with physical CPU hotplug */ static inline void play_dead(void) { @@ -152,7 +152,7 @@ void cpu_idle(void) } /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs * regs) +void __show_regs(struct pt_regs *regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@ -177,28 +177,28 @@ void __show_regs(struct pt_regs * regs) printk("RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); printk("R10: %016lx R11: %016lx R12: %016lx\n", - regs->r10, regs->r11, regs->r12); + regs->r10, regs->r11, regs->r12); printk("R13: %016lx R14: %016lx R15: %016lx\n", - regs->r13, regs->r14, regs->r15); + regs->r13, regs->r14, regs->r15); - asm("movl %%ds,%0" : "=r" (ds)); - asm("movl %%cs,%0" : "=r" (cs)); - asm("movl %%es,%0" : "=r" (es)); + asm("movl %%ds,%0" : "=r" (ds)); + asm("movl %%cs,%0" : "=r" (cs)); + asm("movl %%es,%0" : "=r" (es)); asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrl(MSR_GS_BASE, gs); + rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); cr4 = read_cr4(); - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", - fs,fsindex,gs,gsindex,shadowgs); - printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); + printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + fs, fsindex, gs, gsindex, shadowgs); + printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); get_debugreg(d0, 0); @@ -314,10 +314,10 @@ void prepare_to_copy(struct task_struct *tsk) int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, unsigned long unused, - struct task_struct * p, struct pt_regs * regs) + struct task_struct *p, struct pt_regs *regs) { int err; - struct pt_regs * childregs; + struct pt_regs *childregs; struct task_struct *me = current; childregs = ((struct pt_regs *) @@ -362,10 +362,10 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, if (test_thread_flag(TIF_IA32)) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); - else -#endif - err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); - if (err) + else +#endif + err = do_arch_prctl(p, ARCH_SET_FS, childregs->r8); + if (err) goto out; } err = 0; @@ -544,7 +544,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; /* we're going to use this soon, after a few expensive things */ - if (next_p->fpu_counter>5) + if (next_p->fpu_counter > 5) prefetch(next->xstate); /* @@ -552,13 +552,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ load_sp0(tss, next); - /* + /* * Switch DS and ES. * This won't pick up thread selector changes, but I guess that is ok. */ savesegment(es, prev->es); if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); + loadsegment(es, next->es); savesegment(ds, prev->ds); if (unlikely(next->ds | prev->ds)) @@ -584,7 +584,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_leave_lazy_cpu_mode(); - /* + /* * Switch FS and GS. * * Segment register != 0 always requires a reload. Also @@ -593,13 +593,13 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); - /* + /* * Check if the user used a selector != 0; if yes * clear 64bit base, since overloaded base is always * mapped to the Null selector */ if (fsindex) - prev->fs = 0; + prev->fs = 0; } /* when next process has a 64bit base use it */ if (next->fs) @@ -609,7 +609,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); if (gsindex) - prev->gs = 0; + prev->gs = 0; } if (next->gs) wrmsrl(MSR_KERNEL_GS_BASE, next->gs); @@ -618,12 +618,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* Must be after DS reload */ unlazy_fpu(prev_p); - /* + /* * Switch the PDA and FPU contexts. */ prev->usersp = read_pda(oldrsp); write_pda(oldrsp, next->usersp); - write_pda(pcurrent, next_p); + write_pda(pcurrent, next_p); write_pda(kernelstack, (unsigned long)task_stack_page(next_p) + @@ -664,7 +664,7 @@ long sys_execve(char __user *name, char __user * __user *argv, char __user * __user *envp, struct pt_regs *regs) { long error; - char * filename; + char *filename; filename = getname(name); error = PTR_ERR(filename); @@ -722,55 +722,55 @@ asmlinkage long sys_vfork(struct pt_regs *regs) unsigned long get_wchan(struct task_struct *p) { unsigned long stack; - u64 fp,ip; + u64 fp, ip; int count = 0; - if (!p || p == current || p->state==TASK_RUNNING) - return 0; + if (!p || p == current || p->state == TASK_RUNNING) + return 0; stack = (unsigned long)task_stack_page(p); if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) return 0; fp = *(u64 *)(p->thread.sp); - do { + do { if (fp < (unsigned long)stack || fp > (unsigned long)stack+THREAD_SIZE) - return 0; + return 0; ip = *(u64 *)(fp+8); if (!in_sched_functions(ip)) return ip; - fp = *(u64 *)fp; - } while (count++ < 16); + fp = *(u64 *)fp; + } while (count++ < 16); return 0; } long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) -{ - int ret = 0; +{ + int ret = 0; int doit = task == current; int cpu; - switch (code) { + switch (code) { case ARCH_SET_GS: if (addr >= TASK_SIZE_OF(task)) - return -EPERM; + return -EPERM; cpu = get_cpu(); - /* handle small bases via the GDT because that's faster to + /* handle small bases via the GDT because that's faster to switch. */ - if (addr <= 0xffffffff) { - set_32bit_tls(task, GS_TLS, addr); - if (doit) { + if (addr <= 0xffffffff) { + set_32bit_tls(task, GS_TLS, addr); + if (doit) { load_TLS(&task->thread, cpu); - load_gs_index(GS_TLS_SEL); + load_gs_index(GS_TLS_SEL); } - task->thread.gsindex = GS_TLS_SEL; + task->thread.gsindex = GS_TLS_SEL; task->thread.gs = 0; - } else { + } else { task->thread.gsindex = 0; task->thread.gs = addr; if (doit) { load_gs_index(0); ret = checking_wrmsrl(MSR_KERNEL_GS_BASE, addr); - } + } } put_cpu(); break; @@ -824,8 +824,7 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr) rdmsrl(MSR_KERNEL_GS_BASE, base); else base = task->thread.gs; - } - else + } else base = task->thread.gs; ret = put_user(base, (unsigned long __user *)addr); break; -- cgit v1.2.2 From 8092c654de9a964c14d89da56834f73a80548a58 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 29 Jul 2008 02:48:52 -0300 Subject: x86: add KERN_INFO to printks on process_64.c Fix many coding style warnings. Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 4da8514dd25c..3560d7f4d74e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,24 +161,24 @@ void __show_regs(struct pt_regs *regs) printk("\n"); print_modules(); - printk("Pid: %d, comm: %.20s %s %s %.*s\n", + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s\n", current->pid, current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - printk("RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); + printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); printk_address(regs->ip, 1); - printk("RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, regs->sp, - regs->flags); - printk("RAX: %016lx RBX: %016lx RCX: %016lx\n", + printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, + regs->sp, regs->flags); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); - printk("RDX: %016lx RSI: %016lx RDI: %016lx\n", + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", regs->dx, regs->si, regs->di); - printk("RBP: %016lx R08: %016lx R09: %016lx\n", + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", regs->bp, regs->r8, regs->r9); - printk("R10: %016lx R11: %016lx R12: %016lx\n", + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", regs->r10, regs->r11, regs->r12); - printk("R13: %016lx R14: %016lx R15: %016lx\n", + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); asm("movl %%ds,%0" : "=r" (ds)); @@ -196,24 +196,26 @@ void __show_regs(struct pt_regs *regs) cr3 = read_cr3(); cr4 = read_cr4(); - printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", + printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs, fsindex, gs, gsindex, shadowgs); - printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); - printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); + printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, + es, cr0); + printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, + cr4); get_debugreg(d0, 0); get_debugreg(d1, 1); get_debugreg(d2, 2); - printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); get_debugreg(d3, 3); get_debugreg(d6, 6); get_debugreg(d7, 7); - printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); + printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) { - printk("CPU %d:", smp_processor_id()); + printk(KERN_INFO "CPU %d:", smp_processor_id()); __show_regs(regs); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } -- cgit v1.2.2 From 08aadf069d0482ade033badefa8f03eb2fcddd9c Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 29 Jul 2008 02:48:53 -0300 Subject: x86: coding style fixes to arch/x86/kernel/crash_dump_64.c Fix conding style without change crash_dump_64.o arch/x86/kernel/crash_dump_64.o text data bss dec hex filename 129 0 0 129 81 crash_dump_64.o.after 129 0 0 129 81 crash_dump_64.o.before md5: 885b52c1b92737e6b12e5107e90fc1f1 crash_dump_64.o.after 885b52c1b92737e6b12e5107e90fc1f1 crash_dump_64.o.before Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash_dump_64.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 15e6c6bc4a46..d3e524c84527 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -7,9 +7,8 @@ #include #include - -#include -#include +#include +#include /** * copy_oldmem_page - copy one page from "oldmem" @@ -25,7 +24,7 @@ * in the current kernel. We stitch up a pte, similar to kmap_atomic. */ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) + size_t csize, unsigned long offset, int userbuf) { void *vaddr; -- cgit v1.2.2 From caa007dd3687d38a0252484d9d0a8f9d929ba932 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 29 Jul 2008 02:48:54 -0300 Subject: x86: coding style fixes to arch/x86/kernel/signal_64.c Fix all errors and many warnings reported by checkpatch.pl without change signal_64.o arch/x86/kernel/signal_64.o text data bss dec hex filename 5143 0 8 5151 141f signal_64.o.after 5143 0 8 5151 141f signal_64.o.before md5: e68718092b3641cb27e79e55ce57e3ad signal_64.o.after e68718092b3641cb27e79e55ce57e3ad signal_64.o.before Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/sigframe.h | 5 ++++ arch/x86/kernel/signal_64.c | 62 ++++++++++++++++++++++----------------------- 2 files changed, 35 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sigframe.h b/arch/x86/kernel/sigframe.h index 72bbb519d2dc..8b4956e800ac 100644 --- a/arch/x86/kernel/sigframe.h +++ b/arch/x86/kernel/sigframe.h @@ -24,4 +24,9 @@ struct rt_sigframe { struct ucontext uc; struct siginfo info; }; + +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, + sigset_t *set, struct pt_regs *regs); #endif diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index b45ef8ddd651..87a9c2f28d99 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -19,9 +19,10 @@ #include #include #include +#include + #include #include -#include #include #include #include @@ -41,11 +42,6 @@ # define FIX_EFLAGS __FIX_EFLAGS #endif -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, - sigset_t *set, struct pt_regs * regs); - asmlinkage long sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, struct pt_regs *regs) @@ -119,7 +115,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) +#define COPY(x) (err |= __get_user(regs->x, &sc->x)) COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); @@ -149,7 +145,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, } { - struct _fpstate __user * buf; + struct _fpstate __user *buf; err |= __get_user(buf, &sc->fpstate); if (buf) { @@ -189,7 +185,7 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) current->blocked = set; recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - + if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; @@ -199,16 +195,17 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return ax; badframe: - signal_fault(regs,frame,"sigreturn"); + signal_fault(regs, frame, "sigreturn"); return 0; -} +} /* * Set up a signal frame. */ static inline int -setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, unsigned long mask, struct task_struct *me) +setup_sigcontext(struct sigcontext __user *sc, struct pt_regs *regs, + unsigned long mask, struct task_struct *me) { int err = 0; @@ -264,35 +261,35 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) } static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs * regs) + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; - struct _fpstate __user *fp = NULL; + struct _fpstate __user *fp = NULL; int err = 0; struct task_struct *me = current; if (used_math()) { - fp = get_stack(ka, regs, sizeof(struct _fpstate)); + fp = get_stack(ka, regs, sizeof(struct _fpstate)); frame = (void __user *)round_down( (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) goto give_sigsegv; - if (save_i387(fp) < 0) - err |= -1; + if (save_i387(fp) < 0) + err |= -1; } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - if (ka->sa.sa_flags & SA_SIGINFO) { + if (ka->sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, info); if (err) goto give_sigsegv; } - + /* Create the ucontext. */ err |= __put_user(0, &frame->uc.uc_flags); err |= __put_user(0, &frame->uc.uc_link); @@ -302,9 +299,9 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); err |= setup_sigcontext(&frame->uc.uc_mcontext, regs, set->sig[0], me); err |= __put_user(fp, &frame->uc.uc_mcontext.fpstate); - if (sizeof(*set) == 16) { + if (sizeof(*set) == 16) { __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); - __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); } else err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); @@ -315,7 +312,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ - goto give_sigsegv; + goto give_sigsegv; } if (err) @@ -323,7 +320,7 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up registers for signal handler */ regs->di = sig; - /* In case the signal handler was declared without prototypes */ + /* In case the signal handler was declared without prototypes */ regs->ax = 0; /* This also works for non SA_SIGINFO handlers because they expect the @@ -376,7 +373,7 @@ static long current_syscall_ret(struct pt_regs *regs) /* * OK, we're invoking a handler - */ + */ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, @@ -420,7 +417,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); else ret = ia32_setup_frame(sig, ka, oldset, regs); - } else + } else #endif ret = setup_rt_frame(sig, ka, info, oldset, regs); @@ -448,9 +445,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, ptrace_notify(SIGTRAP); spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked,¤t->blocked,&ka->sa.sa_mask); + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked,sig); + sigaddset(¤t->blocked, sig); recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); } @@ -552,14 +549,15 @@ void do_notify_resume(struct pt_regs *regs, void *unused, } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -{ - struct task_struct *me = current; +{ + struct task_struct *me = current; if (show_unhandled_signals && printk_ratelimit()) { printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm,me->pid,where,frame,regs->ip,regs->sp,regs->orig_ax); + me->comm, me->pid, where, frame, regs->ip, + regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); printk("\n"); } - force_sig(SIGSEGV, me); -} + force_sig(SIGSEGV, me); +} -- cgit v1.2.2 From 4df9e510a9fda29aca71d8acac853b98aa6884d1 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 29 Jul 2008 02:48:55 -0300 Subject: x86: coding style fixes to arch/x86/kernel/traps_64.c Fix all errors and many warnings reported by checkpath.pl. Except the change of include to the traps.o before and after changes are the same. Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 59 +++++++++++++++++++++++++--------------------- 1 file changed, 32 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 3f18d73f420c..fe36d96ba70b 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #if defined(CONFIG_EDAC) #include @@ -45,9 +47,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -85,7 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) void printk_address(unsigned long address, int reliable) { - printk(" [<%016lx>] %s%pS\n", address, reliable ? "": "? ", (void *) address); + printk(" [<%016lx>] %s%pS\n", address, reliable ? + "" : "? ", (void *) address); } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, @@ -98,7 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, [STACKFAULT_STACK - 1] = "#SS", [MCE_STACK - 1] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" + [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / + EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; unsigned k; @@ -163,7 +164,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, } /* - * x86-64 can have up to three kernel stacks: + * x86-64 can have up to three kernel stacks: * process stack * interrupt stack * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack @@ -219,7 +220,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, const struct stacktrace_ops *ops, void *data) { const unsigned cpu = get_cpu(); - unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; unsigned used = 0; struct thread_info *tinfo; @@ -237,7 +238,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!bp) { if (task == current) { /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) :); + asm("movq %%rbp, %0" : "=r" (bp) : ); } else { /* bp is the last reg pushed by switch_to */ bp = *(unsigned long *) task->thread.sp; @@ -357,11 +358,13 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack; int i; const int cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *) (cpu_pda(cpu)->irqstackptr); - unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); + unsigned long *irqstack_end = + (unsigned long *) (cpu_pda(cpu)->irqstackptr); + unsigned long *irqstack = + (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - // debugging aid: "show_stack(NULL, NULL);" prints the - // back trace for this cpu. + /* debugging aid: "show_stack(NULL, NULL);" prints the + back trace for this cpu. */ if (sp == NULL) { if (task) @@ -404,7 +407,7 @@ void dump_stack(void) #ifdef CONFIG_FRAME_POINTER if (!bp) - asm("movq %%rbp, %0" : "=r" (bp):); + asm("movq %%rbp, %0" : "=r" (bp) : ); #endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", @@ -414,7 +417,6 @@ void dump_stack(void) init_utsname()->version); show_trace(NULL, NULL, &stack, bp); } - EXPORT_SYMBOL(dump_stack); void show_registers(struct pt_regs *regs) @@ -493,7 +495,7 @@ unsigned __kprobes long oops_begin(void) raw_local_irq_save(flags); cpu = smp_processor_id(); if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) + if (cpu == die_owner) /* nested oops. should stop eventually */; else __raw_spin_lock(&die_lock); @@ -638,7 +640,7 @@ kernel_trap: } #define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -648,7 +650,7 @@ asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs * regs, long error_code) \ +asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -683,7 +685,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); } -asmlinkage void do_double_fault(struct pt_regs * regs, long error_code) +asmlinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; @@ -778,9 +780,10 @@ io_check_error(unsigned char reason, struct pt_regs *regs) } static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs * regs) +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) return; printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); @@ -882,7 +885,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) else if (user_mode(eregs)) regs = task_pt_regs(current); /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ + kernel process stack. */ else if (eregs->flags & X86_EFLAGS_IF) regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); if (eregs != regs) @@ -891,7 +894,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) } /* runs on IST stack. */ -asmlinkage void __kprobes do_debug(struct pt_regs * regs, +asmlinkage void __kprobes do_debug(struct pt_regs *regs, unsigned long error_code) { struct task_struct *tsk = current; @@ -1035,7 +1038,7 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) asmlinkage void bad_intr(void) { - printk("bad interrupt"); + printk("bad interrupt"); } asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) @@ -1047,7 +1050,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) conditional_sti(regs); if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) + kernel_math_error(regs, "kernel simd math error", 19)) return; /* @@ -1092,7 +1095,7 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_spurious_interrupt_bug(struct pt_regs * regs) +asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs) { } @@ -1142,8 +1145,10 @@ void __init trap_init(void) set_intr_gate(0, ÷_error); set_intr_gate_ist(1, &debug, DEBUG_STACK); set_intr_gate_ist(2, &nmi, NMI_STACK); - set_system_gate_ist(3, &int3, DEBUG_STACK); /* int3 can be called from all */ - set_system_gate(4, &overflow); /* int4 can be called from all */ + /* int3 can be called from all */ + set_system_gate_ist(3, &int3, DEBUG_STACK); + /* int4 can be called from all */ + set_system_gate(4, &overflow); set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); -- cgit v1.2.2 From e9c8abb66cc37801bdb5d4360bb78d180c3bbb73 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Tue, 29 Jul 2008 02:48:56 -0300 Subject: x86: coding style fixes to arch/x86/kernel/sys_x86_64.c Fix all errors and many warnings reported by checkpatch.pl without change sys_x86_64.o arch/x86/kernel/sys_x86_64.o: text data bss dec hex filename 1567 0 0 1567 61f sys_x86_64.o.after 1567 0 0 1567 61f sys_x86_64.o.before md5: de28ffedcb5851dfd7ec87a03afec1fd sys_x86_64.o.after de28ffedcb5851dfd7ec87a03afec1fd sys_x86_64.o.before Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/sys_x86_64.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 3b360ef33817..56eb8f916e9f 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -13,15 +13,16 @@ #include #include #include +#include -#include #include -asmlinkage long sys_mmap(unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long off) +asmlinkage long sys_mmap(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long off) { long error; - struct file * file; + struct file *file; error = -EINVAL; if (off & ~PAGE_MASK) @@ -56,9 +57,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, unmapped base down for this case. This can give conflicts with the heap, but we assume that glibc malloc knows how to fall back to mmap. Give it 1GB - of playground for now. -AK */ - *begin = 0x40000000; - *end = 0x80000000; + of playground for now. -AK */ + *begin = 0x40000000; + *end = 0x80000000; if (current->flags & PF_RANDOMIZE) { new_begin = randomize_range(*begin, *begin + 0x02000000, 0); if (new_begin) @@ -66,9 +67,9 @@ static void find_start_end(unsigned long flags, unsigned long *begin, } } else { *begin = TASK_UNMAPPED_BASE; - *end = TASK_SIZE; + *end = TASK_SIZE; } -} +} unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, @@ -78,11 +79,11 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; unsigned long start_addr; unsigned long begin, end; - + if (flags & MAP_FIXED) return addr; - find_start_end(flags, &begin, &end); + find_start_end(flags, &begin, &end); if (len > end) return -ENOMEM; @@ -96,12 +97,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } if (((flags & MAP_32BIT) || test_thread_flag(TIF_IA32)) && len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; + mm->cached_hole_size = 0; mm->free_area_cache = begin; } addr = mm->free_area_cache; - if (addr < begin) - addr = begin; + if (addr < begin) + addr = begin; start_addr = addr; full_search: @@ -127,7 +128,7 @@ full_search: return addr; } if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; + mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; } @@ -177,7 +178,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, vma = find_vma(mm, addr-len); if (!vma || addr <= vma->vm_start) /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr-len); + return mm->free_area_cache = addr-len; } if (mm->mmap_base < len) @@ -194,7 +195,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, vma = find_vma(mm, addr); if (!vma || addr+len <= vma->vm_start) /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr); + return mm->free_area_cache = addr; /* remember the largest hole we saw so far */ if (addr + mm->cached_hole_size < vma->vm_start) @@ -224,13 +225,13 @@ bottomup: } -asmlinkage long sys_uname(struct new_utsname __user * name) +asmlinkage long sys_uname(struct new_utsname __user *name) { int err; down_read(&uts_sem); - err = copy_to_user(name, utsname(), sizeof (*name)); + err = copy_to_user(name, utsname(), sizeof(*name)); up_read(&uts_sem); - if (personality(current->personality) == PER_LINUX32) - err |= copy_to_user(&name->machine, "i686", 5); + if (personality(current->personality) == PER_LINUX32) + err |= copy_to_user(&name->machine, "i686", 5); return err ? -EFAULT : 0; } -- cgit v1.2.2 From 64a76f667d987a559ad0726b4692c987800b22bc Mon Sep 17 00:00:00 2001 From: David Brownell Date: Tue, 29 Jul 2008 12:47:38 -0700 Subject: hpet: /dev/hpet - fixes and cleanup Minor /dev/hpet updates and bugfixes: * Remove dead code, mostly remnants of an incomplete/unusable kernel interface ... noted when addressing "sparse" warnings: + hpet_unregister() and a routine it calls + hpet_task and all references, including hpet_task_lock + hpet_data.hd_flags (and HPET_DATA_PLATFORM) * Correct and improve boot message: + displays *counter* (shared between comparators) bit width, not *timer* bit widths (which are often mixed) + relabel "timers" as "comparators"; this is less confusing, they are not independent like normal timers are (sigh) + display MHz not Hz; it's never less than 10 MHz. * Tighten and correct the userspace interface code + don't accidentally program comparators in 64-bit mode using 32-bit values ... always force comparators into 32-bit mode + provide the correct bit definition flagging comparators with periodic capability ... the ABI is unchanged * Update Documentation/hpet.txt + be more correct and current + expand description a bit + don't mention that now-gone kernel interface Plus, add a FIXME comment for something that could cause big trouble on systems with more capable HPETs than at least Intel seems to ship. It seems that few folk use this userspace interface; it's not very usable given the general lack of HPET IRQ routing. I'm told that the only real point of it any more is to mmap for fast timestamps; IMO that's handled better through the gettimeofday() vsyscall. Signed-off-by: David Brownell Acked-by: Clemens Ladisch Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad2b15a1334d..82d459186fd8 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -115,13 +115,17 @@ static void hpet_reserve_platform_timers(unsigned long id) hd.hd_phys_address = hpet_address; hd.hd_address = hpet; hd.hd_nirqs = nrtimers; - hd.hd_flags = HPET_DATA_PLATFORM; hpet_reserve_timer(&hd, 0); #ifdef CONFIG_HPET_EMULATE_RTC hpet_reserve_timer(&hd, 1); #endif + /* + * NOTE that hd_irq[] reflects IOAPIC input pins (LEGACY_8254 + * is wrong for i8259!) not the output IRQ. Many BIOS writers + * don't bother configuring *any* comparator interrupts. + */ hd.hd_irq[0] = HPET_LEGACY_8254; hd.hd_irq[1] = HPET_LEGACY_RTC; -- cgit v1.2.2 From a677f58a8c8c541bf7d02c658545084040f3708d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 29 Jul 2008 00:37:10 -0700 Subject: x86: print per_cpu data address to make sure per_cpu data on correct node. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index f7745f94c006..61f3966632a8 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -180,9 +180,16 @@ void __init setup_per_cpu_areas(void) printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", cpu, node); + if (ptr) + printk(KERN_DEBUG "per cpu data for cpu%d at %016lx\n", + cpu, __pa(ptr)); } - else + else { ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); + if (ptr) + printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", + cpu, node, __pa(ptr)); + } #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); -- cgit v1.2.2 From a0ac87d61ba20932e54f08464d3f3439d78fa878 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:04 +0200 Subject: x86: AMD microcode patch loader style corrections Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 07e52beacb44..6789530ef33b 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -47,9 +47,9 @@ MODULE_LICENSE("GPL v2"); #define UCODE_UCODE_TYPE 0x00000001 #define UCODE_MAX_SIZE (2048) -#define DEFAULT_UCODE_DATASIZE (896) /* 896 bytes */ -#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) /* 64 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 960 bytes */ +#define DEFAULT_UCODE_DATASIZE (896) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) #define DWSIZE (sizeof(u32)) /* For now we support a fixed ucode total size only */ #define get_totalsize(mc) \ -- cgit v1.2.2 From f516526febb75500f280def2108688d388df4e1e Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:05 +0200 Subject: x86: Intel microcode patch loader style corrections Signed-off-by: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_intel.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 6da4a85ff465..a61986e8b691 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -55,8 +55,8 @@ * in a single CPU package. * 1.10 28 Feb 2002 Asit K Mallick and * Tigran Aivazian , - * Serialize updates as required on HT processors due to speculative - * nature of implementation. + * Serialize updates as required on HT processors due to + * speculative nature of implementation. * 1.11 22 Mar 2002 Tigran Aivazian * Fix the panic when writing zero-length microcode chunk. * 1.12 29 Sep 2003 Nitin Kamble , @@ -71,7 +71,7 @@ * Thanks to Stuart Swales for pointing out this bug. */ -//#define DEBUG /* pr_debug */ +/* #define DEBUG */ /* pr_debug */ #include #include #include @@ -99,11 +99,11 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); -#define DEFAULT_UCODE_DATASIZE (2000) /* 2000 bytes */ -#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) /* 48 bytes */ -#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) /* 2048 bytes */ -#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) /* 20 bytes */ -#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) /* 12 bytes */ +#define DEFAULT_UCODE_DATASIZE (2000) +#define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) +#define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) +#define EXT_HEADER_SIZE (sizeof(struct extended_sigtable)) +#define EXT_SIGNATURE_SIZE (sizeof(struct extended_signature)) #define DWSIZE (sizeof(u32)) #define get_totalsize(mc) \ (((struct microcode_intel *)mc)->hdr.totalsize ? \ -- cgit v1.2.2 From 831f9bd3151ebd22959a0cb2a20b44a06b26d4ed Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:06 +0200 Subject: x86: moved function declarations out from AMD microcode patch loader to heade file Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 6789530ef33b..2b98e6ab1bf9 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -56,9 +56,6 @@ MODULE_LICENSE("GPL v2"); ((((struct microcode_amd *)mc)->hdr.mc_patch_data_len * 28) \ + MC_HEADER_SIZE) -extern int microcode_init(void *opaque, struct module *module); -extern void microcode_exit(void); - /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); -- cgit v1.2.2 From daa9c0fee1cbe1d49fbe29af3d4bb16312043b1e Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Tue, 29 Jul 2008 17:41:07 +0200 Subject: x86: minor pointer type cast in AMD microcode patch loader Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 2b98e6ab1bf9..33b2a217a8c5 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -394,7 +394,8 @@ static int cpu_request_microcode_amd(int cpu) return error; } - buf_pos = buf = firmware->data; + buf_pos = (unsigned int *)firmware->data; + buf = (void *)firmware->data; size = firmware->size; if (buf_pos[0] != UCODE_MAGIC) { -- cgit v1.2.2 From 7ab6af7ab69df8c9c5fbc380004fb81187742096 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 30 Jul 2008 17:36:48 -0700 Subject: x86_32: use apic_ops at print_local_APIC() Use apic_icr_read at print_local_APIC() in io_apic_32.c Signed-off-by: Hiroshi Shimamoto Cc: Suresh Siddha Cc: Yinghai Lu Signed-off-by: Ingo Molnar Cc: Suresh Siddha Cc: Yinghai Lu --- arch/x86/kernel/io_apic_32.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 98e4db5373f3..39e063a9a28a 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1486,6 +1486,7 @@ static void print_APIC_bitfield(int base) void /*__init*/ print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; + u64 icr; if (apic_verbosity == APIC_QUIET) return; @@ -1536,10 +1537,9 @@ void /*__init*/ print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC ESR: %08x\n", v); } - v = apic_read(APIC_ICR); - printk(KERN_DEBUG "... APIC ICR: %08x\n", v); - v = apic_read(APIC_ICR2); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", v); + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); -- cgit v1.2.2 From e0d22d03c06c4e2c194d7010bc1e4a972199f156 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Thu, 31 Jul 2008 23:43:44 +0200 Subject: x86: fdiv bug detection fix The fdiv detection code writes s32 integer into the boot_cpu_data.fdiv_bug. However, the boot_cpu_data.fdiv_bug is only char (s8) field so the detection overwrites already set fields for other bugs, e.g. the f00f bug field. Use local s32 variable to receive result. This is a partial fix to Bugzilla #9928 - fixes wrong information about the f00f bug (tested) and probably for coma bug (I have no cpu to test this). Signed-off-by: Krzysztof Helt Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/bugs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c9b58a806e85..c8e315f1aa83 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -50,6 +50,8 @@ static double __initdata y = 3145727.0; */ static void __init check_fpu(void) { + s32 fdiv_bug; + if (!boot_cpu_data.hard_math) { #ifndef CONFIG_MATH_EMULATION printk(KERN_EMERG "No coprocessor found and no math emulation present.\n"); @@ -74,8 +76,10 @@ static void __init check_fpu(void) "fistpl %0\n\t" "fwait\n\t" "fninit" - : "=m" (*&boot_cpu_data.fdiv_bug) + : "=m" (*&fdiv_bug) : "m" (*&x), "m" (*&y)); + + boot_cpu_data.fdiv_bug = fdiv_bug; if (boot_cpu_data.fdiv_bug) printk("Hmm, FPU with FDIV bug.\n"); } -- cgit v1.2.2 From ec983f7060cd73e14cdd3edd910339127a8a4e96 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 30 Jul 2008 12:05:03 -0400 Subject: [CPUFREQ] Remove EXPERIMENTAL annotation from VIA C7 powersaver kconfig. This has been pretty solid, and doesn't see much change at all. Noticed by Harald Welte. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig index cb7a5715596d..efae3b22a0ff 100644 --- a/arch/x86/kernel/cpu/cpufreq/Kconfig +++ b/arch/x86/kernel/cpu/cpufreq/Kconfig @@ -235,9 +235,9 @@ config X86_LONGHAUL If in doubt, say N. config X86_E_POWERSAVER - tristate "VIA C7 Enhanced PowerSaver (EXPERIMENTAL)" + tristate "VIA C7 Enhanced PowerSaver" select CPU_FREQ_TABLE - depends on X86_32 && EXPERIMENTAL + depends on X86_32 help This adds the CPUFreq driver for VIA C7 processors. -- cgit v1.2.2 From 460f5ef2835dbc33825f611f408eb09c29be4b85 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 30 Jul 2008 13:01:42 -0400 Subject: [CPUFREQ] Fix warning in elanfreq arch/x86/kernel/cpu/cpufreq/elanfreq.c:47:26: warning: symbol 'elan_multiplier' was not declared. Should it be static? Yes, yes it should. Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/elanfreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index 94619c22f563..e4a4bf870e94 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -44,7 +44,7 @@ struct s_elan_multiplier { * It is important that the frequencies * are listed in ascending order here! */ -struct s_elan_multiplier elan_multiplier[] = { +static struct s_elan_multiplier elan_multiplier[] = { {1000, 0x02, 0x18}, {2000, 0x02, 0x10}, {4000, 0x02, 0x08}, -- cgit v1.2.2 From 23431b495fcbbde6d09aba30019b6a224d543cf9 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 31 Jul 2008 12:39:05 -0500 Subject: [CPUFREQ][1/2] whitespace fix for powernow-k8 Trivial whitespace fix for powernow-k8. Signed-off-by: Mark Langsdorf Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index c45ca6d4dce1..84bb395038d8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -66,7 +66,6 @@ static u32 find_freq_from_fid(u32 fid) return 800 + (fid * 100); } - /* Return a frequency in KHz, given an input fid */ static u32 find_khz_freq_from_fid(u32 fid) { @@ -78,7 +77,6 @@ static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data, u32 p return data[pstate].frequency; } - /* Return the vco fid for an input fid * * Each "low" fid has corresponding "high" fid, and you can get to "low" fids @@ -166,7 +164,6 @@ static void fidvid_msr_init(void) wrmsr(MSR_FIDVID_CTL, lo, hi); } - /* write the new fid value along with the other control fields to the msr */ static int write_new_fid(struct powernow_k8_data *data, u32 fid) { -- cgit v1.2.2 From 34ae7f35a21694aa5cb8829dc5142c39d73d6ba0 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 31 Jul 2008 12:39:12 -0500 Subject: [CPUFREQ][2/2] preregister support for powernow-k8 This patch provides support for the _PSD ACPI object in the Powernow-k8 driver. Although it looks like an invasive patch, most of it is simply the consequence of turning the static acpi_performance_data structure into a pointer. AMD has tested it on several machines over the past few days without issue. [trivial checkpatch warnings fixed up by davej] [X86_POWERNOW_K8_ACPI=n buildfix from Randy Dunlap] Signed-off-by: Mark Langsdorf Tested-by: Frank Arnold Signed-off-by: Randy Dunlap Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 109 ++++++++++++++++++++---------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 3 +- 2 files changed, 75 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 84bb395038d8..4e7271999a74 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -737,44 +737,63 @@ static int find_psb_table(struct powernow_k8_data *data) #ifdef CONFIG_X86_POWERNOW_K8_ACPI static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) + if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE)) return; - data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; + data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK; +} + + +static struct acpi_processor_performance *acpi_perf_data; +static int preregister_valid; + +static int powernow_k8_cpu_preinit_acpi(void) +{ + acpi_perf_data = alloc_percpu(struct acpi_processor_performance); + if (!acpi_perf_data) + return -ENODEV; + + if (acpi_processor_preregister_performance(acpi_perf_data)) + return -ENODEV; + else + preregister_valid = 1; + return 0; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val; + int cpu = 0; - if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { + data->acpi_data = percpu_ptr(acpi_perf_data, cpu); + if (acpi_processor_register_performance(data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); return -EIO; } /* verify the data contained in the ACPI structures */ - if (data->acpi_data.state_count <= 1) { + if (data->acpi_data->state_count <= 1) { dprintk("No ACPI P-States\n"); goto err_out; } - if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { dprintk("Invalid control/status registers (%x - %x)\n", - data->acpi_data.control_register.space_id, - data->acpi_data.status_register.space_id); + data->acpi_data->control_register.space_id, + data->acpi_data->status_register.space_id); goto err_out; } /* fill in data->powernow_table */ powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data.state_count + 1)), GFP_KERNEL); + * (data->acpi_data->state_count + 1)), GFP_KERNEL); if (!powernow_table) { dprintk("powernow_table memory alloc failure\n"); goto err_out; @@ -787,12 +806,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) if (ret_val) goto err_out_mem; - powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; - powernow_table[data->acpi_data.state_count].index = 0; + powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END; + powernow_table[data->acpi_data->state_count].index = 0; data->powernow_table = powernow_table; /* fill in data */ - data->numps = data->acpi_data.state_count; + data->numps = data->acpi_data->state_count; if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) print_basics(data); powernow_k8_acpi_pst_values(data, 0); @@ -800,16 +819,31 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); + /* determine affinity, from ACPI if available */ + if (preregister_valid) { + if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) || + (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY)) + data->starting_core_affinity = data->acpi_data->shared_cpu_map; + else + data->starting_core_affinity = cpumask_of_cpu(data->cpu); + } else { + /* best guess from family if not */ + if (cpu_family == CPU_HW_PSTATE) + data->starting_core_affinity = cpumask_of_cpu(data->cpu); + else + data->starting_core_affinity = per_cpu(cpu_core_map, data->cpu); + } + return 0; err_out_mem: kfree(powernow_table); err_out: - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + acpi_processor_unregister_performance(data->acpi_data, data->cpu); /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ - data->acpi_data.state_count = 0; + data->acpi_data->state_count = 0; return -ENODEV; } @@ -821,10 +855,10 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - for (i = 0; i < data->acpi_data.state_count; i++) { + for (i = 0; i < data->acpi_data->state_count; i++) { u32 index; - index = data->acpi_data.states[i].control & HW_PSTATE_MASK; + index = data->acpi_data->states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); @@ -840,7 +874,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf powernow_table[i].index = index; - powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; + powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000; } return 0; } @@ -849,16 +883,16 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf { int i; int cntlofreq = 0; - for (i = 0; i < data->acpi_data.state_count; i++) { + for (i = 0; i < data->acpi_data->state_count; i++) { u32 fid; u32 vid; if (data->exttype) { - fid = data->acpi_data.states[i].status & EXT_FID_MASK; - vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; + fid = data->acpi_data->states[i].status & EXT_FID_MASK; + vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK; } else { - fid = data->acpi_data.states[i].control & FID_MASK; - vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; + fid = data->acpi_data->states[i].control & FID_MASK; + vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK; } dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); @@ -899,10 +933,10 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf cntlofreq = i; } - if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { + if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", powernow_table[i].frequency, - (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); + (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; continue; } @@ -912,11 +946,12 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { - if (data->acpi_data.state_count) - acpi_processor_unregister_performance(&data->acpi_data, data->cpu); + if (data->acpi_data->state_count) + acpi_processor_unregister_performance(data->acpi_data, data->cpu); } #else +static int powernow_k8_cpu_preinit_acpi(void) { return -ENODEV; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } @@ -1101,7 +1136,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol) static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; - cpumask_t oldmask; + cpumask_t oldmask = CPU_MASK_ALL; int rc; if (!cpu_online(pol->cpu)) @@ -1174,10 +1209,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* run on any CPU again */ set_cpus_allowed_ptr(current, &oldmask); - if (cpu_family == CPU_HW_PSTATE) - pol->cpus = cpumask_of_cpu(pol->cpu); - else - pol->cpus = per_cpu(cpu_core_map, pol->cpu); + pol->cpus = data->starting_core_affinity; data->available_cores = &(pol->cpus); /* Take a crude guess here. @@ -1300,6 +1332,7 @@ static int __cpuinit powernowk8_init(void) } if (supported_cpus == num_online_cpus()) { + powernow_k8_cpu_preinit_acpi(); printk(KERN_INFO PFX "Found %d %s " "processors (%d cpu cores) (" VERSION ")\n", num_online_nodes(), @@ -1316,6 +1349,10 @@ static void __exit powernowk8_exit(void) dprintk("exit\n"); cpufreq_unregister_driver(&cpufreq_amd64_driver); + +#ifdef CONFIG_X86_POWERNOW_K8_ACPI + free_percpu(acpi_perf_data); +#endif } MODULE_AUTHOR("Paul Devriendt and Mark Langsdorf "); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index ab48cfed4d96..a62612cd4be8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -33,12 +33,13 @@ struct powernow_k8_data { #ifdef CONFIG_X86_POWERNOW_K8_ACPI /* the acpi table needs to be kept. it's only available if ACPI was * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance acpi_data; + struct acpi_processor_performance *acpi_data; #endif /* we need to keep track of associated cores, but let cpufreq * handle hotplug events - so just point at cpufreq pol->cpus * structure */ cpumask_t *available_cores; + cpumask_t starting_core_affinity; }; -- cgit v1.2.2 From 31343d8a5079cda57ffd539fcf4f00cea344fe98 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Fri, 8 Aug 2008 12:15:57 -0700 Subject: x86: Fix broken VMI in 2.6.27-rc.. The lowmem mapping table created by VMI need not depend on max_low_pfn at all. Instead we now create an extra large mapping which covers all possible lowmem instead of the physical ram that is actually available. This allows the vmi initialization to be done before max_low_pfn could be computed. We also move the vmi_init code very early in the boot process so that nobody accidentally breaks the fixmap dependancy. Signed-off-by: Alok N Kataria Acked-by: Zachary Amsden Signed-off-by: H. Peter Anvin --- arch/x86/kernel/setup.c | 16 ++++++++-------- arch/x86/kernel/vmi_32.c | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2d888586385d..6e5823b9e765 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -604,6 +604,14 @@ void __init setup_arch(char **cmdline_p) early_cpu_init(); early_ioremap_init(); +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) + /* + * Must be before kernel pagetables are setup + * or fixmap area is touched. + */ + vmi_init(); +#endif + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; @@ -817,14 +825,6 @@ void __init setup_arch(char **cmdline_p) kvmclock_init(); #endif -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be after max_low_pfn is determined, and before kernel - * pagetables are setup. - */ - vmi_init(); -#endif - paravirt_pagetable_setup_start(swapper_pg_dir); paging_init(); paravirt_pagetable_setup_done(swapper_pg_dir); diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 0a1b1a9d922d..6ca515d6db54 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -37,6 +37,7 @@ #include #include #include +#include /* Convenient for calling VMI functions indirectly in the ROM */ typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void); @@ -683,7 +684,7 @@ void vmi_bringup(void) { /* We must establish the lowmem mapping for MMU ops to work */ if (vmi_ops.set_linear_mapping) - vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, max_low_pfn, 0); + vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0); } /* -- cgit v1.2.2 From 90936cfe6c8f7e90a6f8b0c5cb44d3a012dfd313 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:07:44 +0200 Subject: x86, tsc: fix section mismatch warning WARNING: vmlinux.o(.text+0x7950): Section mismatch in reference from the function native_calibrate_tsc() to the function .init.text:tsc_read_refs() The function native_calibrate_tsc() references the function __init tsc_read_refs(). This is often because native_calibrate_tsc lacks a __init annotation or the annotation of tsc_read_refs is wrong. tsc_read_refs is called from native_calibrate_tsc which is not __init and native_calibrate_tsc cannot be marked __init Signed-off-by: Marcin Slusarz Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7603c0553909..46af71676738 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,7 +104,7 @@ __setup("notsc", notsc_setup); /* * Read TSC and the reference counters. Take care of SMI disturbance */ -static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) +static u64 tsc_read_refs(u64 *pm, u64 *hpet) { u64 t1, t2; int i; -- cgit v1.2.2 From 85a14437ed24244c78f9a70d58b8299753b03c92 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:12:37 +0200 Subject: x86: fix MP_processor_info section mismatch warning WARNING: arch/x86/kernel/built-in.o(.cpuinit.text+0x1fe7): Section mismatch in reference from the function MP_processor_info() to the variable .init.data:x86_quirks The function __cpuinit MP_processor_info() references a variable __initdata x86_quirks. If x86_quirks is only used by MP_processor_info then annotate x86_quirks with a matching annotation. MP_processor_info uses x86_quirks which is __init and is used only from smp_read_mpc and construct_default_ISA_mptable which are __init Signed-off-by: Marcin Slusarz Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mpparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6ae005ccaed8..78509ee8cc74 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -49,7 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -static void __cpuinit MP_processor_info(struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int apicid; char *bootup_cpu = ""; -- cgit v1.2.2 From bafc1dae8215c862c2e6ae913ddadc20581e59b9 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:11:13 +0200 Subject: x86: mmconf: fix section mismatch warning WARNING: arch/x86/kernel/built-in.o(.cpuinit.text+0x1591): Section mismatch in reference from the function init_amd() to the function .init.text:check_enable_amd_mmconf_dmi() The function __cpuinit init_amd() references a function __init check_enable_amd_mmconf_dmi(). If check_enable_amd_mmconf_dmi is only used by init_amd then annotate check_enable_amd_mmconf_dmi with a matching annotation. check_enable_amd_mmconf_dmi is only called from init_amd which is __cpuinit Signed-off-by: Marcin Slusarz Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index fdfdc550b366..efc2f361fe85 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { {} }; -void __init check_enable_amd_mmconf_dmi(void) +void __cpuinit check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); } -- cgit v1.2.2 From d406d21d90dce2e66c7eb4a44605aac947fe55fb Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:09:38 +0200 Subject: x86: mpparse.c: fix section mismatch warning WARNING: vmlinux.o(.text+0x118f7): Section mismatch in reference from the function construct_ioapic_table() to the function .init.text:MP_bus_info() The function construct_ioapic_table() references the function __init MP_bus_info(). This is often because construct_ioapic_table lacks a __init annotation or the annotation of MP_bus_info is wrong. construct_ioapic_table is called only from construct_default_ISA_mptable which is __init Signed-off-by: Marcin Slusarz Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Signed-off-by: H. Peter Anvin --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 78509ee8cc74..2c1963b39621 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -484,7 +484,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) } -static void construct_ioapic_table(int mpc_default_type) +static void __init construct_ioapic_table(int mpc_default_type) { struct mpc_config_ioapic ioapic; struct mpc_config_bus bus; @@ -529,7 +529,7 @@ static void construct_ioapic_table(int mpc_default_type) construct_default_ioirq_mptable(mpc_default_type); } #else -static inline void construct_ioapic_table(int mpc_default_type) { } +static inline void __init construct_ioapic_table(int mpc_default_type) { } #endif static inline void __init construct_default_ISA_mptable(int mpc_default_type) -- cgit v1.2.2 From d388e5fdc461344d04307a3fa83862b9ed429647 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 9 Aug 2008 15:09:02 -0700 Subject: x86: Restore proper vector locking during cpu hotplug Having cpu_online_map change during assign_irq_vector can result in some really nasty and weird things happening. The one that bit me last time was accessing non existent per cpu memory for non existent cpus. This locking was removed in a sloppy x86_64 and x86_32 merge patch. Guys can we please try and avoid subtly breaking x86 when we are merging files together? Signed-off-by: Eric W. Biederman Signed-off-by: H. Peter Anvin --- arch/x86/kernel/io_apic_32.c | 6 +----- arch/x86/kernel/io_apic_64.c | 25 +++++++++++++++---------- arch/x86/kernel/smpboot.c | 12 +++++++++--- 3 files changed, 25 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index de9aa0e3a9c5..09cddb57bec4 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -57,7 +57,7 @@ atomic_t irq_mis_count; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); +DEFINE_SPINLOCK(vector_lock); int timer_through_8259 __initdata; @@ -1209,10 +1209,6 @@ static int assign_irq_vector(int irq) return vector; } -void setup_vector_irq(int cpu) -{ -} - static struct irq_chip ioapic_chip; #define IOAPIC_AUTO -1 diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 8269434d1707..61a83b70c18f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -101,7 +101,7 @@ int timer_through_8259 __initdata; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_SPINLOCK(vector_lock); /* * # of IRQ routing registers @@ -697,6 +697,19 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + spin_unlock(&vector_lock); +} + static int __assign_irq_vector(int irq, cpumask_t mask) { /* @@ -802,7 +815,7 @@ static void __clear_irq_vector(int irq) cpus_clear(cfg->domain); } -static void __setup_vector_irq(int cpu) +void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ @@ -825,14 +838,6 @@ static void __setup_vector_irq(int cpu) } } -void setup_vector_irq(int cpu) -{ - spin_lock(&vector_lock); - __setup_vector_irq(smp_processor_id()); - spin_unlock(&vector_lock); -} - - static struct irq_chip ioapic_chip; static void ioapic_register_intr(int irq, unsigned long trigger) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 332512767f4f..da10f07fc59c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -326,12 +326,16 @@ static void __cpuinit start_secondary(void *unused) * for which cpus receive the IPI. Holding this * lock helps us to not include this cpu in a currently in progress * smp_call_function(). + * + * We need to hold vector_lock so there the set of online cpus + * does not change while we are assigning vectors to cpus. Holding + * this lock ensures we don't half assign or remove an irq from a cpu. */ ipi_call_lock_irq(); -#ifdef CONFIG_X86_IO_APIC - setup_vector_irq(smp_processor_id()); -#endif + lock_vector_lock(); + __setup_vector_irq(smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); + unlock_vector_lock(); ipi_call_unlock_irq(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; @@ -1336,7 +1340,9 @@ int __cpu_disable(void) remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ + lock_vector_lock(); remove_cpu_from_maps(cpu); + unlock_vector_lock(); fixup_irqs(cpu_online_map); return 0; } -- cgit v1.2.2 From 4c942654a4514d7d0a9b592a7d1b198a212e8a03 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sun, 10 Aug 2008 20:57:45 +0800 Subject: arch/x86/kernel/acpi/boot.c: removed duplicated #include Removed duplicated include file in arch/x86/kernel/acpi/boot.c. Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fa88a1d71290..d8d118935b08 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -58,7 +58,6 @@ EXPORT_SYMBOL(acpi_disabled); #ifdef CONFIG_X86_64 #include -#include #else /* X86 */ -- cgit v1.2.2 From 8aeb4022633f7d0eca5e13a9622bd73df92bbf2a Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sun, 10 Aug 2008 21:09:22 +0800 Subject: arch/x86/kernel/cpuid.c: removed duplicated #include Removed duplicated include file in arch/x86/kernel/cpuid.c. Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpuid.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 14b11b3be31c..3fa4e926b510 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.2 From 2ae111cdd8d83ebf9de72e36e68a8c84b6ebbeea Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 11 Aug 2008 18:34:08 +0400 Subject: x86: apic interrupts - move assignments to irqinit_32.c, v2 64bit mode APIC interrupt handlers are set within irqinit_64.c. Lets do tha same for 32bit mode which would help in furter code merging. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 48 ------------------------------------------ arch/x86/kernel/irqinit_32.c | 49 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/mach-default/setup.c | 15 ------------- 3 files changed, 49 insertions(+), 63 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f93c18f5b79d..9e341c9d9414 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1354,54 +1354,6 @@ void smp_error_interrupt(struct pt_regs *regs) irq_exit(); } -#ifdef CONFIG_SMP -void __init smp_intr_init(void) -{ - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); - - /* - * The reschedule interrupt is a CPU-to-CPU reschedule-helper - * IPI, driven by wakeup. - */ - alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); - - /* IPI for invalidation */ - alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); - - /* IPI for generic function call */ - alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); - - /* IPI for single call function */ - set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, - call_function_single_interrupt); -} -#endif - -/* - * Initialize APIC interrupts - */ -void __init apic_intr_init(void) -{ -#ifdef CONFIG_SMP - smp_intr_init(); -#endif - /* self generated IPI for local APIC timer */ - alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); - - /* IPI vectors for APIC spurious and error interrupts */ - alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); - alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); - - /* thermal monitor LVT interrupt */ -#ifdef CONFIG_X86_MCE_P4THERMAL - alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -} - /** * connect_bsp_APIC - attach the APIC to the interrupt system */ diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index d66914287ee1..9200a1e2752d 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -74,6 +74,15 @@ void __init init_ISA_irqs (void) } } +/* + * IRQ2 is cascade interrupt to second interrupt controller + */ +static struct irqaction irq2 = { + .handler = no_action, + .mask = CPU_MASK_NONE, + .name = "cascade", +}; + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -98,6 +107,46 @@ void __init native_init_IRQ(void) set_intr_gate(vector, interrupt[i]); } +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) + /* + * IRQ0 must be given a fixed assignment and initialized, + * because it's used before the IO-APIC is set up. + */ + set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + alloc_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for generic function call */ + alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); + + /* IPI for single call function */ + set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); +#endif + +#ifdef CONFIG_X86_LOCAL_APIC + /* self generated IPI for local APIC timer */ + alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); + + /* IPI vectors for APIC spurious and error interrupts */ + alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); + alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +#endif + +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) + /* thermal monitor LVT interrupt */ + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif + + if (!acpi_ioapic) + setup_irq(2, &irq2); + /* setup after call gates are initialised (usually add in * the architecture specific gates) */ diff --git a/arch/x86/mach-default/setup.c b/arch/x86/mach-default/setup.c index 3d317836be9e..b00f5ad10ca1 100644 --- a/arch/x86/mach-default/setup.c +++ b/arch/x86/mach-default/setup.c @@ -36,15 +36,6 @@ void __init pre_intr_init_hook(void) init_ISA_irqs(); } -/* - * IRQ2 is cascade interrupt to second interrupt controller - */ -static struct irqaction irq2 = { - .handler = no_action, - .mask = CPU_MASK_NONE, - .name = "cascade", -}; - /** * intr_init_hook - post gate setup interrupt initialisation * @@ -60,12 +51,6 @@ void __init intr_init_hook(void) if (x86_quirks->arch_intr_init()) return; } -#ifdef CONFIG_X86_LOCAL_APIC - apic_intr_init(); -#endif - - if (!acpi_ioapic) - setup_irq(2, &irq2); } /** -- cgit v1.2.2 From 7c13e6a3d15a4ebcc3f40df5f4d19665479f8ca3 Mon Sep 17 00:00:00 2001 From: Dimitri Sivanich Date: Mon, 11 Aug 2008 10:46:46 -0500 Subject: x86: remove EXPERIMENTAL restriction from CONFIG_HOTPLUG_CPU This removes the EXPERIMENTAL restriction from CONFIG_HOTPLUG_CPU on the x86 architecture. Signed-off-by: Dimitri Sivanich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3d0f2b6a5a16..7917962ab7ff 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1371,14 +1371,14 @@ config PHYSICAL_ALIGN Don't change this unless you know what you are doing. config HOTPLUG_CPU - bool "Support for suspend on SMP and hot-pluggable CPUs (EXPERIMENTAL)" - depends on SMP && HOTPLUG && EXPERIMENTAL && !X86_VOYAGER + bool "Support for hot-pluggable CPUs" + depends on SMP && HOTPLUG && !X86_VOYAGER ---help--- - Say Y here to experiment with turning CPUs off and on, and to - enable suspend on SMP systems. CPUs can be controlled through - /sys/devices/system/cpu. - Say N if you want to disable CPU hotplug and don't need to - suspend. + Say Y here to allow turning CPUs off and on. CPUs can be + controlled through /sys/devices/system/cpu. + ( Note: power management support will enable this option + automatically on SMP systems. ) + Say N if you want to disable CPU hotplug. config COMPAT_VDSO def_bool y -- cgit v1.2.2 From eeb0d7d113895556db473ff1e638803d7d49bff9 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Mon, 11 Aug 2008 17:44:57 +0200 Subject: x86, debug: tone down arch/x86/kernel/mpparse.c debugging printk commit 11a62a056093a7f25f1595fbd8bd5f93559572b6 turns some formerly nopped debugging printks in arch/x86/kernel/mppparse.c into regular ones. The one at the top of smp_scan_config() in particular also prints on !CONFIG_SMP/CONFIG_X86_LOCAL_APIC kernels and UP machines without anything resembling MP tables which makes their lowly UP owners wonder... Turn the former Dprintk()s into apic_printk()s instead meaning that their printing is dependent on passing the apic=verbose (or =debug) command line param. On 32-bit, "apic" is a __setup() param which isn't early enough for this code and therefore needs a followup changing it into an early_param(). On 64-bit, it already is. Signed-off-by: Rene Herman Cc: Andrew Morton Cc: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6ae005ccaed8..678090508a62 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -83,7 +83,7 @@ static void __init MP_bus_info(struct mpc_config_bus *m) if (x86_quirks->mpc_oem_bus_info) x86_quirks->mpc_oem_bus_info(m, str); else - printk(KERN_INFO "Bus #%d is %s\n", m->mpc_busid, str); + apic_printk(APIC_VERBOSE, "Bus #%d is %s\n", m->mpc_busid, str); #if MAX_MP_BUSSES < 256 if (m->mpc_busid >= MAX_MP_BUSSES) { @@ -154,7 +154,7 @@ static void __init MP_ioapic_info(struct mpc_config_ioapic *m) static void print_MP_intsrc_info(struct mpc_config_intsrc *m) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbus, @@ -163,7 +163,7 @@ static void print_MP_intsrc_info(struct mpc_config_intsrc *m) static void __init print_mp_irq_info(struct mp_config_intsrc *mp_irq) { - printk(KERN_CONT "Int: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC INT %02x\n", mp_irq->mp_irqtype, mp_irq->mp_irqflag & 3, (mp_irq->mp_irqflag >> 2) & 3, mp_irq->mp_srcbus, @@ -235,7 +235,7 @@ static void __init MP_intsrc_info(struct mpc_config_intsrc *m) static void __init MP_lintsrc_info(struct mpc_config_lintsrc *m) { - printk(KERN_INFO "Lint: type %d, pol %d, trig %d, bus %02x," + apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," " IRQ %02x, APIC ID %x, APIC LINT %02x\n", m->mpc_irqtype, m->mpc_irqflag & 3, (m->mpc_irqflag >> 2) & 3, m->mpc_srcbusid, @@ -695,7 +695,8 @@ static int __init smp_scan_config(unsigned long base, unsigned long length, unsigned int *bp = phys_to_virt(base); struct intel_mp_floating *mpf; - printk(KERN_DEBUG "Scan SMP from %p for %ld bytes.\n", bp, length); + apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", + bp, length); BUILD_BUG_ON(sizeof(*mpf) != 16); while (length > 0) { -- cgit v1.2.2 From fb6bef8002d54a9c3062abc281893ec7c896d3ce Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Mon, 11 Aug 2008 17:45:53 +0200 Subject: x86: make "apic" an early_param() on 32-bit On 32-bit, "apic" is a __setup() param meaning it is parsed rather late in the game. Make it an early_param() for apic_printk() use by arch/x86/kernel/mpparse.c. On 64-bit, it already is an early_param(). Signed-off-by: Rene Herman Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d6c898358371..f432d4833a12 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1726,9 +1726,9 @@ static int __init apic_set_verbosity(char *str) apic_verbosity = APIC_DEBUG; else if (strcmp("verbose", str) == 0) apic_verbosity = APIC_VERBOSE; - return 1; + return 0; } -__setup("apic=", apic_set_verbosity); +early_param("apic", apic_set_verbosity); static int __init lapic_insert_resource(void) { -- cgit v1.2.2 From cf3e50501259f9a7cb108a69c3e1b912135628f6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 8 Aug 2008 13:46:07 -0700 Subject: x86: work around gcc 3.4.x bug Simon Horman reported that gcc-3.4.x crashes when compiling pgd_prepopulate_pmd() when PREALLOCATED_PMDS == 0 and CONFIG_DEBUG_INFO is enabled. Adding an extra check for PREALLOCATED_PMDS == 0 [which is compiled out by gcc] seems to avoid the problem. Reported-by: Simon Horman Signed-off-by: Jeremy Fitzhardinge Acked-by: Simon Horman Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 557b2abceef8..d50302774fe2 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -207,6 +207,9 @@ static void pgd_prepopulate_pmd(struct mm_struct *mm, pgd_t *pgd, pmd_t *pmds[]) unsigned long addr; int i; + if (PREALLOCATED_PMDS == 0) /* Work around gcc-3.4.x bug */ + return; + pud = pud_offset(pgd, 0); for (addr = i = 0; i < PREALLOCATED_PMDS; -- cgit v1.2.2 From 9b0094f7f289d752868aae1a3984517d712cbb22 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 7 Aug 2008 15:14:55 -0700 Subject: x86, pci-calgary: fix function declaration Fix function declaration: linux-next-20080807/arch/x86/kernel/pci-calgary_64.c:1353:36: warning: non-ANSI function declaration of function 'get_tce_space_from_tar' Signed-off-by: Randy Dunlap Acked-by: Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index b67a4b1d4eae..02d19328525d 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1350,7 +1350,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) * Function for kdump case. Get the tce tables from first kernel * by reading the contents of the base adress register of calgary iommu */ -static void get_tce_space_from_tar() +static void get_tce_space_from_tar(void) { int bus; void __iomem *target; -- cgit v1.2.2 From 48d97cb65e62a5f1122ac2cf1149800d4f4693e8 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Mon, 11 Aug 2008 19:20:17 +0200 Subject: x86: make "apic" an early_param() on 32-bit, NULL check Cyrill Gorcunov observed: > you turned it into early_param so now it's NULL injecting vulnerabled. > Could you please add checking for NULL str param? fix that. Also, change the name of 'str' into 'arg', to make it more apparent that this is an optional argument that can be NULL, not a string parameter that is empty when unset. Reported-by: Cyrill Gorcunov Signed-off-by: Rene Herman Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f432d4833a12..039a8d4aaf62 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1720,12 +1720,16 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static int __init apic_set_verbosity(char *str) +static int __init apic_set_verbosity(char *arg) { - if (strcmp("debug", str) == 0) + if (!arg) + return -EINVAL; + + if (strcmp(arg, "debug") == 0) apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) + else if (strcmp(arg, "verbose") == 0) apic_verbosity = APIC_VERBOSE; + return 0; } early_param("apic", apic_set_verbosity); -- cgit v1.2.2 From 012f09e7942716d5f4339f1fd9a831a485bb1d4a Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 16:23:08 +0200 Subject: x86: compile pat debugfs interface only if CONFIG_X86_PAT is set Recently I've run a kernel w/o PAT support and wondered why there was a file "x86/pat_memtype_list" in debugfs. Of course it's empty if PAT is disabled ... so just get rid of this interface if PAT is disabled. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 2fe30916d4b6..647b1c4de719 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -492,7 +492,7 @@ void unmap_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) free_memtype(addr, addr + size); } -#if defined(CONFIG_DEBUG_FS) +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_X86_PAT) /* get Nth element of the linked list */ static struct memtype *memtype_get_idx(loff_t pos) @@ -576,4 +576,4 @@ static int __init pat_memtype_list_init(void) late_initcall(pat_memtype_list_init); -#endif /* CONFIG_DEBUG_FS */ +#endif /* CONFIG_DEBUG_FS && CONFIG_X86_PAT */ -- cgit v1.2.2 From 59f09ba2b62e6f89beeb4c8fc2c83fe14321dda9 Mon Sep 17 00:00:00 2001 From: Philipp Kohlbecher Date: Wed, 6 Aug 2008 15:25:26 +0200 Subject: x86: fix comment in protected mode header Comments in arch/x86/boot/compressed/head_32.S erroneously refer to the real mode pointer as the second and the heap area as the third argument to decompress_kernel(). In fact, these have been the first and second argument, respectively, since v2.6.20. This patch corrects the comments. It introduces no code changes. Signed-off-by: Philipp Kohlbecher Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_32.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index ba7736cf2ec7..29c5fbf08392 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -137,14 +137,15 @@ relocated: */ movl output_len(%ebx), %eax pushl %eax + # push arguments for decompress_kernel: pushl %ebp # output address movl input_len(%ebx), %eax pushl %eax # input_len leal input_data(%ebx), %eax pushl %eax # input_data leal boot_heap(%ebx), %eax - pushl %eax # heap area as third argument - pushl %esi # real mode pointer as second arg + pushl %eax # heap area + pushl %esi # real mode pointer call decompress_kernel addl $20, %esp popl %ecx -- cgit v1.2.2 From 49800efcb17afdf973f33e8aa8807b7f83993cc6 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:27:30 +0200 Subject: x86: pda_init(): fix memory leak when using CPU hotplug pda->irqstackptr is allocated whenever a CPU is set online. But it is never freed. This results in a memory leak of 16K for each CPU offline/online cycle. Fix is to allocate pda->irqstackptr only once. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 6f9b8924bdc0..8396fd7628a9 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -493,17 +493,20 @@ void pda_init(int cpu) /* others are initialized in smpboot.c */ pda->pcurrent = &init_task; pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) pda->nodenumber = cpu_to_node(cpu); } - - pda->irqstackptr += IRQSTACKSIZE-64; } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + -- cgit v1.2.2 From b55793f7528ce1b73c25b3ac8a86a6cda2a0f9a4 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:29:37 +0200 Subject: x86: cpu_init(): fix memory leak when using CPU hotplug Exception stacks are allocated each time a CPU is set online. But the allocated space is never freed. Thus with one CPU hotplug offline/online cycle there is a memory leak of 24K (6 pages) for a CPU. Fix is to allocate exception stacks only once -- when the CPU is set online for the first time. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 8396fd7628a9..cc6efe86249d 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -606,19 +606,22 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (!orig_ist->ist[0]) { static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); -- cgit v1.2.2 From 99809963c99e1ed868d9ebeb4a5e7ee1cbe0309f Mon Sep 17 00:00:00 2001 From: Jeff Chua Date: Wed, 6 Aug 2008 19:09:53 +0800 Subject: x86: make sparsemem more available With CONFIG_X86_PC, I can set CONFIG_SPARSEMEM=y. With CONFIG_X86_GENERICARCH, CONFIG_SPARSEMEM depends on CONFIG_NUMA. I'm using the patch below to enable sparsemem instead of flatmem. System booted and is running. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7917962ab7ff..e2305c7d881b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1035,7 +1035,7 @@ config HAVE_ARCH_ALLOC_REMAP config ARCH_FLATMEM_ENABLE def_bool y - depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && X86_PC && !NUMA + depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA config ARCH_DISCONTIGMEM_ENABLE def_bool y @@ -1051,7 +1051,7 @@ config ARCH_SPARSEMEM_DEFAULT config ARCH_SPARSEMEM_ENABLE def_bool y - depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) + depends on X86_64 || NUMA || (EXPERIMENTAL && X86_PC) || X86_GENERICARCH select SPARSEMEM_STATIC if X86_32 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 -- cgit v1.2.2 From b74548e76a0eab1f29546e7c5a589429c069a680 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 11 Aug 2008 13:36:04 -0700 Subject: x86: fix 2.6.27rc1 cannot boot more than 8CPUs Jeff Chua reported that booting a !bigsmp kernel on a 16-way box hangs silently. this is a long-standing issue, smp start AP cpu could check the apic id >=8 etc before trying to start it. achieve this by moving the def_to_bigsmp check later and skip the apicid id > 8 [ mingo@elte.hu: clean up the message that is printed. ] Reported-by: "Jeff Chua" Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar arch/x86/kernel/setup.c | 6 ------ arch/x86/kernel/smpboot.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 6 deletions(-) --- arch/x86/kernel/setup.c | 6 ------ arch/x86/kernel/smpboot.c | 10 ++++++++++ 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 6e5823b9e765..68b48e3fbcbd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -861,12 +861,6 @@ void __init setup_arch(char **cmdline_p) init_apic_mappings(); ioapic_init_mappings(); -#if defined(CONFIG_SMP) && defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) - if (def_to_bigsmp) - printk(KERN_WARNING "More than 8 CPUs detected and " - "CONFIG_X86_PC cannot handle it.\nUse " - "CONFIG_X86_GENERICARCH or CONFIG_X86_BIGSMP.\n"); -#endif kvm_guest_init(); e820_reserve_resources(); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index da10f07fc59c..91055d7fc1b0 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -994,7 +994,17 @@ int __cpuinit native_cpu_up(unsigned int cpu) flush_tlb_all(); low_mappings = 1; +#ifdef CONFIG_X86_PC + if (def_to_bigsmp && apicid > 8) { + printk(KERN_WARNING + "More than 8 CPUs detected - skipping them.\n" + "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); + err = -1; + } else + err = do_boot_cpu(apicid, cpu); +#else err = do_boot_cpu(apicid, cpu); +#endif zap_low_mappings(); low_mappings = 0; -- cgit v1.2.2 From 9dd1e9eb5cd6f79d4efda57db4e26dfa31ff9ae5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Aug 2008 23:11:05 +0200 Subject: x86/PCI: allow scanning of 255 PCI busses Fix an old off by one error in the legacy PCI bus check. 0xff is a valid bus. Signed-off-by: Andi Kleen Signed-off-by: Jesse Barnes --- arch/x86/pci/legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index ec9ce35e44d6..b722dd481b39 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -14,7 +14,7 @@ static void __devinit pcibios_fixup_peer_bridges(void) int n, devfn; long node; - if (pcibios_last_bus <= 0 || pcibios_last_bus >= 0xff) + if (pcibios_last_bus <= 0 || pcibios_last_bus > 0xff) return; DBG("PCI: Peer bridge fixup\n"); -- cgit v1.2.2 From 912985dce45ef18fcdd9f5439fef054e0e22302a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 12 Aug 2008 17:52:52 -0500 Subject: mm: Make generic weak get_user_pages_fast and EXPORT_GPL it Out of line get_user_pages_fast fallback implementation, make it a weak symbol, get rid of CONFIG_HAVE_GET_USER_PAGES_FAST. Export the symbol to modules so lguest can use it. Signed-off-by: Nick Piggin Signed-off-by: Rusty Russell --- arch/x86/Kconfig | 1 - arch/x86/mm/Makefile | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3d0f2b6a5a16..ac2fb0641a04 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -22,7 +22,6 @@ config X86 select HAVE_IDE select HAVE_OPROFILE select HAVE_IOREMAP_PROT - select HAVE_GET_USER_PAGES_FAST select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 2977ea37791f..dfb932dcf136 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -1,7 +1,6 @@ obj-y := init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \ - pat.o pgtable.o + pat.o pgtable.o gup.o -obj-$(CONFIG_HAVE_GET_USER_PAGES_FAST) += gup.o obj-$(CONFIG_X86_32) += pgtable_32.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -- cgit v1.2.2 From 7b27718bdb1b70166383dec91391df5534d449ee Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 13 Aug 2008 10:07:05 +0200 Subject: x86: fix setup code crashes on my old 486 box yesterday I tried to reactivate my old 486 box and wanted to install a current Linux with latest kernel on it. But it turned out that the latest kernel does not boot because the machine crashes early in the setup code. After some debugging it turned out that the problem is the query_ist() function. If this interrupt with that function is called the machine simply locks up. It looks like a BIOS bug. Looking for a workaround for this problem I wrote the attached patch. It checks for the CPUID instruction and if it is not implemented it does not call the speedstep BIOS function. As far as I know speedstep should be available since some Pentium earliest. Alan Cox observed that it's available since the Pentium II, so cpuid levels 4 and 5 can be excluded altogether. H. Peter Anvin cleaned up the code some more: > Right in concept, but I dislike the implementation (duplication of the > CPU detect code we already have). Could you try this patch and see if > it works for you? which, with a small modification to fix a build error with it the resulting kernel boots on my machine. Signed-off-by: Joerg Roedel Signed-off-by: "H. Peter Anvin" Cc: Signed-off-by: Ingo Molnar --- arch/x86/boot/boot.h | 8 ++++++++ arch/x86/boot/cpucheck.c | 8 +------- arch/x86/boot/main.c | 4 ++++ 3 files changed, 13 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index a34b9982c7cb..9d4b4b43d97a 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -25,6 +25,8 @@ #include #include +#define NCAPINTS 8 + /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -242,6 +244,12 @@ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); /* cpu.c, cpucheck.c */ +struct cpu_features { + int level; /* Family, or 64 for x86-64 */ + int model; + u32 flags[NCAPINTS]; +}; +extern struct cpu_features cpu; int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); int validate_cpu(void); diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 7804389ee005..c1ce0303d994 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -30,13 +30,7 @@ #include #include -struct cpu_features { - int level; /* Family, or 64 for x86-64 */ - int model; - u32 flags[NCAPINTS]; -}; - -static struct cpu_features cpu; +struct cpu_features cpu; static u32 cpu_vendor[3]; static u32 err_flags[NCAPINTS]; diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 2296164b54d2..01aa64b5575b 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -73,6 +73,10 @@ static void keyboard_set_repeat(void) */ static void query_ist(void) { + /* Some 486 BIOSes apparently crash on this call */ + if (cpu.level < 6) + return; + asm("int $0x15" : "=a" (boot_params.ist_info.signature), "=b" (boot_params.ist_info.command), -- cgit v1.2.2 From c9d08f0860d47ed6a3fe91d0f19335086179be7b Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Tue, 12 Aug 2008 23:23:03 +0200 Subject: x86: fix 2 section mismatch warnings - map_high() WARNING: vmlinux.o(.text+0x14cf8): Section mismatch in reference from the function map_high() to the function .init.text:init_extra_mapping_uc() The function map_high() references the function __init init_extra_mapping_uc(). This is often because map_high lacks a __init annotation or the annotation of init_extra_mapping_uc is wrong. WARNING: vmlinux.o(.text+0x14d05): Section mismatch in reference from the function map_high() to the function .init.text:init_extra_mapping_wb() The function map_high() references the function __init init_extra_mapping_wb(). This is often because map_high lacks a __init annotation or the annotation of init_extra_mapping_wb is wrong. map_high is called only from __init functions (map_*_high) and calls 2 __init_functions (init_extra_mapping_*) Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 2cfcbded888a..2d7e307c7779 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -222,7 +222,7 @@ static __init void map_low_mmrs(void) enum map_type {map_wb, map_uc}; -static void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) { unsigned long bytes, paddr; -- cgit v1.2.2 From 6b3560229d3b6be7443fa9f9c6502e660bcfef5f Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Tue, 12 Aug 2008 23:23:05 +0200 Subject: x86: fix 2 section mismatch warnings - find_and_reserve_crashkernel WARNING: vmlinux.o(.text+0xcd1f): Section mismatch in reference from the function find_and_reserve_crashkernel() to the function .init.text:find_e820_area() The function find_and_reserve_crashkernel() references the function __init find_e820_area(). This is often because find_and_reserve_crashkernel lacks a __init annotation or the annotation of find_e820_area is wrong. WARNING: vmlinux.o(.text+0xcd38): Section mismatch in reference from the function find_and_reserve_crashkernel() to the function .init.text:reserve_bootmem_generic() The function find_and_reserve_crashkernel() references the function __init reserve_bootmem_generic(). This is often because find_and_reserve_crashkernel lacks a __init annotation or the annotation of reserve_bootmem_generic is wrong. find_and_reserve_crashkernel is called from __init function (reserve_crashkernel) and calls 2 __init functions (find_e820_area, reserve_bootmem_generic), so mark it __init Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 68b48e3fbcbd..a4656adab53b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -445,7 +445,7 @@ static void __init reserve_early_setup_data(void) * @size: Size of the crashkernel memory to reserve. * Returns the base address on success, and -1ULL on failure. */ -unsigned long long find_and_reserve_crashkernel(unsigned long long size) +unsigned long long __init find_and_reserve_crashkernel(unsigned long long size) { const unsigned long long alignment = 16<<20; /* 16M */ unsigned long long start = 0LL; -- cgit v1.2.2 From a726c6009e6eba4acfccf8b683854866eeabb184 Mon Sep 17 00:00:00 2001 From: John Keller Date: Tue, 29 Jul 2008 14:34:16 -0500 Subject: x86: allow MMCONFIG above 4GB on x86_64 SGI UV will have MMCFG base addresses that are greater than 4GB (32 bits). v2: Use CONFIG_RESOURCES_64BIT instead of CONFIG_X86_64. v3: Create a flag, that is set by platform specific code, to disable the > 4GB check. Signed-off-by: John Keller Cc: jpk@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fa88a1d71290..bfd10fd211cd 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -97,6 +97,8 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #warning ACPI uses CMPXCHG, i486 and later hardware #endif +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -158,6 +160,14 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; +static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) +{ + if (!strcmp(mcfg->header.oem_id, "SGI")) + acpi_mcfg_64bit_base_addr = TRUE; + + return 0; +} + int __init acpi_parse_mcfg(struct acpi_table_header *header) { struct acpi_table_mcfg *mcfg; @@ -190,8 +200,12 @@ int __init acpi_parse_mcfg(struct acpi_table_header *header) } memcpy(pci_mmcfg_config, &mcfg[1], config_size); + + acpi_mcfg_oem_check(mcfg); + for (i = 0; i < pci_mmcfg_config_num; ++i) { - if (pci_mmcfg_config[i].address > 0xFFFFFFFF) { + if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) && + !acpi_mcfg_64bit_base_addr) { printk(KERN_ERR PREFIX "MMCONFIG not in low 4GB of memory\n"); kfree(pci_mmcfg_config); -- cgit v1.2.2 From 34a1b9fc4995102ecbb3a980b348aebf168d8196 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 12 Aug 2008 13:25:44 +0100 Subject: Fix date output in x86 microcode driver. The microcode stores its date in a uint32_t in some weird order approximating pdp-endian. Rather than printing it like that, print it properly in ISO standard form. Signed-off-by: David Woodhouse Cc: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_intel.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index a61986e8b691..d2d9d74f4cbb 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -339,8 +339,11 @@ static void apply_microcode(int cpu) return; } printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %08x \n", - cpu_num, uci->rev, val[1], uci->mc.mc_intel->hdr.date); + "0x%x to 0x%x, date = %04x-%02x-%02x \n", + cpu_num, uci->rev, val[1], + uci->mc.mc_intel->hdr.date & 0xffff, + uci->mc.mc_intel->hdr.date >> 24, + (uci->mc.mc_intel->hdr.date >> 16) & 0xff); uci->rev = val[1]; } -- cgit v1.2.2 From 875e40b97571e1f06d1184ad6cbb2acf9cb31a23 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 30 Jul 2008 12:26:26 -0700 Subject: x86: use WARN() in arch/x86/mm/pageattr.c Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. Signed-off-by: Arjan van de Ven Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Cc: akpm@linux-foundation.org Cc: arjan@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 3 +-- arch/x86/mm/pageattr.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 0dcd42eb94e6..d4aa503caaa2 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -221,8 +221,7 @@ static int pageattr_test(void) failed += print_split(&sc); if (failed) { - printk(KERN_ERR "NOT PASSED. Please report.\n"); - WARN_ON(1); + WARN(1, KERN_ERR "NOT PASSED. Please report.\n"); return -EINVAL; } else { if (print) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 65c6e46bf059..ba24537d69dd 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -592,10 +592,9 @@ repeat: if (!pte_val(old_pte)) { if (!primary) return 0; - printk(KERN_WARNING "CPA: called for zero pte. " + WARN(1, KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", address, cpa->vaddr); - WARN_ON(1); return -EINVAL; } -- cgit v1.2.2 From c2dcfde8274883e1f6050784dcbd34b01e824b91 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 13 Aug 2008 13:14:22 -0700 Subject: x86: cleanup for setup code crashes during IST probe Clean up the code for crashes during SpeedStep probing on older machines. Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/boot/boot.h | 4 ++-- arch/x86/boot/cpu.c | 3 --- arch/x86/boot/cpucheck.c | 2 -- arch/x86/boot/main.c | 3 ++- 4 files changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 9d4b4b43d97a..616b804a2295 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -24,8 +24,8 @@ #include #include #include - -#define NCAPINTS 8 +#include "bitops.h" +#include /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 92d6fd73dc7d..75298fe2edca 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -16,9 +16,6 @@ */ #include "boot.h" -#include "bitops.h" -#include - #include "cpustr.h" static char *cpu_name(int level) diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index c1ce0303d994..4b9ae7c56748 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -22,10 +22,8 @@ #ifdef _SETUP # include "boot.h" -# include "bitops.h" #endif #include -#include #include #include #include diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 01aa64b5575b..197421db1af1 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -73,7 +73,8 @@ static void keyboard_set_repeat(void) */ static void query_ist(void) { - /* Some 486 BIOSes apparently crash on this call */ + /* Some older BIOSes apparently crash on this call, so filter + it from machines too old to have SpeedStep at all. */ if (cpu.level < 6) return; -- cgit v1.2.2 From ee2b92a8201a40021ecd1aee6f0625dc03bacc54 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 13 Aug 2008 11:38:13 -0700 Subject: x86, xsave: remove the redundant access_ok() in setup_rt_frame() save_i387_xstate() is already doing the required access_ok(). Remove the redundant access_ok() before it. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 2621b98f5bf6..6c581698ab56 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -208,9 +208,6 @@ static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = (void __user *)round_down( (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; - if (!access_ok(VERIFY_WRITE, fp, sizeof(struct _fpstate))) - goto give_sigsegv; - if (save_i387_xstate(fp) < 0) err |= -1; } else -- cgit v1.2.2 From ed405958057ca6a8c4c9178a7a3b1167fabb45f5 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 13 Aug 2008 11:38:14 -0700 Subject: x86, xsave: clear the user buffer before doing fxsave/xsave fxsave/xsave instructions will not touch all the bytes in the fxsave/xsave frame. Clear the user buffer before doing fxsave/xsave directly to user buffer during the sigcontext setup. This is essentially needed in the context of xsave(for example, some of the fields in the xsave header are not touched by the xsave and defined as must be zero). This will also present uniform and clean context to the user (from which user can safely do fxrstor/xrstor). Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 7415f3e38a51..bb097b1644d8 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -92,6 +92,12 @@ int save_i387_xstate(void __user *buf) return 0; clear_used_math(); /* trigger finit */ if (task_thread_info(tsk)->status & TS_USEDFPU) { + /* + * Start with clearing the user buffer. This will present a + * clean context for the bytes not touched by the fxsave/xsave. + */ + __clear_user(buf, sig_xstate_size); + if (task_thread_info(tsk)->status & TS_XSAVE) err = xsave_user(buf); else -- cgit v1.2.2 From f65bc214e042916135256620f900e9599d65e0cb Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 13 Aug 2008 11:38:15 -0700 Subject: x86, xsave: use BUG_ON() instead of BUILD_BUG_ON() All these structure sizes are runtime determined. So use a runtime bug check. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index bb097b1644d8..07713d64debe 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -82,8 +82,7 @@ int save_i387_xstate(void __user *buf) if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) return -EACCES; - BUILD_BUG_ON(sizeof(struct user_i387_struct) != - sizeof(tsk->thread.xstate->fxsave)); + BUG_ON(sig_xstate_size < xstate_size); if ((unsigned long)buf % 64) printk("save_i387_xstate: bad fpstate %p\n", buf); -- cgit v1.2.2 From 23b49c19f6946cc33392a1fc75dd788dd4a90fb7 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Mon, 11 Aug 2008 14:55:31 -0700 Subject: x86: resurrect proper handling of maxcpus= kernel option (v2) For some reason we had two parsers registered for maxcpus=. One in init/main.c and another in arch/x86/smpboot.c. So I nuked the one in arch/x86. Also 64-bit kernels used to handle maxcpus= as documented in Documentation/cpu-hotplug.txt. CPUs with 'id > maxcpus' are initialized but not booted. 32-bit version for some reason ignored them even though all the infrastructure for booting them later is there. In the current mainline both 64 and 32 bit versions are broken. This patch restores the correct behaviour. I've tested x86_64 version on 4- and 8- way Core2 and 2-way Opteron based machines. Various config combinations SMP, !SMP, CPU_HOTPLUG, !CPU_HOTPLUG. Booted with maxcpus=1 and maxcpus=4, etc. Everything is working as expected. So far we've received two reports from different people confirming that 32-bit version also works fine, both on dual core laptops and 16way server machines. [v2: This version fixes visws breakage pointed out by Ingo.] Signed-off-by: Max Krasnyansky Cc: lizf@cn.fujitsu.com Cc: jeff.chua.linux@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 8 -------- arch/x86/kernel/apic_64.c | 7 ------- arch/x86/kernel/smpboot.c | 14 -------------- arch/x86/kernel/visws_quirks.c | 6 ++---- 4 files changed, 2 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 039a8d4aaf62..f88bd0d982b0 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1454,8 +1454,6 @@ void disconnect_bsp_APIC(int virt_wire_setup) } } -unsigned int __cpuinitdata maxcpus = NR_CPUS; - void __cpuinit generic_processor_info(int apicid, int version) { int cpu; @@ -1482,12 +1480,6 @@ void __cpuinit generic_processor_info(int apicid, int version) return; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - num_processors++; cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 7f1f030da7ee..446c062e831c 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -90,7 +90,6 @@ static unsigned long apic_phys; unsigned long mp_lapic_addr; -unsigned int __cpuinitdata maxcpus = NR_CPUS; /* * Get the LAPIC version */ @@ -1062,12 +1061,6 @@ void __cpuinit generic_processor_info(int apicid, int version) return; } - if (num_processors >= maxcpus) { - printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); - return; - } - num_processors++; cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 91055d7fc1b0..e25287e4a85f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1386,17 +1386,3 @@ void __cpu_die(unsigned int cpu) BUG(); } #endif - -/* - * If the BIOS enumerates physical processors before logical, - * maxcpus=N at enumeration-time can be used to disable HT. - */ -static int __init parse_maxcpus(char *arg) -{ - extern unsigned int maxcpus; - - if (arg) - maxcpus = simple_strtoul(arg, NULL, 0); - return 0; -} -early_param("maxcpus", parse_maxcpus); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 41e01b145c48..594ef47f0a63 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -184,8 +184,6 @@ static int __init visws_get_smp_config(unsigned int early) return 1; } -extern unsigned int __cpuinitdata maxcpus; - /* * The Visual Workstation is Intel MP compliant in the hardware * sense, but it doesn't have a BIOS(-configuration table). @@ -244,8 +242,8 @@ static int __init visws_find_smp_config(unsigned int reserve) ncpus = CO_CPU_MAX; } - if (ncpus > maxcpus) - ncpus = maxcpus; + if (ncpus > setup_max_cpus) + ncpus = setup_max_cpus; #ifdef CONFIG_X86_LOCAL_APIC smp_found_config = 1; -- cgit v1.2.2 From 858f774733b72609acb28104475f131abb912c08 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 14 Aug 2008 02:16:29 -0700 Subject: x86: don't call e820_regiter_active_regions if out of range on node so we don't get warning on 32bit system with 64g RAM or more Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_32.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c index 1eb2973a301c..16ae70fc57e7 100644 --- a/arch/x86/mm/srat_32.c +++ b/arch/x86/mm/srat_32.c @@ -178,7 +178,7 @@ void acpi_numa_arch_fixup(void) * start of the node, and that the current "end" address is after * the previous one. */ -static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) +static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk) { /* * Only add present memory as told by the e820. @@ -189,10 +189,10 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c if (memory_chunk->start_pfn >= max_pfn) { printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n", memory_chunk->start_pfn, memory_chunk->end_pfn); - return; + return -1; } if (memory_chunk->nid != nid) - return; + return -1; if (!node_has_online_mem(nid)) node_start_pfn[nid] = memory_chunk->start_pfn; @@ -202,6 +202,8 @@ static __init void node_read_chunk(int nid, struct node_memory_chunk_s *memory_c if (node_end_pfn[nid] < memory_chunk->end_pfn) node_end_pfn[nid] = memory_chunk->end_pfn; + + return 0; } int __init get_memcfg_from_srat(void) @@ -259,7 +261,9 @@ int __init get_memcfg_from_srat(void) printk(KERN_DEBUG "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n", j, chunk->nid, chunk->start_pfn, chunk->end_pfn); - node_read_chunk(chunk->nid, chunk); + if (node_read_chunk(chunk->nid, chunk)) + continue; + e820_register_active_regions(chunk->nid, chunk->start_pfn, min(chunk->end_pfn, max_pfn)); } -- cgit v1.2.2 From a58f03b07539f6575adaa011712fa139c9343742 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 14 Aug 2008 02:16:30 -0700 Subject: x86: check bigsmp in smp_sanity_check instead of cpu_up clear bits for cpu nr > 8. This allows us to boot the full range of possible CPUs that the supported APIC model will allow. Previously we'd hang or boot up with less than 8 CPUs. Signed-off-by: Yinghai Lu Tested-by: Jeff Chua Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e25287e4a85f..a8fb8a980fae 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -994,17 +994,7 @@ int __cpuinit native_cpu_up(unsigned int cpu) flush_tlb_all(); low_mappings = 1; -#ifdef CONFIG_X86_PC - if (def_to_bigsmp && apicid > 8) { - printk(KERN_WARNING - "More than 8 CPUs detected - skipping them.\n" - "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); - err = -1; - } else - err = do_boot_cpu(apicid, cpu); -#else err = do_boot_cpu(apicid, cpu); -#endif zap_low_mappings(); low_mappings = 0; @@ -1058,6 +1048,34 @@ static __init void disable_smp(void) static int __init smp_sanity_check(unsigned max_cpus) { preempt_disable(); + +#if defined(CONFIG_X86_PC) && defined(CONFIG_X86_32) + if (def_to_bigsmp && nr_cpu_ids > 8) { + unsigned int cpu; + unsigned nr; + + printk(KERN_WARNING + "More than 8 CPUs detected - skipping them.\n" + "Use CONFIG_X86_GENERICARCH and CONFIG_X86_BIGSMP.\n"); + + nr = 0; + for_each_present_cpu(cpu) { + if (nr >= 8) + cpu_clear(cpu, cpu_present_map); + nr++; + } + + nr = 0; + for_each_possible_cpu(cpu) { + if (nr >= 8) + cpu_clear(cpu, cpu_possible_map); + nr++; + } + + nr_cpu_ids = 8; + } +#endif + if (!physid_isset(hard_smp_processor_id(), phys_cpu_present_map)) { printk(KERN_WARNING "weird, boot CPU (#%d) not listed" "by the BIOS.\n", hard_smp_processor_id()); -- cgit v1.2.2 From a6825f1c1fa83b1e92b6715ee5771a4d6524d3b9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 14 Aug 2008 12:17:06 +0200 Subject: x86: hpet: workaround SB700 BIOS AMD SB700 based systems with spread spectrum enabled use a SMM based HPET emulation to provide proper frequency setting. The SMM code is initialized with the first HPET register access and takes some time to complete. During this time the config register reads 0xffffffff. We check for max. 1000 loops whether the config register reads a non 0xffffffff value to make sure that HPET is up and running before we go further. A counting loop is safe, as the HPET access takes thousands of CPU cycles. On non SB700 based machines this check is only done once and has no side effects. Based on a quirk patch from: crane cai Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index ad2b15a1334d..59fd3b6b1303 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -359,6 +359,7 @@ static int hpet_clocksource_register(void) int __init hpet_enable(void) { unsigned long id; + int i; if (!is_hpet_capable()) return 0; @@ -369,6 +370,29 @@ int __init hpet_enable(void) * Read the period and check for a sane value: */ hpet_period = hpet_readl(HPET_PERIOD); + + /* + * AMD SB700 based systems with spread spectrum enabled use a + * SMM based HPET emulation to provide proper frequency + * setting. The SMM code is initialized with the first HPET + * register access and takes some time to complete. During + * this time the config register reads 0xffffffff. We check + * for max. 1000 loops whether the config register reads a non + * 0xffffffff value to make sure that HPET is up and running + * before we go further. A counting loop is safe, as the HPET + * access takes thousands of CPU cycles. On non SB700 based + * machines this check is only done once and has no side + * effects. + */ + for (i = 0; hpet_readl(HPET_CFG) == 0xFFFFFFFF; i++) { + if (i == 1000) { + printk(KERN_WARNING + "HPET config register value = 0xFFFFFFFF. " + "Disabling HPET\n"); + goto out_nohpet; + } + } + if (hpet_period < HPET_MIN_PERIOD || hpet_period > HPET_MAX_PERIOD) goto out_nohpet; -- cgit v1.2.2 From 967060d00d7ab8e992963a966cd3d18156c02d55 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 14 Aug 2008 15:43:33 -0700 Subject: x86, msr: fix NULL pointer deref due to msr_open on nonexistent CPUs msr_open tests for someone trying to open a device for a nonexistent CPU. However, the function always returns 0, not ret like it should, hence userspace can BUG the kernel trivially. This bug was introduced by the cdev lock_kernel pushdown patch last May. The BUG can be reproduced with these commands: # mknod fubar c 202 8 <-- pick a number less than NR_CPUS that is not the number of an online CPU # cat fubar Signed-off-by: Darrick J. Wong Signed-off-by: Ingo Molnar --- arch/x86/kernel/msr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 9fd809552447..e43938086885 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -131,7 +131,7 @@ static int msr_open(struct inode *inode, struct file *file) ret = -EIO; /* MSR not supported */ out: unlock_kernel(); - return 0; + return ret; } /* -- cgit v1.2.2 From ed4e5ec177d20504c51aebb93db12d57716cde9c Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 13:51:20 +0200 Subject: x86: apic - use SET_APIC_DEST_FIELD instead of hardcoded shift Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 69a876be506f..77c5e5eb820e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -150,7 +150,7 @@ u32 safe_xapic_wait_icr_idle(void) void xapic_icr_write(u32 low, u32 id) { - apic_write(APIC_ICR2, id << 24); + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); apic_write(APIC_ICR, low); } -- cgit v1.2.2 From 36fef0949c14445d231a68663e8a01860f7caca3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 13:51:20 +0200 Subject: x86: apic - unify disable_apic_timer Get rid of local_apic_timer_disabled and use disable_apic_timer instead. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 17 ++++++++++++----- arch/x86/kernel/apic_64.c | 16 +++++++++++----- 2 files changed, 23 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 12b154822bce..6af20dd12c96 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -63,7 +63,7 @@ int disable_apic; /* Local APIC timer verification ok */ static int local_apic_timer_verify_ok; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int local_apic_timer_disabled; +static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -574,7 +574,7 @@ void __init setup_boot_APIC_clock(void) * timer as a dummy clock event source on SMP systems, so the * broadcast mechanism is used. On UP systems simply ignore it. */ - if (local_apic_timer_disabled) { + if (disable_apic_timer) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) { lapic_clockevent.mult = 1; @@ -1699,12 +1699,19 @@ static int __init parse_nolapic(char *arg) } early_param("nolapic", parse_nolapic); -static int __init parse_disable_lapic_timer(char *arg) +static int __init parse_disable_apic_timer(char *arg) { - local_apic_timer_disabled = 1; + disable_apic_timer = 1; return 0; } -early_param("nolapic_timer", parse_disable_lapic_timer); +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); static int __init parse_lapic_timer_c2_ok(char *arg) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 77c5e5eb820e..1e925be861e4 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -45,6 +45,7 @@ #include #include +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; int disable_apic; @@ -1583,14 +1584,19 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static __init int setup_noapictimer(char *str) +static int __init parse_disable_apic_timer(char *arg) { - if (str[0] != ' ' && str[0] != 0) - return 0; disable_apic_timer = 1; - return 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; } -__setup("noapictimer", setup_noapictimer); +early_param("nolapic_timer", parse_nolapic_timer); static __init int setup_apicpmtimer(char *s) { -- cgit v1.2.2 From f07f4f9046121ac803bc2f0ded3d77b7c2ab481b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 13:51:21 +0200 Subject: x86: apic - unify __setup_APIC_LVTT To be able to unify this function we RE-introduce APIC_DIVISOR for 64bit mode. This snipped was eliminated in some time ago in a sake of clenup but now we need it again since it allow up to get rid of #ifdef(s). And lapic_is_integrated call is added in apic_64.c but since we always have APIC integrated on 64bit cpu compiler will ignore this call. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 1e925be861e4..ac399d0f65c0 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -244,6 +244,9 @@ int lapic_get_maxlvt(void) return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } +/* Clock divisor is set to 1 */ +#define APIC_DIVISOR 1 + /* * This function sets up the local APIC timer, with a timeout of * 'clocks' APIC bus clock. During calibration we actually call @@ -262,6 +265,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) lvtt_value = LOCAL_TIMER_VECTOR; if (!oneshot) lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + if (!irqen) lvtt_value |= APIC_LVT_MASKED; @@ -276,7 +282,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) | APIC_TDR_DIV_16); if (!oneshot) - apic_write(APIC_TMICT, clocks); + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } /* @@ -446,7 +452,7 @@ static int __init calibrate_APIC_clock(void) lapic_clockevent.min_delta_ns = clockevent_delta2ns(0xF, &lapic_clockevent); - calibration_result = result / HZ; + calibration_result = (result * APIC_DIVISOR) / HZ; /* * Do a sanity check on the APIC calibration result -- cgit v1.2.2 From 9ce122c6e55c44ae9a4c4c777579b87d83e7f898 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 13:51:21 +0200 Subject: x86: apic - do not clear APIC twice in lapic_shutdown There is no need to clear APIC twice since disable_local_APIC will clear it anyway. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 6af20dd12c96..a151d66f948c 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -830,10 +830,11 @@ void lapic_shutdown(void) return; local_irq_save(flags); - clear_local_APIC(); if (enabled_via_apicbase) disable_local_APIC(); + else + clear_local_APIC(); local_irq_restore(flags); } -- cgit v1.2.2 From 64e474d168e3cf31e44890b4947644d361f84e43 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 13:51:22 +0200 Subject: x86: apic - get rid of local_apic_timer_verify_ok We are able to use clock_event_device as it's done in 64bit apic code so lets get rid of local_apic_timer_verify_ok variable. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a151d66f948c..7783c113be24 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -60,8 +60,6 @@ unsigned long mp_lapic_addr; static int force_enable_local_apic; int disable_apic; -/* Local APIC timer verification ok */ -static int local_apic_timer_verify_ok; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ @@ -301,7 +299,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, unsigned int v; /* Lapic used for broadcast ? */ - if (!local_apic_timer_verify_ok) + if (evt->features & CLOCK_EVT_FEAT_DUMMY) return; local_irq_save(flags); @@ -514,7 +512,7 @@ static int __init calibrate_APIC_clock(void) return -1; } - local_apic_timer_verify_ok = 1; + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; /* We trust the pm timer based calibration */ if (!pm_referenced) { @@ -548,11 +546,11 @@ static int __init calibrate_APIC_clock(void) if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); else - local_apic_timer_verify_ok = 0; + levt->features |= CLOCK_EVT_FEAT_DUMMY; } else local_irq_enable(); - if (!local_apic_timer_verify_ok) { + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { printk(KERN_WARNING "APIC timer disabled due to verification failure.\n"); return -1; -- cgit v1.2.2 From c93baa1ae51cdba25a5f5ad37b2348e700e75daf Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 13:51:22 +0200 Subject: x86: apic - unify verify_local_APIC Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 7783c113be24..7ef7c782a42d 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -878,6 +878,12 @@ int __init verify_local_APIC(void) */ reg0 = apic_read(APIC_ID); apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; /* * The next two are just to see if we have sane values. -- cgit v1.2.2 From 296cb9511dcc3895fda84d0cd5b411bd926e4bb3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 13:51:23 +0200 Subject: x86: apic - unify sync_Arb_IDs Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 3 +-- arch/x86/kernel/apic_64.c | 10 ++++++---- 2 files changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 7ef7c782a42d..d07488993ee7 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -915,8 +915,7 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, - APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ac399d0f65c0..41aff3460635 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -742,8 +742,11 @@ int __init verify_local_APIC(void) */ void __init sync_Arb_IDs(void) { - /* Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 */ - if (modern_apic()) + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; /* @@ -752,8 +755,7 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG - | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* -- cgit v1.2.2 From ef31023743e66de7184e9aad432291c842a6384b Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 14 Aug 2008 15:07:03 -0400 Subject: x86: silence mmconfig printk There's so much broken mmconfig hardware/bios'es out there, that classing this as an error seems a little extreme. Lower its priority to KERN_INFO so that it isn't so noisy when booting with 'quiet' Signed-off-by: Dave Jones Signed-off-by: Ingo Molnar --- arch/x86/pci/mmconfig-shared.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 23faaa890ffc..2bd5c53f6386 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -365,7 +365,7 @@ static void __init pci_mmcfg_reject_broken(int early) return; reject: - printk(KERN_ERR "PCI: Not using MMCONFIG.\n"); + printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); pci_mmcfg_arch_free(); kfree(pci_mmcfg_config); pci_mmcfg_config = NULL; -- cgit v1.2.2 From 519c31bacf78a969efa8d2e55ed8862848f28590 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 14 Aug 2008 19:55:15 +0200 Subject: x86, AMD IOMMU: use status bit instead of memory write-back for completion wait Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 22d7d050905d..028e945c68ad 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -101,16 +101,13 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret; + int ret, ready = 0; + unsigned status = 0; struct iommu_cmd cmd; - volatile u64 ready = 0; - unsigned long ready_phys = virt_to_phys(&ready); unsigned long i = 0; memset(&cmd, 0, sizeof(cmd)); - cmd.data[0] = LOW_U32(ready_phys) | CMD_COMPL_WAIT_STORE_MASK; - cmd.data[1] = upper_32_bits(ready_phys); - cmd.data[2] = 1; /* value written to 'ready' */ + cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT); iommu->need_sync = 0; @@ -122,9 +119,15 @@ static int iommu_completion_wait(struct amd_iommu *iommu) while (!ready && (i < EXIT_LOOP_COUNT)) { ++i; - cpu_relax(); + /* wait for the bit to become one */ + status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); + ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; } + /* set bit back to zero */ + status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; + writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); + if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); -- cgit v1.2.2 From 9f5f5fb35d2934fe7dc0cb019854a030efd10cd7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 14 Aug 2008 19:55:16 +0200 Subject: x86, AMD IOMMU: initialize device table properly This patch adds device table initializations which forbids memory accesses for devices per default and disables all page faults. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index d9a9da597e79..ceba33811537 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -800,6 +800,21 @@ static int __init init_memory_definitions(struct acpi_table_header *table) return 0; } +/* + * Init the device table to not allow DMA access for devices and + * suppress all page faults + */ +static void init_device_table(void) +{ + u16 devid; + + for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { + set_dev_entry_bit(devid, DEV_ENTRY_VALID); + set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); + set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT); + } +} + /* * This function finally enables all IOMMUs found in the system after * they have been initialized @@ -931,6 +946,9 @@ int __init amd_iommu_init(void) if (amd_iommu_pd_alloc_bitmap == NULL) goto free; + /* init the device table */ + init_device_table(); + /* * let all alias entries point to itself */ -- cgit v1.2.2 From 8a456695c5020d6317f9c7af190999e9414b0d3e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 14 Aug 2008 19:55:17 +0200 Subject: x86m AMD IOMMU: cleanup: replace LOW_U32 macro with generic lower_32_bits Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 028e945c68ad..de39e1f2ede5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -164,7 +164,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, address &= PAGE_MASK; CMD_SET_TYPE(&cmd, CMD_INV_IOMMU_PAGES); cmd.data[1] |= domid; - cmd.data[2] = LOW_U32(address); + cmd.data[2] = lower_32_bits(address); cmd.data[3] = upper_32_bits(address); if (s) /* size bit - we flush more than one 4kb page */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK; -- cgit v1.2.2 From 129d6aba444d1e99d4cbfb9866a4652912426b65 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 14 Aug 2008 19:55:18 +0200 Subject: x86, AMD IOMMU: initialize dma_ops after sysfs registration If sysfs registration fails all memory used by IOMMU is freed. This happens after dma_ops initialization and the functions will access the freed memory then. Fix this by initializing dma_ops after the sysfs registration. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index ceba33811537..a69cc0f52042 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -972,15 +972,15 @@ int __init amd_iommu_init(void) if (acpi_table_parse("IVRS", init_memory_definitions) != 0) goto free; - ret = amd_iommu_init_dma_ops(); + ret = sysdev_class_register(&amd_iommu_sysdev_class); if (ret) goto free; - ret = sysdev_class_register(&amd_iommu_sysdev_class); + ret = sysdev_register(&device_amd_iommu); if (ret) goto free; - ret = sysdev_register(&device_amd_iommu); + ret = amd_iommu_init_dma_ops(); if (ret) goto free; -- cgit v1.2.2 From dcc984166870150709f0c645b521a47becd9a047 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 14 Aug 2008 16:32:15 -0400 Subject: x86, perfctr: don't use CCCR_OVF_PMI1 on Pentium 4Ds Currently, setup_p4_watchdog() use CCCR_OVF_PMI1 to enable the counter overflow interrupts to the second logical core. But this bit doesn't work on Pentium 4 Ds (model 4, stepping 4) and this patch avoids its use on these processors. Tested on 4 different machines that have this specific model with success. Signed-off-by: Aristeu Rozanski Cc: jvillalovos@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index de7439f82b92..05cc22dbd4ff 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -478,7 +478,13 @@ static int setup_p4_watchdog(unsigned nmi_hz) perfctr_msr = MSR_P4_IQ_PERFCTR1; evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR1; - cccr_val = P4_CCCR_OVF_PMI1 | P4_CCCR_ESCR_SELECT(4); + + /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */ + if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4) + cccr_val = P4_CCCR_OVF_PMI0; + else + cccr_val = P4_CCCR_OVF_PMI1; + cccr_val |= P4_CCCR_ESCR_SELECT(4); } evntsel = P4_ESCR_EVENT_SELECT(0x3F) -- cgit v1.2.2 From 394a15051c33f2b18e72f42283b36a9388fa414b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 14 Aug 2008 09:11:26 -0500 Subject: x86: invalidate caches before going into suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a CPU core is shut down, all of its caches need to be flushed to prevent stale data from causing errors if the core is resumed. Current Linux suspend code performs an assignment after the flush, which can add dirty data back to the cache.  On some AMD platforms, additional speculative reads have caused crashes on resume because of this dirty data. Relocate the cache flush to be the very last thing done before halting.  Tie into an assembly line so the compile will not reorder it.  Add some documentation explaining what is going on and why we're doing this. Signed-off-by: Mark Langsdorf Acked-by: Mark Borden Acked-by: Michael Hohmuth Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 5 ++--- arch/x86/kernel/process_64.c | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 53bc653ed5ca..3b7a1ddcc0bc 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -95,7 +95,6 @@ static inline void play_dead(void) { /* This must be done before dead CPU ack */ cpu_exit_clear(); - wbinvd(); mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; @@ -104,8 +103,8 @@ static inline void play_dead(void) * With physical CPU hotplug, we should halt the cpu */ local_irq_disable(); - while (1) - halt(); + /* mask all interrupts, flush any and all caches, and halt */ + wbinvd_halt(); } #else static inline void play_dead(void) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3fb62a7d9a16..71553b664e2a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,14 +93,13 @@ DECLARE_PER_CPU(int, cpu_state); static inline void play_dead(void) { idle_task_exit(); - wbinvd(); mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; local_irq_disable(); - while (1) - halt(); + /* mask all interrupts, flush any and all caches, and halt */ + wbinvd_halt(); } #else static inline void play_dead(void) -- cgit v1.2.2 From 04b69447f79eade34e92f3117a39e8fa6ecb519b Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Thu, 14 Aug 2008 17:16:50 +0200 Subject: arch/x86/Kconfig: clean up, experimental adjustement Adjust experimental tags in Kconfig, update config to notice that i386/x86_64 is now single architecture. Signed-off-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac2fb0641a04..68d91c8233f4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -951,9 +951,9 @@ config NUMA local memory controller of the CPU and add some more NUMA awareness to the kernel. - For i386 this is currently highly experimental and should be only + For 32-bit this is currently highly experimental and should be only used for kernel development. It might also cause boot failures. - For x86_64 this is recommended on all multiprocessor Opteron systems. + For 64-bit this is recommended on all multiprocessor Opteron systems. If the system is EM64T, you should say N unless your system is EM64T NUMA. @@ -1263,7 +1263,7 @@ config KEXEC strongly in flux, so no good recommendation can be made. config CRASH_DUMP - bool "kernel crash dumps (EXPERIMENTAL)" + bool "kernel crash dumps" depends on X86_64 || (X86_32 && HIGHMEM) help Generate crash dump after being started by kexec. -- cgit v1.2.2 From a06de63000b95e1ed1c6373a72376876c952608e Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 15 Aug 2008 13:58:32 +0100 Subject: x86: fix /proc/meminfo DirectMap Do we actually want these DirectMap lines in the x86 /proc/meminfo? I can see they're interesting to CPA developers and TLB optimizers, but they don't fit its usual "where has all my memory gone?" usage. If they are to stay, here are some fixes. 1. On x86_32 without PAE, they're not 2M but 4M pages: no need to mess with the internal enum, but show the right name to users. 2. Many machines can never show anything but 0 for DirectMap1G, so suppress that line unless direct_gbpages are really enabled. 3. The unit in /proc/meminfo is kB not number of pages: HugePages messed that up, but they're an example to regret not to follow. 4. Once we use kB, it's easy to see that 1GB has gone missing (which explains why CONFIG_CPA_DEBUG=y soon wraps DirectMap2M negative): because head_64.S's level2_ident_pgt entries were not counted. My fix is not ideal, but works for more and for less than 1G, and avoids interfering with early bootup pagetable contortions. Signed-off-by: Hugh Dickins Cc: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++++- arch/x86/mm/pageattr.c | 18 ++++++++++++------ 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 129618ca0ea2..b3e6c3075acc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -60,7 +60,7 @@ static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); -int direct_gbpages __meminitdata +int direct_gbpages #ifdef CONFIG_DIRECT_GBPAGES = 1 #endif @@ -314,6 +314,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, { unsigned long pages = 0; unsigned long last_map_addr = end; + unsigned long start = address; int i = pmd_index(address); @@ -334,6 +335,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (!pmd_large(*pmd)) last_map_addr = phys_pte_update(pmd, address, end); + /* Count entries we're using from level2_ident_pgt */ + if (start == 0) + pages++; continue; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index ba24537d69dd..f5f5154ea11e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -55,13 +55,19 @@ static void split_page_count(int level) int arch_report_meminfo(char *page) { - int n = sprintf(page, "DirectMap4k: %8lu\n" - "DirectMap2M: %8lu\n", - direct_pages_count[PG_LEVEL_4K], - direct_pages_count[PG_LEVEL_2M]); + int n = sprintf(page, "DirectMap4k: %8lu kB\n", + direct_pages_count[PG_LEVEL_4K] << 2); +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) + n += sprintf(page + n, "DirectMap2M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 11); +#else + n += sprintf(page + n, "DirectMap4M: %8lu kB\n", + direct_pages_count[PG_LEVEL_2M] << 12); +#endif #ifdef CONFIG_X86_64 - n += sprintf(page + n, "DirectMap1G: %8lu\n", - direct_pages_count[PG_LEVEL_1G]); + if (direct_gbpages) + n += sprintf(page + n, "DirectMap1G: %8lu kB\n", + direct_pages_count[PG_LEVEL_1G] << 20); #endif return n; } -- cgit v1.2.2 From 15636668449d4135ac77a79715ba430a81aed16d Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Fri, 15 Aug 2008 08:36:14 -0400 Subject: x86, NMI: fix watchdog failure message > it just won't work at boot time - the second logic unit will be stuck: > > Booting processor 1/2 APIC 0x1 > Initializing CPU#1 > Calibrating delay using timer specific routine.. 5586.12 BogoMIPS (lpj=2793063) > CPU: Trace cache: 12K uops, L1 D cache: 16K > CPU: L2 cache: 1024K > CPU: Physical Processor ID: 0 > CPU: Processor Core ID: 1 > CPU1: Thermal monitoring enabled (TM1) > Intel(R) Pentium(R) D CPU 2.80GHz stepping 04 > Brought up 2 CPUs > testing NMI watchdog ... <4>WARNING: CPU#1: NMI appears to be stuck (0->0)! while at it... - fix that newline Signed-off-by: Aristeu Rozanski Cc: jvillalo@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ac6d51222e7d..919473ad4a29 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -142,11 +142,15 @@ int __init check_nmi_watchdog(void) if (!per_cpu(wd_enabled, cpu)) continue; if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { + printk("\n"); printk(KERN_WARNING "WARNING: CPU#%d: NMI " "appears to be stuck (%d->%d)!\n", cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); + printk(KERN_WARNING "Please report this to " + "linux-kernel@vger.kernel.org and attach " + "the output of 'dmesg' command.\n"); per_cpu(wd_enabled, cpu) = 0; atomic_dec(&nmi_active); } -- cgit v1.2.2 From 8bb851900f5d0a79d3fddac808cc670d9894ef67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 Aug 2008 15:34:32 +0200 Subject: x86, nmi: clean UP NMI watchdog failure message clean up the failure message - and redirect people to bugzilla instead of lkml. Signed-off-by: Ingo Molnar --- arch/x86/kernel/nmi.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 919473ad4a29..abb78a2cc4ad 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -114,6 +114,23 @@ static __init void nmi_cpu_busy(void *data) } #endif +static void report_broken_nmi(int cpu, int *prev_nmi_count) +{ + printk(KERN_CONT "\n"); + + printk(KERN_WARNING + "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n", + cpu, prev_nmi_count[cpu], get_nmi_count(cpu)); + + printk(KERN_WARNING + "Please report this to bugzilla.kernel.org,\n"); + printk(KERN_WARNING + "and attach the output of the 'dmesg' command.\n"); + + per_cpu(wd_enabled, cpu) = 0; + atomic_dec(&nmi_active); +} + int __init check_nmi_watchdog(void) { unsigned int *prev_nmi_count; @@ -141,19 +158,8 @@ int __init check_nmi_watchdog(void) for_each_online_cpu(cpu) { if (!per_cpu(wd_enabled, cpu)) continue; - if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) { - printk("\n"); - printk(KERN_WARNING "WARNING: CPU#%d: NMI " - "appears to be stuck (%d->%d)!\n", - cpu, - prev_nmi_count[cpu], - get_nmi_count(cpu)); - printk(KERN_WARNING "Please report this to " - "linux-kernel@vger.kernel.org and attach " - "the output of 'dmesg' command.\n"); - per_cpu(wd_enabled, cpu) = 0; - atomic_dec(&nmi_active); - } + if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5) + report_broken_nmi(cpu, prev_nmi_count); } endflag = 1; if (!atomic_read(&nmi_active)) { -- cgit v1.2.2 From 516cbf3730c49739629d66313b20bdc50c98aa2c Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Tue, 12 Aug 2008 12:52:36 -0700 Subject: x86, bootup: add built-in kernel command line for x86 (v2) Allow x86 to support a built-in kernel command line. The built-in command line can override the one provided by the boot loader, for those cases where the boot loader is broken or it is difficult to change the command line in the the boot loader. H. Peter Anvin wrote: > Ingo Molnar wrote: >> Best would be to make it really apparent in the code that nothing >> changes if this config option is not set. Preferably there should be >> no extra code at all in that case. >> > > I would like to see this: [...Nested ifdefs...] OK. This version changes absolutely nothing if CONFIG_CMDLINE_BOOL is not set (the default). Also, no space is appended even when CONFIG_CMDLINE_BOOL is set, but the builtin string is empty. This is less sloppy all the way around, IMHO. Note that I use the same option names as on other arches for this feature. [ mingo@elte.hu: build fix ] Signed-off-by: Tim Bird Cc: Matt Mackall Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 45 +++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/setup.c | 16 ++++++++++++++++ 2 files changed, 61 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ac2fb0641a04..fbcb79bbafd2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1392,6 +1392,51 @@ config COMPAT_VDSO If unsure, say Y. +config CMDLINE_BOOL + bool "Built-in kernel command line" + default n + help + Allow for specifying boot arguments to the kernel at + build time. On some systems (e.g. embedded ones), it is + necessary or convenient to provide some or all of the + kernel boot arguments with the kernel itself (that is, + to not rely on the boot loader to provide them.) + + To compile command line arguments into the kernel, + set this option to 'Y', then fill in the + the boot arguments in CONFIG_CMDLINE. + + Systems with fully functional boot loaders (i.e. non-embedded) + should leave this option set to 'N'. + +config CMDLINE + string "Built-in kernel command string" + depends on CMDLINE_BOOL + default "" + help + Enter arguments here that should be compiled into the kernel + image and used at boot time. If the boot loader provides a + command line at boot time, it is appended to this string to + form the full kernel command line, when the system boots. + + However, you can use the CONFIG_CMDLINE_OVERRIDE option to + change this behavior. + + In most cases, the command line (whether built-in or provided + by the boot loader) should specify the device for the root + file system. + +config CMDLINE_OVERRIDE + bool "Built-in command line overrides boot loader arguments" + default n + depends on CMDLINE_BOOL + help + Set this option to 'Y' to have the kernel ignore the boot loader + command line, and use ONLY the built-in command line. + + This is used to work around broken boot loaders. This should + be set to 'N' under normal conditions. + endmenu config ARCH_ENABLE_MEMORY_HOTPLUG diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 68b48e3fbcbd..2f31cddd27b7 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -223,6 +223,9 @@ unsigned long saved_video_mode; #define RAMDISK_LOAD_FLAG 0x4000 static char __initdata command_line[COMMAND_LINE_SIZE]; +#ifdef CONFIG_CMDLINE_BOOL +static char __initdata builtin_cmdline[COMMAND_LINE_SIZE] = CONFIG_CMDLINE; +#endif #if defined(CONFIG_EDD) || defined(CONFIG_EDD_MODULE) struct edd edd; @@ -673,6 +676,19 @@ void __init setup_arch(char **cmdline_p) bss_resource.start = virt_to_phys(&__bss_start); bss_resource.end = virt_to_phys(&__bss_stop)-1; +#ifdef CONFIG_CMDLINE_BOOL +#ifdef CONFIG_CMDLINE_OVERRIDE + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); +#else + if (builtin_cmdline[0]) { + /* append boot loader cmdline to builtin */ + strlcat(builtin_cmdline, " ", COMMAND_LINE_SIZE); + strlcat(builtin_cmdline, boot_command_line, COMMAND_LINE_SIZE); + strlcpy(boot_command_line, builtin_cmdline, COMMAND_LINE_SIZE); + } +#endif +#endif + strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; -- cgit v1.2.2 From 1ac2f7d55b7ee1613c90631e87fea22ec06781e5 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 4 Aug 2008 14:51:24 +0800 Subject: introduce two APIs for page attribute Introduce two APIs for page attribute. flushing tlb/cache in every page attribute is expensive. AGP gart usually will do a lot of operations to change a page to uc, new APIs can reduce flush. Signed-off-by: Shaohua Li Cc: airlied@linux.ie Cc: Andrew Morton Cc: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 58 +++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 65c6e46bf059..2c5c18c2464d 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -752,12 +752,12 @@ static inline int cache_attr(pgprot_t attr) (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int change_page_attr_set_clr(unsigned long addr, int numpages, +static int do_change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split) + int force_split, int *tlb_flush) { struct cpa_data cpa; - int ret, cache, checkalias; + int ret, checkalias; /* * Check, if we are requested to change a not supported @@ -792,9 +792,22 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, /* * Check whether we really changed something: */ - if (!cpa.flushtlb) - goto out; + *tlb_flush = cpa.flushtlb; + cpa_fill_pool(NULL); + + return ret; +} + +static int change_page_attr_set_clr(unsigned long addr, int numpages, + pgprot_t mask_set, pgprot_t mask_clr, + int force_split) +{ + int cache, flush_cache = 0, ret; + ret = do_change_page_attr_set_clr(addr, numpages, mask_set, mask_clr, + force_split, &flush_cache); + if (!flush_cache) + goto out; /* * No need to flush, when we did not set any of the caching * attributes: @@ -811,10 +824,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_range(addr, numpages, cache); else cpa_flush_all(cache); - out: - cpa_fill_pool(NULL); - return ret; } @@ -852,6 +862,30 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); +int set_memory_uc_noflush(unsigned long addr, int numpages) +{ + int flush; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL)) + return -EINVAL; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + return do_change_page_attr_set_clr(addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), + __pgprot(0), 0, &flush); +} +EXPORT_SYMBOL(set_memory_uc_noflush); + +void set_memory_flush_all(void) +{ + cpa_flush_all(1); +} +EXPORT_SYMBOL(set_memory_flush_all); + int _set_memory_wc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, @@ -926,6 +960,14 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); +int set_pages_uc_noflush(struct page *page, int numpages) +{ + unsigned long addr = (unsigned long)page_address(page); + + return set_memory_uc_noflush(addr, numpages); +} +EXPORT_SYMBOL(set_pages_uc_noflush); + int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); -- cgit v1.2.2 From 2bd455dbfebfd632a8dcf1d3d1612737986fde0a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 4 Aug 2008 11:26:38 +0800 Subject: x86: remove nesting CONFIG_HOTPLUG_CPU prefill_possible_map() is defined inside CONFIG_HOTPLUG_CPU, so the nesting CONFIG_HOTPLUG_CPU is just redundant. Signed-off-by: Li Zefan Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 332512767f4f..d5e19c362046 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1271,16 +1271,13 @@ __init void prefill_possible_map(void) if (!num_processors) num_processors = 1; -#ifdef CONFIG_HOTPLUG_CPU if (additional_cpus == -1) { if (disabled_cpus > 0) additional_cpus = disabled_cpus; else additional_cpus = 0; } -#else - additional_cpus = 0; -#endif + possible = num_processors + additional_cpus; if (possible > NR_CPUS) possible = NR_CPUS; -- cgit v1.2.2 From 9744f5a32853642f8ed0749a1c9ed8cf9c9c9dc4 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 3 Aug 2008 19:25:48 +0200 Subject: x86, acpi: cleanup, temp_stack is used only when CONFIG_SMP is set fix: arch/x86/kernel/acpi/sleep.c:24: warning: 'temp_stack' defined but not used [ Sven Wegener : fix build bug ] Signed-off-by: Marcin Slusarz Acked-by: Pavel Machek Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index fa2161d5003b..81e5ab6542d8 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -20,7 +20,7 @@ unsigned long acpi_realmode_flags; /* address in low memory of the wakeup routine. */ static unsigned long acpi_realmode; -#ifdef CONFIG_64BIT +#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) static char temp_stack[10240]; #endif -- cgit v1.2.2 From 020878ac427aa053414602cef975c2b5a2e33bf8 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Sat, 2 Aug 2008 21:23:36 +0200 Subject: x86: coding style fixes to arch/x86/boot/compressed/misc.c Before: total: 4 errors, 6 warnings, 439 lines checked After: total: 1 errors, 5 warnings, 441 lines checked Before -#include +#include paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/misc.o.* 8b2394e1fe519a9542e9a7e3e7b69c39 /tmp/misc.o.after 8b2394e1fe519a9542e9a7e3e7b69c39 /tmp/misc.o.before After -#include +#include paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/misc.o.* 59a2d264284be5e72b5af4f3a8ccfb47 /tmp/misc.o.after 8b2394e1fe519a9542e9a7e3e7b69c39 /tmp/misc.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/misc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 9fea73706479..2a3f77e52e2d 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include @@ -251,7 +251,7 @@ static void __putstr(int error, const char *s) y--; } } else { - vidmem [(x + cols * y) * 2] = c; + vidmem[(x + cols * y) * 2] = c; if (++x >= cols) { x = 0; if (++y >= lines) { @@ -277,7 +277,8 @@ static void *memset(void *s, int c, unsigned n) int i; char *ss = s; - for (i = 0; i < n; i++) ss[i] = c; + for (i = 0; i < n; i++) + ss[i] = c; return s; } @@ -287,7 +288,8 @@ static void *memcpy(void *dest, const void *src, unsigned n) const char *s = src; char *d = dest; - for (i = 0; i < n; i++) d[i] = s[i]; + for (i = 0; i < n; i++) + d[i] = s[i]; return dest; } -- cgit v1.2.2 From 2070dae10f50ec244f58292436ace9a3f9dc1d71 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Sat, 2 Aug 2008 21:24:06 +0200 Subject: x86: coding style fixes to arch/x86/kernel/bios_uv.c paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/bios_uv.o.* 9afe794594831166704744184e192ed8 /tmp/bios_uv.o.after 9afe794594831166704744184e192ed8 /tmp/bios_uv.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index c639bd55391c..fdd585f9c53d 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -25,11 +25,11 @@ x86_bios_strerror(long status) { const char *str; switch (status) { - case 0: str = "Call completed without error"; break; - case -1: str = "Not implemented"; break; - case -2: str = "Invalid argument"; break; - case -3: str = "Call completed with error"; break; - default: str = "Unknown BIOS status code"; break; + case 0: str = "Call completed without error"; break; + case -1: str = "Not implemented"; break; + case -2: str = "Invalid argument"; break; + case -3: str = "Call completed with error"; break; + default: str = "Unknown BIOS status code"; break; } return str; } -- cgit v1.2.2 From 209b580fd8c3a42b69550c98de434671d41a4ebb Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Sat, 2 Aug 2008 21:24:45 +0200 Subject: x86: coding style fixes to arch/x86/lib/strstr_32.c Before: total: 3 errors, 0 warnings, 31 lines checked After: total: 0 errors, 0 warnings, 31 lines checked paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/strstr_32.o.* c96006ec3387862e5bacb139207a3098 /tmp/strstr_32.o.after c96006ec3387862e5bacb139207a3098 /tmp/strstr_32.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Ingo Molnar --- arch/x86/lib/strstr_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/strstr_32.c b/arch/x86/lib/strstr_32.c index 42e8a50303f3..8e2d55f754bf 100644 --- a/arch/x86/lib/strstr_32.c +++ b/arch/x86/lib/strstr_32.c @@ -23,9 +23,9 @@ __asm__ __volatile__( "jne 1b\n\t" "xorl %%eax,%%eax\n\t" "2:" - :"=a" (__res), "=&c" (d0), "=&S" (d1) - :"0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) - :"dx", "di"); + : "=a" (__res), "=&c" (d0), "=&S" (d1) + : "0" (0), "1" (0xffffffff), "2" (cs), "g" (ct) + : "dx", "di"); return __res; } -- cgit v1.2.2 From 3492cdf0176bde5e35223a1388d59676bc67c145 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Sat, 2 Aug 2008 21:25:13 +0200 Subject: x86: coding style fixes to arch/x86/lib/string_32.c Before: total: 21 errors, 0 warnings, 237 lines checked After: total: 0 errors, 0 warnings, 237 lines checked paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/string_32.o.* c55d059ef1612b32a8bb2771a72ae0d5 /tmp/string_32.o.after c55d059ef1612b32a8bb2771a72ae0d5 /tmp/string_32.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Ingo Molnar --- arch/x86/lib/string_32.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 94972e7c094d..82004d2bf05e 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -22,7 +22,7 @@ char *strcpy(char *dest, const char *src) "testb %%al,%%al\n\t" "jne 1b" : "=&S" (d0), "=&D" (d1), "=&a" (d2) - :"0" (src), "1" (dest) : "memory"); + : "0" (src), "1" (dest) : "memory"); return dest; } EXPORT_SYMBOL(strcpy); @@ -42,7 +42,7 @@ char *strncpy(char *dest, const char *src, size_t count) "stosb\n" "2:" : "=&S" (d0), "=&D" (d1), "=&c" (d2), "=&a" (d3) - :"0" (src), "1" (dest), "2" (count) : "memory"); + : "0" (src), "1" (dest), "2" (count) : "memory"); return dest; } EXPORT_SYMBOL(strncpy); @@ -60,7 +60,7 @@ char *strcat(char *dest, const char *src) "testb %%al,%%al\n\t" "jne 1b" : "=&S" (d0), "=&D" (d1), "=&a" (d2), "=&c" (d3) - : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu): "memory"); + : "0" (src), "1" (dest), "2" (0), "3" (0xffffffffu) : "memory"); return dest; } EXPORT_SYMBOL(strcat); @@ -105,9 +105,9 @@ int strcmp(const char *cs, const char *ct) "2:\tsbbl %%eax,%%eax\n\t" "orb $1,%%al\n" "3:" - :"=a" (res), "=&S" (d0), "=&D" (d1) - :"1" (cs), "2" (ct) - :"memory"); + : "=a" (res), "=&S" (d0), "=&D" (d1) + : "1" (cs), "2" (ct) + : "memory"); return res; } EXPORT_SYMBOL(strcmp); @@ -130,9 +130,9 @@ int strncmp(const char *cs, const char *ct, size_t count) "3:\tsbbl %%eax,%%eax\n\t" "orb $1,%%al\n" "4:" - :"=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) - :"1" (cs), "2" (ct), "3" (count) - :"memory"); + : "=a" (res), "=&S" (d0), "=&D" (d1), "=&c" (d2) + : "1" (cs), "2" (ct), "3" (count) + : "memory"); return res; } EXPORT_SYMBOL(strncmp); @@ -152,9 +152,9 @@ char *strchr(const char *s, int c) "movl $1,%1\n" "2:\tmovl %1,%0\n\t" "decl %0" - :"=a" (res), "=&S" (d0) - :"1" (s), "0" (c) - :"memory"); + : "=a" (res), "=&S" (d0) + : "1" (s), "0" (c) + : "memory"); return res; } EXPORT_SYMBOL(strchr); @@ -169,9 +169,9 @@ size_t strlen(const char *s) "scasb\n\t" "notl %0\n\t" "decl %0" - :"=c" (res), "=&D" (d0) - :"1" (s), "a" (0), "0" (0xffffffffu) - :"memory"); + : "=c" (res), "=&D" (d0) + : "1" (s), "a" (0), "0" (0xffffffffu) + : "memory"); return res; } EXPORT_SYMBOL(strlen); @@ -189,9 +189,9 @@ void *memchr(const void *cs, int c, size_t count) "je 1f\n\t" "movl $1,%0\n" "1:\tdecl %0" - :"=D" (res), "=&c" (d0) - :"a" (c), "0" (cs), "1" (count) - :"memory"); + : "=D" (res), "=&c" (d0) + : "a" (c), "0" (cs), "1" (count) + : "memory"); return res; } EXPORT_SYMBOL(memchr); @@ -228,9 +228,9 @@ size_t strnlen(const char *s, size_t count) "cmpl $-1,%1\n\t" "jne 1b\n" "3:\tsubl %2,%0" - :"=a" (res), "=&d" (d0) - :"c" (s), "1" (count) - :"memory"); + : "=a" (res), "=&d" (d0) + : "c" (s), "1" (count) + : "memory"); return res; } EXPORT_SYMBOL(strnlen); -- cgit v1.2.2 From d9336a9b47d57db835453968efbd0d5cedfe0260 Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Sat, 2 Aug 2008 21:25:43 +0200 Subject: x86: coding style fixes to arch/x86/kernel/paravirt_patch_32.c Before: total: 3 errors, 1 warnings, 49 lines checked After: total: 2 errors, 1 warnings, 49 lines checked paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/paravirt_patch_32.o.* a78eea4264723e18c49dcfbe0ee0aae7 /tmp/paravirt_patch_32.o.after a78eea4264723e18c49dcfbe0ee0aae7 /tmp/paravirt_patch_32.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt_patch_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 58262218781b..9fe644f4861d 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -23,7 +23,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, start = start_##ops##_##x; \ end = end_##ops##_##x; \ goto patch_site - switch(type) { + switch (type) { PATCH_SITE(pv_irq_ops, irq_disable); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, restore_fl); -- cgit v1.2.2 From 0d5cdc97e242a5589e5dca23277675f4b4482490 Mon Sep 17 00:00:00 2001 From: Jens Rottmann Date: Mon, 4 Aug 2008 14:40:16 +0200 Subject: x86, geode-mfgpt: check IRQ before using MFGPT as clocksource Adds a simple IRQ autodetection to the AMD Geode MFGPT driver, and more importantly, adds some checks, if IRQs can actually be received on the chosen line. This fixes cases where MFGPT is selected as clocksource though not producing any ticks, so the kernel simply starves during boot. Signed-off-by: Jens Rottmann Cc: Andres Salomon Cc: linux-geode@bombadil.infradead.org Cc: Jordan Crouse Signed-off-by: Ingo Molnar --- arch/x86/kernel/mfgpt_32.c | 52 +++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c index 07c0f828f488..3b599518c322 100644 --- a/arch/x86/kernel/mfgpt_32.c +++ b/arch/x86/kernel/mfgpt_32.c @@ -33,6 +33,8 @@ #include #include +#define MFGPT_DEFAULT_IRQ 7 + static struct mfgpt_timer_t { unsigned int avail:1; } mfgpt_timers[MFGPT_MAX_TIMERS]; @@ -157,29 +159,48 @@ int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable) } EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event); -int geode_mfgpt_set_irq(int timer, int cmp, int irq, int enable) +int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable) { - u32 val, dummy; - int offset; + u32 zsel, lpc, dummy; + int shift; if (timer < 0 || timer >= MFGPT_MAX_TIMERS) return -EIO; - if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) + /* + * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA + * is using the same CMP of the timer's Siamese twin, the IRQ is set to + * 2, and we mustn't use nor change it. + * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the + * IRQ of the 1st. This can only happen if forcing an IRQ, calling this + * with *irq==0 is safe. Currently there _are_ no 2 drivers. + */ + rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); + shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4; + if (((zsel >> shift) & 0xF) == 2) return -EIO; - rdmsr(MSR_PIC_ZSEL_LOW, val, dummy); + /* Choose IRQ: if none supplied, keep IRQ already set or use default */ + if (!*irq) + *irq = (zsel >> shift) & 0xF; + if (!*irq) + *irq = MFGPT_DEFAULT_IRQ; - offset = (timer % 4) * 4; - - val &= ~((0xF << offset) | (0xF << (offset + 16))); + /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */ + if (*irq < 1 || *irq == 2 || *irq > 15) + return -EIO; + rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy); + if (lpc & (1 << *irq)) + return -EIO; + /* All chosen and checked - go for it */ + if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable)) + return -EIO; if (enable) { - val |= (irq & 0x0F) << (offset); - val |= (irq & 0x0F) << (offset + 16); + zsel = (zsel & ~(0xF << shift)) | (*irq << shift); + wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy); } - wrmsr(MSR_PIC_ZSEL_LOW, val, dummy); return 0; } @@ -242,7 +263,7 @@ EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer); static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN; static u16 mfgpt_event_clock; -static int irq = 7; +static int irq; static int __init mfgpt_setup(char *str) { get_option(&str, &irq); @@ -346,7 +367,7 @@ int __init mfgpt_timer_setup(void) mfgpt_event_clock = timer; /* Set up the IRQ on the MFGPT side */ - if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, irq)) { + if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) { printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq); return -EIO; } @@ -374,13 +395,14 @@ int __init mfgpt_timer_setup(void) &mfgpt_clockevent); printk(KERN_INFO - "mfgpt-timer: registering the MFGPT timer as a clock event.\n"); + "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n", + timer, irq); clockevents_register_device(&mfgpt_clockevent); return 0; err: - geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, irq); + geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq); printk(KERN_ERR "mfgpt-timer: Unable to set up the MFGPT clock source\n"); return -EIO; -- cgit v1.2.2 From d33dcb9e7d272cc3171dcc3a21049902185ab03d Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Fri, 1 Aug 2008 12:46:45 +0200 Subject: x86: Microcode patch loader style corrections Style corrections to main microcode module. Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 28dba1a6e4aa..39961bb83293 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -55,8 +55,8 @@ * in a single CPU package. * 1.10 28 Feb 2002 Asit K Mallick and * Tigran Aivazian , - * Serialize updates as required on HT processors due to speculative - * nature of implementation. + * Serialize updates as required on HT processors due to + * speculative nature of implementation. * 1.11 22 Mar 2002 Tigran Aivazian * Fix the panic when writing zero-length microcode chunk. * 1.12 29 Sep 2003 Nitin Kamble , @@ -71,7 +71,7 @@ * Thanks to Stuart Swales for pointing out this bug. */ -//#define DEBUG /* pr_debug */ +/*#define DEBUG pr_debug */ #include #include #include @@ -116,7 +116,7 @@ EXPORT_SYMBOL_GPL(user_buffer); unsigned int user_buffer_size; /* it's size */ EXPORT_SYMBOL_GPL(user_buffer_size); -static int do_microcode_update (void) +static int do_microcode_update(void) { long cursor = 0; int error = 0; @@ -158,18 +158,20 @@ out: return error; } -static int microcode_open (struct inode *unused1, struct file *unused2) +static int microcode_open(struct inode *unused1, struct file *unused2) { cycle_kernel_lock(); return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; } -static ssize_t microcode_write (struct file *file, const char __user *buf, size_t len, loff_t *ppos) +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) { ssize_t ret; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", num_physpages); + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", + num_physpages); return -EINVAL; } @@ -201,7 +203,7 @@ static struct miscdevice microcode_dev = { .fops = µcode_fops, }; -static int __init microcode_dev_init (void) +static int __init microcode_dev_init(void) { int error; @@ -216,7 +218,7 @@ static int __init microcode_dev_init (void) return 0; } -static void microcode_dev_exit (void) +static void microcode_dev_exit(void) { misc_deregister(µcode_dev); } @@ -224,7 +226,7 @@ static void microcode_dev_exit (void) MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); #else #define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while(0) +#define microcode_dev_exit() do { } while (0) #endif /* fake device for request_firmware */ @@ -451,7 +453,7 @@ int microcode_init(void *opaque, struct module *module) } EXPORT_SYMBOL_GPL(microcode_init); -void __exit microcode_exit (void) +void __exit microcode_exit(void) { microcode_dev_exit(); -- cgit v1.2.2 From 636a31781684d0f49208aa163f1a5a3a74210eb4 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Fri, 1 Aug 2008 12:46:46 +0200 Subject: x86: Fixed NULL function pointer dereference in AMD microcode patch loader. Dereference took place in code part responsible for manual installation of microcode patches through /dev/cpu/microcode. Signed-off-by: Peter Oruba Cc: Peter Oruba Cc: Tigran Aivazian Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 39961bb83293..ad136ad99cb3 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -127,7 +127,8 @@ static int do_microcode_update(void) old = current->cpus_allowed; while ((cursor = microcode_ops->get_next_ucode(&new_mc, cursor)) > 0) { - error = microcode_ops->microcode_sanity_check(new_mc); + if (microcode_ops->microcode_sanity_check != NULL) + error = microcode_ops->microcode_sanity_check(new_mc); if (error) goto out; /* -- cgit v1.2.2 From 5843d9a4d0ba89719916c8f07fc9c57b7126be6d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 1 Aug 2008 03:15:21 +0200 Subject: x86, pat: avoid highmem cache attribute aliasing Highmem code can leave ptes and tlb entries around for a given page even after kunmap, and after it has been freed. >From what I can gather, the PAT code may change the cache attributes of arbitrary physical addresses (ie. including highmem pages), which would result in aliases in the case that it operates on one of these lazy tlb highmem pages. Flushing kmaps should solve the problem. I've also just added code for conditional flushing if we haven't got any dangling highmem aliases -- this should help performance if we change page attributes frequently or systems that aren't using much highmem pages (eg. if < 4G RAM). Should be turned into 2 patches, but just for RFC... Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2c5c18c2464d..4adb33628dec 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -777,6 +777,9 @@ static int do_change_page_attr_set_clr(unsigned long addr, int numpages, WARN_ON_ONCE(1); } + /* Must avoid aliasing mappings in the highmem code */ + kmap_flush_unused(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; -- cgit v1.2.2 From c9c3dddd8f9a05b25d4ce53e8e80cc0ea1759d18 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Fri, 1 Aug 2008 03:51:38 +0400 Subject: x86_64: remove empty lines from stack traces/oopses Signed-off-by: Alexey Dobriyan Cc: ak@suse.de Cc: akpm@osdl.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 3f18d73f420c..8b5b3b81d437 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -339,9 +339,8 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl) { - printk("\nCall Trace:\n"); + printk("Call Trace:\n"); dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("\n"); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -386,6 +385,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, printk(" %016lx", *stack++); touch_nmi_watchdog(); } + printk("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } @@ -443,7 +443,6 @@ void show_registers(struct pt_regs *regs) printk("Stack: "); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, regs->bp, ""); - printk("\n"); printk(KERN_EMERG "Code: "); -- cgit v1.2.2 From 66d4bdf22b8652cda215e2653c8bbec7a767ed57 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 31 Jul 2008 16:48:31 +0100 Subject: x86-64: fix overlap of modules and fixmap areas Plus add a build time check so this doesn't go unnoticed again. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 1b318e903bf6..9bfc4d72fb2e 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -88,6 +88,7 @@ void __init x86_64_start_kernel(char * real_mode_data) BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); + BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); -- cgit v1.2.2 From 163f6876f5c3ff8215e900b93779e960a56b3694 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:22 -0700 Subject: kexec jump: rename KEXEC_CONTROL_CODE_SIZE to KEXEC_CONTROL_PAGE_SIZE Rename KEXEC_CONTROL_CODE_SIZE to KEXEC_CONTROL_PAGE_SIZE, because control page is used for not only code on some platform. For example in kexec jump, it is used for data and stack too. [akpm@linux-foundation.org: unbreak powerpc and arm, finish conversion] Signed-off-by: Huang Ying Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Ingo Molnar Cc: Russell King Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 9fe478d98406..466450167dea 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -78,7 +78,7 @@ static void load_segments(void) /* * A architecture hook called to validate the * proposed image and prepare the control pages - * as needed. The pages for KEXEC_CONTROL_CODE_SIZE + * as needed. The pages for KEXEC_CONTROL_PAGE_SIZE * have been allocated, but the segments have yet * been copied into the kernel. * -- cgit v1.2.2 From fb45daa69d287b394eca1619b3fadff7c0215c71 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:23 -0700 Subject: kexec jump: check code size in control page Kexec/Kexec-jump require code size in control page is less than PAGE_SIZE/2. This patch add link-time checking for this. ASSERT() of ld link script is used as the link-time checking mechanism. [akpm@linux-foundation.org: build fix] Signed-off-by: Huang Ying Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Acked-by: Vivek Goyal Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 2 +- arch/x86/kernel/relocate_kernel_32.S | 10 +++++++--- arch/x86/kernel/vmlinux_32.lds.S | 8 ++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 466450167dea..5c8e7735c896 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -138,7 +138,7 @@ void machine_kexec(struct kimage *image) } control_page = page_address(image->control_code_page); - memcpy(control_page, relocate_kernel, PAGE_SIZE/2); + memcpy(control_page, relocate_kernel, KEXEC_CONTROL_CODE_MAX_SIZE); relocate_kernel_ptr = control_page; page_list[PA_CONTROL_PAGE] = __pa(control_page); diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index 703310a99023..6f50664b2ba5 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -20,10 +20,11 @@ #define PAGE_ATTR (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY) #define PAE_PGD_ATTR (_PAGE_PRESENT) -/* control_page + PAGE_SIZE/2 ~ control_page + PAGE_SIZE * 3/4 are - * used to save some data for jumping back +/* control_page + KEXEC_CONTROL_CODE_MAX_SIZE + * ~ control_page + PAGE_SIZE are used as data storage and stack for + * jumping back */ -#define DATA(offset) (PAGE_SIZE/2+(offset)) +#define DATA(offset) (KEXEC_CONTROL_CODE_MAX_SIZE+(offset)) /* Minimal CPU state */ #define ESP DATA(0x0) @@ -376,3 +377,6 @@ swap_pages: popl %ebx popl %ebp ret + + .globl kexec_control_code_size +.set kexec_control_code_size, . - relocate_kernel diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index cdb2363697d2..af5bdad84604 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -209,3 +209,11 @@ SECTIONS DWARF_DEBUG } + +#ifdef CONFIG_KEXEC +/* Link time checks */ +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif -- cgit v1.2.2 From 3122c331190e9d1622bf1c8cf6ce3b17cca67c9e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Fri, 15 Aug 2008 00:40:26 -0700 Subject: kexec jump: fix for ftrace Ftrace depends on some processor state that we destroyed during kexec and restored by restore_processor_state(). So save_processor_state() and restore_processor_state() are moved into machine_kexec() and ftrace is restored after restore_processor_state(). Signed-off-by: Huang Ying Cc: Pavel Machek Cc: "Rafael J. Wysocki" Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_32.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 5c8e7735c896..0732adba05ca 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -113,6 +114,7 @@ void machine_kexec(struct kimage *image) { unsigned long page_list[PAGES_NR]; void *control_page; + int save_ftrace_enabled; asmlinkage unsigned long (*relocate_kernel_ptr)(unsigned long indirection_page, unsigned long control_page, @@ -120,7 +122,12 @@ void machine_kexec(struct kimage *image) unsigned int has_pae, unsigned int preserve_context); - tracer_disable(); +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + save_processor_state(); +#endif + + save_ftrace_enabled = __ftrace_enabled_save(); /* Interrupts aren't acceptable while we reboot */ local_irq_disable(); @@ -178,6 +185,13 @@ void machine_kexec(struct kimage *image) (unsigned long)page_list, image->start, cpu_has_pae, image->preserve_context); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) + restore_processor_state(); +#endif + + __ftrace_enabled_restore(save_ftrace_enabled); } void arch_crash_save_vmcoreinfo(void) -- cgit v1.2.2 From 48e2bd56b1d1ae4b95fb21be778927b64d5c4235 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Sat, 2 Aug 2008 12:50:37 -0300 Subject: x86: coding style fixes to arch/x86/kernel/traps_64.c Fix coding style of traps_64.c with improvements suggested by Ingo. Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index fe36d96ba70b..5df5d2e97ece 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -84,8 +84,8 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) void printk_address(unsigned long address, int reliable) { - printk(" [<%016lx>] %s%pS\n", address, reliable ? - "" : "? ", (void *) address); + printk(" [<%016lx>] %s%pS\n", + address, reliable ? "" : "? ", (void *) address); } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, @@ -98,8 +98,8 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, [STACKFAULT_STACK - 1] = "#SS", [MCE_STACK - 1] = "#MC", #if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... N_EXCEPTION_STACKS + DEBUG_STKSZ / - EXCEPTION_STKSZ - 2] = "#DB[?]" + [N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" #endif }; unsigned k; @@ -363,8 +363,10 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *irqstack = (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - /* debugging aid: "show_stack(NULL, NULL);" prints the - back trace for this cpu. */ + /* + * debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu. + */ if (sp == NULL) { if (task) -- cgit v1.2.2 From 89499759dc0dd300528510f465b0bf532fc79a2a Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Mon, 11 Aug 2008 17:01:50 -0700 Subject: x86/PCI: irq and pci_ids patch for Intel Ibex Peak PCHs This patch adds the Intel Ibex Peak (PCH) LPC and SMBus Controller DeviceIDs. Signed-off-by: Seth Heasley Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index fec0123b33a9..8e077185e185 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -590,6 +590,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: + case PCI_DEVICE_ID_INTEL_PCH_0: + case PCI_DEVICE_ID_INTEL_PCH_1: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; -- cgit v1.2.2 From e213e87785559eaf3107897226817aea9291b06f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 15 Aug 2008 18:12:47 +0200 Subject: x86: Fix ioremap off by one BUG Jean Delvare's machine triggered this BUG acpi_os_map_memory phys ffff0000 size 65535 ------------[ cut here ]------------ kernel BUG at arch/x86/mm/pat.c:233! with ACPI in the backtrace. Adding some debugging output showed that ACPI calls acpi_os_map_memory phys ffff0000 size 65535 And ioremap/PAT does this check in 32bit, so addr+size wraps and the BUG in reserve_memtype() triggers incorrectly. BUG_ON(start >= end); /* end is exclusive */ But reserve_memtype already uses u64: int reserve_memtype(u64 start, u64 end, so the 32bit truncation must happen in the caller. Presumably in ioremap when it passes this information to reserve_memtype(). This patch does this computation in 64bit. http://bugzilla.kernel.org/show_bug.cgi?id=11346 Signed-off-by: Andi Kleen --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 016f335bbeea..6ba6f889c79d 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -170,7 +170,7 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, phys_addr &= PAGE_MASK; size = PAGE_ALIGN(last_addr+1) - phys_addr; - retval = reserve_memtype(phys_addr, phys_addr + size, + retval = reserve_memtype(phys_addr, (u64)phys_addr + size, prot_val, &new_prot_val); if (retval) { pr_debug("Warning: reserve_memtype returned %d\n", retval); -- cgit v1.2.2 From fc0091b3c86396afc8e6c273aff21671cf882ee1 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 15 Aug 2008 17:21:14 +0100 Subject: x86: change init_gdt to update the gdt via write_gdt, rather than a direct write. By writing directly, a memory access violation can occur whilst hotplugging a CPU if the entry was previously marked read-only. Signed-off-by: Alex Nixon Cc: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpcommon.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpcommon.c b/arch/x86/kernel/smpcommon.c index 99941b37eca0..397e309839dd 100644 --- a/arch/x86/kernel/smpcommon.c +++ b/arch/x86/kernel/smpcommon.c @@ -8,18 +8,21 @@ DEFINE_PER_CPU(unsigned long, this_cpu_off); EXPORT_PER_CPU_SYMBOL(this_cpu_off); -/* Initialize the CPU's GDT. This is either the boot CPU doing itself - (still using the master per-cpu area), or a CPU doing it for a - secondary which will soon come up. */ +/* + * Initialize the CPU's GDT. This is either the boot CPU doing itself + * (still using the master per-cpu area), or a CPU doing it for a + * secondary which will soon come up. + */ __cpuinit void init_gdt(int cpu) { - struct desc_struct *gdt = get_cpu_gdt_table(cpu); + struct desc_struct gdt; - pack_descriptor(&gdt[GDT_ENTRY_PERCPU], - __per_cpu_offset[cpu], 0xFFFFF, + pack_descriptor(&gdt, __per_cpu_offset[cpu], 0xFFFFF, 0x2 | DESCTYPE_S, 0x8); + gdt.s = 1; - gdt[GDT_ENTRY_PERCPU].s = 1; + write_gdt_entry(get_cpu_gdt_table(cpu), + GDT_ENTRY_PERCPU, &gdt, DESCTYPE_S); per_cpu(this_cpu_off, cpu) = __per_cpu_offset[cpu]; per_cpu(cpu_number, cpu) = cpu; -- cgit v1.2.2 From 8d6ea9674cb12b90c800dc572214bf06f6ce8340 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Fri, 15 Aug 2008 18:32:24 +0200 Subject: x86: fix section mismatch warning - spp_getpage() WARNING: vmlinux.o(.text+0x17a3e): Section mismatch in reference from the function set_pte_vaddr_pud() to the function .init.text:spp_getpage() The function set_pte_vaddr_pud() references the function __init spp_getpage(). This is often because set_pte_vaddr_pud lacks a __init annotation or the annotation of spp_getpage is wrong. spp_getpage is called from __init (__init_extra_mapping) and non __init (set_pte_vaddr_pud) functions, so it can't be __init. Unfortunately it calls alloc_bootmem_pages which is __init, but does it only when bootmem allocator is available (after_bootmem == 0). So annotate it accordingly. Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar Cc: H. Peter Anvin --- arch/x86/mm/init_64.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index b3e6c3075acc..a87ea0e4b3dc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -88,7 +88,11 @@ early_param("gbpages", parse_direct_gbpages_on); int after_bootmem; -static __init void *spp_getpage(void) +/* + * NOTE: This function is marked __ref because it calls __init function + * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. + */ +static __ref void *spp_getpage(void) { void *ptr; -- cgit v1.2.2 From f88f07e0f0fd6376e081b10930d272a08fbf082f Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Thu, 14 Aug 2008 16:58:15 -0400 Subject: x86: alternatives : fix LOCK_PREFIX race with preemptible kernel and CPU hotplug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a kernel thread is preempted in single-cpu mode right after the NOP (nop about to be turned into a lock prefix), then we CPU hotplug a CPU, and then the thread is scheduled back again, a SMP-unsafe atomic operation will be used on shared SMP variables, leading to corruption. No corruption would happen in the reverse case : going from SMP to UP is ok because we split a bit instruction into tiny pieces, which does not present this condition. Changing the 0x90 (single-byte nop) currently used into a 0x3E DS segment override prefix should fix this issue. Since the default of the atomic instructions is to use the DS segment anyway, it should not affect the behavior. The exception to this are references that use ESP/RSP and EBP/RBP as the base register (they will use the SS segment), however, in Linux (a) DS == SS at all times, and (b) we do not distinguish between segment violations reported as #SS as opposed to #GP, so there is no need to disassemble the instruction to figure out the suitable segment. This patch assumes that the 0x3E prefix will leave atomic operations as-is (thus assuming they normally touch data in the DS segment). Since there seem to be no obvious ill-use of other segment override prefixes for atomic operations, it should be safe. It can be verified with a quick grep -r LOCK_PREFIX include/asm-x86/ grep -A 1 -r LOCK_PREFIX arch/x86/ Taken from This source : AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions States "Instructions that Reference a Non-Stack Segment—If an instruction encoding references any base register other than rBP or rSP, or if an instruction contains an immediate offset, the default segment is the data segment (DS). These instructions can use the segment-override prefix to select one of the non-default segments, as shown in Table 1-5." Therefore, forcing the DS segment on the atomic operations, which already use the DS segment, should not change. This source : http://wiki.osdev.org/X86_Instruction_Encoding States "In 64-bit the CS, SS, DS and ES segment overrides are ignored." Confirmed by "AMD 64-Bit Technology" A.7 http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/x86-64_overview.pdf "In 64-bit mode, the DS, ES, SS and CS segment-override prefixes have no effect. These four prefixes are no longer treated as segment-override prefixes in the context of multipleprefix rules. Instead, they are treated as null prefixes." This patch applies to 2.6.27-rc2, but would also have to be applied to earlier kernels (2.6.26, 2.6.25, ...). Performance impact of the fix : tests done on "xaddq" and "xaddl" shows it actually improves performances on Intel Xeon, AMD64, Pentium M. It does not change the performance on Pentium II, Pentium 3 and Pentium 4. Xeon E5405 2.0GHz : NR_TESTS 10000000 test empty cycles : 162207948 test test 1-byte nop xadd cycles : 170755422 test test DS override prefix xadd cycles : 170000118 * test test LOCK xadd cycles : 472012134 AMD64 2.0GHz : NR_TESTS 10000000 test empty cycles : 146674549 test test 1-byte nop xadd cycles : 150273860 test test DS override prefix xadd cycles : 149982382 * test test LOCK xadd cycles : 270000690 Pentium 4 3.0GHz NR_TESTS 10000000 test empty cycles : 290001195 test test 1-byte nop xadd cycles : 310000560 test test DS override prefix xadd cycles : 310000575 * test test LOCK xadd cycles : 1050103740 Pentium M 2.0GHz NR_TESTS 10000000 test empty cycles : 180000523 test test 1-byte nop xadd cycles : 320000345 test test DS override prefix xadd cycles : 310000374 * test test LOCK xadd cycles : 480000357 Pentium 3 550MHz NR_TESTS 10000000 test empty cycles : 510000231 test test 1-byte nop xadd cycles : 620000128 test test DS override prefix xadd cycles : 620000110 * test test LOCK xadd cycles : 800000088 Pentium II 350MHz NR_TESTS 10000000 test empty cycles : 200833494 test test 1-byte nop xadd cycles : 340000130 test test DS override prefix xadd cycles : 340000126 * test test LOCK xadd cycles : 530000078 Speed test modules can be found at http://ltt.polymtl.ca/svn/trunk/tests/kernel/test-prefix-speed-32.c http://ltt.polymtl.ca/svn/trunk/tests/kernel/test-prefix-speed.c Macro-benchmarks 2.0GHz E5405 Core 2 dual Quad-Core Xeon Summary * replace smp lock prefixes with DS segment selector prefixes no lock prefix (s) with lock prefix (s) Speedup make -j1 kernel/ 33.94 +/- 0.07 34.91 +/- 0.27 2.8 % hackbench 50 2.99 +/- 0.01 3.74 +/- 0.01 25.1 % * replace smp lock prefixes with 0x90 nops no lock prefix (s) with lock prefix (s) Speedup make -j1 kernel/ 34.16 +/- 0.32 34.91 +/- 0.27 2.2 % hackbench 50 3.00 +/- 0.01 3.74 +/- 0.01 24.7 % Detail : 1 CPU, replace smp lock prefixes with DS segment selector prefixes make -j1 kernel/ real 0m34.067s user 0m30.630s sys 0m2.980s real 0m33.867s user 0m30.582s sys 0m3.024s real 0m33.939s user 0m30.738s sys 0m2.876s real 0m33.913s user 0m30.806s sys 0m2.808s avg : 33.94s std. dev. : 0.07s hackbench 50 Time: 2.978 Time: 2.982 Time: 3.010 Time: 2.984 Time: 2.982 avg : 2.99 std. dev. : 0.01 1 CPU, noreplace-smp make -j1 kernel/ real 0m35.326s user 0m30.630s sys 0m3.260s real 0m34.325s user 0m30.802s sys 0m3.084s real 0m35.568s user 0m30.722s sys 0m3.168s real 0m34.435s user 0m30.886s sys 0m2.996s avg.: 34.91s std. dev. : 0.27s hackbench 50 Time: 3.733 Time: 3.750 Time: 3.761 Time: 3.737 Time: 3.741 avg : 3.74 std. dev. : 0.01 1 CPU, replace smp lock prefixes with 0x90 nops make -j1 kernel/ real 0m34.139s user 0m30.782s sys 0m2.820s real 0m34.010s user 0m30.630s sys 0m2.976s real 0m34.777s user 0m30.658s sys 0m2.916s real 0m33.924s user 0m30.634s sys 0m2.924s real 0m33.962s user 0m30.774s sys 0m2.800s real 0m34.141s user 0m30.770s sys 0m2.828s avg : 34.16 std. dev. : 0.32 hackbench 50 Time: 2.999 Time: 2.994 Time: 3.004 Time: 2.991 Time: 2.988 avg : 3.00 std. dev. : 0.01 I did more runs (20 runs of each) to compare the nop case to the DS prefix case. Results in seconds. They actually does not seems to show a significant difference. NOP 34.155 33.955 34.012 35.299 35.679 34.141 33.995 35.016 34.254 33.957 33.957 34.008 35.013 34.494 33.893 34.295 34.314 34.854 33.991 34.132 DS 34.080 34.304 34.374 35.095 34.291 34.135 33.940 34.208 35.276 34.288 33.861 33.898 34.610 34.709 33.851 34.256 35.161 34.283 33.865 35.078 Used http://www.graphpad.com/quickcalcs/ttest1.cfm?Format=C to do the T-test (yeah, I'm lazy) : Group Group One (DS prefix) Group Two (nops) Mean 34.37815 34.37070 SD 0.46108 0.51905 SEM 0.10310 0.11606 N 20 20 P value and statistical significance: The two-tailed P value equals 0.9620 By conventional criteria, this difference is considered to be not statistically significant. Confidence interval: The mean of Group One minus Group Two equals 0.00745 95% confidence interval of this difference: From -0.30682 to 0.32172 Intermediate values used in calculations: t = 0.0480 df = 38 standard error of difference = 0.155 So, unless these calculus are completely bogus, the difference between the nop and the DS case seems not to be statistically significant. Signed-off-by: Mathieu Desnoyers Acked-by: H. Peter Anvin CC: Linus Torvalds CC: Jeremy Fitzhardinge CC: Roland McGrath CC: Ingo Molnar Cc: Steven Rostedt CC: Steven Rostedt CC: Thomas Gleixner CC: Peter Zijlstra CC: Andrew Morton CC: David Miller CC: Ulrich Drepper CC: Rusty Russell CC: Gregory Haskins CC: Arnaldo Carvalho de Melo CC: "Luis Claudio R. Goncalves" CC: Clark Williams CC: Christoph Lameter CC: Andi Kleen CC: Harvey Harrison Signed-off-by: H. Peter Anvin --- arch/x86/kernel/alternative.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b553..7ead11f3732d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -241,25 +241,25 @@ static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) continue; if (*ptr > text_end) continue; - text_poke(*ptr, ((unsigned char []){0xf0}), 1); /* add lock prefix */ + /* turn DS segment override prefix into lock prefix */ + text_poke(*ptr, ((unsigned char []){0xf0}), 1); }; } static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) { u8 **ptr; - char insn[1]; if (noreplace_smp) return; - add_nops(insn, 1); for (ptr = start; ptr < end; ptr++) { if (*ptr < text) continue; if (*ptr > text_end) continue; - text_poke(*ptr, insn, 1); + /* turn lock prefix into DS segment override prefix */ + text_poke(*ptr, ((unsigned char []){0x3E}), 1); }; } -- cgit v1.2.2 From 6f6da97faf29f87b3980a6992fb8cab44b4c444d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 23:05:19 +0400 Subject: x86: apic - sync_Arb_IDs style fixup No changes on binary level Signed-off-by: Cyrill Gorcunov Acked-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 4 +++- arch/x86/kernel/apic_64.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d07488993ee7..60575c05151a 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -909,13 +909,15 @@ void __init sync_Arb_IDs(void) */ if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return; + /* * Wait for idle. */ apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 41aff3460635..72e94ab0e364 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -755,7 +755,8 @@ void __init sync_Arb_IDs(void) apic_wait_icr_idle(); apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | APIC_INT_LEVELTRIG | APIC_DM_INIT); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); } /* -- cgit v1.2.2 From 638c0411922540deaf8797cacf73513b17618405 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 15 Aug 2008 23:05:18 +0400 Subject: x86: apic - unify init_bsp_APIC - remove redundant read of APIC_LVR register in 64bit mode - APIC is always integrated for 64bit mode so gcc will eliminate lapic_is_integrated call Signed-off-by: Cyrill Gorcunov Acked-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 4 +++- arch/x86/kernel/apic_64.c | 14 +++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 60575c05151a..16d9721788c1 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -925,7 +925,7 @@ void __init sync_Arb_IDs(void) */ void __init init_bsp_APIC(void) { - unsigned long value; + unsigned int value; /* * Don't do the setup now if we have a SMP BIOS as the @@ -946,11 +946,13 @@ void __init init_bsp_APIC(void) value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; +#ifdef CONFIG_X86_32 /* This bit is reserved on P4/Xeon and should be cleared */ if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 == 15)) value &= ~APIC_SPIV_FOCUS_DISABLED; else +#endif value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; apic_write(APIC_SPIV, value); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 72e94ab0e364..99d18b8976a5 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -773,8 +773,6 @@ void __init init_bsp_APIC(void) if (smp_found_config || !cpu_has_apic) return; - value = apic_read(APIC_LVR); - /* * Do not trust the local APIC being empty at bootup. */ @@ -786,7 +784,15 @@ void __init init_bsp_APIC(void) value = apic_read(APIC_SPIV); value &= ~APIC_VECTOR_MASK; value |= APIC_SPIV_APIC_ENABLED; - value |= APIC_SPIV_FOCUS_DISABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; value |= SPURIOUS_APIC_VECTOR; apic_write(APIC_SPIV, value); @@ -795,6 +801,8 @@ void __init init_bsp_APIC(void) */ apic_write(APIC_LVT0, APIC_DM_EXTINT); value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); } -- cgit v1.2.2 From 6764014bc8bb4849f6a4f336477e873ad5861ed2 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 16 Aug 2008 23:21:50 +0400 Subject: x86: apic - unify clear_local_APIC - Remove redundant masking of APIC_LVTTHMR register in apic_32.c - Add masking of APIC_LVTTHMR register to apic_64.c. We use a bit complicated #ifdef here: CONFIG_X86_MCE_P4THERMAL is 32bit specific and X86_MCE_INTEL is 64bit specific so the appropriate config variable will be set by Kconfig. - the APIC_ESR register clearing in apic_64.c now uses not straightforward way but this is allowed tradeoff. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 6 +----- arch/x86/kernel/apic_64.c | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 16d9721788c1..3131603a4d6a 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -754,7 +754,7 @@ void clear_local_APIC(void) } /* lets not touch this if we didn't frob it */ -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) if (maxlvt >= 5) { v = apic_read(APIC_LVTTHMR); apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); @@ -771,10 +771,6 @@ void clear_local_APIC(void) if (maxlvt >= 4) apic_write(APIC_LVTPC, APIC_LVT_MASKED); -#ifdef CONFIG_X86_MCE_P4THERMAL - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, APIC_LVT_MASKED); -#endif /* Integrated APIC (!82489DX) ? */ if (lapic_is_integrated()) { if (maxlvt > 3) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 99d18b8976a5..d834b7583624 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -630,6 +630,13 @@ void clear_local_APIC(void) apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); } + /* lets not touch this if we didn't frob it */ +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif /* * Clean APIC state for other OSs: */ @@ -640,8 +647,14 @@ void clear_local_APIC(void) apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) apic_write(APIC_LVTPC, APIC_LVT_MASKED); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); + + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } } /** -- cgit v1.2.2 From 92206c909ad1d4f9c355b4842722d5a355d6b76b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 16 Aug 2008 23:21:51 +0400 Subject: x86: apic - unify lapic_resume Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 29 ++++++++++++++++++----------- arch/x86/kernel/apic_64.c | 19 +++++++++++++++---- 2 files changed, 33 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 3131603a4d6a..3d40213d3af2 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1606,16 +1606,21 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); +#ifdef CONFIG_X86_64 + if (x2apic) + enable_x2apic(); + else +#endif + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); @@ -1625,7 +1630,7 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif @@ -1639,7 +1644,9 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index d834b7583624..e542a2d08de5 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1412,13 +1412,22 @@ static int lapic_resume(struct sys_device *dev) maxlvt = lapic_get_maxlvt(); local_irq_save(flags); - if (!x2apic) { + +#ifdef CONFIG_X86_64 + if (x2apic) + enable_x2apic(); + else +#endif + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; wrmsr(MSR_IA32_APICBASE, l, h); - } else - enable_x2apic(); apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); @@ -1428,7 +1437,7 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#ifdef CONFIG_X86_MCE_INTEL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); #endif @@ -1442,7 +1451,9 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); + local_irq_restore(flags); + return 0; } -- cgit v1.2.2 From 24968cfdd4c3cf61338495dd0f9bb8a6907d2087 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 16 Aug 2008 23:21:52 +0400 Subject: x86: apic - unify lapic_suspend Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 2 +- arch/x86/kernel/apic_64.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 3d40213d3af2..6cb8aaaf10f5 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1582,7 +1582,7 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_P4THERMAL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index e542a2d08de5..13dea935b4eb 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1390,10 +1390,11 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state) apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#ifdef CONFIG_X86_MCE_INTEL +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) if (maxlvt >= 5) apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); #endif + local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); -- cgit v1.2.2 From 274cfe5912eebe9d4e1a4c451fe617289a181fcf Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 16 Aug 2008 23:21:53 +0400 Subject: x86: apic - rearrange functions and comments Rearrange functions and comments to find differences easier. Also use apic_printk in setup_boot_APIC_clock for 64bit mode. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 69 ++++++++++++++++++++++++++--------------------- arch/x86/kernel/apic_64.c | 48 ++++++++++++++++++++++++++------- 2 files changed, 77 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 6cb8aaaf10f5..9e8702eebd46 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -251,6 +251,9 @@ int lapic_get_maxlvt(void) * this function twice on the boot CPU, once with a bogus timeout * value, second time for real. The other (noncalibrating) CPUs * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. */ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { @@ -279,6 +282,36 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) apic_write(APIC_TMICT, clocks / APIC_DIVISOR); } +/* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} + /* * Program the next event, relative to now */ @@ -298,7 +331,7 @@ static void lapic_timer_setup(enum clock_event_mode mode, unsigned long flags; unsigned int v; - /* Lapic used for broadcast ? */ + /* Lapic used as dummy for broadcast ? */ if (evt->features & CLOCK_EVT_FEAT_DUMMY) return; @@ -680,35 +713,6 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } -/* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} - /* * Local APIC start and shutdown */ @@ -1542,6 +1546,11 @@ void __cpuinit generic_processor_info(int apicid, int version) #ifdef CONFIG_PM static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ int active; /* r/w apic fields */ unsigned int apic_id; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 13dea935b4eb..46523c0cc6f6 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -81,6 +81,9 @@ static void lapic_timer_setup(enum clock_event_mode mode, static void lapic_timer_broadcast(cpumask_t mask); static void apic_pm_activate(void); +/* + * The local apic timer can be used for any function which is CPU local. + */ static struct clock_event_device lapic_clockevent = { .name = "lapic", .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT @@ -127,6 +130,11 @@ static int modern_apic(void) return lapic_get_version() >= 0x14; } +/* + * Paravirt kernels also might be using these below ops. So we still + * use generic apic_read()/apic_write(), which might be pointing to different + * ops in PARAVIRT case. + */ void xapic_wait_icr_idle(void) { while (apic_read(APIC_ICR) & APIC_ICR_BUSY) @@ -175,7 +183,6 @@ static struct apic_ops xapic_ops = { }; struct apic_ops __read_mostly *apic_ops = &xapic_ops; - EXPORT_SYMBOL_GPL(apic_ops); static void x2apic_wait_icr_idle(void) @@ -244,6 +251,10 @@ int lapic_get_maxlvt(void) return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; } +/* + * Local APIC timer + */ + /* Clock divisor is set to 1 */ #define APIC_DIVISOR 1 @@ -257,7 +268,6 @@ int lapic_get_maxlvt(void) * We do reads before writes even if unnecessary, to get around the * P5 APIC double write bug. */ - static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) { unsigned int lvtt_value, tmp_value; @@ -474,10 +484,10 @@ static int __init calibrate_APIC_clock(void) void __init setup_boot_APIC_clock(void) { /* - * The local apic timer can be disabled via the kernel commandline. - * Register the lapic timer as a dummy clock event source on SMP - * systems, so the broadcast mechanism is used. On UP systems simply - * ignore it. + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. */ if (disable_apic_timer) { printk(KERN_INFO "Disabling APIC timer\n"); @@ -489,7 +499,9 @@ void __init setup_boot_APIC_clock(void) return; } - printk(KERN_INFO "Using local APIC timer interrupts.\n"); + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + if (calibrate_APIC_clock()) { /* No broadcast on UP ! */ if (num_possible_cpus() > 1) @@ -508,6 +520,7 @@ void __init setup_boot_APIC_clock(void) printk(KERN_WARNING "APIC timer registered as dummy," " due to nmi_watchdog=%d!\n", nmi_watchdog); + /* Setup the lapic or request the broadcast */ setup_APIC_timer(); } @@ -577,6 +590,7 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) irq_enter(); local_apic_timer_interrupt(); irq_exit(); + set_irq_regs(old_regs); } @@ -1248,6 +1262,13 @@ void __init connect_bsp_APIC(void) enable_apic_mode(); } +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ void disconnect_bsp_APIC(int virt_wire_setup) { /* Go back to Virtual Wire compatibility mode */ @@ -1347,9 +1368,11 @@ int hard_smp_processor_id(void) #ifdef CONFIG_PM static struct { - /* 'active' is true if the local APIC was enabled by us and - not the BIOS; this signifies that we are also responsible - for disabling it before entering apm/acpi suspend */ + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ int active; /* r/w apic fields */ unsigned int apic_id; @@ -1458,6 +1481,11 @@ static int lapic_resume(struct sys_device *dev) return 0; } +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + static struct sysdev_class lapic_sysclass = { .name = "lapic", .resume = lapic_resume, -- cgit v1.2.2 From 9c803869f5f5e3d7ad94b2926474b2882e30073b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 16 Aug 2008 23:21:54 +0400 Subject: x86: apic - unify lapic_is_integrated Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 4 ++++ arch/x86/kernel/apic_64.c | 6 +++++- 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 9e8702eebd46..41134d4e6a66 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -128,7 +128,11 @@ static inline int lapic_get_version(void) */ static inline int lapic_is_integrated(void) { +#ifdef CONFIG_X86_64 + return 1; +#else return APIC_INTEGRATED(lapic_get_version()); +#endif } /* diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 46523c0cc6f6..18c0b8cd2376 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -111,11 +111,15 @@ static inline int lapic_get_version(void) } /* - * Check, if the APIC is integrated or a seperate chip + * Check, if the APIC is integrated or a separate chip */ static inline int lapic_is_integrated(void) { +#ifdef CONFIG_X86_64 return 1; +#else + return APIC_INTEGRATED(lapic_get_version()); +#endif } /* -- cgit v1.2.2 From cf9768d7514d59b3179a886c4a47908d72e6cf76 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 16 Aug 2008 23:21:55 +0400 Subject: x86: apic - unify xapic_icr_read Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 18c0b8cd2376..5e0738131e58 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -174,7 +174,7 @@ u64 xapic_icr_read(void) icr2 = apic_read(APIC_ICR2); icr1 = apic_read(APIC_ICR); - return (icr1 | ((u64)icr2 << 32)); + return icr1 | ((u64)icr2 << 32); } static struct apic_ops xapic_ops = { -- cgit v1.2.2 From 7b22ff5344fda666e0938e5261ea7b9a3dfce497 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 18 Aug 2008 00:36:18 +0900 Subject: x86 gart: allocate size-aligned address for alloc_coherent, v2 This patch changes GART IOMMU to return a size aligned address wrt dma_alloc_coherent, as DMA-mapping.txt defines: The cpu return address and the DMA bus master address are both guaranteed to be aligned to the smallest PAGE_SIZE order which is greater than or equal to the requested size. This invariant exists (for example) to guarantee that if you allocate a chunk which is smaller than or equal to 64 kilobytes, the extent of the buffer you receive will not cross a 64K boundary. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index cdab67849074..4d8efb05428d 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -82,7 +82,8 @@ AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ static int need_flush; /* global flush state. set for each gart wrap */ -static unsigned long alloc_iommu(struct device *dev, int size) +static unsigned long alloc_iommu(struct device *dev, int size, + unsigned long align_mask) { unsigned long offset, flags; unsigned long boundary_size; @@ -95,11 +96,12 @@ static unsigned long alloc_iommu(struct device *dev, int size) spin_lock_irqsave(&iommu_bitmap_lock, flags); offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, - size, base_index, boundary_size, 0); + size, base_index, boundary_size, align_mask); if (offset == -1) { need_flush = 1; offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, - size, base_index, boundary_size, 0); + size, base_index, boundary_size, + align_mask); } if (offset != -1) { next_bit = offset+size; @@ -236,10 +238,10 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) * Caller needs to check if the iommu is needed and flush. */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, - size_t size, int dir) + size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(dev, npages); + unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; if (iommu_page == -1) { @@ -262,7 +264,11 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, static dma_addr_t gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir) { - dma_addr_t map = dma_map_area(dev, paddr, size, dir); + dma_addr_t map; + unsigned long align_mask; + + align_mask = (1UL << get_order(size)) - 1; + map = dma_map_area(dev, paddr, size, dir, align_mask); flush_gart(); @@ -281,7 +287,8 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) if (!need_iommu(dev, paddr, size)) return paddr; - bus = gart_map_simple(dev, paddr, size, dir); + bus = dma_map_area(dev, paddr, size, dir, 0); + flush_gart(); return bus; } @@ -340,7 +347,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, unsigned long addr = sg_phys(s); if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir); + addr = dma_map_area(dev, addr, s->length, dir, 0); if (addr == bad_dma_address) { if (i > 0) gart_unmap_sg(dev, sg, i, dir); @@ -362,7 +369,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, struct scatterlist *sout, unsigned long pages) { - unsigned long iommu_start = alloc_iommu(dev, pages); + unsigned long iommu_start = alloc_iommu(dev, pages, 0); unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; -- cgit v1.2.2 From d554d9a4295dd0595d12eeccbc55d1f495b15176 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:07:44 +0200 Subject: x86, tsc: fix section mismatch warning WARNING: vmlinux.o(.text+0x7950): Section mismatch in reference from the function native_calibrate_tsc() to the function .init.text:tsc_read_refs() The function native_calibrate_tsc() references the function __init tsc_read_refs(). This is often because native_calibrate_tsc lacks a __init annotation or the annotation of tsc_read_refs is wrong. tsc_read_refs is called from native_calibrate_tsc which is not __init and native_calibrate_tsc cannot be marked __init Signed-off-by: Marcin Slusarz Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7603c0553909..46af71676738 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,7 +104,7 @@ __setup("notsc", notsc_setup); /* * Read TSC and the reference counters. Take care of SMI disturbance */ -static u64 __init tsc_read_refs(u64 *pm, u64 *hpet) +static u64 tsc_read_refs(u64 *pm, u64 *hpet) { u64 t1, t2; int i; -- cgit v1.2.2 From 67d0c9ebdc9f5f356657146b4e862b2d745e9e78 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:12:37 +0200 Subject: x86: fix MP_processor_info section mismatch warning WARNING: arch/x86/kernel/built-in.o(.cpuinit.text+0x1fe7): Section mismatch in reference from the function MP_processor_info() to the variable .init.data:x86_quirks The function __cpuinit MP_processor_info() references a variable __initdata x86_quirks. If x86_quirks is only used by MP_processor_info then annotate x86_quirks with a matching annotation. MP_processor_info uses x86_quirks which is __init and is used only from smp_read_mpc and construct_default_ISA_mptable which are __init Signed-off-by: Marcin Slusarz Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 678090508a62..9942fc72b63c 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -49,7 +49,7 @@ static int __init mpf_checksum(unsigned char *mp, int len) return sum & 0xFF; } -static void __cpuinit MP_processor_info(struct mpc_config_processor *m) +static void __init MP_processor_info(struct mpc_config_processor *m) { int apicid; char *bootup_cpu = ""; -- cgit v1.2.2 From c72a5efec1193faa2ef34c0bd48d7251a70ec934 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:11:13 +0200 Subject: x86: mmconf: fix section mismatch warning WARNING: arch/x86/kernel/built-in.o(.cpuinit.text+0x1591): Section mismatch in reference from the function init_amd() to the function .init.text:check_enable_amd_mmconf_dmi() The function __cpuinit init_amd() references a function __init check_enable_amd_mmconf_dmi(). If check_enable_amd_mmconf_dmi is only used by init_amd then annotate check_enable_amd_mmconf_dmi with a matching annotation. check_enable_amd_mmconf_dmi is only called from init_amd which is __cpuinit Signed-off-by: Marcin Slusarz Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/mmconf-fam10h_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index fdfdc550b366..efc2f361fe85 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -238,7 +238,7 @@ static struct dmi_system_id __devinitdata mmconf_dmi_table[] = { {} }; -void __init check_enable_amd_mmconf_dmi(void) +void __cpuinit check_enable_amd_mmconf_dmi(void) { dmi_check_system(mmconf_dmi_table); } -- cgit v1.2.2 From 39e00fe20aaad4326ed5e0e3221451732bc7f679 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Mon, 11 Aug 2008 00:09:38 +0200 Subject: x86: mpparse.c: fix section mismatch warning WARNING: vmlinux.o(.text+0x118f7): Section mismatch in reference from the function construct_ioapic_table() to the function .init.text:MP_bus_info() The function construct_ioapic_table() references the function __init MP_bus_info(). This is often because construct_ioapic_table lacks a __init annotation or the annotation of MP_bus_info is wrong. construct_ioapic_table is called only from construct_default_ISA_mptable which is __init Signed-off-by: Marcin Slusarz Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9942fc72b63c..b3fb430725cb 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -484,7 +484,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) } -static void construct_ioapic_table(int mpc_default_type) +static void __init construct_ioapic_table(int mpc_default_type) { struct mpc_config_ioapic ioapic; struct mpc_config_bus bus; @@ -529,7 +529,7 @@ static void construct_ioapic_table(int mpc_default_type) construct_default_ioirq_mptable(mpc_default_type); } #else -static inline void construct_ioapic_table(int mpc_default_type) { } +static inline void __init construct_ioapic_table(int mpc_default_type) { } #endif static inline void __init construct_default_ISA_mptable(int mpc_default_type) -- cgit v1.2.2 From e532c06f2a835b5cc4f4166f467437d9b09c1d0e Mon Sep 17 00:00:00 2001 From: David Fries Date: Sun, 17 Aug 2008 23:03:40 -0500 Subject: x86: fix i486 suspend to disk CR4 oops arch/x86/power/cpu_32.c __save_processor_state calls read_cr4() only a i486 CPU doesn't have the CR4 register. Trying to read it produces an invalid opcode oops during suspend to disk. Use the safe rc4 reading op instead. If the value to be written is zero the write is skipped. arch/x86/power/hibernate_asm_32.S done: swapped the use of %eax and %ecx to use jecxz for the zero test and jump over store to %cr4. restore_image: s/%ecx/%eax/ to be consistent with done: In addition to __save_processor_state, acpi_save_state_mem, efi_call_phys_prelog, and efi_call_phys_epilog had checks added (acpi restore was in assembly and already had a check for non-zero). There were other reads and writes of CR4, but MCE and virtualization shouldn't be executed on a i486 anyway. Signed-off-by: David Fries Acked-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/sleep.c | 2 +- arch/x86/kernel/efi_32.c | 4 ++-- arch/x86/power/cpu_32.c | 6 ++++-- arch/x86/power/hibernate_asm_32.S | 26 +++++++++++++++----------- 4 files changed, 22 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 81e5ab6542d8..426e5d91b63a 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -86,7 +86,7 @@ int acpi_save_state_mem(void) #endif /* !CONFIG_64BIT */ header->pmode_cr0 = read_cr0(); - header->pmode_cr4 = read_cr4(); + header->pmode_cr4 = read_cr4_safe(); header->realmode_flags = acpi_realmode_flags; header->real_magic = 0x12345678; diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/kernel/efi_32.c index 4b63c8e1f13b..5cab48ee61a4 100644 --- a/arch/x86/kernel/efi_32.c +++ b/arch/x86/kernel/efi_32.c @@ -53,7 +53,7 @@ void efi_call_phys_prelog(void) * directory. If I have PAE, I just need to duplicate one entry in * page directory. */ - cr4 = read_cr4(); + cr4 = read_cr4_safe(); if (cr4 & X86_CR4_PAE) { efi_bak_pg_dir_pointer[0].pgd = @@ -91,7 +91,7 @@ void efi_call_phys_epilog(void) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); - cr4 = read_cr4(); + cr4 = read_cr4_safe(); if (cr4 & X86_CR4_PAE) { swapper_pg_dir[pgd_index(0)].pgd = diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 7dc5d5cf50a2..d3e083dea720 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -45,7 +45,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); - ctxt->cr4 = read_cr4(); + ctxt->cr4 = read_cr4_safe(); } /* Needed by apm.c */ @@ -98,7 +98,9 @@ static void __restore_processor_state(struct saved_context *ctxt) /* * control registers */ - write_cr4(ctxt->cr4); + /* cr4 was introduced in the Pentium CPU */ + if (ctxt->cr4) + write_cr4(ctxt->cr4); write_cr3(ctxt->cr3); write_cr2(ctxt->cr2); write_cr0(ctxt->cr0); diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index b95aa6cfe3cb..4fc7e872c85e 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -28,9 +28,9 @@ ENTRY(swsusp_arch_suspend) ret ENTRY(restore_image) - movl resume_pg_dir, %ecx - subl $__PAGE_OFFSET, %ecx - movl %ecx, %cr3 + movl resume_pg_dir, %eax + subl $__PAGE_OFFSET, %eax + movl %eax, %cr3 movl restore_pblist, %edx .p2align 4,,7 @@ -52,17 +52,21 @@ copy_loop: done: /* go back to the original page tables */ - movl $swapper_pg_dir, %ecx - subl $__PAGE_OFFSET, %ecx - movl %ecx, %cr3 + movl $swapper_pg_dir, %eax + subl $__PAGE_OFFSET, %eax + movl %eax, %cr3 /* Flush TLB, including "global" things (vmalloc) */ - movl mmu_cr4_features, %eax - movl %eax, %edx + movl mmu_cr4_features, %ecx + jecxz 1f # cr4 Pentium and higher, skip if zero + movl %ecx, %edx andl $~(1<<7), %edx; # PGE movl %edx, %cr4; # turn off PGE - movl %cr3, %ecx; # flush TLB - movl %ecx, %cr3 - movl %eax, %cr4; # turn PGE back on +1: + movl %cr3, %eax; # flush TLB + movl %eax, %cr3 + jecxz 1f # cr4 Pentium and higher, skip if zero + movl %ecx, %cr4; # turn PGE back on +1: movl saved_context_esp, %esp movl saved_context_ebp, %ebp -- cgit v1.2.2 From d19fbfdfe6a7034c8b6a7062365780485ab5aeaa Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 17 Aug 2008 17:50:51 +0200 Subject: x86: silence section mismatch warning - get_local_pda Take out part of get_local_pda referencing __init function (free_bootmem) to new (static) function marked as __ref. It's safe to do because free_bootmem is called before __init sections are dropped. WARNING: vmlinux.o(.cpuinit.text+0x3cd7): Section mismatch in reference from the function get_local_pda() to the function .init.text:free_bootmem() The function __cpuinit get_local_pda() references a function __init free_bootmem(). If free_bootmem is only used by get_local_pda then annotate free_bootmem with a matching annotation. Signed-off-by: Marcin Slusarz Cc: Mike Travis Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a8fb8a980fae..e139e617f422 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -756,6 +756,14 @@ static void __cpuinit do_fork_idle(struct work_struct *work) } #ifdef CONFIG_X86_64 + +/* __ref because it's safe to call free_bootmem when after_bootmem == 0. */ +static void __ref free_bootmem_pda(struct x8664_pda *oldpda) +{ + if (!after_bootmem) + free_bootmem((unsigned long)oldpda, sizeof(*oldpda)); +} + /* * Allocate node local memory for the AP pda. * @@ -784,8 +792,7 @@ int __cpuinit get_local_pda(int cpu) if (oldpda) { memcpy(newpda, oldpda, size); - if (!after_bootmem) - free_bootmem((unsigned long)oldpda, size); + free_bootmem_pda(oldpda); } newpda->in_bootmem = 0; -- cgit v1.2.2 From f71066624d5d91bf179a1ea25da0800b72e20c60 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 17 Aug 2008 17:50:52 +0200 Subject: x86, calgary: fix section mismatch warning - get_tce_space_from_tar WARNING: vmlinux.o(.text+0x27032): Section mismatch in reference from the function get_tce_space_from_tar() to the function .init.text:calgary_bus_has_devices() The function get_tce_space_from_tar() references the function __init calgary_bus_has_devices(). This is often because get_tce_space_from_tar lacks a __init annotation or the annotation of calgary_bus_has_devices is wrong. get_tce_space_from_tar is called only from __init function (calgary_init) and calls __init function (calgary_bus_has_devices). So annotate it properly. Signed-off-by: Marcin Slusarz Cc: Chandru Siddalingappa Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 02d19328525d..218d783ed7a8 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1350,7 +1350,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) * Function for kdump case. Get the tce tables from first kernel * by reading the contents of the base adress register of calgary iommu */ -static void get_tce_space_from_tar(void) +static void __init get_tce_space_from_tar(void) { int bus; void __iomem *target; -- cgit v1.2.2 From 1b72691ce35812ff865d778f303779e774d2b098 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 18 Aug 2008 09:06:01 +0200 Subject: x86: fix build warnings in real mode code This recent patch commit c3965bd15118742d72b4bc1a290d37b3f081eb98 Author: Paul Jackson Date: Wed May 14 08:15:34 2008 -0700 x86 boot: proper use of ARRAY_SIZE instead of repeated E820MAX constant caused these new warnings during a normal build: In file included from linux-2.6/arch/x86/boot/memory.c:17: linux-2.6/include/linux/log2.h: In function '__ilog2_u32': linux-2.6/include/linux/log2.h:34: warning: implicit declaration of function 'fls' linux-2.6/include/linux/log2.h: In function '__ilog2_u64': linux-2.6/include/linux/log2.h:42: warning: implicit declaration of function 'fls64' linux-2.6/include/linux/log2.h: In function '__roundup_pow_of_two ': linux-2.6/include/linux/log2.h:63: warning: implicit declaration of function 'fls_long' I tried to fix them in log2.h, but it's difficult because the real mode environment is completely different from a normal kernel environment. Instead define an own ARRAY_SIZE macro in boot.h, similar to the other private macros there. Signed-off-by: Andi Kleen Signed-off-by: Ingo Molnar --- arch/x86/boot/boot.h | 2 ++ arch/x86/boot/memory.c | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 616b804a2295..cc0ef13fba7a 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -30,6 +30,8 @@ /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) +#define ARRAY_SIZE(x) (sizeof(x) / sizeof(*(x))) + extern struct setup_header hdr; extern struct boot_params boot_params; diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 53165c97336b..8c3c25f35578 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -13,7 +13,6 @@ */ #include "boot.h" -#include #define SMAP 0x534d4150 /* ASCII "SMAP" */ -- cgit v1.2.2 From d5e629a6f88137fb77c4cc857be5ea7c3f27110d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 17 Aug 2008 21:12:27 -0700 Subject: x86: apic - unify lapic_resume - fix Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 5e0738131e58..ad532dccd11e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1446,6 +1446,7 @@ static int lapic_resume(struct sys_device *dev) enable_x2apic(); else #endif + { /* * Make sure the APICBASE points to the right address * @@ -1456,6 +1457,7 @@ static int lapic_resume(struct sys_device *dev) l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; wrmsr(MSR_IA32_APICBASE, l, h); + } apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); -- cgit v1.2.2 From 8bfcb3960fde049b863266dab8c3617bb5a541aa Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Mon, 18 Aug 2008 12:33:20 +0200 Subject: x86: make movsl_mask definition non-CPU specific movsl_mask is currently defined in arch/x86/kernel/cpu/intel.c, which contains code specific to Intel CPUs. However, movsl_mask is used in the non-CPU specific code in arch/x86/lib/usercopy_32.c, which breaks the compilation when support for Intel CPUs is compiled out. This patch solves this problem by moving movsl_mask's definition close to its users in arch/x86/lib/usercopy_32.c. Signed-off-by: Thomas Petazzoni Cc: michael@free-electrons.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 7 ------- arch/x86/lib/usercopy_32.c | 7 +++++++ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b75f2569b8f8..5c8959b8a42e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -23,13 +23,6 @@ #include #endif -#ifdef CONFIG_X86_INTEL_USERCOPY -/* - * Alignment at which movsl is preferred for bulk memory copies. - */ -struct movsl_mask movsl_mask __read_mostly; -#endif - static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 24e60944971a..9e68075544f6 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -14,6 +14,13 @@ #include #include +#ifdef CONFIG_X86_INTEL_USERCOPY +/* + * Alignment at which movsl is preferred for bulk memory copies. + */ +struct movsl_mask movsl_mask __read_mostly; +#endif + static inline int __movsl_is_ok(unsigned long a1, unsigned long a2, unsigned long n) { #ifdef CONFIG_X86_INTEL_USERCOPY -- cgit v1.2.2 From 774400a3ba23b63f4de39e67ce6c4e48935809dc Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Mon, 18 Aug 2008 12:33:21 +0200 Subject: x86: move cmpxchg fallbacks to a generic place arch/x86/kernel/cpu/intel.c defines a few fallback functions (cmpxchg_*()) that are used when the CPU doesn't support cmpxchg and/or cmpxchg64 natively. However, while defined in an Intel-specific file, these functions are also used for CPUs from other vendors when they don't support cmpxchg and/or cmpxchg64. This breaks the compilation when support for Intel CPUs is disabled. This patch moves these functions to a new arch/x86/kernel/cpu/cmpxchg.c file, unconditionally compiled when X86_32 is enabled. Signed-off-by: Thomas Petazzoni Cc: michael@free-electrons.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 2 +- arch/x86/kernel/cpu/cmpxchg.c | 72 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/intel.c | 64 -------------------------------------- 3 files changed, 73 insertions(+), 65 deletions(-) create mode 100644 arch/x86/kernel/cpu/cmpxchg.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index ee76eaad3001..25b4c063fbf6 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -5,7 +5,7 @@ obj-y := intel_cacheinfo.o addon_cpuid_features.o obj-y += proc.o feature_names.o -obj-$(CONFIG_X86_32) += common.o bugs.o +obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += common_64.o bugs_64.o obj-$(CONFIG_X86_32) += amd.o obj-$(CONFIG_X86_64) += amd_64.o diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c new file mode 100644 index 000000000000..2056ccf572cc --- /dev/null +++ b/arch/x86/kernel/cpu/cmpxchg.c @@ -0,0 +1,72 @@ +/* + * cmpxchg*() fallbacks for CPU not supporting these instructions + */ + +#include +#include +#include + +#ifndef CONFIG_X86_CMPXCHG +unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) +{ + u8 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u8 *)ptr; + if (prev == old) + *(u8 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u8); + +unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) +{ + u16 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u16 *)ptr; + if (prev == old) + *(u16 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u16); + +unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) +{ + u32 prev; + unsigned long flags; + + /* Poor man's cmpxchg for 386. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u32 *)ptr; + if (prev == old) + *(u32 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_386_u32); +#endif + +#ifndef CONFIG_X86_CMPXCHG64 +unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) +{ + u64 prev; + unsigned long flags; + + /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ + local_irq_save(flags); + prev = *(u64 *)ptr; + if (prev == old) + *(u64 *)ptr = new; + local_irq_restore(flags); + return prev; +} +EXPORT_SYMBOL(cmpxchg_486_u64); +#endif + diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5c8959b8a42e..77618c717d76 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -307,69 +307,5 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - /* arch_initcall(intel_cpu_init); */ -- cgit v1.2.2 From 8d02c2110b3fb8e2700b31596a582a2989fd72ba Mon Sep 17 00:00:00 2001 From: Thomas Petazzoni Date: Tue, 5 Aug 2008 11:45:19 +0200 Subject: x86: configuration options to compile out x86 CPU support code This patch adds some configuration options that allow to compile out CPU vendor-specific code in x86 kernels (in arch/x86/kernel/cpu). The new configuration options are only visible when CONFIG_EMBEDDED is selected, as they are mostly interesting for space savings reasons. An example of size saving, on x86 with only Intel CPU support: text data bss dec hex filename 1125479 118760 212992 1457231 163c4f vmlinux.old 1121355 116536 212992 1450883 162383 vmlinux -4124 -2224 0 -6348 -18CC +/- However, I'm not exactly sure that the Kconfig wording is correct with regard to !64BIT / 64BIT. [ mingo@elte.hu: convert macro to inline ] Signed-off-by: Thomas Petazzoni Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 70 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/Makefile | 19 ++++++------ 2 files changed, 80 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2c518fbc52ec..6156ac25ff8c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -415,3 +415,73 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) + +menuconfig PROCESSOR_SELECT + default y + bool "Supported processor vendors" if EMBEDDED + help + This lets you choose what x86 vendor support code your kernel + will include. + +config CPU_SUP_INTEL_32 + default y + bool "Support Intel processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables extended support for Intel processors + +config CPU_SUP_INTEL_64 + default y + bool "Support Intel processors" if PROCESSOR_SELECT + depends on 64BIT + help + This enables extended support for Intel processors + +config CPU_SUP_CYRIX_32 + default y + bool "Support Cyrix processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables extended support for Cyrix processors + +config CPU_SUP_AMD_32 + default y + bool "Support AMD processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables extended support for AMD processors + +config CPU_SUP_AMD_64 + default y + bool "Support AMD processors" if PROCESSOR_SELECT + depends on 64BIT + help + This enables extended support for AMD processors + +config CPU_SUP_CENTAUR_32 + default y + bool "Support Centaur processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables extended support for Centaur processors + +config CPU_SUP_CENTAUR_64 + default y + bool "Support Centaur processors" if PROCESSOR_SELECT + depends on 64BIT + help + This enables extended support for Centaur processors + +config CPU_SUP_TRANSMETA_32 + default y + bool "Support Transmeta processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables extended support for Transmeta processors + +config CPU_SUP_UMC_32 + default y + bool "Support UMC processors" if PROCESSOR_SELECT + depends on !64BIT + help + This enables extended support for UMC processors diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 25b4c063fbf6..a0fc6c144384 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -7,15 +7,16 @@ obj-y += proc.o feature_names.o obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += common_64.o bugs_64.o -obj-$(CONFIG_X86_32) += amd.o -obj-$(CONFIG_X86_64) += amd_64.o -obj-$(CONFIG_X86_32) += cyrix.o -obj-$(CONFIG_X86_32) += centaur.o -obj-$(CONFIG_X86_64) += centaur_64.o -obj-$(CONFIG_X86_32) += transmeta.o -obj-$(CONFIG_X86_32) += intel.o -obj-$(CONFIG_X86_64) += intel_64.o -obj-$(CONFIG_X86_32) += umc.o + +obj-$(CONFIG_CPU_SUP_AMD_32) += amd.o +obj-$(CONFIG_CPU_SUP_AMD_64) += amd_64.o +obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o +obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o +obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o +obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o +obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o +obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o +obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ -- cgit v1.2.2 From b6c8051311e1a14d229df05ea39d0a1b2ce90cdd Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:49 +0400 Subject: x86: apic - rearrange maxcpu definition Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 4 ++-- arch/x86/kernel/apic_64.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 41134d4e6a66..8d1febf05da1 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -114,6 +114,8 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static int enabled_via_apicbase; static unsigned long apic_phys; +unsigned int __cpuinitdata maxcpus = NR_CPUS; + /* * Get the LAPIC version @@ -1459,8 +1461,6 @@ void disconnect_bsp_APIC(int virt_wire_setup) } } -unsigned int __cpuinitdata maxcpus = NR_CPUS; - void __cpuinit generic_processor_info(int apicid, int version) { int cpu; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ad532dccd11e..46acb9b47ae7 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -98,10 +98,10 @@ static struct clock_event_device lapic_clockevent = { static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static unsigned long apic_phys; +unsigned int __cpuinitdata maxcpus = NR_CPUS; unsigned long mp_lapic_addr; -unsigned int __cpuinitdata maxcpus = NR_CPUS; /* * Get the LAPIC version */ -- cgit v1.2.2 From f1ee37891dab6014f6b0ec77b18bc07e2369a55f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:50 +0400 Subject: x86: apic - unify setup_boot_APIC_clock Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 8d1febf05da1..80db3e0426c4 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -612,6 +612,7 @@ void __init setup_boot_APIC_clock(void) * broadcast mechanism is used. On UP systems simply ignore it. */ if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); /* No broadcast on UP ! */ if (num_possible_cpus() > 1) { lapic_clockevent.mult = 1; -- cgit v1.2.2 From 990b183e58cb513a62492b6218987750e106cbfb Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:51 +0400 Subject: x86: apic - unify disable_local_APIC Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 4 +++- arch/x86/kernel/apic_64.c | 14 ++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 80db3e0426c4..13c4b79441da 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -796,7 +796,7 @@ void clear_local_APIC(void) */ void disable_local_APIC(void) { - unsigned long value; + unsigned int value; clear_local_APIC(); @@ -808,6 +808,7 @@ void disable_local_APIC(void) value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); +#ifdef CONFIG_X86_32 /* * When LAPIC was disabled by the BIOS and enabled by the kernel, * restore the disabled state. @@ -819,6 +820,7 @@ void disable_local_APIC(void) l &= ~MSR_IA32_APICBASE_ENABLE; wrmsr(MSR_IA32_APICBASE, l, h); } +#endif } /* diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 46acb9b47ae7..4fb903b2fc39 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -691,6 +691,20 @@ void disable_local_APIC(void) value = apic_read(APIC_SPIV); value &= ~APIC_SPIV_APIC_ENABLED; apic_write(APIC_SPIV, value); + +#ifdef CONFIG_X86_32 + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +#endif } void lapic_shutdown(void) -- cgit v1.2.2 From fe4024dcb0c01e5399394d2807406a2c13fb1eb7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:52 +0400 Subject: x86: apic - unify lapic_shutdown Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 9 ++++++--- arch/x86/kernel/apic_64.c | 14 +++++++++++++- 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 13c4b79441da..d4efe86adc72 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -838,10 +838,13 @@ void lapic_shutdown(void) local_irq_save(flags); - if (enabled_via_apicbase) - disable_local_APIC(); - else +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) clear_local_APIC(); + else +#endif + disable_local_APIC(); + local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 4fb903b2fc39..48806546d49f 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -707,6 +707,12 @@ void disable_local_APIC(void) #endif } +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ void lapic_shutdown(void) { unsigned long flags; @@ -716,7 +722,13 @@ void lapic_shutdown(void) local_irq_save(flags); - disable_local_APIC(); +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif + disable_local_APIC(); + local_irq_restore(flags); } -- cgit v1.2.2 From 36c9d6742897fa414f51c4e9d0f20ab4e6bf942c Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:53 +0400 Subject: x86: apic - unify connect_bsp_APIC Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 2 ++ arch/x86/kernel/apic_64.c | 20 ++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index d4efe86adc72..6d230e9d0a7d 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1387,6 +1387,7 @@ void smp_error_interrupt(struct pt_regs *regs) */ void __init connect_bsp_APIC(void) { +#ifdef CONFIG_X86_32 if (pic_mode) { /* * Do not trust the local APIC being empty at bootup. @@ -1401,6 +1402,7 @@ void __init connect_bsp_APIC(void) outb(0x70, 0x22); outb(0x01, 0x23); } +#endif enable_apic_mode(); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 48806546d49f..5579e213b5d2 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1285,10 +1285,26 @@ asmlinkage void smp_error_interrupt(void) } /** - * * connect_bsp_APIC - attach the APIC to the interrupt system - * */ + * connect_bsp_APIC - attach the APIC to the interrupt system + */ void __init connect_bsp_APIC(void) { +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } +#endif enable_apic_mode(); } -- cgit v1.2.2 From c43da2f5e92fe3bcc256f0c0d6cb858368da5bd9 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:54 +0400 Subject: x86: apic - unify lapic_setup_esr We use 32bit code former for 64bit mode since it's much better implementation and easier to merge. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 21 +++++++++---------- arch/x86/kernel/apic_64.c | 51 ++++++++++++++++++++++++++++++++++++----------- 2 files changed, 50 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 6d230e9d0a7d..3c1562afa274 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -983,6 +983,16 @@ static void __cpuinit lapic_setup_esr(void) { unsigned long oldvalue, value, maxlvt; if (lapic_is_integrated() && !esr_disable) { + if (esr_disable) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; + } /* !82489DX */ maxlvt = lapic_get_maxlvt(); if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ @@ -1003,16 +1013,7 @@ static void __cpuinit lapic_setup_esr(void) "vector: 0x%08lx after: 0x%08lx\n", oldvalue, value); } else { - if (esr_disable) - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - else - printk(KERN_INFO "No ESR for 82489DX.\n"); + printk(KERN_INFO "No ESR for 82489DX.\n"); } } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 5579e213b5d2..d74abf7e92f7 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -863,6 +863,45 @@ void __init init_bsp_APIC(void) apic_write(APIC_LVT1, value); } +static void __cpuinit lapic_setup_esr(void) +{ + unsigned long oldvalue, value, maxlvt; + if (lapic_is_integrated() && !esr_disable) { + if (esr_disable) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; + } + /* !82489DX */ + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08lx after: 0x%08lx\n", + oldvalue, value); + } else { + printk(KERN_INFO "No ESR for 82489DX.\n"); + } +} + + /** * setup_local_APIC - setup the local APIC */ @@ -968,18 +1007,6 @@ void __cpuinit setup_local_APIC(void) preempt_enable(); } -static void __cpuinit lapic_setup_esr(void) -{ - unsigned maxlvt = lapic_get_maxlvt(); - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); -} - void __cpuinit end_local_APIC_setup(void) { lapic_setup_esr(); -- cgit v1.2.2 From c40aaec6868401671a0ca14ed77e9b2da2d1f223 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:55 +0400 Subject: x86: apic - unify __setup_APIC_LVTT Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 10 +++++++--- arch/x86/kernel/apic_64.c | 12 ++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 3c1562afa274..65419c7d437f 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -248,8 +248,12 @@ int lapic_get_maxlvt(void) * Local APIC timer */ -/* Clock divisor is set to 16 */ +/* Clock divisor */ +#ifdef CONFG_X86_64 +#define APIC_DIVISOR 1 +#else #define APIC_DIVISOR 16 +#endif /* * This function sets up the local APIC timer, with a timeout of @@ -281,8 +285,8 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) */ tmp_value = apic_read(APIC_TDCR); apic_write(APIC_TDCR, - (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | - APIC_TDR_DIV_16); + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) apic_write(APIC_TMICT, clocks / APIC_DIVISOR); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index d74abf7e92f7..fe57db9f3fbb 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -259,8 +259,12 @@ int lapic_get_maxlvt(void) * Local APIC timer */ -/* Clock divisor is set to 1 */ +/* Clock divisor */ +#ifdef CONFG_X86_64 #define APIC_DIVISOR 1 +#else +#define APIC_DIVISOR 16 +#endif /* * This function sets up the local APIC timer, with a timeout of @@ -291,9 +295,9 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) * Divide PICLK by 16 */ tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, (tmp_value - & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) - | APIC_TDR_DIV_16); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); if (!oneshot) apic_write(APIC_TMICT, clocks / APIC_DIVISOR); -- cgit v1.2.2 From c177b0bc03e0e11623e2099db42903fb0caf0fd3 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:56 +0400 Subject: x86: apic - unify disconnect_bsp_APIC - just #ifdef added Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 66 ++++++++++++++++++++++++----------------------- arch/x86/kernel/apic_64.c | 23 +++++++++++++++-- 2 files changed, 55 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 65419c7d437f..3095bb71eaed 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1420,6 +1420,7 @@ void __init connect_bsp_APIC(void) */ void disconnect_bsp_APIC(int virt_wire_setup) { +#ifdef CONFIG_X86_32 if (pic_mode) { /* * Put the board back into PIC mode (has an effect only on @@ -1431,47 +1432,48 @@ void disconnect_bsp_APIC(int virt_wire_setup) "entering PIC mode.\n"); outb(0x70, 0x22); outb(0x00, 0x23); - } else { - /* Go back to Virtual Wire compatibility mode */ - unsigned long value; + return; + } +#endif - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); + /* Go back to Virtual Wire compatibility mode */ + unsigned int value; - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + if (!virt_wire_setup) { /* - * For LVT1 make it edge triggered, active high, nmi and - * enabled + * For LVT0 make it edge triggered, active high, + * external and enabled */ - value = apic_read(APIC_LVT1); - value &= ~( - APIC_MODE_MASK | APIC_SEND_PENDING | + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); } + + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); } void __cpuinit generic_processor_info(int apicid, int version) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index fe57db9f3fbb..0d969103fce7 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1348,8 +1348,24 @@ void __init connect_bsp_APIC(void) */ void disconnect_bsp_APIC(int virt_wire_setup) { +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + return; + } +#endif + /* Go back to Virtual Wire compatibility mode */ - unsigned long value; + unsigned int value; /* For the spurious interrupt use vector F, and enable it */ value = apic_read(APIC_SPIV); @@ -1375,7 +1391,10 @@ void disconnect_bsp_APIC(int virt_wire_setup) apic_write(APIC_LVT0, APIC_LVT_MASKED); } - /* For LVT1 make it edge triggered, active high, nmi and enabled */ + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ value = apic_read(APIC_LVT1); value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | -- cgit v1.2.2 From 1b313f4a6d7bee7b2c034b3f1e203bc360a71cca Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:57 +0400 Subject: x86: apic - generic_processor_info - use physid_set instead of phys_cpu and physids_or - set phys_cpu_present_map bit AFTER check for allowed number of processors - add checking for APIC valid version in 64bit mode (mostly not needed but added for merging purpose) - add apic_version definition for 64bit mode which is used now Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 15 ++++++++------- arch/x86/kernel/apic_64.c | 41 +++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 3095bb71eaed..c3a252b1a8be 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1480,7 +1480,6 @@ void __cpuinit generic_processor_info(int apicid, int version) { int cpu; cpumask_t tmp_map; - physid_mask_t phys_cpu; /* * Validate version @@ -1493,9 +1492,6 @@ void __cpuinit generic_processor_info(int apicid, int version) } apic_version[apicid] = version; - phys_cpu = apicid_to_cpu_present(apicid); - physids_or(phys_cpu_present_map, phys_cpu_present_map, phys_cpu); - if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." " Processor ignored.\n", NR_CPUS); @@ -1512,17 +1508,19 @@ void __cpuinit generic_processor_info(int apicid, int version) cpus_complement(tmp_map, cpu_present_map); cpu = first_cpu(tmp_map); - if (apicid == boot_cpu_physical_apicid) + physid_set(apicid, phys_cpu_present_map); + if (apicid == boot_cpu_physical_apicid) { /* * x86_bios_cpu_apicid is required to have processors listed * in same order as logical cpu numbers. Hence the first * entry is BSP, and so on. */ cpu = 0; - + } if (apicid > max_physical_apicid) max_physical_apicid = apicid; +#ifdef CONFIG_X86_32 /* * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y * but we need to work other dependencies like SMP_SUSPEND etc @@ -1542,7 +1540,9 @@ void __cpuinit generic_processor_info(int apicid, int version) def_to_bigsmp = 1; } } -#ifdef CONFIG_SMP +#endif + +#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) /* are we being called early in kernel startup? */ if (early_per_cpu_ptr(x86_cpu_to_apicid)) { u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); @@ -1555,6 +1555,7 @@ void __cpuinit generic_processor_info(int apicid, int version) per_cpu(x86_bios_cpu_apicid, cpu) = apicid; } #endif + cpu_set(cpu, cpu_possible_map); cpu_set(cpu, cpu_present_map); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 0d969103fce7..76c20773bedf 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1215,6 +1215,8 @@ void __init init_apic_mappings(void) * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ +int apic_version[MAX_APICS]; + int __init APIC_init_uniprocessor(void) { if (disable_apic) { @@ -1409,15 +1411,26 @@ void __cpuinit generic_processor_info(int apicid, int version) int cpu; cpumask_t tmp_map; + /* + * Validate version + */ + if (version == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + version); + version = 0x10; + } + apic_version[apicid] = version; + if (num_processors >= NR_CPUS) { printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); + " Processor ignored.\n", NR_CPUS); return; } if (num_processors >= maxcpus) { printk(KERN_WARNING "WARNING: maxcpus limit of %i reached." - " Processor ignored.\n", maxcpus); + " Processor ignored.\n", maxcpus); return; } @@ -1437,6 +1450,29 @@ void __cpuinit generic_processor_info(int apicid, int version) if (apicid > max_physical_apicid) max_physical_apicid = apicid; +#ifdef CONFIG_X86_32 + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj + */ + if (max_physical_apicid >= 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } +#endif + +#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) /* are we being called early in kernel startup? */ if (early_per_cpu_ptr(x86_cpu_to_apicid)) { u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); @@ -1448,6 +1484,7 @@ void __cpuinit generic_processor_info(int apicid, int version) per_cpu(x86_cpu_to_apicid, cpu) = apicid; per_cpu(x86_bios_cpu_apicid, cpu) = apicid; } +#endif cpu_set(cpu, cpu_possible_map); cpu_set(cpu, cpu_present_map); -- cgit v1.2.2 From fa6b95fc7c6f2f3eb1560e1f33cd13197546c5a0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:58 +0400 Subject: x86: apic - unify end_local_APIC_setup Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 6 ++++-- arch/x86/kernel/apic_64.c | 9 +++++++++ 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index c3a252b1a8be..f1882329b9cf 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1155,13 +1155,15 @@ void __cpuinit setup_local_APIC(void) void __cpuinit end_local_APIC_setup(void) { - unsigned long value; - lapic_setup_esr(); + +#ifdef CONFIG_X86_32 + unsigned int value; /* Disable the local apic timer */ value = apic_read(APIC_LVTT); value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); apic_write(APIC_LVTT, value); +#endif setup_apic_nmi_watchdog(NULL); apic_pm_activate(); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 76c20773bedf..eec10b34dc49 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1014,6 +1014,15 @@ void __cpuinit setup_local_APIC(void) void __cpuinit end_local_APIC_setup(void) { lapic_setup_esr(); + +#ifdef CONFIG_X86_32 + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); +#endif + setup_apic_nmi_watchdog(NULL); apic_pm_activate(); } -- cgit v1.2.2 From 0b23e8cf553f5e706b0057363f1319867bcd1a7d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:45:59 +0400 Subject: x86: apic - unify local_apic_timer_interrupt Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 4 ++++ arch/x86/kernel/apic_64.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f1882329b9cf..af227bce23b1 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -685,7 +685,11 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ +#ifdef CONFIG_X86_64 + add_pda(apic_timer_irqs, 1); +#else per_cpu(irq_stat, cpu).apic_timer_irqs++; +#endif evt->event_handler(evt); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index eec10b34dc49..a9ad2cb4ff95 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -567,7 +567,11 @@ static void local_apic_timer_interrupt(void) /* * the NMI deadlock-detector uses this. */ +#ifdef CONFIG_X86_64 add_pda(apic_timer_irqs, 1); +#else + per_cpu(irq_stat, cpu).apic_timer_irqs++; +#endif evt->event_handler(evt); } -- cgit v1.2.2 From 79af9bec60e6e218a1e48e8830d603d64a7fc441 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:46:00 +0400 Subject: x86: apic - unify apic_set_verbosity Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 17 ++++++++++++++--- arch/x86/kernel/apic_64.c | 46 +++++++++++++++++++++++++--------------------- 2 files changed, 39 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index af227bce23b1..acbc4dea2ee8 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1768,13 +1768,24 @@ early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); static int __init apic_set_verbosity(char *arg) { - if (!arg) + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; +#endif return -EINVAL; + } - if (strcmp(arg, "debug") == 0) + if (strcmp("debug", arg) == 0) apic_verbosity = APIC_DEBUG; - else if (strcmp(arg, "verbose") == 0) + else if (strcmp("verbose", arg) == 0) apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } return 0; } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index a9ad2cb4ff95..999781810c09 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1759,27 +1759,6 @@ early_param("nox2apic", setup_nox2apic); /* * APIC command line parameters */ -static int __init apic_set_verbosity(char *str) -{ - if (str == NULL) { - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; - } - if (strcmp("debug", str) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", str) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", str); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - static __init int setup_disableapic(char *str) { disable_apic = 1; @@ -1824,6 +1803,31 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + ioapic_force = 1; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic", apic_set_verbosity); + static int __init lapic_insert_resource(void) { if (!apic_phys) -- cgit v1.2.2 From 789fa735712d726b5cfa5c6be57171b5637a4872 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:46:01 +0400 Subject: x86: apic - unify disableapic and nolapic setup handlers Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 11 +++++++++-- arch/x86/kernel/apic_64.c | 6 +++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index acbc4dea2ee8..5680bda62f78 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1737,13 +1737,20 @@ static int __init parse_lapic(char *arg) } early_param("lapic", parse_lapic); -static int __init parse_nolapic(char *arg) +static int __init setup_disableapic(char *arg) { disable_apic = 1; setup_clear_cpu_cap(X86_FEATURE_APIC); return 0; } -early_param("nolapic", parse_nolapic); +early_param("disableapic", setup_disableapic); + +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) +{ + return setup_disableapic(arg); +} +early_param("nolapic", setup_nolapic); static int __init parse_disable_apic_timer(char *arg) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 999781810c09..7a317180ba2e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1759,7 +1759,7 @@ early_param("nox2apic", setup_nox2apic); /* * APIC command line parameters */ -static __init int setup_disableapic(char *str) +static int __init setup_disableapic(char *arg) { disable_apic = 1; setup_clear_cpu_cap(X86_FEATURE_APIC); @@ -1768,9 +1768,9 @@ static __init int setup_disableapic(char *str) early_param("disableapic", setup_disableapic); /* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) +static int __init setup_nolapic(char *arg) { - return setup_disableapic(str); + return setup_disableapic(arg); } early_param("nolapic", setup_nolapic); -- cgit v1.2.2 From 3415610b8e38b26b52a430b8846665c2d6bb3558 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:46:02 +0400 Subject: x86: apic - rearrange parse_lapic_timer_c2_ok Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 5680bda62f78..885e306420ac 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1752,6 +1752,13 @@ static int __init setup_nolapic(char *arg) } early_param("nolapic", setup_nolapic); +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + static int __init parse_disable_apic_timer(char *arg) { disable_apic_timer = 1; @@ -1766,13 +1773,6 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - static int __init apic_set_verbosity(char *arg) { if (!arg) { -- cgit v1.2.2 From e75bedf415f300a08e9bbcc755784e488574a73e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 20:46:03 +0400 Subject: x86: apic - lapic_resume 32bit - unification fix Just add parenthesis to be identical of current 64bit implementation (so diff will not complain). Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 885e306420ac..e975562a76a5 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1646,6 +1646,7 @@ static int lapic_resume(struct sys_device *dev) enable_x2apic(); else #endif + { /* * Make sure the APICBASE points to the right address * @@ -1656,6 +1657,7 @@ static int lapic_resume(struct sys_device *dev) l &= ~MSR_IA32_APICBASE_BASE; l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; wrmsr(MSR_IA32_APICBASE, l, h); + } apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); apic_write(APIC_ID, apic_pm_state.apic_id); -- cgit v1.2.2 From 1b4ee4e4096d430c4c12516c1d30a6b0b4f9e9e4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 18 Aug 2008 23:12:33 +0400 Subject: x86: apic - compilation warnings fix Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 15 +++++++++------ arch/x86/kernel/apic_64.c | 15 +++++++++------ 2 files changed, 18 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index e975562a76a5..b8d80c291650 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1162,11 +1162,13 @@ void __cpuinit end_local_APIC_setup(void) lapic_setup_esr(); #ifdef CONFIG_X86_32 - unsigned int value; - /* Disable the local apic timer */ - value = apic_read(APIC_LVTT); - value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, value); + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } #endif setup_apic_nmi_watchdog(NULL); @@ -1426,6 +1428,8 @@ void __init connect_bsp_APIC(void) */ void disconnect_bsp_APIC(int virt_wire_setup) { + unsigned int value; + #ifdef CONFIG_X86_32 if (pic_mode) { /* @@ -1443,7 +1447,6 @@ void disconnect_bsp_APIC(int virt_wire_setup) #endif /* Go back to Virtual Wire compatibility mode */ - unsigned int value; /* For the spurious interrupt use vector F, and enable it */ value = apic_read(APIC_SPIV); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 7a317180ba2e..37e037606f30 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1020,11 +1020,13 @@ void __cpuinit end_local_APIC_setup(void) lapic_setup_esr(); #ifdef CONFIG_X86_32 - unsigned int value; - /* Disable the local apic timer */ - value = apic_read(APIC_LVTT); - value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, value); + { + unsigned int value; + /* Disable the local apic timer */ + value = apic_read(APIC_LVTT); + value |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, value); + } #endif setup_apic_nmi_watchdog(NULL); @@ -1363,6 +1365,8 @@ void __init connect_bsp_APIC(void) */ void disconnect_bsp_APIC(int virt_wire_setup) { + unsigned int value; + #ifdef CONFIG_X86_32 if (pic_mode) { /* @@ -1380,7 +1384,6 @@ void disconnect_bsp_APIC(int virt_wire_setup) #endif /* Go back to Virtual Wire compatibility mode */ - unsigned int value; /* For the spurious interrupt use vector F, and enable it */ value = apic_read(APIC_SPIV); -- cgit v1.2.2 From 467cd0529a480c9c25f06154b788d1d1ba77828a Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 18 Aug 2008 16:56:29 -0700 Subject: x86: early_printk.c trivial sparse fixes arch/x86/kernel/early_printk.c:404:13: warning: incorrect type in assignment (different base types) arch/x86/kernel/early_printk.c:404:13: expected restricted __le16 [assigned] [usertype] wValue arch/x86/kernel/early_printk.c:404:13: got int [signed] value arch/x86/kernel/early_printk.c:405:13: warning: incorrect type in assignment (different base types) arch/x86/kernel/early_printk.c:405:13: expected restricted __le16 [assigned] [usertype] wIndex arch/x86/kernel/early_printk.c:405:13: got int [signed] index arch/x86/kernel/early_printk.c:406:14: warning: incorrect type in assignment (different base types) arch/x86/kernel/early_printk.c:406:14: expected restricted __le16 [assigned] [usertype] wLength arch/x86/kernel/early_printk.c:406:14: got int [signed] size arch/x86/kernel/early_printk.c:845:16: warning: Using plain integer as NULL pointer arch/x86/kernel/early_printk.c:992:13: warning: symbol 'enable_debug_console' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 28155acf706e..825e9136b026 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -401,9 +401,9 @@ static int dbgp_control_msg(unsigned devnum, int requesttype, int request, /* Compute the control message */ req.bRequestType = requesttype; req.bRequest = request; - req.wValue = value; - req.wIndex = index; - req.wLength = size; + req.wValue = cpu_to_le16(value); + req.wIndex = cpu_to_le16(index); + req.wLength = cpu_to_le16(size); pids = DBGP_PID_SET(USB_PID_DATA0, USB_PID_SETUP); addr = DBGP_EPADDR(devnum, 0); @@ -842,7 +842,7 @@ static int __init early_dbgp_init(char *s) ret = ehci_setup(); if (ret < 0) { dbgp_printk("ehci_setup failed\n"); - ehci_debug = 0; + ehci_debug = NULL; return -1; } @@ -989,7 +989,7 @@ static int __init setup_early_printk(char *buf) return 0; } -void __init enable_debug_console(char *buf) +static void __init enable_debug_console(char *buf) { #ifdef DBGP_DEBUG struct console *old_early_console = NULL; -- cgit v1.2.2 From 8a7c5ef3ba8cca8f9c4f91ddef3d081f517914ae Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Tue, 19 Aug 2008 02:13:55 +0200 Subject: x86 iommu: remove unneeded parenthesis The parenthesis in __iommu_queue_command() are not needed when assigning into 'target' variable. Signed-off-by: Jiri Kosina Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index de39e1f2ede5..69b4d060b21c 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -65,7 +65,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) u8 *target; tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); - target = (iommu->cmd_buf + tail); + target = iommu->cmd_buf + tail; memcpy_toio(target, cmd, sizeof(*cmd)); tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); -- cgit v1.2.2 From e2fe16d91228a005811335fbc4fbad5d4f5b75af Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 15 Aug 2008 15:36:31 -0700 Subject: x86: boot: stub out unimplemented CPU feature words The CPU feature detection code in the boot code is somewhat minimal, and doesn't include all possible CPUID words. In particular, it doesn't contain the code for CPU feature words 2 (Transmeta), 3 (Linux-specific), 5 (VIA), or 7 (scattered). Zero them out, so we can still set those bits as known at compile time; in particular, this allows creating a Linux-specific NOPL flag and have it required (and therefore resolvable at compile time) in 64-bit mode. Signed-off-by: H. Peter Anvin --- arch/x86/boot/cpucheck.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 7804389ee005..19b14f7ef9f1 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -46,12 +46,12 @@ static const u32 req_flags[NCAPINTS] = { REQUIRED_MASK0, REQUIRED_MASK1, - REQUIRED_MASK2, - REQUIRED_MASK3, + 0, /* REQUIRED_MASK2 not implemented in this file */ + 0, /* REQUIRED_MASK3 not implemented in this file */ REQUIRED_MASK4, - REQUIRED_MASK5, + 0, /* REQUIRED_MASK5 not implemented in this file */ REQUIRED_MASK6, - REQUIRED_MASK7, + 0, /* REQUIRED_MASK7 not implemented in this file */ }; #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) -- cgit v1.2.2 From 7e00df5818964298c9821365a6cb7a8304227c5c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Aug 2008 17:39:32 -0700 Subject: x86: add NOPL as a synthetic CPU feature bit The long noops ("NOPL") are supposed to be detected by family >= 6. Unfortunately, several non-Intel x86 implementations, both hardware and software, don't obey this dictum. Instead, probe for NOPL directly by executing a NOPL instruction and see if we get #UD. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 32 +++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/common_64.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/feature_names.c | 3 ++- 3 files changed, 69 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..0785b3c8d043 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #include @@ -341,6 +342,35 @@ static void __init early_cpu_detect(void) early_get_cap(c); } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { u32 tfms, xlvl; @@ -395,8 +425,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) } init_scattered_cpuid_features(c); + detect_nopl(c); } - } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index dd6e3f15017e..c3afba5a81a7 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -215,6 +216,39 @@ static void __init early_cpu_support_print(void) } } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); void __init early_cpu_init(void) @@ -313,6 +347,8 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } + detect_nopl(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index e43ad4ad4cba..c9017799497c 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = { NULL, NULL, NULL, NULL, "constant_tsc", "up", NULL, "arch_perfmon", "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "rep_good", NULL, NULL, NULL, + "nopl", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ -- cgit v1.2.2 From f1c5d30e1d79bbfb60eaf189db862d3cb2bcac92 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Aug 2008 17:50:33 -0700 Subject: x86: use X86_FEATURE_NOPL in alternatives Use X86_FEATURE_NOPL to determine if it is safe to use P6 NOPs in alternatives. Also, replace table and loop with simple if statement. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/alternative.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b553..65a0c1b48696 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { extern char __vsyscall_0; const unsigned char *const *find_nop_table(void) { - return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return k8_nops; } #else /* CONFIG_X86_64 */ -static const struct nop { - int cpuid; - const unsigned char *const *noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { X86_FEATURE_P4, p6_nops }, - { X86_FEATURE_P3, p6_nops }, - { -1, NULL } -}; - const unsigned char *const *find_nop_table(void) { - const unsigned char *const *noptable = intel_nops; - int i; - - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - return noptable; + if (boot_cpu_has(X86_FEATURE_K8)) + return k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + return k7_nops; + else if (boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return intel_nops; } #endif /* CONFIG_X86_64 */ -- cgit v1.2.2 From 20211e4d344729f4d4c93da37a590fc1c3a1fd9b Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Mon, 18 Aug 2008 21:25:38 +0200 Subject: x86: Coding style fixes to arch/x86/oprofile/op_model_p4.c A coding style patch to arch/x86/oprofile/op_model_p4.c that removes 87 errors and 4 warnings. Before: total: 89 errors, 13 warnings, 722 lines checked After: total: 2 errors, 9 warnings, 721 lines checked Compile tested, binary verified as follow: paolo@paolo-desktop:~/linux.trees.git$ size /tmp/op_model_p4.o.* text data bss dec hex filename 2691 968 32 3691 e6b /tmp/op_model_p4.o.after 2691 968 32 3691 e6b /tmp/op_model_p4.o.before paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/op_model_p4.o.* 8c1c9823bab33333e1f7f76574e62561 /tmp/op_model_p4.o.after 8c1c9823bab33333e1f7f76574e62561 /tmp/op_model_p4.o.before Signed-off-by: Paolo Ciarrocchi Cc: robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_p4.c | 175 ++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 88 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 56b4757a1f47..43ac5af338d8 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -10,11 +10,12 @@ #include #include +#include +#include #include -#include #include #include -#include + #include "op_x86_model.h" #include "op_counter.h" @@ -40,7 +41,7 @@ static unsigned int num_controls = NUM_CONTROLS_NON_HT; static inline void setup_num_counters(void) { #ifdef CONFIG_SMP - if (smp_num_siblings == 2){ + if (smp_num_siblings == 2) { num_counters = NUM_COUNTERS_HT2; num_controls = NUM_CONTROLS_HT2; } @@ -86,7 +87,7 @@ struct p4_event_binding { #define CTR_FLAME_2 (1 << 6) #define CTR_IQ_5 (1 << 7) -static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { +static struct p4_counter_binding p4_counters[NUM_COUNTERS_NON_HT] = { { CTR_BPU_0, MSR_P4_BPU_PERFCTR0, MSR_P4_BPU_CCCR0 }, { CTR_MS_0, MSR_P4_MS_PERFCTR0, MSR_P4_MS_CCCR0 }, { CTR_FLAME_0, MSR_P4_FLAME_PERFCTR0, MSR_P4_FLAME_CCCR0 }, @@ -97,32 +98,32 @@ static struct p4_counter_binding p4_counters [NUM_COUNTERS_NON_HT] = { { CTR_IQ_5, MSR_P4_IQ_PERFCTR5, MSR_P4_IQ_CCCR5 } }; -#define NUM_UNUSED_CCCRS NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT +#define NUM_UNUSED_CCCRS (NUM_CCCRS_NON_HT - NUM_COUNTERS_NON_HT) /* p4 event codes in libop/op_event.h are indices into this table. */ static struct p4_event_binding p4_events[NUM_EVENTS] = { - + { /* BRANCH_RETIRED */ - 0x05, 0x06, + 0x05, 0x06, { {CTR_IQ_4, MSR_P4_CRU_ESCR2}, {CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, - + { /* MISPRED_BRANCH_RETIRED */ - 0x04, 0x03, + 0x04, 0x03, { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, - + { /* TC_DELIVER_MODE */ 0x01, 0x01, - { { CTR_MS_0, MSR_P4_TC_ESCR0}, + { { CTR_MS_0, MSR_P4_TC_ESCR0}, { CTR_MS_2, MSR_P4_TC_ESCR1} } }, - + { /* BPU_FETCH_REQUEST */ - 0x00, 0x03, + 0x00, 0x03, { { CTR_BPU_0, MSR_P4_BPU_ESCR0}, { CTR_BPU_2, MSR_P4_BPU_ESCR1} } }, @@ -146,7 +147,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* LOAD_PORT_REPLAY */ - 0x02, 0x04, + 0x02, 0x04, { { CTR_FLAME_0, MSR_P4_SAAT_ESCR0}, { CTR_FLAME_2, MSR_P4_SAAT_ESCR1} } }, @@ -170,43 +171,43 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* BSQ_CACHE_REFERENCE */ - 0x07, 0x0c, + 0x07, 0x0c, { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, { CTR_BPU_2, MSR_P4_BSU_ESCR1} } }, { /* IOQ_ALLOCATION */ - 0x06, 0x03, + 0x06, 0x03, { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { 0, 0 } } }, { /* IOQ_ACTIVE_ENTRIES */ - 0x06, 0x1a, + 0x06, 0x1a, { { CTR_BPU_2, MSR_P4_FSB_ESCR1}, { 0, 0 } } }, { /* FSB_DATA_ACTIVITY */ - 0x06, 0x17, + 0x06, 0x17, { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { CTR_BPU_2, MSR_P4_FSB_ESCR1} } }, { /* BSQ_ALLOCATION */ - 0x07, 0x05, + 0x07, 0x05, { { CTR_BPU_0, MSR_P4_BSU_ESCR0}, { 0, 0 } } }, { /* BSQ_ACTIVE_ENTRIES */ 0x07, 0x06, - { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, + { { CTR_BPU_2, MSR_P4_BSU_ESCR1 /* guess */}, { 0, 0 } } }, { /* X87_ASSIST */ - 0x05, 0x03, + 0x05, 0x03, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, @@ -216,21 +217,21 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* PACKED_SP_UOP */ - 0x01, 0x08, + 0x01, 0x08, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* PACKED_DP_UOP */ - 0x01, 0x0c, + 0x01, 0x0c, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* SCALAR_SP_UOP */ - 0x01, 0x0a, + 0x01, 0x0a, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, @@ -242,31 +243,31 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* 64BIT_MMX_UOP */ - 0x01, 0x02, + 0x01, 0x02, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* 128BIT_MMX_UOP */ - 0x01, 0x1a, + 0x01, 0x1a, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, { /* X87_FP_UOP */ - 0x01, 0x04, + 0x01, 0x04, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* X87_SIMD_MOVES_UOP */ - 0x01, 0x2e, + 0x01, 0x2e, { { CTR_FLAME_0, MSR_P4_FIRM_ESCR0}, { CTR_FLAME_2, MSR_P4_FIRM_ESCR1} } }, - + { /* MACHINE_CLEAR */ - 0x05, 0x02, + 0x05, 0x02, { { CTR_IQ_4, MSR_P4_CRU_ESCR2}, { CTR_IQ_5, MSR_P4_CRU_ESCR3} } }, @@ -276,9 +277,9 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { { { CTR_BPU_0, MSR_P4_FSB_ESCR0}, { CTR_BPU_2, MSR_P4_FSB_ESCR1} } }, - + { /* TC_MS_XFER */ - 0x00, 0x05, + 0x00, 0x05, { { CTR_MS_0, MSR_P4_MS_ESCR0}, { CTR_MS_2, MSR_P4_MS_ESCR1} } }, @@ -308,7 +309,7 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { }, { /* INSTR_RETIRED */ - 0x04, 0x02, + 0x04, 0x02, { { CTR_IQ_4, MSR_P4_CRU_ESCR0}, { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, @@ -319,14 +320,14 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { { CTR_IQ_5, MSR_P4_CRU_ESCR1} } }, - { /* UOP_TYPE */ - 0x02, 0x02, + { /* UOP_TYPE */ + 0x02, 0x02, { { CTR_IQ_4, MSR_P4_RAT_ESCR0}, { CTR_IQ_5, MSR_P4_RAT_ESCR1} } }, { /* RETIRED_MISPRED_BRANCH_TYPE */ - 0x02, 0x05, + 0x02, 0x05, { { CTR_MS_0, MSR_P4_TBPU_ESCR0}, { CTR_MS_2, MSR_P4_TBPU_ESCR1} } }, @@ -349,8 +350,8 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define ESCR_SET_OS_1(escr, os) ((escr) |= (((os) & 1) << 1)) #define ESCR_SET_EVENT_SELECT(escr, sel) ((escr) |= (((sel) & 0x3f) << 25)) #define ESCR_SET_EVENT_MASK(escr, mask) ((escr) |= (((mask) & 0xffff) << 9)) -#define ESCR_READ(escr,high,ev,i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) -#define ESCR_WRITE(escr,high,ev,i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high));} while (0) +#define ESCR_READ(escr, high, ev, i) do {rdmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) +#define ESCR_WRITE(escr, high, ev, i) do {wrmsr(ev->bindings[(i)].escr_address, (escr), (high)); } while (0) #define CCCR_RESERVED_BITS 0x38030FFF #define CCCR_CLEAR(cccr) ((cccr) &= CCCR_RESERVED_BITS) @@ -360,15 +361,15 @@ static struct p4_event_binding p4_events[NUM_EVENTS] = { #define CCCR_SET_PMI_OVF_1(cccr) ((cccr) |= (1<<27)) #define CCCR_SET_ENABLE(cccr) ((cccr) |= (1<<12)) #define CCCR_SET_DISABLE(cccr) ((cccr) &= ~(1<<12)) -#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) -#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high));} while (0) +#define CCCR_READ(low, high, i) do {rdmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) +#define CCCR_WRITE(low, high, i) do {wrmsr(p4_counters[(i)].cccr_address, (low), (high)); } while (0) #define CCCR_OVF_P(cccr) ((cccr) & (1U<<31)) #define CCCR_CLEAR_OVF(cccr) ((cccr) &= (~(1U<<31))) -#define CTRL_IS_RESERVED(msrs,c) (msrs->controls[(c)].addr ? 1 : 0) -#define CTR_IS_RESERVED(msrs,c) (msrs->counters[(c)].addr ? 1 : 0) -#define CTR_READ(l,h,i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h));} while (0) -#define CTR_WRITE(l,i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1);} while (0) +#define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) +#define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) +#define CTR_READ(l, h, i) do {rdmsr(p4_counters[(i)].counter_address, (l), (h)); } while (0) +#define CTR_WRITE(l, i) do {wrmsr(p4_counters[(i)].counter_address, -(u32)(l), -1); } while (0) #define CTR_OVERFLOW_P(ctr) (!((ctr) & 0x80000000)) @@ -380,7 +381,7 @@ static unsigned int get_stagger(void) #ifdef CONFIG_SMP int cpu = smp_processor_id(); return (cpu != first_cpu(per_cpu(cpu_sibling_map, cpu))); -#endif +#endif return 0; } @@ -395,25 +396,23 @@ static unsigned long reset_value[NUM_COUNTERS_NON_HT]; static void p4_fill_in_addresses(struct op_msrs * const msrs) { - unsigned int i; + unsigned int i; unsigned int addr, cccraddr, stag; setup_num_counters(); stag = get_stagger(); /* initialize some registers */ - for (i = 0; i < num_counters; ++i) { + for (i = 0; i < num_counters; ++i) msrs->counters[i].addr = 0; - } - for (i = 0; i < num_controls; ++i) { + for (i = 0; i < num_controls; ++i) msrs->controls[i].addr = 0; - } - + /* the counter & cccr registers we pay attention to */ for (i = 0; i < num_counters; ++i) { addr = p4_counters[VIRT_CTR(stag, i)].counter_address; cccraddr = p4_counters[VIRT_CTR(stag, i)].cccr_address; - if (reserve_perfctr_nmi(addr)){ + if (reserve_perfctr_nmi(addr)) { msrs->counters[i].addr = addr; msrs->controls[i].addr = cccraddr; } @@ -447,22 +446,22 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs) if (reserve_evntsel_nmi(addr)) msrs->controls[i].addr = addr; } - + for (addr = MSR_P4_MS_ESCR0 + stag; - addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { + addr <= MSR_P4_TC_ESCR1; ++i, addr += addr_increment()) { if (reserve_evntsel_nmi(addr)) msrs->controls[i].addr = addr; } - + for (addr = MSR_P4_IX_ESCR0 + stag; - addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { + addr <= MSR_P4_CRU_ESCR3; ++i, addr += addr_increment()) { if (reserve_evntsel_nmi(addr)) msrs->controls[i].addr = addr; } /* there are 2 remaining non-contiguously located ESCRs */ - if (num_counters == NUM_COUNTERS_NON_HT) { + if (num_counters == NUM_COUNTERS_NON_HT) { /* standard non-HT CPUs handle both remaining ESCRs*/ if (reserve_evntsel_nmi(MSR_P4_CRU_ESCR5)) msrs->controls[i++].addr = MSR_P4_CRU_ESCR5; @@ -498,20 +497,20 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) unsigned int stag; stag = get_stagger(); - + /* convert from counter *number* to counter *bit* */ counter_bit = 1 << VIRT_CTR(stag, ctr); - + /* find our event binding structure. */ if (counter_config[ctr].event <= 0 || counter_config[ctr].event > NUM_EVENTS) { - printk(KERN_ERR - "oprofile: P4 event code 0x%lx out of range\n", + printk(KERN_ERR + "oprofile: P4 event code 0x%lx out of range\n", counter_config[ctr].event); return; } - + ev = &(p4_events[counter_config[ctr].event - 1]); - + for (i = 0; i < maxbind; i++) { if (ev->bindings[i].virt_counter & counter_bit) { @@ -526,25 +525,24 @@ static void pmc_setup_one_p4_counter(unsigned int ctr) ESCR_SET_OS_1(escr, counter_config[ctr].kernel); } ESCR_SET_EVENT_SELECT(escr, ev->event_select); - ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); + ESCR_SET_EVENT_MASK(escr, counter_config[ctr].unit_mask); ESCR_WRITE(escr, high, ev, i); - + /* modify CCCR */ CCCR_READ(cccr, high, VIRT_CTR(stag, ctr)); CCCR_CLEAR(cccr); CCCR_SET_REQUIRED_BITS(cccr); CCCR_SET_ESCR_SELECT(cccr, ev->escr_select); - if (stag == 0) { + if (stag == 0) CCCR_SET_PMI_OVF_0(cccr); - } else { + else CCCR_SET_PMI_OVF_1(cccr); - } CCCR_WRITE(cccr, high, VIRT_CTR(stag, ctr)); return; } } - printk(KERN_ERR + printk(KERN_ERR "oprofile: P4 event code 0x%lx no binding, stag %d ctr %d\n", counter_config[ctr].event, stag, ctr); } @@ -559,14 +557,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) stag = get_stagger(); rdmsr(MSR_IA32_MISC_ENABLE, low, high); - if (! MISC_PMC_ENABLED_P(low)) { + if (!MISC_PMC_ENABLED_P(low)) { printk(KERN_ERR "oprofile: P4 PMC not available\n"); return; } /* clear the cccrs we will use */ for (i = 0 ; i < num_counters ; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; rdmsr(p4_counters[VIRT_CTR(stag, i)].cccr_address, low, high); CCCR_CLEAR(low); @@ -576,14 +574,14 @@ static void p4_setup_ctrs(struct op_msrs const * const msrs) /* clear all escrs (including those outside our concern) */ for (i = num_counters; i < num_controls; i++) { - if (unlikely(!CTRL_IS_RESERVED(msrs,i))) + if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; wrmsr(msrs->controls[i].addr, 0, 0); } /* setup all counters */ for (i = 0 ; i < num_counters ; ++i) { - if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs,i))) { + if ((counter_config[i].enabled) && (CTRL_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; pmc_setup_one_p4_counter(i); CTR_WRITE(counter_config[i].count, VIRT_CTR(stag, i)); @@ -603,11 +601,11 @@ static int p4_check_ctrs(struct pt_regs * const regs, stag = get_stagger(); for (i = 0; i < num_counters; ++i) { - - if (!reset_value[i]) + + if (!reset_value[i]) continue; - /* + /* * there is some eccentricity in the hardware which * requires that we perform 2 extra corrections: * @@ -616,24 +614,24 @@ static int p4_check_ctrs(struct pt_regs * const regs, * * - write the counter back twice to ensure it gets * updated properly. - * + * * the former seems to be related to extra NMIs happening * during the current NMI; the latter is reported as errata * N15 in intel doc 249199-029, pentium 4 specification * update, though their suggested work-around does not * appear to solve the problem. */ - + real = VIRT_CTR(stag, i); CCCR_READ(low, high, real); - CTR_READ(ctr, high, real); + CTR_READ(ctr, high, real); if (CCCR_OVF_P(low) || CTR_OVERFLOW_P(ctr)) { oprofile_add_sample(regs, i); - CTR_WRITE(reset_value[i], real); + CTR_WRITE(reset_value[i], real); CCCR_CLEAR_OVF(low); CCCR_WRITE(low, high, real); - CTR_WRITE(reset_value[i], real); + CTR_WRITE(reset_value[i], real); } } @@ -683,15 +681,16 @@ static void p4_shutdown(struct op_msrs const * const msrs) int i; for (i = 0 ; i < num_counters ; ++i) { - if (CTR_IS_RESERVED(msrs,i)) + if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(msrs->counters[i].addr); } - /* some of the control registers are specially reserved in + /* + * some of the control registers are specially reserved in * conjunction with the counter registers (hence the starting offset). * This saves a few bits. */ for (i = num_counters ; i < num_controls ; ++i) { - if (CTRL_IS_RESERVED(msrs,i)) + if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(msrs->controls[i].addr); } } -- cgit v1.2.2 From f607e3a03c90e8c050cb0c12ec9967c2925cc812 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 19 Aug 2008 13:34:59 -0700 Subject: Revert "[CPUFREQ][2/2] preregister support for powernow-k8" This reverts commit 34ae7f35a21694aa5cb8829dc5142c39d73d6ba0, which has been reported to cause a number of problems. During suspend and resume, it apparently causes a crash in a CPU hotplug notifier to happen, although the exact details are sketchy because of the inability to get good traces during the suspend sequence. See buzilla entries http://bugzilla.kernel.org/show_bug.cgi?id=11296 http://bugzilla.kernel.org/show_bug.cgi?id=11339 for more examples and details. [ Mark: "Revert the patch for now. I'm still looking into getting a reliable reproduction and I do not have a fix at this time." ] Requested-by: Rafael J. Wysocki Acked-by: Mark Langsdorf Acked-by: Dave Jones Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 109 ++++++++++-------------------- arch/x86/kernel/cpu/cpufreq/powernow-k8.h | 3 +- 2 files changed, 37 insertions(+), 75 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 4e7271999a74..84bb395038d8 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -737,63 +737,44 @@ static int find_psb_table(struct powernow_k8_data *data) #ifdef CONFIG_X86_POWERNOW_K8_ACPI static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { - if (!data->acpi_data->state_count || (cpu_family == CPU_HW_PSTATE)) + if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) return; - data->irt = (data->acpi_data->states[index].control >> IRT_SHIFT) & IRT_MASK; - data->rvo = (data->acpi_data->states[index].control >> RVO_SHIFT) & RVO_MASK; - data->exttype = (data->acpi_data->states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; - data->plllock = (data->acpi_data->states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; - data->vidmvs = 1 << ((data->acpi_data->states[index].control >> MVS_SHIFT) & MVS_MASK); - data->vstable = (data->acpi_data->states[index].control >> VST_SHIFT) & VST_MASK; -} - - -static struct acpi_processor_performance *acpi_perf_data; -static int preregister_valid; - -static int powernow_k8_cpu_preinit_acpi(void) -{ - acpi_perf_data = alloc_percpu(struct acpi_processor_performance); - if (!acpi_perf_data) - return -ENODEV; - - if (acpi_processor_preregister_performance(acpi_perf_data)) - return -ENODEV; - else - preregister_valid = 1; - return 0; + data->irt = (data->acpi_data.states[index].control >> IRT_SHIFT) & IRT_MASK; + data->rvo = (data->acpi_data.states[index].control >> RVO_SHIFT) & RVO_MASK; + data->exttype = (data->acpi_data.states[index].control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK; + data->plllock = (data->acpi_data.states[index].control >> PLL_L_SHIFT) & PLL_L_MASK; + data->vidmvs = 1 << ((data->acpi_data.states[index].control >> MVS_SHIFT) & MVS_MASK); + data->vstable = (data->acpi_data.states[index].control >> VST_SHIFT) & VST_MASK; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { struct cpufreq_frequency_table *powernow_table; int ret_val; - int cpu = 0; - data->acpi_data = percpu_ptr(acpi_perf_data, cpu); - if (acpi_processor_register_performance(data->acpi_data, data->cpu)) { + if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { dprintk("register performance failed: bad ACPI data\n"); return -EIO; } /* verify the data contained in the ACPI structures */ - if (data->acpi_data->state_count <= 1) { + if (data->acpi_data.state_count <= 1) { dprintk("No ACPI P-States\n"); goto err_out; } - if ((data->acpi_data->control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || - (data->acpi_data->status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { + if ((data->acpi_data.control_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE) || + (data->acpi_data.status_register.space_id != ACPI_ADR_SPACE_FIXED_HARDWARE)) { dprintk("Invalid control/status registers (%x - %x)\n", - data->acpi_data->control_register.space_id, - data->acpi_data->status_register.space_id); + data->acpi_data.control_register.space_id, + data->acpi_data.status_register.space_id); goto err_out; } /* fill in data->powernow_table */ powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table) - * (data->acpi_data->state_count + 1)), GFP_KERNEL); + * (data->acpi_data.state_count + 1)), GFP_KERNEL); if (!powernow_table) { dprintk("powernow_table memory alloc failure\n"); goto err_out; @@ -806,12 +787,12 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) if (ret_val) goto err_out_mem; - powernow_table[data->acpi_data->state_count].frequency = CPUFREQ_TABLE_END; - powernow_table[data->acpi_data->state_count].index = 0; + powernow_table[data->acpi_data.state_count].frequency = CPUFREQ_TABLE_END; + powernow_table[data->acpi_data.state_count].index = 0; data->powernow_table = powernow_table; /* fill in data */ - data->numps = data->acpi_data->state_count; + data->numps = data->acpi_data.state_count; if (first_cpu(per_cpu(cpu_core_map, data->cpu)) == data->cpu) print_basics(data); powernow_k8_acpi_pst_values(data, 0); @@ -819,31 +800,16 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) /* notify BIOS that we exist */ acpi_processor_notify_smm(THIS_MODULE); - /* determine affinity, from ACPI if available */ - if (preregister_valid) { - if ((data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ALL) || - (data->acpi_data->shared_type == CPUFREQ_SHARED_TYPE_ANY)) - data->starting_core_affinity = data->acpi_data->shared_cpu_map; - else - data->starting_core_affinity = cpumask_of_cpu(data->cpu); - } else { - /* best guess from family if not */ - if (cpu_family == CPU_HW_PSTATE) - data->starting_core_affinity = cpumask_of_cpu(data->cpu); - else - data->starting_core_affinity = per_cpu(cpu_core_map, data->cpu); - } - return 0; err_out_mem: kfree(powernow_table); err_out: - acpi_processor_unregister_performance(data->acpi_data, data->cpu); + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); /* data->acpi_data.state_count informs us at ->exit() whether ACPI was used */ - data->acpi_data->state_count = 0; + data->acpi_data.state_count = 0; return -ENODEV; } @@ -855,10 +821,10 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; - for (i = 0; i < data->acpi_data->state_count; i++) { + for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; - index = data->acpi_data->states[i].control & HW_PSTATE_MASK; + index = data->acpi_data.states[i].control & HW_PSTATE_MASK; if (index > data->max_hw_pstate) { printk(KERN_ERR PFX "invalid pstate %d - bad value %d.\n", i, index); printk(KERN_ERR PFX "Please report to BIOS manufacturer\n"); @@ -874,7 +840,7 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpuf powernow_table[i].index = index; - powernow_table[i].frequency = data->acpi_data->states[i].core_frequency * 1000; + powernow_table[i].frequency = data->acpi_data.states[i].core_frequency * 1000; } return 0; } @@ -883,16 +849,16 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf { int i; int cntlofreq = 0; - for (i = 0; i < data->acpi_data->state_count; i++) { + for (i = 0; i < data->acpi_data.state_count; i++) { u32 fid; u32 vid; if (data->exttype) { - fid = data->acpi_data->states[i].status & EXT_FID_MASK; - vid = (data->acpi_data->states[i].status >> VID_SHIFT) & EXT_VID_MASK; + fid = data->acpi_data.states[i].status & EXT_FID_MASK; + vid = (data->acpi_data.states[i].status >> VID_SHIFT) & EXT_VID_MASK; } else { - fid = data->acpi_data->states[i].control & FID_MASK; - vid = (data->acpi_data->states[i].control >> VID_SHIFT) & VID_MASK; + fid = data->acpi_data.states[i].control & FID_MASK; + vid = (data->acpi_data.states[i].control >> VID_SHIFT) & VID_MASK; } dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid); @@ -933,10 +899,10 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf cntlofreq = i; } - if (powernow_table[i].frequency != (data->acpi_data->states[i].core_frequency * 1000)) { + if (powernow_table[i].frequency != (data->acpi_data.states[i].core_frequency * 1000)) { printk(KERN_INFO PFX "invalid freq entries %u kHz vs. %u kHz\n", powernow_table[i].frequency, - (unsigned int) (data->acpi_data->states[i].core_frequency * 1000)); + (unsigned int) (data->acpi_data.states[i].core_frequency * 1000)); powernow_table[i].frequency = CPUFREQ_ENTRY_INVALID; continue; } @@ -946,12 +912,11 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpuf static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { - if (data->acpi_data->state_count) - acpi_processor_unregister_performance(data->acpi_data, data->cpu); + if (data->acpi_data.state_count) + acpi_processor_unregister_performance(&data->acpi_data, data->cpu); } #else -static int powernow_k8_cpu_preinit_acpi(void) { return -ENODEV; } static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data) { return -ENODEV; } static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data) { return; } static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index) { return; } @@ -1136,7 +1101,7 @@ static int powernowk8_verify(struct cpufreq_policy *pol) static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) { struct powernow_k8_data *data; - cpumask_t oldmask = CPU_MASK_ALL; + cpumask_t oldmask; int rc; if (!cpu_online(pol->cpu)) @@ -1209,7 +1174,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* run on any CPU again */ set_cpus_allowed_ptr(current, &oldmask); - pol->cpus = data->starting_core_affinity; + if (cpu_family == CPU_HW_PSTATE) + pol->cpus = cpumask_of_cpu(pol->cpu); + else + pol->cpus = per_cpu(cpu_core_map, pol->cpu); data->available_cores = &(pol->cpus); /* Take a crude guess here. @@ -1332,7 +1300,6 @@ static int __cpuinit powernowk8_init(void) } if (supported_cpus == num_online_cpus()) { - powernow_k8_cpu_preinit_acpi(); printk(KERN_INFO PFX "Found %d %s " "processors (%d cpu cores) (" VERSION ")\n", num_online_nodes(), @@ -1349,10 +1316,6 @@ static void __exit powernowk8_exit(void) dprintk("exit\n"); cpufreq_unregister_driver(&cpufreq_amd64_driver); - -#ifdef CONFIG_X86_POWERNOW_K8_ACPI - free_percpu(acpi_perf_data); -#endif } MODULE_AUTHOR("Paul Devriendt and Mark Langsdorf "); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index a62612cd4be8..ab48cfed4d96 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -33,13 +33,12 @@ struct powernow_k8_data { #ifdef CONFIG_X86_POWERNOW_K8_ACPI /* the acpi table needs to be kept. it's only available if ACPI was * used to determine valid frequency/vid/fid states */ - struct acpi_processor_performance *acpi_data; + struct acpi_processor_performance acpi_data; #endif /* we need to keep track of associated cores, but let cpufreq * handle hotplug events - so just point at cpufreq pol->cpus * structure */ cpumask_t *available_cores; - cpumask_t starting_core_affinity; }; -- cgit v1.2.2 From c6744955d0ec0cb485c28c51eeb7185e260f6172 Mon Sep 17 00:00:00 2001 From: Samuel Sieb Date: Wed, 6 Aug 2008 22:06:29 -0700 Subject: x86: fix "kernel won't boot on a Cyrix MediaGXm (Geode)" Cyrix MediaGXm/Cx5530 Unicorn Revision 1.19.3B has stopped booting starting at v2.6.22. The reason is this commit: > commit f25f64ed5bd3c2932493681bdfdb483ea707da0a > Author: Juergen Beisert > Date: Sun Jul 22 11:12:38 2007 +0200 > > x86: Replace NSC/Cyrix specific chipset access macros by inlined functions. this commit activated a macro which was dormant before due to (buggy) macro side-effects. I've looked through various datasheets and found that the GXm and GXLV Geode processors don't have an incrementor. Remove the incrementor setup entirely. As the incrementor value differs according to clock speed and we would hope that the BIOS configures it correctly, it is probably the right solution. Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cyrix.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 3fd7a67bb06a..e710a21bb6e8 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -134,23 +134,6 @@ static void __cpuinit set_cx86_memwb(void) setCx86(CX86_CCR2, getCx86(CX86_CCR2) | 0x14); } -static void __cpuinit set_cx86_inc(void) -{ - unsigned char ccr3; - - printk(KERN_INFO "Enable Incrementor on Cyrix/NSC processor.\n"); - - ccr3 = getCx86(CX86_CCR3); - setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10); /* enable MAPEN */ - /* PCR1 -- Performance Control */ - /* Incrementor on, whatever that is */ - setCx86(CX86_PCR1, getCx86(CX86_PCR1) | 0x02); - /* PCR0 -- Performance Control */ - /* Incrementor Margin 10 */ - setCx86(CX86_PCR0, getCx86(CX86_PCR0) | 0x04); - setCx86(CX86_CCR3, ccr3); /* disable MAPEN */ -} - /* * Configure later MediaGX and/or Geode processor. */ @@ -174,7 +157,6 @@ static void __cpuinit geode_configure(void) set_cx86_memwb(); set_cx86_reorder(); - set_cx86_inc(); local_irq_restore(flags); } -- cgit v1.2.2 From c171f465b7281f2d3b03e9145ec763d6a8bab176 Mon Sep 17 00:00:00 2001 From: Uros Bizjak Date: Wed, 20 Aug 2008 10:44:47 +0200 Subject: x86, cleanup: use X86_CR4_PGE in x86/power/hibernate_asm_32.S Signed-off-by: Uros Bizjak Signed-off-by: Ingo Molnar --- arch/x86/power/hibernate_asm_32.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S index 4fc7e872c85e..d1e9b53f9d33 100644 --- a/arch/x86/power/hibernate_asm_32.S +++ b/arch/x86/power/hibernate_asm_32.S @@ -1,5 +1,3 @@ -.text - /* * This may not use any stack, nor any variable that is not "NoSave": * @@ -12,17 +10,18 @@ #include #include #include +#include - .text +.text ENTRY(swsusp_arch_suspend) - movl %esp, saved_context_esp movl %ebx, saved_context_ebx movl %ebp, saved_context_ebp movl %esi, saved_context_esi movl %edi, saved_context_edi - pushfl ; popl saved_context_eflags + pushfl + popl saved_context_eflags call swsusp_save ret @@ -59,7 +58,7 @@ done: movl mmu_cr4_features, %ecx jecxz 1f # cr4 Pentium and higher, skip if zero movl %ecx, %edx - andl $~(1<<7), %edx; # PGE + andl $~(X86_CR4_PGE), %edx movl %edx, %cr4; # turn off PGE 1: movl %cr3, %eax; # flush TLB @@ -74,7 +73,8 @@ done: movl saved_context_esi, %esi movl saved_context_edi, %edi - pushl saved_context_eflags ; popfl + pushl saved_context_eflags + popfl xorl %eax, %eax -- cgit v1.2.2 From 80c5e73d6028e0f03ab0c70e7c4cbf98ac2e0c43 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Tue, 19 Aug 2008 16:28:01 -0700 Subject: x86: fix Xorg startup/shutdown slowdown with PAT Rene Herman reported significant Xorg startup/shutdown slowdown due to PAT. It turns out that the memtype list has thousands of entries. Add cached_entry to list add routine, in order to speed up the lookup for sequential reserve_memtype calls. Reported-by: Rene Herman Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 2fe30916d4b6..bb6e8a267bfe 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -207,6 +207,9 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, return -EBUSY; } +static struct memtype *cached_entry; +static u64 cached_start; + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -280,11 +283,17 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, spin_lock(&memtype_lock); + if (cached_entry && start >= cached_start) + entry = cached_entry; + else + entry = list_entry(&memtype_list, struct memtype, nd); + /* Search for existing mapping that overlaps the current range */ where = NULL; - list_for_each_entry(entry, &memtype_list, nd) { + list_for_each_entry_continue(entry, &memtype_list, nd) { if (end <= entry->start) { where = entry->nd.prev; + cached_entry = list_entry(where, struct memtype, nd); break; } else if (start <= entry->start) { /* end > entry->start */ err = chk_conflict(new, entry, new_type); @@ -292,6 +301,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); where = entry->nd.prev; + cached_entry = list_entry(where, + struct memtype, nd); } break; } else if (start < entry->end) { /* start > entry->start */ @@ -299,7 +310,20 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!err) { dprintk("Overlap at 0x%Lx-0x%Lx\n", entry->start, entry->end); - where = &entry->nd; + cached_entry = list_entry(entry->nd.prev, + struct memtype, nd); + + /* + * Move to right position in the linked + * list to add this new entry + */ + list_for_each_entry_continue(entry, + &memtype_list, nd) { + if (start <= entry->start) { + where = entry->nd.prev; + break; + } + } } break; } @@ -314,6 +338,8 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, return err; } + cached_start = start; + if (where) list_add(&new->nd, where); else @@ -343,6 +369,9 @@ int free_memtype(u64 start, u64 end) spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { + if (cached_entry == entry || cached_start == start) + cached_entry = NULL; + list_del(&entry->nd); kfree(entry); err = 0; -- cgit v1.2.2 From 8343ef2437c599d30568e6b5a257a40bf2f4902b Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 20 Aug 2008 00:16:13 +0200 Subject: x86-microcode: fix unbalanced use of get_cpu() Don't use get_cpu() at all. Resort to checking a boot-up CPU (#0) in microcode_{intel,amd}_module_init(). Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 10 ++++++---- arch/x86/kernel/microcode_intel.c | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 33b2a217a8c5..a6e76ccf8158 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -500,13 +500,15 @@ static struct microcode_ops microcode_amd_ops = { static int __init microcode_amd_module_init(void) { - struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + struct cpuinfo_x86 *c = &cpu_data(0); equiv_cpu_table = NULL; - if (c->x86_vendor == X86_VENDOR_AMD) - return microcode_init(µcode_amd_ops, THIS_MODULE); - else + if (c->x86_vendor != X86_VENDOR_AMD) { + printk(KERN_ERR "microcode: CPU platform is not AMD-capable\n"); return -ENODEV; + } + + return microcode_init(µcode_amd_ops, THIS_MODULE); } static void __exit microcode_amd_module_exit(void) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index d2d9d74f4cbb..6dd8907ff22e 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -531,12 +531,14 @@ static struct microcode_ops microcode_intel_ops = { static int __init microcode_intel_module_init(void) { - struct cpuinfo_x86 *c = &cpu_data(get_cpu()); + struct cpuinfo_x86 *c = &cpu_data(0); - if (c->x86_vendor == X86_VENDOR_INTEL) - return microcode_init(µcode_intel_ops, THIS_MODULE); - else + if (c->x86_vendor != X86_VENDOR_INTEL) { + printk(KERN_ERR "microcode: CPU platform is not Intel-capable\n"); return -ENODEV; + } + + return microcode_init(µcode_intel_ops, THIS_MODULE); } static void __exit microcode_intel_module_exit(void) -- cgit v1.2.2 From d45de40934897c6ee5b05141f7895bbb28512395 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Wed, 20 Aug 2008 00:22:26 +0200 Subject: x86-microcode: generic interface refactoring This is the 1st patch in the series. Here the aim was to avoid any significant changes, logically-wise. So it's mainly about generic interface refactoring: e.g. make microcode_{intel,amd}.c more about arch-specific details and less about policies like make-sure-we-run-on-a-target-cpu (no more set_cpus_allowed_ptr() here) and generic synchronization (no more microcode_mutex here). All in all, more line have been deleted than added. 4 files changed, 145 insertions(+), 198 deletions(-) Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 155 +++++++++++++++++++++++++------------- arch/x86/kernel/microcode_amd.c | 77 +++---------------- arch/x86/kernel/microcode_intel.c | 91 ++++------------------ 3 files changed, 129 insertions(+), 194 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index ad136ad99cb3..b2f84ce5eed3 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -71,7 +71,7 @@ * Thanks to Stuart Swales for pointing out this bug. */ -/*#define DEBUG pr_debug */ +/* #define DEBUG pr_debug */ #include #include #include @@ -104,8 +104,7 @@ MODULE_LICENSE("GPL"); struct microcode_ops *microcode_ops; /* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -DEFINE_MUTEX(microcode_mutex); -EXPORT_SYMBOL_GPL(microcode_mutex); +static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); @@ -234,22 +233,6 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); struct platform_device *microcode_pdev; EXPORT_SYMBOL_GPL(microcode_pdev); -static void microcode_init_cpu(int cpu, int resume) -{ - cpumask_t old; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - mutex_lock(µcode_mutex); - microcode_ops->collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING && !resume) - microcode_ops->cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); -} - static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t sz) @@ -266,14 +249,15 @@ static ssize_t reload_store(struct sys_device *dev, cpumask_t old = current->cpus_allowed; get_online_cpus(); - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - mutex_lock(µcode_mutex); - if (uci->valid) - err = microcode_ops->cpu_request_microcode(cpu); - mutex_unlock(µcode_mutex); + if (cpu_online(cpu)) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + mutex_lock(µcode_mutex); + if (uci->valid) + err = microcode_ops->cpu_request_microcode(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); + } put_online_cpus(); - set_cpus_allowed_ptr(current, &old); } if (err) return err; @@ -285,7 +269,7 @@ static ssize_t version_show(struct sys_device *dev, { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - return sprintf(buf, "0x%x\n", uci->rev); + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } static ssize_t pf_show(struct sys_device *dev, @@ -293,7 +277,7 @@ static ssize_t pf_show(struct sys_device *dev, { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - return sprintf(buf, "0x%x\n", uci->pf); + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } static SYSDEV_ATTR(reload, 0200, NULL, reload_store); @@ -312,7 +296,85 @@ static struct attribute_group mc_attr_group = { .name = "microcode", }; -static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + microcode_ops->microcode_fini_cpu(cpu); + uci->valid = 0; + mutex_unlock(µcode_mutex); +} + +static void collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + memset(uci, 0, sizeof(*uci)); + if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) + uci->valid = 1; +} + +static void microcode_resume_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpu_signature nsig; + + pr_debug("microcode: CPU%d resumed\n", cpu); + + if (!uci->mc.valid_mc) + return; + + /* + * Let's verify that the 'cached' ucode does belong + * to this cpu (a bit of paranoia): + */ + if (microcode_ops->collect_cpu_info(cpu, &nsig)) { + microcode_fini_cpu(cpu); + return; + } + + if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { + microcode_fini_cpu(cpu); + /* Should we look for a new ucode here? */ + return; + } + + microcode_ops->apply_microcode(cpu); +} + +void microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + + mutex_lock(µcode_mutex); + /* + * Check if the system resume is in progress (uci->valid != NULL), + * otherwise just request a firmware: + */ + if (uci->valid) { + microcode_resume_cpu(cpu); + } else { + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING) + microcode_ops->cpu_request_microcode(cpu); + } + mutex_unlock(µcode_mutex); +} + +static void microcode_init_cpu(int cpu) +{ + cpumask_t old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + microcode_update_cpu(cpu); + set_cpus_allowed_ptr(current, &old); +} + +static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -327,16 +389,10 @@ static int __mc_sysdev_add(struct sys_device *sys_dev, int resume) if (err) return err; - microcode_init_cpu(cpu, resume); - + microcode_init_cpu(cpu); return 0; } -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - return __mc_sysdev_add(sys_dev, 0); -} - static int mc_sysdev_remove(struct sys_device *sys_dev) { int cpu = sys_dev->id; @@ -345,7 +401,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) return 0; pr_debug("microcode: CPU%d removed\n", cpu); - microcode_ops->microcode_fini_cpu(cpu); + microcode_fini_cpu(cpu); sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); return 0; } @@ -376,33 +432,26 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) sys_dev = get_cpu_sysdev(cpu); switch (action) { - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_ops->microcode_fini_cpu(cpu); - break; case CPU_ONLINE: - case CPU_DOWN_FAILED: - mc_sysdev_add(sys_dev); - break; case CPU_ONLINE_FROZEN: - /* System-wide resume is in progress, try to apply microcode */ - if (microcode_ops->apply_microcode_check_cpu(cpu)) { - /* The application of microcode failed */ - microcode_ops->microcode_fini_cpu(cpu); - __mc_sysdev_add(sys_dev, 1); - break; - } + microcode_init_cpu(cpu); + case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: + pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) printk(KERN_ERR "microcode: Failed to create the sysfs " "group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: - mc_sysdev_remove(sys_dev); - break; case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + pr_debug("microcode: CPU%d removed\n", cpu); + break; + case CPU_DEAD: + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); break; } return NOTIFY_OK; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index a6e76ccf8158..4006e5e3adf0 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -59,38 +59,28 @@ MODULE_LICENSE("GPL v2"); /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -extern struct mutex (microcode_mutex); - struct equiv_cpu_entry *equiv_cpu_table; -extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; - -static void collect_cpu_info_amd(int cpu) +static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu); - uci->rev = 0; - uci->pf = 0; - uci->mc.mc_amd = NULL; - uci->valid = 1; + memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { printk(KERN_ERR "microcode: CPU%d not a capable AMD processor\n", cpu); - uci->valid = 0; - return; + return -1; } asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (uci->rev) + : "=a" (csig->rev) : "i" (0x0000008B) : "ecx"); printk(KERN_INFO "microcode: collect_cpu_info_amd : patch_id=0x%x\n", - uci->rev); + csig->rev); + + return 0; } static int get_matching_microcode_amd(void *mc, int cpu) @@ -119,7 +109,7 @@ static int get_matching_microcode_amd(void *mc, int cpu) if (equiv_cpu_table == NULL) { printk(KERN_INFO "microcode: CPU%d microcode update with " "version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->rev); + cpu, mc_header->patch_id, uci->cpu_sig.rev); goto out; } @@ -185,12 +175,12 @@ static int get_matching_microcode_amd(void *mc, int cpu) pci_dev_put(sb_pci_dev); } - if (mc_header->patch_id <= uci->rev) + if (mc_header->patch_id <= uci->cpu_sig.rev) return 0; printk(KERN_INFO "microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->rev); + cpu, mc_header->patch_id, uci->cpu_sig.rev); out: new_mc = vmalloc(UCODE_MAX_SIZE); @@ -250,9 +240,9 @@ static void apply_microcode_amd(int cpu) printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x \n", - cpu_num, uci->rev, uci->mc.mc_amd->hdr.patch_id); + cpu_num, uci->cpu_sig.rev, uci->mc.mc_amd->hdr.patch_id); - uci->rev = rev; + uci->cpu_sig.rev = rev; } #ifdef CONFIG_MICROCODE_OLD_INTERFACE @@ -437,61 +427,18 @@ static int cpu_request_microcode_amd(int cpu) return error; } -static int apply_microcode_check_cpu_amd(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - unsigned int rev; - cpumask_t old; - int err = 0; - - /* Check if the microcode is available */ - if (!uci->mc.mc_amd) - return 0; - - old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - /* Check if the microcode we have in memory matches the CPU */ - if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 16) - err = -EINVAL; - - if (!err) { - asm volatile("movl %1, %%ecx; rdmsr" - : "=a" (rev) - : "i" (0x0000008B) : "ecx"); - - if (uci->rev != rev) - err = -EINVAL; - } - - if (!err) - apply_microcode_amd(cpu); - else - printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" - " rev=0x%x\n", - cpu, uci->rev); - - set_cpus_allowed(current, old); - return err; -} - static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); - uci->valid = 0; vfree(uci->mc.mc_amd); uci->mc.mc_amd = NULL; - mutex_unlock(µcode_mutex); } static struct microcode_ops microcode_amd_ops = { .get_next_ucode = get_next_ucode_amd, .get_matching_microcode = get_matching_microcode_amd, .microcode_sanity_check = NULL, - .apply_microcode_check_cpu = apply_microcode_check_cpu_amd, .cpu_request_microcode = cpu_request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 6dd8907ff22e..c9b53202ba3d 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -122,46 +122,37 @@ MODULE_LICENSE("GPL"); /* serialize access to the physical write to MSR 0x79 */ static DEFINE_SPINLOCK(microcode_update_lock); -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -extern struct mutex microcode_mutex; - -extern struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; - -static void collect_cpu_info(int cpu_num) +static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; unsigned int val[2]; - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu_num); - uci->pf = uci->rev = 0; - uci->mc.mc_intel = NULL; - uci->valid = 1; + memset(csig, 0, sizeof(*csig)); if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || cpu_has(c, X86_FEATURE_IA64)) { printk(KERN_ERR "microcode: CPU%d not a capable Intel " "processor\n", cpu_num); - uci->valid = 0; - return; + return -1; } - uci->sig = cpuid_eax(0x00000001); + csig->sig = cpuid_eax(0x00000001); if ((c->x86_model >= 5) || (c->x86 > 6)) { /* get processor flags from MSR 0x17 */ rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - uci->pf = 1 << ((val[1] >> 18) & 7); + csig->pf = 1 << ((val[1] >> 18) & 7); } wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], uci->rev); + rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - uci->sig, uci->pf, uci->rev); + csig->sig, csig->pf, csig->rev); + + return 0; } static inline int microcode_update_match(int cpu_num, @@ -169,8 +160,8 @@ static inline int microcode_update_match(int cpu_num, { struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; - if (!sigmatch(sig, uci->sig, pf, uci->pf) - || mc_header->rev <= uci->rev) + if (!sigmatch(sig, uci->cpu_sig.sig, pf, uci->cpu_sig.pf) + || mc_header->rev <= uci->cpu_sig.rev) return 0; return 1; } @@ -289,7 +280,7 @@ static int get_matching_microcode(void *mc, int cpu) find: pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", - cpu, mc_header->rev, uci->rev); + cpu, mc_header->rev, uci->cpu_sig.rev); new_mc = vmalloc(total_size); if (!new_mc) { printk(KERN_ERR "microcode: error! Can not allocate memory\n"); @@ -335,16 +326,16 @@ static void apply_microcode(int cpu) spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != uci->mc.mc_intel->hdr.rev) { printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", cpu_num, uci->rev, val[1]); + "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); return; } printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->rev, val[1], + cpu_num, uci->cpu_sig.rev, val[1], uci->mc.mc_intel->hdr.date & 0xffff, uci->mc.mc_intel->hdr.date >> 24, (uci->mc.mc_intel->hdr.date >> 16) & 0xff); - uci->rev = val[1]; + uci->cpu_sig.rev = val[1]; } #ifdef CONFIG_MICROCODE_OLD_INTERFACE @@ -459,70 +450,18 @@ static int cpu_request_microcode(int cpu) return error; } -static int apply_microcode_check_cpu(int cpu) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - cpumask_t old; - unsigned int val[2]; - int err = 0; - - /* Check if the microcode is available */ - if (!uci->mc.mc_intel) - return 0; - - old = current->cpus_allowed; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - - /* Check if the microcode we have in memory matches the CPU */ - if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || - cpu_has(c, X86_FEATURE_IA64) || uci->sig != cpuid_eax(0x00000001)) - err = -EINVAL; - - if (!err && ((c->x86_model >= 5) || (c->x86 > 6))) { - /* get processor flags from MSR 0x17 */ - rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); - if (uci->pf != (1 << ((val[1] >> 18) & 7))) - err = -EINVAL; - } - - if (!err) { - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - if (uci->rev != val[1]) - err = -EINVAL; - } - - if (!err) - apply_microcode(cpu); - else - printk(KERN_ERR "microcode: Could not apply microcode to CPU%d:" - " sig=0x%x, pf=0x%x, rev=0x%x\n", - cpu, uci->sig, uci->pf, uci->rev); - - set_cpus_allowed_ptr(current, &old); - return err; -} - static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - mutex_lock(µcode_mutex); - uci->valid = 0; vfree(uci->mc.mc_intel); uci->mc.mc_intel = NULL; - mutex_unlock(µcode_mutex); } static struct microcode_ops microcode_intel_ops = { .get_next_ucode = get_next_ucode, .get_matching_microcode = get_matching_microcode, .microcode_sanity_check = microcode_sanity_check, - .apply_microcode_check_cpu = apply_microcode_check_cpu, .cpu_request_microcode = cpu_request_microcode, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode, -- cgit v1.2.2 From b6edbb1e045a7116d5571544dae25c6c37c94a48 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:04:19 -0700 Subject: x86_64: use save/loadsegment in ia32 compat Use savesegment and loadsegment consistently in ia32 compat code. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_aout.c | 11 +++++++---- arch/x86/ia32/ia32_signal.c | 21 ++++++++++----------- 2 files changed, 17 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index a0e1dbe67dc1..127ec3f07214 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -85,8 +85,10 @@ static void dump_thread32(struct pt_regs *regs, struct user32 *dump) dump->regs.ax = regs->ax; dump->regs.ds = current->thread.ds; dump->regs.es = current->thread.es; - asm("movl %%fs,%0" : "=r" (fs)); dump->regs.fs = fs; - asm("movl %%gs,%0" : "=r" (gs)); dump->regs.gs = gs; + savesegment(fs, fs); + dump->regs.fs = fs; + savesegment(gs, gs); + dump->regs.gs = gs; dump->regs.orig_ax = regs->orig_ax; dump->regs.ip = regs->ip; dump->regs.cs = regs->cs; @@ -430,8 +432,9 @@ beyond_if: current->mm->start_stack = (unsigned long)create_aout_tables((char __user *)bprm->p, bprm); /* start thread */ - asm volatile("movl %0,%%fs" :: "r" (0)); \ - asm volatile("movl %0,%%es; movl %0,%%ds": :"r" (__USER32_DS)); + loadsegment(fs, 0); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); load_gs_index(0); (regs)->ip = ex.a_entry; (regs)->sp = current->mm->start_stack; diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 20af4c79579a..f1a2ac777faf 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -206,7 +206,7 @@ struct rt_sigframe { unsigned int cur; \ unsigned short pre; \ err |= __get_user(pre, &sc->seg); \ - asm volatile("movl %%" #seg ",%0" : "=r" (cur)); \ + savesegment(seg, cur); \ pre |= mask; \ if (pre != cur) loadsegment(seg, pre); } @@ -235,7 +235,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, */ err |= __get_user(gs, &sc->gs); gs |= 3; - asm("movl %%gs,%0" : "=r" (oldgs)); + savesegment(gs, oldgs); if (gs != oldgs) load_gs_index(gs); @@ -355,14 +355,13 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, { int tmp, err = 0; - tmp = 0; - __asm__("movl %%gs,%0" : "=r"(tmp): "0"(tmp)); + savesegment(gs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->gs); - __asm__("movl %%fs,%0" : "=r"(tmp): "0"(tmp)); + savesegment(fs, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->fs); - __asm__("movl %%ds,%0" : "=r"(tmp): "0"(tmp)); + savesegment(ds, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->ds); - __asm__("movl %%es,%0" : "=r"(tmp): "0"(tmp)); + savesegment(es, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->es); err |= __put_user((u32)regs->di, &sc->di); @@ -498,8 +497,8 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, regs->dx = 0; regs->cx = 0; - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); regs->cs = __USER32_CS; regs->ss = __USER32_DS; @@ -591,8 +590,8 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->dx = (unsigned long) &frame->info; regs->cx = (unsigned long) &frame->uc; - asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); - asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); + loadsegment(ds, __USER32_DS); + loadsegment(es, __USER32_DS); regs->cs = __USER32_CS; regs->ss = __USER32_DS; -- cgit v1.2.2 From 99dd8713306a89f3e106143581244e550e00a644 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Tue, 19 Aug 2008 12:51:59 -0500 Subject: x86, SGI UV: hardcode the TLB flush interrupt system vector The UV TLB shootdown mechanism needs a system interrupt vector. Its vector had been hardcoded as 200, but needs to moved to the reserved system vector range so that it does not collide with some device vector. This is still temporary until dynamic system IRQ allocation is provided. But it will be needed when real UV hardware becomes available and runs 2.6.27. Signed-off-by: Cliff Wickman Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d0fbb7712ab0..8b8c0d6640fa 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -783,7 +784,7 @@ static int __init uv_bau_init(void) uv_init_blade(blade, node, cur_cpu); cur_cpu += uv_blade_nr_possible_cpus(blade); } - set_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); + alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); uv_enable_timeouts(); return 0; -- cgit v1.2.2 From 27990eac52dae87b909ef4f7e796fb6ec758bb94 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:10:07 -0700 Subject: x86: another user of PTE_FLAGS_MASK Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index a20d1fa64b4e..e7277cbcfb40 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -148,8 +148,8 @@ static void note_page(struct seq_file *m, struct pg_state *st, * we have now. "break" is either changing perms, levels or * address space marker. */ - prot = pgprot_val(new_prot) & ~(PTE_PFN_MASK); - cur = pgprot_val(st->current_prot) & ~(PTE_PFN_MASK); + prot = pgprot_val(new_prot) & PTE_FLAGS_MASK; + cur = pgprot_val(st->current_prot) & PTE_FLAGS_MASK; if (!st->level) { /* First entry */ -- cgit v1.2.2 From 6e833587e11ed0dbf12e647127f2650e2f80b26d Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:16:17 -0700 Subject: xen: clean up domain mode predicates There are four operating modes Xen code may find itself running in: - native - hvm domain - pv dom0 - pv domU Clean up predicates for testing for these states to make them more consistent. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 53afa14eb314..b106e825d266 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -56,6 +56,9 @@ EXPORT_SYMBOL_GPL(hypercall_page); DEFINE_PER_CPU(struct vcpu_info *, xen_vcpu); DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info); +enum xen_domain_type xen_domain_type = XEN_NATIVE; +EXPORT_SYMBOL_GPL(xen_domain_type); + /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -1613,6 +1616,8 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_start_info) return; + xen_domain_type = XEN_PV_DOMAIN; + BUG_ON(memcmp(xen_start_info->magic, "xen-3", 5) != 0); xen_setup_features(); @@ -1650,7 +1655,7 @@ asmlinkage void __init xen_start_kernel(void) /* Prevent unwanted bits from being set in PTEs. */ __supported_pte_mask &= ~_PAGE_GLOBAL; - if (!is_initial_xendomain()) + if (!xen_initial_domain()) __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); /* Don't do the full vcpu_info placement stuff until we have a @@ -1685,7 +1690,7 @@ asmlinkage void __init xen_start_kernel(void) boot_params.hdr.ramdisk_size = xen_start_info->mod_len; boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line); - if (!is_initial_xendomain()) { + if (!xen_initial_domain()) { add_preferred_console("xenboot", 0, NULL); add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); -- cgit v1.2.2 From 63d3a75d6f1fcf2f33e6abbe84e1f428c3586152 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:19:36 -0700 Subject: x86/paravirt: add spin_lock_flags lock op It is useful for a pv_lock_ops backend to know whether interrupts are enabled or not in the context a spin_lock is being called. This allows it to enable interrupts while spinning, which could be particularly helpful when spinning becomes blocking. The default implementation just calls the normal spin_lock op, ignoring the flags. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/paravirt-spinlocks.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 38d7f7f1dbc9..206359a8e43f 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -7,12 +7,18 @@ #include +static void default_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __raw_spin_lock(lock); +} + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP .spin_is_locked = __ticket_spin_is_locked, .spin_is_contended = __ticket_spin_is_contended, .spin_lock = __ticket_spin_lock, + .spin_lock_flags = default_spin_lock_flags, .spin_trylock = __ticket_spin_trylock, .spin_unlock = __ticket_spin_unlock, #endif -- cgit v1.2.2 From 11ad93e59d114f4b218873f1c93261be725d2e22 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:32:51 -0700 Subject: xen: clarify locking used when pinning a pagetable. Add some comments explaining the locking and pinning algorithm when using split pte locks. Also implement a minor optimisation of not pinning the PTE when not using split pte locks. Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 41 +++++++++++++++++++++++++++++++++++------ 1 file changed, 35 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..d3752b6ce6e6 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -590,8 +590,6 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmdidx_limit = 0; #endif - flush |= (*func)(virt_to_page(pgd), PT_PGD); - for (pgdidx = 0; pgdidx <= pgdidx_limit; pgdidx++) { pud_t *pud; @@ -637,7 +635,11 @@ static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), } } } + out: + /* Do the top level last, so that the callbacks can use it as + a cue to do final things like tlb flushes. */ + flush |= (*func)(virt_to_page(pgd), PT_PGD); return flush; } @@ -691,6 +693,26 @@ static int pin_page(struct page *page, enum pt_level level) flush = 0; + /* + * We need to hold the pagetable lock between the time + * we make the pagetable RO and when we actually pin + * it. If we don't, then other users may come in and + * attempt to update the pagetable by writing it, + * which will fail because the memory is RO but not + * pinned, so Xen won't do the trap'n'emulate. + * + * If we're using split pte locks, we can't hold the + * entire pagetable's worth of locks during the + * traverse, because we may wrap the preempt count (8 + * bits). The solution is to mark RO and pin each PTE + * page while holding the lock. This means the number + * of locks we end up holding is never more than a + * batch size (~32 entries, at present). + * + * If we're not using split pte locks, we needn't pin + * the PTE pages independently, because we're + * protected by the overall pagetable lock. + */ ptl = NULL; if (level == PT_PTE) ptl = lock_pte(page); @@ -699,10 +721,9 @@ static int pin_page(struct page *page, enum pt_level level) pfn_pte(pfn, PAGE_KERNEL_RO), level == PT_PGD ? UVMF_TLB_FLUSH : 0); - if (level == PT_PTE) + if (ptl) { xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn); - if (ptl) { /* Queue a deferred unlock for when this batch is completed. */ xen_mc_callback(do_unlock, ptl); @@ -796,10 +817,18 @@ static int unpin_page(struct page *page, enum pt_level level) spinlock_t *ptl = NULL; struct multicall_space mcs; + /* + * Do the converse to pin_page. If we're using split + * pte locks, we must be holding the lock for while + * the pte page is unpinned but still RO to prevent + * concurrent updates from seeing it in this + * partially-pinned state. + */ if (level == PT_PTE) { ptl = lock_pte(page); - xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); + if (ptl) + xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); } mcs = __xen_mc_entry(0); @@ -837,7 +866,7 @@ static void xen_pgd_unpin(pgd_t *pgd) #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif pgd_walk(pgd, unpin_page, USER_LIMIT); -- cgit v1.2.2 From 7708ad64a24a674f7905aa7a5099a50f055debec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 19 Aug 2008 13:34:22 -0700 Subject: xen: add xen_ prefixes to make tracing with ftrace easier It's easier to pattern match on Xen function if they all start with xen_. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 66 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d3752b6ce6e6..d9a35a363095 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -229,14 +229,14 @@ void make_lowmem_page_readwrite(void *vaddr) } -static bool page_pinned(void *ptr) +static bool xen_page_pinned(void *ptr) { struct page *page = virt_to_page(ptr); return PagePinned(page); } -static void extend_mmu_update(const struct mmu_update *update) +static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; struct mmu_update *u; @@ -265,7 +265,7 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pmd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -276,7 +276,7 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) { /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } @@ -334,7 +334,7 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; u.val = pte_val_ma(pte); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -400,7 +400,7 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) /* ptr may be ioremapped for 64-bit pagetable setup */ u.ptr = arbitrary_virt_to_machine(ptr).maddr; u.val = pud_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); xen_mc_issue(PARAVIRT_LAZY_MMU); @@ -411,7 +411,7 @@ void xen_set_pud(pud_t *ptr, pud_t val) { /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; return; } @@ -490,7 +490,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) u.ptr = virt_to_machine(ptr).maddr; u.val = pgd_val_ma(val); - extend_mmu_update(&u); + xen_extend_mmu_update(&u); } /* @@ -519,10 +519,10 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) /* If page is not pinned, we can just update the entry directly */ - if (!page_pinned(ptr)) { + if (!xen_page_pinned(ptr)) { *ptr = val; if (user_ptr) { - WARN_ON(page_pinned(user_ptr)); + WARN_ON(xen_page_pinned(user_ptr)); *user_ptr = val; } return; @@ -555,8 +555,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) * For 64-bit, we must skip the Xen hole in the middle of the address * space, just after the big x86-64 virtual hole. */ -static int pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), - unsigned long limit) +static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), + unsigned long limit) { int flush = 0; unsigned hole_low, hole_high; @@ -644,7 +644,9 @@ out: return flush; } -static spinlock_t *lock_pte(struct page *page) +/* If we're using split pte locks, then take the page's lock and + return a pointer to it. Otherwise return NULL. */ +static spinlock_t *xen_pte_lock(struct page *page) { spinlock_t *ptl = NULL; @@ -656,7 +658,7 @@ static spinlock_t *lock_pte(struct page *page) return ptl; } -static void do_unlock(void *v) +static void xen_pte_unlock(void *v) { spinlock_t *ptl = v; spin_unlock(ptl); @@ -674,7 +676,7 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct page *page, enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -715,7 +717,7 @@ static int pin_page(struct page *page, enum pt_level level) */ ptl = NULL; if (level == PT_PTE) - ptl = lock_pte(page); + ptl = xen_pte_lock(page); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), @@ -726,7 +728,7 @@ static int pin_page(struct page *page, enum pt_level level) /* Queue a deferred unlock for when this batch is completed. */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -740,7 +742,7 @@ void xen_pgd_pin(pgd_t *pgd) { xen_mc_batch(); - if (pgd_walk(pgd, pin_page, USER_LIMIT)) { + if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); @@ -754,14 +756,14 @@ void xen_pgd_pin(pgd_t *pgd) xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); if (user_pgd) { - pin_page(virt_to_page(user_pgd), PT_PGD); + xen_pin_page(virt_to_page(user_pgd), PT_PGD); xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ - pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); #endif /* CONFIG_X86_64 */ @@ -796,7 +798,7 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int mark_pinned(struct page *page, enum pt_level level) +static __init int xen_mark_pinned(struct page *page, enum pt_level level) { SetPagePinned(page); return 0; @@ -804,10 +806,10 @@ static __init int mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - pgd_walk(init_mm.pgd, mark_pinned, FIXADDR_TOP); + xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP); } -static int unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct page *page, enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -825,7 +827,7 @@ static int unpin_page(struct page *page, enum pt_level level) * partially-pinned state. */ if (level == PT_PTE) { - ptl = lock_pte(page); + ptl = xen_pte_lock(page); if (ptl) xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); @@ -839,7 +841,7 @@ static int unpin_page(struct page *page, enum pt_level level) if (ptl) { /* unlock when batch completed */ - xen_mc_callback(do_unlock, ptl); + xen_mc_callback(xen_pte_unlock, ptl); } } @@ -859,17 +861,17 @@ static void xen_pgd_unpin(pgd_t *pgd) if (user_pgd) { xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); - unpin_page(virt_to_page(user_pgd), PT_PGD); + xen_unpin_page(virt_to_page(user_pgd), PT_PGD); } } #endif #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); #endif - pgd_walk(pgd, unpin_page, USER_LIMIT); + xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } @@ -936,7 +938,7 @@ static void drop_other_mm_ref(void *info) } } -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { cpumask_t mask; unsigned cpu; @@ -966,7 +968,7 @@ static void drop_mm_ref(struct mm_struct *mm) smp_call_function_mask(mask, drop_other_mm_ref, mm, 1); } #else -static void drop_mm_ref(struct mm_struct *mm) +static void xen_drop_mm_ref(struct mm_struct *mm) { if (current->active_mm == mm) load_cr3(swapper_pg_dir); @@ -990,13 +992,13 @@ static void drop_mm_ref(struct mm_struct *mm) void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ - drop_mm_ref(mm); + xen_drop_mm_ref(mm); put_cpu(); spin_lock(&mm->page_table_lock); /* pgd may not be pinned in the error exit path of execve */ - if (page_pinned(mm->pgd)) + if (xen_page_pinned(mm->pgd)) xen_pgd_unpin(mm->pgd); spin_unlock(&mm->page_table_lock); -- cgit v1.2.2 From 80a8c9fffa78f57d7d4351af2f15a56386805ceb Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 19 Aug 2008 03:13:38 +0200 Subject: x86: fix oprofile + hibernation badness Vegard Nossum reported oprofile + hibernation problems: > Now some warnings: > > ------------[ cut here ]------------ > WARNING: at /uio/arkimedes/s29/vegardno/git-working/linux-2.6/kernel/smp.c:328 s > mp_call_function_mask+0x194/0x1a0() The usual problem: the suspend function when interrupts are already disabled calls smp_call_function which is not allowed with interrupt off. But at this point all the other CPUs should be already down anyways, so it should be enough to just drop that. This patch should fix that problem at least by fixing cpu hotplug& suspend support. [ mingo@elte.hu: fixed 5 coding style errors. ] Signed-off-by: Andi Kleen Tested-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3f90289410e6..0227694f7dab 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -28,23 +29,48 @@ static DEFINE_PER_CPU(unsigned long, saved_lvtpc); static int nmi_start(void); static void nmi_stop(void); +static void nmi_cpu_start(void *dummy); +static void nmi_cpu_stop(void *dummy); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; +#ifdef CONFIG_SMP +static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, + void *data) +{ + int cpu = (unsigned long)data; + switch (action) { + case CPU_DOWN_FAILED: + case CPU_ONLINE: + smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block oprofile_cpu_nb = { + .notifier_call = oprofile_cpu_notifier +}; +#endif + #ifdef CONFIG_PM static int nmi_suspend(struct sys_device *dev, pm_message_t state) { + /* Only one CPU left, just stop that one */ if (nmi_enabled == 1) - nmi_stop(); + nmi_cpu_stop(NULL); return 0; } static int nmi_resume(struct sys_device *dev) { if (nmi_enabled == 1) - nmi_start(); + nmi_cpu_start(NULL); return 0; } @@ -463,6 +489,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) } init_sysfs(); +#ifdef CONFIG_SMP + register_cpu_notifier(&oprofile_cpu_nb); +#endif using_nmi = 1; ops->create_files = nmi_create_files; ops->setup = nmi_setup; @@ -476,6 +505,10 @@ int __init op_nmi_init(struct oprofile_operations *ops) void op_nmi_exit(void) { - if (using_nmi) + if (using_nmi) { exit_sysfs(); +#ifdef CONFIG_SMP + unregister_cpu_notifier(&oprofile_cpu_nb); +#endif + } } -- cgit v1.2.2 From 85d577979a8a94a32ca1d828a91c2eb8300bc9f2 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 18 Aug 2008 11:58:17 +0200 Subject: werror: fix pci calgary Fix an integer comparison always false warning in the PCI Calgary 64 driver. A u8 is being compared to something that's 512 by default, resulting in the following warning: arch/x86/kernel/pci-calgary_64.c:1285: warning: comparison is always false due to limited range of data type This was introduced by patch b34e90b8f0f30151349134f87b5dc6ef75a5218c. Signed-off-by: David Howells Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 218d783ed7a8..363a74ec7a76 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -1269,13 +1269,15 @@ static inline int __init determine_tce_table_size(u64 ram) static int __init build_detail_arrays(void) { unsigned long ptr; - int i, scal_detail_size, rio_detail_size; + unsigned numnodes, i; + int scal_detail_size, rio_detail_size; - if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ + numnodes = rio_table_hdr->num_scal_dev; + if (numnodes > MAX_NUMNODES){ printk(KERN_WARNING "Calgary: MAX_NUMNODES too low! Defined as %d, " "but system has %d nodes.\n", - MAX_NUMNODES, rio_table_hdr->num_scal_dev); + MAX_NUMNODES, numnodes); return -ENODEV; } @@ -1296,8 +1298,7 @@ static int __init build_detail_arrays(void) } ptr = ((unsigned long)rio_table_hdr) + 3; - for (i = 0; i < rio_table_hdr->num_scal_dev; - i++, ptr += scal_detail_size) + for (i = 0; i < numnodes; i++, ptr += scal_detail_size) scal_devs[i] = (struct scal_detail *)ptr; for (i = 0; i < rio_table_hdr->num_rio_dev; -- cgit v1.2.2 From 0c072bb452f0cfd4959bc01ff3627d6385255b20 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 8 Jul 2008 09:50:22 -0700 Subject: x86: use WARN() in arch/x86/mm/ioremap.c Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. Signed-off-by: Arjan van de Ven Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6ba6f889c79d..d4b6e6a29ae3 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -553,13 +553,11 @@ static int __init check_early_ioremap_leak(void) { if (!early_ioremap_nested) return 0; - - printk(KERN_WARNING + WARN(1, KERN_WARNING "Debug warning: early ioremap leak of %d areas detected.\n", - early_ioremap_nested); + early_ioremap_nested); printk(KERN_WARNING - "please boot with early_ioremap_debug and report the dmesg.\n"); - WARN_ON(1); + "please boot with early_ioremap_debug and report the dmesg.\n"); return 1; } -- cgit v1.2.2 From bde78a79a6eb015f33aa4660df1b06f5135def20 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 8 Jul 2008 09:51:56 -0700 Subject: x86: use WARN() in arch/x86/kernel Use WARN() instead of a printk+WARN_ON() pair; this way the message becomes part of the warning section for better reporting/collection. This also allowed the folding of some if()'s into the WARN() Signed-off-by: Arjan van de Ven Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 5 +---- arch/x86/kernel/pci-calgary_64.c | 3 +-- arch/x86/kernel/tsc_sync.c | 6 ++---- 3 files changed, 4 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 6f23969c8faf..b117d7f8a564 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1496,11 +1496,8 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) /* kvm/qemu doesn't have mtrr set right, don't trim them all */ if (!highest_pfn) { - if (!kvm_para_available()) { - printk(KERN_WARNING + WARN(!kvm_para_available(), KERN_WARNING "WARNING: strange, CPU MTRRs all blank?\n"); - WARN_ON(1); - } return 0; } diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 363a74ec7a76..dcdac6c826e9 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -343,9 +343,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, /* were we called with bad_dma_address? */ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { - printk(KERN_ERR "Calgary: driver tried unmapping bad DMA " + WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " "address 0x%Lx\n", dma_addr); - WARN_ON(1); return; } diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0577825cf89b..9ffb01c31c40 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -88,11 +88,9 @@ static __cpuinit void check_tsc_warp(void) __raw_spin_unlock(&sync_lock); } } - if (!(now-start)) { - printk("Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", + WARN(!(now-start), + "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n", now-start, end-start); - WARN_ON(1); - } } /* -- cgit v1.2.2 From 2d96ae6b0dc03a568398c1655fb8967f01f8e40a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 20 Aug 2008 16:43:02 -0700 Subject: arch/x86/pci/irq.c: attempt to clean up code layout Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/pci/irq.c | 67 +++++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index fec0123b33a9..d781cc4f725a 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -1041,35 +1041,44 @@ static void __init pcibios_fixup_irqs(void) if (io_apic_assign_pci_irqs) { int irq; - if (pin) { - /* - * interrupt pins are numbered starting - * from 1 - */ - pin--; - irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, - PCI_SLOT(dev->devfn), pin); - /* - * Busses behind bridges are typically not listed in the MP-table. - * In this case we have to look up the IRQ based on the parent bus, - * parent slot, and pin number. The SMP code detects such bridged - * busses itself so we should get into this branch reliably. - */ - if (irq < 0 && dev->bus->parent) { /* go back to the bridge */ - struct pci_dev *bridge = dev->bus->self; - - pin = (pin + PCI_SLOT(dev->devfn)) % 4; - irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number, - PCI_SLOT(bridge->devfn), pin); - if (irq >= 0) - dev_warn(&dev->dev, "using bridge %s INT %c to get IRQ %d\n", - pci_name(bridge), - 'A' + pin, irq); - } - if (irq >= 0) { - dev_info(&dev->dev, "PCI->APIC IRQ transform: INT %c -> IRQ %d\n", 'A' + pin, irq); - dev->irq = irq; - } + if (!pin) + continue; + + /* + * interrupt pins are numbered starting from 1 + */ + pin--; + irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, + PCI_SLOT(dev->devfn), pin); + /* + * Busses behind bridges are typically not listed in the + * MP-table. In this case we have to look up the IRQ + * based on the parent bus, parent slot, and pin number. + * The SMP code detects such bridged busses itself so we + * should get into this branch reliably. + */ + if (irq < 0 && dev->bus->parent) { + /* go back to the bridge */ + struct pci_dev *bridge = dev->bus->self; + int bus; + + pin = (pin + PCI_SLOT(dev->devfn)) % 4; + bus = bridge->bus->number; + irq = IO_APIC_get_PCI_irq_vector(bus, + PCI_SLOT(bridge->devfn), pin); + if (irq >= 0) + dev_warn(&dev->dev, + "using bridge %s INT %c to " + "get IRQ %d\n", + pci_name(bridge), + 'A' + pin, irq); + } + if (irq >= 0) { + dev_info(&dev->dev, + "PCI->APIC IRQ transform: INT %c " + "-> IRQ %d\n", + 'A' + pin, irq); + dev->irq = irq; } } #endif -- cgit v1.2.2 From e621bd18958ef5dbace3129ebe17a0a475e127d9 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 20 Aug 2008 16:43:03 -0700 Subject: i386: vmalloc size fix Booting kernel with vmalloc=[any size<=16m] will oops on my pc (i386/1G memory). BUG_ON in arch/x86/mm/init_32.c triggered: BUG_ON((unsigned long)high_memory > VMALLOC_START); It's due to the vm area hole. In include/asm-x86/pgtable_32.h: #define VMALLOC_OFFSET (8 * 1024 * 1024) #define VMALLOC_START (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) \ & ~(VMALLOC_OFFSET - 1)) There's several related point: 1. MAXMEM : (-__PAGE_OFFSET - __VMALLOC_RESERVE). The space after VMALLOC_END is included as well, I set it to (VMALLOC_END - PAGE_OFFSET - __VMALLOC_RESERVE) 2. VMALLOC_OFFSET is not considered in __VMALLOC_RESERVE fixed by adding VMALLOC_OFFSET to it. 3. VMALLOC_START : (((unsigned long)high_memory + 2 * VMALLOC_OFFSET - 1) & ~(VMALLOC_OFFSET - 1)) So it's not always 8M, bigger than 8M possible. I set it to ((unsigned long)high_memory + VMALLOC_OFFSET) 4. the VMALLOC_RESERVE is an unused macro, so remove it here. Signed-off-by: Dave Young Cc: akpm@linux-foundation.org Cc: hidave.darkstar@gmail.com Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/x86/mm/pgtable_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cab0abbd1ebe..0951db9ee519 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -123,7 +123,8 @@ static int __init parse_vmalloc(char *arg) if (!arg) return -EINVAL; - __VMALLOC_RESERVE = memparse(arg, &arg); + /* Add VMALLOC_OFFSET to the parsed value due to vm area guard hole*/ + __VMALLOC_RESERVE = memparse(arg, &arg) + VMALLOC_OFFSET; return 0; } early_param("vmalloc", parse_vmalloc); -- cgit v1.2.2 From 11494547b1754c4f3bd7f707ab869e2adf54d52f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 21 Aug 2008 01:01:19 -0700 Subject: x86: fix apic version warning after following patch, commit 1b313f4a6d7bee7b2c034b3f1e203bc360a71cca Author: Cyrill Gorcunov Date: Mon Aug 18 20:45:57 2008 +0400 x86: apic - generic_processor_info - use physid_set instead of phys_cpu and physids_or - set phys_cpu_present_map bit AFTER check for allowed number of processors - add checking for APIC valid version in 64bit mode (mostly not needed but added for merging purpose) - add apic_version definition for 64bit mode which is used now we are getting warning for acpi path on 64 bit system. make the 64-bit side fill in apic_version[] as well. [ mingo@elte.hu: build fix ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 12e260e8fb2a..d9770a56511a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -239,10 +239,8 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled) return; } -#ifdef CONFIG_X86_32 if (boot_cpu_physical_apicid != -1U) ver = apic_version[boot_cpu_physical_apicid]; -#endif generic_processor_info(id, ver); } @@ -762,10 +760,8 @@ static void __init acpi_register_lapic_address(unsigned long address) set_fixmap_nocache(FIX_APIC_BASE, address); if (boot_cpu_physical_apicid == -1U) { boot_cpu_physical_apicid = read_apic_id(); -#ifdef CONFIG_X86_32 apic_version[boot_cpu_physical_apicid] = GET_APIC_VERSION(apic_read(APIC_LVR)); -#endif } } -- cgit v1.2.2 From 7701e8c59b0e7cbcf21c1bc49d8b8c6db43027a4 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Wed, 20 Aug 2008 21:07:47 +0200 Subject: x86, mmiotrace: silence section mismatch warning - leave_uniprocessor WARNING: vmlinux.o(.text+0x180af): Section mismatch in reference from the function leave_uniprocessor() to the function .cpuinit.text:cpu_up() The function leave_uniprocessor() references the function __cpuinit cpu_up(). This is often because leave_uniprocessor lacks a __cpuinit annotation or the annotation of cpu_up is wrong. leave_uniprocessor calls cpu_up only when CONFIG_HOTPLUG_CPU is set, so it can be safely annotated as __ref Signed-off-by: Marcin Slusarz Cc: Pekka Paalanen Signed-off-by: Ingo Molnar Cc: Ingo Molnar Cc: Pekka Paalanen --- arch/x86/mm/mmio-mod.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index e7397e108beb..635b50e85581 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -430,7 +430,9 @@ static void enter_uniprocessor(void) "may miss events.\n"); } -static void leave_uniprocessor(void) +/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, + but this whole function is ifdefed CONFIG_HOTPLUG_CPU */ +static void __ref leave_uniprocessor(void) { int cpu; int err; -- cgit v1.2.2 From 7946612de2087e163308e26034286fc2dc9dacf1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 11:31:07 -0700 Subject: x86: export pv_lock_ops non-GPL None of the spinlock API is exported GPL, so there's no reason for pv_lock_ops to be. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Cc: drago01 --- arch/x86/kernel/paravirt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 94da4d52d798..300da17e61cb 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -471,7 +471,7 @@ struct pv_lock_ops pv_lock_ops = { .spin_unlock = __ticket_spin_unlock, #endif }; -EXPORT_SYMBOL_GPL(pv_lock_ops); +EXPORT_SYMBOL(pv_lock_ops); EXPORT_SYMBOL_GPL(pv_time_ops); EXPORT_SYMBOL (pv_cpu_ops); -- cgit v1.2.2 From b2a6a58ca6a3ddf4e278a53199c5b8fd39adbc0e Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Wed, 20 Aug 2008 18:18:26 +0200 Subject: x86: fix BUG: unable to handle kernel paging request (numaq_tsc_disable) This section mismatch: >> Seems to be a section mismatch; init_intel() is __cpuinit while >> numaq_tsc_disable() is __init. Seems to be introduced in: >> >> commit 64898a8bad8c94ad7a4bd5cc86b66edfbb081f4a >> Author: Yinghai Lu >> Date: Sat Jul 19 18:01:16 2008 -0700 >> >> x86: extend and use x86_quirks to clean up NUMAQ code > > Oops, I am wrong about numaq_tsc_disable() being __init. Still, I > believe that Yinghai might be able to say what's really wrong :-) Would lead to this crash: BUG: unable to handle kernel paging request at c08a45f0 IP: [] numaq_tsc_disable+0x0/0x40 Fixed by the patch below. Signed-off-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/kernel/numaq_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/numaq_32.c b/arch/x86/kernel/numaq_32.c index b8c45610b20a..eecc8c18f010 100644 --- a/arch/x86/kernel/numaq_32.c +++ b/arch/x86/kernel/numaq_32.c @@ -73,7 +73,7 @@ static void __init smp_dump_qct(void) } -void __init numaq_tsc_disable(void) +void __cpuinit numaq_tsc_disable(void) { if (!found_numaq) return; -- cgit v1.2.2 From 4e1d112cac08049e764fe3c4f6d3ec92529f9f68 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Wed, 20 Aug 2008 16:43:07 -0700 Subject: arch/x86/kernel/apm_32.c: remove duplicated #include Removed duplicated include file in arch/x86/kernel/apm_32.c. Signed-off-by: Huang Weiyi Acked-by: Pavel Machek Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- arch/x86/kernel/apm_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9ee24e6bc4b0..b93d069aea72 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -228,7 +228,6 @@ #include #include #include -#include #include #include -- cgit v1.2.2 From 671eef85a3e885dff4ce210d8774ad50a91d5967 Mon Sep 17 00:00:00 2001 From: "Cihula, Joseph" Date: Wed, 20 Aug 2008 16:43:07 -0700 Subject: x86, e820: add support for AddressRangeUnusuable ACPI memory type Add support for the E820_UNUSABLE memory type, which is defined in Revision 3.0b (Oct. 10, 2006) of the ACPI Specification on p. 394 Table 14-1: AddressRangeUnusuable This range of address contains memory in which errors have been detected. This range must not be used by the OSPM. Signed-off-by: Joseph Cihula Signed-off-by: Shane Wang Signed-off-by: Gang Wei Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9af89078f7bb..291e6cd9f9c0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -148,6 +148,9 @@ void __init e820_print_map(char *who) case E820_NVS: printk(KERN_CONT "(ACPI NVS)\n"); break; + case E820_UNUSABLE: + printk("(unusable)\n"); + break; default: printk(KERN_CONT "type %u\n", e820.map[i].type); break; @@ -1260,6 +1263,7 @@ static inline const char *e820_type_to_string(int e820_type) case E820_RAM: return "System RAM"; case E820_ACPI: return "ACPI Tables"; case E820_NVS: return "ACPI Non-volatile Storage"; + case E820_UNUSABLE: return "Unusable memory"; default: return "reserved"; } } -- cgit v1.2.2 From c15238df3b65e34fadb1021b0fb0d5aebc7c42c6 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Wed, 20 Aug 2008 16:45:51 -0700 Subject: x86: PAT proper tracking of set_memory_uc and friends Big thinko in pat memtype tracking code. reserve_memtype should be called with physical address and not virtual address. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f5f5154ea11e..43e2f8483e4f 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -849,7 +849,7 @@ int set_memory_uc(unsigned long addr, int numpages) /* * for now UC MINUS. see comments in ioremap_nocache() */ - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_UC_MINUS, NULL)) return -EINVAL; @@ -868,7 +868,7 @@ int set_memory_wc(unsigned long addr, int numpages) if (!pat_enabled) return set_memory_uc(addr, numpages); - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE, _PAGE_CACHE_WC, NULL)) return -EINVAL; @@ -884,7 +884,7 @@ int _set_memory_wb(unsigned long addr, int numpages) int set_memory_wb(unsigned long addr, int numpages) { - free_memtype(addr, addr + numpages * PAGE_SIZE); + free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE); return _set_memory_wb(addr, numpages); } -- cgit v1.2.2 From 28df82ebab79c6a2b4295dd94fd8de88196a49df Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Wed, 20 Aug 2008 16:45:52 -0700 Subject: devmem, x86: PAT Change /dev/mem mmap with O_SYNC to use UC_MINUS All kernel mappings like ioremap(), etc uses UC_MINUS as the type. /dev/mem mappings with /dev/mem being opened with O_SYNC however was using UC, resulting in a conflict with /dev/mem mmap failing. This seems to be affecting some apps (one being flashrom) which are using O_SYNC and which were working before. Switch /dev/mem with O_SYNC also to UC_MINUS. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index bb6e8a267bfe..2a50e0fa64a5 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -390,14 +390,6 @@ int free_memtype(u64 start, u64 end) } -/* - * /dev/mem mmap interface. The memtype used for mapping varies: - * - Use UC for mappings with O_SYNC flag - * - Without O_SYNC flag, if there is any conflict in reserve_memtype, - * inherit the memtype from existing mapping. - * - Else use UC_MINUS memtype (for backward compatibility with existing - * X drivers. - */ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -435,14 +427,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, unsigned long size, pgprot_t *vma_prot) { u64 offset = ((u64) pfn) << PAGE_SHIFT; - unsigned long flags = _PAGE_CACHE_UC_MINUS; + unsigned long flags = -1; int retval; if (!range_is_allowed(pfn, size)) return 0; if (file->f_flags & O_SYNC) { - flags = _PAGE_CACHE_UC; + flags = _PAGE_CACHE_UC_MINUS; } #ifdef CONFIG_X86_32 @@ -465,13 +457,14 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, #endif /* - * With O_SYNC, we can only take UC mapping. Fail if we cannot. + * With O_SYNC, we can only take UC_MINUS mapping. Fail if we cannot. + * * Without O_SYNC, we want to get * - WB for WB-able memory and no other conflicting mappings * - UC_MINUS for non-WB-able memory with no other conflicting mappings * - Inherit from confliting mappings otherwise */ - if (flags != _PAGE_CACHE_UC_MINUS) { + if (flags != -1) { retval = reserve_memtype(offset, offset + size, flags, NULL); } else { retval = reserve_memtype(offset, offset + size, -1, &flags); -- cgit v1.2.2 From 8323444b5dba3fe55e56a95d20d8f55c1d6745af Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Wed, 20 Aug 2008 16:45:53 -0700 Subject: x86: PAT Update validate_pat_support for intel CPUs Pentium III and Core Solo/Duo CPUs have an erratum " Page with PAT set to WC while associated MTRR is UC may consolidate to UC " which can result in WC setting in PAT to be ineffective. We will disable PAT on such CPUs, so that we can continue to use MTRR WC setting. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/addon_cpuid_features.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 84a8220a6072..a6ef672adbba 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -56,9 +56,22 @@ void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: - if (c->x86 == 0xF || (c->x86 == 6 && c->x86_model >= 15)) + /* + * There is a known erratum on Pentium III and Core Solo + * and Core Duo CPUs. + * " Page with PAT set to WC while associated MTRR is UC + * may consolidate to UC " + * Because of this erratum, it is better to stick with + * setting WC in MTRR rather than using PAT on these CPUs. + * + * Enable PAT WC only on P4, Core 2 or later CPUs. + */ + if (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 15)) return; - break; + + pat_disable("PAT WC disabled due to known CPU erratum."); + return; + case X86_VENDOR_AMD: case X86_VENDOR_CENTAUR: case X86_VENDOR_TRANSMETA: -- cgit v1.2.2 From cacf890694a36124ceddce44ff4c7b02d372ce7c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Aug 2008 13:46:33 +0200 Subject: Revert "introduce two APIs for page attribute" This reverts commit 1ac2f7d55b7ee1613c90631e87fea22ec06781e5. --- arch/x86/mm/pageattr.c | 58 +++++++------------------------------------------- 1 file changed, 8 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4adb33628dec..5c06469a0653 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -752,12 +752,12 @@ static inline int cache_attr(pgprot_t attr) (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int do_change_page_attr_set_clr(unsigned long addr, int numpages, +static int change_page_attr_set_clr(unsigned long addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split, int *tlb_flush) + int force_split) { struct cpa_data cpa; - int ret, checkalias; + int ret, cache, checkalias; /* * Check, if we are requested to change a not supported @@ -795,22 +795,9 @@ static int do_change_page_attr_set_clr(unsigned long addr, int numpages, /* * Check whether we really changed something: */ - *tlb_flush = cpa.flushtlb; - cpa_fill_pool(NULL); - - return ret; -} - -static int change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr, - int force_split) -{ - int cache, flush_cache = 0, ret; - - ret = do_change_page_attr_set_clr(addr, numpages, mask_set, mask_clr, - force_split, &flush_cache); - if (!flush_cache) + if (!cpa.flushtlb) goto out; + /* * No need to flush, when we did not set any of the caching * attributes: @@ -827,7 +814,10 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_range(addr, numpages, cache); else cpa_flush_all(cache); + out: + cpa_fill_pool(NULL); + return ret; } @@ -865,30 +855,6 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); -int set_memory_uc_noflush(unsigned long addr, int numpages) -{ - int flush; - /* - * for now UC MINUS. see comments in ioremap_nocache() - */ - if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL)) - return -EINVAL; - /* - * for now UC MINUS. see comments in ioremap_nocache() - */ - return do_change_page_attr_set_clr(addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS), - __pgprot(0), 0, &flush); -} -EXPORT_SYMBOL(set_memory_uc_noflush); - -void set_memory_flush_all(void) -{ - cpa_flush_all(1); -} -EXPORT_SYMBOL(set_memory_flush_all); - int _set_memory_wc(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, @@ -963,14 +929,6 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); -int set_pages_uc_noflush(struct page *page, int numpages) -{ - unsigned long addr = (unsigned long)page_address(page); - - return set_memory_uc_noflush(addr, numpages); -} -EXPORT_SYMBOL(set_pages_uc_noflush); - int set_pages_wb(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); -- cgit v1.2.2 From d75586ad01e6c5a30e7337fb87d61e03556a1ecb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Aug 2008 10:46:06 +0800 Subject: x86, pageattr: introduce APIs to change pageattr of a page array Add array interface APIs of pageattr. page based cache flush is quite slow for a lot of pages. If pages are more than 1024 (4M), the patch will use a wbinvd(). We have a simple test here (run a 3d game - open arena), nearly all agp memory allocation are small (< 1M), so suppose this will not impact runtime performance. Signed-off-by: Dave Airlie Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 216 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 166 insertions(+), 50 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 5c06469a0653..041e81ef673a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -25,15 +25,19 @@ * The current flushing context - we pass it instead of 5 arguments: */ struct cpa_data { - unsigned long vaddr; + unsigned long *vaddr; pgprot_t mask_set; pgprot_t mask_clr; int numpages; - int flushtlb; + int flags; unsigned long pfn; unsigned force_split : 1; + int curpage; }; +#define CPA_FLUSHTLB 1 +#define CPA_ARRAY 2 + #ifdef CONFIG_PROC_FS static unsigned long direct_pages_count[PG_LEVEL_NUM]; @@ -184,6 +188,41 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } +static void cpa_flush_array(unsigned long *start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long *addr; + + BUG_ON(irqs_disabled()); + + on_each_cpu(__cpa_flush_range, NULL, 1); + + if (!cache) + return; + + /* 4M threshold */ + if (numpages >= 1024) { + if (boot_cpu_data.x86_model >= 4) + wbinvd(); + return; + } + /* + * We only need to flush on one CPU, + * clflush is a MESI-coherent instruction that + * will cause all other CPUs to flush the same + * cachelines: + */ + for (i = 0, addr = start; i < numpages; i++, addr++) { + pte_t *pte = lookup_address(*addr, &level); + + /* + * Only flush present addresses: + */ + if (pte && (pte_val(*pte) & _PAGE_PRESENT)) + clflush_cache_range((void *) *addr, PAGE_SIZE); + } +} + /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this @@ -392,7 +431,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, */ new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); __set_pmd_pte(kpte, address, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; do_split = 0; } @@ -578,11 +617,16 @@ out_unlock: static int __change_page_attr(struct cpa_data *cpa, int primary) { - unsigned long address = cpa->vaddr; + unsigned long address; int do_split, err; unsigned int level; pte_t *kpte, old_pte; + if (cpa->flags & CPA_ARRAY) + address = cpa->vaddr[cpa->curpage]; + else + address = *cpa->vaddr; + repeat: kpte = lookup_address(address, &level); if (!kpte) @@ -594,8 +638,8 @@ repeat: return 0; printk(KERN_WARNING "CPA: called for zero pte. " "vaddr = %lx cpa->vaddr = %lx\n", address, - cpa->vaddr); WARN_ON(1); + *cpa->vaddr); return -EINVAL; } @@ -621,7 +665,7 @@ repeat: */ if (pte_val(old_pte) != pte_val(new_pte)) { set_pte_atomic(kpte, new_pte); - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; } cpa->numpages = 1; return 0; @@ -645,7 +689,7 @@ repeat: */ err = split_large_page(kpte, address); if (!err) { - cpa->flushtlb = 1; + cpa->flags |= CPA_FLUSHTLB; goto repeat; } @@ -658,6 +702,7 @@ static int cpa_process_alias(struct cpa_data *cpa) { struct cpa_data alias_cpa; int ret = 0; + unsigned long temp_cpa_vaddr, vaddr; if (cpa->pfn >= max_pfn_mapped) return 0; @@ -670,16 +715,24 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (!(within(cpa->vaddr, PAGE_OFFSET, + if (cpa->flags & CPA_ARRAY) + vaddr = cpa->vaddr[cpa->curpage]; + else + vaddr = *cpa->vaddr; + + if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_low_pfn_mapped << PAGE_SHIFT)) #ifdef CONFIG_X86_64 - || within(cpa->vaddr, PAGE_OFFSET + (1UL<<32), + || within(vaddr, PAGE_OFFSET + (1UL<<32), PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT)) #endif )) { alias_cpa = *cpa; - alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + temp_cpa_vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; + ret = __change_page_attr_set_clr(&alias_cpa, 0); } @@ -691,7 +744,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the high * mapping already: */ - if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + if (within(vaddr, (unsigned long) _text, (unsigned long) _end)) return 0; /* @@ -702,8 +755,9 @@ static int cpa_process_alias(struct cpa_data *cpa) return 0; alias_cpa = *cpa; - alias_cpa.vaddr = - (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + temp_cpa_vaddr = (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + alias_cpa.vaddr = &temp_cpa_vaddr; + alias_cpa.flags &= ~CPA_ARRAY; /* * The high mapping range is imprecise, so ignore the return value. @@ -723,6 +777,9 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * preservation check. */ cpa->numpages = numpages; + /* for array changes, we can't use large page */ + if (cpa->flags & CPA_ARRAY) + cpa->numpages = 1; ret = __change_page_attr(cpa, checkalias); if (ret) @@ -741,7 +798,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) */ BUG_ON(cpa->numpages > numpages); numpages -= cpa->numpages; - cpa->vaddr += cpa->numpages * PAGE_SIZE; + if (cpa->flags & CPA_ARRAY) + cpa->curpage++; + else + *cpa->vaddr += cpa->numpages * PAGE_SIZE; + } return 0; } @@ -752,9 +813,9 @@ static inline int cache_attr(pgprot_t attr) (_PAGE_PAT | _PAGE_PAT_LARGE | _PAGE_PWT | _PAGE_PCD); } -static int change_page_attr_set_clr(unsigned long addr, int numpages, +static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, - int force_split) + int force_split, int array) { struct cpa_data cpa; int ret, cache, checkalias; @@ -769,12 +830,22 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, return 0; /* Ensure we are PAGE_SIZE aligned */ - if (addr & ~PAGE_MASK) { - addr &= PAGE_MASK; - /* - * People should not be passing in unaligned addresses: - */ - WARN_ON_ONCE(1); + if (!array) { + if (*addr & ~PAGE_MASK) { + *addr &= PAGE_MASK; + /* + * People should not be passing in unaligned addresses: + */ + WARN_ON_ONCE(1); + } + } else { + int i; + for (i = 0; i < numpages; i++) { + if (addr[i] & ~PAGE_MASK) { + addr[i] &= PAGE_MASK; + WARN_ON_ONCE(1); + } + } } /* Must avoid aliasing mappings in the highmem code */ @@ -784,9 +855,13 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa.numpages = numpages; cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; - cpa.flushtlb = 0; + cpa.flags = 0; + cpa.curpage = 0; cpa.force_split = force_split; + if (array) + cpa.flags |= CPA_ARRAY; + /* No alias checking for _NX bit modifications */ checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; @@ -795,7 +870,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, /* * Check whether we really changed something: */ - if (!cpa.flushtlb) + if (!(cpa.flags & CPA_FLUSHTLB)) goto out; /* @@ -810,9 +885,12 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, * error case we fall back to cpa_flush_all (which uses * wbindv): */ - if (!ret && cpu_has_clflush) - cpa_flush_range(addr, numpages, cache); - else + if (!ret && cpu_has_clflush) { + if (cpa.flags & CPA_ARRAY) + cpa_flush_array(addr, numpages, cache); + else + cpa_flush_range(*addr, numpages, cache); + } else cpa_flush_all(cache); out: @@ -821,16 +899,18 @@ out: return ret; } -static inline int change_page_attr_set(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_set(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0, + array); } -static inline int change_page_attr_clear(unsigned long addr, int numpages, - pgprot_t mask) +static inline int change_page_attr_clear(unsigned long *addr, int numpages, + pgprot_t mask, int array) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0, + array); } int _set_memory_uc(unsigned long addr, int numpages) @@ -838,8 +918,8 @@ int _set_memory_uc(unsigned long addr, int numpages) /* * for now UC MINUS. see comments in ioremap_nocache() */ - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_UC_MINUS)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_UC_MINUS), 0); } int set_memory_uc(unsigned long addr, int numpages) @@ -855,10 +935,31 @@ int set_memory_uc(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_uc); +int set_memory_array_uc(unsigned long *addr, int addrinarray) +{ + int i; + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + for (i = 0; i < addrinarray; i++) { + if (reserve_memtype(addr[i], addr[i] + PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL)) + goto out; + } + + return change_page_attr_set(addr, addrinarray, + __pgprot(_PAGE_CACHE_UC_MINUS), 1); +out: + while (--i >= 0) + free_memtype(addr[i], addr[i] + PAGE_SIZE); + return -EINVAL; +} +EXPORT_SYMBOL(set_memory_array_uc); + int _set_memory_wc(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_CACHE_WC)); + return change_page_attr_set(&addr, numpages, + __pgprot(_PAGE_CACHE_WC), 0); } int set_memory_wc(unsigned long addr, int numpages) @@ -876,8 +977,8 @@ EXPORT_SYMBOL(set_memory_wc); int _set_memory_wb(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_CACHE_MASK)); + return change_page_attr_clear(&addr, numpages, + __pgprot(_PAGE_CACHE_MASK), 0); } int set_memory_wb(unsigned long addr, int numpages) @@ -888,37 +989,48 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); +int set_memory_array_wb(unsigned long *addr, int addrinarray) +{ + int i; + for (i = 0; i < addrinarray; i++) + free_memtype(addr[i], addr[i] + PAGE_SIZE); + + return change_page_attr_clear(addr, addrinarray, + __pgprot(_PAGE_CACHE_MASK), 1); +} +EXPORT_SYMBOL(set_memory_array_wb); + int set_memory_x(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_x); int set_memory_nx(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_NX)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); } EXPORT_SYMBOL(set_memory_nx); int set_memory_ro(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_rw(unsigned long addr, int numpages) { - return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); + return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_RW), 0); } int set_memory_np(unsigned long addr, int numpages) { - return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); + return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_PRESENT), 0); } int set_memory_4k(unsigned long addr, int numpages) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), - __pgprot(0), 1); + return change_page_attr_set_clr(&addr, numpages, __pgprot(0), + __pgprot(0), 1, 0); } int set_pages_uc(struct page *page, int numpages) @@ -971,20 +1083,24 @@ int set_pages_rw(struct page *page, int numpages) static int __set_pages_p(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), - .mask_clr = __pgprot(0)}; + .mask_clr = __pgprot(0), + .flags = 0}; return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) { - struct cpa_data cpa = { .vaddr = (unsigned long) page_address(page), + unsigned long tempaddr = (unsigned long) page_address(page); + struct cpa_data cpa = { .vaddr = &tempaddr, .numpages = numpages, .mask_set = __pgprot(0), - .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; + .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), + .flags = 0}; return __change_page_attr_set_clr(&cpa, 1); } -- cgit v1.2.2 From ab7e79243746e2a9c5f00243e60108189c44c9eb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 21 Aug 2008 10:46:21 +0800 Subject: x86: fix pageattr-test We changed the interface of some pageattr, this patch makes pageattr-test compile. Signed-off-by: Dave Airlie Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 0dcd42eb94e6..6ae1f28a7ff2 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -118,6 +118,7 @@ static int pageattr_test(void) unsigned int level; int i, k; int err; + unsigned long test_addr; if (print) printk(KERN_INFO "CPA self-test:\n"); @@ -172,7 +173,8 @@ static int pageattr_test(void) continue; } - err = change_page_attr_set(addr[i], len[i], PAGE_TESTBIT); + test_addr = addr[i]; + err = change_page_attr_set(&test_addr, len[i], PAGE_TESTBIT, 0); if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; @@ -204,7 +206,8 @@ static int pageattr_test(void) failed++; continue; } - err = change_page_attr_clear(addr[i], len[i], PAGE_TESTBIT); + test_addr = addr[i]; + err = change_page_attr_clear(&test_addr, len[i], PAGE_TESTBIT, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; -- cgit v1.2.2 From 168d2f464ab9860f0d1e66cf1f9684973222f1c6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:18 -0700 Subject: xen: save previous spinlock when blocking A spinlock can be interrupted while spinning, so make sure we preserve the previous lock of interest if we're taking a lock from within an interrupt handler. We also need to deal with the case where the blocking path gets interrupted between testing to see if the lock is free and actually blocking. If we get interrupted there and end up in the state where the lock is free but the irq isn't pending, then we'll block indefinitely in the hypervisor. This fix is to make sure that any nested lock-takers will always leave the irq pending if there's any chance the outer lock became free. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 65 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 8dc4d31da67f..4884bc603aa7 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -47,25 +47,41 @@ static int xen_spin_trylock(struct raw_spinlock *lock) static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(struct xen_spinlock *, lock_spinners); -static inline void spinning_lock(struct xen_spinlock *xl) +/* + * Mark a cpu as interested in a lock. Returns the CPU's previous + * lock of interest, in case we got preempted by an interrupt. + */ +static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl) { + struct xen_spinlock *prev; + + prev = __get_cpu_var(lock_spinners); __get_cpu_var(lock_spinners) = xl; + wmb(); /* set lock of interest before count */ + asm(LOCK_PREFIX " incw %0" : "+m" (xl->spinners) : : "memory"); + + return prev; } -static inline void unspinning_lock(struct xen_spinlock *xl) +/* + * Mark a cpu as no longer interested in a lock. Restores previous + * lock of interest (NULL for none). + */ +static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock *prev) { asm(LOCK_PREFIX " decw %0" : "+m" (xl->spinners) : : "memory"); - wmb(); /* decrement count before clearing lock */ - __get_cpu_var(lock_spinners) = NULL; + wmb(); /* decrement count before restoring lock */ + __get_cpu_var(lock_spinners) = prev; } static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; + struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; @@ -74,23 +90,42 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) return 0; /* announce we're spinning */ - spinning_lock(xl); + prev = spinning_lock(xl); - /* clear pending */ - xen_clear_irq_pending(irq); + do { + /* clear pending */ + xen_clear_irq_pending(irq); + + /* check again make sure it didn't become free while + we weren't looking */ + ret = xen_spin_trylock(lock); + if (ret) { + /* + * If we interrupted another spinlock while it + * was blocking, make sure it doesn't block + * without rechecking the lock. + */ + if (prev != NULL) + xen_set_irq_pending(irq); + goto out; + } - /* check again make sure it didn't become free while - we weren't looking */ - ret = xen_spin_trylock(lock); - if (ret) - goto out; + /* + * Block until irq becomes pending. If we're + * interrupted at this point (after the trylock but + * before entering the block), then the nested lock + * handler guarantees that the irq will be left + * pending if there's any chance the lock became free; + * xen_poll_irq() returns immediately if the irq is + * pending. + */ + xen_poll_irq(irq); + } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - /* block until irq becomes pending */ - xen_poll_irq(irq); kstat_this_cpu.irqs[irq]++; out: - unspinning_lock(xl); + unspinning_lock(xl, prev); return ret; } -- cgit v1.2.2 From 994025caba3e6beade9bde84dd1b70d9d250f27b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:19 -0700 Subject: xen: add debugfs support Add support for exporting statistics on mmu updates, multicall batching and pv spinlocks into debugfs. The base path is xen/ and each subsystem adds its own directory: mmu, multicalls, spinlocks. In each directory, writing 1 to "zero_stats" will cause the corresponding stats to be zeroed the next time they're updated. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 10 ++- arch/x86/xen/Makefile | 3 +- arch/x86/xen/debugfs.c | 123 ++++++++++++++++++++++++++++++++++ arch/x86/xen/debugfs.h | 10 +++ arch/x86/xen/mmu.c | 163 ++++++++++++++++++++++++++++++++++++++++++++- arch/x86/xen/multicalls.c | 115 +++++++++++++++++++++++++++++++- arch/x86/xen/spinlock.c | 165 +++++++++++++++++++++++++++++++++++++++++++++- 7 files changed, 580 insertions(+), 9 deletions(-) create mode 100644 arch/x86/xen/debugfs.c create mode 100644 arch/x86/xen/debugfs.h (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index 3815e425f470..d3e68465ace9 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -27,4 +27,12 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool depends on PM - default y \ No newline at end of file + default y + +config XEN_DEBUG_FS + bool "Enable Xen debug and tuning parameters in debugfs" + depends on XEN && DEBUG_FS + default n + help + Enable statistics output and various tuning options in debugfs. + Enabling this option may incur a significant performance overhead. \ No newline at end of file diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile index 9ee745fa5527..313947940a1a 100644 --- a/arch/x86/xen/Makefile +++ b/arch/x86/xen/Makefile @@ -8,4 +8,5 @@ endif obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ time.o xen-asm_$(BITS).o grant-table.o suspend.o -obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_SMP) += smp.o spinlock.o +obj-$(CONFIG_XEN_DEBUG_FS) += debugfs.o \ No newline at end of file diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c new file mode 100644 index 000000000000..b53225d2cac3 --- /dev/null +++ b/arch/x86/xen/debugfs.c @@ -0,0 +1,123 @@ +#include +#include +#include + +#include "debugfs.h" + +static struct dentry *d_xen_debug; + +struct dentry * __init xen_init_debugfs(void) +{ + if (!d_xen_debug) { + d_xen_debug = debugfs_create_dir("xen", NULL); + + if (!d_xen_debug) + pr_warning("Could not create 'xen' debugfs directory\n"); + } + + return d_xen_debug; +} + +struct array_data +{ + void *array; + unsigned elements; +}; + +static int u32_array_open(struct inode *inode, struct file *file) +{ + file->private_data = NULL; + return nonseekable_open(inode, file); +} + +static size_t format_array(char *buf, size_t bufsize, const char *fmt, + u32 *array, unsigned array_size) +{ + size_t ret = 0; + unsigned i; + + for(i = 0; i < array_size; i++) { + size_t len; + + len = snprintf(buf, bufsize, fmt, array[i]); + len++; /* ' ' or '\n' */ + ret += len; + + if (buf) { + buf += len; + bufsize -= len; + buf[-1] = (i == array_size-1) ? '\n' : ' '; + } + } + + ret++; /* \0 */ + if (buf) + *buf = '\0'; + + return ret; +} + +static char *format_array_alloc(const char *fmt, u32 *array, unsigned array_size) +{ + size_t len = format_array(NULL, 0, fmt, array, array_size); + char *ret; + + ret = kmalloc(len, GFP_KERNEL); + if (ret == NULL) + return NULL; + + format_array(ret, len, fmt, array, array_size); + return ret; +} + +static ssize_t u32_array_read(struct file *file, char __user *buf, size_t len, + loff_t *ppos) +{ + struct inode *inode = file->f_path.dentry->d_inode; + struct array_data *data = inode->i_private; + size_t size; + + if (*ppos == 0) { + if (file->private_data) { + kfree(file->private_data); + file->private_data = NULL; + } + + file->private_data = format_array_alloc("%u", data->array, data->elements); + } + + size = 0; + if (file->private_data) + size = strlen(file->private_data); + + return simple_read_from_buffer(buf, len, ppos, file->private_data, size); +} + +static int xen_array_release(struct inode *inode, struct file *file) +{ + kfree(file->private_data); + + return 0; +} + +static struct file_operations u32_array_fops = { + .owner = THIS_MODULE, + .open = u32_array_open, + .release= xen_array_release, + .read = u32_array_read, +}; + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements) +{ + struct array_data *data = kmalloc(sizeof(*data), GFP_KERNEL); + + if (data == NULL) + return NULL; + + data->array = array; + data->elements = elements; + + return debugfs_create_file(name, mode, parent, data, &u32_array_fops); +} diff --git a/arch/x86/xen/debugfs.h b/arch/x86/xen/debugfs.h new file mode 100644 index 000000000000..e28132084832 --- /dev/null +++ b/arch/x86/xen/debugfs.h @@ -0,0 +1,10 @@ +#ifndef _XEN_DEBUGFS_H +#define _XEN_DEBUGFS_H + +struct dentry * __init xen_init_debugfs(void); + +struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, + struct dentry *parent, + u32 *array, unsigned elements); + +#endif /* _XEN_DEBUGFS_H */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index d9a35a363095..f5af913fd7b0 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -40,6 +40,7 @@ */ #include #include +#include #include #include @@ -57,6 +58,61 @@ #include "multicalls.h" #include "mmu.h" +#include "debugfs.h" + +#define MMU_UPDATE_HISTO 30 + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct { + u32 pgd_update; + u32 pgd_update_pinned; + u32 pgd_update_batched; + + u32 pud_update; + u32 pud_update_pinned; + u32 pud_update_batched; + + u32 pmd_update; + u32 pmd_update_pinned; + u32 pmd_update_batched; + + u32 pte_update; + u32 pte_update_pinned; + u32 pte_update_batched; + + u32 mmu_update; + u32 mmu_update_extended; + u32 mmu_update_histo[MMU_UPDATE_HISTO]; + + u32 prot_commit; + u32 prot_commit_batched; + + u32 set_pte_at; + u32 set_pte_at_batched; + u32 set_pte_at_pinned; + u32 set_pte_at_current; + u32 set_pte_at_kernel; +} mmu_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mmu_stats, 0, sizeof(mmu_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); mmu_stats.elem += (val); } while(0) + +#else /* !CONFIG_XEN_DEBUG_FS */ + +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +#endif /* CONFIG_XEN_DEBUG_FS */ /* * Just beyond the highest usermode address. STACK_TOP_MAX has a @@ -243,11 +299,21 @@ static void xen_extend_mmu_update(const struct mmu_update *update) mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); - if (mcs.mc != NULL) + if (mcs.mc != NULL) { + ADD_STATS(mmu_update_extended, 1); + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); + mcs.mc->args[1]++; - else { + + if (mcs.mc->args[1] < MMU_UPDATE_HISTO) + ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); + else + ADD_STATS(mmu_update_histo[0], 1); + } else { + ADD_STATS(mmu_update, 1); mcs = __xen_mc_entry(sizeof(*u)); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); + ADD_STATS(mmu_update_histo[1], 1); } u = mcs.args; @@ -267,6 +333,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); + ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); @@ -274,6 +342,8 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) void xen_set_pmd(pmd_t *ptr, pmd_t val) { + ADD_STATS(pmd_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -281,6 +351,8 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) return; } + ADD_STATS(pmd_update_pinned, 1); + xen_set_pmd_hyper(ptr, val); } @@ -300,12 +372,18 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, if (mm == &init_mm) preempt_disable(); + ADD_STATS(set_pte_at, 1); +// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); + ADD_STATS(set_pte_at_current, mm == current->mm); + ADD_STATS(set_pte_at_kernel, mm == &init_mm); + if (mm == current->mm || mm == &init_mm) { if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { struct multicall_space mcs; mcs = xen_mc_entry(0); MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); + ADD_STATS(set_pte_at_batched, 1); xen_mc_issue(PARAVIRT_LAZY_MMU); goto out; } else @@ -336,6 +414,9 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); + ADD_STATS(prot_commit, 1); + ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -402,6 +483,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); + ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); @@ -409,6 +492,8 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) void xen_set_pud(pud_t *ptr, pud_t val) { + ADD_STATS(pud_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -416,11 +501,17 @@ void xen_set_pud(pud_t *ptr, pud_t val) return; } + ADD_STATS(pud_update_pinned, 1); + xen_set_pud_hyper(ptr, val); } void xen_set_pte(pte_t *ptep, pte_t pte) { + ADD_STATS(pte_update, 1); +// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); + ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + #ifdef CONFIG_X86_PAE ptep->pte_high = pte.pte_high; smp_wmb(); @@ -517,6 +608,8 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) { pgd_t *user_ptr = xen_get_user_pgd(ptr); + ADD_STATS(pgd_update, 1); + /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -528,6 +621,9 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) return; } + ADD_STATS(pgd_update_pinned, 1); + ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); + /* If it's pinned, then we can at least batch the kernel and user updates together. */ xen_mc_batch(); @@ -1003,3 +1099,66 @@ void xen_exit_mmap(struct mm_struct *mm) spin_unlock(&mm->page_table_lock); } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mmu_debug; + +static int __init xen_mmu_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mmu_debug = debugfs_create_dir("mmu", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); + + debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); + debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pgd_update_pinned); + + debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); + debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, + &mmu_stats.pud_update_pinned); + + debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); + debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, + &mmu_stats.pmd_update_pinned); + + debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); +// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, +// &mmu_stats.pte_update_pinned); + debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, + &mmu_stats.pte_update_pinned); + + debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); + debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, + &mmu_stats.mmu_update_extended); + xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, + mmu_stats.mmu_update_histo, 20); + + debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); + debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_batched); + debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_current); + debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, + &mmu_stats.set_pte_at_kernel); + + debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); + debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, + &mmu_stats.prot_commit_batched); + + return 0; +} +fs_initcall(xen_mmu_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c index 9efd1c6c9776..8ea8a0d0b0de 100644 --- a/arch/x86/xen/multicalls.c +++ b/arch/x86/xen/multicalls.c @@ -21,16 +21,20 @@ */ #include #include +#include #include #include "multicalls.h" +#include "debugfs.h" + +#define MC_BATCH 32 #define MC_DEBUG 1 -#define MC_BATCH 32 #define MC_ARGS (MC_BATCH * 16) + struct mc_buffer { struct multicall_entry entries[MC_BATCH]; #if MC_DEBUG @@ -47,6 +51,76 @@ struct mc_buffer { static DEFINE_PER_CPU(struct mc_buffer, mc_buffer); DEFINE_PER_CPU(unsigned long, xen_mc_irq_flags); +/* flush reasons 0- slots, 1- args, 2- callbacks */ +enum flush_reasons +{ + FL_SLOTS, + FL_ARGS, + FL_CALLBACKS, + + FL_N_REASONS +}; + +#ifdef CONFIG_XEN_DEBUG_FS +#define NHYPERCALLS 40 /* not really */ + +static struct { + unsigned histo[MC_BATCH+1]; + + unsigned issued; + unsigned arg_total; + unsigned hypercalls; + unsigned histo_hypercalls[NHYPERCALLS]; + + unsigned flush[FL_N_REASONS]; +} mc_stats; + +static u8 zero_stats; + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&mc_stats, 0, sizeof(mc_stats)); + zero_stats = 0; + } +} + +static void mc_add_stats(const struct mc_buffer *mc) +{ + int i; + + check_zero(); + + mc_stats.issued++; + mc_stats.hypercalls += mc->mcidx; + mc_stats.arg_total += mc->argidx; + + mc_stats.histo[mc->mcidx]++; + for(i = 0; i < mc->mcidx; i++) { + unsigned op = mc->entries[i].op; + if (op < NHYPERCALLS) + mc_stats.histo_hypercalls[op]++; + } +} + +static void mc_stats_flush(enum flush_reasons idx) +{ + check_zero(); + + mc_stats.flush[idx]++; +} + +#else /* !CONFIG_XEN_DEBUG_FS */ + +static inline void mc_add_stats(const struct mc_buffer *mc) +{ +} + +static inline void mc_stats_flush(enum flush_reasons idx) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ + void xen_mc_flush(void) { struct mc_buffer *b = &__get_cpu_var(mc_buffer); @@ -60,6 +134,8 @@ void xen_mc_flush(void) something in the middle */ local_irq_save(flags); + mc_add_stats(b); + if (b->mcidx) { #if MC_DEBUG memcpy(b->debug, b->entries, @@ -115,6 +191,7 @@ struct multicall_space __xen_mc_entry(size_t args) if (b->mcidx == MC_BATCH || (argidx + args) > MC_ARGS) { + mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); xen_mc_flush(); argidx = roundup(b->argidx, sizeof(u64)); } @@ -158,10 +235,44 @@ void xen_mc_callback(void (*fn)(void *), void *data) struct mc_buffer *b = &__get_cpu_var(mc_buffer); struct callback *cb; - if (b->cbidx == MC_BATCH) + if (b->cbidx == MC_BATCH) { + mc_stats_flush(FL_CALLBACKS); xen_mc_flush(); + } cb = &b->callbacks[b->cbidx++]; cb->fn = fn; cb->data = data; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_mc_debug; + +static int __init xen_mc_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_mc_debug = debugfs_create_dir("multicalls", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_mc_debug, &zero_stats); + + debugfs_create_u32("batches", 0444, d_mc_debug, &mc_stats.issued); + debugfs_create_u32("hypercalls", 0444, d_mc_debug, &mc_stats.hypercalls); + debugfs_create_u32("arg_total", 0444, d_mc_debug, &mc_stats.arg_total); + + xen_debugfs_create_u32_array("batch_histo", 0444, d_mc_debug, + mc_stats.histo, MC_BATCH); + xen_debugfs_create_u32_array("hypercall_histo", 0444, d_mc_debug, + mc_stats.histo_hypercalls, NHYPERCALLS); + xen_debugfs_create_u32_array("flush_reasons", 0444, d_mc_debug, + mc_stats.flush, FL_N_REASONS); + + return 0; +} +fs_initcall(xen_mc_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 4884bc603aa7..0d8f3b2d9bec 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -4,6 +4,8 @@ */ #include #include +#include +#include #include @@ -11,6 +13,93 @@ #include #include "xen-ops.h" +#include "debugfs.h" + +#ifdef CONFIG_XEN_DEBUG_FS +static struct xen_spinlock_stats +{ + u64 taken; + u32 taken_slow; + u32 taken_slow_nested; + u32 taken_slow_pickup; + u32 taken_slow_spurious; + + u64 released; + u32 released_slow; + u32 released_slow_kicked; + +#define HISTO_BUCKETS 20 + u32 histo_spin_fast[HISTO_BUCKETS+1]; + u32 histo_spin[HISTO_BUCKETS+1]; + + u64 spinning_time; + u64 total_time; +} spinlock_stats; + +static u8 zero_stats; + +static unsigned lock_timeout = 1 << 10; +#define TIMEOUT lock_timeout + +static inline void check_zero(void) +{ + if (unlikely(zero_stats)) { + memset(&spinlock_stats, 0, sizeof(spinlock_stats)); + zero_stats = 0; + } +} + +#define ADD_STATS(elem, val) \ + do { check_zero(); spinlock_stats.elem += (val); } while(0) + +static inline u64 spin_time_start(void) +{ + return xen_clocksource_read(); +} + +static void __spin_time_accum(u64 delta, u32 *array) +{ + unsigned index = ilog2(delta); + + check_zero(); + + if (index < HISTO_BUCKETS) + array[index]++; + else + array[HISTO_BUCKETS]++; +} + +static inline void spin_time_accum_fast(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_fast); + spinlock_stats.spinning_time += delta; +} + +static inline void spin_time_accum(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin); + spinlock_stats.total_time += delta; +} +#else /* !CONFIG_XEN_DEBUG_FS */ +#define TIMEOUT (1 << 10) +#define ADD_STATS(elem, val) do { (void)(val); } while(0) + +static inline u64 spin_time_start(void) +{ + return 0; +} + +static inline void spin_time_accum_fast(u64 start) +{ +} +static inline void spin_time_accum(u64 start) +{ +} +#endif /* CONFIG_XEN_DEBUG_FS */ struct xen_spinlock { unsigned char lock; /* 0 -> free; 1 -> locked */ @@ -92,6 +181,9 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) /* announce we're spinning */ prev = spinning_lock(xl); + ADD_STATS(taken_slow, 1); + ADD_STATS(taken_slow_nested, prev != NULL); + do { /* clear pending */ xen_clear_irq_pending(irq); @@ -100,6 +192,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) we weren't looking */ ret = xen_spin_trylock(lock); if (ret) { + ADD_STATS(taken_slow_pickup, 1); + /* * If we interrupted another spinlock while it * was blocking, make sure it doesn't block @@ -120,6 +214,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) * pending. */ xen_poll_irq(irq); + ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ kstat_this_cpu.irqs[irq]++; @@ -132,11 +227,18 @@ out: static void xen_spin_lock(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; - int timeout; + unsigned timeout; u8 oldval; + u64 start_spin; + + ADD_STATS(taken, 1); + + start_spin = spin_time_start(); do { - timeout = 1 << 10; + u64 start_spin_fast = spin_time_start(); + + timeout = TIMEOUT; asm("1: xchgb %1,%0\n" " testb %1,%1\n" @@ -151,16 +253,22 @@ static void xen_spin_lock(struct raw_spinlock *lock) : "1" (1) : "memory"); - } while (unlikely(oldval != 0 && !xen_spin_lock_slow(lock))); + spin_time_accum_fast(start_spin_fast); + } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock)))); + + spin_time_accum(start_spin); } static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) { int cpu; + ADD_STATS(released_slow, 1); + for_each_online_cpu(cpu) { /* XXX should mix up next cpu selection */ if (per_cpu(lock_spinners, cpu) == xl) { + ADD_STATS(released_slow_kicked, 1); xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); break; } @@ -171,6 +279,8 @@ static void xen_spin_unlock(struct raw_spinlock *lock) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; + ADD_STATS(released, 1); + smp_wmb(); /* make sure no writes get moved after unlock */ xl->lock = 0; /* release lock */ @@ -216,3 +326,52 @@ void __init xen_init_spinlocks(void) pv_lock_ops.spin_trylock = xen_spin_trylock; pv_lock_ops.spin_unlock = xen_spin_unlock; } + +#ifdef CONFIG_XEN_DEBUG_FS + +static struct dentry *d_spin_debug; + +static int __init xen_spinlock_debugfs(void) +{ + struct dentry *d_xen = xen_init_debugfs(); + + if (d_xen == NULL) + return -ENOMEM; + + d_spin_debug = debugfs_create_dir("spinlocks", d_xen); + + debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); + + debugfs_create_u32("timeout", 0644, d_spin_debug, &lock_timeout); + + debugfs_create_u64("taken", 0444, d_spin_debug, &spinlock_stats.taken); + debugfs_create_u32("taken_slow", 0444, d_spin_debug, + &spinlock_stats.taken_slow); + debugfs_create_u32("taken_slow_nested", 0444, d_spin_debug, + &spinlock_stats.taken_slow_nested); + debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, + &spinlock_stats.taken_slow_pickup); + debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, + &spinlock_stats.taken_slow_spurious); + + debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); + debugfs_create_u32("released_slow", 0444, d_spin_debug, + &spinlock_stats.released_slow); + debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, + &spinlock_stats.released_slow_kicked); + + debugfs_create_u64("time_spinning", 0444, d_spin_debug, + &spinlock_stats.spinning_time); + debugfs_create_u64("time_total", 0444, d_spin_debug, + &spinlock_stats.total_time); + + xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, + spinlock_stats.histo_spin, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, + spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1); + + return 0; +} +fs_initcall(xen_spinlock_debugfs); + +#endif /* CONFIG_XEN_DEBUG_FS */ -- cgit v1.2.2 From 1e696f638b523958ee3d95e655becdb4c4b6fcde Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:20 -0700 Subject: xen: allow interrupts to be enabled while doing a blocking spin If spin_lock is called in an interrupts-enabled context, we can safely enable interrupts while spinning. We don't bother for the actual spin loop, but if we timeout and fall back to blocking, it's definitely worthwhile enabling interrupts if possible. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 0d8f3b2d9bec..e6061f2e64f2 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -23,6 +23,7 @@ static struct xen_spinlock_stats u32 taken_slow_nested; u32 taken_slow_pickup; u32 taken_slow_spurious; + u32 taken_slow_irqenable; u64 released; u32 released_slow; @@ -167,12 +168,13 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock __get_cpu_var(lock_spinners) = prev; } -static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) +static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; struct xen_spinlock *prev; int irq = __get_cpu_var(lock_kicker_irq); int ret; + unsigned long flags; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) @@ -181,6 +183,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) /* announce we're spinning */ prev = spinning_lock(xl); + flags = __raw_local_save_flags(); + if (irq_enable) { + ADD_STATS(taken_slow_irqenable, 1); + raw_local_irq_enable(); + } + ADD_STATS(taken_slow, 1); ADD_STATS(taken_slow_nested, prev != NULL); @@ -220,11 +228,12 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock) kstat_this_cpu.irqs[irq]++; out: + raw_local_irq_restore(flags); unspinning_lock(xl, prev); return ret; } -static void xen_spin_lock(struct raw_spinlock *lock) +static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) { struct xen_spinlock *xl = (struct xen_spinlock *)lock; unsigned timeout; @@ -254,11 +263,23 @@ static void xen_spin_lock(struct raw_spinlock *lock) : "memory"); spin_time_accum_fast(start_spin_fast); - } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock)))); + + } while (unlikely(oldval != 0 && + (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); spin_time_accum(start_spin); } +static void xen_spin_lock(struct raw_spinlock *lock) +{ + __xen_spin_lock(lock, false); +} + +static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) +{ + __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); +} + static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl) { int cpu; @@ -323,6 +344,7 @@ void __init xen_init_spinlocks(void) pv_lock_ops.spin_is_locked = xen_spin_is_locked; pv_lock_ops.spin_is_contended = xen_spin_is_contended; pv_lock_ops.spin_lock = xen_spin_lock; + pv_lock_ops.spin_lock_flags = xen_spin_lock_flags; pv_lock_ops.spin_trylock = xen_spin_trylock; pv_lock_ops.spin_unlock = xen_spin_unlock; } @@ -353,6 +375,8 @@ static int __init xen_spinlock_debugfs(void) &spinlock_stats.taken_slow_pickup); debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, &spinlock_stats.taken_slow_spurious); + debugfs_create_u32("taken_slow_irqenable", 0444, d_spin_debug, + &spinlock_stats.taken_slow_irqenable); debugfs_create_u64("released", 0444, d_spin_debug, &spinlock_stats.released); debugfs_create_u32("released_slow", 0444, d_spin_debug, -- cgit v1.2.2 From f8eca41f0afcc7042dc3398c8ba5878c59b1cf1b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 17:02:21 -0700 Subject: xen: measure how long spinlocks spend blocking Measure how long spinlocks spend blocked. Also rename some fields to be more consistent. Signed-off-by: Jeremy Fitzhardinge Acked-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/xen/spinlock.c | 62 +++++++++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index e6061f2e64f2..d072823bc06d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -29,12 +29,14 @@ static struct xen_spinlock_stats u32 released_slow; u32 released_slow_kicked; -#define HISTO_BUCKETS 20 - u32 histo_spin_fast[HISTO_BUCKETS+1]; - u32 histo_spin[HISTO_BUCKETS+1]; - - u64 spinning_time; - u64 total_time; +#define HISTO_BUCKETS 30 + u32 histo_spin_total[HISTO_BUCKETS+1]; + u32 histo_spin_spinning[HISTO_BUCKETS+1]; + u32 histo_spin_blocked[HISTO_BUCKETS+1]; + + u64 time_total; + u64 time_spinning; + u64 time_blocked; } spinlock_stats; static u8 zero_stats; @@ -70,20 +72,28 @@ static void __spin_time_accum(u64 delta, u32 *array) array[HISTO_BUCKETS]++; } -static inline void spin_time_accum_fast(u64 start) +static inline void spin_time_accum_spinning(u64 start) +{ + u32 delta = xen_clocksource_read() - start; + + __spin_time_accum(delta, spinlock_stats.histo_spin_spinning); + spinlock_stats.time_spinning += delta; +} + +static inline void spin_time_accum_total(u64 start) { u32 delta = xen_clocksource_read() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin_fast); - spinlock_stats.spinning_time += delta; + __spin_time_accum(delta, spinlock_stats.histo_spin_total); + spinlock_stats.time_total += delta; } -static inline void spin_time_accum(u64 start) +static inline void spin_time_accum_blocked(u64 start) { u32 delta = xen_clocksource_read() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin); - spinlock_stats.total_time += delta; + __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); + spinlock_stats.time_blocked += delta; } #else /* !CONFIG_XEN_DEBUG_FS */ #define TIMEOUT (1 << 10) @@ -94,10 +104,13 @@ static inline u64 spin_time_start(void) return 0; } -static inline void spin_time_accum_fast(u64 start) +static inline void spin_time_accum_total(u64 start) +{ +} +static inline void spin_time_accum_spinning(u64 start) { } -static inline void spin_time_accum(u64 start) +static inline void spin_time_accum_blocked(u64 start) { } #endif /* CONFIG_XEN_DEBUG_FS */ @@ -175,11 +188,14 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl int irq = __get_cpu_var(lock_kicker_irq); int ret; unsigned long flags; + u64 start; /* If kicker interrupts not initialized yet, just spin */ if (irq == -1) return 0; + start = spin_time_start(); + /* announce we're spinning */ prev = spinning_lock(xl); @@ -230,6 +246,8 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl out: raw_local_irq_restore(flags); unspinning_lock(xl, prev); + spin_time_accum_blocked(start); + return ret; } @@ -262,12 +280,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) : "1" (1) : "memory"); - spin_time_accum_fast(start_spin_fast); + spin_time_accum_spinning(start_spin_fast); } while (unlikely(oldval != 0 && (TIMEOUT == ~0 || !xen_spin_lock_slow(lock, irq_enable)))); - spin_time_accum(start_spin); + spin_time_accum_total(start_spin); } static void xen_spin_lock(struct raw_spinlock *lock) @@ -385,14 +403,18 @@ static int __init xen_spinlock_debugfs(void) &spinlock_stats.released_slow_kicked); debugfs_create_u64("time_spinning", 0444, d_spin_debug, - &spinlock_stats.spinning_time); + &spinlock_stats.time_spinning); + debugfs_create_u64("time_blocked", 0444, d_spin_debug, + &spinlock_stats.time_blocked); debugfs_create_u64("time_total", 0444, d_spin_debug, - &spinlock_stats.total_time); + &spinlock_stats.time_total); xen_debugfs_create_u32_array("histo_total", 0444, d_spin_debug, - spinlock_stats.histo_spin, HISTO_BUCKETS + 1); + spinlock_stats.histo_spin_total, HISTO_BUCKETS + 1); xen_debugfs_create_u32_array("histo_spinning", 0444, d_spin_debug, - spinlock_stats.histo_spin_fast, HISTO_BUCKETS + 1); + spinlock_stats.histo_spin_spinning, HISTO_BUCKETS + 1); + xen_debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, + spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); return 0; } -- cgit v1.2.2 From f86399396ce7a4f4069828b7dceac5aa5113dfb5 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Wed, 30 Jul 2008 18:32:27 -0300 Subject: x86, paravirt_ops: use unsigned long instead of u32 for alloc_p*() pfn args This patch changes the pfn args from 'u32' to 'unsigned long' on alloc_p*() functions on paravirt_ops, and the corresponding implementations for Xen and VMI. The prototypes for CONFIG_PARAVIRT=n are already using unsigned long, so paravirt.h now matches the prototypes on asm-x86/pgalloc.h. It shouldn't result in any changes on generated code on 32-bit, with or without CONFIG_PARAVIRT. On both cases, 'codiff -f' didn't show any change after applying this patch. On 64-bit, there are (expected) binary changes only when CONFIG_PARAVIRT is enabled, as the patch is really supposed to change the size of the pfn args. [ v2: KVM_GUEST: use the right parameter type on kvm_release_pt() ] Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Acked-by: Zachary Amsden Signed-off-by: Ingo Molnar --- arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/vmi_32.c | 10 +++++----- arch/x86/xen/enlighten.c | 20 ++++++++++---------- 3 files changed, 16 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8b7a3cf37d2b..478bca986eca 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -178,7 +178,7 @@ static void kvm_flush_tlb(void) kvm_deferred_mmu_op(&ftlb, sizeof ftlb); } -static void kvm_release_pt(u32 pfn) +static void kvm_release_pt(unsigned long pfn) { struct kvm_mmu_op_release_pt rpt = { .header.op = KVM_MMU_OP_RELEASE_PT, diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 0a1b1a9d922d..340b03643b95 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -392,13 +392,13 @@ static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type) } #endif -static void vmi_allocate_pte(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) { vmi_set_page_type(pfn, VMI_PAGE_L1); vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); } -static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) +static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn) { /* * This call comes in very early, before mem_map is setup. @@ -409,20 +409,20 @@ static void vmi_allocate_pmd(struct mm_struct *mm, u32 pfn) vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0); } -static void vmi_allocate_pmd_clone(u32 pfn, u32 clonepfn, u32 start, u32 count) +static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) { vmi_set_page_type(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE); vmi_check_page_type(clonepfn, VMI_PAGE_L2); vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count); } -static void vmi_release_pte(u32 pfn) +static void vmi_release_pte(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L1); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); } -static void vmi_release_pmd(u32 pfn) +static void vmi_release_pmd(unsigned long pfn) { vmi_ops.release_page(pfn, VMI_PAGE_L2); vmi_set_page_type(pfn, VMI_PAGE_NORMAL); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..db970bdc5e3d 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -812,7 +812,7 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high) /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ -static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) +static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ @@ -822,7 +822,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, u32 pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(u32 pfn) +static void xen_release_pte_init(unsigned long pfn) { make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } @@ -838,7 +838,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) /* This needs to make sure the new pte page is pinned iff its being attached to a pinned pagetable. */ -static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) +static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -856,12 +856,12 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) } } -static void xen_alloc_pte(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pte(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PTE); } -static void xen_alloc_pmd(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PMD); } @@ -909,7 +909,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd) } /* This should never happen until we're OK to use struct page */ -static void xen_release_ptpage(u32 pfn, unsigned level) +static void xen_release_ptpage(unsigned long pfn, unsigned level) { struct page *page = pfn_to_page(pfn); @@ -923,23 +923,23 @@ static void xen_release_ptpage(u32 pfn, unsigned level) } } -static void xen_release_pte(u32 pfn) +static void xen_release_pte(unsigned long pfn) { xen_release_ptpage(pfn, PT_PTE); } -static void xen_release_pmd(u32 pfn) +static void xen_release_pmd(unsigned long pfn) { xen_release_ptpage(pfn, PT_PMD); } #if PAGETABLE_LEVELS == 4 -static void xen_alloc_pud(struct mm_struct *mm, u32 pfn) +static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn) { xen_alloc_ptpage(mm, pfn, PT_PUD); } -static void xen_release_pud(u32 pfn) +static void xen_release_pud(unsigned long pfn) { xen_release_ptpage(pfn, PT_PUD); } -- cgit v1.2.2 From 38cc1c3df77c1bb739a4766788eb9fa49f16ffdf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 21 Aug 2008 20:24:24 -0700 Subject: x86: work around MTRR mask setting Joshua Hoblitt reported that only 3 GB of his 16 GB of RAM is usable. Booting with mtrr_show showed us the BIOS-initialized MTRR settings - which are all wrong. So the root cause is that the BIOS has not set the mask correctly: > [ 0.429971] MSR00000200: 00000000d0000000 > [ 0.433305] MSR00000201: 0000000ff0000800 > should be ==> [ 0.433305] MSR00000201: 0000003ff0000800 > > [ 0.436638] MSR00000202: 00000000e0000000 > [ 0.439971] MSR00000203: 0000000fe0000800 > should be ==> [ 0.439971] MSR00000203: 0000003fe0000800 > > [ 0.443304] MSR00000204: 0000000000000006 > [ 0.446637] MSR00000205: 0000000c00000800 > should be ==> [ 0.446637] MSR00000205: 0000003c00000800 > > [ 0.449970] MSR00000206: 0000000400000006 > [ 0.453303] MSR00000207: 0000000fe0000800 > should be ==> [ 0.453303] MSR00000207: 0000003fe0000800 > > [ 0.456636] MSR00000208: 0000000420000006 > [ 0.459970] MSR00000209: 0000000ff0000800 > should be ==> [ 0.459970] MSR00000209: 0000003ff0000800 So detect this borkage and add the prefix 111. Signed-off-by: Yinghai Lu Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 509bd3d9eacd..43102e03e2d1 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -379,6 +379,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, unsigned long *size, mtrr_type *type) { unsigned int mask_lo, mask_hi, base_lo, base_hi; + unsigned int tmp, hi; rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); if ((mask_lo & 0x800) == 0) { @@ -392,8 +393,18 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, rdmsr(MTRRphysBase_MSR(reg), base_lo, base_hi); /* Work out the shifted address mask. */ - mask_lo = size_or_mask | mask_hi << (32 - PAGE_SHIFT) - | mask_lo >> PAGE_SHIFT; + tmp = mask_hi << (32 - PAGE_SHIFT) | mask_lo >> PAGE_SHIFT; + mask_lo = size_or_mask | tmp; + /* Expand tmp with high bits to all 1s*/ + hi = fls(tmp); + if (hi > 0) { + tmp |= ~((1<<(hi - 1)) - 1); + + if (tmp != mask_lo) { + WARN_ON("mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + mask_lo = tmp; + } + } /* This works correctly if size is a power of two, i.e. a contiguous range. */ -- cgit v1.2.2 From 9a79f4f491f92bc713e1f28f96516b141b752600 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Fri, 22 Aug 2008 00:10:13 +0200 Subject: x86: {reverve,free}_memtype() take a physical address The new set_memory_array_{uc,wb}() pass virtual addresses to {reserve,free}_memtype() it seems. Signed-off-by: Rene Herman Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 1785591808bd..fed6ba2a8e7e 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -947,7 +947,7 @@ int set_memory_array_uc(unsigned long *addr, int addrinarray) * for now UC MINUS. see comments in ioremap_nocache() */ for (i = 0; i < addrinarray; i++) { - if (reserve_memtype(addr[i], addr[i] + PAGE_SIZE, + if (reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, _PAGE_CACHE_UC_MINUS, NULL)) goto out; } @@ -956,7 +956,7 @@ int set_memory_array_uc(unsigned long *addr, int addrinarray) __pgprot(_PAGE_CACHE_UC_MINUS), 1); out: while (--i >= 0) - free_memtype(addr[i], addr[i] + PAGE_SIZE); + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); return -EINVAL; } EXPORT_SYMBOL(set_memory_array_uc); @@ -998,7 +998,7 @@ int set_memory_array_wb(unsigned long *addr, int addrinarray) { int i; for (i = 0; i < addrinarray; i++) - free_memtype(addr[i], addr[i] + PAGE_SIZE); + free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); return change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); -- cgit v1.2.2 From c5e147cf5aeb31aa1a9030be9727914855fc4133 Mon Sep 17 00:00:00 2001 From: Rene Herman Date: Fri, 22 Aug 2008 01:02:20 +0200 Subject: x86: have set_memory_array_{uc,wb} coalesce memtypes. Actually, might as well simply reconstruct the memtype list at free time I guess. How is this for a coalescing version of the array functions? Compiles, boots and provides me with: root@7ixe4:~# wc -l /debug/x86/pat_memtype_list 53 /debug/x86/pat_memtype_list otherwise (down from 16384+). Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index fed6ba2a8e7e..497108825da9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -942,21 +942,38 @@ EXPORT_SYMBOL(set_memory_uc); int set_memory_array_uc(unsigned long *addr, int addrinarray) { + unsigned long start; + unsigned long end; int i; /* * for now UC MINUS. see comments in ioremap_nocache() */ for (i = 0; i < addrinarray; i++) { - if (reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, - _PAGE_CACHE_UC_MINUS, NULL)) + start = __pa(addr[i]); + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL)) goto out; } return change_page_attr_set(addr, addrinarray, __pgprot(_PAGE_CACHE_UC_MINUS), 1); out: - while (--i >= 0) - free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + for (i = 0; i < addrinarray; i++) { + unsigned long tmp = __pa(addr[i]); + + if (tmp == start) + break; + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(tmp, end); + } return -EINVAL; } EXPORT_SYMBOL(set_memory_array_uc); @@ -997,9 +1014,18 @@ EXPORT_SYMBOL(set_memory_wb); int set_memory_array_wb(unsigned long *addr, int addrinarray) { int i; - for (i = 0; i < addrinarray; i++) - free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); + for (i = 0; i < addrinarray; i++) { + unsigned long start = __pa(addr[i]); + unsigned long end; + + for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + if (end != __pa(addr[i + 1])) + break; + i++; + } + free_memtype(start, end); + } return change_page_attr_clear(addr, addrinarray, __pgprot(_PAGE_CACHE_MASK), 1); } -- cgit v1.2.2 From 5b792d320f28ff83dd4c13f984807e26235f7703 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 21 Aug 2008 13:43:51 -0700 Subject: x86, microcode_amd: fix shift warning microcode_amd.c uses ">> 32" on a 32-bit value, so gcc warns about that. The code could use something like this *untested* patch. linux-next-20080821/arch/x86/kernel/microcode_amd.c:229: warning: right shift count >= width of type Signed-off-by: Randy Dunlap Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 4006e5e3adf0..d606a05545ca 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -206,6 +206,7 @@ static void apply_microcode_amd(int cpu) unsigned int rev; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -215,10 +216,9 @@ static void apply_microcode_amd(int cpu) spin_lock_irqsave(µcode_update_lock, flags); - edx = (unsigned int)(((unsigned long) - &(uci->mc.mc_amd->hdr.data_code)) >> 32); - eax = (unsigned int)(((unsigned long) - &(uci->mc.mc_amd->hdr.data_code)) & 0xffffffffL); + addr = (unsigned long)&uci->mc.mc_amd->hdr.data_code; + edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); + eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); asm volatile("movl %0, %%ecx; wrmsr" : : "i" (0xc0010020), "a" (eax), "d" (edx) : "ecx"); -- cgit v1.2.2 From 8ae3a5a8dff2c92bd1087bb97c4a3bb61174303e Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 21 Aug 2008 14:27:22 +0100 Subject: x86: fix 1:1 mapping init on 64-bit (memory hotplug case) While I don't have a hotplug capable system at hand, I think two issues need fixing: - pud_phys (in kernel_physical_ampping_init()) would remain uninitialized in the after_bootmem case - the locking done just around phys_pmd_{init,update}() would leave out pgd updates, and it was needlessly covering code portions that do allocations (perhaps using a more friendly gfp value in alloc_low_page() would then be possible) Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a87ea0e4b3dc..8f487705c3e6 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -336,9 +336,12 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, } if (pmd_val(*pmd)) { - if (!pmd_large(*pmd)) + if (!pmd_large(*pmd)) { + spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); + end); + spin_unlock(&init_mm.page_table_lock); + } /* Count entries we're using from level2_ident_pgt */ if (start == 0) pages++; @@ -347,8 +350,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } @@ -357,7 +362,9 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, last_map_addr = phys_pte_init(pte, address, end); unmap_low_page(pte); + spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); + spin_unlock(&init_mm.page_table_lock); } update_page_count(PG_LEVEL_2M, pages); return last_map_addr; @@ -370,9 +377,7 @@ phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); - spin_unlock(&init_mm.page_table_lock); __flush_tlb_all(); return last_map_addr; } @@ -408,20 +413,21 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + spin_unlock(&init_mm.page_table_lock); last_map_addr = (addr & PUD_MASK) + PUD_SIZE; continue; } pmd = alloc_low_page(&pmd_phys); - - spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); unmap_low_page(pmd); + + spin_lock(&init_mm.page_table_lock); pud_populate(&init_mm, pud, __va(pmd_phys)); spin_unlock(&init_mm.page_table_lock); - } __flush_tlb_all(); update_page_count(PG_LEVEL_1G, pages); @@ -513,16 +519,14 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, continue; } - if (after_bootmem) - pud = pud_offset(pgd, start & PGDIR_MASK); - else - pud = alloc_low_page(&pud_phys); - + pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); unmap_low_page(pud); - pgd_populate(&init_mm, pgd_offset_k(start), - __va(pud_phys)); + + spin_lock(&init_mm.page_table_lock); + pgd_populate(&init_mm, pgd, __va(pud_phys)); + spin_unlock(&init_mm.page_table_lock); } return last_map_addr; -- cgit v1.2.2 From 9482ac6e34dd1890a9a956d460a135bf992cb54a Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 21 Aug 2008 14:28:42 +0100 Subject: x86: fix two modpost warnings in mm/init_64.c early_io{re,un}map() are __init and hence can't be called from __meminit functions. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 8f487705c3e6..d3746efb060d 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -241,7 +241,7 @@ static unsigned long __initdata table_start; static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; -static __meminit void *alloc_low_page(unsigned long *phys) +static __ref void *alloc_low_page(unsigned long *phys) { unsigned long pfn = table_end++; void *adr; @@ -262,7 +262,7 @@ static __meminit void *alloc_low_page(unsigned long *phys) return adr; } -static __meminit void unmap_low_page(void *adr) +static __ref void unmap_low_page(void *adr) { if (after_bootmem) return; -- cgit v1.2.2 From 3a6ddd5f18405ca92e004416af8ed44b9c9783d7 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Thu, 21 Aug 2008 11:32:26 -0700 Subject: x86: fix VMI for early params while fixing a different bug i moved the call to vmi_init before early params could be parsed. This broke the vmi specific commandline parameters. Fix that, by moving vmi initialization after kernel has got a chance to parse early parameters. Signed-off-by: Alok N Kataria Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a4656adab53b..362d4e7f2d38 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -604,14 +604,6 @@ void __init setup_arch(char **cmdline_p) early_cpu_init(); early_ioremap_init(); -#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) - /* - * Must be before kernel pagetables are setup - * or fixmap area is touched. - */ - vmi_init(); -#endif - ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; @@ -678,6 +670,14 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#if defined(CONFIG_VMI) && defined(CONFIG_X86_32) + /* + * Must be before kernel pagetables are setup + * or fixmap area is touched. + */ + vmi_init(); +#endif + /* after early param, so could get panic from serial */ reserve_early_setup_data(); -- cgit v1.2.2 From ee7686bc043921a488d9bf89fe93241c5457a74e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 21 Aug 2008 13:17:56 -0700 Subject: x86: build fix in "xen spinlock updates and performance measurements" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Ingo Molnar wrote: > -tip testing found this build failure: > > arch/x86/xen/spinlock.c: In function ‘spin_time_start’: > arch/x86/xen/spinlock.c:60: error: implicit declaration of function ‘xen_clocksource_read’ > > i've excluded these new commits for now from tip/master - could you > please send a delta fix against tip/x86/xen? Make xen_clocksource_read non-static. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/time.c | 4 +--- arch/x86/xen/xen-ops.h | 2 ++ 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 685b77470fc3..20182d9072c4 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -30,8 +30,6 @@ #define TIMER_SLOP 100000 #define NS_PER_TICK (1000000000LL / HZ) -static cycle_t xen_clocksource_read(void); - /* runstate info updated by Xen */ static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); @@ -213,7 +211,7 @@ unsigned long xen_tsc_khz(void) return xen_khz; } -static cycle_t xen_clocksource_read(void) +cycle_t xen_clocksource_read(void) { struct pvclock_vcpu_time_info *src; cycle_t ret; diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 3c70ebc50b1b..1e8bfdaa20d3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -2,6 +2,7 @@ #define XEN_OPS_H #include +#include #include #include @@ -33,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); void __init xen_time_init(void); -- cgit v1.2.2 From 94581094e774402a11887719bac10505236c2d51 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:39 +0200 Subject: x86: add alloc_coherent dma_ops callback to GART driver [ v2 - x86: make gart_alloc_coherent return zeroed memory FUJITA Tomonori pointed it out that the dma_alloc_coherent function should return memory set to zero. This patch adds this to the GART implementation too. ] Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 49285f8fd4d5..076e64b2d4f3 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -499,6 +499,26 @@ error: return 0; } +/* allocate and map a coherent mapping */ +static void * +gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, + gfp_t flag) +{ + void *vaddr; + + vaddr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size)); + if (!vaddr) + return NULL; + + *dma_addr = gart_map_single(dev, __pa(vaddr), size, DMA_BIDIRECTIONAL); + if (*dma_addr != bad_dma_address) + return vaddr; + + free_pages((unsigned long)vaddr, get_order(size)); + + return NULL; +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -701,6 +721,7 @@ static struct dma_mapping_ops gart_dma_ops = { .sync_sg_for_device = NULL, .map_sg = gart_map_sg, .unmap_sg = gart_unmap_sg, + .alloc_coherent = gart_alloc_coherent, }; void gart_iommu_shutdown(void) -- cgit v1.2.2 From 43a5a5a09b8cfd56047706cf904718d073ccfd33 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:40 +0200 Subject: x86: add free_coherent dma_ops callback to GART driver Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 076e64b2d4f3..ef753e233580 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -519,6 +519,15 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, return NULL; } +/* free a coherent mapping */ +static void +gart_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + gart_unmap_single(dev, dma_addr, size, DMA_BIDIRECTIONAL); + free_pages((unsigned long)vaddr, get_order(size)); +} + static int no_agp; static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) @@ -722,6 +731,7 @@ static struct dma_mapping_ops gart_dma_ops = { .map_sg = gart_map_sg, .unmap_sg = gart_unmap_sg, .alloc_coherent = gart_alloc_coherent, + .free_coherent = gart_free_coherent, }; void gart_iommu_shutdown(void) -- cgit v1.2.2 From e4ad68b651f22fa71820c0b30bb2807999b2b49f Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:41 +0200 Subject: x86: add free_coherent dma_ops callback to Calgary IOMMU driver Signed-off-by: Joerg Roedel Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 218d783ed7a8..afb020fdb19c 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -511,8 +511,22 @@ error: return ret; } +static void calgary_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_handle) +{ + unsigned int npages; + struct iommu_table *tbl = find_iommu_table(dev); + + size = PAGE_ALIGN(size); + npages = size >> PAGE_SHIFT; + + iommu_free(tbl, dma_handle, npages); + free_pages((unsigned long)vaddr, get_order(size)); +} + static struct dma_mapping_ops calgary_dma_ops = { .alloc_coherent = calgary_alloc_coherent, + .free_coherent = calgary_free_coherent, .map_single = calgary_map_single, .unmap_single = calgary_unmap_single, .map_sg = calgary_map_sg, -- cgit v1.2.2 From c5e835f9641e9fcb95d1afb24906821e98b2c6a8 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:42 +0200 Subject: x86: add alloc_coherent dma_ops callback to NOMMU driver Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 55 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 3f91f71cdc3e..b8ce83c98211 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -72,7 +72,62 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } +static void * +nommu_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_addr, gfp_t gfp) +{ + unsigned long dma_mask; + int node; + struct page *page; + + if (hwdev->dma_mask == NULL) + return NULL; + + gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + gfp |= __GFP_ZERO; + + dma_mask = hwdev->coherent_dma_mask; + if (!dma_mask) + dma_mask = *(hwdev->dma_mask); + + if (dma_mask < DMA_24BIT_MASK) + return NULL; + + node = dev_to_node(hwdev); + +#ifdef CONFIG_X86_64 + if (dma_mask <= DMA_32BIT_MASK) + gfp |= GFP_DMA32; +#endif + + /* No alloc-free penalty for ISA devices */ + if (dma_mask == DMA_24BIT_MASK) + gfp |= GFP_DMA; + +again: + page = alloc_pages_node(node, gfp, get_order(size)); + if (!page) + return NULL; + + if ((page_to_phys(page) + size > dma_mask) && !(gfp & GFP_DMA)) { + free_pages((unsigned long)page_address(page), get_order(size)); + gfp |= GFP_DMA; + goto again; + } + + *dma_addr = page_to_phys(page); + if (check_addr("alloc_coherent", hwdev, *dma_addr, size)) { + flush_write_buffers(); + return page_address(page); + } + + free_pages((unsigned long)page_address(page), get_order(size)); + + return NULL; +} + struct dma_mapping_ops nommu_dma_ops = { + .alloc_coherent = nommu_alloc_coherent, .map_single = nommu_map_single, .map_sg = nommu_map_sg, .is_phys = 1, -- cgit v1.2.2 From a3a76532e0caa093c279806d8fe8608232538af0 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:43 +0200 Subject: x86: add free_coherent dma_ops callback to NOMMU driver Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index b8ce83c98211..73853d3fdcac 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -126,8 +126,15 @@ again: return NULL; } +static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, + dma_addr_t dma_addr) +{ + free_pages((unsigned long)vaddr, get_order(size)); +} + struct dma_mapping_ops nommu_dma_ops = { .alloc_coherent = nommu_alloc_coherent, + .free_coherent = nommu_free_coherent, .map_single = nommu_map_single, .map_sg = nommu_map_sg, .is_phys = 1, -- cgit v1.2.2 From c647c3bb2d16246a87f49035985ddb7c1eb030df Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:44 +0200 Subject: x86: cleanup dma_*_coherent functions All dma_ops implementations support the alloc_coherent and free_coherent callbacks now. This allows a big simplification of the dma_alloc_coherent function which is done with this patch. The dma_free_coherent functions is also cleaned up and calls now the free_coherent callback of the dma_ops implementation. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 121 +++++----------------------------------------- 1 file changed, 12 insertions(+), 109 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 87d4d6964ec2..613332b26e31 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -241,33 +241,15 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -/* Allocate DMA memory on node near device */ -static noinline struct page * -dma_alloc_pages(struct device *dev, gfp_t gfp, unsigned order) -{ - int node; - - node = dev_to_node(dev); - - return alloc_pages_node(node, gfp, order); -} - /* * Allocate memory for a coherent mapping. */ -void * + void * dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t gfp) { struct dma_mapping_ops *ops = get_dma_ops(dev); - void *memory = NULL; - struct page *page; - unsigned long dma_mask = 0; - dma_addr_t bus; - int noretry = 0; - - /* ignore region specifiers */ - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + void *memory; if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) return memory; @@ -276,89 +258,10 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, dev = &fallback_dev; gfp |= GFP_DMA; } - dma_mask = dev->coherent_dma_mask; - if (dma_mask == 0) - dma_mask = (gfp & GFP_DMA) ? DMA_24BIT_MASK : DMA_32BIT_MASK; - - /* Device not DMA able */ - if (dev->dma_mask == NULL) - return NULL; - - /* Don't invoke OOM killer or retry in lower 16MB DMA zone */ - if (gfp & __GFP_DMA) - noretry = 1; - -#ifdef CONFIG_X86_64 - /* Why <=? Even when the mask is smaller than 4GB it is often - larger than 16MB and in this case we have a chance of - finding fitting memory in the next higher zone first. If - not retry with true GFP_DMA. -AK */ - if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp |= GFP_DMA32; - if (dma_mask < DMA_32BIT_MASK) - noretry = 1; - } -#endif - again: - page = dma_alloc_pages(dev, - noretry ? gfp | __GFP_NORETRY : gfp, get_order(size)); - if (page == NULL) - return NULL; - - { - int high, mmu; - bus = page_to_phys(page); - memory = page_address(page); - high = (bus + size) >= dma_mask; - mmu = high; - if (force_iommu && !(gfp & GFP_DMA)) - mmu = 1; - else if (high) { - free_pages((unsigned long)memory, - get_order(size)); - - /* Don't use the 16MB ZONE_DMA unless absolutely - needed. It's better to use remapping first. */ - if (dma_mask < DMA_32BIT_MASK && !(gfp & GFP_DMA)) { - gfp = (gfp & ~GFP_DMA32) | GFP_DMA; - goto again; - } - - /* Let low level make its own zone decisions */ - gfp &= ~(GFP_DMA32|GFP_DMA); - - if (ops->alloc_coherent) - return ops->alloc_coherent(dev, size, - dma_handle, gfp); - return NULL; - } - - memset(memory, 0, size); - if (!mmu) { - *dma_handle = bus; - return memory; - } - } - - if (ops->alloc_coherent) { - free_pages((unsigned long)memory, get_order(size)); - gfp &= ~(GFP_DMA|GFP_DMA32); - return ops->alloc_coherent(dev, size, dma_handle, gfp); - } - - if (ops->map_simple) { - *dma_handle = ops->map_simple(dev, virt_to_phys(memory), - size, - PCI_DMA_BIDIRECTIONAL); - if (*dma_handle != bad_dma_address) - return memory; - } - - if (panic_on_overflow) - panic("dma_alloc_coherent: IOMMU overflow by %lu bytes\n", - (unsigned long)size); - free_pages((unsigned long)memory, get_order(size)); + if (ops->alloc_coherent) + return ops->alloc_coherent(dev, size, + dma_handle, gfp); return NULL; } EXPORT_SYMBOL(dma_alloc_coherent); @@ -368,17 +271,17 @@ EXPORT_SYMBOL(dma_alloc_coherent); * The caller must ensure that the device has finished accessing the mapping. */ void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) + void *vaddr, dma_addr_t bus) { struct dma_mapping_ops *ops = get_dma_ops(dev); - int order = get_order(size); - WARN_ON(irqs_disabled()); /* for portability */ - if (dma_release_from_coherent(dev, order, vaddr)) + WARN_ON(irqs_disabled()); /* for portability */ + + if (dma_release_from_coherent(dev, get_order(size), vaddr)) return; - if (ops->unmap_single) - ops->unmap_single(dev, bus, size, 0); - free_pages((unsigned long)vaddr, order); + + if (ops->free_coherent) + ops->free_coherent(dev, size, vaddr, bus); } EXPORT_SYMBOL(dma_free_coherent); -- cgit v1.2.2 From 6c505ce3930c6a6b455cda53fab3e88ae44f8221 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:45 +0200 Subject: x86: move dma_*_coherent functions to include file All the x86 DMA-API functions are defined in asm/dma-mapping.h. This patch moves the dma_*_coherent functions also to this header file because they are now small enough to do so. This is done as a separate patch because it also includes some renaming and restructuring of the dma-mapping.h file. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 49 +++---------------------------------------- arch/x86/kernel/pci-gart_64.c | 4 ++-- 2 files changed, 5 insertions(+), 48 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 613332b26e31..0a1408abcc62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -41,11 +41,12 @@ EXPORT_SYMBOL(bad_dma_address); /* Dummy device used for NULL arguments (normally ISA). Better would be probably a smaller DMA mask, but this is bug-to-bug compatible to older i386. */ -struct device fallback_dev = { +struct device x86_dma_fallback_dev = { .bus_id = "fallback device", .coherent_dma_mask = DMA_32BIT_MASK, - .dma_mask = &fallback_dev.coherent_dma_mask, + .dma_mask = &x86_dma_fallback_dev.coherent_dma_mask, }; +EXPORT_SYMBOL(x86_dma_fallback_dev); int dma_set_mask(struct device *dev, u64 mask) { @@ -241,50 +242,6 @@ int dma_supported(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_supported); -/* - * Allocate memory for a coherent mapping. - */ - void * -dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, - gfp_t gfp) -{ - struct dma_mapping_ops *ops = get_dma_ops(dev); - void *memory; - - if (dma_alloc_from_coherent(dev, size, dma_handle, &memory)) - return memory; - - if (!dev) { - dev = &fallback_dev; - gfp |= GFP_DMA; - } - - if (ops->alloc_coherent) - return ops->alloc_coherent(dev, size, - dma_handle, gfp); - return NULL; -} -EXPORT_SYMBOL(dma_alloc_coherent); - -/* - * Unmap coherent memory. - * The caller must ensure that the device has finished accessing the mapping. - */ -void dma_free_coherent(struct device *dev, size_t size, - void *vaddr, dma_addr_t bus) -{ - struct dma_mapping_ops *ops = get_dma_ops(dev); - - WARN_ON(irqs_disabled()); /* for portability */ - - if (dma_release_from_coherent(dev, get_order(size), vaddr)) - return; - - if (ops->free_coherent) - ops->free_coherent(dev, size, vaddr, bus); -} -EXPORT_SYMBOL(dma_free_coherent); - static int __init pci_iommu_init(void) { calgary_iommu_init(); diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index ef753e233580..e81df25dc067 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -276,7 +276,7 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) unsigned long bus; if (!dev) - dev = &fallback_dev; + dev = &x86_dma_fallback_dev; if (!need_iommu(dev, paddr, size)) return paddr; @@ -427,7 +427,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) return 0; if (!dev) - dev = &fallback_dev; + dev = &x86_dma_fallback_dev; out = 0; start = 0; -- cgit v1.2.2 From 2cd54961caff9fe9109807c6603a0af0729b9591 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 19 Aug 2008 16:32:46 +0200 Subject: x86, AMD IOMMU: remove obsolete FIXME comment Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index de39e1f2ede5..d15081c38237 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1038,8 +1038,6 @@ out: /* * The exported free_coherent function for dma_ops. - * FIXME: fix the generic x86 DMA layer so that it actually calls that - * function. */ static void free_coherent(struct device *dev, size_t size, void *virt_addr, dma_addr_t dma_addr) -- cgit v1.2.2 From 766af9fa812f49feb4a3e62cf92f3d37f33c7fb6 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 22 Aug 2008 00:12:09 +0900 Subject: dma-mapping.h, x86: remove last user of dma_mapping_ops->map_simple pci-dma.c doesn't use map_simple hook any more so we can remove it from struct dma_mapping_ops now. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index e81df25dc067..cfd7ec25e37e 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -720,7 +720,6 @@ extern int agp_amd64_init(void); static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, - .map_simple = gart_map_simple, .unmap_single = gart_unmap_single, .sync_single_for_cpu = NULL, .sync_single_for_device = NULL, -- cgit v1.2.2 From 421076e2bec1b917bdcf83da35b24f6349bf35db Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 22 Aug 2008 16:29:10 +0900 Subject: x86: dma_*_coherent rework patchset v2, fix alloc_coherent dma_ops callback was added to GART, however, it doesn't return a size aligned address wrt dma_alloc_coherent, as DMA-mapping.txt defines. This patch fixes it. This patch also removes unused gart_map_simple (dma_mapping_ops->map_simple has gone). Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 338c4f241559..4d0864900b8c 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -261,20 +261,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); } -static dma_addr_t -gart_map_simple(struct device *dev, phys_addr_t paddr, size_t size, int dir) -{ - dma_addr_t map; - unsigned long align_mask; - - align_mask = (1UL << get_order(size)) - 1; - map = dma_map_area(dev, paddr, size, dir, align_mask); - - flush_gart(); - - return map; -} - /* Map a single area into the IOMMU */ static dma_addr_t gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) @@ -512,12 +498,21 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { void *vaddr; + unsigned long align_mask; vaddr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size)); if (!vaddr) return NULL; - *dma_addr = gart_map_single(dev, __pa(vaddr), size, DMA_BIDIRECTIONAL); + align_mask = (1UL << get_order(size)) - 1; + + if (!dev) + dev = &x86_dma_fallback_dev; + + *dma_addr = dma_map_area(dev, __pa(vaddr), size, DMA_BIDIRECTIONAL, + align_mask); + flush_gart(); + if (*dma_addr != bad_dma_address) return vaddr; -- cgit v1.2.2 From b05f78f5c713eda2c34e495d92495ee4f1c3b5e1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 22 Aug 2008 01:32:50 -0700 Subject: x86_64: printout msr -v2 commandline show_msr=1 for bsp, show_msr=32 for all 32 cpus. [ mingo@elte.hu: added documentation ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 51 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/paravirt.c | 1 + 2 files changed, 52 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index dd6e3f15017e..bca2d6980e82 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -394,6 +394,49 @@ static __init int setup_noclflush(char *arg) } __setup("noclflush", setup_noclflush); +struct msr_range { + unsigned min; + unsigned max; +}; + +static struct msr_range msr_range_array[] __cpuinitdata = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, + { 0xc0011000, 0xc001103b}, +}; + +static void __cpuinit print_cpu_msr(void) +{ + unsigned index; + u64 val; + int i; + unsigned index_min, index_max; + + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; + for (index = index_min; index < index_max; index++) { + if (rdmsrl_amd_safe(index, &val)) + continue; + printk(KERN_INFO " MSR%08x: %016llx\n", index, val); + } + } +} + +static int show_msr __cpuinitdata; +static __init int setup_show_msr(char *arg) +{ + int num; + + get_option(&arg, &num); + + if (num > 0) + show_msr = num; + return 1; +} +__setup("show_msr=", setup_show_msr); + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) @@ -403,6 +446,14 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT " stepping %02x\n", c->x86_mask); else printk(KERN_CONT "\n"); + +#ifdef CONFIG_SMP + if (c->cpu_index < show_msr) + print_cpu_msr(); +#else + if (show_msr) + print_cpu_msr(); +#endif } static __init int setup_disablecpuid(char *arg) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 94da4d52d798..c6044682e1e7 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -330,6 +330,7 @@ struct pv_cpu_ops pv_cpu_ops = { #endif .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, + .read_msr_amd = native_read_msr_amd_safe, .write_msr = native_write_msr_safe, .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, -- cgit v1.2.2 From c4bd1fdab0deec0f69aeabab22075cb22ac8ad44 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Thu, 21 Aug 2008 20:49:05 +0200 Subject: x86: fix section mismatch warning - uv_cpu_init WARNING: vmlinux.o(.cpuinit.text+0x3cc4): Section mismatch in reference from the function uv_cpu_init() to the function .init.text:uv_system_init() The function __cpuinit uv_cpu_init() references a function __init uv_system_init(). If uv_system_init is only used by uv_cpu_init then annotate uv_system_init with a matching annotation. uv_system_init was ment to be called only once, so do it from codepath (native_smp_prepare_cpus) which is called once, right before activation of other cpus (smp_init). Note: old code relied on uv_node_to_blade being initialized to 0, but it'a not initialized from anywhere. Signed-off-by: Marcin Slusarz Acked-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 8 +++++--- arch/x86/kernel/smpboot.c | 3 +++ 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 2d7e307c7779..bfa837cb16be 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -293,7 +293,9 @@ static __init void uv_rtc_init(void) sn_rtc_cycles_per_second = ticks_per_sec; } -static __init void uv_system_init(void) +static bool uv_system_inited; + +void __init uv_system_init(void) { union uvh_si_addr_map_config_u m_n_config; union uvh_node_id_u node_id; @@ -383,6 +385,7 @@ static __init void uv_system_init(void) map_mmr_high(max_pnode); map_config_high(max_pnode); map_mmioh_high(max_pnode); + uv_system_inited = true; } /* @@ -391,8 +394,7 @@ static __init void uv_system_init(void) */ void __cpuinit uv_cpu_init(void) { - if (!uv_node_to_blade) - uv_system_init(); + BUG_ON(!uv_system_inited); uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e139e617f422..7985c5b3f916 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1221,6 +1221,9 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) printk(KERN_INFO "CPU%d: ", 0); print_cpu_info(&cpu_data(0)); setup_boot_clock(); + + if (is_uv_system()) + uv_system_init(); out: preempt_enable(); } -- cgit v1.2.2 From 9754a5b840a209bc1f192d59f63e81b698a55ac8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 Aug 2008 08:22:23 +0200 Subject: x86: work around MTRR mask setting, v2 improve the debug printout: - make it actually display something - print it only once would be nice to have a WARN_ONCE() facility, to feed such things to kerneloops.org. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 43102e03e2d1..cb7d3b6a80eb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -401,7 +401,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - WARN_ON("mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + static int once = 1; + + if (once) { + printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); + once = 0; + } mask_lo = tmp; } } -- cgit v1.2.2 From 25258ef762bc4a05fa9c4523f7dae56e3fd01864 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 20 Aug 2008 11:31:07 -0700 Subject: x86: export pv_lock_ops non-GPL None of the spinlock API is exported GPL, so there's no reason for pv_lock_ops to be. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar Cc: drago01 --- arch/x86/kernel/paravirt-spinlocks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 206359a8e43f..0e9f1982b1dd 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -23,7 +23,7 @@ struct pv_lock_ops pv_lock_ops = { .spin_unlock = __ticket_spin_unlock, #endif }; -EXPORT_SYMBOL_GPL(pv_lock_ops); +EXPORT_SYMBOL(pv_lock_ops); void __init paravirt_use_bytelocks(void) { -- cgit v1.2.2 From 01de05af94db5d5214b0a5e191068d19c82059a8 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Fri, 22 Aug 2008 12:08:17 -0700 Subject: x86: have set_memory_array_{uc,wb} coalesce memtypes, fix Fix the start addr for free_memtype calls in the error path. Signed-off-by: Venkatesh Pallipadi Acked-by: Rene Herman Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 497108825da9..4b6968ba0864 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -967,7 +967,7 @@ out: if (tmp == start) break; - for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { + for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) { if (end != __pa(addr[i + 1])) break; i++; -- cgit v1.2.2 From 9b4e27b52853c5da77e61a4e36fbc40688b7a829 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 Aug 2008 20:23:37 +0200 Subject: x86: fix: do not run code in amd_bus.c on non-AMD CPUs Jan Beulich wrote: > Even worse - this would even try to access the MSR on non-AMD CPUs > (currently probably prevented just by the fact that only AMD ones use > family values of 0x10 or higher). This patch adds cpu vendor check to the postcore_initcalls. Reported-by: Jan Beulich Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/pci/amd_bus.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index dbf532369711..4a6f1a6a3aa9 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -555,9 +555,11 @@ static int __init early_fill_mp_bus_info(void) return 0; } -postcore_initcall(early_fill_mp_bus_info); +#else /* !CONFIG_X86_64 */ -#endif +static int __init early_fill_mp_bus_info(void) { return 0; } + +#endif /* !CONFIG_X86_64 */ /* common 32/64 bit code */ @@ -583,4 +585,15 @@ static int __init enable_pci_io_ecs(void) return 0; } -postcore_initcall(enable_pci_io_ecs); +static int __init amd_postcore_init(void) +{ + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) + return 0; + + early_fill_mp_bus_info(); + enable_pci_io_ecs(); + + return 0; +} + +postcore_initcall(amd_postcore_init); -- cgit v1.2.2 From 91ede005d72df60d6b3f252be177a4743a6aa46a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 Aug 2008 20:23:38 +0200 Subject: x86: fix: make PCI ECS for AMD CPUs hotplug capable Until now, PCI ECS setup was performed at boot time only and for cpus that are enabled then. This patch fixes this and adds cpu hotplug. Tests sequence (check if ECS bit is set when bringing cpu online again): # ( perl -e 'sysseek(STDIN, 0xC001001F, 0)'; hexdump -n 8 -e '2/4 "%08x " "\n"' ) < /dev/cpu/1/msr 00000008 00404010 # ( perl -e 'sysseek(STDOUT, 0xC001001F, 0); print pack "l*", 8, 0x00400010' ) > /dev/cpu/1/msr # ( perl -e 'sysseek(STDIN, 0xC001001F, 0)'; hexdump -n 8 -e '2/4 "%08x " "\n"' ) < /dev/cpu/1/msr 00000008 00400010 # echo 0 > /sys/devices/system/cpu/cpu1/online # echo 1 > /sys/devices/system/cpu/cpu1/online # ( perl -e 'sysseek(STDIN, 0xC001001F, 0)'; hexdump -n 8 -e '2/4 "%08x " "\n"' ) < /dev/cpu/1/msr 00000008 00404010 Reported-by: Yinghai Lu Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/pci/amd_bus.c | 35 +++++++++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 4a6f1a6a3aa9..6a0fca78c362 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -1,6 +1,7 @@ #include #include #include +#include #include "pci.h" #ifdef CONFIG_X86_64 @@ -565,7 +566,7 @@ static int __init early_fill_mp_bus_info(void) { return 0; } #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs_per_cpu(void *unused) +static void enable_pci_io_ecs(void *unused) { u64 reg; rdmsrl(MSR_AMD64_NB_CFG, reg); @@ -575,13 +576,39 @@ static void enable_pci_io_ecs_per_cpu(void *unused) } } -static int __init enable_pci_io_ecs(void) +static int __cpuinit amd_cpu_notify(struct notifier_block *self, + unsigned long action, void *hcpu) { + int cpu = (long)hcpu; + switch(action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); + break; + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata amd_cpu_notifier = { + .notifier_call = amd_cpu_notify, +}; + +static int __init pci_io_ecs_init(void) +{ + int cpu; + /* assume all cpus from fam10h have IO ECS */ if (boot_cpu_data.x86 < 0x10) return 0; - on_each_cpu(enable_pci_io_ecs_per_cpu, NULL, 1); + + register_cpu_notifier(&amd_cpu_notifier); + for_each_online_cpu(cpu) + amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, + (void *)(long)cpu); pci_probe |= PCI_HAS_IO_ECS; + return 0; } @@ -591,7 +618,7 @@ static int __init amd_postcore_init(void) return 0; early_fill_mp_bus_info(); - enable_pci_io_ecs(); + pci_io_ecs_init(); return 0; } -- cgit v1.2.2 From bbb65d2d365efe9951290e61678dcf81ec60add4 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Sat, 23 Aug 2008 17:47:10 +0200 Subject: x86: use cpuid vector 0xb when available for detecting cpu topology cpuid leaf 0xb provides extended topology enumeration. This interface provides the 32-bit x2APIC id of the logical processor and it also provides a new mechanism to detect SMT and core siblings (which provides increased addressability). Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/addon_cpuid_features.c | 86 ++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common_64.c | 3 ++ arch/x86/kernel/cpu/intel.c | 13 +++-- arch/x86/kernel/cpu/intel_64.c | 5 +- 4 files changed, 103 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index 84a8220a6072..aa9641ae6703 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -7,6 +7,8 @@ #include #include +#include + struct cpuid_bit { u16 feature; u8 reg; @@ -48,6 +50,90 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) } } +/* leaf 0xb SMT level */ +#define SMT_LEVEL 0 + +/* leaf 0xb sub-leaf types */ +#define INVALID_TYPE 0 +#define SMT_TYPE 1 +#define CORE_TYPE 2 + +#define LEAFB_SUBTYPE(ecx) (((ecx) >> 8) & 0xff) +#define BITS_SHIFT_NEXT_LEVEL(eax) ((eax) & 0x1f) +#define LEVEL_MAX_SIBLINGS(ebx) ((ebx) & 0xffff) + +/* + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. + */ +void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx, sub_index; + unsigned int ht_mask_width, core_plus_mask_width; + unsigned int core_select_mask, core_level_siblings; + + if (c->cpuid_level < 0xb) + return; + + cpuid_count(0xb, SMT_LEVEL, &eax, &ebx, &ecx, &edx); + + /* + * check if the cpuid leaf 0xb is actually implemented. + */ + if (ebx == 0 || (LEAFB_SUBTYPE(ecx) != SMT_TYPE)) + return; + + set_cpu_cap(c, X86_FEATURE_XTOPOLOGY); + + /* + * initial apic id, which also represents 32-bit extended x2apic id. + */ + c->initial_apicid = edx; + + /* + * Populate HT related information from sub-leaf level 0. + */ + core_level_siblings = smp_num_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = ht_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + + sub_index = 1; + do { + cpuid_count(0xb, sub_index, &eax, &ebx, &ecx, &edx); + + /* + * Check for the Core type in the implemented sub leaves. + */ + if (LEAFB_SUBTYPE(ecx) == CORE_TYPE) { + core_level_siblings = LEVEL_MAX_SIBLINGS(ebx); + core_plus_mask_width = BITS_SHIFT_NEXT_LEVEL(eax); + break; + } + + sub_index++; + } while (LEAFB_SUBTYPE(ecx) != INVALID_TYPE); + + core_select_mask = (~(-1 << core_plus_mask_width)) >> ht_mask_width; + +#ifdef CONFIG_X86_32 + c->cpu_core_id = phys_pkg_id(c->initial_apicid, ht_mask_width) + & core_select_mask; + c->phys_proc_id = phys_pkg_id(c->initial_apicid, core_plus_mask_width); +#else + c->cpu_core_id = phys_pkg_id(ht_mask_width) & core_select_mask; + c->phys_proc_id = phys_pkg_id(core_plus_mask_width); +#endif + c->x86_max_cores = (core_level_siblings / smp_num_siblings); + + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + return; +} + #ifdef CONFIG_X86_PAT void __cpuinit validate_pat_support(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index af569a964e74..a2888c7b56ac 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -128,6 +128,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; int index_msb, core_bits; + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return; + cpuid(1, &eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 77618c717d76..58a6f1a0b297 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -176,9 +176,16 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (p) strcpy(c->x86_model_id, p); - c->x86_max_cores = num_cpu_cores(c); - - detect_ht(c); + detect_extended_topology(c); + + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + c->x86_max_cores = num_cpu_cores(c); + detect_ht(c); + } /* Work around errata */ Intel_errata_workarounds(c); diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index 1019c58d39f0..42d501a8c418 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -80,7 +80,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_REP_GOOD); set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - c->x86_max_cores = intel_num_cpu_cores(c); + + detect_extended_topology(c); + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) + c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); } -- cgit v1.2.2 From e17941b0c140562d92e5a3bc12b4ad88281c7926 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Sat, 23 Aug 2008 17:47:11 +0200 Subject: x86: use x2apic id reported by cpuid during topology discovery use x2apic id reported by cpuid during topology discovery, instead of the apic id configured in the APIC. For most of the systems, x2apic id reported by cpuid leaf 0xb will be same as the physical apic id reported by the APIC_ID register of the APIC. We follow the suggested guidelines and use the apic id reported by the cpuid. No change to non-generic UV platforms, will use the apic id reported in the APIC_ID register as the cpuid reported apic id's may not be unique. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_cluster.c | 7 +------ arch/x86/kernel/genx2apic_phys.c | 7 +------ 2 files changed, 2 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index ef3f3182d50a..0bf0a8624b8c 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -120,14 +120,9 @@ static unsigned long set_apic_id(unsigned int id) return x; } -static unsigned int x2apic_read_id(void) -{ - return apic_read(APIC_ID); -} - static unsigned int phys_pkg_id(int index_msb) { - return x2apic_read_id() >> index_msb; + return current_cpu_data.initial_apicid >> index_msb; } static void x2apic_send_IPI_self(int vector) diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 3229c68aedd4..e2401ab0c46e 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -118,14 +118,9 @@ static unsigned long set_apic_id(unsigned int id) return x; } -static unsigned int x2apic_read_id(void) -{ - return apic_read(APIC_ID); -} - static unsigned int phys_pkg_id(int index_msb) { - return x2apic_read_id() >> index_msb; + return current_cpu_data.initial_apicid >> index_msb; } void x2apic_send_IPI_self(int vector) -- cgit v1.2.2 From 8735728ef8dc935c4fb351f913758fdbb62c308d Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 22 Aug 2008 22:23:09 +0200 Subject: x86 MCE: Fix CPU hotplug problem with multiple multicore AMD CPUs During CPU hot-remove the sysfs directory created by threshold_create_bank(), defined in arch/x86/kernel/cpu/mcheck/mce_amd_64.c, has to be removed before its parent directory, created by mce_create_device(), defined in arch/x86/kernel/cpu/mcheck/mce_64.c . Moreover, when the CPU in question is hotplugged again, obviously the latter has to be created before the former. At present, the right ordering is not enforced, because all of these operations are carried out by CPU hotplug notifiers which are not appropriately ordered with respect to each other. This leads to serious problems on systems with two or more multicore AMD CPUs, among other things during suspend and hibernation. Fix the problem by placing threshold bank CPU hotplug callbacks in mce_cpu_callback(), so that they are invoked at the right places, if defined. Additionally, use kobject_del() to remove the sysfs directory associated with the kobject created by kobject_create_and_add() in threshold_create_bank(), to prevent the kernel from crashing during CPU hotplug operations on systems with two or more multicore AMD CPUs. This patch fixes bug #11337. Signed-off-by: Rafael J. Wysocki Acked-by: Andi Kleen Tested-by: Mark Langsdorf Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_64.c | 5 +++++ arch/x86/kernel/cpu/mcheck/mce_amd_64.c | 18 +++++------------- 2 files changed, 10 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 65a339678ece..726a5fcdf341 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -759,6 +759,7 @@ static struct sysdev_class mce_sysclass = { }; DEFINE_PER_CPU(struct sys_device, device_mce); +void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu) __cpuinitdata; /* Why are there no generic functions for this? */ #define ACCESSOR(name, var, start) \ @@ -883,9 +884,13 @@ static int __cpuinit mce_cpu_callback(struct notifier_block *nfb, case CPU_ONLINE: case CPU_ONLINE_FROZEN: mce_create_device(cpu); + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); break; case CPU_DEAD: case CPU_DEAD_FROZEN: + if (threshold_cpu_callback) + threshold_cpu_callback(action, cpu); mce_remove_device(cpu); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c index 88736cadbaa6..5eb390a4b2e9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd_64.c @@ -628,6 +628,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) deallocate_threshold_block(cpu, bank); free_out: + kobject_del(b->kobj); kobject_put(b->kobj); kfree(b); per_cpu(threshold_banks, cpu)[bank] = NULL; @@ -645,14 +646,11 @@ static void threshold_remove_device(unsigned int cpu) } /* get notified when a cpu comes on/off */ -static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static void __cpuinit amd_64_threshold_cpu_callback(unsigned long action, + unsigned int cpu) { - /* cpu was unsigned int to begin with */ - unsigned int cpu = (unsigned long)hcpu; - if (cpu >= NR_CPUS) - goto out; + return; switch (action) { case CPU_ONLINE: @@ -666,14 +664,8 @@ static int __cpuinit threshold_cpu_callback(struct notifier_block *nfb, default: break; } - out: - return NOTIFY_OK; } -static struct notifier_block threshold_cpu_notifier __cpuinitdata = { - .notifier_call = threshold_cpu_callback, -}; - static __init int threshold_init_device(void) { unsigned lcpu = 0; @@ -684,7 +676,7 @@ static __init int threshold_init_device(void) if (err) return err; } - register_hotcpu_notifier(&threshold_cpu_notifier); + threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } -- cgit v1.2.2 From 7a8fc9b248e77a4eab0613acf30a6811799786b3 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sun, 17 Aug 2008 17:36:59 +0300 Subject: removed unused #include 's This patch lets the files using linux/version.h match the files that #include it. Signed-off-by: Adrian Bunk Signed-off-by: Linus Torvalds --- arch/x86/mach-rdc321x/platform.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mach-rdc321x/platform.c b/arch/x86/mach-rdc321x/platform.c index a037041817c7..4f4e50c3ad3b 100644 --- a/arch/x86/mach-rdc321x/platform.c +++ b/arch/x86/mach-rdc321x/platform.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include -- cgit v1.2.2 From 060700b571717c997a2ea5e2049b848fa248ee13 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 24 Aug 2008 11:52:06 -0700 Subject: x86: do not enable TSC notifier if we don't need it Impact: crash on non-TSC-equipped CPUs Don't enable the TSC notifier if we *either*: 1. don't have a CPU, or 2. have a CPU with constant TSC. In either of those cases, the notifier is either damaging (1) or useless(2). From: Linus Torvalds Signed-off-by: H. Peter Anvin --- arch/x86/kernel/tsc.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 46af71676738..9bed5cae4bdc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -325,6 +325,10 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { + if (!cpu_has_tsc) + return 0; + if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + return 0; cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER); return 0; -- cgit v1.2.2 From a2bd7274b47124d2fc4dfdb8c0591f545ba749dd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 25 Aug 2008 00:56:08 -0700 Subject: x86: fix HPET regression in 2.6.26 versus 2.6.25, check hpet against BAR, v3 David Witbrodt tracked down (and bisected) a hpet bootup hang on his system to the following problem: a BIOS bug made the hpet device visible as a generic PCI device. If e820 reserved entries happen to be registered first in the resource tree [which v2.6.26 started doing], then the PCI code will reallocate that device's BAR to some other address - breaking timer IRQs and hanging the system. ( Normally hpet devices are hidden by the BIOS from the OS's PCI discovery via chipset magic. Sometimes the hpet is not a PCI device at all. ) Solve this fundamental fragility by making non-PCI platform drivers insert resources into the resource tree even if it overlaps the e820 reserved entry, to keep the resource manager from updating the BAR. Also do these checks for the ioapic and mmconfig addresses, and emit a warning if this happens. Bisected-by: David Witbrodt Signed-off-by: Yinghai Lu Tested-by: David Witbrodt Signed-off-by: Ingo Molnar --- arch/x86/pci/i386.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 5807d1bc73f7..d765da913842 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -31,8 +31,11 @@ #include #include #include +#include #include +#include +#include #include "pci.h" @@ -77,6 +80,77 @@ pcibios_align_resource(void *data, struct resource *res, } EXPORT_SYMBOL(pcibios_align_resource); +static int check_res_with_valid(struct pci_dev *dev, struct resource *res) +{ + unsigned long base; + unsigned long size; + int i; + + base = res->start; + size = (res->start == 0 && res->end == res->start) ? 0 : + (res->end - res->start + 1); + + if (!base || !size) + return 0; + +#ifdef CONFIG_HPET_TIMER + /* for hpet */ + if (base == hpet_address && (res->flags & IORESOURCE_MEM)) { + dev_info(&dev->dev, "BAR has HPET at %08lx-%08lx\n", + base, base + size - 1); + return 1; + } +#endif + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < nr_ioapics; i++) { + unsigned long ioapic_phys = mp_ioapics[i].mp_apicaddr; + + if (base == ioapic_phys && (res->flags & IORESOURCE_MEM)) { + dev_info(&dev->dev, "BAR has ioapic at %08lx-%08lx\n", + base, base + size - 1); + return 1; + } + } +#endif + +#ifdef CONFIG_PCI_MMCONFIG + for (i = 0; i < pci_mmcfg_config_num; i++) { + unsigned long addr; + + addr = pci_mmcfg_config[i].address; + if (base == addr && (res->flags & IORESOURCE_MEM)) { + dev_info(&dev->dev, "BAR has MMCONFIG at %08lx-%08lx\n", + base, base + size - 1); + return 1; + } + } +#endif + + return 0; +} + +static int check_platform(struct pci_dev *dev, struct resource *res) +{ + struct resource *root = NULL; + + /* + * forcibly insert it into the + * resource tree + */ + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + else if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + + if (root && check_res_with_valid(dev, res)) { + insert_resource(root, res); + + return 1; + } + + return 0; +} /* * Handle resources of PCI devices. If the world were perfect, we could * just allocate all the resource regions and do nothing more. It isn't. @@ -128,6 +202,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) pr = pci_find_parent_resource(dev, r); if (!r->start || !pr || request_resource(pr, r) < 0) { + if (check_platform(dev, r)) + continue; dev_err(&dev->dev, "BAR %d: can't " "allocate resource\n", idx); /* @@ -171,6 +247,8 @@ static void __init pcibios_allocate_resources(int pass) r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { + if (check_platform(dev, r)) + continue; dev_err(&dev->dev, "BAR %d: can't " "allocate resource\n", idx); /* We'll assign a new address later */ -- cgit v1.2.2 From 93be71b672f167b1e8c23725114f86305354f0ac Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:11 +0100 Subject: x86: add cpu hotplug hooks into smp_ops Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 4 ++-- arch/x86/kernel/smp.c | 6 +++++- arch/x86/kernel/smpboot.c | 8 ++++---- 4 files changed, 13 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 3b7a1ddcc0bc..e382fe0ccd66 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -91,7 +91,7 @@ static void cpu_exit_clear(void) } /* We don't actually take CPU down, just spin without interrupts. */ -static inline void play_dead(void) +void native_play_dead(void) { /* This must be done before dead CPU ack */ cpu_exit_clear(); @@ -107,7 +107,7 @@ static inline void play_dead(void) wbinvd_halt(); } #else -static inline void play_dead(void) +void native_play_dead(void) { BUG(); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71553b664e2a..dfd3f5752085 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -90,7 +90,7 @@ DECLARE_PER_CPU(int, cpu_state); #include /* We halt the CPU with physical CPU hotplug */ -static inline void play_dead(void) +void native_play_dead(void) { idle_task_exit(); mb(); @@ -102,7 +102,7 @@ static inline void play_dead(void) wbinvd_halt(); } #else -static inline void play_dead(void) +void native_play_dead(void) { BUG(); } diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 361b7a4c640c..18f9b19f5f8f 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -214,12 +214,16 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, - .cpu_up = native_cpu_up, .smp_cpus_done = native_smp_cpus_done, .smp_send_stop = native_smp_send_stop, .smp_send_reschedule = native_smp_send_reschedule, + .cpu_up = native_cpu_up, + .cpu_die = native_cpu_die, + .cpu_disable = native_cpu_disable, + .play_dead = native_play_dead, + .send_call_func_ipi = native_send_call_func_ipi, .send_call_func_single_ipi = native_send_call_func_single_ipi, }; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b3f916..c414cee296ba 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1346,7 +1346,7 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } -int __cpu_disable(void) +int native_cpu_disable(void) { int cpu = smp_processor_id(); @@ -1385,7 +1385,7 @@ int __cpu_disable(void) return 0; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We don't do anything here: idle task is faking death itself. */ unsigned int i; @@ -1403,12 +1403,12 @@ void __cpu_die(unsigned int cpu) printk(KERN_ERR "CPU %u didn't die...\n", cpu); } #else /* ... !CONFIG_HOTPLUG_CPU */ -int __cpu_disable(void) +int native_cpu_disable(void) { return -ENOSYS; } -void __cpu_die(unsigned int cpu) +void native_cpu_die(unsigned int cpu) { /* We said "no" in __cpu_disable */ BUG(); -- cgit v1.2.2 From 379002586368ae22916f668011c9118c8ce8189c Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:12 +0100 Subject: x86_32: clean up play_dead The removal of the CPU from the various maps was redundant as it already happened in cpu_disable. After cleaning this up, cpu_uninit only resets the tlb state, so rename it and create a noop version for the X86_64 case (so the two play_deads can be unified later). Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 +----- arch/x86/kernel/process_32.c | 17 ++++------------- 2 files changed, 5 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..18f5551b32dd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -714,14 +714,10 @@ void __cpuinit cpu_init(void) mxcsr_feature_mask_init(); } -#ifdef CONFIG_HOTPLUG_CPU -void __cpuinit cpu_uninit(void) +void reset_lazy_tlbstate(void) { int cpu = raw_smp_processor_id(); - cpu_clear(cpu, cpu_initialized); - /* lazy TLB state */ per_cpu(cpu_tlbstate, cpu).state = 0; per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; } -#endif diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index e382fe0ccd66..d55eb49584b1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -75,26 +75,17 @@ unsigned long thread_saved_pc(struct task_struct *tsk) #ifdef CONFIG_HOTPLUG_CPU #include -static void cpu_exit_clear(void) +/* We don't actually take CPU down, just spin without interrupts. */ +void native_play_dead(void) { int cpu = raw_smp_processor_id(); idle_task_exit(); - cpu_uninit(); - irq_ctx_exit(cpu); + reset_lazy_tlbstate(); - cpu_clear(cpu, cpu_callout_map); - cpu_clear(cpu, cpu_callin_map); - - numa_remove_cpu(cpu); -} + irq_ctx_exit(cpu); -/* We don't actually take CPU down, just spin without interrupts. */ -void native_play_dead(void) -{ - /* This must be done before dead CPU ack */ - cpu_exit_clear(); mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; -- cgit v1.2.2 From a21f5d88c17a40941f6239d1959d89e8493e8e01 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:13 +0100 Subject: x86: unify x86_32 and x86_64 play_dead into one function Add the new play_dead into smpboot.c, as it fits more cleanly in there alongside other CONFIG_HOTPLUG functions. Separate out the common code into its own function. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 32 -------------------------------- arch/x86/kernel/process_64.c | 23 ----------------------- arch/x86/kernel/smpboot.c | 29 +++++++++++++++++++++++++++++ 3 files changed, 29 insertions(+), 55 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d55eb49584b1..633cf06578fa 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -72,38 +72,6 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } -#ifdef CONFIG_HOTPLUG_CPU -#include - -/* We don't actually take CPU down, just spin without interrupts. */ -void native_play_dead(void) -{ - int cpu = raw_smp_processor_id(); - - idle_task_exit(); - - reset_lazy_tlbstate(); - - irq_ctx_exit(cpu); - - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - /* - * With physical CPU hotplug, we should halt the cpu - */ - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else -void native_play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index dfd3f5752085..aa28ed01cdf3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -85,29 +85,6 @@ void exit_idle(void) __exit_idle(); } -#ifdef CONFIG_HOTPLUG_CPU -DECLARE_PER_CPU(int, cpu_state); - -#include -/* We halt the CPU with physical CPU hotplug */ -void native_play_dead(void) -{ - idle_task_exit(); - mb(); - /* Ack it */ - __get_cpu_var(cpu_state) = CPU_DEAD; - - local_irq_disable(); - /* mask all interrupts, flush any and all caches, and halt */ - wbinvd_halt(); -} -#else -void native_play_dead(void) -{ - BUG(); -} -#endif /* CONFIG_HOTPLUG_CPU */ - /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c414cee296ba..28b4287296aa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1402,6 +1402,29 @@ void native_cpu_die(unsigned int cpu) } printk(KERN_ERR "CPU %u didn't die...\n", cpu); } + +void play_dead_common(void) +{ + idle_task_exit(); + reset_lazy_tlbstate(); + irq_ctx_exit(raw_smp_processor_id()); + + mb(); + /* Ack it */ + __get_cpu_var(cpu_state) = CPU_DEAD; + + /* + * With physical CPU hotplug, we should halt the cpu + */ + local_irq_disable(); +} + +void native_play_dead(void) +{ + play_dead_common(); + wbinvd_halt(); +} + #else /* ... !CONFIG_HOTPLUG_CPU */ int native_cpu_disable(void) { @@ -1413,4 +1436,10 @@ void native_cpu_die(unsigned int cpu) /* We said "no" in __cpu_disable */ BUG(); } + +void native_play_dead(void) +{ + BUG(); +} + #endif -- cgit v1.2.2 From 8227dce7dc2cfdcc28ee0eadfb482a7ee77fba03 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:14 +0100 Subject: x86: separate generic cpu disabling code from APIC writes in cpu_disable It allows paravirt implementations of cpu_disable to share the cpu_disable_common code, without having to take on board APIC writes, which may not be appropriate. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 40 +++++++++++++++++++++++----------------- 1 file changed, 23 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 28b4287296aa..66b04e598817 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1346,25 +1346,9 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } -int native_cpu_disable(void) +void cpu_disable_common(void) { int cpu = smp_processor_id(); - - /* - * Perhaps use cpufreq to drop frequency, but that could go - * into generic code. - * - * We won't take down the boot processor on i386 due to some - * interrupts only being able to be serviced by the BSP. - * Especially so if we're not using an IOAPIC -zwane - */ - if (cpu == 0) - return -EBUSY; - - if (nmi_watchdog == NMI_LOCAL_APIC) - stop_apic_nmi_watchdog(NULL); - clear_local_APIC(); - /* * HACK: * Allow any queued timer interrupts to get serviced @@ -1382,6 +1366,28 @@ int native_cpu_disable(void) remove_cpu_from_maps(cpu); unlock_vector_lock(); fixup_irqs(cpu_online_map); +} + +int native_cpu_disable(void) +{ + int cpu = smp_processor_id(); + + /* + * Perhaps use cpufreq to drop frequency, but that could go + * into generic code. + * + * We won't take down the boot processor on i386 due to some + * interrupts only being able to be serviced by the BSP. + * Especially so if we're not using an IOAPIC -zwane + */ + if (cpu == 0) + return -EBUSY; + + if (nmi_watchdog == NMI_LOCAL_APIC) + stop_apic_nmi_watchdog(NULL); + clear_local_APIC(); + + cpu_disable_common(); return 0; } -- cgit v1.2.2 From ed21763e7b0b3fb50e4efd9d4bc17ef5b035d304 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 22 Aug 2008 20:23:38 +0200 Subject: x86: cleanup in amd_cpu_notify() small coding style fix. Signed-off-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/pci/amd_bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index 6a0fca78c362..22e057665e55 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -580,7 +580,7 @@ static int __cpuinit amd_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { int cpu = (long)hcpu; - switch(action) { + switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); -- cgit v1.2.2 From d68d82afd4c88e25763b23cd9cd4974573a3706f Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Fri, 22 Aug 2008 11:52:15 +0100 Subject: xen: implement CPU hotplugging Note the changes from 2.6.18-xen CPU hotplugging: A vcpu_down request from the remote admin via Xenbus both hotunplugs the CPU, and disables it by removing it from the cpu_present map, and removing its entry in /sys. A vcpu_up request from the remote admin only re-enables the CPU, and does not immediately bring the CPU up. A udev event is emitted, which can be caught by the user if he wishes to automatically re-up CPUs when available, or implement a more complex policy. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 60 +++++++++++++++++++++++++++++++++++++++---------- arch/x86/xen/spinlock.c | 5 +++++ arch/x86/xen/time.c | 8 +++++++ arch/x86/xen/xen-ops.h | 6 +++++ 4 files changed, 67 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index baca7f2fbd8a..be5cbb2b7c60 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -11,8 +11,6 @@ * useful topology information for the kernel to make use of. As a * result, all CPUs are treated as if they're single-core and * single-threaded. - * - * This does not handle HOTPLUG_CPU yet. */ #include #include @@ -61,11 +59,12 @@ static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static __cpuinit void cpu_bringup_and_idle(void) +static __cpuinit void cpu_bringup(void) { int cpu = smp_processor_id(); cpu_init(); + touch_softlockup_watchdog(); preempt_disable(); xen_enable_sysenter(); @@ -86,6 +85,11 @@ static __cpuinit void cpu_bringup_and_idle(void) local_irq_enable(); wmb(); /* make sure everything is out */ +} + +static __cpuinit void cpu_bringup_and_idle(void) +{ + cpu_bringup(); cpu_idle(); } @@ -209,8 +213,6 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus) cpu_set(cpu, cpu_present_map); } - - //init_xenbus_allowed_cpumask(); } static __cpuinit int @@ -278,12 +280,6 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) struct task_struct *idle = idle_task(cpu); int rc; -#if 0 - rc = cpu_up_check(cpu); - if (rc) - return rc; -#endif - #ifdef CONFIG_X86_64 /* Allocate node local memory for AP pdas */ WARN_ON(cpu == 0); @@ -336,6 +332,42 @@ static void xen_smp_cpus_done(unsigned int max_cpus) { } +int xen_cpu_disable(void) +{ + unsigned int cpu = smp_processor_id(); + if (cpu == 0) + return -EBUSY; + + cpu_disable_common(); + + load_cr3(swapper_pg_dir); + return 0; +} + +void xen_cpu_die(unsigned int cpu) +{ + while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { + current->state = TASK_UNINTERRUPTIBLE; + schedule_timeout(HZ/10); + } + unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); + unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + + if (num_online_cpus() == 1) + alternatives_smp_switch(0); +} + +void xen_play_dead(void) +{ + play_dead_common(); + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + cpu_bringup(); +} + static void stop_self(void *v) { int cpu = smp_processor_id(); @@ -419,9 +451,13 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id) static const struct smp_ops xen_smp_ops __initdata = { .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, .smp_prepare_cpus = xen_smp_prepare_cpus, - .cpu_up = xen_cpu_up, .smp_cpus_done = xen_smp_cpus_done, + .cpu_up = xen_cpu_up, + .cpu_die = xen_cpu_die, + .cpu_disable = xen_cpu_disable, + .play_dead = xen_play_dead, + .smp_send_stop = xen_smp_send_stop, .smp_send_reschedule = xen_smp_send_reschedule, diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index d072823bc06d..dd71e3a021cd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -357,6 +357,11 @@ void __cpuinit xen_init_lock_cpu(int cpu) printk("cpu %d spinlock event irq %d\n", cpu, irq); } +void xen_uninit_lock_cpu(int cpu) +{ + unbind_from_irqhandler(per_cpu(lock_kicker_irq, cpu), NULL); +} + void __init xen_init_spinlocks(void) { pv_lock_ops.spin_is_locked = xen_spin_is_locked; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 20182d9072c4..004ba86326ae 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -450,6 +450,14 @@ void xen_setup_timer(int cpu) setup_runstate_info(cpu); } +void xen_teardown_timer(int cpu) +{ + struct clock_event_device *evt; + BUG_ON(cpu == 0); + evt = &per_cpu(xen_clock_events, cpu); + unbind_from_irqhandler(evt->irq, NULL); +} + void xen_setup_cpu_clockevents(void) { BUG_ON(preemptible()); diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1e8bfdaa20d3..8dbd97fd7f18 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -34,6 +34,7 @@ void __init xen_build_dynamic_phys_to_machine(void); void xen_init_irq_ops(void); void xen_setup_timer(int cpu); +void xen_teardown_timer(int cpu); cycle_t xen_clocksource_read(void); void xen_setup_cpu_clockevents(void); unsigned long xen_tsc_khz(void); @@ -50,11 +51,16 @@ void xen_mark_init_mm_pinned(void); void __init xen_setup_vcpu_info_placement(void); +void xen_play_dead(void); +void xen_cpu_die(unsigned int cpu); +int xen_cpu_disable(void); + #ifdef CONFIG_SMP void xen_smp_init(void); void __init xen_init_spinlocks(void); __cpuinit void xen_init_lock_cpu(int cpu); +void xen_uninit_lock_cpu(int cpu); extern cpumask_t xen_cpu_initialized_map; #else -- cgit v1.2.2 From c7ffa6c26277b403920e2255d10df849bd613380 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 25 Aug 2008 13:11:27 +0300 Subject: x86: default to reboot via ACPI Triple-fault and keyboard reset may assert INIT instead of RESET; however INIT is blocked when Intel VT is enabled. This leads to a partially reset machine when invoking emergency_restart via sysrq-b: the processor is still working but other parts of the system are dead. Default to rebooting via ACPI, which correctly asserts RESET and reboots the machine. This is safe since we will fall back to keyboard reset and triple fault if acpi is not enabled or if the reset is not successful. Signed-off-by: Avi Kivity Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 724adfc63cb9..f4c93f1cfc19 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -29,7 +29,11 @@ EXPORT_SYMBOL(pm_power_off); static const struct desc_ptr no_idt = {}; static int reboot_mode; -enum reboot_type reboot_type = BOOT_KBD; +/* + * Keyboard reset and triple fault may result in INIT, not RESET, which + * doesn't work when we're in vmx root mode. Try ACPI first. + */ +enum reboot_type reboot_type = BOOT_ACPI; int reboot_force; #if defined(CONFIG_X86_32) && defined(CONFIG_SMP) -- cgit v1.2.2 From 52a8968ce95da8469ba0a9b3e4010fe31caf77a3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 Aug 2008 13:35:06 +0200 Subject: x86: fix cpufreq + sched_clock() regression I noticed that my sched_clock() was slow on a number of machine, so I started looking at cpufreq. The below seems to fix the problem for me. Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9bed5cae4bdc..8e786b0d665a 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -314,7 +314,7 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, mark_tsc_unstable("cpufreq changes"); } - set_cyc2ns_scale(tsc_khz_ref, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu); return 0; } -- cgit v1.2.2 From cd5998ebfbc9e6cb44408efa217c15d7eea13675 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:19:38 +0300 Subject: KVM: MMU: Fix torn shadow pte The shadow code assigns a pte directly in one place, which is nonatomic on i386 can can cause random memory references. Fix by using an atomic setter. Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index f72ac1fa35f0..4a814bff21f2 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -345,7 +345,7 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, shadow_addr = __pa(shadow_page->spt); shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; - *shadow_ent = shadow_pte; + set_shadow_pte(shadow_ent, shadow_pte); } mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, -- cgit v1.2.2 From d25e26b61d59370eee8b7f2634641eb0fa76e952 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 25 Aug 2008 14:15:38 -0700 Subject: [x86] Clean up MAXSMP Kconfig, and limit NR_CPUS to 512 This fixes a regression that was indirectly caused by commit 1184dc2ffe2c8fb9afb766d870850f2c3165ef25 ("x86: modify Kconfig to allow up to 4096 cpus"). Allowing 4k CPU's is not practical at this time, because we still have a number of places that have several 'cpumask_t's on the stack, and a 4k-bit cpumask is 512 bytes of stack-space for each such variable. This literally caused functions like 'smp_call_function_mask' to have a 2.5kB stack frame, and several functions to have 2kB stackframes. With an 8kB stack total, smashing the stack was simply much too likely. At least bugzilla entry http://bugzilla.kernel.org/show_bug.cgi?id=11342 was due to this. The earlier commit to not inline load_module() into sys_init_module() fixed the particular symptoms of this that Alan Brunelle saw in that bugzilla entry, but the huge stack waste by cpumask_t's was the more direct cause. Some day we'll have allocation helpers that allocate large CPU masks dynamically, but in the meantime we simply cannot allow cpumasks this large. Cc: Alan D. Brunelle Cc: Mike Travis Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 68d91c8233f4..ed92864d1325 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -577,35 +577,29 @@ config SWIOTLB config IOMMU_HELPER def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) + config MAXSMP bool "Configure Maximum number of SMP Processors and NUMA Nodes" - depends on X86_64 && SMP + depends on X86_64 && SMP && BROKEN default n help Configure maximum number of CPUS and NUMA Nodes for this architecture. If unsure, say N. -if MAXSMP -config NR_CPUS - int - default "4096" -endif - -if !MAXSMP config NR_CPUS - int "Maximum number of CPUs (2-4096)" - range 2 4096 + int "Maximum number of CPUs (2-512)" if !MAXSMP + range 2 512 depends on SMP + default "4096" if MAXSMP default "32" if X86_NUMAQ || X86_SUMMIT || X86_BIGSMP || X86_ES7000 default "8" help This allows you to specify the maximum number of CPUs which this - kernel will support. The maximum supported value is 4096 and the + kernel will support. The maximum supported value is 512 and the minimum value which makes sense is 2. This is purely to save memory - each supported CPU adds approximately eight kilobytes to the kernel image. -endif config SCHED_SMT bool "SMT (Hyperthreading) scheduler support" @@ -996,17 +990,10 @@ config NUMA_EMU into virtual nodes when booted with "numa=fake=N", where N is the number of nodes. This is only useful for debugging. -if MAXSMP - config NODES_SHIFT - int - default "9" -endif - -if !MAXSMP -config NODES_SHIFT - int "Maximum NUMA Nodes (as a power of 2)" + int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP range 1 9 if X86_64 + default "9" if MAXSMP default "6" if X86_64 default "4" if X86_NUMAQ default "3" @@ -1014,7 +1001,6 @@ config NODES_SHIFT help Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accomodate various tables. -endif config HAVE_ARCH_BOOTMEM_NODE def_bool y -- cgit v1.2.2 From c6f31932d0a1d2b13952f506ebc92675e2d8df80 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:27:21 -0700 Subject: x86: msr: propagate errors from smp_call_function_single() Propagate error (-ENXIO) from smp_call_function_single(). These errors can happen when a CPU is unplugged while the MSR driver is open. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/msr.c | 14 ++++++++++---- arch/x86/lib/msr-on-cpu.c | 22 ++++++++++++---------- 2 files changed, 22 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index e43938086885..9c34a1005dba 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -79,8 +79,11 @@ static ssize_t msr_read(struct file *file, char __user *buf, for (; count; count -= 8) { err = rdmsr_safe_on_cpu(cpu, reg, &data[0], &data[1]); - if (err) - return -EIO; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + return err; + } if (copy_to_user(tmp, &data, 8)) return -EFAULT; tmp += 2; @@ -105,8 +108,11 @@ static ssize_t msr_write(struct file *file, const char __user *buf, if (copy_from_user(&data, tmp, 8)) return -EFAULT; err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); - if (err) - return -EIO; + if (err) { + if (err == -EFAULT) /* Fix idiotic error code */ + err = -EIO; + return err; + } tmp += 2; } diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c index d5a2b39f882b..01b868ba82f8 100644 --- a/arch/x86/lib/msr-on-cpu.c +++ b/arch/x86/lib/msr-on-cpu.c @@ -30,10 +30,11 @@ static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) rv.msr_no = msr_no; if (safe) { - smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); - err = rv.err; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, + &rv, 1); + err = err ? err : rv.err; } else { - smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); } *l = rv.l; *h = rv.h; @@ -64,23 +65,24 @@ static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) rv.l = l; rv.h = h; if (safe) { - smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); - err = rv.err; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, + &rv, 1); + err = err ? err : rv.err; } else { - smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); } return err; } -void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { - _wrmsr_on_cpu(cpu, msr_no, l, h, 0); + return _wrmsr_on_cpu(cpu, msr_no, l, h, 0); } -void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { - _rdmsr_on_cpu(cpu, msr_no, l, h, 0); + return _rdmsr_on_cpu(cpu, msr_no, l, h, 0); } /* These "safe" variants are slower and should be used when the target MSR -- cgit v1.2.2 From 4b46ca701bdcdc19fcf32823f9fcabf8236e4e78 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:28:20 -0700 Subject: x86: cpuid: propagate error from smp_call_function_single() Propagate error (-ENXIO) from smp_call_function_single() in the CPUID driver. This can happen when a CPU is unplugged while the CPUID driver is open. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 14b11b3be31c..23e8316c8357 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -89,6 +89,7 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); u64 pos = *ppos; + int err; if (count % 16) return -EINVAL; /* Invalid chunk size */ @@ -96,7 +97,9 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, for (; count; count -= 16) { cmd.eax = pos; cmd.ecx = pos >> 32; - smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); + err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); + if (err) + return err; if (copy_to_user(tmp, &cmd, 16)) return -EFAULT; tmp += 16; -- cgit v1.2.2 From 85f1cb60157e06d9e8996b02fad9ba6964523d75 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:34:27 -0700 Subject: x86: msr: correct return value on partial operations Return the correct return value when the MSR driver partially completes a request (we should return the number of bytes actually read or written, instead of the error code.) Signed-off-by: H. Peter Anvin --- arch/x86/kernel/msr.c | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 9c34a1005dba..2e2af5d18191 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -72,7 +72,8 @@ static ssize_t msr_read(struct file *file, char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -82,14 +83,17 @@ static ssize_t msr_read(struct file *file, char __user *buf, if (err) { if (err == -EFAULT) /* Fix idiotic error code */ err = -EIO; - return err; + break; + } + if (copy_to_user(tmp, &data, 8)) { + err = -EFAULT; + break; } - if (copy_to_user(tmp, &data, 8)) - return -EFAULT; tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static ssize_t msr_write(struct file *file, const char __user *buf, @@ -99,24 +103,28 @@ static ssize_t msr_write(struct file *file, const char __user *buf, u32 data[2]; u32 reg = *ppos; int cpu = iminor(file->f_path.dentry->d_inode); - int err; + int err = 0; + ssize_t bytes = 0; if (count % 8) return -EINVAL; /* Invalid chunk size */ for (; count; count -= 8) { - if (copy_from_user(&data, tmp, 8)) - return -EFAULT; + if (copy_from_user(&data, tmp, 8)) { + err = -EFAULT; + break; + } err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]); if (err) { if (err == -EFAULT) /* Fix idiotic error code */ err = -EIO; - return err; + break; } tmp += 2; + bytes += 8; } - return ((char __user *)tmp) - buf; + return bytes ? bytes : err; } static int msr_open(struct inode *inode, struct file *file) -- cgit v1.2.2 From 9ea2b82ed6265a31f9a84886d74d8a2ef01b27c8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:35:25 -0700 Subject: x86: cpuid: correct return value on partial operations Return the correct return value when the CPUID driver partially completes a request (we should return the number of bytes actually read or written, instead of the error code.) Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpuid.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 23e8316c8357..8e9cd6a8ec12 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -89,7 +89,8 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, struct cpuid_regs cmd; int cpu = iminor(file->f_path.dentry->d_inode); u64 pos = *ppos; - int err; + ssize_t bytes = 0; + int err = 0; if (count % 16) return -EINVAL; /* Invalid chunk size */ @@ -99,14 +100,17 @@ static ssize_t cpuid_read(struct file *file, char __user *buf, cmd.ecx = pos >> 32; err = smp_call_function_single(cpu, cpuid_smp_cpuid, &cmd, 1); if (err) - return err; - if (copy_to_user(tmp, &cmd, 16)) - return -EFAULT; + break; + if (copy_to_user(tmp, &cmd, 16)) { + err = -EFAULT; + break; + } tmp += 16; + bytes += 16; *ppos = ++pos; } - return tmp - buf; + return bytes ? bytes : err; } static int cpuid_open(struct inode *inode, struct file *file) -- cgit v1.2.2 From bdd314616f7218e325aa9637a46159ecba44cfeb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 17:44:03 -0700 Subject: x86: msr-on-cpu: remove unnecessary level of abstraction Remove an unnecessary level of abstraction in the msr-on-cpu library. Although this duplicates some code, the duplicated code is less than the additional code, and this way should be faster. Additionally, change the order of the functions to make the regular structure of this file more obvious. Signed-off-by: H. Peter Anvin --- arch/x86/lib/msr-on-cpu.c | 78 ++++++++++++++++++++++------------------------- 1 file changed, 36 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/msr-on-cpu.c b/arch/x86/lib/msr-on-cpu.c index 01b868ba82f8..321cf720dbb6 100644 --- a/arch/x86/lib/msr-on-cpu.c +++ b/arch/x86/lib/msr-on-cpu.c @@ -16,37 +16,46 @@ static void __rdmsr_on_cpu(void *info) rdmsr(rv->msr_no, rv->l, rv->h); } -static void __rdmsr_safe_on_cpu(void *info) +static void __wrmsr_on_cpu(void *info) { struct msr_info *rv = info; - rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); + wrmsr(rv->msr_no, rv->l, rv->h); } -static int _rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h, int safe) +int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { - int err = 0; + int err; struct msr_info rv; rv.msr_no = msr_no; - if (safe) { - err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, - &rv, 1); - err = err ? err : rv.err; - } else { - err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); - } + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); *l = rv.l; *h = rv.h; return err; } -static void __wrmsr_on_cpu(void *info) +int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) +{ + int err; + struct msr_info rv; + + rv.msr_no = msr_no; + rv.l = l; + rv.h = h; + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} + +/* These "safe" variants are slower and should be used when the target MSR + may not actually exist. */ +static void __rdmsr_safe_on_cpu(void *info) { struct msr_info *rv = info; - wrmsr(rv->msr_no, rv->l, rv->h); + rv->err = rdmsr_safe(rv->msr_no, &rv->l, &rv->h); } static void __wrmsr_safe_on_cpu(void *info) @@ -56,45 +65,30 @@ static void __wrmsr_safe_on_cpu(void *info) rv->err = wrmsr_safe(rv->msr_no, rv->l, rv->h); } -static int _wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h, int safe) +int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) { - int err = 0; + int err; struct msr_info rv; rv.msr_no = msr_no; - rv.l = l; - rv.h = h; - if (safe) { - err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, - &rv, 1); - err = err ? err : rv.err; - } else { - err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); - } - - return err; -} + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *l = rv.l; + *h = rv.h; -int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) -{ - return _wrmsr_on_cpu(cpu, msr_no, l, h, 0); + return err ? err : rv.err; } -int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - return _rdmsr_on_cpu(cpu, msr_no, l, h, 0); -} - -/* These "safe" variants are slower and should be used when the target MSR - may not actually exist. */ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { - return _wrmsr_on_cpu(cpu, msr_no, l, h, 1); -} + int err; + struct msr_info rv; -int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) -{ - return _rdmsr_on_cpu(cpu, msr_no, l, h, 1); + rv.msr_no = msr_no; + rv.l = l; + rv.h = h; + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; } EXPORT_SYMBOL(rdmsr_on_cpu); -- cgit v1.2.2 From c1b362e3b4d331a63915b268a33207311a439d60 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 25 Aug 2008 15:06:35 -0700 Subject: x86: update defconfigs Enable some option commonly used by testers in defconfig, including some very common device drivers and network boot support. defconfig is still not meant to be a kitchen-sink configuration. Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 303 ++++++++++++++++++++++---------------- arch/x86/configs/x86_64_defconfig | 258 +++++++++++++++++++------------- 2 files changed, 334 insertions(+), 227 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 4d73f53287b6..104275e191a8 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,13 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.26-rc1 -# Sun May 4 19:59:02 2008 +# Linux kernel version: 2.6.27-rc4 +# Mon Aug 25 15:04:00 2008 # # CONFIG_64BIT is not set CONFIG_X86_32=y # CONFIG_X86_64 is not set CONFIG_X86=y -CONFIG_DEFCONFIG_LIST="arch/x86/configs/i386_defconfig" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" # CONFIG_GENERIC_LOCKBREAK is not set CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -53,6 +53,7 @@ CONFIG_X86_HT=y CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y CONFIG_KTIME_SCALAR=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # General setup @@ -82,6 +83,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_NS=y # CONFIG_CGROUP_DEVICE is not set CONFIG_CPUSETS=y +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y CONFIG_GROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y # CONFIG_RT_GROUP_SCHED is not set @@ -105,7 +107,6 @@ CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y @@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y # CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y @@ -132,27 +134,35 @@ CONFIG_MARKERS=y # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y CONFIG_KPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y CONFIG_KRETPROBES=y +CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y +# CONFIG_HAVE_ARCH_TRACEHOOK is not set # CONFIG_HAVE_DMA_ATTRS is not set +CONFIG_USE_GENERIC_SMP_HELPERS=y +# CONFIG_HAVE_CLK is not set CONFIG_PROC_PAGE_MONITOR=y +CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 CONFIG_MODULES=y +# CONFIG_MODULE_FORCE_LOAD is not set CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set -# CONFIG_KMOD is not set +CONFIG_KMOD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_LBD is not set CONFIG_BLK_DEV_IO_TRACE=y # CONFIG_LSF is not set CONFIG_BLK_DEV_BSG=y +# CONFIG_BLK_DEV_INTEGRITY is not set # # IO Schedulers @@ -176,19 +186,17 @@ CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y CONFIG_X86_PC=y # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set -# CONFIG_X86_NUMAQ is not set -# CONFIG_X86_SUMMIT is not set -# CONFIG_X86_BIGSMP is not set -# CONFIG_X86_VISWS is not set # CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_ES7000 is not set -# CONFIG_X86_RDC321X is not set # CONFIG_X86_VSMP is not set +# CONFIG_X86_RDC321X is not set CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set +# CONFIG_MEMTEST is not set # CONFIG_M386 is not set # CONFIG_M486 is not set # CONFIG_M586 is not set @@ -215,21 +223,19 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y # CONFIG_MPSC is not set CONFIG_MCORE2=y # CONFIG_GENERIC_CPU is not set -# CONFIG_X86_GENERIC is not set +CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_X86_XADD=y CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y CONFIG_X86_POPAD_OK=y -CONFIG_X86_GOOD_APIC=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_P6_NOP=y CONFIG_X86_TSC=y -CONFIG_X86_MINIMUM_CPU_FAMILY=6 +CONFIG_X86_MINIMUM_CPU_FAMILY=4 CONFIG_X86_DEBUGCTLMSR=y CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y @@ -247,7 +253,7 @@ CONFIG_X86_IO_APIC=y CONFIG_VM86=y # CONFIG_TOSHIBA is not set # CONFIG_I8K is not set -# CONFIG_X86_REBOOTFIXUPS is not set +CONFIG_X86_REBOOTFIXUPS=y # CONFIG_MICROCODE is not set CONFIG_X86_MSR=y CONFIG_X86_CPUID=y @@ -256,32 +262,28 @@ CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set CONFIG_PAGE_OFFSET=0xC0000000 CONFIG_HIGHMEM=y -CONFIG_NEED_NODE_MEMMAP_SIZE=y CONFIG_ARCH_FLATMEM_ENABLE=y CONFIG_ARCH_SPARSEMEM_ENABLE=y CONFIG_ARCH_SELECT_MEMORY_MODEL=y CONFIG_SELECT_MEMORY_MODEL=y -# CONFIG_FLATMEM_MANUAL is not set +CONFIG_FLATMEM_MANUAL=y # CONFIG_DISCONTIGMEM_MANUAL is not set -CONFIG_SPARSEMEM_MANUAL=y -CONFIG_SPARSEMEM=y -CONFIG_HAVE_MEMORY_PRESENT=y +# CONFIG_SPARSEMEM_MANUAL is not set +CONFIG_FLATMEM=y +CONFIG_FLAT_NODE_MEM_MAP=y CONFIG_SPARSEMEM_STATIC=y # CONFIG_SPARSEMEM_VMEMMAP_ENABLE is not set - -# -# Memory hotplug is currently incompatible with Software Suspend -# CONFIG_PAGEFLAGS_EXTENDED=y CONFIG_SPLIT_PTLOCK_CPUS=4 CONFIG_RESOURCES_64BIT=y CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y -# CONFIG_HIGHPTE is not set +CONFIG_HIGHPTE=y # CONFIG_MATH_EMULATION is not set CONFIG_MTRR=y -# CONFIG_X86_PAT is not set +# CONFIG_MTRR_SANITIZER is not set +CONFIG_X86_PAT=y CONFIG_EFI=y # CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y @@ -293,6 +295,7 @@ CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +# CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 CONFIG_RELOCATABLE=y CONFIG_PHYSICAL_ALIGN=0x200000 @@ -312,6 +315,7 @@ CONFIG_PM_TRACE_RTC=y CONFIG_PM_SLEEP_SMP=y CONFIG_PM_SLEEP=y CONFIG_SUSPEND=y +# CONFIG_PM_TEST_SUSPEND is not set CONFIG_SUSPEND_FREEZER=y CONFIG_HIBERNATION=y CONFIG_PM_STD_PARTITION="" @@ -337,6 +341,7 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_EC=y +# CONFIG_ACPI_PCI_SLOT is not set CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y @@ -395,8 +400,8 @@ CONFIG_PCI=y # CONFIG_PCI_GOBIOS is not set # CONFIG_PCI_GOMMCONFIG is not set # CONFIG_PCI_GODIRECT is not set -CONFIG_PCI_GOANY=y # CONFIG_PCI_GOOLPC is not set +CONFIG_PCI_GOANY=y CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y @@ -448,10 +453,6 @@ CONFIG_HOTPLUG_PCI=y CONFIG_BINFMT_ELF=y # CONFIG_BINFMT_AOUT is not set CONFIG_BINFMT_MISC=y - -# -# Networking -# CONFIG_NET=y # @@ -475,7 +476,10 @@ CONFIG_IP_FIB_HASH=y CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_ROUTE_MULTIPATH=y CONFIG_IP_ROUTE_VERBOSE=y -# CONFIG_IP_PNP is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set CONFIG_IP_MROUTE=y @@ -618,7 +622,6 @@ CONFIG_NET_SCHED=y # CONFIG_NET_SCH_HTB is not set # CONFIG_NET_SCH_HFSC is not set # CONFIG_NET_SCH_PRIO is not set -# CONFIG_NET_SCH_RR is not set # CONFIG_NET_SCH_RED is not set # CONFIG_NET_SCH_SFQ is not set # CONFIG_NET_SCH_TEQL is not set @@ -680,28 +683,19 @@ CONFIG_FIB_RULES=y CONFIG_CFG80211=y CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y +CONFIG_WIRELESS_EXT_SYSFS=y CONFIG_MAC80211=y # # Rate control algorithm selection # +CONFIG_MAC80211_RC_PID=y CONFIG_MAC80211_RC_DEFAULT_PID=y -# CONFIG_MAC80211_RC_DEFAULT_NONE is not set - -# -# Selecting 'y' for an algorithm will -# - -# -# build the algorithm into mac80211. -# CONFIG_MAC80211_RC_DEFAULT="pid" -CONFIG_MAC80211_RC_PID=y # CONFIG_MAC80211_MESH is not set CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set -# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set -# CONFIG_MAC80211_DEBUG is not set +# CONFIG_MAC80211_DEBUG_MENU is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set @@ -717,6 +711,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="" # CONFIG_DEBUG_DRIVER is not set CONFIG_DEBUG_DEVRES=y # CONFIG_SYS_HYPERVISOR is not set @@ -749,6 +745,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 # CONFIG_BLK_DEV_XIP is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set +# CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set @@ -760,10 +757,12 @@ CONFIG_MISC_DEVICES=y # CONFIG_FUJITSU_LAPTOP is not set # CONFIG_TC1100_WMI is not set # CONFIG_MSI_LAPTOP is not set +# CONFIG_COMPAL_LAPTOP is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set # CONFIG_INTEL_MENLOW is not set # CONFIG_ENCLOSURE_SERVICES is not set +# CONFIG_HP_ILO is not set CONFIG_HAVE_IDE=y # CONFIG_IDE is not set @@ -802,12 +801,13 @@ CONFIG_SCSI_WAIT_SCAN=m # CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set -# CONFIG_SCSI_ISCSI_ATTRS is not set +CONFIG_SCSI_ISCSI_ATTRS=y # CONFIG_SCSI_SAS_ATTRS is not set # CONFIG_SCSI_SAS_LIBSAS is not set # CONFIG_SCSI_SRP_ATTRS is not set # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set +# CONFIG_SCSI_DH is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -842,7 +842,7 @@ CONFIG_PATA_AMD=y # CONFIG_PATA_CS5536 is not set # CONFIG_PATA_CYPRESS is not set # CONFIG_PATA_EFAR is not set -# CONFIG_ATA_GENERIC is not set +CONFIG_ATA_GENERIC=y # CONFIG_PATA_HPT366 is not set # CONFIG_PATA_HPT37X is not set # CONFIG_PATA_HPT3X2N is not set @@ -852,7 +852,7 @@ CONFIG_PATA_AMD=y # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set # CONFIG_PATA_MARVELL is not set -# CONFIG_PATA_MPIIX is not set +CONFIG_PATA_MPIIX=y CONFIG_PATA_OLDPIIX=y # CONFIG_PATA_NETCELL is not set # CONFIG_PATA_NINJA32 is not set @@ -871,6 +871,7 @@ CONFIG_PATA_OLDPIIX=y # CONFIG_PATA_SIS is not set # CONFIG_PATA_VIA is not set # CONFIG_PATA_WINBOND is not set +CONFIG_PATA_SCH=y CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_LINEAR is not set @@ -894,13 +895,16 @@ CONFIG_DM_ZERO=y # # IEEE 1394 (FireWire) support # + +# +# Enable only one of the two stacks, unless you know what you are doing +# # CONFIG_FIREWIRE is not set # CONFIG_IEEE1394 is not set # CONFIG_I2O is not set CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -# CONFIG_NETDEVICES_MULTIQUEUE is not set # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -910,7 +914,23 @@ CONFIG_NETDEVICES=y # CONFIG_VETH is not set # CONFIG_NET_SB1000 is not set # CONFIG_ARCNET is not set -# CONFIG_PHYLIB is not set +CONFIG_PHYLIB=y + +# +# MII PHY device drivers +# +# CONFIG_MARVELL_PHY is not set +# CONFIG_DAVICOM_PHY is not set +# CONFIG_QSEMI_PHY is not set +# CONFIG_LXT_PHY is not set +# CONFIG_CICADA_PHY is not set +# CONFIG_VITESSE_PHY is not set +# CONFIG_SMSC_PHY is not set +# CONFIG_BROADCOM_PHY is not set +# CONFIG_ICPLUS_PHY is not set +# CONFIG_REALTEK_PHY is not set +# CONFIG_FIXED_PHY is not set +# CONFIG_MDIO_BITBANG is not set CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set @@ -943,10 +963,10 @@ CONFIG_FORCEDETH=y CONFIG_E100=y # CONFIG_FEALNX is not set # CONFIG_NATSEMI is not set -# CONFIG_NE2K_PCI is not set +CONFIG_NE2K_PCI=y # CONFIG_8139CP is not set CONFIG_8139TOO=y -CONFIG_8139TOO_PIO=y +# CONFIG_8139TOO_PIO is not set # CONFIG_8139TOO_TUNE_TWISTER is not set # CONFIG_8139TOO_8129 is not set # CONFIG_8139_OLD_RX_RESET is not set @@ -961,25 +981,24 @@ CONFIG_NETDEV_1000=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set -# CONFIG_E1000E is not set -# CONFIG_E1000E_ENABLED is not set +CONFIG_E1000E=y # CONFIG_IP1000 is not set # CONFIG_IGB is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set -# CONFIG_R8169 is not set +CONFIG_R8169=y # CONFIG_SIS190 is not set # CONFIG_SKGE is not set CONFIG_SKY2=y # CONFIG_SKY2_DEBUG is not set # CONFIG_VIA_VELOCITY is not set CONFIG_TIGON3=y -# CONFIG_BNX2 is not set +CONFIG_BNX2=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set +# CONFIG_ATL1E is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set @@ -1019,13 +1038,14 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8180 is not set # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set +# CONFIG_MAC80211_HWSIM is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set -# CONFIG_IWLWIFI is not set +# CONFIG_ATH9K is not set # CONFIG_IWLCORE is not set # CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWL4965 is not set +# CONFIG_IWLAGN is not set # CONFIG_IWL3945 is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set @@ -1105,6 +1125,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y # CONFIG_MOUSE_PS2_TOUCHKIT is not set # CONFIG_MOUSE_SERIAL is not set # CONFIG_MOUSE_APPLETOUCH is not set +# CONFIG_MOUSE_BCM5974 is not set # CONFIG_MOUSE_VSXXXAA is not set CONFIG_INPUT_JOYSTICK=y # CONFIG_JOYSTICK_ANALOG is not set @@ -1139,12 +1160,14 @@ CONFIG_INPUT_TOUCHSCREEN=y # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set # CONFIG_TOUCHSCREEN_MTOUCH is not set +# CONFIG_TOUCHSCREEN_INEXIO is not set # CONFIG_TOUCHSCREEN_MK712 is not set # CONFIG_TOUCHSCREEN_PENMOUNT is not set # CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set # CONFIG_TOUCHSCREEN_TOUCHWIN is not set # CONFIG_TOUCHSCREEN_UCB1400 is not set # CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set +# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set CONFIG_INPUT_MISC=y # CONFIG_INPUT_PCSPKR is not set # CONFIG_INPUT_APANEL is not set @@ -1173,6 +1196,7 @@ CONFIG_SERIO_LIBPS2=y # Character devices # CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y @@ -1223,8 +1247,8 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y -# CONFIG_HW_RANDOM_INTEL is not set -# CONFIG_HW_RANDOM_AMD is not set +CONFIG_HW_RANDOM_INTEL=y +CONFIG_HW_RANDOM_AMD=y CONFIG_HW_RANDOM_GEODE=y CONFIG_HW_RANDOM_VIA=y CONFIG_NVRAM=y @@ -1245,7 +1269,6 @@ CONFIG_NVRAM=y # CONFIG_CS5535_GPIO is not set # CONFIG_RAW_DRIVER is not set CONFIG_HPET=y -# CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set # CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set @@ -1254,43 +1277,64 @@ CONFIG_DEVPORT=y CONFIG_I2C=y CONFIG_I2C_BOARDINFO=y # CONFIG_I2C_CHARDEV is not set +CONFIG_I2C_HELPER_AUTO=y # # I2C Hardware Bus support # + +# +# PC SMBus host controller drivers +# # CONFIG_I2C_ALI1535 is not set # CONFIG_I2C_ALI1563 is not set # CONFIG_I2C_ALI15X3 is not set # CONFIG_I2C_AMD756 is not set # CONFIG_I2C_AMD8111 is not set CONFIG_I2C_I801=y -# CONFIG_I2C_I810 is not set +# CONFIG_I2C_ISCH is not set # CONFIG_I2C_PIIX4 is not set # CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIMTEC is not set -# CONFIG_SCx200_ACB is not set # CONFIG_I2C_SIS5595 is not set # CONFIG_I2C_SIS630 is not set # CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_TAOS_EVM is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_TINY_USB is not set # CONFIG_I2C_VIA is not set # CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_TINY_USB is not set + +# +# Graphics adapter I2C/DDC channel drivers +# # CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# # CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_STUB is not set +# CONFIG_SCx200_ACB is not set # # Miscellaneous I2C Chip support # # CONFIG_DS1682 is not set +# CONFIG_AT24 is not set # CONFIG_SENSORS_EEPROM is not set # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set # CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set @@ -1299,6 +1343,8 @@ CONFIG_I2C_I801=y # CONFIG_I2C_DEBUG_BUS is not set # CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set +CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y +# CONFIG_GPIOLIB is not set # CONFIG_W1 is not set CONFIG_POWER_SUPPLY=y # CONFIG_POWER_SUPPLY_DEBUG is not set @@ -1360,8 +1406,10 @@ CONFIG_SSB_POSSIBLE=y # # Multifunction device drivers # +# CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_MFD_TMIO is not set # # Multimedia devices @@ -1372,6 +1420,7 @@ CONFIG_SSB_POSSIBLE=y # # CONFIG_VIDEO_DEV is not set # CONFIG_DVB_CORE is not set +# CONFIG_VIDEO_MEDIA is not set # # Multimedia drivers @@ -1418,7 +1467,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y # CONFIG_FB_SYS_IMAGEBLIT is not set # CONFIG_FB_FOREIGN_ENDIAN is not set # CONFIG_FB_SYS_FOPS is not set -CONFIG_FB_DEFERRED_IO=y # CONFIG_FB_SVGALIB is not set # CONFIG_FB_MACMODES is not set # CONFIG_FB_BACKLIGHT is not set @@ -1463,6 +1511,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set # CONFIG_FB_GEODE is not set # CONFIG_FB_VIRTUAL is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y @@ -1470,6 +1519,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_BACKLIGHT_CORGI is not set # CONFIG_BACKLIGHT_PROGEAR is not set +# CONFIG_BACKLIGHT_MBP_NVIDIA is not set # # Display device support @@ -1489,15 +1539,7 @@ CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_LOGO_LINUX_CLUT224=y - -# -# Sound -# CONFIG_SOUND=y - -# -# Advanced Linux Sound Architecture -# CONFIG_SND=y CONFIG_SND_TIMER=y CONFIG_SND_PCM=y @@ -1515,20 +1557,14 @@ CONFIG_SND_VERBOSE_PROCFS=y # CONFIG_SND_VERBOSE_PRINTK is not set # CONFIG_SND_DEBUG is not set CONFIG_SND_VMASTER=y - -# -# Generic devices -# +CONFIG_SND_DRIVERS=y # CONFIG_SND_PCSP is not set # CONFIG_SND_DUMMY is not set # CONFIG_SND_VIRMIDI is not set # CONFIG_SND_MTPAV is not set # CONFIG_SND_SERIAL_U16550 is not set # CONFIG_SND_MPU401 is not set - -# -# PCI devices -# +CONFIG_SND_PCI=y # CONFIG_SND_AD1889 is not set # CONFIG_SND_ALS300 is not set # CONFIG_SND_ALS4000 is not set @@ -1603,36 +1639,14 @@ CONFIG_SND_HDA_GENERIC=y # CONFIG_SND_VIRTUOSO is not set # CONFIG_SND_VX222 is not set # CONFIG_SND_YMFPCI is not set - -# -# USB devices -# +CONFIG_SND_USB=y # CONFIG_SND_USB_AUDIO is not set # CONFIG_SND_USB_USX2Y is not set # CONFIG_SND_USB_CAIAQ is not set - -# -# PCMCIA devices -# +CONFIG_SND_PCMCIA=y # CONFIG_SND_VXPOCKET is not set # CONFIG_SND_PDAUDIOCF is not set - -# -# System on Chip audio support -# # CONFIG_SND_SOC is not set - -# -# ALSA SoC audio for Freescale SOCs -# - -# -# SoC Audio for the Texas Instruments OMAP -# - -# -# Open Sound System -# # CONFIG_SOUND_PRIME is not set CONFIG_HID_SUPPORT=y CONFIG_HID=y @@ -1668,6 +1682,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DYNAMIC_MINORS is not set CONFIG_USB_SUSPEND=y # CONFIG_USB_OTG is not set +CONFIG_USB_MON=y # # USB Host Controller Drivers @@ -1691,6 +1706,7 @@ CONFIG_USB_UHCI_HCD=y # # CONFIG_USB_ACM is not set CONFIG_USB_PRINTER=y +# CONFIG_USB_WDM is not set # # NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' @@ -1712,6 +1728,7 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_ALAUDA is not set # CONFIG_USB_STORAGE_ONETOUCH is not set # CONFIG_USB_STORAGE_KARMA is not set +# CONFIG_USB_STORAGE_SIERRA is not set # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set CONFIG_USB_LIBUSUAL=y @@ -1720,7 +1737,6 @@ CONFIG_USB_LIBUSUAL=y # # CONFIG_USB_MDC800 is not set # CONFIG_USB_MICROTEK is not set -CONFIG_USB_MON=y # # USB port drivers @@ -1733,7 +1749,6 @@ CONFIG_USB_MON=y # CONFIG_USB_EMI62 is not set # CONFIG_USB_EMI26 is not set # CONFIG_USB_ADUTUX is not set -# CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set @@ -1750,6 +1765,7 @@ CONFIG_USB_MON=y # CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set +# CONFIG_USB_ISIGHTFW is not set # CONFIG_USB_GADGET is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1759,7 +1775,9 @@ CONFIG_LEDS_CLASS=y # # LED drivers # +# CONFIG_LEDS_PCA9532 is not set # CONFIG_LEDS_CLEVO_MAIL is not set +# CONFIG_LEDS_PCA955X is not set # # LED Triggers @@ -1805,6 +1823,7 @@ CONFIG_RTC_INTF_DEV=y # CONFIG_RTC_DRV_PCF8583 is not set # CONFIG_RTC_DRV_M41T80 is not set # CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set # # SPI RTC drivers @@ -1837,11 +1856,13 @@ CONFIG_DMADEVICES=y # Firmware Drivers # # CONFIG_EDD is not set +CONFIG_FIRMWARE_MEMMAP=y CONFIG_EFI_VARS=y # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set CONFIG_DMIID=y -# CONFIG_ISCSI_IBFT_FIND is not set +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=y # # File systems @@ -1920,14 +1941,27 @@ CONFIG_HUGETLB_PAGE=y # CONFIG_CRAMFS is not set # CONFIG_VXFS_FS is not set # CONFIG_MINIX_FS is not set +# CONFIG_OMFS_FS is not set # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set CONFIG_NETWORK_FILESYSTEMS=y -# CONFIG_NFS_FS is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y # CONFIG_NFSD is not set +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=y +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +# CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set # CONFIG_CIFS is not set # CONFIG_NCP_FS is not set @@ -2001,9 +2035,9 @@ CONFIG_NLS_UTF8=y # Kernel hacking # CONFIG_TRACE_IRQFLAGS_SUPPORT=y -# CONFIG_PRINTK_TIME is not set -# CONFIG_ENABLE_WARN_DEPRECATED is not set -# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_PRINTK_TIME=y +CONFIG_ENABLE_WARN_DEPRECATED=y +CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=2048 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set @@ -2033,6 +2067,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_WRITECOUNT is not set +CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set CONFIG_FRAME_POINTER=y @@ -2043,23 +2078,32 @@ CONFIG_FRAME_POINTER=y # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set +CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_HAVE_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +# CONFIG_FTRACE is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SYSPROF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_SAMPLES is not set -# CONFIG_KGDB is not set CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set # CONFIG_STRICT_DEVMEM is not set +CONFIG_X86_VERBOSE_BOOTUP=y CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y # CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m # CONFIG_4KSTACKS is not set -CONFIG_X86_FIND_SMP_CONFIG=y -CONFIG_X86_MPPARSE=y CONFIG_DOUBLEFAULT=y +# CONFIG_MMIOTRACE is not set CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2071,6 +2115,7 @@ CONFIG_IO_DELAY_0X80=y CONFIG_DEFAULT_IO_DELAY_TYPE=0 CONFIG_DEBUG_BOOT_PARAMS=y # CONFIG_CPA_DEBUG is not set +# CONFIG_OPTIMIZE_INLINING is not set # # Security options @@ -2080,7 +2125,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y # CONFIG_SECURITY_NETWORK_XFRM is not set -CONFIG_SECURITY_CAPABILITIES=y CONFIG_SECURITY_FILE_CAPABILITIES=y # CONFIG_SECURITY_ROOTPLUG is not set CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 @@ -2141,6 +2185,10 @@ CONFIG_CRYPTO_HMAC=y # CONFIG_CRYPTO_MD4 is not set CONFIG_CRYPTO_MD5=y # CONFIG_CRYPTO_MICHAEL_MIC is not set +# CONFIG_CRYPTO_RMD128 is not set +# CONFIG_CRYPTO_RMD160 is not set +# CONFIG_CRYPTO_RMD256 is not set +# CONFIG_CRYPTO_RMD320 is not set CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set @@ -2151,7 +2199,7 @@ CONFIG_CRYPTO_SHA1=y # Ciphers # CONFIG_CRYPTO_AES=y -# CONFIG_CRYPTO_AES_586 is not set +CONFIG_CRYPTO_AES_586=y # CONFIG_CRYPTO_ANUBIS is not set CONFIG_CRYPTO_ARC4=y # CONFIG_CRYPTO_BLOWFISH is not set @@ -2193,6 +2241,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set +CONFIG_CRC_T10DIF=y # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y # CONFIG_CRC7 is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index a40452429625..678c8acefe04 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,13 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.26-rc1 -# Sun May 4 19:59:57 2008 +# Linux kernel version: 2.6.27-rc4 +# Mon Aug 25 14:40:46 2008 # CONFIG_64BIT=y # CONFIG_X86_32 is not set CONFIG_X86_64=y CONFIG_X86=y -CONFIG_DEFCONFIG_LIST="arch/x86/configs/x86_64_defconfig" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" # CONFIG_GENERIC_LOCKBREAK is not set CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -53,6 +53,7 @@ CONFIG_X86_HT=y CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y # CONFIG_KTIME_SCALAR is not set +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" # # General setup @@ -82,6 +83,7 @@ CONFIG_CGROUPS=y CONFIG_CGROUP_NS=y # CONFIG_CGROUP_DEVICE is not set CONFIG_CPUSETS=y +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y CONFIG_GROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y # CONFIG_RT_GROUP_SCHED is not set @@ -105,7 +107,6 @@ CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y -CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y @@ -113,6 +114,7 @@ CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y # CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y @@ -132,25 +134,33 @@ CONFIG_MARKERS=y # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y CONFIG_KPROBES=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y CONFIG_KRETPROBES=y +CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y +# CONFIG_HAVE_ARCH_TRACEHOOK is not set # CONFIG_HAVE_DMA_ATTRS is not set +CONFIG_USE_GENERIC_SMP_HELPERS=y +# CONFIG_HAVE_CLK is not set CONFIG_PROC_PAGE_MONITOR=y +# CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y # CONFIG_TINY_SHMEM is not set CONFIG_BASE_SMALL=0 CONFIG_MODULES=y +# CONFIG_MODULE_FORCE_LOAD is not set CONFIG_MODULE_UNLOAD=y CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODVERSIONS is not set # CONFIG_MODULE_SRCVERSION_ALL is not set -# CONFIG_KMOD is not set +CONFIG_KMOD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y +# CONFIG_BLK_DEV_INTEGRITY is not set CONFIG_BLOCK_COMPAT=y # @@ -175,20 +185,15 @@ CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y +CONFIG_X86_FIND_SMP_CONFIG=y +CONFIG_X86_MPPARSE=y CONFIG_X86_PC=y # CONFIG_X86_ELAN is not set # CONFIG_X86_VOYAGER is not set -# CONFIG_X86_NUMAQ is not set -# CONFIG_X86_SUMMIT is not set -# CONFIG_X86_BIGSMP is not set -# CONFIG_X86_VISWS is not set # CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_ES7000 is not set -# CONFIG_X86_RDC321X is not set # CONFIG_X86_VSMP is not set # CONFIG_PARAVIRT_GUEST is not set -CONFIG_MEMTEST_BOOTPARAM=y -CONFIG_MEMTEST_BOOTPARAM_VALUE=0 +# CONFIG_MEMTEST is not set # CONFIG_M386 is not set # CONFIG_M486 is not set # CONFIG_M586 is not set @@ -220,11 +225,12 @@ CONFIG_X86_L1_CACHE_BYTES=64 CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y CONFIG_X86_L1_CACHE_SHIFT=6 -CONFIG_X86_GOOD_APIC=y +CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y CONFIG_X86_P6_NOP=y CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y CONFIG_X86_CMOV=y CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y @@ -234,8 +240,10 @@ CONFIG_DMI=y CONFIG_GART_IOMMU=y CONFIG_CALGARY_IOMMU=y CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y +CONFIG_AMD_IOMMU=y CONFIG_SWIOTLB=y CONFIG_IOMMU_HELPER=y +# CONFIG_MAXSMP is not set CONFIG_NR_CPUS=4 # CONFIG_SCHED_SMT is not set CONFIG_SCHED_MC=y @@ -281,6 +289,7 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_MTRR=y +# CONFIG_MTRR_SANITIZER is not set # CONFIG_X86_PAT is not set CONFIG_EFI=y CONFIG_SECCOMP=y @@ -313,6 +322,7 @@ CONFIG_PM_TRACE_RTC=y CONFIG_PM_SLEEP_SMP=y CONFIG_PM_SLEEP=y CONFIG_SUSPEND=y +# CONFIG_PM_TEST_SUSPEND is not set CONFIG_SUSPEND_FREEZER=y CONFIG_HIBERNATION=y CONFIG_PM_STD_PARTITION="" @@ -339,6 +349,7 @@ CONFIG_ACPI_NUMA=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set CONFIG_ACPI_EC=y +# CONFIG_ACPI_PCI_SLOT is not set CONFIG_ACPI_POWER=y CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y @@ -437,10 +448,6 @@ CONFIG_IA32_EMULATION=y CONFIG_COMPAT=y CONFIG_COMPAT_FOR_U64_ALIGNMENT=y CONFIG_SYSVIPC_COMPAT=y - -# -# Networking -# CONFIG_NET=y # @@ -464,7 +471,10 @@ CONFIG_IP_FIB_HASH=y CONFIG_IP_MULTIPLE_TABLES=y CONFIG_IP_ROUTE_MULTIPATH=y CONFIG_IP_ROUTE_VERBOSE=y -# CONFIG_IP_PNP is not set +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y # CONFIG_NET_IPIP is not set # CONFIG_NET_IPGRE is not set CONFIG_IP_MROUTE=y @@ -607,7 +617,6 @@ CONFIG_NET_SCHED=y # CONFIG_NET_SCH_HTB is not set # CONFIG_NET_SCH_HFSC is not set # CONFIG_NET_SCH_PRIO is not set -# CONFIG_NET_SCH_RR is not set # CONFIG_NET_SCH_RED is not set # CONFIG_NET_SCH_SFQ is not set # CONFIG_NET_SCH_TEQL is not set @@ -669,28 +678,19 @@ CONFIG_FIB_RULES=y CONFIG_CFG80211=y CONFIG_NL80211=y CONFIG_WIRELESS_EXT=y +CONFIG_WIRELESS_EXT_SYSFS=y CONFIG_MAC80211=y # # Rate control algorithm selection # +CONFIG_MAC80211_RC_PID=y CONFIG_MAC80211_RC_DEFAULT_PID=y -# CONFIG_MAC80211_RC_DEFAULT_NONE is not set - -# -# Selecting 'y' for an algorithm will -# - -# -# build the algorithm into mac80211. -# CONFIG_MAC80211_RC_DEFAULT="pid" -CONFIG_MAC80211_RC_PID=y # CONFIG_MAC80211_MESH is not set CONFIG_MAC80211_LEDS=y # CONFIG_MAC80211_DEBUGFS is not set -# CONFIG_MAC80211_DEBUG_PACKET_ALIGNMENT is not set -# CONFIG_MAC80211_DEBUG is not set +# CONFIG_MAC80211_DEBUG_MENU is not set # CONFIG_IEEE80211 is not set # CONFIG_RFKILL is not set # CONFIG_NET_9P is not set @@ -706,6 +706,8 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_STANDALONE=y CONFIG_PREVENT_FIRMWARE_BUILD=y CONFIG_FW_LOADER=y +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="" # CONFIG_DEBUG_DRIVER is not set CONFIG_DEBUG_DEVRES=y # CONFIG_SYS_HYPERVISOR is not set @@ -738,6 +740,7 @@ CONFIG_BLK_DEV_RAM_SIZE=16384 # CONFIG_BLK_DEV_XIP is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set +# CONFIG_BLK_DEV_HD is not set CONFIG_MISC_DEVICES=y # CONFIG_IBM_ASM is not set # CONFIG_PHANTOM is not set @@ -748,10 +751,14 @@ CONFIG_MISC_DEVICES=y # CONFIG_ASUS_LAPTOP is not set # CONFIG_FUJITSU_LAPTOP is not set # CONFIG_MSI_LAPTOP is not set +# CONFIG_COMPAL_LAPTOP is not set # CONFIG_SONY_LAPTOP is not set # CONFIG_THINKPAD_ACPI is not set # CONFIG_INTEL_MENLOW is not set # CONFIG_ENCLOSURE_SERVICES is not set +# CONFIG_SGI_XP is not set +# CONFIG_HP_ILO is not set +# CONFIG_SGI_GRU is not set CONFIG_HAVE_IDE=y # CONFIG_IDE is not set @@ -790,12 +797,13 @@ CONFIG_SCSI_WAIT_SCAN=m # CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_FC_ATTRS is not set -# CONFIG_SCSI_ISCSI_ATTRS is not set +CONFIG_SCSI_ISCSI_ATTRS=y # CONFIG_SCSI_SAS_ATTRS is not set # CONFIG_SCSI_SAS_LIBSAS is not set # CONFIG_SCSI_SRP_ATTRS is not set # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set +# CONFIG_SCSI_DH is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -857,6 +865,7 @@ CONFIG_PATA_OLDPIIX=y # CONFIG_PATA_SIS is not set # CONFIG_PATA_VIA is not set # CONFIG_PATA_WINBOND is not set +CONFIG_PATA_SCH=y CONFIG_MD=y CONFIG_BLK_DEV_MD=y # CONFIG_MD_LINEAR is not set @@ -880,13 +889,16 @@ CONFIG_DM_ZERO=y # # IEEE 1394 (FireWire) support # + +# +# Enable only one of the two stacks, unless you know what you are doing +# # CONFIG_FIREWIRE is not set # CONFIG_IEEE1394 is not set # CONFIG_I2O is not set CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y -# CONFIG_NETDEVICES_MULTIQUEUE is not set # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -896,7 +908,23 @@ CONFIG_NETDEVICES=y # CONFIG_VETH is not set # CONFIG_NET_SB1000 is not set # CONFIG_ARCNET is not set -# CONFIG_PHYLIB is not set +CONFIG_PHYLIB=y + +# +# MII PHY device drivers +# +# CONFIG_MARVELL_PHY is not set +# CONFIG_DAVICOM_PHY is not set +# CONFIG_QSEMI_PHY is not set +# CONFIG_LXT_PHY is not set +# CONFIG_CICADA_PHY is not set +# CONFIG_VITESSE_PHY is not set +# CONFIG_SMSC_PHY is not set +# CONFIG_BROADCOM_PHY is not set +# CONFIG_ICPLUS_PHY is not set +# CONFIG_REALTEK_PHY is not set +# CONFIG_FIXED_PHY is not set +# CONFIG_MDIO_BITBANG is not set CONFIG_NET_ETHERNET=y CONFIG_MII=y # CONFIG_HAPPYMEAL is not set @@ -940,16 +968,15 @@ CONFIG_8139TOO_PIO=y # CONFIG_SIS900 is not set # CONFIG_EPIC100 is not set # CONFIG_SUNDANCE is not set +# CONFIG_TLAN is not set # CONFIG_VIA_RHINE is not set # CONFIG_SC92031 is not set CONFIG_NETDEV_1000=y # CONFIG_ACENIC is not set # CONFIG_DL2K is not set CONFIG_E1000=y -# CONFIG_E1000_NAPI is not set # CONFIG_E1000_DISABLE_PACKET_SPLIT is not set # CONFIG_E1000E is not set -# CONFIG_E1000E_ENABLED is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set # CONFIG_NS83820 is not set @@ -965,6 +992,7 @@ CONFIG_TIGON3=y # CONFIG_BNX2 is not set # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set +# CONFIG_ATL1E is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set # CONFIG_CHELSIO_T3 is not set @@ -1003,13 +1031,14 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8180 is not set # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set +# CONFIG_MAC80211_HWSIM is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set -# CONFIG_IWLWIFI is not set +# CONFIG_ATH9K is not set # CONFIG_IWLCORE is not set # CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWL4965 is not set +# CONFIG_IWLAGN is not set # CONFIG_IWL3945 is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set @@ -1088,6 +1117,7 @@ CONFIG_MOUSE_PS2_TRACKPOINT=y # CONFIG_MOUSE_PS2_TOUCHKIT is not set # CONFIG_MOUSE_SERIAL is not set # CONFIG_MOUSE_APPLETOUCH is not set +# CONFIG_MOUSE_BCM5974 is not set # CONFIG_MOUSE_VSXXXAA is not set CONFIG_INPUT_JOYSTICK=y # CONFIG_JOYSTICK_ANALOG is not set @@ -1122,12 +1152,14 @@ CONFIG_INPUT_TOUCHSCREEN=y # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set # CONFIG_TOUCHSCREEN_MTOUCH is not set +# CONFIG_TOUCHSCREEN_INEXIO is not set # CONFIG_TOUCHSCREEN_MK712 is not set # CONFIG_TOUCHSCREEN_PENMOUNT is not set # CONFIG_TOUCHSCREEN_TOUCHRIGHT is not set # CONFIG_TOUCHSCREEN_TOUCHWIN is not set # CONFIG_TOUCHSCREEN_UCB1400 is not set # CONFIG_TOUCHSCREEN_USB_COMPOSITE is not set +# CONFIG_TOUCHSCREEN_TOUCHIT213 is not set CONFIG_INPUT_MISC=y # CONFIG_INPUT_PCSPKR is not set # CONFIG_INPUT_APANEL is not set @@ -1155,6 +1187,7 @@ CONFIG_SERIO_LIBPS2=y # Character devices # CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y CONFIG_VT_CONSOLE=y CONFIG_HW_CONSOLE=y CONFIG_VT_HW_CONSOLE_BINDING=y @@ -1222,7 +1255,6 @@ CONFIG_NVRAM=y # CONFIG_PC8736x_GPIO is not set # CONFIG_RAW_DRIVER is not set CONFIG_HPET=y -# CONFIG_HPET_RTC_IRQ is not set # CONFIG_HPET_MMAP is not set # CONFIG_HANGCHECK_TIMER is not set # CONFIG_TCG_TPM is not set @@ -1231,42 +1263,63 @@ CONFIG_DEVPORT=y CONFIG_I2C=y CONFIG_I2C_BOARDINFO=y # CONFIG_I2C_CHARDEV is not set +CONFIG_I2C_HELPER_AUTO=y # # I2C Hardware Bus support # + +# +# PC SMBus host controller drivers +# # CONFIG_I2C_ALI1535 is not set # CONFIG_I2C_ALI1563 is not set # CONFIG_I2C_ALI15X3 is not set # CONFIG_I2C_AMD756 is not set # CONFIG_I2C_AMD8111 is not set CONFIG_I2C_I801=y -# CONFIG_I2C_I810 is not set +# CONFIG_I2C_ISCH is not set # CONFIG_I2C_PIIX4 is not set # CONFIG_I2C_NFORCE2 is not set -# CONFIG_I2C_OCORES is not set -# CONFIG_I2C_PARPORT_LIGHT is not set -# CONFIG_I2C_PROSAVAGE is not set -# CONFIG_I2C_SAVAGE4 is not set -# CONFIG_I2C_SIMTEC is not set # CONFIG_I2C_SIS5595 is not set # CONFIG_I2C_SIS630 is not set # CONFIG_I2C_SIS96X is not set -# CONFIG_I2C_TAOS_EVM is not set -# CONFIG_I2C_STUB is not set -# CONFIG_I2C_TINY_USB is not set # CONFIG_I2C_VIA is not set # CONFIG_I2C_VIAPRO is not set + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +# CONFIG_I2C_OCORES is not set +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set +# CONFIG_I2C_TINY_USB is not set + +# +# Graphics adapter I2C/DDC channel drivers +# # CONFIG_I2C_VOODOO3 is not set + +# +# Other I2C/SMBus bus drivers +# # CONFIG_I2C_PCA_PLATFORM is not set +# CONFIG_I2C_STUB is not set # # Miscellaneous I2C Chip support # # CONFIG_DS1682 is not set +# CONFIG_AT24 is not set # CONFIG_SENSORS_EEPROM is not set # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set # CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set @@ -1275,6 +1328,8 @@ CONFIG_I2C_I801=y # CONFIG_I2C_DEBUG_BUS is not set # CONFIG_I2C_DEBUG_CHIP is not set # CONFIG_SPI is not set +CONFIG_ARCH_WANT_OPTIONAL_GPIOLIB=y +# CONFIG_GPIOLIB is not set # CONFIG_W1 is not set CONFIG_POWER_SUPPLY=y # CONFIG_POWER_SUPPLY_DEBUG is not set @@ -1335,8 +1390,10 @@ CONFIG_SSB_POSSIBLE=y # # Multifunction device drivers # +# CONFIG_MFD_CORE is not set # CONFIG_MFD_SM501 is not set # CONFIG_HTC_PASIC3 is not set +# CONFIG_MFD_TMIO is not set # # Multimedia devices @@ -1347,6 +1404,7 @@ CONFIG_SSB_POSSIBLE=y # # CONFIG_VIDEO_DEV is not set # CONFIG_DVB_CORE is not set +# CONFIG_VIDEO_MEDIA is not set # # Multimedia drivers @@ -1387,7 +1445,6 @@ CONFIG_FB_CFB_IMAGEBLIT=y # CONFIG_FB_SYS_IMAGEBLIT is not set # CONFIG_FB_FOREIGN_ENDIAN is not set # CONFIG_FB_SYS_FOPS is not set -CONFIG_FB_DEFERRED_IO=y # CONFIG_FB_SVGALIB is not set # CONFIG_FB_MACMODES is not set # CONFIG_FB_BACKLIGHT is not set @@ -1430,6 +1487,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set # CONFIG_FB_GEODE is not set # CONFIG_FB_VIRTUAL is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y @@ -1437,6 +1495,7 @@ CONFIG_BACKLIGHT_LCD_SUPPORT=y CONFIG_BACKLIGHT_CLASS_DEVICE=y # CONFIG_BACKLIGHT_CORGI is not set # CONFIG_BACKLIGHT_PROGEAR is not set +# CONFIG_BACKLIGHT_MBP_NVIDIA is not set # # Display device support @@ -1456,15 +1515,7 @@ CONFIG_LOGO=y # CONFIG_LOGO_LINUX_MONO is not set # CONFIG_LOGO_LINUX_VGA16 is not set CONFIG_LOGO_LINUX_CLUT224=y - -# -# Sound -# CONFIG_SOUND=y - -# -# Advanced Linux Sound Architecture -# CONFIG_SND=y CONFIG_SND_TIMER=y CONFIG_SND_PCM=y @@ -1482,20 +1533,14 @@ CONFIG_SND_VERBOSE_PROCFS=y # CONFIG_SND_VERBOSE_PRINTK is not set # CONFIG_SND_DEBUG is not set CONFIG_SND_VMASTER=y - -# -# Generic devices -# +CONFIG_SND_DRIVERS=y # CONFIG_SND_PCSP is not set # CONFIG_SND_DUMMY is not set # CONFIG_SND_VIRMIDI is not set # CONFIG_SND_MTPAV is not set # CONFIG_SND_SERIAL_U16550 is not set # CONFIG_SND_MPU401 is not set - -# -# PCI devices -# +CONFIG_SND_PCI=y # CONFIG_SND_AD1889 is not set # CONFIG_SND_ALS300 is not set # CONFIG_SND_ALS4000 is not set @@ -1568,36 +1613,14 @@ CONFIG_SND_HDA_GENERIC=y # CONFIG_SND_VIRTUOSO is not set # CONFIG_SND_VX222 is not set # CONFIG_SND_YMFPCI is not set - -# -# USB devices -# +CONFIG_SND_USB=y # CONFIG_SND_USB_AUDIO is not set # CONFIG_SND_USB_USX2Y is not set # CONFIG_SND_USB_CAIAQ is not set - -# -# PCMCIA devices -# +CONFIG_SND_PCMCIA=y # CONFIG_SND_VXPOCKET is not set # CONFIG_SND_PDAUDIOCF is not set - -# -# System on Chip audio support -# # CONFIG_SND_SOC is not set - -# -# ALSA SoC audio for Freescale SOCs -# - -# -# SoC Audio for the Texas Instruments OMAP -# - -# -# Open Sound System -# # CONFIG_SOUND_PRIME is not set CONFIG_HID_SUPPORT=y CONFIG_HID=y @@ -1633,6 +1656,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_DYNAMIC_MINORS is not set CONFIG_USB_SUSPEND=y # CONFIG_USB_OTG is not set +CONFIG_USB_MON=y # # USB Host Controller Drivers @@ -1656,6 +1680,7 @@ CONFIG_USB_UHCI_HCD=y # # CONFIG_USB_ACM is not set CONFIG_USB_PRINTER=y +# CONFIG_USB_WDM is not set # # NOTE: USB_STORAGE enables SCSI, and 'SCSI disk support' @@ -1677,6 +1702,7 @@ CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_ALAUDA is not set # CONFIG_USB_STORAGE_ONETOUCH is not set # CONFIG_USB_STORAGE_KARMA is not set +# CONFIG_USB_STORAGE_SIERRA is not set # CONFIG_USB_STORAGE_CYPRESS_ATACB is not set CONFIG_USB_LIBUSUAL=y @@ -1685,7 +1711,6 @@ CONFIG_USB_LIBUSUAL=y # # CONFIG_USB_MDC800 is not set # CONFIG_USB_MICROTEK is not set -CONFIG_USB_MON=y # # USB port drivers @@ -1698,7 +1723,6 @@ CONFIG_USB_MON=y # CONFIG_USB_EMI62 is not set # CONFIG_USB_EMI26 is not set # CONFIG_USB_ADUTUX is not set -# CONFIG_USB_AUERSWALD is not set # CONFIG_USB_RIO500 is not set # CONFIG_USB_LEGOTOWER is not set # CONFIG_USB_LCD is not set @@ -1715,6 +1739,7 @@ CONFIG_USB_MON=y # CONFIG_USB_TRANCEVIBRATOR is not set # CONFIG_USB_IOWARRIOR is not set # CONFIG_USB_TEST is not set +# CONFIG_USB_ISIGHTFW is not set # CONFIG_USB_GADGET is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1724,7 +1749,9 @@ CONFIG_LEDS_CLASS=y # # LED drivers # +# CONFIG_LEDS_PCA9532 is not set # CONFIG_LEDS_CLEVO_MAIL is not set +# CONFIG_LEDS_PCA955X is not set # # LED Triggers @@ -1770,6 +1797,7 @@ CONFIG_RTC_INTF_DEV=y # CONFIG_RTC_DRV_PCF8583 is not set # CONFIG_RTC_DRV_M41T80 is not set # CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set # # SPI RTC drivers @@ -1802,11 +1830,13 @@ CONFIG_DMADEVICES=y # Firmware Drivers # # CONFIG_EDD is not set +CONFIG_FIRMWARE_MEMMAP=y CONFIG_EFI_VARS=y # CONFIG_DELL_RBU is not set # CONFIG_DCDBAS is not set CONFIG_DMIID=y -# CONFIG_ISCSI_IBFT_FIND is not set +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=y # # File systems @@ -1886,14 +1916,27 @@ CONFIG_HUGETLB_PAGE=y # CONFIG_CRAMFS is not set # CONFIG_VXFS_FS is not set # CONFIG_MINIX_FS is not set +# CONFIG_OMFS_FS is not set # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set CONFIG_NETWORK_FILESYSTEMS=y -# CONFIG_NFS_FS is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V3=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y # CONFIG_NFSD is not set +CONFIG_LOCKD=y +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=y +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=y +CONFIG_SUNRPC_GSS=y +CONFIG_RPCSEC_GSS_KRB5=y +# CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set # CONFIG_CIFS is not set # CONFIG_NCP_FS is not set @@ -1967,9 +2010,9 @@ CONFIG_NLS_UTF8=y # Kernel hacking # CONFIG_TRACE_IRQFLAGS_SUPPORT=y -# CONFIG_PRINTK_TIME is not set -# CONFIG_ENABLE_WARN_DEPRECATED is not set -# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_PRINTK_TIME=y +CONFIG_ENABLE_WARN_DEPRECATED=y +CONFIG_ENABLE_MUST_CHECK=y CONFIG_FRAME_WARN=2048 CONFIG_MAGIC_SYSRQ=y # CONFIG_UNUSED_SYMBOLS is not set @@ -1998,6 +2041,7 @@ CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set # CONFIG_DEBUG_VM is not set # CONFIG_DEBUG_WRITECOUNT is not set +CONFIG_DEBUG_MEMORY_INIT=y # CONFIG_DEBUG_LIST is not set # CONFIG_DEBUG_SG is not set CONFIG_FRAME_POINTER=y @@ -2008,11 +2052,20 @@ CONFIG_FRAME_POINTER=y # CONFIG_LKDTM is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set +CONFIG_SYSCTL_SYSCALL_CHECK=y +CONFIG_HAVE_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +# CONFIG_FTRACE is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SYSPROF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y # CONFIG_SAMPLES is not set -# CONFIG_KGDB is not set CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set # CONFIG_STRICT_DEVMEM is not set +CONFIG_X86_VERBOSE_BOOTUP=y CONFIG_EARLY_PRINTK=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y @@ -2023,8 +2076,8 @@ CONFIG_DEBUG_RODATA=y # CONFIG_DIRECT_GBPAGES is not set # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m -CONFIG_X86_MPPARSE=y # CONFIG_IOMMU_DEBUG is not set +# CONFIG_MMIOTRACE is not set CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2036,6 +2089,7 @@ CONFIG_IO_DELAY_0X80=y CONFIG_DEFAULT_IO_DELAY_TYPE=0 CONFIG_DEBUG_BOOT_PARAMS=y # CONFIG_CPA_DEBUG is not set +# CONFIG_OPTIMIZE_INLINING is not set # # Security options @@ -2045,7 +2099,6 @@ CONFIG_KEYS_DEBUG_PROC_KEYS=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y # CONFIG_SECURITY_NETWORK_XFRM is not set -CONFIG_SECURITY_CAPABILITIES=y CONFIG_SECURITY_FILE_CAPABILITIES=y # CONFIG_SECURITY_ROOTPLUG is not set CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR=65536 @@ -2106,6 +2159,10 @@ CONFIG_CRYPTO_HMAC=y # CONFIG_CRYPTO_MD4 is not set CONFIG_CRYPTO_MD5=y # CONFIG_CRYPTO_MICHAEL_MIC is not set +# CONFIG_CRYPTO_RMD128 is not set +# CONFIG_CRYPTO_RMD160 is not set +# CONFIG_CRYPTO_RMD256 is not set +# CONFIG_CRYPTO_RMD320 is not set CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_SHA256 is not set # CONFIG_CRYPTO_SHA512 is not set @@ -2155,6 +2212,7 @@ CONFIG_GENERIC_FIND_FIRST_BIT=y CONFIG_GENERIC_FIND_NEXT_BIT=y # CONFIG_CRC_CCITT is not set # CONFIG_CRC16 is not set +CONFIG_CRC_T10DIF=y # CONFIG_CRC_ITU_T is not set CONFIG_CRC32=y # CONFIG_CRC7 is not set -- cgit v1.2.2 From a81726087428b541aa64604b8a94104a4d4aa8f9 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 26 Aug 2008 15:13:45 -0700 Subject: x86: acpi: move acpi_mcfg_64bit_base_addr into CONFIG_PCI_MMCONFIG acpi_mcfg_64bit_base_addr is used when CONFIG_PCI_MMCONFIG is enabled. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index f20470823d38..e5032d7b391d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -96,8 +96,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #warning ACPI uses CMPXCHG, i486 and later hardware #endif -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; - /* -------------------------------------------------------------------------- Boot-time Configuration -------------------------------------------------------------------------- */ @@ -159,6 +157,8 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg) { if (!strcmp(mcfg->header.oem_id, "SGI")) -- cgit v1.2.2 From 11c231a962c740b3216eb6565149ae5a7944cba7 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Sat, 23 Aug 2008 17:47:11 +0200 Subject: x86: use x2apic id reported by cpuid during topology discovery, fix v2: Fix for !SMP build Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/addon_cpuid_features.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c index aa9641ae6703..a2d1a545767d 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/addon_cpuid_features.c @@ -69,6 +69,7 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) */ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) { +#ifdef CONFIG_SMP unsigned int eax, ebx, ecx, edx, sub_index; unsigned int ht_mask_width, core_plus_mask_width; unsigned int core_select_mask, core_level_siblings; @@ -132,6 +133,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); return; +#endif } #ifdef CONFIG_X86_PAT -- cgit v1.2.2 From 83b8e28b14d63db928cb39e5c5ed2a548246bd71 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 27 Aug 2008 14:57:36 -0700 Subject: x86: xsave: restore xcr0 during resume Add the missing XCR0(XFEATURE_ENABLED_MASK) restore during resume. Reported-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/power/cpu_32.c | 7 +++++++ arch/x86/power/cpu_64.c | 7 +++++++ 2 files changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c index 7dc5d5cf50a2..02f36f53558c 100644 --- a/arch/x86/power/cpu_32.c +++ b/arch/x86/power/cpu_32.c @@ -11,6 +11,7 @@ #include #include #include +#include static struct saved_context saved_context; @@ -124,6 +125,12 @@ static void __restore_processor_state(struct saved_context *ctxt) if (boot_cpu_has(X86_FEATURE_SEP)) enable_sep_cpu(); + /* + * restore XCR0 for xsave capable cpu's. + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + fix_processor_context(); do_fpu_end(); mtrr_ap_init(); diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c index 66bdfb591fd8..e3b6cf70d62c 100644 --- a/arch/x86/power/cpu_64.c +++ b/arch/x86/power/cpu_64.c @@ -14,6 +14,7 @@ #include #include #include +#include static void fix_processor_context(void); @@ -122,6 +123,12 @@ static void __restore_processor_state(struct saved_context *ctxt) wrmsrl(MSR_GS_BASE, ctxt->gs_base); wrmsrl(MSR_KERNEL_GS_BASE, ctxt->gs_kernel_base); + /* + * restore XCR0 for xsave capable cpu's. + */ + if (cpu_has_xsave) + xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); + fix_processor_context(); do_fpu_end(); -- cgit v1.2.2 From 7414aa41a63348c3bc72d8c37b716024c29b6d50 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 27 Aug 2008 17:56:44 -0700 Subject: x86: generate names for /proc/cpuinfo from We have had a number of cases where (and its predecessors) have diverged substantially from the names list in /proc/cpuinfo. This patch generates the latter from the former. It retains the option for explicitly overriding the strings, but by making that require a separate action it should at least be less likely to happen. It would be good to do a future pass and rename strings that are gratuituously different in the kernel (/proc/cpuinfo is a userspace interface and must remain constant.) Signed-off-by: H. Peter Anvin --- arch/x86/boot/mkcpustr.c | 2 +- arch/x86/kernel/cpu/Makefile | 11 ++++- arch/x86/kernel/cpu/feature_names.c | 84 ------------------------------------- arch/x86/kernel/cpu/mkcapflags.pl | 32 ++++++++++++++ arch/x86/kernel/cpu/powerflags.c | 20 +++++++++ 5 files changed, 63 insertions(+), 86 deletions(-) delete mode 100644 arch/x86/kernel/cpu/feature_names.c create mode 100644 arch/x86/kernel/cpu/mkcapflags.pl create mode 100644 arch/x86/kernel/cpu/powerflags.c (limited to 'arch/x86') diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index bbe76953bae9..4589caa3e9d1 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -15,7 +15,7 @@ #include -#include "../kernel/cpu/feature_names.c" +#include "../kernel/cpu/capflags.c" #if NCAPFLAGS > 8 # error "Need to adjust the boot code handling of CPUID strings" diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index a0fc6c144384..3ede19a4e0b2 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -3,7 +3,7 @@ # obj-y := intel_cacheinfo.o addon_cpuid_features.o -obj-y += proc.o feature_names.o +obj-y += proc.o capflags.o powerflags.o obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += common_64.o bugs_64.o @@ -23,3 +23,12 @@ obj-$(CONFIG_MTRR) += mtrr/ obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o + +quiet_cmd_mkcapflags = MKCAP $@ + cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ + +cpufeature = $(src)/../../../../include/asm-x86/cpufeature.h + +targets += capflags.c +$(obj)/capflags.c: $(cpufeature) $(src)/mkcapflags.pl FORCE + $(call if_changed,mkcapflags) diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c deleted file mode 100644 index c9017799497c..000000000000 --- a/arch/x86/kernel/cpu/feature_names.c +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Strings for the various x86 capability flags. - * - * This file must not contain any executable code. - */ - -#include - -/* - * These flag bits must match the definitions in . - * NULL means this bit is undefined or reserved; either way it doesn't - * have meaning as far as Linux is concerned. Note that it's important - * to realize there is a difference between this table and CPUID -- if - * applications want to get the raw CPUID data, they should access - * /dev/cpu//cpuid instead. - */ -const char * const x86_cap_flags[NCAPINTS*32] = { - /* Intel-defined */ - "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", - "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", - "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", - - /* AMD-defined */ - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "syscall", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, "mp", "nx", NULL, "mmxext", NULL, - NULL, "fxsr_opt", "pdpe1gb", "rdtscp", NULL, "lm", - "3dnowext", "3dnow", - - /* Transmeta-defined */ - "recovery", "longrun", NULL, "lrti", NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Other (Linux-defined) */ - "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", - NULL, NULL, NULL, NULL, - "constant_tsc", "up", NULL, "arch_perfmon", - "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, - "nopl", NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Intel-defined (#2) */ - "pni", NULL, NULL, "monitor", "ds_cpl", "vmx", "smx", "est", - "tm2", "ssse3", "cid", NULL, NULL, "cx16", "xtpr", NULL, - NULL, NULL, "dca", "sse4_1", "sse4_2", NULL, NULL, "popcnt", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* VIA/Cyrix/Centaur-defined */ - NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* AMD-defined (#2) */ - "lahf_lm", "cmp_legacy", "svm", "extapic", - "cr8_legacy", "abm", "sse4a", "misalignsse", - "3dnowprefetch", "osvw", "ibs", "sse5", - "skinit", "wdt", NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - - /* Auxiliary (Linux-defined) */ - "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, -}; - -const char *const x86_power_flags[32] = { - "ts", /* temperature sensor */ - "fid", /* frequency id control */ - "vid", /* voltage id control */ - "ttp", /* thermal trip */ - "tm", - "stc", - "100mhzsteps", - "hwpstate", - "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ -}; diff --git a/arch/x86/kernel/cpu/mkcapflags.pl b/arch/x86/kernel/cpu/mkcapflags.pl new file mode 100644 index 000000000000..dfea390e1608 --- /dev/null +++ b/arch/x86/kernel/cpu/mkcapflags.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl +# +# Generate the x86_cap_flags[] array from include/asm-x86/cpufeature.h +# + +($in, $out) = @ARGV; + +open(IN, "< $in\0") or die "$0: cannot open: $in: $!\n"; +open(OUT, "> $out\0") or die "$0: cannot create: $out: $!\n"; + +print OUT "#include \n\n"; +print OUT "const char * const x86_cap_flags[NCAPINTS*32] = {\n"; + +while (defined($line = )) { + if ($line =~ /^\s*\#\s*define\s+(X86_FEATURE_(\S+))\s+(.*)$/) { + $macro = $1; + $feature = $2; + $tail = $3; + if ($tail =~ /\/\*\s*\"([^"]*)\".*\*\//) { + $feature = $1; + } + + if ($feature ne '') { + printf OUT "\t%-32s = \"%s\",\n", + "[$macro]", "\L$feature"; + } + } +} +print OUT "};\n"; + +close(IN); +close(OUT); diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c new file mode 100644 index 000000000000..5abbea297e0c --- /dev/null +++ b/arch/x86/kernel/cpu/powerflags.c @@ -0,0 +1,20 @@ +/* + * Strings for the various x86 power flags + * + * This file must not contain any executable code. + */ + +#include + +const char *const x86_power_flags[32] = { + "ts", /* temperature sensor */ + "fid", /* frequency id control */ + "vid", /* voltage id control */ + "ttp", /* thermal trip */ + "tm", + "stc", + "100mhzsteps", + "hwpstate", + "", /* tsc invariant mapped to constant_tsc */ + /* nothing */ +}; -- cgit v1.2.2 From 2c7e9fd4c6cb7f4b0bc7162e9a30847e51a1ca1b Mon Sep 17 00:00:00 2001 From: Joe Korty Date: Wed, 27 Aug 2008 10:35:06 -0400 Subject: x86: make poll_idle behave more like the other idle methods Make poll_idle() behave more like the other idle methods. Currently, poll_idle() returns immediately. The other idle methods all wait indefinately for some condition to come true before returning. poll_idle should emulate these other methods and also wait for a return condition, in this case, for need_resched() to become 'true'. Without this delay the idle loop spends all of its time in the outer loop that calls poll_idle. This outer loop, these days, does real work, some of it under rcu locks. That work should only be done when idle is entered and when idle exits, not continuously while idle is spinning. Signed-off-by: Joe Korty Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7fc4d5b0a6a0..4e09d26748cf 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -185,7 +185,8 @@ static void mwait_idle(void) static void poll_idle(void) { local_irq_enable(); - cpu_relax(); + while (!need_resched()) + cpu_relax(); } /* -- cgit v1.2.2 From 8cb51ba8e06570a5fff674b3744d12a1b089f2d0 Mon Sep 17 00:00:00 2001 From: Austin Zhang Date: Thu, 7 Aug 2008 09:57:03 +0800 Subject: crypto: crc32c - Use Intel CRC32 instruction From NHM processor onward, Intel processors can support hardware accelerated CRC32c algorithm with the new CRC32 instruction in SSE 4.2 instruction set. The patch detects the availability of the feature, and chooses the most proper way to calculate CRC32c checksum. Byte code instructions are used for compiler compatibility. No MMX / XMM registers is involved in the implementation. Signed-off-by: Austin Zhang Signed-off-by: Kent Liu Signed-off-by: Herbert Xu --- arch/x86/crypto/Makefile | 2 + arch/x86/crypto/crc32c-intel.c | 197 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 199 insertions(+) create mode 100644 arch/x86/crypto/crc32c-intel.c (limited to 'arch/x86') diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 3874c2de5403..903de4aa5094 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -10,6 +10,8 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o +obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o + aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o diff --git a/arch/x86/crypto/crc32c-intel.c b/arch/x86/crypto/crc32c-intel.c new file mode 100644 index 000000000000..070afc5b6c94 --- /dev/null +++ b/arch/x86/crypto/crc32c-intel.c @@ -0,0 +1,197 @@ +/* + * Using hardware provided CRC32 instruction to accelerate the CRC32 disposal. + * CRC32C polynomial:0x1EDC6F41(BE)/0x82F63B78(LE) + * CRC32 is a new instruction in Intel SSE4.2, the reference can be found at: + * http://www.intel.com/products/processor/manuals/ + * Intel(R) 64 and IA-32 Architectures Software Developer's Manual + * Volume 2A: Instruction Set Reference, A-M + * + * Copyright (c) 2008 Austin Zhang + * Copyright (c) 2008 Kent Liu + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ +#include +#include +#include +#include +#include + +#include + +#define CHKSUM_BLOCK_SIZE 1 +#define CHKSUM_DIGEST_SIZE 4 + +#define SCALE_F sizeof(unsigned long) + +#ifdef CONFIG_X86_64 +#define REX_PRE "0x48, " +#else +#define REX_PRE +#endif + +static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) +{ + while (length--) { + __asm__ __volatile__( + ".byte 0xf2, 0xf, 0x38, 0xf0, 0xf1" + :"=S"(crc) + :"0"(crc), "c"(*data) + ); + data++; + } + + return crc; +} + +static u32 __pure crc32c_intel_le_hw(u32 crc, unsigned char const *p, size_t len) +{ + unsigned int iquotient = len / SCALE_F; + unsigned int iremainder = len % SCALE_F; + unsigned long *ptmp = (unsigned long *)p; + + while (iquotient--) { + __asm__ __volatile__( + ".byte 0xf2, " REX_PRE "0xf, 0x38, 0xf1, 0xf1;" + :"=S"(crc) + :"0"(crc), "c"(*ptmp) + ); + ptmp++; + } + + if (iremainder) + crc = crc32c_intel_le_hw_byte(crc, (unsigned char *)ptmp, + iremainder); + + return crc; +} + +/* + * Setting the seed allows arbitrary accumulators and flexible XOR policy + * If your algorithm starts with ~0, then XOR with ~0 before you set + * the seed. + */ +static int crc32c_intel_setkey(struct crypto_ahash *hash, const u8 *key, + unsigned int keylen) +{ + u32 *mctx = crypto_ahash_ctx(hash); + + if (keylen != sizeof(u32)) { + crypto_ahash_set_flags(hash, CRYPTO_TFM_RES_BAD_KEY_LEN); + return -EINVAL; + } + *mctx = le32_to_cpup((__le32 *)key); + return 0; +} + +static int crc32c_intel_init(struct ahash_request *req) +{ + u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); + u32 *crcp = ahash_request_ctx(req); + + *crcp = *mctx; + + return 0; +} + +static int crc32c_intel_update(struct ahash_request *req) +{ + struct crypto_hash_walk walk; + u32 *crcp = ahash_request_ctx(req); + u32 crc = *crcp; + int nbytes; + + for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; + nbytes = crypto_hash_walk_done(&walk, 0)) + crc = crc32c_intel_le_hw(crc, walk.data, nbytes); + + *crcp = crc; + return 0; +} + +static int crc32c_intel_final(struct ahash_request *req) +{ + u32 *crcp = ahash_request_ctx(req); + + *(__le32 *)req->result = ~cpu_to_le32p(crcp); + return 0; +} + +static int crc32c_intel_digest(struct ahash_request *req) +{ + struct crypto_hash_walk walk; + u32 *mctx = crypto_ahash_ctx(crypto_ahash_reqtfm(req)); + u32 crc = *mctx; + int nbytes; + + for (nbytes = crypto_hash_walk_first(req, &walk); nbytes; + nbytes = crypto_hash_walk_done(&walk, 0)) + crc = crc32c_intel_le_hw(crc, walk.data, nbytes); + + *(__le32 *)req->result = ~cpu_to_le32(crc); + return 0; +} + +static int crc32c_intel_cra_init(struct crypto_tfm *tfm) +{ + u32 *key = crypto_tfm_ctx(tfm); + + *key = ~0; + + tfm->crt_ahash.reqsize = sizeof(u32); + + return 0; +} + +static struct crypto_alg alg = { + .cra_name = "crc32c", + .cra_driver_name = "crc32c-intel", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_AHASH, + .cra_blocksize = CHKSUM_BLOCK_SIZE, + .cra_alignmask = 3, + .cra_ctxsize = sizeof(u32), + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(alg.cra_list), + .cra_init = crc32c_intel_cra_init, + .cra_type = &crypto_ahash_type, + .cra_u = { + .ahash = { + .digestsize = CHKSUM_DIGEST_SIZE, + .setkey = crc32c_intel_setkey, + .init = crc32c_intel_init, + .update = crc32c_intel_update, + .final = crc32c_intel_final, + .digest = crc32c_intel_digest, + } + } +}; + + +static int __init crc32c_intel_mod_init(void) +{ + if (cpu_has_xmm4_2) + return crypto_register_alg(&alg); + else + return -ENODEV; +} + +static void __exit crc32c_intel_mod_fini(void) +{ + crypto_unregister_alg(&alg); +} + +module_init(crc32c_intel_mod_init); +module_exit(crc32c_intel_mod_fini); + +MODULE_AUTHOR("Austin Zhang , Kent Liu "); +MODULE_DESCRIPTION("CRC32c (Castagnoli) optimization using Intel Hardware."); +MODULE_LICENSE("GPL"); + +MODULE_ALIAS("crc32c"); +MODULE_ALIAS("crc32c-intel"); + -- cgit v1.2.2 From b4609472116bb806a95e98d04767189406c74c70 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 29 Aug 2008 14:38:03 -0700 Subject: Revert "x86: fix HPET regression in 2.6.26 versus 2.6.25, check hpet against BAR, v3" This reverts commit a2bd7274b47124d2fc4dfdb8c0591f545ba749dd. It wasn't really right to begin with (there's a better fix for the problem with e820 reservations clashing with PCI BAR's pending), but it also actually causes more regressions, so it should be reverted even before the better fix is finalized. Rafael reports that this commit broke AHCI detection, and thus causes the kernel to not boot on his quad core test box. Reported-and-bisected-by: Rafael J. Wysocki Cc: Yinghai Lu Cc: David Witbrodt Cc: Ingo Molnar Signed-off-by: Linus Torvalds --- arch/x86/pci/i386.c | 78 ----------------------------------------------------- 1 file changed, 78 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index d765da913842..5807d1bc73f7 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -31,11 +31,8 @@ #include #include #include -#include #include -#include -#include #include "pci.h" @@ -80,77 +77,6 @@ pcibios_align_resource(void *data, struct resource *res, } EXPORT_SYMBOL(pcibios_align_resource); -static int check_res_with_valid(struct pci_dev *dev, struct resource *res) -{ - unsigned long base; - unsigned long size; - int i; - - base = res->start; - size = (res->start == 0 && res->end == res->start) ? 0 : - (res->end - res->start + 1); - - if (!base || !size) - return 0; - -#ifdef CONFIG_HPET_TIMER - /* for hpet */ - if (base == hpet_address && (res->flags & IORESOURCE_MEM)) { - dev_info(&dev->dev, "BAR has HPET at %08lx-%08lx\n", - base, base + size - 1); - return 1; - } -#endif - -#ifdef CONFIG_X86_IO_APIC - for (i = 0; i < nr_ioapics; i++) { - unsigned long ioapic_phys = mp_ioapics[i].mp_apicaddr; - - if (base == ioapic_phys && (res->flags & IORESOURCE_MEM)) { - dev_info(&dev->dev, "BAR has ioapic at %08lx-%08lx\n", - base, base + size - 1); - return 1; - } - } -#endif - -#ifdef CONFIG_PCI_MMCONFIG - for (i = 0; i < pci_mmcfg_config_num; i++) { - unsigned long addr; - - addr = pci_mmcfg_config[i].address; - if (base == addr && (res->flags & IORESOURCE_MEM)) { - dev_info(&dev->dev, "BAR has MMCONFIG at %08lx-%08lx\n", - base, base + size - 1); - return 1; - } - } -#endif - - return 0; -} - -static int check_platform(struct pci_dev *dev, struct resource *res) -{ - struct resource *root = NULL; - - /* - * forcibly insert it into the - * resource tree - */ - if (res->flags & IORESOURCE_MEM) - root = &iomem_resource; - else if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - - if (root && check_res_with_valid(dev, res)) { - insert_resource(root, res); - - return 1; - } - - return 0; -} /* * Handle resources of PCI devices. If the world were perfect, we could * just allocate all the resource regions and do nothing more. It isn't. @@ -202,8 +128,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) pr = pci_find_parent_resource(dev, r); if (!r->start || !pr || request_resource(pr, r) < 0) { - if (check_platform(dev, r)) - continue; dev_err(&dev->dev, "BAR %d: can't " "allocate resource\n", idx); /* @@ -247,8 +171,6 @@ static void __init pcibios_allocate_resources(int pass) r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { - if (check_platform(dev, r)) - continue; dev_err(&dev->dev, "BAR %d: can't " "allocate resource\n", idx); /* We'll assign a new address later */ -- cgit v1.2.2 From 011fec7486028977e2b4012afb84235759c6ad82 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 2 Sep 2008 10:38:28 -0700 Subject: Un-break printk strings in x86 PCI probing code Breaking lines due to some imaginary problem with a long line length is often stupid and wrong, but never more so when it splits a string that is printed out into multiple lines. This really ended up making it much harder to find where some error strings were printed out, because a simple 'grep' didn't work. I'm sure there is tons more of this particular idiocy hiding in other places, but this particular case hit me once more last week. So fix it. Signed-off-by: Linus Torvalds --- arch/x86/pci/i386.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 5807d1bc73f7..8791fc55e715 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -128,8 +128,7 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) pr = pci_find_parent_resource(dev, r); if (!r->start || !pr || request_resource(pr, r) < 0) { - dev_err(&dev->dev, "BAR %d: can't " - "allocate resource\n", idx); + dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* * Something is wrong with the region. * Invalidate the resource to prevent @@ -164,15 +163,13 @@ static void __init pcibios_allocate_resources(int pass) else disabled = !(command & PCI_COMMAND_MEMORY); if (pass == disabled) { - dev_dbg(&dev->dev, "resource %#08llx-%#08llx " - "(f=%lx, d=%d, p=%d)\n", + dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", (unsigned long long) r->start, (unsigned long long) r->end, r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { - dev_err(&dev->dev, "BAR %d: can't " - "allocate resource\n", idx); + dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* We'll assign a new address later */ r->end -= r->start; r->start = 0; -- cgit v1.2.2 From fbb16e243887332dd5754e48ffe5b963378f3cd2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 00:54:47 +0200 Subject: [x86] Fix TSC calibration issues Larry Finger reported at http://lkml.org/lkml/2008/9/1/90: An ancient laptop of mine started throwing errors from b43legacy when I started using 2.6.27 on it. This has been bisected to commit bfc0f59 "x86: merge tsc calibration". The unification of the TSC code adopted mostly the 64bit code, which prefers PMTIMER/HPET over the PIT calibration. Larrys system has an AMD K6 CPU. Such systems are known to have PMTIMER incarnations which run at double speed. This results in a miscalibration of the TSC by factor 0.5. So the resulting calibrated CPU/TSC speed is half of the real CPU speed, which means that the TSC based delay loop will run half the time it should run. That might explain why the b43legacy driver went berserk. On the other hand we know about systems, where the PIT based calibration results in random crap due to heavy SMI/SMM disturbance. On those systems the PMTIMER/HPET based calibration logic with SMI detection shows better results. According to Alok also virtualized systems suffer from the PIT calibration method. The solution is to use a more wreckage aware aproach than the current either/or decision. 1) reimplement the retry loop which was dropped from the 32bit code during the merge. It repeats the calibration and selects the lowest frequency value as this is probably the closest estimate to the real frequency 2) Monitor the delta of the TSC values in the delay loop which waits for the PIT counter to reach zero. If the maximum value is significantly different from the minimum, then we have a pretty safe indicator that the loop was disturbed by an SMI. 3) keep the pmtimer/hpet reference as a backup solution for systems where the SMI disturbance is a permanent point of failure for PIT based calibration 4) do the loop iteration for both methods, record the lowest value and decide after all iterations finished. 5) Set a clear preference to PIT based calibration when the result makes sense. The implementation does the reference calibration based on HPET/PMTIMER around the delay, which is necessary for the PIT anyway, but keeps separate TSC values to ensure the "independency" of the resulting calibration values. Tested on various 32bit/64bit machines including Geode 266Mhz, AMD K6 (affected machine with a double speed pmtimer which I grabbed out of the dump), Pentium class machines and AMD/Intel 64 bit boxen. Bisected-by: Larry Finger Signed-off-by: Thomas Gleixner Tested-by: Larry Finger Signed-off-by: Linus Torvalds --- arch/x86/kernel/tsc.c | 235 ++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 181 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 8e786b0d665a..ac79bd143da8 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -127,75 +127,202 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) */ unsigned long native_calibrate_tsc(void) { - unsigned long flags; - u64 tsc1, tsc2, tr1, tr2, delta, pm1, pm2, hpet1, hpet2; - int hpet = is_hpet_enabled(); - unsigned int tsc_khz_val = 0; + u64 tsc1, tsc2, tr1, tr2, tsc, delta, pm1, pm2, hpet1, hpet2; + unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; + unsigned long flags, tscmin, tscmax; + int hpet = is_hpet_enabled(), pitcnt, i; - local_irq_save(flags); - - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); - - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - tr1 = get_cycles(); - while ((inb(0x61) & 0x20) == 0); - tr2 = get_cycles(); - - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); - - local_irq_restore(flags); + /* + * Run 5 calibration loops to get the lowest frequency value + * (the best estimate). We use two different calibration modes + * here: + * + * 1) PIT loop. We set the PIT Channel 2 to oneshot mode and + * load a timeout of 50ms. We read the time right after we + * started the timer and wait until the PIT count down reaches + * zero. In each wait loop iteration we read the TSC and check + * the delta to the previous read. We keep track of the min + * and max values of that delta. The delta is mostly defined + * by the IO time of the PIT access, so we can detect when a + * SMI/SMM disturbance happend between the two reads. If the + * maximum time is significantly larger than the minimum time, + * then we discard the result and have another try. + * + * 2) Reference counter. If available we use the HPET or the + * PMTIMER as a reference to check the sanity of that value. + * We use separate TSC readouts and check inside of the + * reference read for a SMI/SMM disturbance. We dicard + * disturbed values here as well. We do that around the PIT + * calibration delay loop as we have to wait for a certain + * amount of time anyway. + */ + for (i = 0; i < 5; i++) { + + tscmin = ULONG_MAX; + tscmax = 0; + pitcnt = 0; + + local_irq_save(flags); + + /* + * Read the start value and the reference count of + * hpet/pmtimer when available: + */ + tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Setup CTC channel 2* for mode 0, (interrupt on terminal + * count mode), binary count. Set the latch register to 50ms + * (LSB then MSB) to begin countdown. + * + * Some devices need a delay here. + */ + outb(0xb0, 0x43); + outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + + tsc = tr1 = tr2 = get_cycles(); + + while ((inb(0x61) & 0x20) == 0) { + tr2 = get_cycles(); + delta = tr2 - tsc; + tsc = tr2; + if ((unsigned int) delta < tscmin) + tscmin = (unsigned int) delta; + if ((unsigned int) delta > tscmax) + tscmax = (unsigned int) delta; + pitcnt++; + } + + /* + * We waited at least 50ms above. Now read + * pmtimer/hpet reference again + */ + tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + + local_irq_restore(flags); + + /* + * Sanity checks: + * + * If we were not able to read the PIT more than 5000 + * times, then we have been hit by a massive SMI + * + * If the maximum is 10 times larger than the minimum, + * then we got hit by an SMI as well. + */ + if (pitcnt > 5000 && tscmax < 10 * tscmin) { + + /* Calculate the PIT value */ + delta = tr2 - tr1; + do_div(delta, 50); + + /* We take the smallest value into account */ + tsc_pit_min = min(tsc_pit_min, (unsigned long) delta); + } + + /* hpet or pmtimer available ? */ + if (!hpet && !pm1 && !pm2) + continue; + + /* Check, whether the sampling was disturbed by an SMI */ + if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) + continue; + + tsc2 = (tsc2 - tsc1) * 1000000LL; + + if (hpet) { + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tsc1, 1000000); + } else { + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tsc1 = pm2 * 1000000000LL; + do_div(tsc1, PMTMR_TICKS_PER_SEC); + } + + do_div(tsc2, tsc1); + tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); + } /* - * Preset the result with the raw and inaccurate PIT - * calibration value + * Now check the results. */ - delta = (tr2 - tr1); - do_div(delta, 50); - tsc_khz_val = delta; + if (tsc_pit_min == ULONG_MAX) { + /* PIT gave no useful value */ + printk(KERN_WARNING "TSC: PIT calibration failed due to " + "SMI disturbance.\n"); + + /* We don't have an alternative source, disable TSC */ + if (!hpet && !pm1 && !pm2) { + printk("TSC: No reference (HPET/PMTIMER) available\n"); + return 0; + } + + /* The alternative source failed as well, disable TSC */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " + "failed due to SMI disturbance.\n"); + return 0; + } + + /* Use the alternative source */ + printk(KERN_INFO "TSC: using %s reference calibration\n", + hpet ? "HPET" : "PMTIMER"); + + return tsc_ref_min; + } - /* hpet or pmtimer available ? */ + /* We don't have an alternative source, use the PIT calibration value */ if (!hpet && !pm1 && !pm2) { - printk(KERN_INFO "TSC calibrated against PIT\n"); - goto out; + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - /* Check, whether the sampling was disturbed by an SMI */ - if (tsc1 == ULLONG_MAX || tsc2 == ULLONG_MAX) { - printk(KERN_WARNING "TSC calibration disturbed by SMI, " - "using PIT calibration result\n"); - goto out; + /* The alternative source failed, use the PIT calibration value */ + if (tsc_ref_min == ULONG_MAX) { + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " + "to SMI disturbance. Using PIT calibration\n"); + return tsc_pit_min; } - tsc2 = (tsc2 - tsc1) * 1000000LL; - - if (hpet) { - printk(KERN_INFO "TSC calibrated against HPET\n"); - if (hpet2 < hpet1) - hpet2 += 0x100000000ULL; - hpet2 -= hpet1; - tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); - do_div(tsc1, 1000000); - } else { - printk(KERN_INFO "TSC calibrated against PM_TIMER\n"); - if (pm2 < pm1) - pm2 += (u64)ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = pm2 * 1000000000LL; - do_div(tsc1, PMTMR_TICKS_PER_SEC); + /* Check the reference deviation */ + delta = ((u64) tsc_pit_min) * 100; + do_div(delta, tsc_ref_min); + + /* + * If both calibration results are inside a 5% window, the we + * use the lower frequency of those as it is probably the + * closest estimate. + */ + if (delta >= 95 && delta <= 105) { + printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n", + hpet ? "HPET" : "PMTIMER"); + printk(KERN_INFO "TSC: using %s calibration value\n", + tsc_pit_min <= tsc_ref_min ? "PIT" : + hpet ? "HPET" : "PMTIMER"); + return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min; } - do_div(tsc2, tsc1); - tsc_khz_val = tsc2; + printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); -out: - return tsc_khz_val; + /* + * The calibration values differ too much. In doubt, we use + * the PIT value as we know that there are PMTIMERs around + * running at double speed. + */ + printk(KERN_INFO "TSC: Using PIT calibration value\n"); + return tsc_pit_min; } - #ifdef CONFIG_X86_32 /* Only called from the Powernow K7 cpu freq driver */ int recalibrate_cpu_khz(void) -- cgit v1.2.2 From ec0c15afb41fd9ad45b53468b60db50170e22346 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 3 Sep 2008 07:30:13 -0700 Subject: Split up PIT part of TSC calibration from native_calibrate_tsc The TSC calibration function is still very complicated, but this makes it at least a little bit less so by moving the PIT part out into a helper function of its own. Tested-by: Larry Finger Signed-of-by: Linus Torvalds --- arch/x86/kernel/tsc.c | 132 +++++++++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 61 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ac79bd143da8..346cae5ac423 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -122,15 +122,75 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) return ULLONG_MAX; } +/* + * Try to calibrate the TSC against the Programmable + * Interrupt Timer and return the frequency of the TSC + * in kHz. + * + * Return ULONG_MAX on failure to calibrate. + */ +static unsigned long pit_calibrate_tsc(void) +{ + u64 tsc, t1, t2, delta; + unsigned long tscmin, tscmax; + int pitcnt; + + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Setup CTC channel 2* for mode 0, (interrupt on terminal + * count mode), binary count. Set the latch register to 50ms + * (LSB then MSB) to begin countdown. + */ + outb(0xb0, 0x43); + outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); + outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + + tsc = t1 = t2 = get_cycles(); + + pitcnt = 0; + tscmax = 0; + tscmin = ULONG_MAX; + while ((inb(0x61) & 0x20) == 0) { + t2 = get_cycles(); + delta = t2 - tsc; + tsc = t2; + if ((unsigned long) delta < tscmin) + tscmin = (unsigned int) delta; + if ((unsigned long) delta > tscmax) + tscmax = (unsigned int) delta; + pitcnt++; + } + + /* + * Sanity checks: + * + * If we were not able to read the PIT more than 5000 + * times, then we have been hit by a massive SMI + * + * If the maximum is 10 times larger than the minimum, + * then we got hit by an SMI as well. + */ + if (pitcnt < 5000 || tscmax > 10 * tscmin) + return ULONG_MAX; + + /* Calculate the PIT value */ + delta = t2 - t1; + do_div(delta, 50); + return delta; +} + + /** * native_calibrate_tsc - calibrate the tsc on boot */ unsigned long native_calibrate_tsc(void) { - u64 tsc1, tsc2, tr1, tr2, tsc, delta, pm1, pm2, hpet1, hpet2; + u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, tscmin, tscmax; - int hpet = is_hpet_enabled(), pitcnt, i; + unsigned long flags; + int hpet = is_hpet_enabled(), i; /* * Run 5 calibration loops to get the lowest frequency value @@ -157,72 +217,22 @@ unsigned long native_calibrate_tsc(void) * amount of time anyway. */ for (i = 0; i < 5; i++) { - - tscmin = ULONG_MAX; - tscmax = 0; - pitcnt = 0; - - local_irq_save(flags); + unsigned long tsc_pit_khz; /* * Read the start value and the reference count of - * hpet/pmtimer when available: + * hpet/pmtimer when available. Then do the PIT + * calibration, which will take at least 50ms, and + * read the end value. */ + local_irq_save(flags); tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); - - /* Set the Gate high, disable speaker */ - outb((inb(0x61) & ~0x02) | 0x01, 0x61); - - /* - * Setup CTC channel 2* for mode 0, (interrupt on terminal - * count mode), binary count. Set the latch register to 50ms - * (LSB then MSB) to begin countdown. - * - * Some devices need a delay here. - */ - outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); - - tsc = tr1 = tr2 = get_cycles(); - - while ((inb(0x61) & 0x20) == 0) { - tr2 = get_cycles(); - delta = tr2 - tsc; - tsc = tr2; - if ((unsigned int) delta < tscmin) - tscmin = (unsigned int) delta; - if ((unsigned int) delta > tscmax) - tscmax = (unsigned int) delta; - pitcnt++; - } - - /* - * We waited at least 50ms above. Now read - * pmtimer/hpet reference again - */ + tsc_pit_khz = pit_calibrate_tsc(); tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); - local_irq_restore(flags); - /* - * Sanity checks: - * - * If we were not able to read the PIT more than 5000 - * times, then we have been hit by a massive SMI - * - * If the maximum is 10 times larger than the minimum, - * then we got hit by an SMI as well. - */ - if (pitcnt > 5000 && tscmax < 10 * tscmin) { - - /* Calculate the PIT value */ - delta = tr2 - tr1; - do_div(delta, 50); - - /* We take the smallest value into account */ - tsc_pit_min = min(tsc_pit_min, (unsigned long) delta); - } + /* Pick the lowest PIT TSC calibration so far */ + tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ if (!hpet && !pm1 && !pm2) -- cgit v1.2.2 From a57a5c2e8db8d80f460dcad77877895718c9f209 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 4 Sep 2008 01:03:02 +0400 Subject: x86 setup: remove remnants of CONFIG_VIDEO_SELECT (read: vga=) Impact: cleanup Video mode selection became always possible in 2.6.23-rc1 after i386 setup code rewrite in C. Regardless, VIDEO_SELECT is stupid config option because it affects only kernel setup code, not code which always stays in memory. vga= always possible now which is good. Signed-off-by: Alexey Dobriyan Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 2 files changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 9bc34e2033ec..a7ae4385670e 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1482,7 +1482,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # CONFIG_FRAMEBUFFER_CONSOLE is not set CONFIG_LOGO=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ae5124e064d4..743e80392731 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1449,7 +1449,6 @@ CONFIG_BACKLIGHT_CLASS_DEVICE=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 -CONFIG_VIDEO_SELECT=y CONFIG_DUMMY_CONSOLE=y # CONFIG_FRAMEBUFFER_CONSOLE is not set CONFIG_LOGO=y -- cgit v1.2.2 From e6a5652fd156a286faadbf7a4062b5354d4e346e Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Wed, 3 Sep 2008 19:33:14 -0400 Subject: x86: add io delay quirk for Presario F700 Manually adding "io_delay=0xed" fixes system lockups in ioapic mode on this machine. System Information Manufacturer: Hewlett-Packard Product Name: Presario F700 (KA695EA#ABF) Base Board Information Manufacturer: Quanta Product Name: 30D3 Reference: https://bugzilla.redhat.com/show_bug.cgi?id=459546 Signed-off-by: Chuck Ebbert Signed-off-by: H. Peter Anvin --- arch/x86/kernel/io_delay.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_delay.c b/arch/x86/kernel/io_delay.c index 1c3a66a67f83..720d2607aacb 100644 --- a/arch/x86/kernel/io_delay.c +++ b/arch/x86/kernel/io_delay.c @@ -92,6 +92,14 @@ static struct dmi_system_id __initdata io_delay_0xed_port_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "30BF") } }, + { + .callback = dmi_io_delay_0xed_port, + .ident = "Presario F700", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "Quanta"), + DMI_MATCH(DMI_BOARD_NAME, "30D3") + } + }, { } }; -- cgit v1.2.2 From de014d617636d6a6bd5aef3b3d1f7f9a35669057 Mon Sep 17 00:00:00 2001 From: Alok N Kataria Date: Wed, 3 Sep 2008 18:18:01 -0700 Subject: x86: Change warning message in TSC calibration. When calibration against PIT fails, the warning that we print is misleading. In a virtualized environment the VM may get descheduled while calibration or, the check in PIT calibration may fail due to other virtualization overheads. The warning message explicitly assumes that calibration failed due to SMI's which may not be the case. Change that to something proper. Signed-off-by: Alok N Kataria Signed-off-by: Linus Torvalds --- arch/x86/kernel/tsc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 346cae5ac423..8f98e9de1b82 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -267,8 +267,7 @@ unsigned long native_calibrate_tsc(void) */ if (tsc_pit_min == ULONG_MAX) { /* PIT gave no useful value */ - printk(KERN_WARNING "TSC: PIT calibration failed due to " - "SMI disturbance.\n"); + printk(KERN_WARNING "TSC: Unable to calibrate against PIT\n"); /* We don't have an alternative source, disable TSC */ if (!hpet && !pm1 && !pm2) { -- cgit v1.2.2 From 7f16a339787d45f997d67c1a4dea3c357f48e121 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 4 Sep 2008 06:19:45 -0700 Subject: x86: boot/compressed/Makefile: fix "make clean" The Kbuild variable "targets" is supposed to be configuration-independent and reflect "all possible targets". This is required to make "make clean" work properly. Therefore, move all manipulation of "targets" as well as custom rules out of the x86-32 ifdef statement. Only leave inside the ifdefs the things that are genuinely configuration-dependent. Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 92fdd35bd93e..1771c804e02f 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -27,9 +27,8 @@ $(obj)/vmlinux.bin: vmlinux FORCE $(call if_changed,objcopy) -ifeq ($(CONFIG_X86_32),y) -targets += vmlinux.bin.all vmlinux.relocs -hostprogs-y := relocs +targets += vmlinux.bin.all vmlinux.relocs relocs +hostprogs-$(CONFIG_X86_32) += relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< @@ -43,6 +42,8 @@ quiet_cmd_relocbin = BUILD $@ $(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE $(call if_changed,relocbin) +ifeq ($(CONFIG_X86_32),y) + ifdef CONFIG_RELOCATABLE $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE $(call if_changed,gzip) @@ -59,6 +60,5 @@ $(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T endif - $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.gz FORCE $(call if_changed,ld) -- cgit v1.2.2 From 1625324d22409e32e3f8eb86018cad72e1c09d61 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 27 Aug 2008 23:01:16 -0700 Subject: x86: move dir es7000 to es7000_32.c to be aligned with numaq, summit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/es7000/Makefile | 5 - arch/x86/es7000/es7000.h | 114 -------------- arch/x86/es7000/es7000plat.c | 262 ------------------------------- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/es7000_32.c | 345 +++++++++++++++++++++++++++++++++++++++++ arch/x86/mach-generic/Makefile | 1 - 6 files changed, 346 insertions(+), 382 deletions(-) delete mode 100644 arch/x86/es7000/Makefile delete mode 100644 arch/x86/es7000/es7000.h delete mode 100644 arch/x86/es7000/es7000plat.c create mode 100644 arch/x86/kernel/es7000_32.c (limited to 'arch/x86') diff --git a/arch/x86/es7000/Makefile b/arch/x86/es7000/Makefile deleted file mode 100644 index 3ef8b43b62fc..000000000000 --- a/arch/x86/es7000/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the linux kernel. -# - -obj-$(CONFIG_X86_ES7000) := es7000plat.o diff --git a/arch/x86/es7000/es7000.h b/arch/x86/es7000/es7000.h deleted file mode 100644 index 4e62f6fa95b8..000000000000 --- a/arch/x86/es7000/es7000.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - * Natalie Protasevich, Unisys Corporation - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -/* - * ES7000 chipsets - */ - -#define NON_UNISYS 0 -#define ES7000_CLASSIC 1 -#define ES7000_ZORRO 2 - - -#define MIP_REG 1 -#define MIP_PSAI_REG 4 - -#define MIP_BUSY 1 -#define MIP_SPIN 0xf0000 -#define MIP_VALID 0x0100000000000000ULL -#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) - -#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) - -struct mip_reg_info { - unsigned long long mip_info; - unsigned long long delivery_info; - unsigned long long host_reg; - unsigned long long mip_reg; -}; - -struct part_info { - unsigned char type; - unsigned char length; - unsigned char part_id; - unsigned char apic_mode; - unsigned long snum; - char ptype[16]; - char sname[64]; - char pname[64]; -}; - -struct psai { - unsigned long long entry_type; - unsigned long long addr; - unsigned long long bep_addr; -}; - -struct es7000_mem_info { - unsigned char type; - unsigned char length; - unsigned char resv[6]; - unsigned long long start; - unsigned long long size; -}; - -struct es7000_oem_table { - unsigned long long hdr; - struct mip_reg_info mip; - struct part_info pif; - struct es7000_mem_info shm; - struct psai psai; -}; - -#ifdef CONFIG_ACPI - -struct oem_table { - struct acpi_table_header Header; - u32 OEMTableAddr; - u32 OEMTableSize; -}; - -extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); -#endif - -struct mip_reg { - unsigned long long off_0; - unsigned long long off_8; - unsigned long long off_10; - unsigned long long off_18; - unsigned long long off_20; - unsigned long long off_28; - unsigned long long off_30; - unsigned long long off_38; -}; - -#define MIP_SW_APIC 0x1020b -#define MIP_FUNC(VALUE) (VALUE & 0xff) - -extern int parse_unisys_oem (char *oemptr); -extern void setup_unisys(void); -extern int es7000_start_cpu(int cpu, unsigned long eip); -extern void es7000_sw_apic(void); diff --git a/arch/x86/es7000/es7000plat.c b/arch/x86/es7000/es7000plat.c deleted file mode 100644 index 7789fde13c3f..000000000000 --- a/arch/x86/es7000/es7000plat.c +++ /dev/null @@ -1,262 +0,0 @@ -/* - * Written by: Garry Forsgren, Unisys Corporation - * Natalie Protasevich, Unisys Corporation - * This file contains the code to configure and interface - * with Unisys ES7000 series hardware system manager. - * - * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. - * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write the Free Software Foundation, Inc., 59 - * Temple Place - Suite 330, Boston MA 02111-1307, USA. - * - * Contact information: Unisys Corporation, Township Line & Union Meeting - * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: - * - * http://www.unisys.com - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "es7000.h" -#include - -/* - * ES7000 Globals - */ - -static volatile unsigned long *psai = NULL; -static struct mip_reg *mip_reg; -static struct mip_reg *host_reg; -static int mip_port; -static unsigned long mip_addr, host_addr; - -int es7000_plat; - -/* - * GSI override for ES7000 platforms. - */ - -static unsigned int base; - -static int -es7000_rename_gsi(int ioapic, int gsi) -{ - if (es7000_plat == ES7000_ZORRO) - return gsi; - - if (!base) { - int i; - for (i = 0; i < nr_ioapics; i++) - base += nr_ioapic_registers[i]; - } - - if (!ioapic && (gsi < 16)) - gsi += base; - return gsi; -} - -void __init -setup_unisys(void) -{ - /* - * Determine the generation of the ES7000 currently running. - * - * es7000_plat = 1 if the machine is a 5xx ES7000 box - * es7000_plat = 2 if the machine is a x86_64 ES7000 box - * - */ - if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) - es7000_plat = ES7000_ZORRO; - else - es7000_plat = ES7000_CLASSIC; - ioapic_renumber_irq = es7000_rename_gsi; -} - -/* - * Parse the OEM Table - */ - -int __init -parse_unisys_oem (char *oemptr) -{ - int i; - int success = 0; - unsigned char type, size; - unsigned long val; - char *tp = NULL; - struct psai *psaip = NULL; - struct mip_reg_info *mi; - struct mip_reg *host, *mip; - - tp = oemptr; - - tp += 8; - - for (i=0; i <= 6; i++) { - type = *tp++; - size = *tp++; - tp -= 2; - switch (type) { - case MIP_REG: - mi = (struct mip_reg_info *)tp; - val = MIP_RD_LO(mi->host_reg); - host_addr = val; - host = (struct mip_reg *)val; - host_reg = __va(host); - val = MIP_RD_LO(mi->mip_reg); - mip_port = MIP_PORT(mi->mip_info); - mip_addr = val; - mip = (struct mip_reg *)val; - mip_reg = __va(mip); - pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", - (unsigned long)host_reg); - pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", - (unsigned long)mip_reg); - success++; - break; - case MIP_PSAI_REG: - psaip = (struct psai *)tp; - if (tp != NULL) { - if (psaip->addr) - psai = __va(psaip->addr); - else - psai = NULL; - success++; - } - break; - default: - break; - } - tp += size; - } - - if (success < 2) { - es7000_plat = NON_UNISYS; - } else - setup_unisys(); - return es7000_plat; -} - -#ifdef CONFIG_ACPI -int __init -find_unisys_acpi_oem_table(unsigned long *oem_addr) -{ - struct acpi_table_header *header = NULL; - int i = 0; - while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { - if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { - struct oem_table *t = (struct oem_table *)header; - *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, - t->OEMTableSize); - return 0; - } - } - return -1; -} -#endif - -static void -es7000_spin(int n) -{ - int i = 0; - - while (i++ < n) - rep_nop(); -} - -static int __init -es7000_mip_write(struct mip_reg *mip_reg) -{ - int status = 0; - int spin; - - spin = MIP_SPIN; - while (((unsigned long long)host_reg->off_38 & - (unsigned long long)MIP_VALID) != 0) { - if (--spin <= 0) { - printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); - outb(1, mip_port); - - spin = MIP_SPIN; - - while (((unsigned long long)mip_reg->off_38 & - (unsigned long long)MIP_VALID) == 0) { - if (--spin <= 0) { - printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); - return -1; - } - es7000_spin(MIP_SPIN); - } - - status = ((unsigned long long)mip_reg->off_0 & - (unsigned long long)0xffff0000000000ULL) >> 48; - mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & - (unsigned long long)~MIP_VALID); - return status; -} - -int -es7000_start_cpu(int cpu, unsigned long eip) -{ - unsigned long vect = 0, psaival = 0; - - if (psai == NULL) - return -1; - - vect = ((unsigned long)__pa(eip)/0x1000) << 16; - psaival = (0x1000000 | vect | cpu); - - while (*psai & 0x1000000) - ; - - *psai = psaival; - - return 0; - -} - -void __init -es7000_sw_apic(void) -{ - if (es7000_plat) { - int mip_status; - struct mip_reg es7000_mip_reg; - - printk("ES7000: Enabling APIC mode.\n"); - memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); - es7000_mip_reg.off_0 = MIP_SW_APIC; - es7000_mip_reg.off_38 = (MIP_VALID); - while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) - printk("es7000_sw_apic: command failed, status = %x\n", - mip_status); - return; - } -} diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a07ec14f3312..dac24c4390dd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o obj-$(CONFIG_X86_NUMAQ) += numaq_32.o +obj-$(CONFIG_X86_ES7000) += es7000_32.o obj-$(CONFIG_X86_SUMMIT_NUMA) += summit_32.o obj-y += vsmp_64.o obj-$(CONFIG_KPROBES) += kprobes.o diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c new file mode 100644 index 000000000000..849e5cd485b8 --- /dev/null +++ b/arch/x86/kernel/es7000_32.c @@ -0,0 +1,345 @@ +/* + * Written by: Garry Forsgren, Unisys Corporation + * Natalie Protasevich, Unisys Corporation + * This file contains the code to configure and interface + * with Unisys ES7000 series hardware system manager. + * + * Copyright (c) 2003 Unisys Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston MA 02111-1307, USA. + * + * Contact information: Unisys Corporation, Township Line & Union Meeting + * Roads-A, Unisys Way, Blue Bell, Pennsylvania, 19424, or: + * + * http://www.unisys.com + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ES7000 chipsets + */ + +#define NON_UNISYS 0 +#define ES7000_CLASSIC 1 +#define ES7000_ZORRO 2 + + +#define MIP_REG 1 +#define MIP_PSAI_REG 4 + +#define MIP_BUSY 1 +#define MIP_SPIN 0xf0000 +#define MIP_VALID 0x0100000000000000ULL +#define MIP_PORT(VALUE) ((VALUE >> 32) & 0xffff) + +#define MIP_RD_LO(VALUE) (VALUE & 0xffffffff) + +struct mip_reg_info { + unsigned long long mip_info; + unsigned long long delivery_info; + unsigned long long host_reg; + unsigned long long mip_reg; +}; + +struct part_info { + unsigned char type; + unsigned char length; + unsigned char part_id; + unsigned char apic_mode; + unsigned long snum; + char ptype[16]; + char sname[64]; + char pname[64]; +}; + +struct psai { + unsigned long long entry_type; + unsigned long long addr; + unsigned long long bep_addr; +}; + +struct es7000_mem_info { + unsigned char type; + unsigned char length; + unsigned char resv[6]; + unsigned long long start; + unsigned long long size; +}; + +struct es7000_oem_table { + unsigned long long hdr; + struct mip_reg_info mip; + struct part_info pif; + struct es7000_mem_info shm; + struct psai psai; +}; + +#ifdef CONFIG_ACPI + +struct oem_table { + struct acpi_table_header Header; + u32 OEMTableAddr; + u32 OEMTableSize; +}; + +extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +#endif + +struct mip_reg { + unsigned long long off_0; + unsigned long long off_8; + unsigned long long off_10; + unsigned long long off_18; + unsigned long long off_20; + unsigned long long off_28; + unsigned long long off_30; + unsigned long long off_38; +}; + +#define MIP_SW_APIC 0x1020b +#define MIP_FUNC(VALUE) (VALUE & 0xff) + +/* + * ES7000 Globals + */ + +static volatile unsigned long *psai = NULL; +static struct mip_reg *mip_reg; +static struct mip_reg *host_reg; +static int mip_port; +static unsigned long mip_addr, host_addr; + +int es7000_plat; + +/* + * GSI override for ES7000 platforms. + */ + +static unsigned int base; + +static int +es7000_rename_gsi(int ioapic, int gsi) +{ + if (es7000_plat == ES7000_ZORRO) + return gsi; + + if (!base) { + int i; + for (i = 0; i < nr_ioapics; i++) + base += nr_ioapic_registers[i]; + } + + if (!ioapic && (gsi < 16)) + gsi += base; + return gsi; +} + +void __init +setup_unisys(void) +{ + /* + * Determine the generation of the ES7000 currently running. + * + * es7000_plat = 1 if the machine is a 5xx ES7000 box + * es7000_plat = 2 if the machine is a x86_64 ES7000 box + * + */ + if (!(boot_cpu_data.x86 <= 15 && boot_cpu_data.x86_model <= 2)) + es7000_plat = ES7000_ZORRO; + else + es7000_plat = ES7000_CLASSIC; + ioapic_renumber_irq = es7000_rename_gsi; +} + +/* + * Parse the OEM Table + */ + +int __init +parse_unisys_oem (char *oemptr) +{ + int i; + int success = 0; + unsigned char type, size; + unsigned long val; + char *tp = NULL; + struct psai *psaip = NULL; + struct mip_reg_info *mi; + struct mip_reg *host, *mip; + + tp = oemptr; + + tp += 8; + + for (i=0; i <= 6; i++) { + type = *tp++; + size = *tp++; + tp -= 2; + switch (type) { + case MIP_REG: + mi = (struct mip_reg_info *)tp; + val = MIP_RD_LO(mi->host_reg); + host_addr = val; + host = (struct mip_reg *)val; + host_reg = __va(host); + val = MIP_RD_LO(mi->mip_reg); + mip_port = MIP_PORT(mi->mip_info); + mip_addr = val; + mip = (struct mip_reg *)val; + mip_reg = __va(mip); + pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", + (unsigned long)host_reg); + pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", + (unsigned long)mip_reg); + success++; + break; + case MIP_PSAI_REG: + psaip = (struct psai *)tp; + if (tp != NULL) { + if (psaip->addr) + psai = __va(psaip->addr); + else + psai = NULL; + success++; + } + break; + default: + break; + } + tp += size; + } + + if (success < 2) { + es7000_plat = NON_UNISYS; + } else + setup_unisys(); + return es7000_plat; +} + +#ifdef CONFIG_ACPI +int __init +find_unisys_acpi_oem_table(unsigned long *oem_addr) +{ + struct acpi_table_header *header = NULL; + int i = 0; + while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { + if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { + struct oem_table *t = (struct oem_table *)header; + *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, + t->OEMTableSize); + return 0; + } + } + return -1; +} +#endif + +static void +es7000_spin(int n) +{ + int i = 0; + + while (i++ < n) + rep_nop(); +} + +static int __init +es7000_mip_write(struct mip_reg *mip_reg) +{ + int status = 0; + int spin; + + spin = MIP_SPIN; + while (((unsigned long long)host_reg->off_38 & + (unsigned long long)MIP_VALID) != 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for Host Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + memcpy(host_reg, mip_reg, sizeof(struct mip_reg)); + outb(1, mip_port); + + spin = MIP_SPIN; + + while (((unsigned long long)mip_reg->off_38 & + (unsigned long long)MIP_VALID) == 0) { + if (--spin <= 0) { + printk("es7000_mip_write: Timeout waiting for MIP Valid Flag"); + return -1; + } + es7000_spin(MIP_SPIN); + } + + status = ((unsigned long long)mip_reg->off_0 & + (unsigned long long)0xffff0000000000ULL) >> 48; + mip_reg->off_38 = ((unsigned long long)mip_reg->off_38 & + (unsigned long long)~MIP_VALID); + return status; +} + +int +es7000_start_cpu(int cpu, unsigned long eip) +{ + unsigned long vect = 0, psaival = 0; + + if (psai == NULL) + return -1; + + vect = ((unsigned long)__pa(eip)/0x1000) << 16; + psaival = (0x1000000 | vect | cpu); + + while (*psai & 0x1000000) + ; + + *psai = psaival; + + return 0; + +} + +void __init +es7000_sw_apic(void) +{ + if (es7000_plat) { + int mip_status; + struct mip_reg es7000_mip_reg; + + printk("ES7000: Enabling APIC mode.\n"); + memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); + es7000_mip_reg.off_0 = MIP_SW_APIC; + es7000_mip_reg.off_38 = (MIP_VALID); + while ((mip_status = es7000_mip_write(&es7000_mip_reg)) != 0) + printk("es7000_sw_apic: command failed, status = %x\n", + mip_status); + return; + } +} diff --git a/arch/x86/mach-generic/Makefile b/arch/x86/mach-generic/Makefile index 4706de7676b1..6730f4e7c744 100644 --- a/arch/x86/mach-generic/Makefile +++ b/arch/x86/mach-generic/Makefile @@ -9,4 +9,3 @@ obj-$(CONFIG_X86_NUMAQ) += numaq.o obj-$(CONFIG_X86_SUMMIT) += summit.o obj-$(CONFIG_X86_BIGSMP) += bigsmp.o obj-$(CONFIG_X86_ES7000) += es7000.o -obj-$(CONFIG_X86_ES7000) += ../../x86/es7000/ -- cgit v1.2.2 From cce3e057242d3d46fea07b9eb3910b0076419be5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Sep 2008 15:18:44 +0000 Subject: x86: TSC: define the PIT latch value separate Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 346cae5ac423..aa11413e7c1d 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -122,6 +122,10 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) return ULLONG_MAX; } +#define CAL_MS 50 +#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_PIT_LOOPS 5000 + /* * Try to calibrate the TSC against the Programmable * Interrupt Timer and return the frequency of the TSC @@ -144,8 +148,8 @@ static unsigned long pit_calibrate_tsc(void) * (LSB then MSB) to begin countdown. */ outb(0xb0, 0x43); - outb((CLOCK_TICK_RATE / (1000 / 50)) & 0xff, 0x42); - outb((CLOCK_TICK_RATE / (1000 / 50)) >> 8, 0x42); + outb(CAL_LATCH & 0xff, 0x42); + outb(CAL_LATCH >> 8, 0x42); tsc = t1 = t2 = get_cycles(); @@ -166,18 +170,18 @@ static unsigned long pit_calibrate_tsc(void) /* * Sanity checks: * - * If we were not able to read the PIT more than 5000 + * If we were not able to read the PIT more than PIT_MIN_LOOPS * times, then we have been hit by a massive SMI * * If the maximum is 10 times larger than the minimum, * then we got hit by an SMI as well. */ - if (pitcnt < 5000 || tscmax > 10 * tscmin) + if (pitcnt < CAL_PIT_LOOPS || tscmax > 10 * tscmin) return ULONG_MAX; /* Calculate the PIT value */ delta = t2 - t1; - do_div(delta, 50); + do_div(delta, CAL_MS); return delta; } -- cgit v1.2.2 From d683ef7afe8b6dbac6a3c681cef8a908357793ca Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Sep 2008 15:18:48 +0000 Subject: x86: TSC: separate hpet/pmtimer calculation out Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 56 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index aa11413e7c1d..ebb9bf824a07 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -122,6 +122,43 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) return ULLONG_MAX; } +/* + * Calculate the TSC frequency from HPET reference + */ +static unsigned long calc_hpet_ref(u64 deltatsc, u64 hpet1, u64 hpet2) +{ + u64 tmp; + + if (hpet2 < hpet1) + hpet2 += 0x100000000ULL; + hpet2 -= hpet1; + tmp = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); + do_div(tmp, 1000000); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + +/* + * Calculate the TSC frequency from PMTimer reference + */ +static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) +{ + u64 tmp; + + if (!pm1 && !pm2) + return ULONG_MAX; + + if (pm2 < pm1) + pm2 += (u64)ACPI_PM_OVRRUN; + pm2 -= pm1; + tmp = pm2 * 1000000000LL; + do_div(tmp, PMTMR_TICKS_PER_SEC); + do_div(deltatsc, tmp); + + return (unsigned long) deltatsc; +} + #define CAL_MS 50 #define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 5000 @@ -247,22 +284,11 @@ unsigned long native_calibrate_tsc(void) continue; tsc2 = (tsc2 - tsc1) * 1000000LL; + if (hpet) + tsc2 = calc_hpet_ref(tsc2, hpet1, hpet2); + else + tsc2 = calc_pmtimer_ref(tsc2, pm1, pm2); - if (hpet) { - if (hpet2 < hpet1) - hpet2 += 0x100000000ULL; - hpet2 -= hpet1; - tsc1 = ((u64)hpet2 * hpet_readl(HPET_PERIOD)); - do_div(tsc1, 1000000); - } else { - if (pm2 < pm1) - pm2 += (u64)ACPI_PM_OVRRUN; - pm2 -= pm1; - tsc1 = pm2 * 1000000000LL; - do_div(tsc1, PMTMR_TICKS_PER_SEC); - } - - do_div(tsc2, tsc1); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); } -- cgit v1.2.2 From 827014be05e4515fa0dfc32e3100c4dab2070a98 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Sep 2008 15:18:53 +0000 Subject: x86: TSC: use one set of reference variables Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index ebb9bf824a07..52284d31fc9c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -104,7 +104,7 @@ __setup("notsc", notsc_setup); /* * Read TSC and the reference counters. Take care of SMI disturbance */ -static u64 tsc_read_refs(u64 *pm, u64 *hpet) +static u64 tsc_read_refs(u64 *p, int hpet) { u64 t1, t2; int i; @@ -112,9 +112,9 @@ static u64 tsc_read_refs(u64 *pm, u64 *hpet) for (i = 0; i < MAX_RETRIES; i++) { t1 = get_cycles(); if (hpet) - *hpet = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; + *p = hpet_readl(HPET_COUNTER) & 0xFFFFFFFF; else - *pm = acpi_pm_read_early(); + *p = acpi_pm_read_early(); t2 = get_cycles(); if ((t2 - t1) < SMI_TRESHOLD) return t2; @@ -228,7 +228,7 @@ static unsigned long pit_calibrate_tsc(void) */ unsigned long native_calibrate_tsc(void) { - u64 tsc1, tsc2, delta, pm1, pm2, hpet1, hpet2; + u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; unsigned long flags; int hpet = is_hpet_enabled(), i; @@ -267,16 +267,16 @@ unsigned long native_calibrate_tsc(void) * read the end value. */ local_irq_save(flags); - tsc1 = tsc_read_refs(&pm1, hpet ? &hpet1 : NULL); + tsc1 = tsc_read_refs(&ref1, hpet); tsc_pit_khz = pit_calibrate_tsc(); - tsc2 = tsc_read_refs(&pm2, hpet ? &hpet2 : NULL); + tsc2 = tsc_read_refs(&ref2, hpet); local_irq_restore(flags); /* Pick the lowest PIT TSC calibration so far */ tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); /* hpet or pmtimer available ? */ - if (!hpet && !pm1 && !pm2) + if (!hpet && !ref1 && !ref2) continue; /* Check, whether the sampling was disturbed by an SMI */ @@ -285,9 +285,9 @@ unsigned long native_calibrate_tsc(void) tsc2 = (tsc2 - tsc1) * 1000000LL; if (hpet) - tsc2 = calc_hpet_ref(tsc2, hpet1, hpet2); + tsc2 = calc_hpet_ref(tsc2, ref1, ref2); else - tsc2 = calc_pmtimer_ref(tsc2, pm1, pm2); + tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); } @@ -301,7 +301,7 @@ unsigned long native_calibrate_tsc(void) "SMI disturbance.\n"); /* We don't have an alternative source, disable TSC */ - if (!hpet && !pm1 && !pm2) { + if (!hpet && !ref1 && !ref2) { printk("TSC: No reference (HPET/PMTIMER) available\n"); return 0; } @@ -321,7 +321,7 @@ unsigned long native_calibrate_tsc(void) } /* We don't have an alternative source, use the PIT calibration value */ - if (!hpet && !pm1 && !pm2) { + if (!hpet && !ref1 && !ref2) { printk(KERN_INFO "TSC: Using PIT calibration value\n"); return tsc_pit_min; } -- cgit v1.2.2 From a977c400957451f3bd92b9ed6022f5fe8a6cbbf5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Sep 2008 15:18:59 +0000 Subject: x86: TSC make the calibration loop smarter The last changes made the calibration loop 250ms long which is far too much. Try to do that more clever. Experiments have shown that using a 10ms delay for the PIT based calibration gives us a good enough value. If we have a reference (HPET/PMTIMER) and the result of the PIT and the reference is close enough, then we can break out of the calibration loop on a match right away and use the reference value. Otherwise we just loop 3 times and decide then, which value to take. One caveat is that for virtualized environments the PIT calibration often does not work at all and I found out that 10us is a bit too short as well for the reference to give a sane result. The solution here is to make the last loop longer when the first two PIT calibrations failed. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 95 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 52284d31fc9c..da033b5b3e19 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -159,9 +159,14 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) return (unsigned long) deltatsc; } -#define CAL_MS 50 +#define CAL_MS 10 #define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) -#define CAL_PIT_LOOPS 5000 +#define CAL_PIT_LOOPS 1000 + +#define CAL2_MS 50 +#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_PIT_LOOPS 5000 + /* * Try to calibrate the TSC against the Programmable @@ -170,7 +175,7 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) * * Return ULONG_MAX on failure to calibrate. */ -static unsigned long pit_calibrate_tsc(void) +static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) { u64 tsc, t1, t2, delta; unsigned long tscmin, tscmax; @@ -185,8 +190,8 @@ static unsigned long pit_calibrate_tsc(void) * (LSB then MSB) to begin countdown. */ outb(0xb0, 0x43); - outb(CAL_LATCH & 0xff, 0x42); - outb(CAL_LATCH >> 8, 0x42); + outb(latch & 0xff, 0x42); + outb(latch >> 8, 0x42); tsc = t1 = t2 = get_cycles(); @@ -207,18 +212,18 @@ static unsigned long pit_calibrate_tsc(void) /* * Sanity checks: * - * If we were not able to read the PIT more than PIT_MIN_LOOPS + * If we were not able to read the PIT more than loopmin * times, then we have been hit by a massive SMI * * If the maximum is 10 times larger than the minimum, * then we got hit by an SMI as well. */ - if (pitcnt < CAL_PIT_LOOPS || tscmax > 10 * tscmin) + if (pitcnt < loopmin || tscmax > 10 * tscmin) return ULONG_MAX; /* Calculate the PIT value */ delta = t2 - t1; - do_div(delta, CAL_MS); + do_div(delta, ms); return delta; } @@ -230,8 +235,8 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags; - int hpet = is_hpet_enabled(), i; + unsigned long flags, latch, ms; + int hpet = is_hpet_enabled(), i, loopmin; /* * Run 5 calibration loops to get the lowest frequency value @@ -257,7 +262,13 @@ unsigned long native_calibrate_tsc(void) * calibration delay loop as we have to wait for a certain * amount of time anyway. */ - for (i = 0; i < 5; i++) { + + /* Preset PIT loop values */ + latch = CAL_LATCH; + ms = CAL_MS; + loopmin = CAL_PIT_LOOPS; + + for (i = 0; i < 3; i++) { unsigned long tsc_pit_khz; /* @@ -268,7 +279,7 @@ unsigned long native_calibrate_tsc(void) */ local_irq_save(flags); tsc1 = tsc_read_refs(&ref1, hpet); - tsc_pit_khz = pit_calibrate_tsc(); + tsc_pit_khz = pit_calibrate_tsc(latch, ms, loopmin); tsc2 = tsc_read_refs(&ref2, hpet); local_irq_restore(flags); @@ -290,6 +301,35 @@ unsigned long native_calibrate_tsc(void) tsc2 = calc_pmtimer_ref(tsc2, ref1, ref2); tsc_ref_min = min(tsc_ref_min, (unsigned long) tsc2); + + /* Check the reference deviation */ + delta = ((u64) tsc_pit_min) * 100; + do_div(delta, tsc_ref_min); + + /* + * If both calibration results are inside a 10% window + * then we can be sure, that the calibration + * succeeded. We break out of the loop right away. We + * use the reference value, as it is more precise. + */ + if (delta >= 90 && delta <= 110) { + printk(KERN_INFO + "TSC: PIT calibration matches %s. %d loops\n", + hpet ? "HPET" : "PMTIMER", i + 1); + return tsc_ref_min; + } + + /* + * Check whether PIT failed more than once. This + * happens in virtualized environments. We need to + * give the virtual PC a slightly longer timeframe for + * the HPET/PMTIMER to make the result precise. + */ + if (i == 1 && tsc_pit_min == ULONG_MAX) { + latch = CAL2_LATCH; + ms = CAL2_MS; + loopmin = CAL2_PIT_LOOPS; + } } /* @@ -309,7 +349,7 @@ unsigned long native_calibrate_tsc(void) /* The alternative source failed as well, disable TSC */ if (tsc_ref_min == ULONG_MAX) { printk(KERN_WARNING "TSC: HPET/PMTIMER calibration " - "failed due to SMI disturbance.\n"); + "failed.\n"); return 0; } @@ -328,37 +368,18 @@ unsigned long native_calibrate_tsc(void) /* The alternative source failed, use the PIT calibration value */ if (tsc_ref_min == ULONG_MAX) { - printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed due " - "to SMI disturbance. Using PIT calibration\n"); + printk(KERN_WARNING "TSC: HPET/PMTIMER calibration failed. " + "Using PIT calibration\n"); return tsc_pit_min; } - /* Check the reference deviation */ - delta = ((u64) tsc_pit_min) * 100; - do_div(delta, tsc_ref_min); - - /* - * If both calibration results are inside a 5% window, the we - * use the lower frequency of those as it is probably the - * closest estimate. - */ - if (delta >= 95 && delta <= 105) { - printk(KERN_INFO "TSC: PIT calibration confirmed by %s.\n", - hpet ? "HPET" : "PMTIMER"); - printk(KERN_INFO "TSC: using %s calibration value\n", - tsc_pit_min <= tsc_ref_min ? "PIT" : - hpet ? "HPET" : "PMTIMER"); - return tsc_pit_min <= tsc_ref_min ? tsc_pit_min : tsc_ref_min; - } - - printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", - hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); - /* * The calibration values differ too much. In doubt, we use * the PIT value as we know that there are PMTIMERs around - * running at double speed. + * running at double speed. At least we let the user know: */ + printk(KERN_WARNING "TSC: PIT calibration deviates from %s: %lu %lu.\n", + hpet ? "HPET" : "PMTIMER", tsc_pit_min, tsc_ref_min); printk(KERN_INFO "TSC: Using PIT calibration value\n"); return tsc_pit_min; } -- cgit v1.2.2 From 58f7c98850a226d3fb05b1095af9f7c4ea3507ba Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 28 Aug 2008 13:52:25 -0700 Subject: x86: split e820 reserved entries record to late v2 so could let BAR res register at first, or even pnp. v2: insert e820 reserve resources before pnp_system_init Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 20 +++++++++++-- arch/x86/pci/i386.c | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 97 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 291e6cd9f9c0..523d6c5605d1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1271,13 +1271,15 @@ static inline const char *e820_type_to_string(int e820_type) /* * Mark e820 reserved areas as busy for the resource manager. */ +struct resource __initdata *e820_res; void __init e820_reserve_resources(void) { int i; - struct resource *res; u64 end; + struct resource *res; res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); + e820_res = res; for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; #ifndef CONFIG_RESOURCES_64BIT @@ -1291,7 +1293,8 @@ void __init e820_reserve_resources(void) res->end = end; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); + if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) + insert_resource(&iomem_resource, res); res++; } @@ -1303,6 +1306,19 @@ void __init e820_reserve_resources(void) } } +void __init e820_reserve_resources_late(void) +{ + int i; + struct resource *res; + + res = e820_res; + for (i = 0; i < e820.nr_map; i++) { + if (e820.map[i].type == E820_RESERVED && res->start >= (1ULL<<20)) + insert_resource(&iomem_resource, res); + res++; + } +} + char *__init default_machine_specific_memory_setup(void) { char *who = "BIOS-e820"; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 8791fc55e715..40811efaa25a 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -31,8 +31,12 @@ #include #include #include +#include #include +#include +#include +#include #include "pci.h" @@ -77,6 +81,77 @@ pcibios_align_resource(void *data, struct resource *res, } EXPORT_SYMBOL(pcibios_align_resource); +static int check_res_with_valid(struct pci_dev *dev, struct resource *res) +{ + unsigned long base; + unsigned long size; + int i; + + base = res->start; + size = (res->start == 0 && res->end == res->start) ? 0 : + (res->end - res->start + 1); + + if (!base || !size) + return 0; + +#ifdef CONFIG_HPET_TIMER + /* for hpet */ + if (base == hpet_address && (res->flags & IORESOURCE_MEM)) { + dev_info(&dev->dev, "BAR has HPET at %08lx-%08lx\n", + base, base + size - 1); + return 1; + } +#endif + +#ifdef CONFIG_X86_IO_APIC + for (i = 0; i < nr_ioapics; i++) { + unsigned long ioapic_phys = mp_ioapics[i].mp_apicaddr; + + if (base == ioapic_phys && (res->flags & IORESOURCE_MEM)) { + dev_info(&dev->dev, "BAR has ioapic at %08lx-%08lx\n", + base, base + size - 1); + return 1; + } + } +#endif + +#ifdef CONFIG_PCI_MMCONFIG + for (i = 0; i < pci_mmcfg_config_num; i++) { + unsigned long addr; + + addr = pci_mmcfg_config[i].address; + if (base == addr && (res->flags & IORESOURCE_MEM)) { + dev_info(&dev->dev, "BAR has MMCONFIG at %08lx-%08lx\n", + base, base + size - 1); + return 1; + } + } +#endif + + return 0; +} + +static int check_platform(struct pci_dev *dev, struct resource *res) +{ + struct resource *root = NULL; + + /* + * forcibly insert it into the + * resource tree + */ + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + else if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + + if (root && check_res_with_valid(dev, res)) { + insert_resource(root, res); + + return 1; + } + + return 0; +} /* * Handle resources of PCI devices. If the world were perfect, we could * just allocate all the resource regions and do nothing more. It isn't. @@ -128,6 +203,8 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) pr = pci_find_parent_resource(dev, r); if (!r->start || !pr || request_resource(pr, r) < 0) { + if (check_platform(dev, r)) + continue; dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* * Something is wrong with the region. @@ -169,6 +246,8 @@ static void __init pcibios_allocate_resources(int pass) r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { + if (check_platform(dev, r)) + continue; dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* We'll assign a new address later */ r->end -= r->start; -- cgit v1.2.2 From a5444d15b611cf2ffe2bc52aaf11f2ac51882f89 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Aug 2008 08:09:23 +0200 Subject: x86: split e820 reserved entries record to late v4 this one replaces: | commit a2bd7274b47124d2fc4dfdb8c0591f545ba749dd | Author: Yinghai Lu | Date: Mon Aug 25 00:56:08 2008 -0700 | | x86: fix HPET regression in 2.6.26 versus 2.6.25, check hpet against BAR, v3 v2: insert e820 reserve resources before pnp_system_init v3: fix merging problem in tip/x86/core v4: address Linus's review about comments and condition in _late() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 12 ++++++-- arch/x86/pci/i386.c | 80 ++------------------------------------------------ 2 files changed, 11 insertions(+), 81 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 523d6c5605d1..a7a71339bfb0 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1271,12 +1271,12 @@ static inline const char *e820_type_to_string(int e820_type) /* * Mark e820 reserved areas as busy for the resource manager. */ -struct resource __initdata *e820_res; +static struct resource __initdata *e820_res; void __init e820_reserve_resources(void) { int i; - u64 end; struct resource *res; + u64 end; res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); e820_res = res; @@ -1293,6 +1293,12 @@ void __init e820_reserve_resources(void) res->end = end; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; + + /* + * don't register the region that could be conflicted with + * pci device BAR resource and insert them later in + * pcibios_resource_survey() + */ if (e820.map[i].type != E820_RESERVED || res->start < (1ULL<<20)) insert_resource(&iomem_resource, res); res++; @@ -1313,7 +1319,7 @@ void __init e820_reserve_resources_late(void) res = e820_res; for (i = 0; i < e820.nr_map; i++) { - if (e820.map[i].type == E820_RESERVED && res->start >= (1ULL<<20)) + if (!res->parent && res->end) insert_resource(&iomem_resource, res); res++; } diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 40811efaa25a..844df0cbbd3e 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -31,11 +31,8 @@ #include #include #include -#include #include -#include -#include #include #include "pci.h" @@ -81,77 +78,6 @@ pcibios_align_resource(void *data, struct resource *res, } EXPORT_SYMBOL(pcibios_align_resource); -static int check_res_with_valid(struct pci_dev *dev, struct resource *res) -{ - unsigned long base; - unsigned long size; - int i; - - base = res->start; - size = (res->start == 0 && res->end == res->start) ? 0 : - (res->end - res->start + 1); - - if (!base || !size) - return 0; - -#ifdef CONFIG_HPET_TIMER - /* for hpet */ - if (base == hpet_address && (res->flags & IORESOURCE_MEM)) { - dev_info(&dev->dev, "BAR has HPET at %08lx-%08lx\n", - base, base + size - 1); - return 1; - } -#endif - -#ifdef CONFIG_X86_IO_APIC - for (i = 0; i < nr_ioapics; i++) { - unsigned long ioapic_phys = mp_ioapics[i].mp_apicaddr; - - if (base == ioapic_phys && (res->flags & IORESOURCE_MEM)) { - dev_info(&dev->dev, "BAR has ioapic at %08lx-%08lx\n", - base, base + size - 1); - return 1; - } - } -#endif - -#ifdef CONFIG_PCI_MMCONFIG - for (i = 0; i < pci_mmcfg_config_num; i++) { - unsigned long addr; - - addr = pci_mmcfg_config[i].address; - if (base == addr && (res->flags & IORESOURCE_MEM)) { - dev_info(&dev->dev, "BAR has MMCONFIG at %08lx-%08lx\n", - base, base + size - 1); - return 1; - } - } -#endif - - return 0; -} - -static int check_platform(struct pci_dev *dev, struct resource *res) -{ - struct resource *root = NULL; - - /* - * forcibly insert it into the - * resource tree - */ - if (res->flags & IORESOURCE_MEM) - root = &iomem_resource; - else if (res->flags & IORESOURCE_IO) - root = &ioport_resource; - - if (root && check_res_with_valid(dev, res)) { - insert_resource(root, res); - - return 1; - } - - return 0; -} /* * Handle resources of PCI devices. If the world were perfect, we could * just allocate all the resource regions and do nothing more. It isn't. @@ -203,8 +129,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list) pr = pci_find_parent_resource(dev, r); if (!r->start || !pr || request_resource(pr, r) < 0) { - if (check_platform(dev, r)) - continue; dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* * Something is wrong with the region. @@ -246,8 +170,6 @@ static void __init pcibios_allocate_resources(int pass) r->flags, disabled, pass); pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { - if (check_platform(dev, r)) - continue; dev_err(&dev->dev, "BAR %d: can't allocate resource\n", idx); /* We'll assign a new address later */ r->end -= r->start; @@ -306,6 +228,8 @@ void __init pcibios_resource_survey(void) pcibios_allocate_bus_resources(&pci_root_buses); pcibios_allocate_resources(0); pcibios_allocate_resources(1); + + e820_reserve_resources_late(); } /** -- cgit v1.2.2 From fb481dd56adf3c5b0993b8f052cc9ba966e3959d Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Sep 2008 13:46:11 +0200 Subject: x86: drop -funroll-loops for csum_partial_64.c Impact: performance optimization I did some rebenchmarking with modern compilers and dropping -funroll-loops makes the function consistently go faster by a few percent. So drop that flag. Thanks to Richard Guenther for a hint. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/lib/Makefile | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index aa3fa4119424..55e11aa6d66c 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -17,9 +17,6 @@ ifeq ($(CONFIG_X86_32),y) lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o else obj-y += io_64.o iomap_copy_64.o - - CFLAGS_csum-partial_64.o := -funroll-loops - lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o lib-y += thunk_64.o clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o -- cgit v1.2.2 From dc44e65943169de2d1a1b494876f48a65a9737f1 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 4 Sep 2008 13:47:38 +0200 Subject: x86: capitalize function call interrupts consistently Impact: aestetic Capitalize function call interrupts consistently. All other descriptions in /proc/interrupts are capitalized except for "function call interrupts". Capitalize it too for consistency. While that's technically a published ABI I think the risk of anyone relying on that text to stay the same is negligible. Signed-off-by: Andi Kleen Signed-off-by: H. Peter Anvin --- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 1cf8c1fcc088..b71e02d42f4f 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -325,7 +325,7 @@ skip: for_each_online_cpu(j) seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " function call interrupts\n"); + seq_printf(p, " Function call interrupts\n"); seq_printf(p, "TLB: "); for_each_online_cpu(j) seq_printf(p, "%10u ", diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 1f78b238d8d2..f065fe9071b9 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -129,7 +129,7 @@ skip: seq_printf(p, "CAL: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " function call interrupts\n"); + seq_printf(p, " Function call interrupts\n"); seq_printf(p, "TLB: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); -- cgit v1.2.2 From fac8f1e4f99dff7a0c3a929f327d66f46de6fa21 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:59:22 +0200 Subject: x86: split e820 reserved entries record to late, v7 try to insert_resource second time, by expanding the resource... for case: e820 reserved entry is partially overlapped with bar res... hope it will never happen Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a7a71339bfb0..e24d1bc47b46 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1320,7 +1320,7 @@ void __init e820_reserve_resources_late(void) res = e820_res; for (i = 0; i < e820.nr_map; i++) { if (!res->parent && res->end) - insert_resource(&iomem_resource, res); + reserve_region_with_split(&iomem_resource, res->start, res->end, res->name); res++; } } -- cgit v1.2.2 From ebd60cd64f8ab1170102c3ab072eb73042b7a33d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:04:32 +0200 Subject: x86: unify using pci_mmcfg_insert_resource even with known_bridge insert them late too. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/pci/mmconfig-shared.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index d9635764ce3d..654a2234f8f3 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -209,7 +209,7 @@ static int __init pci_mmcfg_check_hostbridge(void) return name != NULL; } -static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) +static void __init pci_mmcfg_insert_resources(void) { #define PCI_MMCFG_RESOURCE_NAME_LEN 19 int i; @@ -233,7 +233,7 @@ static void __init pci_mmcfg_insert_resources(unsigned long resource_flags) cfg->pci_segment); res->start = cfg->address; res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | resource_flags; + res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; insert_resource(&iomem_resource, res); names += PCI_MMCFG_RESOURCE_NAME_LEN; } @@ -434,11 +434,9 @@ static void __init __pci_mmcfg_init(int early) (pci_mmcfg_config[0].address == 0)) return; - if (pci_mmcfg_arch_init()) { - if (known_bridge) - pci_mmcfg_insert_resources(IORESOURCE_BUSY); + if (pci_mmcfg_arch_init()) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; - } else { + else { /* * Signal not to attempt to insert mmcfg resources because * the architecture mmcfg setup could not initialize. @@ -475,7 +473,7 @@ static int __init pci_mmcfg_late_insert_resources(void) * marked so it won't cause request errors when __request_region is * called. */ - pci_mmcfg_insert_resources(0); + pci_mmcfg_insert_resources(); return 0; } -- cgit v1.2.2 From 5fef55fddb7317585cabc1eae38dbd57f1c59729 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: move mtrr cpu cap setting early in early_init_xxxx Krzysztof Helt found MTRR is not detected on k6-2 root cause: we moved mtrr_bp_init() early for mtrr trimming, and in early_detect we only read the CPU capability from cpuid, so some cpu doesn't have that bit in cpuid. So we need to add early_init_xxxx to preset those bit before mtrr_bp_init for those earlier cpus. this patch is for v2.6.27 Reported-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 9 +++++---- arch/x86/kernel/cpu/centaur.c | 11 +++++++++++ arch/x86/kernel/cpu/cyrix.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index cae9cabc3031..18514ed26104 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -31,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } + + /* Set MTRR capability flag if appropriate */ + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -166,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mbytes); } - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); break; } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a55..a0534c04d38a 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -314,6 +314,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index ada50505a5c8..13f8fa16b815 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -161,6 +166,24 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -416,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, }; -- cgit v1.2.2 From 5031088dbc0cd30e893eb27d53f0e0eee6eb1c00 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: delay early cpu initialization until cpuid is done Move early cpu initialization after cpu early get cap so the early cpu initialization can fix up cpu caps. Signed-off-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0785b3c8d043..8aab8517642e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -335,11 +335,11 @@ static void __init early_cpu_detect(void) get_cpu_vendor(c, 1); + early_get_cap(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); - - early_get_cap(c); } /* -- cgit v1.2.2 From 3da99c97763703b23cbf2bd6e96252256d4e4617 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:44 +0200 Subject: x86: make (early)_identify_cpu more the same between 32bit and 64 bit 1. add extended_cpuid_level for 32bit 2. add generic_identify for 64bit 3. add early_identify_cpu for 32bit 4. early_identify_cpu not be called by identify_cpu 5. remove early in get_cpu_vendor for 32bit 6. add get_cpu_cap 7. add cpu_detect for 64bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 138 +++++++++++++++------------------------ arch/x86/kernel/cpu/common_64.c | 139 +++++++++++++++++++++++++--------------- 2 files changed, 141 insertions(+), 136 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8aab8517642e..96e1b8698d3a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -96,7 +96,7 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) unsigned int *v; char *p, *q; - if (cpuid_eax(0x80000000) < 0x80000004) + if (c->extended_cpuid_level < 0x80000004) return 0; v = (unsigned int *) c->x86_model_id; @@ -125,7 +125,7 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, ecx, edx, l2size; - n = cpuid_eax(0x80000000); + n = c->extended_cpuid_level; if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); @@ -186,7 +186,7 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) } -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) +static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { char *v = c->x86_vendor_id; int i; @@ -198,8 +198,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c, int early) (cpu_devs[i]->c_ident[1] && !strcmp(v, cpu_devs[i]->c_ident[1]))) { c->x86_vendor = i; - if (!early) - this_cpu = cpu_devs[i]; + this_cpu = cpu_devs[i]; return; } } @@ -284,34 +283,30 @@ void __init cpu_detect(struct cpuinfo_x86 *c) } } } -static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) + +static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; - unsigned int ebx; + u32 ebx; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - if (have_cpuid_p()) { - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - } + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } + /* AMD-defined flags: level 0x80000001 */ + xlvl = cpuid_eax(0x80000000); + c->extended_cpuid_level = xlvl; + if ((xlvl & 0xffff0000) == 0x80000000) { + if (xlvl >= 0x80000001) { + c->x86_capability[1] = cpuid_edx(0x80000001); + c->x86_capability[6] = cpuid_ecx(0x80000001); } - } - } - /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -321,25 +316,29 @@ static void __cpuinit early_get_cap(struct cpuinfo_x86 *c) * WARNING: this function is only called on the BP. Don't add code here * that is supposed to run on all CPUs. */ -static void __init early_cpu_detect(void) +static void __init early_identify_cpu(struct cpuinfo_x86 *c) { - struct cpuinfo_x86 *c = &boot_cpu_data; - c->x86_cache_alignment = 32; c->x86_clflush_size = 32; if (!have_cpuid_p()) return; + c->extended_cpuid_level = 0; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + cpu_detect(c); - get_cpu_vendor(c, 1); + get_cpu_vendor(c); - early_get_cap(c); + get_cpu_cap(c); if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); + + validate_pat_support(c); } /* @@ -373,60 +372,32 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { - u32 tfms, xlvl; - unsigned int ebx; - - if (have_cpuid_p()) { - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - get_cpu_vendor(c, 0); - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; - c->initial_apicid = (ebx >> 24) & 0xFF; + if (!have_cpuid_p()) + return; + + c->extended_cpuid_level = 0; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); + + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; #ifdef CONFIG_X86_HT - c->apicid = phys_pkg_id(c->initial_apicid, 0); - c->phys_proc_id = c->initial_apicid; + c->apicid = phys_pkg_id(c->initial_apicid, 0); + c->phys_proc_id = c->initial_apicid; #else - c->apicid = c->initial_apicid; + c->apicid = c->initial_apicid; #endif - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) - c->x86_clflush_size = ((ebx >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; - } + } - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ - } + if (c->extended_cpuid_level >= 0x80000004) + get_model_name(c); /* Default name */ - init_scattered_cpuid_features(c); - detect_nopl(c); - } + init_scattered_cpuid_features(c); + detect_nopl(c); } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) @@ -651,13 +622,10 @@ void __init early_cpu_init(void) { struct cpu_vendor_dev *cvdev; - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) + for (cvdev = __x86cpuvendor_start; cvdev < __x86cpuvendor_end; cvdev++) cpu_devs[cvdev->vendor] = cvdev->cpu_dev; - early_cpu_detect(); - validate_pat_support(&boot_cpu_data); + early_identify_cpu(&boot_cpu_data); } /* Make sure %fs is initialized properly in idle threads */ diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 699ecbfb63ec..28719fe07941 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -195,6 +195,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) printk(KERN_ERR "CPU: Your system may be unstable.\n"); } c->x86_vendor = X86_VENDOR_UNKNOWN; + this_cpu = &default_cpu; } static void __init early_cpu_support_print(void) @@ -249,56 +250,18 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) } } -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); - -void __init early_cpu_init(void) +void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { - struct cpu_vendor_dev *cvdev; - - for (cvdev = __x86cpuvendor_start ; - cvdev < __x86cpuvendor_end ; - cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; - early_cpu_support_print(); - early_identify_cpu(&boot_cpu_data); -} - -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ -static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) -{ - u32 tfms, xlvl; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; - c->x86_max_cores = 1; - c->x86_coreid_bits = 0; - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, (unsigned int *)&c->x86_vendor_id[0], (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); - get_cpu_vendor(c); - - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { - __u32 misc; - cpuid(0x00000001, &tfms, &misc, &c->x86_capability[4], - &c->x86_capability[0]); + u32 junk, tfms, cap0, misc; + cpuid(0x00000001, &tfms, &misc, &junk, &cap0); c->x86 = (tfms >> 8) & 0xf; c->x86_model = (tfms >> 4) & 0xf; c->x86_mask = tfms & 0xf; @@ -306,17 +269,32 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (test_cpu_cap(c, X86_FEATURE_CLFLSH)) + if (cap0 & (1<<19)) c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; } else { /* Have CPUID level 0 only - unheard of */ c->x86 = 4; } +} + + +static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) +{ + u32 tfms, xlvl; + u32 ebx; + + /* Initialize the standard set of capabilities */ + /* Note that the vendor-specific code below might override */ + + /* Intel-defined flags: level 0x00000001 */ + if (c->cpuid_level >= 0x00000001) { + u32 capability, excap; + + cpuid(0x00000001, &tfms, &ebx, &excap, &capability); + c->x86_capability[0] = capability; + c->x86_capability[4] = excap; + } - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; -#ifdef CONFIG_SMP - c->phys_proc_id = c->initial_apicid; -#endif /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -325,8 +303,6 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[1] = cpuid_edx(0x80000001); c->x86_capability[6] = cpuid_ecx(0x80000001); } - if (xlvl >= 0x80000004) - get_model_name(c); /* Default name */ } /* Transmeta-defined flags: level 0x80860001 */ @@ -346,8 +322,26 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } +} - detect_nopl(c); +/* Do some early cpuid on the boot CPU to get some parameter that are + needed before check_bugs. Everything advanced is in identify_cpu + below. */ +static void __init early_identify_cpu(struct cpuinfo_x86 *c) +{ + + c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; + + memset(&c->x86_capability, 0, sizeof c->x86_capability); + + c->extended_cpuid_level = 0; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) @@ -356,6 +350,39 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) validate_pat_support(c); } +void __init early_cpu_init(void) +{ + struct cpu_vendor_dev *cvdev; + + for (cvdev = __x86cpuvendor_start; cvdev < __x86cpuvendor_end; cvdev++) + cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + + early_cpu_support_print(); + early_identify_cpu(&boot_cpu_data); +} + +static void __cpuinit generic_identify(struct cpuinfo_x86 *c) +{ + c->extended_cpuid_level = 0; + + cpu_detect(c); + + get_cpu_vendor(c); + + get_cpu_cap(c); + + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; +#ifdef CONFIG_SMP + c->phys_proc_id = c->initial_apicid; +#endif + + if (c->extended_cpuid_level >= 0x80000004) + get_model_name(c); /* Default name */ + + init_scattered_cpuid_features(c); + detect_nopl(c); +} + /* * This does the hard work of actually picking apart the CPU stuff... */ @@ -363,9 +390,19 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) { int i; - early_identify_cpu(c); + c->loops_per_jiffy = loops_per_jiffy; + c->x86_cache_size = -1; + c->x86_vendor = X86_VENDOR_UNKNOWN; + c->x86_model = c->x86_mask = 0; /* So far unknown... */ + c->x86_vendor_id[0] = '\0'; /* Unset */ + c->x86_model_id[0] = '\0'; /* Unset */ + c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; + c->x86_max_cores = 1; + c->x86_coreid_bits = 0; + memset(&c->x86_capability, 0, sizeof c->x86_capability); - init_scattered_cpuid_features(c); + generic_identify(c); c->apicid = phys_pkg_id(0); -- cgit v1.2.2 From 9d31d35b5f9d619bb2482235cc889326de049e29 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:44 +0200 Subject: x86: order functions in cpu/common.c and cpu/common_64.c v2 v2: make 64 bit get c->x86_cache_alignment = c->x86_clfush_size Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 190 ++++++++++++++++++++++------------------ arch/x86/kernel/cpu/common_64.c | 106 +++++++++++----------- 2 files changed, 156 insertions(+), 140 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 96e1b8698d3a..10e89ae5a600 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -60,6 +60,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +/* Current gdt points %fs at the "master" per-cpu area: after this, + * it's on the real one. */ +void switch_to_new_gdt(void) +{ + struct desc_ptr gdt_descr; + + gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); + gdt_descr.size = GDT_SIZE - 1; + load_gdt(&gdt_descr); + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +} + static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -123,15 +135,15 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int n, dummy, ecx, edx, l2size; + unsigned int n, dummy, ebx, ecx, edx, l2size; n = c->extended_cpuid_level; if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &dummy, &ecx, &edx); + cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24)+(edx>>24); + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + c->x86_cache_size = (ecx>>24) + (edx>>24); } if (n < 0x80000006) /* Some chips just has a large L1. */ @@ -185,6 +197,51 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) return NULL; /* Not found */ } +#ifdef CONFIG_X86_HT +void __cpuinit detect_ht(struct cpuinfo_x86 *c) +{ + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; + + cpuid(1, &eax, &ebx, &ecx, &edx); + + if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) + return; + + smp_num_siblings = (ebx & 0xff0000) >> 16; + + if (smp_num_siblings == 1) { + printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); + } else if (smp_num_siblings > 1) { + + if (smp_num_siblings > NR_CPUS) { + printk(KERN_WARNING "CPU: Unsupported number of siblings %d", + smp_num_siblings); + smp_num_siblings = 1; + return; + } + + index_msb = get_count_order(smp_num_siblings); + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); + + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + + smp_num_siblings = smp_num_siblings / c->x86_max_cores; + + index_msb = get_count_order(smp_num_siblings); + + core_bits = get_count_order(c->x86_max_cores); + + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); + + if (c->x86_max_cores > 1) + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); + } +} +#endif static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { @@ -258,7 +315,26 @@ static int __cpuinit have_cpuid_p(void) return flag_is_changeable_p(X86_EFLAGS_ID); } -void __init cpu_detect(struct cpuinfo_x86 *c) +static void __init early_cpu_support_print(void) +{ + int i,j; + struct cpu_dev *cpu_devx; + + printk("KERNEL supported cpus:\n"); + for (i = 0; i < X86_VENDOR_NUM; i++) { + cpu_devx = cpu_devs[i]; + if (!cpu_devx) + continue; + for (j = 0; j < 2; j++) { + if (!cpu_devx->c_ident[j]) + continue; + printk(" %s %s\n", cpu_devx->c_vendor, + cpu_devx->c_ident[j]); + } + } +} + +void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ cpuid(0x00000000, (unsigned int *)&c->cpuid_level, @@ -267,19 +343,20 @@ void __init cpu_detect(struct cpuinfo_x86 *c) (unsigned int *)&c->x86_vendor_id[4]); c->x86 = 4; + /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; cpuid(0x00000001, &tfms, &misc, &junk, &cap0); - c->x86 = (tfms >> 8) & 15; - c->x86_model = (tfms >> 4) & 15; + c->x86 = (tfms >> 8) & 0xf; + c->x86_model = (tfms >> 4) & 0xf; + c->x86_mask = tfms & 0xf; if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - c->x86_mask = tfms & 15; + c->x86_model += ((tfms >> 16) & 0xf) << 4; if (cap0 & (1<<19)) { - c->x86_cache_alignment = ((misc >> 8) & 0xff) * 8; c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; + c->x86_cache_alignment = c->x86_clflush_size; } } } @@ -341,6 +418,17 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) validate_pat_support(c); } +void __init early_cpu_init(void) +{ + struct cpu_vendor_dev *cvdev; + + for (cvdev = __x86cpuvendor_start; cvdev < __x86cpuvendor_end; cvdev++) + cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + + early_cpu_support_print(); + early_identify_cpu(&boot_cpu_data); +} + /* * The NOPL instruction is supposed to exist on all CPUs with * family >= 6, unfortunately, that's not true in practice because @@ -500,7 +588,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) */ if (c != &boot_cpu_data) { /* AND the already accumulated flags with these */ - for (i = 0 ; i < NCAPINTS ; i++) + for (i = 0; i < NCAPINTS; i++) boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; } @@ -529,52 +617,6 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); } -#ifdef CONFIG_X86_HT -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) - return; - - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of the " - "siblings %d", smp_num_siblings); - smp_num_siblings = 1; - return; - } - - index_msb = get_count_order(smp_num_siblings); - c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); - - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings) ; - - core_bits = get_count_order(c->x86_max_cores); - - c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); - - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } -} -#endif - static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLSH); @@ -592,17 +634,17 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) vendor = c->x86_vendor_id; if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) - printk("%s ", vendor); + printk(KERN_CONT "%s ", vendor); - if (!c->x86_model_id[0]) - printk("%d86", c->x86); + if (c->x86_model_id[0]) + printk(KERN_CONT "%s", c->x86_model_id); else - printk("%s", c->x86_model_id); + printk(KERN_CONT "%d86", c->x86); if (c->x86_mask || c->cpuid_level >= 0) - printk(" stepping %02x\n", c->x86_mask); + printk(KERN_CONT " stepping %02x\n", c->x86_mask); else - printk("\n"); + printk(KERN_CONT "\n"); } static __init int setup_disablecpuid(char *arg) @@ -618,16 +660,6 @@ __setup("clearcpuid=", setup_disablecpuid); cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; -void __init early_cpu_init(void) -{ - struct cpu_vendor_dev *cvdev; - - for (cvdev = __x86cpuvendor_start; cvdev < __x86cpuvendor_end; cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; - - early_identify_cpu(&boot_cpu_data); -} - /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { @@ -636,18 +668,6 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) return regs; } -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); - asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); -} - /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 28719fe07941..522a5f2e405d 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -103,9 +103,8 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n >= 0x80000005) { cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), " - "D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); + printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", + edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); /* On K8 L1 TLB is inclusive, so don't count it */ c->x86_tlbsize = 0; @@ -143,8 +142,8 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } else if (smp_num_siblings > 1) { if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of " - "siblings %d", smp_num_siblings); + printk(KERN_WARNING "CPU: Unsupported number of siblings %d", + smp_num_siblings); smp_num_siblings = 1; return; } @@ -182,7 +181,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) if (cpu_devs[i]) { if (!strcmp(v, cpu_devs[i]->c_ident[0]) || (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { + !strcmp(v, cpu_devs[i]->c_ident[1]))) { c->x86_vendor = i; this_cpu = cpu_devs[i]; return; @@ -217,39 +216,6 @@ static void __init early_cpu_support_print(void) } } -/* - * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6, unfortunately, that's not true in practice because - * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. Hence, probe for it based on first - * principles. - * - * Note: no 64-bit chip is known to lack these, but put the code here - * for consistency with 32 bits, and to make it utterly trivial to - * diagnose the problem should it ever surface. - */ -static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) -{ - const u32 nopl_signature = 0x888c53b1; /* Random number */ - u32 has_nopl = nopl_signature; - - clear_cpu_cap(c, X86_FEATURE_NOPL); - if (c->x86 >= 6) { - asm volatile("\n" - "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ - "2:\n" - " .section .fixup,\"ax\"\n" - "3: xor %0,%0\n" - " jmp 2b\n" - " .previous\n" - _ASM_EXTABLE(1b,3b) - : "+a" (has_nopl)); - - if (has_nopl == nopl_signature) - set_cpu_cap(c, X86_FEATURE_NOPL); - } -} - void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ @@ -258,6 +224,7 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) (unsigned int *)&c->x86_vendor_id[8], (unsigned int *)&c->x86_vendor_id[4]); + c->x86 = 4; /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 junk, tfms, cap0, misc; @@ -268,12 +235,11 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) if (c->x86 == 0xf) c->x86 += (tfms >> 20) & 0xff; if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xF) << 4; - if (cap0 & (1<<19)) + c->x86_model += ((tfms >> 16) & 0xf) << 4; + if (cap0 & (1<<19)) { c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - } else { - /* Have CPUID level 0 only - unheard of */ - c->x86 = 4; + c->x86_cache_alignment = c->x86_clflush_size; + } } } @@ -283,9 +249,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) u32 tfms, xlvl; u32 ebx; - /* Initialize the standard set of capabilities */ - /* Note that the vendor-specific code below might override */ - /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 capability, excap; @@ -361,6 +324,39 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; @@ -448,7 +444,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) } -void __cpuinit identify_boot_cpu(void) +void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); } @@ -460,13 +456,6 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); } -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); - return 1; -} -__setup("noclflush", setup_noclflush); - struct msr_range { unsigned min; unsigned max; @@ -510,6 +499,13 @@ static __init int setup_show_msr(char *arg) } __setup("show_msr=", setup_show_msr); +static __init int setup_noclflush(char *arg) +{ + setup_clear_cpu_cap(X86_FEATURE_CLFLSH); + return 1; +} +__setup("noclflush", setup_noclflush); + void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { if (c->x86_model_id[0]) -- cgit v1.2.2 From 10a434fcb23a57c385177a0086955fae01003f64 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:45 +0200 Subject: x86: remove cpu_vendor_dev 1. add c_x86_vendor into cpu_dev 2. change cpu_devs to static 3. check c_x86_vendor before put that cpu_dev into array 4. remove alignment for 64bit 5. order the sequence in cpu_devs according to link sequence... so could put intel at first, then amd... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 4 +-- arch/x86/kernel/cpu/amd.c | 3 +- arch/x86/kernel/cpu/amd_64.c | 4 +-- arch/x86/kernel/cpu/centaur.c | 3 +- arch/x86/kernel/cpu/centaur_64.c | 3 +- arch/x86/kernel/cpu/common.c | 71 ++++++++++++++++++++-------------------- arch/x86/kernel/cpu/common_64.c | 71 ++++++++++++++++++++-------------------- arch/x86/kernel/cpu/cpu.h | 18 ++++------ arch/x86/kernel/cpu/cyrix.c | 6 ++-- arch/x86/kernel/cpu/intel.c | 3 +- arch/x86/kernel/cpu/intel_64.c | 3 +- arch/x86/kernel/cpu/transmeta.c | 3 +- arch/x86/kernel/cpu/umc.c | 3 +- arch/x86/kernel/vmlinux_32.lds.S | 8 ++--- arch/x86/kernel/vmlinux_64.lds.S | 9 +++-- 15 files changed, 106 insertions(+), 106 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 3ede19a4e0b2..403e689df0b8 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -8,14 +8,14 @@ obj-y += proc.o capflags.o powerflags.o obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += common_64.o bugs_64.o +obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o +obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o obj-$(CONFIG_CPU_SUP_AMD_32) += amd.o obj-$(CONFIG_CPU_SUP_AMD_64) += amd_64.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o -obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o -obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o obj-$(CONFIG_X86_MCE) += mcheck/ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 18514ed26104..d64ea6097ca7 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -298,6 +298,7 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_early_init = early_init_amd, .c_init = init_amd, .c_size_cache = amd_size_cache, + .c_x86_vendor = X86_VENDOR_AMD, }; -cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); +cpu_dev_register(amd_cpu_dev); diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index d1692b2a41ff..d1c721c0c49f 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -218,7 +218,7 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_ident = { "AuthenticAMD" }, .c_early_init = early_init_amd, .c_init = init_amd, + .c_x86_vendor = X86_VENDOR_AMD, }; -cpu_vendor_dev_register(X86_VENDOR_AMD, &amd_cpu_dev); - +cpu_dev_register(amd_cpu_dev); diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index a0534c04d38a..e5f6d89521bf 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -475,6 +475,7 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, + .c_x86_vendor = X86_VENDOR_CENTAUR, }; -cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 1d181c40e2e1..49cfc6d2f2fb 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -29,7 +29,8 @@ static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_ident = { "CentaurHauls" }, .c_early_init = early_init_centaur, .c_init = init_centaur, + .c_x86_vendor = X86_VENDOR_CENTAUR, }; -cpu_vendor_dev_register(X86_VENDOR_CENTAUR, ¢aur_cpu_dev); +cpu_dev_register(centaur_cpu_dev); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 10e89ae5a600..ee1044ca481d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -75,7 +75,7 @@ void switch_to_new_gdt(void) static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { @@ -93,8 +93,9 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; +static struct cpu_dev *this_cpu __cpuinitdata; static int __init cachesize_setup(char *str) { @@ -250,21 +251,24 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) static int printed; for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - this_cpu = cpu_devs[i]; - return; - } + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; } } + if (!printed) { printed++; printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } + c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; } @@ -315,25 +319,6 @@ static int __cpuinit have_cpuid_p(void) return flag_is_changeable_p(X86_EFLAGS_ID); } -static void __init early_cpu_support_print(void) -{ - int i,j; - struct cpu_dev *cpu_devx; - - printk("KERNEL supported cpus:\n"); - for (i = 0; i < X86_VENDOR_NUM; i++) { - cpu_devx = cpu_devs[i]; - if (!cpu_devx) - continue; - for (j = 0; j < 2; j++) { - if (!cpu_devx->c_ident[j]) - continue; - printk(" %s %s\n", cpu_devx->c_vendor, - cpu_devx->c_ident[j]); - } - } -} - void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ @@ -411,21 +396,35 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_cap(c); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); validate_pat_support(c); } void __init early_cpu_init(void) { - struct cpu_vendor_dev *cvdev; + struct cpu_dev **cdev; + int count = 0; + + printk("KERNEL supported cpus:\n"); + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + struct cpu_dev *cpudev = *cdev; + unsigned int j; - for (cvdev = __x86cpuvendor_start; cvdev < __x86cpuvendor_end; cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(" %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } - early_cpu_support_print(); early_identify_cpu(&boot_cpu_data); } diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 522a5f2e405d..b82479c10835 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -66,7 +66,7 @@ void switch_to_new_gdt(void) load_gdt(&gdt_descr); } -struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; +static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { @@ -76,8 +76,9 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) static struct cpu_dev __cpuinitdata default_cpu = { .c_init = default_init, .c_vendor = "Unknown", + .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static struct cpu_dev *this_cpu __cpuinitdata = &default_cpu; +static struct cpu_dev *this_cpu __cpuinitdata; int __cpuinit get_model_name(struct cpuinfo_x86 *c) { @@ -178,44 +179,28 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) static int printed; for (i = 0; i < X86_VENDOR_NUM; i++) { - if (cpu_devs[i]) { - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - c->x86_vendor = i; - this_cpu = cpu_devs[i]; - return; - } + if (!cpu_devs[i]) + break; + + if (!strcmp(v, cpu_devs[i]->c_ident[0]) || + (cpu_devs[i]->c_ident[1] && + !strcmp(v, cpu_devs[i]->c_ident[1]))) { + this_cpu = cpu_devs[i]; + c->x86_vendor = this_cpu->c_x86_vendor; + return; } } + if (!printed) { printed++; printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } + c->x86_vendor = X86_VENDOR_UNKNOWN; this_cpu = &default_cpu; } -static void __init early_cpu_support_print(void) -{ - int i,j; - struct cpu_dev *cpu_devx; - - printk("KERNEL supported cpus:\n"); - for (i = 0; i < X86_VENDOR_NUM; i++) { - cpu_devx = cpu_devs[i]; - if (!cpu_devx) - continue; - for (j = 0; j < 2; j++) { - if (!cpu_devx->c_ident[j]) - continue; - printk(" %s %s\n", cpu_devx->c_vendor, - cpu_devx->c_ident[j]); - } - } -} - void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ @@ -306,21 +291,35 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) get_cpu_cap(c); - if (c->x86_vendor != X86_VENDOR_UNKNOWN && - cpu_devs[c->x86_vendor]->c_early_init) - cpu_devs[c->x86_vendor]->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); validate_pat_support(c); } void __init early_cpu_init(void) { - struct cpu_vendor_dev *cvdev; + struct cpu_dev **cdev; + int count = 0; + + printk("KERNEL supported cpus:\n"); + for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { + struct cpu_dev *cpudev = *cdev; + unsigned int j; - for (cvdev = __x86cpuvendor_start; cvdev < __x86cpuvendor_end; cvdev++) - cpu_devs[cvdev->vendor] = cvdev->cpu_dev; + if (count >= X86_VENDOR_NUM) + break; + cpu_devs[count] = cpudev; + count++; + + for (j = 0; j < 2; j++) { + if (!cpudev->c_ident[j]) + continue; + printk(" %s %s\n", cpudev->c_vendor, + cpudev->c_ident[j]); + } + } - early_cpu_support_print(); early_identify_cpu(&boot_cpu_data); } diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 4d894e8565fe..3cc9d92afd8f 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -21,21 +21,15 @@ struct cpu_dev { void (*c_init)(struct cpuinfo_x86 * c); void (*c_identify)(struct cpuinfo_x86 * c); unsigned int (*c_size_cache)(struct cpuinfo_x86 * c, unsigned int size); + int c_x86_vendor; }; -extern struct cpu_dev * cpu_devs [X86_VENDOR_NUM]; +#define cpu_dev_register(cpu_devX) \ + static struct cpu_dev *__cpu_dev_##cpu_devX __used \ + __attribute__((__section__(".x86_cpu_dev.init"))) = \ + &cpu_devX; -struct cpu_vendor_dev { - int vendor; - struct cpu_dev *cpu_dev; -}; - -#define cpu_vendor_dev_register(cpu_vendor_id, cpu_dev) \ - static struct cpu_vendor_dev __cpu_vendor_dev_##cpu_vendor_id __used \ - __attribute__((__section__(".x86cpuvendor.init"))) = \ - { cpu_vendor_id, cpu_dev } - -extern struct cpu_vendor_dev __x86cpuvendor_start[], __x86cpuvendor_end[]; +extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 13f8fa16b815..3f8c7283d816 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -442,14 +442,16 @@ static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, + .c_x86_vendor = X86_VENDOR_CYRIX, }; -cpu_vendor_dev_register(X86_VENDOR_CYRIX, &cyrix_cpu_dev); +cpu_dev_register(cyrix_cpu_dev); static struct cpu_dev nsc_cpu_dev __cpuinitdata = { .c_vendor = "NSC", .c_ident = { "Geode by NSC" }, .c_init = init_nsc, + .c_x86_vendor = X86_VENDOR_NSC, }; -cpu_vendor_dev_register(X86_VENDOR_NSC, &nsc_cpu_dev); +cpu_dev_register(nsc_cpu_dev); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 77618c717d76..c5ac08124adc 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -303,9 +303,10 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_early_init = early_init_intel, .c_init = init_intel, .c_size_cache = intel_size_cache, + .c_x86_vendor = X86_VENDOR_INTEL, }; -cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); +cpu_dev_register(intel_cpu_dev); /* arch_initcall(intel_cpu_init); */ diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index 1019c58d39f0..0a8128a240df 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -90,6 +90,7 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_ident = { "GenuineIntel" }, .c_early_init = early_init_intel, .c_init = init_intel, + .c_x86_vendor = X86_VENDOR_INTEL, }; -cpu_vendor_dev_register(X86_VENDOR_INTEL, &intel_cpu_dev); +cpu_dev_register(intel_cpu_dev); diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index b911a2c61b8f..7c46e6ecedca 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -102,6 +102,7 @@ static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { .c_ident = { "GenuineTMx86", "TransmetaCPU" }, .c_init = init_transmeta, .c_identify = transmeta_identify, + .c_x86_vendor = X86_VENDOR_TRANSMETA, }; -cpu_vendor_dev_register(X86_VENDOR_TRANSMETA, &transmeta_cpu_dev); +cpu_dev_register(transmeta_cpu_dev); diff --git a/arch/x86/kernel/cpu/umc.c b/arch/x86/kernel/cpu/umc.c index b1fc90989d75..e777f79e0960 100644 --- a/arch/x86/kernel/cpu/umc.c +++ b/arch/x86/kernel/cpu/umc.c @@ -19,7 +19,8 @@ static struct cpu_dev umc_cpu_dev __cpuinitdata = { } }, }, + .c_x86_vendor = X86_VENDOR_UMC, }; -cpu_vendor_dev_register(X86_VENDOR_UMC, &umc_cpu_dev); +cpu_dev_register(umc_cpu_dev); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index af5bdad84604..21b4f7eefaab 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -140,10 +140,10 @@ SECTIONS *(.con_initcall.init) __con_initcall_end = .; } - .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { - __x86cpuvendor_start = .; - *(.x86cpuvendor.init) - __x86cpuvendor_end = .; + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; } SECURITY_INIT . = ALIGN(4); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 63e5c1a22e88..201e81a91a95 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -168,13 +168,12 @@ SECTIONS *(.con_initcall.init) } __con_initcall_end = .; - . = ALIGN(16); - __x86cpuvendor_start = .; - .x86cpuvendor.init : AT(ADDR(.x86cpuvendor.init) - LOAD_OFFSET) { - *(.x86cpuvendor.init) + __x86_cpu_dev_start = .; + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + *(.x86_cpu_dev.init) } - __x86cpuvendor_end = .; SECURITY_INIT + __x86_cpu_dev_end = .; . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { -- cgit v1.2.2 From a0854a46c5eb82588126421a9cbceb973384a644 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:46 +0200 Subject: x86: make 32bit support show_msr like 64 bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 51 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ee1044ca481d..a79cf5c52b6a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -616,6 +616,49 @@ void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); } +struct msr_range { + unsigned min; + unsigned max; +}; + +static struct msr_range msr_range_array[] __cpuinitdata = { + { 0x00000000, 0x00000418}, + { 0xc0000000, 0xc000040b}, + { 0xc0010000, 0xc0010142}, + { 0xc0011000, 0xc001103b}, +}; + +static void __cpuinit print_cpu_msr(void) +{ + unsigned index; + u64 val; + int i; + unsigned index_min, index_max; + + for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { + index_min = msr_range_array[i].min; + index_max = msr_range_array[i].max; + for (index = index_min; index < index_max; index++) { + if (rdmsrl_amd_safe(index, &val)) + continue; + printk(KERN_INFO " MSR%08x: %016llx\n", index, val); + } + } +} + +static int show_msr __cpuinitdata; +static __init int setup_show_msr(char *arg) +{ + int num; + + get_option(&arg, &num); + + if (num > 0) + show_msr = num; + return 1; +} +__setup("show_msr=", setup_show_msr); + static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLSH); @@ -644,6 +687,14 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) printk(KERN_CONT " stepping %02x\n", c->x86_mask); else printk(KERN_CONT "\n"); + +#ifdef CONFIG_SMP + if (c->cpu_index < show_msr) + print_cpu_msr(); +#else + if (show_msr) + print_cpu_msr(); +#endif } static __init int setup_disablecpuid(char *arg) -- cgit v1.2.2 From 01b2e16a7a9be6573cba5d594d6659b3c6cb46a0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:46 +0200 Subject: x86: make get_mode_name of 64bit the same as 32bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index b82479c10835..f1fb94e766bb 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -83,6 +83,7 @@ static struct cpu_dev *this_cpu __cpuinitdata; int __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; + char *p, *q; if (c->extended_cpuid_level < 0x80000004) return 0; @@ -92,6 +93,19 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); c->x86_model_id[48] = 0; + + /* Intel chips right-justify this string for some dumb reason; + undo that brain damage */ + p = q = &c->x86_model_id[0]; + while (*p == ' ') + p++; + if (p != q) { + while (*p) + *q++ = *p++; + while (q <= &c->x86_model_id[48]) + *q++ = '\0'; /* Zero-pad the rest */ + } + return 1; } -- cgit v1.2.2 From 0a488a53d7ca46ac638c30079072c57e50cfcc7b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:47 +0200 Subject: x86: move 32bit related functions together Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 251 ++++++++++++++++++++-------------------- arch/x86/kernel/cpu/common_64.c | 34 +++--- 2 files changed, 143 insertions(+), 142 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a79cf5c52b6a..2b2c170e8d69 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -22,6 +22,8 @@ #include "cpu.h" +static struct cpu_dev *this_cpu __cpuinitdata; + DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, @@ -58,6 +60,109 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +static int cachesize_override __cpuinitdata = -1; +static int disable_x86_serial_nr __cpuinitdata = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + asm("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { + /* Disable processor serial number */ + unsigned long lo, hi; + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); + __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; /* Current gdt points %fs at the "master" per-cpu area: after this, @@ -72,9 +177,6 @@ void switch_to_new_gdt(void) asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); } -static int cachesize_override __cpuinitdata = -1; -static int disable_x86_serial_nr __cpuinitdata = 1; - static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) @@ -95,14 +197,6 @@ static struct cpu_dev __cpuinitdata default_cpu = { .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static struct cpu_dev *this_cpu __cpuinitdata; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); int __cpuinit get_model_name(struct cpuinfo_x86 *c) { @@ -133,7 +227,6 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) return 1; } - void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -150,7 +243,7 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (n < 0x80000006) /* Some chips just has a large L1. */ return; - ecx = cpuid_ecx(0x80000006); + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); l2size = ecx >> 16; /* do processor-specific cache resizing */ @@ -167,48 +260,23 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) c->x86_cache_size = l2size; printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); -} - -/* - * Naming convention should be: [()] - * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * - */ - -/* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) -{ - struct cpu_model_info *info; - - if (c->x86_model >= 16) - return NULL; /* Range check */ - - if (!this_cpu) - return NULL; - - info = this_cpu->c_models; - - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; - } - return NULL; /* Not found */ + l2size, ecx & 0xFF); } #ifdef CONFIG_X86_HT void __cpuinit detect_ht(struct cpuinfo_x86 *c) { - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - cpuid(1, &eax, &ebx, &ecx, &edx); + u32 eax, ebx, ecx, edx; + int index_msb, core_bits; - if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY)) + if (!cpu_has(c, X86_FEATURE_HT)) return; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) + goto out; + + cpuid(1, &eax, &ebx, &ecx, &edx); + smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { @@ -225,8 +293,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) index_msb = get_count_order(smp_num_siblings); c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -236,10 +302,14 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & ((1 << core_bits) - 1); + } - if (c->x86_max_cores > 1) - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); +out: + if ((c->x86_max_cores * smp_num_siblings) > 1) { + printk(KERN_INFO "CPU: Physical Processor ID: %d\n", + c->phys_proc_id); + printk(KERN_INFO "CPU: Processor Core ID: %d\n", + c->cpu_core_id); } } #endif @@ -273,52 +343,6 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) this_cpu = &default_cpu; } - -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - - -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - - -/* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) -{ - u32 f1, f2; - - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); - - return ((f1^f2) & flag) != 0; -} - - -/* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) -{ - return flag_is_changeable_p(X86_EFLAGS_ID); -} - void __cpuinit cpu_detect(struct cpuinfo_x86 *c) { /* Get vendor name */ @@ -380,16 +404,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { - c->x86_cache_alignment = 32; c->x86_clflush_size = 32; + c->x86_cache_alignment = c->x86_clflush_size; if (!have_cpuid_p()) return; - c->extended_cpuid_level = 0; - memset(&c->x86_capability, 0, sizeof c->x86_capability); + c->extended_cpuid_level = 0; + cpu_detect(c); get_cpu_vendor(c); @@ -487,31 +511,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) detect_nopl(c); } -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { - /* Disable processor serial number */ - unsigned long lo, hi; - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); - - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } -} - -static int __init x86_serial_nr_setup(char *s) -{ - disable_x86_serial_nr = 0; - return 1; -} -__setup("serialnumber", x86_serial_nr_setup); - - - /* * This does the hard work of actually picking apart the CPU stuff... */ diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index f1fb94e766bb..b2da04135326 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -37,6 +37,8 @@ #include "cpu.h" +static struct cpu_dev *this_cpu __cpuinitdata; + /* We need valid kernel segments for data and code in long mode too * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout @@ -78,7 +80,6 @@ static struct cpu_dev __cpuinitdata default_cpu = { .c_vendor = "Unknown", .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -static struct cpu_dev *this_cpu __cpuinitdata; int __cpuinit get_model_name(struct cpuinfo_x86 *c) { @@ -112,7 +113,7 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { - unsigned int n, dummy, ebx, ecx, edx; + unsigned int n, dummy, ebx, ecx, edx, l2size; n = c->extended_cpuid_level; @@ -125,15 +126,17 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) c->x86_tlbsize = 0; } - if (n >= 0x80000006) { - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - ecx = cpuid_ecx(0x80000006); - c->x86_cache_size = ecx >> 16; - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); + if (n < 0x80000006) /* Some chips just has a large L1. */ + return; - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - c->x86_cache_size, ecx & 0xFF); - } + cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); + l2size = ecx >> 16; + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); + + c->x86_cache_size = l2size; + + printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", + l2size, ecx & 0xFF); } void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -142,14 +145,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) u32 eax, ebx, ecx, edx; int index_msb, core_bits; - cpuid(1, &eax, &ebx, &ecx, &edx); - - if (!cpu_has(c, X86_FEATURE_HT)) return; if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) goto out; + cpuid(1, &eax, &ebx, &ecx, &edx); + smp_num_siblings = (ebx & 0xff0000) >> 16; if (smp_num_siblings == 1) { @@ -175,6 +177,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) c->cpu_core_id = phys_pkg_id(index_msb) & ((1 << core_bits) - 1); } + out: if ((c->x86_max_cores * smp_num_siblings) > 1) { printk(KERN_INFO "CPU: Physical Processor ID: %d\n", @@ -182,7 +185,6 @@ out: printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); } - #endif } @@ -405,10 +407,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_model = c->x86_mask = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_clflush_size = 64; - c->x86_cache_alignment = c->x86_clflush_size; c->x86_max_cores = 1; c->x86_coreid_bits = 0; + c->x86_clflush_size = 64; + c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); generic_identify(c); -- cgit v1.2.2 From 6ac40ed0413ef4096720f966e11c7cdf259eee3f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 4 Sep 2008 10:41:22 -0700 Subject: x86: quick TSC calibration Introduce a fast TSC-calibration method on sane hardware. It only uses 17920 PIT timer ticks to calibrate the TSC, plus 256 ticks on each side to make sure the TSC values were very close to the tick, so the whole calibration takes 15ms. Yet, despite only takign 15ms, we can actually give pretty stringent guarantees of accuracy: - the code requires that we hit each 256-counter block at least 50 times, so the TSC error is basically at *MOST* just a few PIT cycles off in any direction. In practice, it's going to be about one microseconds off (which is how long it takes to read the counter) - so over 17920 PIT cycles, we can pretty much guarantee that the calibration error is less than one half of a percent. My testing bears this out: on my machine, the quick-calibration reports 2934.085kHz, while the slow one reports 2933.415. Yes, the slower calibration is still more precise. For me, the slow calibration is stable to within about one hundreth of a percent, so it's (at a guess) roughly an order-and-a-half of magnitude more precise. The longer you wait, the more precise you can be. However, the nice thing about the fast TSC PIT synchronization is that it's pretty much _guaranteed_ to give that 0.5% precision, and fail gracefully (and very quickly) if it doesn't get it. And it really is fairly simple (even if there's a lot of _details_ there, and I didn't get all of those right ont he first try or even the second ;) The patch says "110 insertions", but 63 of those new lines are actually comments. Signed-off-by: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 111 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 files changed, 110 insertions(+), 1 deletions(-) --- arch/x86/kernel/tsc.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 118 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index da033b5b3e19..839070ba8465 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -227,6 +227,117 @@ static unsigned long pit_calibrate_tsc(u32 latch, unsigned long ms, int loopmin) return delta; } +/* + * This reads the current MSB of the PIT counter, and + * checks if we are running on sufficiently fast and + * non-virtualized hardware. + * + * Our expectations are: + * + * - the PIT is running at roughly 1.19MHz + * + * - each IO is going to take about 1us on real hardware, + * but we allow it to be much faster (by a factor of 10) or + * _slightly_ slower (ie we allow up to a 2us read+counter + * update - anything else implies a unacceptably slow CPU + * or PIT for the fast calibration to work. + * + * - with 256 PIT ticks to read the value, we have 214us to + * see the same MSB (and overhead like doing a single TSC + * read per MSB value etc). + * + * - We're doing 2 reads per loop (LSB, MSB), and we expect + * them each to take about a microsecond on real hardware. + * So we expect a count value of around 100. But we'll be + * generous, and accept anything over 50. + * + * - if the PIT is stuck, and we see *many* more reads, we + * return early (and the next caller of pit_expect_msb() + * then consider it a failure when they don't see the + * next expected value). + * + * These expectations mean that we know that we have seen the + * transition from one expected value to another with a fairly + * high accuracy, and we didn't miss any events. We can thus + * use the TSC value at the transitions to calculate a pretty + * good value for the TSC frequencty. + */ +static inline int pit_expect_msb(unsigned char val) +{ + int count = 0; + + for (count = 0; count < 50000; count++) { + /* Ignore LSB */ + inb(0x42); + if (inb(0x42) != val) + break; + } + return count > 50; +} + +/* + * How many MSB values do we want to see? We aim for a + * 15ms calibration, which assuming a 2us counter read + * error should give us roughly 150 ppm precision for + * the calibration. + */ +#define QUICK_PIT_MS 15 +#define QUICK_PIT_ITERATIONS (QUICK_PIT_MS * PIT_TICK_RATE / 1000 / 256) + +static unsigned long quick_pit_calibrate(void) +{ + /* Set the Gate high, disable speaker */ + outb((inb(0x61) & ~0x02) | 0x01, 0x61); + + /* + * Counter 2, mode 0 (one-shot), binary count + * + * NOTE! Mode 2 decrements by two (and then the + * output is flipped each time, giving the same + * final output frequency as a decrement-by-one), + * so mode 0 is much better when looking at the + * individual counts. + */ + outb(0xb0, 0x43); + + /* Start at 0xffff */ + outb(0xff, 0x42); + outb(0xff, 0x42); + + if (pit_expect_msb(0xff)) { + int i; + u64 t1, t2, delta; + unsigned char expect = 0xfe; + + t1 = get_cycles(); + for (i = 0; i < QUICK_PIT_ITERATIONS; i++, expect--) { + if (!pit_expect_msb(expect)) + goto failed; + } + t2 = get_cycles(); + + /* + * Ok, if we get here, then we've seen the + * MSB of the PIT decrement QUICK_PIT_ITERATIONS + * times, and each MSB had many hits, so we never + * had any sudden jumps. + * + * As a result, we can depend on there not being + * any odd delays anywhere, and the TSC reads are + * reliable. + * + * kHz = ticks / time-in-seconds / 1000; + * kHz = (t2 - t1) / (QPI * 256 / PIT_TICK_RATE) / 1000 + * kHz = ((t2 - t1) * PIT_TICK_RATE) / (QPI * 256 * 1000) + */ + delta = (t2 - t1)*PIT_TICK_RATE; + do_div(delta, QUICK_PIT_ITERATIONS*256*1000); + printk("Fast TSC calibration using PIT\n"); + return delta; + } +failed: + return 0; +} /** * native_calibrate_tsc - calibrate the tsc on boot @@ -235,9 +346,15 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms; + unsigned long flags, latch, ms, fast_calibrate; int hpet = is_hpet_enabled(), i, loopmin; + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + if (fast_calibrate) + return fast_calibrate; + /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes -- cgit v1.2.2 From 4156e9a8ef5b521185f451213d33fa661f38512e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Sep 2008 22:47:47 +0200 Subject: x86: quick TSC calibration, improve - make sure the final TSC timestamp is reliable too Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 839070ba8465..6dab90f68515 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -316,6 +316,12 @@ static unsigned long quick_pit_calibrate(void) } t2 = get_cycles(); + /* + * Make sure we can rely on the second TSC timestamp: + */ + if (!pit_expect_msb(--expect)) + goto failed; + /* * Ok, if we get here, then we've seen the * MSB of the PIT decrement QUICK_PIT_ITERATIONS -- cgit v1.2.2 From 97e4db7c8719f67c52793eeca5ec4df4c3407f2a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:08:59 -0700 Subject: x86: make detect_ht depend on CONFIG_X86_HT 64-bit has X86_HT set too, so use that instead of SMP. This also removes a include/asm-x86/processor.h ifdef. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- arch/x86/kernel/cpu/common_64.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7d5a07f0fd24..1f80ce07daa2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -263,9 +263,9 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) l2size, ecx & 0xFF); } -#ifdef CONFIG_X86_HT void __cpuinit detect_ht(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; int index_msb, core_bits; @@ -311,8 +311,8 @@ out: printk(KERN_INFO "CPU: Processor Core ID: %d\n", c->cpu_core_id); } -} #endif +} static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) { diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index bcb48ce05d23..8e630734e1fd 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -141,7 +141,7 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) void __cpuinit detect_ht(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT u32 eax, ebx, ecx, edx; int index_msb, core_bits; -- cgit v1.2.2 From f0fc4aff1fa4db1c201593422dcbf85c9897ec4f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:00 -0700 Subject: x86: make header file the same in arch/x86/kernel/cpu/common_xx.c Make the files more similar in preparation to unification, no code changed. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 22 +++++++++++++++++++--- arch/x86/kernel/cpu/common_64.c | 2 +- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1f80ce07daa2..6bbdbc24f2ba 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1,25 +1,41 @@ #include +#include +#include #include +#include +#include +#include +#include +#include #include #include -#include #include -#include -#include #include #include #include +#include #include #include #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #include #include +#include #endif +#include +#include +#include +#include +#include +#include +#include +#include + #include "cpu.h" static struct cpu_dev *this_cpu __cpuinitdata; diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 8e630734e1fd..5daec699acf0 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -24,6 +24,7 @@ #include #include #include +#include #endif #include #include @@ -33,7 +34,6 @@ #include #include #include -#include #include "cpu.h" -- cgit v1.2.2 From 950ad7ff6ec17fc1b47abc95c5c74628eb1adf8b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:01 -0700 Subject: x86: same gdt_page with macro Move the 32-bit and 64-bit gdt_page definitions next to each other, separated with an #ifdef. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 17 +++++++++++++++++ arch/x86/kernel/cpu/common_64.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6bbdbc24f2ba..e0ca51f4f2d3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -40,6 +40,22 @@ static struct cpu_dev *this_cpu __cpuinitdata; +#ifdef CONFIG_X86_64 +/* We need valid kernel segments for data and code in long mode too + * IRET will check the segment types kkeil 2000/10/28 + * Also sysret mandates a special GDT layout + */ +/* The TLS descriptors are currently at a different place compared to i386. + Hopefully nobody expects them at a fixed place (Wine?) */ +DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, + [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, +} }; +#else DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, @@ -74,6 +90,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, } }; +#endif EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int cachesize_override __cpuinitdata = -1; diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 5daec699acf0..b4890504284d 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -39,6 +39,7 @@ static struct cpu_dev *this_cpu __cpuinitdata; +#ifdef CONFIG_X86_64 /* We need valid kernel segments for data and code in long mode too * IRET will check the segment types kkeil 2000/10/28 * Also sysret mandates a special GDT layout @@ -53,6 +54,42 @@ DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, } }; +#else +DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { + [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, + [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, + [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, + [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, + /* + * Segments used for calling PnP BIOS have byte granularity. + * They code segments and data segments have fixed 64k limits, + * the transfer segment sizes are set at run time. + */ + /* 32-bit code */ + [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ + [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, + /* 16-bit data */ + [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, + /* + * The APM segments have byte granularity and their bases + * are set at run time. All have 64k limits. + */ + /* 32-bit code */ + [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, + /* 16-bit code */ + [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, + /* data */ + [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, + + [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, + [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, +} }; +#endif EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; -- cgit v1.2.2 From ba51dced0b55eb62874e1646d5eeff344c3d4e67 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:02 -0700 Subject: x86: cpu/common.c, let 64-bit code have 32-bit only functions No effect on 64-bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 8 +++ arch/x86/kernel/cpu/common_64.c | 111 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 119 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e0ca51f4f2d3..9128ba0c8d33 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -93,6 +93,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +#ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -195,6 +196,13 @@ static int __init x86_serial_nr_setup(char *s) return 1; } __setup("serialnumber", x86_serial_nr_setup); +#else +/* Probe for the CPUID instruction */ +static inline int have_cpuid_p(void) +{ + return 1; +} +#endif __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index b4890504284d..40c9d89cc14a 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -92,6 +92,117 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +#ifdef CONFIG_X86_32 +static int cachesize_override __cpuinitdata = -1; +static int disable_x86_serial_nr __cpuinitdata = 1; + +static int __init cachesize_setup(char *str) +{ + get_option(&str, &cachesize_override); + return 1; +} +__setup("cachesize=", cachesize_setup); + +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + +static int __init x86_fxsr_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_FXSR); + setup_clear_cpu_cap(X86_FEATURE_XMM); + return 1; +} +__setup("nofxsr", x86_fxsr_setup); + +static int __init x86_sep_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_SEP); + return 1; +} +__setup("nosep", x86_sep_setup); + +/* Standard macro to see if a specific flag is changeable */ +static inline int flag_is_changeable_p(u32 flag) +{ + u32 f1, f2; + + asm("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); + + return ((f1^f2) & flag) != 0; +} + +/* Probe for the CPUID instruction */ +static int __cpuinit have_cpuid_p(void) +{ + return flag_is_changeable_p(X86_EFLAGS_ID); +} + +static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { + /* Disable processor serial number */ + unsigned long lo, hi; + rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + lo |= 0x200000; + wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); + printk(KERN_NOTICE "CPU serial number disabled.\n"); + clear_cpu_cap(c, X86_FEATURE_PN); + + /* Disabling the serial number may affect the cpuid level */ + c->cpuid_level = cpuid_eax(0); + } +} + +static int __init x86_serial_nr_setup(char *s) +{ + disable_x86_serial_nr = 0; + return 1; +} +__setup("serialnumber", x86_serial_nr_setup); +#else +/* Probe for the CPUID instruction */ +static inline int have_cpuid_p(void) +{ + return 1; +} +#endif + __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; /* Current gdt points %fs at the "master" per-cpu area: after this, -- cgit v1.2.2 From d5494d4f517158b1d2eea1ae33f6c264eb12675f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:03 -0700 Subject: x86: cpu/common*.c, make 32-bit have 64-bit only functions Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 138 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common_64.c | 12 ++++ 2 files changed, 150 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9128ba0c8d33..dcd3ebd5ba62 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -750,6 +750,143 @@ __setup("clearcpuid=", setup_disablecpuid); cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; +#ifdef CONFIG_X86_64 +struct x8664_pda **_cpu_pda __read_mostly; +EXPORT_SYMBOL(_cpu_pda); + +struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; + +char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; + +unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +static int do_not_nx __cpuinitdata; + +/* noexec=on|off +Control non executable mappings for 64bit processes. + +on Enable(default) +off Disable +*/ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", nonx_setup); + +int force_personality32; + +/* noexec32=on|off +Control non executable heap for 32bit processes. +To control the stack too use noexec=off + +on PROT_READ does not imply PROT_EXEC for 32bit processes (default) +off PROT_READ implies PROT_EXEC +*/ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + +void pda_init(int cpu) +{ + struct x8664_pda *pda = cpu_pda(cpu); + + /* Setup up data that may be needed in __get_free_pages early */ + loadsegment(fs, 0); + loadsegment(gs, 0); + /* Memory clobbers used to order PDA accessed */ + mb(); + wrmsrl(MSR_GS_BASE, pda); + mb(); + + pda->cpunumber = cpu; + pda->irqcount = -1; + pda->kernelstack = (unsigned long)stack_thread_info() - + PDA_STACKOFFSET + THREAD_SIZE; + pda->active_mm = &init_mm; + pda->mmu_state = 0; + + if (cpu == 0) { + /* others are initialized in smpboot.c */ + pda->pcurrent = &init_task; + pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; + } else { + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } + + if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) + pda->nodenumber = cpu_to_node(cpu); + } +} + +char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + + DEBUG_STKSZ] __page_aligned_bss; + +extern asmlinkage void ignore_sysret(void); + +/* May not be marked __init: used by software suspend */ +void syscall_init(void) +{ + /* + * LSTAR and STAR live in a bit strange symbiosis. + * They both write to the same internal register. STAR allows to + * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); + wrmsrl(MSR_CSTAR, ignore_sysret); + +#ifdef CONFIG_IA32_EMULATION + syscall32_cpu_init(); +#endif + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, + X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); +} + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; +} + +unsigned long kernel_eflags; + +/* + * Copies of the original ist values from the tss are only accessed during + * debugging, no special alignment required. + */ +DEFINE_PER_CPU(struct orig_ist, orig_ist); + +#else + /* Make sure %fs is initialized properly in idle threads */ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) { @@ -757,6 +894,7 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) regs->fs = __KERNEL_PERCPU; return regs; } +#endif /* * cpu_init() initializes state that is per-CPU. Some data is already diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 40c9d89cc14a..9f2a6ece82dd 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -704,6 +704,7 @@ __setup("clearcpuid=", setup_disablecpuid); cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; +#ifdef CONFIG_X86_64 struct x8664_pda **_cpu_pda __read_mostly; EXPORT_SYMBOL(_cpu_pda); @@ -838,6 +839,17 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +#else + +/* Make sure %fs is initialized properly in idle threads */ +struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) +{ + memset(regs, 0, sizeof(struct pt_regs)); + regs->fs = __KERNEL_PERCPU; + return regs; +} +#endif + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT -- cgit v1.2.2 From 1ba76586f778be327e452180d8378e40ee70f066 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:04 -0700 Subject: x86: cpu/common*.c have same cpu_init(), with copying and #ifdef hard to merge by lines... (as here we have material differences between 32-bit and 64-bit mode) - will try to do it later. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 124 ++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/common_64.c | 88 +++++++++++++++++++++++++++- 2 files changed, 211 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index dcd3ebd5ba62..f44678db1616 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -901,7 +901,129 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. + * A lot of state is already set up in PDA init for 64 bit */ +#ifdef CONFIG_X86_64 +void __cpuinit cpu_init(void) +{ + int cpu = stack_smp_processor_id(); + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); + unsigned long v; + char *estacks = NULL; + struct task_struct *me; + int i; + + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) + pda_init(cpu); + else + estacks = boot_exception_stacks; + + me = current; + + if (cpu_test_and_set(cpu, cpu_initialized)) + panic("CPU#%d already initialized!\n", cpu); + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + /* + * Initialize the per-CPU GDT with the boot GDT, + * and set up the GDT descriptor: + */ + + switch_to_new_gdt(); + load_idt((const struct desc_ptr *)&idt_descr); + + memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); + syscall_init(); + + wrmsrl(MSR_FS_BASE, 0); + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + + check_efer(); + if (cpu != 0 && x2apic) + enable_x2apic(); + + /* + * set up and load the per-CPU TSS + */ + if (!orig_ist->ist[0]) { + static const unsigned int order[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + }; + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; + } + } + + t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + /* + * <= is required because the CPU will access up to + * 8 bits beyond the end of the IO permission bitmap. + */ + for (i = 0; i <= IO_BITMAP_LONGS; i++) + t->io_bitmap[i] = ~0UL; + + atomic_inc(&init_mm.mm_count); + me->active_mm = &init_mm; + if (me->mm) + BUG(); + enter_lazy_tlb(&init_mm, me); + + load_sp0(t, ¤t->thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + +#ifdef CONFIG_KGDB + /* + * If the kgdb is connected no debug regs should be altered. This + * is only applicable when KGDB and a KGDB I/O module are built + * into the kernel and you are using early debugging with + * kgdbwait. KGDB will control the kernel HW breakpoint registers. + */ + if (kgdb_connected && arch_kgdb_ops.correct_hw_break) + arch_kgdb_ops.correct_hw_break(); + else { +#endif + /* + * Clear all 6 debug registers: + */ + + set_debugreg(0UL, 0); + set_debugreg(0UL, 1); + set_debugreg(0UL, 2); + set_debugreg(0UL, 3); + set_debugreg(0UL, 6); + set_debugreg(0UL, 7); +#ifdef CONFIG_KGDB + /* If the kgdb is connected no debug regs should be altered. */ + } +#endif + + fpu_init(); + + raw_local_save_flags(kernel_eflags); + + if (is_uv_system()) + uv_cpu_init(); +} + +#else + void __cpuinit cpu_init(void) { int cpu = smp_processor_id(); @@ -982,3 +1104,5 @@ void __cpuinit cpu_uninit(void) per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; } #endif + +#endif diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 9f2a6ece82dd..2bd0ed5abb0a 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -855,8 +855,9 @@ struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init. + * A lot of state is already set up in PDA init for 64 bit */ +#ifdef CONFIG_X86_64 void __cpuinit cpu_init(void) { int cpu = stack_smp_processor_id(); @@ -974,3 +975,88 @@ void __cpuinit cpu_init(void) if (is_uv_system()) uv_cpu_init(); } + +#else + +void __cpuinit cpu_init(void) +{ + int cpu = smp_processor_id(); + struct task_struct *curr = current; + struct tss_struct *t = &per_cpu(init_tss, cpu); + struct thread_struct *thread = &curr->thread; + + if (cpu_test_and_set(cpu, cpu_initialized)) { + printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); + for (;;) local_irq_enable(); + } + + printk(KERN_INFO "Initializing CPU#%d\n", cpu); + + if (cpu_has_vme || cpu_has_tsc || cpu_has_de) + clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); + + load_idt(&idt_descr); + switch_to_new_gdt(); + + /* + * Set up and load the per-CPU TSS and LDT + */ + atomic_inc(&init_mm.mm_count); + curr->active_mm = &init_mm; + if (curr->mm) + BUG(); + enter_lazy_tlb(&init_mm, curr); + + load_sp0(t, thread); + set_tss_desc(cpu, t); + load_TR_desc(); + load_LDT(&init_mm.context); + +#ifdef CONFIG_DOUBLEFAULT + /* Set up doublefault TSS pointer in the GDT */ + __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); +#endif + + /* Clear %gs. */ + asm volatile ("mov %0, %%gs" : : "r" (0)); + + /* Clear all 6 debug registers: */ + set_debugreg(0, 0); + set_debugreg(0, 1); + set_debugreg(0, 2); + set_debugreg(0, 3); + set_debugreg(0, 6); + set_debugreg(0, 7); + + /* + * Force FPU initialization: + */ + if (cpu_has_xsave) + current_thread_info()->status = TS_XSAVE; + else + current_thread_info()->status = 0; + clear_used_math(); + mxcsr_feature_mask_init(); + + /* + * Boot processor to setup the FP and extended state context info. + */ + if (!smp_processor_id()) + init_thread_xstate(); + + xsave_init(); +} + +#ifdef CONFIG_HOTPLUG_CPU +void __cpuinit cpu_uninit(void) +{ + int cpu = raw_smp_processor_id(); + cpu_clear(cpu, cpu_initialized); + + /* lazy TLB state */ + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} +#endif + +#endif -- cgit v1.2.2 From fab334c1d5f24d23d12f98ad652d399279cd03ce Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:05 -0700 Subject: x86: cpu/common*.c, merge switch_to_new_gdt() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/cpu/common_64.c | 3 +++ 2 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f44678db1616..43d5287bb2a4 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -215,7 +215,9 @@ void switch_to_new_gdt(void) gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); +#ifdef CONFIG_X86_32 asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +#endif } static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 2bd0ed5abb0a..d7b996518f86 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -214,6 +214,9 @@ void switch_to_new_gdt(void) gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); +#ifdef CONFIG_X86_32 + asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); +#endif } static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; -- cgit v1.2.2 From b9e67f00424e164dcd29391eb48dc941db8691ad Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:06 -0700 Subject: x86: cpu/common.c, merge default_init() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++++ arch/x86/kernel/cpu/common_64.c | 12 ++++++++++++ 2 files changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 43d5287bb2a4..2c4bfa2e56ad 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -224,6 +224,9 @@ static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_64 + display_cacheinfo(c); +#else /* Not much we can do here... */ /* Check if at least it has cpuid */ if (c->cpuid_level == -1) { @@ -233,6 +236,7 @@ static void __cpuinit default_init(struct cpuinfo_x86 *c) else if (c->x86 == 3) strcpy(c->x86_model_id, "386"); } +#endif } static struct cpu_dev __cpuinitdata default_cpu = { diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index d7b996518f86..2fda10974813 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -223,7 +223,19 @@ static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; static void __cpuinit default_init(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_64 display_cacheinfo(c); +#else + /* Not much we can do here... */ + /* Check if at least it has cpuid */ + if (c->cpuid_level == -1) { + /* No cpuid. It must be an ancient CPU */ + if (c->x86 == 4) + strcpy(c->x86_model_id, "486"); + else if (c->x86 == 3) + strcpy(c->x86_model_id, "386"); + } +#endif } static struct cpu_dev __cpuinitdata default_cpu = { -- cgit v1.2.2 From 140fc72709278989f08eb756d16a70008bdcc409 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:07 -0700 Subject: x86: cpu/common*.c, merge display_cacheinfo() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 8 ++++++++ arch/x86/kernel/cpu/common_64.c | 17 +++++++++++++++++ 2 files changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2c4bfa2e56ad..f9191207718b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -285,6 +285,10 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 + /* On K8 L1 TLB is inclusive, so don't count it */ + c->x86_tlbsize = 0; +#endif } if (n < 0x80000006) /* Some chips just has a large L1. */ @@ -293,6 +297,9 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); l2size = ecx >> 16; +#ifdef CONFIG_X86_64 + c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else /* do processor-specific cache resizing */ if (this_cpu->c_size_cache) l2size = this_cpu->c_size_cache(c, l2size); @@ -303,6 +310,7 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) if (l2size == 0) return; /* Again, no L2 cache is possible */ +#endif c->x86_cache_size = l2size; diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 2fda10974813..f7a2d524b1e7 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -285,8 +285,10 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); c->x86_cache_size = (ecx>>24) + (edx>>24); +#ifdef CONFIG_X86_64 /* On K8 L1 TLB is inclusive, so don't count it */ c->x86_tlbsize = 0; +#endif } if (n < 0x80000006) /* Some chips just has a large L1. */ @@ -294,7 +296,22 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); l2size = ecx >> 16; + +#ifdef CONFIG_X86_64 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); +#else + + /* do processor-specific cache resizing */ + if (this_cpu->c_size_cache) + l2size = this_cpu->c_size_cache(c, l2size); + + /* Allow user to override all this if necessary. */ + if (cachesize_override != -1) + l2size = cachesize_override; + + if (l2size == 0) + return; /* Again, no L2 cache is possible */ +#endif c->x86_cache_size = l2size; -- cgit v1.2.2 From 1cd78776c7d5487e3000426d0ce1ff203c5d60cd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:08 -0700 Subject: x86: cpu/common*.c, merge detect_ht() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 13 ++++++++++++- arch/x86/kernel/cpu/common_64.c | 9 +++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f9191207718b..b2018f76c94d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -330,6 +330,9 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) goto out; + if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) + return; + cpuid(1, &eax, &ebx, &ecx, &edx); smp_num_siblings = (ebx & 0xff0000) >> 16; @@ -346,8 +349,11 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } index_msb = get_count_order(smp_num_siblings); +#ifdef CONFIG_X86_64 + c->phys_proc_id = phys_pkg_id(index_msb); +#else c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); - +#endif smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -355,8 +361,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); +#ifdef CONFIG_X86_64 + c->cpu_core_id = phys_pkg_id(index_msb) & + ((1 << core_bits) - 1); +#else c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & ((1 << core_bits) - 1); +#endif } out: diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index f7a2d524b1e7..7ac0a00743c7 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -349,7 +349,11 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) } index_msb = get_count_order(smp_num_siblings); +#ifdef CONFIG_X86_64 c->phys_proc_id = phys_pkg_id(index_msb); +#else + c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); +#endif smp_num_siblings = smp_num_siblings / c->x86_max_cores; @@ -357,8 +361,13 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) core_bits = get_count_order(c->x86_max_cores); +#ifdef CONFIG_X86_64 c->cpu_core_id = phys_pkg_id(index_msb) & ((1 << core_bits) - 1); +#else + c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & + ((1 << core_bits) - 1); +#endif } out: -- cgit v1.2.2 From 5122c890ba969e6efeaba9ed45f5936e62d3c6d5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:09 -0700 Subject: x86: cpu/common.c: merge get_cpu_cap() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 20 ++++++++++++++++++++ arch/x86/kernel/cpu/common_64.c | 2 ++ 2 files changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b2018f76c94d..ffc2f5ed09a3 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -458,6 +458,26 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[6] = cpuid_ecx(0x80000001); } } + +#ifdef CONFIG_X86_64 + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + /* Don't set x86_cpuid_level here for now to not confuse. */ + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } + + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + + if (c->extended_cpuid_level >= 0x80000008) { + u32 eax = cpuid_eax(0x80000008); + + c->x86_virt_bits = (eax >> 8) & 0xff; + c->x86_phys_bits = eax & 0xff; + } +#endif } /* * Do minimum CPU detection early. diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 7ac0a00743c7..75bb7ff99ee3 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -461,6 +461,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) } } +#ifdef CONFIG_X86_64 /* Transmeta-defined flags: level 0x80860001 */ xlvl = cpuid_eax(0x80860000); if ((xlvl & 0xffff0000) == 0x80860000) { @@ -478,6 +479,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_virt_bits = (eax >> 8) & 0xff; c->x86_phys_bits = eax & 0xff; } +#endif } /* Do some early cpuid on the boot CPU to get some parameter that are -- cgit v1.2.2 From 6627d2423067f2c6eedb422a59fba9270a3c5e36 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:10 -0700 Subject: x86: cpu/common*.c, merge early_identify_cpu() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++++ arch/x86/kernel/cpu/common_64.c | 19 ++++++++++++++++--- 2 files changed, 20 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ffc2f5ed09a3..61a70748213e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -490,7 +490,11 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_64 + c->x86_clflush_size = 64; +#else c->x86_clflush_size = 32; +#endif c->x86_cache_alignment = c->x86_clflush_size; if (!have_cpuid_p()) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 75bb7ff99ee3..6475548d64eb 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -482,15 +482,28 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) #endif } -/* Do some early cpuid on the boot CPU to get some parameter that are - needed before check_bugs. Everything advanced is in identify_cpu - below. */ +/* + * Do minimum CPU detection early. + * Fields really needed: vendor, cpuid_level, family, model, mask, + * cache alignment. + * The others are not touched to avoid unwanted side effects. + * + * WARNING: this function is only called on the BP. Don't add code here + * that is supposed to run on all CPUs. + */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { +#ifdef CONFIG_X86_64 c->x86_clflush_size = 64; +#else + c->x86_clflush_size = 32; +#endif c->x86_cache_alignment = c->x86_clflush_size; + if (!have_cpuid_p()) + return; + memset(&c->x86_capability, 0, sizeof c->x86_capability); c->extended_cpuid_level = 0; -- cgit v1.2.2 From 56f0d033be2ebb983993e5d7f24ae232c9a1e7f9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:11 -0700 Subject: x86: cpu/common*.c: merge print_cpu_info() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 6475548d64eb..68b06a39dcac 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -730,8 +730,20 @@ __setup("noclflush", setup_noclflush); void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) { + char *vendor = NULL; + + if (c->x86_vendor < X86_VENDOR_NUM) + vendor = this_cpu->c_vendor; + else if (c->cpuid_level >= 0) + vendor = c->x86_vendor_id; + + if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) + printk(KERN_CONT "%s ", vendor); + if (c->x86_model_id[0]) printk(KERN_CONT "%s", c->x86_model_id); + else + printk(KERN_CONT "%d86", c->x86); if (c->x86_mask || c->cpuid_level >= 0) printk(KERN_CONT " stepping %02x\n", c->x86_mask); -- cgit v1.2.2 From b89d3b3e2caeab3d895301d1aee3cd2a91eddb79 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:12 -0700 Subject: x86: cpu/common*.c, merge generic_identify() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 15 ++++++++++++--- arch/x86/kernel/cpu/common_64.c | 19 ++++++++++++++++--- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 61a70748213e..f35baa7f3036 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -548,6 +548,10 @@ void __init early_cpu_init(void) * of early VIA chips and (more importantly) broken virtualizers that * are not easy to detect. Hence, probe for it based on first * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. */ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { @@ -586,11 +590,16 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) if (c->cpuid_level >= 0x00000001) { c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; -#ifdef CONFIG_X86_HT +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT c->apicid = phys_pkg_id(c->initial_apicid, 0); - c->phys_proc_id = c->initial_apicid; -#else +# else c->apicid = c->initial_apicid; +# endif +#endif + +#ifdef CONFIG_X86_HT + c->phys_proc_id = c->initial_apicid; #endif } diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 68b06a39dcac..869a6ff9f7dc 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -581,6 +581,9 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { + if (!have_cpuid_p()) + return; + c->extended_cpuid_level = 0; cpu_detect(c); @@ -589,11 +592,21 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) get_cpu_cap(c); - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xff; -#ifdef CONFIG_SMP - c->phys_proc_id = c->initial_apicid; + if (c->cpuid_level >= 0x00000001) { + c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; +#ifdef CONFIG_X86_32 +# ifdef CONFIG_X86_HT + c->apicid = phys_pkg_id(c->initial_apicid, 0); +# else + c->apicid = c->initial_apicid; +# endif #endif +#ifdef CONFIG_X86_HT + c->phys_proc_id = c->initial_apicid; +#endif + } + if (c->extended_cpuid_level >= 0x80000004) get_model_name(c); /* Default name */ -- cgit v1.2.2 From 102bbe3ab83c7ac04461d277df23cd5acdb49892 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:13 -0700 Subject: x86: cpu/common*.c, merge identify_cpu() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 89 ++++++++++++++++++++---------- arch/x86/kernel/cpu/common_64.c | 116 ++++++++++++++++++++++++++++++---------- 2 files changed, 147 insertions(+), 58 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f35baa7f3036..eef868c97b89 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -104,34 +104,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -/* - * Naming convention should be: [()] - * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * - */ - -/* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) -{ - struct cpu_model_info *info; - - if (c->x86_model >= 16) - return NULL; /* Range check */ - - if (!this_cpu) - return NULL; - - info = this_cpu->c_models; - - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; - } - return NULL; /* Not found */ -} - static int __init x86_fxsr_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_FXSR); @@ -197,13 +169,48 @@ static int __init x86_serial_nr_setup(char *s) } __setup("serialnumber", x86_serial_nr_setup); #else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} /* Probe for the CPUID instruction */ static inline int have_cpuid_p(void) { return 1; } +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} #endif +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; /* Current gdt points %fs at the "master" per-cpu area: after this, @@ -620,12 +627,18 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->loops_per_jiffy = loops_per_jiffy; c->x86_cache_size = -1; c->x86_vendor = X86_VENDOR_UNKNOWN; - c->cpuid_level = -1; /* CPUID not detected */ c->x86_model = c->x86_mask = 0; /* So far unknown... */ c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; +#ifdef CONFIG_X86_64 + c->x86_coreid_bits = 0; + c->x86_clflush_size = 64; +#else + c->cpuid_level = -1; /* CPUID not detected */ c->x86_clflush_size = 32; +#endif + c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); if (!have_cpuid_p()) { @@ -644,6 +657,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_identify) this_cpu->c_identify(c); +#ifdef CONFIG_X86_64 + c->apicid = phys_pkg_id(0); +#endif + /* * Vendor-specific initialization. In this section we * canonicalize the feature flags, meaning if there are @@ -677,6 +694,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86, c->x86_model); } +#ifdef CONFIG_X86_64 + detect_ht(c); +#endif + /* * On SMP, boot_cpu_data holds the common feature set between * all CPUs; so make sure that we indicate which features are @@ -693,24 +714,34 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) for (i = 0; i < NCAPINTS; i++) c->x86_capability[i] &= ~cleared_cpu_caps[i]; +#ifdef CONFIG_X86_MCE /* Init Machine Check Exception if available. */ mcheck_init(c); +#endif select_idle_routine(c); + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + numa_add_cpu(smp_processor_id()); +#endif } void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); +#ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); +#endif } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); +#ifdef CONFIG_X86_32 enable_sep_cpu(); +#endif mtrr_ap_init(); } diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 869a6ff9f7dc..6a42371a609f 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -103,34 +103,6 @@ static int __init cachesize_setup(char *str) } __setup("cachesize=", cachesize_setup); -/* - * Naming convention should be: [()] - * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * - */ - -/* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) -{ - struct cpu_model_info *info; - - if (c->x86_model >= 16) - return NULL; /* Range check */ - - if (!this_cpu) - return NULL; - - info = this_cpu->c_models; - - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; - } - return NULL; /* Not found */ -} - static int __init x86_fxsr_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_FXSR); @@ -196,13 +168,48 @@ static int __init x86_serial_nr_setup(char *s) } __setup("serialnumber", x86_serial_nr_setup); #else +static inline int flag_is_changeable_p(u32 flag) +{ + return 1; +} /* Probe for the CPUID instruction */ static inline int have_cpuid_p(void) { return 1; } +static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) +{ +} #endif +/* + * Naming convention should be: [()] + * This table only is used unless init_() below doesn't set it; + * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used + * + */ + +/* Look up CPU names by table lookup. */ +static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) +{ + struct cpu_model_info *info; + + if (c->x86_model >= 16) + return NULL; /* Range check */ + + if (!this_cpu) + return NULL; + + info = this_cpu->c_models; + + while (info && info->family) { + if (info->family == c->x86) + return info->model_names[c->x86_model]; + info++; + } + return NULL; /* Not found */ +} + __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; /* Current gdt points %fs at the "master" per-cpu area: after this, @@ -628,14 +635,35 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; +#ifdef CONFIG_X86_64 c->x86_coreid_bits = 0; c->x86_clflush_size = 64; +#else + c->cpuid_level = -1; /* CPUID not detected */ + c->x86_clflush_size = 32; +#endif c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); + if (!have_cpuid_p()) { + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + } + generic_identify(c); + if (this_cpu->c_identify) + this_cpu->c_identify(c); + +#ifdef CONFIG_X86_64 c->apicid = phys_pkg_id(0); +#endif /* * Vendor-specific initialization. In this section we @@ -650,7 +678,29 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_init) this_cpu->c_init(c); + /* Disable the PN if appropriate */ + squash_the_stupid_serial_number(c); + + /* + * The vendor-specific functions might have changed features. Now + * we do "generic changes." + */ + + /* If the model name is still unset, do table lookup. */ + if (!c->x86_model_id[0]) { + char *p; + p = table_lookup_model(c); + if (p) + strcpy(c->x86_model_id, p); + else + /* Last resort... */ + sprintf(c->x86_model_id, "%02x/%02x", + c->x86, c->x86_model); + } + +#ifdef CONFIG_X86_64 detect_ht(c); +#endif /* * On SMP, boot_cpu_data holds the common feature set between @@ -669,11 +719,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[i] &= ~cleared_cpu_caps[i]; #ifdef CONFIG_X86_MCE + /* Init Machine Check Exception if available. */ mcheck_init(c); #endif select_idle_routine(c); -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif @@ -682,12 +733,19 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); +#ifdef CONFIG_X86_32 + sysenter_setup(); + enable_sep_cpu(); +#endif } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) { BUG_ON(c == &boot_cpu_data); identify_cpu(c); +#ifdef CONFIG_X86_32 + enable_sep_cpu(); +#endif mtrr_ap_init(); } -- cgit v1.2.2 From 143b604a2d07ff69cc2bc02ecd9395b28e0b5292 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Sep 2008 09:37:15 +0200 Subject: x86: cpu/common*.c, merge whitespaces Merge leftover whitespaces, to make arch/x86/kernel/cpu/common_64.c exactly identical to arch/x86/kernel/cpu/common.c. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 6a42371a609f..eef868c97b89 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -26,6 +26,7 @@ #include #include #endif + #include #include #include @@ -280,7 +281,6 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) return 1; } - void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) { unsigned int n, dummy, ebx, ecx, edx, l2size; @@ -307,7 +307,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_64 c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); #else - /* do processor-specific cache resizing */ if (this_cpu->c_size_cache) l2size = this_cpu->c_size_cache(c, l2size); @@ -334,6 +333,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HT)) return; + if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) goto out; @@ -443,7 +443,6 @@ void __cpuinit cpu_detect(struct cpuinfo_x86 *c) } } - static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) { u32 tfms, xlvl; @@ -452,7 +451,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) /* Intel-defined flags: level 0x00000001 */ if (c->cpuid_level >= 0x00000001) { u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); c->x86_capability[0] = capability; c->x86_capability[4] = excap; @@ -488,7 +486,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) } #endif } - /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -500,7 +497,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) */ static void __init early_identify_cpu(struct cpuinfo_x86 *c) { - #ifdef CONFIG_X86_64 c->x86_clflush_size = 64; #else @@ -722,12 +718,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) /* Init Machine Check Exception if available. */ mcheck_init(c); #endif + select_idle_routine(c); #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) numa_add_cpu(smp_processor_id()); #endif - } void __init identify_boot_cpu(void) -- cgit v1.2.2 From f5017cfa3591d8eaf5c792e40ff30f18e498b68c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:09:14 -0700 Subject: x86: use cpu/common.c on 64 bit Use cpu/common.c on both 64-bit and 32-bit and remove cpu/common_64.c. We started out with this linecount: 816 arch/x86/kernel/cpu/common_64.c 805 arch/x86/kernel/cpu/common.c and the resulting common.c is 1197 lines long, so there's already 424 lines of code eliminated in this phase of the unification. Signed-off-by: Yinghai Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/Makefile | 6 +- arch/x86/kernel/cpu/common_64.c | 1197 --------------------------------------- 2 files changed, 3 insertions(+), 1200 deletions(-) delete mode 100644 arch/x86/kernel/cpu/common_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 403e689df0b8..d031f248dfc0 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -3,10 +3,10 @@ # obj-y := intel_cacheinfo.o addon_cpuid_features.o -obj-y += proc.o capflags.o powerflags.o +obj-y += proc.o capflags.o powerflags.o common.o -obj-$(CONFIG_X86_32) += common.o bugs.o cmpxchg.o -obj-$(CONFIG_X86_64) += common_64.o bugs_64.o +obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c deleted file mode 100644 index eef868c97b89..000000000000 --- a/arch/x86/kernel/cpu/common_64.c +++ /dev/null @@ -1,1197 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#include -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cpu.h" - -static struct cpu_dev *this_cpu __cpuinitdata; - -#ifdef CONFIG_X86_64 -/* We need valid kernel segments for data and code in long mode too - * IRET will check the segment types kkeil 2000/10/28 - * Also sysret mandates a special GDT layout - */ -/* The TLS descriptors are currently at a different place compared to i386. - Hopefully nobody expects them at a fixed place (Wine?) */ -DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL32_CS] = { { { 0x0000ffff, 0x00cf9b00 } } }, - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00af9b00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9300 } } }, - [GDT_ENTRY_DEFAULT_USER32_CS] = { { { 0x0000ffff, 0x00cffb00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff300 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00affb00 } } }, -} }; -#else -DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { - [GDT_ENTRY_KERNEL_CS] = { { { 0x0000ffff, 0x00cf9a00 } } }, - [GDT_ENTRY_KERNEL_DS] = { { { 0x0000ffff, 0x00cf9200 } } }, - [GDT_ENTRY_DEFAULT_USER_CS] = { { { 0x0000ffff, 0x00cffa00 } } }, - [GDT_ENTRY_DEFAULT_USER_DS] = { { { 0x0000ffff, 0x00cff200 } } }, - /* - * Segments used for calling PnP BIOS have byte granularity. - * They code segments and data segments have fixed 64k limits, - * the transfer segment sizes are set at run time. - */ - /* 32-bit code */ - [GDT_ENTRY_PNPBIOS_CS32] = { { { 0x0000ffff, 0x00409a00 } } }, - /* 16-bit code */ - [GDT_ENTRY_PNPBIOS_CS16] = { { { 0x0000ffff, 0x00009a00 } } }, - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_DS] = { { { 0x0000ffff, 0x00009200 } } }, - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS1] = { { { 0x00000000, 0x00009200 } } }, - /* 16-bit data */ - [GDT_ENTRY_PNPBIOS_TS2] = { { { 0x00000000, 0x00009200 } } }, - /* - * The APM segments have byte granularity and their bases - * are set at run time. All have 64k limits. - */ - /* 32-bit code */ - [GDT_ENTRY_APMBIOS_BASE] = { { { 0x0000ffff, 0x00409a00 } } }, - /* 16-bit code */ - [GDT_ENTRY_APMBIOS_BASE+1] = { { { 0x0000ffff, 0x00009a00 } } }, - /* data */ - [GDT_ENTRY_APMBIOS_BASE+2] = { { { 0x0000ffff, 0x00409200 } } }, - - [GDT_ENTRY_ESPFIX_SS] = { { { 0x00000000, 0x00c09200 } } }, - [GDT_ENTRY_PERCPU] = { { { 0x00000000, 0x00000000 } } }, -} }; -#endif -EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); - -#ifdef CONFIG_X86_32 -static int cachesize_override __cpuinitdata = -1; -static int disable_x86_serial_nr __cpuinitdata = 1; - -static int __init cachesize_setup(char *str) -{ - get_option(&str, &cachesize_override); - return 1; -} -__setup("cachesize=", cachesize_setup); - -static int __init x86_fxsr_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_FXSR); - setup_clear_cpu_cap(X86_FEATURE_XMM); - return 1; -} -__setup("nofxsr", x86_fxsr_setup); - -static int __init x86_sep_setup(char *s) -{ - setup_clear_cpu_cap(X86_FEATURE_SEP); - return 1; -} -__setup("nosep", x86_sep_setup); - -/* Standard macro to see if a specific flag is changeable */ -static inline int flag_is_changeable_p(u32 flag) -{ - u32 f1, f2; - - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); - - return ((f1^f2) & flag) != 0; -} - -/* Probe for the CPUID instruction */ -static int __cpuinit have_cpuid_p(void) -{ - return flag_is_changeable_p(X86_EFLAGS_ID); -} - -static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) -{ - if (cpu_has(c, X86_FEATURE_PN) && disable_x86_serial_nr) { - /* Disable processor serial number */ - unsigned long lo, hi; - rdmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - lo |= 0x200000; - wrmsr(MSR_IA32_BBL_CR_CTL, lo, hi); - printk(KERN_NOTICE "CPU serial number disabled.\n"); - clear_cpu_cap(c, X86_FEATURE_PN); - - /* Disabling the serial number may affect the cpuid level */ - c->cpuid_level = cpuid_eax(0); - } -} - -static int __init x86_serial_nr_setup(char *s) -{ - disable_x86_serial_nr = 0; - return 1; -} -__setup("serialnumber", x86_serial_nr_setup); -#else -static inline int flag_is_changeable_p(u32 flag) -{ - return 1; -} -/* Probe for the CPUID instruction */ -static inline int have_cpuid_p(void) -{ - return 1; -} -static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) -{ -} -#endif - -/* - * Naming convention should be: [()] - * This table only is used unless init_() below doesn't set it; - * in particular, if CPUID levels 0x80000002..4 are supported, this isn't used - * - */ - -/* Look up CPU names by table lookup. */ -static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) -{ - struct cpu_model_info *info; - - if (c->x86_model >= 16) - return NULL; /* Range check */ - - if (!this_cpu) - return NULL; - - info = this_cpu->c_models; - - while (info && info->family) { - if (info->family == c->x86) - return info->model_names[c->x86_model]; - info++; - } - return NULL; /* Not found */ -} - -__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; - -/* Current gdt points %fs at the "master" per-cpu area: after this, - * it's on the real one. */ -void switch_to_new_gdt(void) -{ - struct desc_ptr gdt_descr; - - gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); - gdt_descr.size = GDT_SIZE - 1; - load_gdt(&gdt_descr); -#ifdef CONFIG_X86_32 - asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); -#endif -} - -static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; - -static void __cpuinit default_init(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - display_cacheinfo(c); -#else - /* Not much we can do here... */ - /* Check if at least it has cpuid */ - if (c->cpuid_level == -1) { - /* No cpuid. It must be an ancient CPU */ - if (c->x86 == 4) - strcpy(c->x86_model_id, "486"); - else if (c->x86 == 3) - strcpy(c->x86_model_id, "386"); - } -#endif -} - -static struct cpu_dev __cpuinitdata default_cpu = { - .c_init = default_init, - .c_vendor = "Unknown", - .c_x86_vendor = X86_VENDOR_UNKNOWN, -}; - -int __cpuinit get_model_name(struct cpuinfo_x86 *c) -{ - unsigned int *v; - char *p, *q; - - if (c->extended_cpuid_level < 0x80000004) - return 0; - - v = (unsigned int *) c->x86_model_id; - cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); - cpuid(0x80000003, &v[4], &v[5], &v[6], &v[7]); - cpuid(0x80000004, &v[8], &v[9], &v[10], &v[11]); - c->x86_model_id[48] = 0; - - /* Intel chips right-justify this string for some dumb reason; - undo that brain damage */ - p = q = &c->x86_model_id[0]; - while (*p == ' ') - p++; - if (p != q) { - while (*p) - *q++ = *p++; - while (q <= &c->x86_model_id[48]) - *q++ = '\0'; /* Zero-pad the rest */ - } - - return 1; -} - -void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) -{ - unsigned int n, dummy, ebx, ecx, edx, l2size; - - n = c->extended_cpuid_level; - - if (n >= 0x80000005) { - cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); - printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n", - edx>>24, edx&0xFF, ecx>>24, ecx&0xFF); - c->x86_cache_size = (ecx>>24) + (edx>>24); -#ifdef CONFIG_X86_64 - /* On K8 L1 TLB is inclusive, so don't count it */ - c->x86_tlbsize = 0; -#endif - } - - if (n < 0x80000006) /* Some chips just has a large L1. */ - return; - - cpuid(0x80000006, &dummy, &ebx, &ecx, &edx); - l2size = ecx >> 16; - -#ifdef CONFIG_X86_64 - c->x86_tlbsize += ((ebx >> 16) & 0xfff) + (ebx & 0xfff); -#else - /* do processor-specific cache resizing */ - if (this_cpu->c_size_cache) - l2size = this_cpu->c_size_cache(c, l2size); - - /* Allow user to override all this if necessary. */ - if (cachesize_override != -1) - l2size = cachesize_override; - - if (l2size == 0) - return; /* Again, no L2 cache is possible */ -#endif - - c->x86_cache_size = l2size; - - printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n", - l2size, ecx & 0xFF); -} - -void __cpuinit detect_ht(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_HT - u32 eax, ebx, ecx, edx; - int index_msb, core_bits; - - if (!cpu_has(c, X86_FEATURE_HT)) - return; - - if (cpu_has(c, X86_FEATURE_CMP_LEGACY)) - goto out; - - if (cpu_has(c, X86_FEATURE_XTOPOLOGY)) - return; - - cpuid(1, &eax, &ebx, &ecx, &edx); - - smp_num_siblings = (ebx & 0xff0000) >> 16; - - if (smp_num_siblings == 1) { - printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); - } else if (smp_num_siblings > 1) { - - if (smp_num_siblings > NR_CPUS) { - printk(KERN_WARNING "CPU: Unsupported number of siblings %d", - smp_num_siblings); - smp_num_siblings = 1; - return; - } - - index_msb = get_count_order(smp_num_siblings); -#ifdef CONFIG_X86_64 - c->phys_proc_id = phys_pkg_id(index_msb); -#else - c->phys_proc_id = phys_pkg_id(c->initial_apicid, index_msb); -#endif - - smp_num_siblings = smp_num_siblings / c->x86_max_cores; - - index_msb = get_count_order(smp_num_siblings); - - core_bits = get_count_order(c->x86_max_cores); - -#ifdef CONFIG_X86_64 - c->cpu_core_id = phys_pkg_id(index_msb) & - ((1 << core_bits) - 1); -#else - c->cpu_core_id = phys_pkg_id(c->initial_apicid, index_msb) & - ((1 << core_bits) - 1); -#endif - } - -out: - if ((c->x86_max_cores * smp_num_siblings) > 1) { - printk(KERN_INFO "CPU: Physical Processor ID: %d\n", - c->phys_proc_id); - printk(KERN_INFO "CPU: Processor Core ID: %d\n", - c->cpu_core_id); - } -#endif -} - -static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) -{ - char *v = c->x86_vendor_id; - int i; - static int printed; - - for (i = 0; i < X86_VENDOR_NUM; i++) { - if (!cpu_devs[i]) - break; - - if (!strcmp(v, cpu_devs[i]->c_ident[0]) || - (cpu_devs[i]->c_ident[1] && - !strcmp(v, cpu_devs[i]->c_ident[1]))) { - this_cpu = cpu_devs[i]; - c->x86_vendor = this_cpu->c_x86_vendor; - return; - } - } - - if (!printed) { - printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); - printk(KERN_ERR "CPU: Your system may be unstable.\n"); - } - - c->x86_vendor = X86_VENDOR_UNKNOWN; - this_cpu = &default_cpu; -} - -void __cpuinit cpu_detect(struct cpuinfo_x86 *c) -{ - /* Get vendor name */ - cpuid(0x00000000, (unsigned int *)&c->cpuid_level, - (unsigned int *)&c->x86_vendor_id[0], - (unsigned int *)&c->x86_vendor_id[8], - (unsigned int *)&c->x86_vendor_id[4]); - - c->x86 = 4; - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 junk, tfms, cap0, misc; - cpuid(0x00000001, &tfms, &misc, &junk, &cap0); - c->x86 = (tfms >> 8) & 0xf; - c->x86_model = (tfms >> 4) & 0xf; - c->x86_mask = tfms & 0xf; - if (c->x86 == 0xf) - c->x86 += (tfms >> 20) & 0xff; - if (c->x86 >= 0x6) - c->x86_model += ((tfms >> 16) & 0xf) << 4; - if (cap0 & (1<<19)) { - c->x86_clflush_size = ((misc >> 8) & 0xff) * 8; - c->x86_cache_alignment = c->x86_clflush_size; - } - } -} - -static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) -{ - u32 tfms, xlvl; - u32 ebx; - - /* Intel-defined flags: level 0x00000001 */ - if (c->cpuid_level >= 0x00000001) { - u32 capability, excap; - cpuid(0x00000001, &tfms, &ebx, &excap, &capability); - c->x86_capability[0] = capability; - c->x86_capability[4] = excap; - } - - /* AMD-defined flags: level 0x80000001 */ - xlvl = cpuid_eax(0x80000000); - c->extended_cpuid_level = xlvl; - if ((xlvl & 0xffff0000) == 0x80000000) { - if (xlvl >= 0x80000001) { - c->x86_capability[1] = cpuid_edx(0x80000001); - c->x86_capability[6] = cpuid_ecx(0x80000001); - } - } - -#ifdef CONFIG_X86_64 - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - - if (c->extended_cpuid_level >= 0x80000008) { - u32 eax = cpuid_eax(0x80000008); - - c->x86_virt_bits = (eax >> 8) & 0xff; - c->x86_phys_bits = eax & 0xff; - } -#endif -} -/* - * Do minimum CPU detection early. - * Fields really needed: vendor, cpuid_level, family, model, mask, - * cache alignment. - * The others are not touched to avoid unwanted side effects. - * - * WARNING: this function is only called on the BP. Don't add code here - * that is supposed to run on all CPUs. - */ -static void __init early_identify_cpu(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_64 - c->x86_clflush_size = 64; -#else - c->x86_clflush_size = 32; -#endif - c->x86_cache_alignment = c->x86_clflush_size; - - if (!have_cpuid_p()) - return; - - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - c->extended_cpuid_level = 0; - - cpu_detect(c); - - get_cpu_vendor(c); - - get_cpu_cap(c); - - if (this_cpu->c_early_init) - this_cpu->c_early_init(c); - - validate_pat_support(c); -} - -void __init early_cpu_init(void) -{ - struct cpu_dev **cdev; - int count = 0; - - printk("KERNEL supported cpus:\n"); - for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { - struct cpu_dev *cpudev = *cdev; - unsigned int j; - - if (count >= X86_VENDOR_NUM) - break; - cpu_devs[count] = cpudev; - count++; - - for (j = 0; j < 2; j++) { - if (!cpudev->c_ident[j]) - continue; - printk(" %s %s\n", cpudev->c_vendor, - cpudev->c_ident[j]); - } - } - - early_identify_cpu(&boot_cpu_data); -} - -/* - * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6, unfortunately, that's not true in practice because - * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. Hence, probe for it based on first - * principles. - * - * Note: no 64-bit chip is known to lack these, but put the code here - * for consistency with 32 bits, and to make it utterly trivial to - * diagnose the problem should it ever surface. - */ -static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) -{ - const u32 nopl_signature = 0x888c53b1; /* Random number */ - u32 has_nopl = nopl_signature; - - clear_cpu_cap(c, X86_FEATURE_NOPL); - if (c->x86 >= 6) { - asm volatile("\n" - "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ - "2:\n" - " .section .fixup,\"ax\"\n" - "3: xor %0,%0\n" - " jmp 2b\n" - " .previous\n" - _ASM_EXTABLE(1b,3b) - : "+a" (has_nopl)); - - if (has_nopl == nopl_signature) - set_cpu_cap(c, X86_FEATURE_NOPL); - } -} - -static void __cpuinit generic_identify(struct cpuinfo_x86 *c) -{ - if (!have_cpuid_p()) - return; - - c->extended_cpuid_level = 0; - - cpu_detect(c); - - get_cpu_vendor(c); - - get_cpu_cap(c); - - if (c->cpuid_level >= 0x00000001) { - c->initial_apicid = (cpuid_ebx(1) >> 24) & 0xFF; -#ifdef CONFIG_X86_32 -# ifdef CONFIG_X86_HT - c->apicid = phys_pkg_id(c->initial_apicid, 0); -# else - c->apicid = c->initial_apicid; -# endif -#endif - -#ifdef CONFIG_X86_HT - c->phys_proc_id = c->initial_apicid; -#endif - } - - if (c->extended_cpuid_level >= 0x80000004) - get_model_name(c); /* Default name */ - - init_scattered_cpuid_features(c); - detect_nopl(c); -} - -/* - * This does the hard work of actually picking apart the CPU stuff... - */ -static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) -{ - int i; - - c->loops_per_jiffy = loops_per_jiffy; - c->x86_cache_size = -1; - c->x86_vendor = X86_VENDOR_UNKNOWN; - c->x86_model = c->x86_mask = 0; /* So far unknown... */ - c->x86_vendor_id[0] = '\0'; /* Unset */ - c->x86_model_id[0] = '\0'; /* Unset */ - c->x86_max_cores = 1; -#ifdef CONFIG_X86_64 - c->x86_coreid_bits = 0; - c->x86_clflush_size = 64; -#else - c->cpuid_level = -1; /* CPUID not detected */ - c->x86_clflush_size = 32; -#endif - c->x86_cache_alignment = c->x86_clflush_size; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - - if (!have_cpuid_p()) { - /* - * First of all, decide if this is a 486 or higher - * It's a 486 if we can modify the AC flag - */ - if (flag_is_changeable_p(X86_EFLAGS_AC)) - c->x86 = 4; - else - c->x86 = 3; - } - - generic_identify(c); - - if (this_cpu->c_identify) - this_cpu->c_identify(c); - -#ifdef CONFIG_X86_64 - c->apicid = phys_pkg_id(0); -#endif - - /* - * Vendor-specific initialization. In this section we - * canonicalize the feature flags, meaning if there are - * features a certain CPU supports which CPUID doesn't - * tell us, CPUID claiming incorrect flags, or other bugs, - * we handle them here. - * - * At the end of this section, c->x86_capability better - * indicate the features this CPU genuinely supports! - */ - if (this_cpu->c_init) - this_cpu->c_init(c); - - /* Disable the PN if appropriate */ - squash_the_stupid_serial_number(c); - - /* - * The vendor-specific functions might have changed features. Now - * we do "generic changes." - */ - - /* If the model name is still unset, do table lookup. */ - if (!c->x86_model_id[0]) { - char *p; - p = table_lookup_model(c); - if (p) - strcpy(c->x86_model_id, p); - else - /* Last resort... */ - sprintf(c->x86_model_id, "%02x/%02x", - c->x86, c->x86_model); - } - -#ifdef CONFIG_X86_64 - detect_ht(c); -#endif - - /* - * On SMP, boot_cpu_data holds the common feature set between - * all CPUs; so make sure that we indicate which features are - * common between the CPUs. The first time this routine gets - * executed, c == &boot_cpu_data. - */ - if (c != &boot_cpu_data) { - /* AND the already accumulated flags with these */ - for (i = 0; i < NCAPINTS; i++) - boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; - } - - /* Clear all flags overriden by options */ - for (i = 0; i < NCAPINTS; i++) - c->x86_capability[i] &= ~cleared_cpu_caps[i]; - -#ifdef CONFIG_X86_MCE - /* Init Machine Check Exception if available. */ - mcheck_init(c); -#endif - - select_idle_routine(c); - -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) - numa_add_cpu(smp_processor_id()); -#endif -} - -void __init identify_boot_cpu(void) -{ - identify_cpu(&boot_cpu_data); -#ifdef CONFIG_X86_32 - sysenter_setup(); - enable_sep_cpu(); -#endif -} - -void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) -{ - BUG_ON(c == &boot_cpu_data); - identify_cpu(c); -#ifdef CONFIG_X86_32 - enable_sep_cpu(); -#endif - mtrr_ap_init(); -} - -struct msr_range { - unsigned min; - unsigned max; -}; - -static struct msr_range msr_range_array[] __cpuinitdata = { - { 0x00000000, 0x00000418}, - { 0xc0000000, 0xc000040b}, - { 0xc0010000, 0xc0010142}, - { 0xc0011000, 0xc001103b}, -}; - -static void __cpuinit print_cpu_msr(void) -{ - unsigned index; - u64 val; - int i; - unsigned index_min, index_max; - - for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { - index_min = msr_range_array[i].min; - index_max = msr_range_array[i].max; - for (index = index_min; index < index_max; index++) { - if (rdmsrl_amd_safe(index, &val)) - continue; - printk(KERN_INFO " MSR%08x: %016llx\n", index, val); - } - } -} - -static int show_msr __cpuinitdata; -static __init int setup_show_msr(char *arg) -{ - int num; - - get_option(&arg, &num); - - if (num > 0) - show_msr = num; - return 1; -} -__setup("show_msr=", setup_show_msr); - -static __init int setup_noclflush(char *arg) -{ - setup_clear_cpu_cap(X86_FEATURE_CLFLSH); - return 1; -} -__setup("noclflush", setup_noclflush); - -void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) -{ - char *vendor = NULL; - - if (c->x86_vendor < X86_VENDOR_NUM) - vendor = this_cpu->c_vendor; - else if (c->cpuid_level >= 0) - vendor = c->x86_vendor_id; - - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) - printk(KERN_CONT "%s ", vendor); - - if (c->x86_model_id[0]) - printk(KERN_CONT "%s", c->x86_model_id); - else - printk(KERN_CONT "%d86", c->x86); - - if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT " stepping %02x\n", c->x86_mask); - else - printk(KERN_CONT "\n"); - -#ifdef CONFIG_SMP - if (c->cpu_index < show_msr) - print_cpu_msr(); -#else - if (show_msr) - print_cpu_msr(); -#endif -} - -static __init int setup_disablecpuid(char *arg) -{ - int bit; - if (get_option(&arg, &bit) && bit < NCAPINTS*32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; -} -__setup("clearcpuid=", setup_disablecpuid); - -cpumask_t cpu_initialized __cpuinitdata = CPU_MASK_NONE; - -#ifdef CONFIG_X86_64 -struct x8664_pda **_cpu_pda __read_mostly; -EXPORT_SYMBOL(_cpu_pda); - -struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; - -char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; - -unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL_GPL(__supported_pte_mask); - -static int do_not_nx __cpuinitdata; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - -void pda_init(int cpu) -{ - struct x8664_pda *pda = cpu_pda(cpu); - - /* Setup up data that may be needed in __get_free_pages early */ - loadsegment(fs, 0); - loadsegment(gs, 0); - /* Memory clobbers used to order PDA accessed */ - mb(); - wrmsrl(MSR_GS_BASE, pda); - mb(); - - pda->cpunumber = cpu; - pda->irqcount = -1; - pda->kernelstack = (unsigned long)stack_thread_info() - - PDA_STACKOFFSET + THREAD_SIZE; - pda->active_mm = &init_mm; - pda->mmu_state = 0; - - if (cpu == 0) { - /* others are initialized in smpboot.c */ - pda->pcurrent = &init_task; - pda->irqstackptr = boot_cpu_stack; - pda->irqstackptr += IRQSTACKSIZE - 64; - } else { - if (!pda->irqstackptr) { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", - cpu); - pda->irqstackptr += IRQSTACKSIZE - 64; - } - - if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) - pda->nodenumber = cpu_to_node(cpu); - } -} - -char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + - DEBUG_STKSZ] __page_aligned_bss; - -extern asmlinkage void ignore_sysret(void); - -/* May not be marked __init: used by software suspend */ -void syscall_init(void) -{ - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ - wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init(); -#endif - - /* Flags to clear on syscall */ - wrmsrl(MSR_SYSCALL_MASK, - X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); -} - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - -unsigned long kernel_eflags; - -/* - * Copies of the original ist values from the tss are only accessed during - * debugging, no special alignment required. - */ -DEFINE_PER_CPU(struct orig_ist, orig_ist); - -#else - -/* Make sure %fs is initialized properly in idle threads */ -struct pt_regs * __cpuinit idle_regs(struct pt_regs *regs) -{ - memset(regs, 0, sizeof(struct pt_regs)); - regs->fs = __KERNEL_PERCPU; - return regs; -} -#endif - -/* - * cpu_init() initializes state that is per-CPU. Some data is already - * initialized (naturally) in the bootstrap process, such as the GDT - * and IDT. We reload them nevertheless, this function acts as a - * 'CPU state barrier', nothing should get across. - * A lot of state is already set up in PDA init for 64 bit - */ -#ifdef CONFIG_X86_64 -void __cpuinit cpu_init(void) -{ - int cpu = stack_smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct orig_ist *orig_ist = &per_cpu(orig_ist, cpu); - unsigned long v; - char *estacks = NULL; - struct task_struct *me; - int i; - - /* CPU 0 is initialised in head64.c */ - if (cpu != 0) - pda_init(cpu); - else - estacks = boot_exception_stacks; - - me = current; - - if (cpu_test_and_set(cpu, cpu_initialized)) - panic("CPU#%d already initialized!\n", cpu); - - printk(KERN_INFO "Initializing CPU#%d\n", cpu); - - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - /* - * Initialize the per-CPU GDT with the boot GDT, - * and set up the GDT descriptor: - */ - - switch_to_new_gdt(); - load_idt((const struct desc_ptr *)&idt_descr); - - memset(me->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); - syscall_init(); - - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); - barrier(); - - check_efer(); - if (cpu != 0 && x2apic) - enable_x2apic(); - - /* - * set up and load the per-CPU TSS - */ - if (!orig_ist->ist[0]) { - static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER - }; - for (v = 0; v < N_EXCEPTION_STACKS; v++) { - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception " - "stack %ld %d\n", v, cpu); - } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = - (unsigned long)estacks; - } - } - - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); - /* - * <= is required because the CPU will access up to - * 8 bits beyond the end of the IO permission bitmap. - */ - for (i = 0; i <= IO_BITMAP_LONGS; i++) - t->io_bitmap[i] = ~0UL; - - atomic_inc(&init_mm.mm_count); - me->active_mm = &init_mm; - if (me->mm) - BUG(); - enter_lazy_tlb(&init_mm, me); - - load_sp0(t, ¤t->thread); - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else { -#endif - /* - * Clear all 6 debug registers: - */ - - set_debugreg(0UL, 0); - set_debugreg(0UL, 1); - set_debugreg(0UL, 2); - set_debugreg(0UL, 3); - set_debugreg(0UL, 6); - set_debugreg(0UL, 7); -#ifdef CONFIG_KGDB - /* If the kgdb is connected no debug regs should be altered. */ - } -#endif - - fpu_init(); - - raw_local_save_flags(kernel_eflags); - - if (is_uv_system()) - uv_cpu_init(); -} - -#else - -void __cpuinit cpu_init(void) -{ - int cpu = smp_processor_id(); - struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(init_tss, cpu); - struct thread_struct *thread = &curr->thread; - - if (cpu_test_and_set(cpu, cpu_initialized)) { - printk(KERN_WARNING "CPU#%d already initialized!\n", cpu); - for (;;) local_irq_enable(); - } - - printk(KERN_INFO "Initializing CPU#%d\n", cpu); - - if (cpu_has_vme || cpu_has_tsc || cpu_has_de) - clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); - - load_idt(&idt_descr); - switch_to_new_gdt(); - - /* - * Set up and load the per-CPU TSS and LDT - */ - atomic_inc(&init_mm.mm_count); - curr->active_mm = &init_mm; - if (curr->mm) - BUG(); - enter_lazy_tlb(&init_mm, curr); - - load_sp0(t, thread); - set_tss_desc(cpu, t); - load_TR_desc(); - load_LDT(&init_mm.context); - -#ifdef CONFIG_DOUBLEFAULT - /* Set up doublefault TSS pointer in the GDT */ - __set_tss_desc(cpu, GDT_ENTRY_DOUBLEFAULT_TSS, &doublefault_tss); -#endif - - /* Clear %gs. */ - asm volatile ("mov %0, %%gs" : : "r" (0)); - - /* Clear all 6 debug registers: */ - set_debugreg(0, 0); - set_debugreg(0, 1); - set_debugreg(0, 2); - set_debugreg(0, 3); - set_debugreg(0, 6); - set_debugreg(0, 7); - - /* - * Force FPU initialization: - */ - if (cpu_has_xsave) - current_thread_info()->status = TS_XSAVE; - else - current_thread_info()->status = 0; - clear_used_math(); - mxcsr_feature_mask_init(); - - /* - * Boot processor to setup the FP and extended state context info. - */ - if (!smp_processor_id()) - init_thread_xstate(); - - xsave_init(); -} - -#ifdef CONFIG_HOTPLUG_CPU -void __cpuinit cpu_uninit(void) -{ - int cpu = raw_smp_processor_id(); - cpu_clear(cpu, cpu_initialized); - - /* lazy TLB state */ - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} -#endif - -#endif -- cgit v1.2.2 From bd220a24a9263eaa70d5e6821383085950b5a13c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 5 Sep 2008 00:58:28 -0700 Subject: x86: move nonx_setup etc from common.c to init_64.c like 32 bit put it in init_32.c Signed-off-by: Yinghai Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 54 -------------------------------------------- arch/x86/mm/init_64.c | 54 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index eef868c97b89..286c89a991bd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -847,51 +847,6 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; -unsigned long __supported_pte_mask __read_mostly = ~0UL; -EXPORT_SYMBOL_GPL(__supported_pte_mask); - -static int do_not_nx __cpuinitdata; - -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - do_not_nx = 0; - } else if (!strncmp(str, "off", 3)) { - do_not_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -int force_personality32; - -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ -static int __init nonx32_setup(char *str) -{ - if (!strcmp(str, "on")) - force_personality32 &= ~READ_IMPLIES_EXEC; - else if (!strcmp(str, "off")) - force_personality32 |= READ_IMPLIES_EXEC; - return 1; -} -__setup("noexec32=", nonx32_setup); - void pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); @@ -957,15 +912,6 @@ void syscall_init(void) X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); } -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || do_not_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - unsigned long kernel_eflags; /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d3746efb060d..9c750d6307b0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -88,6 +88,60 @@ early_param("gbpages", parse_direct_gbpages_on); int after_bootmem; +unsigned long __supported_pte_mask __read_mostly = ~0UL; +EXPORT_SYMBOL_GPL(__supported_pte_mask); + +static int do_not_nx __cpuinitdata; + +/* noexec=on|off +Control non executable mappings for 64bit processes. + +on Enable(default) +off Disable +*/ +static int __init nonx_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + do_not_nx = 0; + } else if (!strncmp(str, "off", 3)) { + do_not_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", nonx_setup); + +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || do_not_nx) + __supported_pte_mask &= ~_PAGE_NX; +} + +int force_personality32; + +/* noexec32=on|off +Control non executable heap for 32bit processes. +To control the stack too use noexec=off + +on PROT_READ does not imply PROT_EXEC for 32bit processes (default) +off PROT_READ implies PROT_EXEC +*/ +static int __init nonx32_setup(char *str) +{ + if (!strcmp(str, "on")) + force_personality32 &= ~READ_IMPLIES_EXEC; + else if (!strcmp(str, "off")) + force_personality32 |= READ_IMPLIES_EXEC; + return 1; +} +__setup("noexec32=", nonx32_setup); + /* * NOTE: This function is marked __ref because it calls __init function * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. -- cgit v1.2.2 From deed05b7c017e49473e8c386efba196410fd18b3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 5 Sep 2008 10:23:26 +0200 Subject: x86, init_64.c: cleanup Clean up comments. Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9c750d6307b0..1f6806b62eb8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -93,12 +93,13 @@ EXPORT_SYMBOL_GPL(__supported_pte_mask); static int do_not_nx __cpuinitdata; -/* noexec=on|off -Control non executable mappings for 64bit processes. - -on Enable(default) -off Disable -*/ +/* + * noexec=on|off + * Control non-executable mappings for 64-bit processes. + * + * on Enable (default) + * off Disable + */ static int __init nonx_setup(char *str) { if (!str) @@ -125,13 +126,14 @@ void __cpuinit check_efer(void) int force_personality32; -/* noexec32=on|off -Control non executable heap for 32bit processes. -To control the stack too use noexec=off - -on PROT_READ does not imply PROT_EXEC for 32bit processes (default) -off PROT_READ implies PROT_EXEC -*/ +/* + * noexec32=on|off + * Control non executable heap for 32bit processes. + * To control the stack too use noexec=off + * + * on PROT_READ does not imply PROT_EXEC for 32-bit processes (default) + * off PROT_READ implies PROT_EXEC + */ static int __init nonx32_setup(char *str) { if (!strcmp(str, "on")) -- cgit v1.2.2 From 7cfb0435330364f90f274a26ecdc5f47f738498c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:24 +0000 Subject: HPET: make minimum reprogramming delta useful The minimum reprogramming delta was hardcoded in HPET ticks, which is stupid as it does not work with faster running HPETs. The C1E idle patches made this prominent on AMD/RS690 chipsets, where the HPET runs with 25MHz. Set it to 5us which seems to be a reasonable value and fixes the problems on the bug reporters machines. We have a further sanity check now in the clock events, which increases the delta when it is not sufficient. Signed-off-by: Thomas Gleixner Tested-by: Luiz Fernando N. Capitulino Tested-by: Dmitry Nezhevenko Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 59fd3b6b1303..2256315416d8 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); + /* 5 usec minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = 5000; /* * Start hpet with the boot cpu mask and make it -- cgit v1.2.2 From 551b4545bf9658dc5ae5a1277dcd4d7bf0028b28 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 5 Sep 2008 17:58:49 +0900 Subject: x86: gart alloc_coherent doesn't need to check NULL device argument asm/dma-mapping.h guarantees that gart alloc_coherent doesn't get NULL device argument. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 4d0864900b8c..0b99d4a06f74 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -506,9 +506,6 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, align_mask = (1UL << get_order(size)) - 1; - if (!dev) - dev = &x86_dma_fallback_dev; - *dma_addr = dma_map_area(dev, __pa(vaddr), size, DMA_BIDIRECTIONAL, align_mask); flush_gart(); -- cgit v1.2.2 From 110e0358e7dfd9cc56d47077068f3680dae10b56 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 28 Aug 2008 13:58:39 -0700 Subject: x86: make sure the CPA test code's use of _PAGE_UNUSED1 is obvious The CPA test code uses _PAGE_UNUSED1, so make sure its obvious. Signed-off-by: Jeremy Fitzhardinge Cc: Nick Piggin Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 7c3017287119..e1d106909218 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -32,7 +32,7 @@ enum { GPS = (1<<30) }; -#define PAGE_TESTBIT __pgprot(_PAGE_UNUSED1) +#define PAGE_CPA_TEST __pgprot(_PAGE_CPA_TEST) static int pte_testbit(pte_t pte) { @@ -174,7 +174,7 @@ static int pageattr_test(void) } test_addr = addr[i]; - err = change_page_attr_set(&test_addr, len[i], PAGE_TESTBIT, 0); + err = change_page_attr_set(&test_addr, len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++; @@ -207,7 +207,7 @@ static int pageattr_test(void) continue; } test_addr = addr[i]; - err = change_page_attr_clear(&test_addr, len[i], PAGE_TESTBIT, 0); + err = change_page_attr_clear(&test_addr, len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; -- cgit v1.2.2 From 913da64b54b2b3bb212a59aba2e6f2b8294ca1fa Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Wed, 3 Sep 2008 14:30:23 +0100 Subject: x86: build fix for !CONFIG_SMP Move reset_lazy_tlbstate into tlb_32.c, and define noop versions of play_dead() in process_{32,64}.c when !CONFIG_SMP. Signed-off-by: Alex Nixon Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 7 ------- arch/x86/kernel/process_32.c | 7 +++++++ arch/x86/kernel/process_64.c | 7 +++++++ arch/x86/kernel/tlb_32.c | 8 ++++++++ 4 files changed, 22 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 18f5551b32dd..76d10d5c2fa7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -714,10 +714,3 @@ void __cpuinit cpu_init(void) mxcsr_feature_mask_init(); } -void reset_lazy_tlbstate(void) -{ - int cpu = raw_smp_processor_id(); - - per_cpu(cpu_tlbstate, cpu).state = 0; - per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; -} diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 633cf06578fa..b76b38ff962b 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -72,6 +72,13 @@ unsigned long thread_saved_pc(struct task_struct *tsk) return ((unsigned long *)tsk->thread.sp)[3]; } +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index aa28ed01cdf3..ec27afa43d7e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -85,6 +85,13 @@ void exit_idle(void) __exit_idle(); } +#ifndef CONFIG_SMP +static inline void play_dead(void) +{ + BUG(); +} +#endif + /* * The idle thread. There's no useful work to be * done, so just try to conserve power and have a diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index fec1ecedc9b7..e00534b33534 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -241,3 +241,11 @@ void flush_tlb_all(void) on_each_cpu(do_flush_tlb_all, NULL, 1); } +void reset_lazy_tlbstate(void) +{ + int cpu = raw_smp_processor_id(); + + per_cpu(cpu_tlbstate, cpu).state = 0; + per_cpu(cpu_tlbstate, cpu).active_mm = &init_mm; +} + -- cgit v1.2.2 From efd327a2d41214dded03cbfbb6d447530964cddd Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Wed, 3 Sep 2008 14:36:40 +0100 Subject: x86/paravirt: Remove duplicate paravirt_pagetable_setup_{start, done}() They were already called once in arch/x86/kernel/setup.c - we don't need to call them again. Signed-off-by: Alex Nixon Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..60ec1d08ff24 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -458,11 +458,7 @@ static void __init pagetable_init(void) { pgd_t *pgd_base = swapper_pg_dir; - paravirt_pagetable_setup_start(pgd_base); - permanent_kmaps_init(pgd_base); - - paravirt_pagetable_setup_done(pgd_base); } #ifdef CONFIG_ACPI_SLEEP -- cgit v1.2.2 From 5b7e41ff37267c35b0fcf9162ca0c32c3d8d2c5c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 3 Sep 2008 17:24:03 -0700 Subject: x86: additional defconfig updates Additional updates to the x86 defconfigs. The goals are, as before: - Make them usable to testers, more so than distributors or end users, both of which are likely to have their own config already. - Keep 32 and 64 bits as similar as is practical. Changes: - Use a more generic CPU type (ppro and generic, respectively). - Bump number of CPUs to 64 (few if any NR_CPUS arrays left). - Enable PAT. - Enable OPTIMIZE_INLINE. - Enable microcode update support. - Build SMT scheduler support (in addition to MC). Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/configs/i386_defconfig | 19 +++++++++++-------- arch/x86/configs/x86_64_defconfig | 29 +++++++++++++---------------- 2 files changed, 24 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 104275e191a8..ef9a52005ec9 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.27-rc4 -# Mon Aug 25 15:04:00 2008 +# Linux kernel version: 2.6.27-rc5 +# Wed Sep 3 17:23:09 2008 # # CONFIG_64BIT is not set CONFIG_X86_32=y @@ -202,7 +202,7 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y # CONFIG_M586 is not set # CONFIG_M586TSC is not set # CONFIG_M586MMX is not set -# CONFIG_M686 is not set +CONFIG_M686=y # CONFIG_MPENTIUMII is not set # CONFIG_MPENTIUMIII is not set # CONFIG_MPENTIUMM is not set @@ -221,13 +221,14 @@ CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER=y # CONFIG_MVIAC3_2 is not set # CONFIG_MVIAC7 is not set # CONFIG_MPSC is not set -CONFIG_MCORE2=y +# CONFIG_MCORE2 is not set # CONFIG_GENERIC_CPU is not set CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y CONFIG_X86_CMPXCHG=y CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_X86_XADD=y +# CONFIG_X86_PPRO_FENCE is not set CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_INVLPG=y CONFIG_X86_BSWAP=y @@ -235,14 +236,15 @@ CONFIG_X86_POPAD_OK=y CONFIG_X86_INTEL_USERCOPY=y CONFIG_X86_USE_PPRO_CHECKSUM=y CONFIG_X86_TSC=y +CONFIG_X86_CMOV=y CONFIG_X86_MINIMUM_CPU_FAMILY=4 CONFIG_X86_DEBUGCTLMSR=y CONFIG_HPET_TIMER=y CONFIG_HPET_EMULATE_RTC=y CONFIG_DMI=y # CONFIG_IOMMU_HELPER is not set -CONFIG_NR_CPUS=4 -# CONFIG_SCHED_SMT is not set +CONFIG_NR_CPUS=64 +CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y @@ -254,7 +256,8 @@ CONFIG_VM86=y # CONFIG_TOSHIBA is not set # CONFIG_I8K is not set CONFIG_X86_REBOOTFIXUPS=y -# CONFIG_MICROCODE is not set +CONFIG_MICROCODE=y +CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y # CONFIG_NOHIGHMEM is not set @@ -2115,7 +2118,7 @@ CONFIG_IO_DELAY_0X80=y CONFIG_DEFAULT_IO_DELAY_TYPE=0 CONFIG_DEBUG_BOOT_PARAMS=y # CONFIG_CPA_DEBUG is not set -# CONFIG_OPTIMIZE_INLINING is not set +CONFIG_OPTIMIZE_INLINING=y # # Security options diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 678c8acefe04..e620ea6e2a7a 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.27-rc4 -# Mon Aug 25 14:40:46 2008 +# Linux kernel version: 2.6.27-rc5 +# Wed Sep 3 17:13:39 2008 # CONFIG_64BIT=y # CONFIG_X86_32 is not set @@ -218,17 +218,14 @@ CONFIG_X86_PC=y # CONFIG_MVIAC3_2 is not set # CONFIG_MVIAC7 is not set # CONFIG_MPSC is not set -CONFIG_MCORE2=y -# CONFIG_GENERIC_CPU is not set +# CONFIG_MCORE2 is not set +CONFIG_GENERIC_CPU=y CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=64 -CONFIG_X86_INTERNODE_CACHE_BYTES=64 +CONFIG_X86_L1_CACHE_BYTES=128 +CONFIG_X86_INTERNODE_CACHE_BYTES=128 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=7 CONFIG_X86_WP_WORKS_OK=y -CONFIG_X86_INTEL_USERCOPY=y -CONFIG_X86_USE_PPRO_CHECKSUM=y -CONFIG_X86_P6_NOP=y CONFIG_X86_TSC=y CONFIG_X86_CMPXCHG64=y CONFIG_X86_CMOV=y @@ -243,9 +240,8 @@ CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT=y CONFIG_AMD_IOMMU=y CONFIG_SWIOTLB=y CONFIG_IOMMU_HELPER=y -# CONFIG_MAXSMP is not set -CONFIG_NR_CPUS=4 -# CONFIG_SCHED_SMT is not set +CONFIG_NR_CPUS=64 +CONFIG_SCHED_SMT=y CONFIG_SCHED_MC=y # CONFIG_PREEMPT_NONE is not set CONFIG_PREEMPT_VOLUNTARY=y @@ -254,7 +250,8 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y # CONFIG_X86_MCE is not set # CONFIG_I8K is not set -# CONFIG_MICROCODE is not set +CONFIG_MICROCODE=y +CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y CONFIG_NUMA=y @@ -290,7 +287,7 @@ CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_MTRR=y # CONFIG_MTRR_SANITIZER is not set -# CONFIG_X86_PAT is not set +CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set @@ -2089,7 +2086,7 @@ CONFIG_IO_DELAY_0X80=y CONFIG_DEFAULT_IO_DELAY_TYPE=0 CONFIG_DEBUG_BOOT_PARAMS=y # CONFIG_CPA_DEBUG is not set -# CONFIG_OPTIMIZE_INLINING is not set +CONFIG_OPTIMIZE_INLINING=y # # Security options -- cgit v1.2.2 From e7250b8ae3870f37f660c2f65cafcaba85e3bfd3 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 5 Sep 2008 18:33:26 +0200 Subject: x86: hpet: modify IXP400 quirk to enable interrupts The current quirk is incomplete. Some more chipset fiddling has to be done to enable HPET interrupts. This patch aims to do this. From my tests it seems to work faultlessly. But the official statement is that HPET is not supported on SB4X0. Users will still have to use hpet=force to enable it. Use it at your own risk. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index d13858818100..f6a11b9b1f98 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -354,9 +354,27 @@ static void ati_force_hpet_resume(void) printk(KERN_DEBUG "Force enabled HPET at resume\n"); } +static u32 ati_ixp4x0_rev(struct pci_dev *dev) +{ + u32 d; + u8 b; + + pci_read_config_byte(dev, 0xac, &b); + b &= ~(1<<5); + pci_write_config_byte(dev, 0xac, b); + pci_read_config_dword(dev, 0x70, &d); + d |= 1<<8; + pci_write_config_dword(dev, 0x70, d); + pci_read_config_dword(dev, 0x8, &d); + d &= 0xff; + dev_printk(KERN_DEBUG, &dev->dev, "SB4X0 revision 0x%x\n", d); + return d; +} + static void ati_force_enable_hpet(struct pci_dev *dev) { - u32 uninitialized_var(val); + u32 d, val; + u8 b; if (hpet_address || force_hpet_address) return; @@ -366,14 +384,33 @@ static void ati_force_enable_hpet(struct pci_dev *dev) return; } + d = ati_ixp4x0_rev(dev); + if (d < 0x82) + return; + + /* base address */ pci_write_config_dword(dev, 0x14, 0xfed00000); pci_read_config_dword(dev, 0x14, &val); + + /* enable interrupt */ + outb(0x72, 0xcd6); b = inb(0xcd7); + b |= 0x1; + outb(0x72, 0xcd6); outb(b, 0xcd7); + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x1)) + return; + pci_read_config_dword(dev, 0x64, &d); + d |= (1<<10); + pci_write_config_dword(dev, 0x64, d); + pci_read_config_dword(dev, 0x64, &d); + if (!(d & (1<<10))) + return; + force_hpet_address = val; force_hpet_resume_type = ATI_FORCE_HPET_RESUME; dev_printk(KERN_DEBUG, &dev->dev, "Force enabled HPET at 0x%lx\n", force_hpet_address); cached_dev = dev; - return; } DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, ati_force_enable_hpet); -- cgit v1.2.2 From cf169702ba6928cee9d4f4adf3e932b643b8db7a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 2 Sep 2008 13:13:40 +0200 Subject: x86, gart: add detection of AMD family 0x11 northbridges This patch adds the detection of the northbridges in the AMD family 0x11 processors. It also fixes the magic numbers there while changing this code. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/k8.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c index 7377ccb21335..304d8bad6559 100644 --- a/arch/x86/kernel/k8.c +++ b/arch/x86/kernel/k8.c @@ -16,8 +16,9 @@ EXPORT_SYMBOL(num_k8_northbridges); static u32 *flush_words; struct pci_device_id k8_nb_ids[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_11H_NB_MISC) }, {} }; EXPORT_SYMBOL(k8_nb_ids); -- cgit v1.2.2 From e51af6630848406fc97adbd71443818cdcda297b Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Thu, 4 Sep 2008 09:54:37 +0100 Subject: x86: blacklist DMAR on Intel G31/G33 chipsets Some BIOSes (the Intel DG33BU, for example) wrongly claim to have DMAR when they don't. Avoid the resulting crashes when it doesn't work as expected. I'd still be grateful if someone could test it on a DG33BU with the old BIOS though, since I've killed mine. I tested the DMI version, but not this one. Signed-off-by: David Woodhouse Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 4353cf5e6fac..24bb5faf5efa 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,6 +95,20 @@ static void __init nvidia_bugs(int num, int slot, int func) } +#ifdef CONFIG_DMAR +static void __init intel_g33_dmar(int num, int slot, int func) +{ + struct acpi_table_header *dmar_tbl; + acpi_status status; + + status = acpi_get_table(ACPI_SIG_DMAR, 0, &dmar_tbl); + if (ACPI_SUCCESS(status)) { + printk(KERN_INFO "BIOS BUG: DMAR advertised on Intel G31/G33 chipset -- ignoring\n"); + dmar_disabled = 1; + } +} +#endif + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -114,6 +128,10 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, +#ifdef CONFIG_DMAR + { PCI_VENDOR_ID_INTEL, 0x29c0, + PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, +#endif {} }; -- cgit v1.2.2 From b74b06c5f6612a72298f37baa65460a59c26ca67 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 15 Aug 2008 15:36:31 -0700 Subject: x86: boot: stub out unimplemented CPU feature words The CPU feature detection code in the boot code is somewhat minimal, and doesn't include all possible CPUID words. In particular, it doesn't contain the code for CPU feature words 2 (Transmeta), 3 (Linux-specific), 5 (VIA), or 7 (scattered). Zero them out, so we can still set those bits as known at compile time; in particular, this allows creating a Linux-specific NOPL flag and have it required (and therefore resolvable at compile time) in 64-bit mode. Signed-off-by: H. Peter Anvin --- arch/x86/boot/cpucheck.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4b9ae7c56748..4d3ff037201f 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -38,12 +38,12 @@ static const u32 req_flags[NCAPINTS] = { REQUIRED_MASK0, REQUIRED_MASK1, - REQUIRED_MASK2, - REQUIRED_MASK3, + 0, /* REQUIRED_MASK2 not implemented in this file */ + 0, /* REQUIRED_MASK3 not implemented in this file */ REQUIRED_MASK4, - REQUIRED_MASK5, + 0, /* REQUIRED_MASK5 not implemented in this file */ REQUIRED_MASK6, - REQUIRED_MASK7, + 0, /* REQUIRED_MASK7 not implemented in this file */ }; #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) -- cgit v1.2.2 From b6734c35af028f06772c0b2c836c7d579e6d4dad Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Aug 2008 17:39:32 -0700 Subject: x86: add NOPL as a synthetic CPU feature bit The long noops ("NOPL") are supposed to be detected by family >= 6. Unfortunately, several non-Intel x86 implementations, both hardware and software, don't obey this dictum. Instead, probe for NOPL directly by executing a NOPL instruction and see if we get #UD. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 32 +++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/common_64.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/feature_names.c | 3 ++- 3 files changed, 69 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa39..0785b3c8d043 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #include @@ -341,6 +342,35 @@ static void __init early_cpu_detect(void) early_get_cap(c); } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { u32 tfms, xlvl; @@ -395,8 +425,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) } init_scattered_cpuid_features(c); + detect_nopl(c); } - } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index dd6e3f15017e..c3afba5a81a7 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -215,6 +216,39 @@ static void __init early_cpu_support_print(void) } } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); void __init early_cpu_init(void) @@ -313,6 +347,8 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } + detect_nopl(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index e43ad4ad4cba..c9017799497c 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = { NULL, NULL, NULL, NULL, "constant_tsc", "up", NULL, "arch_perfmon", "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "rep_good", NULL, NULL, NULL, + "nopl", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ -- cgit v1.2.2 From f31d731e4467e61de51d7f6d7115f3b712d9354c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Aug 2008 17:50:33 -0700 Subject: x86: use X86_FEATURE_NOPL in alternatives Use X86_FEATURE_NOPL to determine if it is safe to use P6 NOPs in alternatives. Also, replace table and loop with simple if statement. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/alternative.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b553..65a0c1b48696 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { extern char __vsyscall_0; const unsigned char *const *find_nop_table(void) { - return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return k8_nops; } #else /* CONFIG_X86_64 */ -static const struct nop { - int cpuid; - const unsigned char *const *noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { X86_FEATURE_P4, p6_nops }, - { X86_FEATURE_P3, p6_nops }, - { -1, NULL } -}; - const unsigned char *const *find_nop_table(void) { - const unsigned char *const *noptable = intel_nops; - int i; - - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - return noptable; + if (boot_cpu_has(X86_FEATURE_K8)) + return k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + return k7_nops; + else if (boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return intel_nops; } #endif /* CONFIG_X86_64 */ -- cgit v1.2.2 From beb20d52d03a51218827fb4a36a4b583debb03f9 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 1 Sep 2008 14:55:57 -0700 Subject: hrtimer: convert kvm to the new hrtimer apis In order to be able to do range hrtimers we need to use accessor functions to the "expire" member of the hrtimer struct. This patch converts KVM to these accessors. Signed-off-by: Arjan van de Ven --- arch/x86/kvm/i8254.c | 6 +++--- arch/x86/kvm/lapic.c | 6 ++---- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c0f7872a9124..1bf8f57a3041 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -205,8 +205,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) wake_up_interruptible(&vcpu0->wq); } - pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); - pt->scheduled = ktime_to_ns(pt->timer.expires); + hrtimer_add_expires_ns(&pt->timer, pt->period); + pt->scheduled = ktime_to_ns(hrtimer_get_expires(&pt->timer)); return (pt->period == 0 ? 0 : 1); } @@ -246,7 +246,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) timer = &pit->pit_state.pit_timer.timer; if (hrtimer_cancel(timer)) - hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } static void destroy_pit_timer(struct kvm_kpit_timer *pt) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73f43de69f67..a5b61de6adf1 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -953,9 +953,7 @@ static int __apic_timer_fn(struct kvm_lapic *apic) } if (apic_lvtt_period(apic)) { result = 1; - apic->timer.dev.expires = ktime_add_ns( - apic->timer.dev.expires, - apic->timer.period); + hrtimer_add_expires_ns(&apic->timer.dev, apic->timer.period); } return result; } @@ -1124,7 +1122,7 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) timer = &apic->timer.dev; if (hrtimer_cancel(timer)) - hrtimer_start(timer, timer->expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From d2f37384fc9957ad0162d5285a5660f0a86ef243 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 5 Sep 2008 21:28:27 -0700 Subject: x86: when building image.iso, use isohybrid if it exists When building image.iso (make isoimage), use the isohybrid tool if it exists. isohybrid is a script included with Syslinux 3.72 and higher, which creates an image that can be booted either as a hard disk (including removable, e.g. USB disk) or as a CD-ROM. If isohybrid doesn't exist, then this has no effect. Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 7ee102f9c4f8..cceba1f46365 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -181,6 +181,7 @@ isoimage: $(BOOTIMAGE) mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \ -no-emul-boot -boot-load-size 4 -boot-info-table \ $(obj)/isoimage + isohybrid $(obj)/image.iso 2>/dev/null || true rm -rf $(obj)/isoimage zlilo: $(BOOTIMAGE) -- cgit v1.2.2 From f7676254f179eac6b5244a80195ec8ae0e9d4606 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2008 03:03:32 +0200 Subject: x86: HPET fix moronic 32/64bit thinko We use the HPET only in 32bit mode because: 1) some HPETs are 32bit only 2) on i386 there is no way to read/write the HPET atomic 64bit wide The HPET code unification done by the "moron of the year" did not take into account that unsigned long is different on 32 and 64 bit. This thinko results in a possible endless loop in the clockevents code, when the return comparison fails due to the 64bit/332bit unawareness. unsigned long cnt = (u32) hpet_read() + delta can wrap over 32bit. but the final compare will fail and return -ETIME causing endless loops. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 2256315416d8..801497a16e0e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -270,15 +270,15 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, } static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long cnt; + u32 cnt; cnt = hpet_readl(HPET_COUNTER); - cnt += delta; + cnt += (u32) delta; hpet_writel(cnt, HPET_T0_CMP); - return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } /* -- cgit v1.2.2 From 72d43d9bc9210d24d09202eaf219eac09e17b339 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2008 03:06:08 +0200 Subject: x86: HPET: read back compare register before reading counter After fixing the u32 thinko I sill had occasional hickups on ATI chipsets with small deltas. There seems to be a delay between writing the compare register and the transffer to the internal register which triggers the interrupt. Reading back the value makes sure, that it hit the internal match register befor we compare against the counter value. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 801497a16e0e..73deaffadd03 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -278,6 +278,13 @@ static int hpet_legacy_next_event(unsigned long delta, cnt += (u32) delta; hpet_writel(cnt, HPET_T0_CMP); + /* + * We need to read back the CMP register to make sure that + * what we wrote hit the chip before we compare it to the + * counter. + */ + WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt); + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } -- cgit v1.2.2 From 1b05d60d60e81c6594da8298107a05b506f01797 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 6 Sep 2008 01:52:27 -0700 Subject: x86: remove duplicated get_model_name() calling Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 3 --- arch/x86/kernel/cpu/amd_64.c | 3 +-- arch/x86/kernel/cpu/centaur.c | 1 - arch/x86/kernel/cpu/common.c | 9 +++------ arch/x86/kernel/cpu/cpu.h | 1 - arch/x86/kernel/cpu/cyrix.c | 1 - arch/x86/kernel/cpu/transmeta.c | 1 - 7 files changed, 4 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d64ea6097ca7..e0ba2c7c5a18 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -42,7 +42,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) { u32 l, h; int mbytes = num_physpages >> (20-PAGE_SHIFT); - int r; #ifdef CONFIG_SMP unsigned long long value; @@ -75,8 +74,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) */ clear_cpu_cap(c, 0*32+31); - r = get_model_name(c); - switch (c->x86) { case 4: /* diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index d1c721c0c49f..c5fbf7477ead 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -157,8 +157,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - level = get_model_name(c); - if (!level) { + if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: /* Should distinguish Models here, but this is only diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e5f6d89521bf..89bfdd9cacc6 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -289,7 +289,6 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) if (c->x86_model >= 6 && c->x86_model < 9) set_cpu_cap(c, X86_FEATURE_3DNOW); - get_model_name(c); display_cacheinfo(c); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 286c89a991bd..a8b9b7242428 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -252,13 +252,13 @@ static struct cpu_dev __cpuinitdata default_cpu = { .c_x86_vendor = X86_VENDOR_UNKNOWN, }; -int __cpuinit get_model_name(struct cpuinfo_x86 *c) +static void __cpuinit get_model_name(struct cpuinfo_x86 *c) { unsigned int *v; char *p, *q; if (c->extended_cpuid_level < 0x80000004) - return 0; + return; v = (unsigned int *) c->x86_model_id; cpuid(0x80000002, &v[0], &v[1], &v[2], &v[3]); @@ -277,8 +277,6 @@ int __cpuinit get_model_name(struct cpuinfo_x86 *c) while (q <= &c->x86_model_id[48]) *q++ = '\0'; /* Zero-pad the rest */ } - - return 1; } void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) @@ -610,8 +608,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) #endif } - if (c->extended_cpuid_level >= 0x80000004) - get_model_name(c); /* Default name */ + get_model_name(c); /* Default name */ init_scattered_cpuid_features(c); detect_nopl(c); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 3cc9d92afd8f..de4094a39210 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -31,7 +31,6 @@ struct cpu_dev { extern struct cpu_dev *__x86_cpu_dev_start[], *__x86_cpu_dev_end[]; -extern int get_model_name(struct cpuinfo_x86 *c); extern void display_cacheinfo(struct cpuinfo_x86 *c); #endif diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index 3f8c7283d816..ffd0f5ed071a 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -301,7 +301,6 @@ static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) */ if ((0x30 <= dir1 && dir1 <= 0x6f) || (0x80 <= dir1 && dir1 <= 0x8f)) geode_configure(); - get_model_name(c); /* get CPU marketing name */ return; } else { /* MediaGX */ Cx86_cb[2] = (dir0_lsn & 1) ? '3' : '4'; diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 7c46e6ecedca..738e03244f95 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -12,7 +12,6 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; char cpu_info[65]; - get_model_name(c); /* Same as AMD/Cyrix */ display_cacheinfo(c); /* Print CMS and CPU revision */ -- cgit v1.2.2 From e3224234717b4228c235cee401af89212f17a3a4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 6 Sep 2008 01:52:28 -0700 Subject: x86, cpu init: call early_init_xxx in init_xxx so we: 1. could set some cap to ap 2. restore some cap after memset in identify_cpu for boot cpu esp for CONSTANT_TSC this matters, as: before this patch: flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow rep_good nopl pni monitor cx16 lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs after this patch: flags : fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm 3dnowext 3dnow constant_tsc rep_good nopl pni monitor cx16 lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs so constant_tsc is back... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 7 ++----- arch/x86/kernel/cpu/amd_64.c | 2 ++ arch/x86/kernel/cpu/centaur_64.c | 2 ++ arch/x86/kernel/cpu/common.c | 7 ++++--- arch/x86/kernel/cpu/intel_64.c | 2 ++ 5 files changed, 12 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e0ba2c7c5a18..c3175da7bc69 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -26,11 +26,8 @@ __asm__(".align 4\nvide: ret"); static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { - if (cpuid_eax(0x80000000) >= 0x80000007) { - c->x86_power = cpuid_edx(0x80000007); - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - } + if (c->x86_power & (1<<8)) + set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); /* Set MTRR capability flag if appropriate */ if (c->x86_model == 13 || c->x86_model == 9 || diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index c5fbf7477ead..8c2d07f06f1d 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -140,6 +140,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } #endif + early_init_amd(c); + /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_cpu_cap(c, 0*32+31); diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 49cfc6d2f2fb..0e5cf17a3c89 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -16,6 +16,8 @@ static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { + early_init_centaur(c); + if (c->x86 == 0x6 && c->x86_model >= 0xf) { c->x86_cache_alignment = c->x86_clflush_size * 2; set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a8b9b7242428..e8045c4ef1c1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -473,9 +473,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } - if (c->extended_cpuid_level >= 0x80000007) - c->x86_power = cpuid_edx(0x80000007); - if (c->extended_cpuid_level >= 0x80000008) { u32 eax = cpuid_eax(0x80000008); @@ -483,6 +480,10 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } #endif + + if (c->extended_cpuid_level >= 0x80000007) + c->x86_power = cpuid_edx(0x80000007); + } /* * Do minimum CPU detection early. diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index 0c0a58dfe099..14a2cd98d480 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -54,6 +54,8 @@ static void __cpuinit srat_detect_node(void) static void __cpuinit init_intel(struct cpuinfo_x86 *c) { + early_init_intel(c); + init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); -- cgit v1.2.2 From bb57925f5057837c248b57a51181fd6b138a32f3 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:26:55 -0700 Subject: x86_32: signal: use syscall_get_nr and error Use asm/syscall.h interfaces that do the same things. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b21070ea33a4..3e4a688bb84f 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "sigframe.h" @@ -507,9 +508,9 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, int ret; /* Are we from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTART_RESTARTBLOCK: case -ERESTARTNOHAND: regs->ax = -EINTR; @@ -623,9 +624,9 @@ static void do_signal(struct pt_regs *regs) } /* Did we come from a system call? */ - if ((long)regs->orig_ax >= 0) { + if (syscall_get_nr(current, regs) >= 0) { /* Restart the system call - no handlers present */ - switch (regs->ax) { + switch (syscall_get_error(current, regs)) { case -ERESTARTNOHAND: case -ERESTARTSYS: case -ERESTARTNOINTR: -- cgit v1.2.2 From 72fa50f4ef9014f4212945b766af84ea94308903 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:27:11 -0700 Subject: x86_32: signal: introduce signal_fault() implement signal_fault() for 32bit. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 3e4a688bb84f..76d05d703845 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -243,7 +243,7 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) return ax; badframe: - force_sig(SIGSEGV, current); + signal_fault(regs, frame, "rt sigreturn"); return 0; } @@ -669,3 +669,19 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) clear_thread_flag(TIF_IRET); } + +void signal_fault(struct pt_regs *regs, void __user *frame, char *where) +{ + struct task_struct *me = current; + + if (show_unhandled_signals && printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); + print_vma_addr(" in ", regs->ip); + printk(KERN_CONT "\n"); + } + + force_sig(SIGSEGV, me); +} -- cgit v1.2.2 From 8fcd8e20f388d787f6abf701b11037b122029d5b Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:27:39 -0700 Subject: x86: signal: make NR_restart_syscall make NR_restart_syscall macro for cosmetic unification of handle_signal(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 3 ++- arch/x86/kernel/signal_64.c | 6 +++--- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 76d05d703845..bd9b65031a9a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -572,6 +572,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return 0; } +#define NR_restart_syscall __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -635,7 +636,7 @@ static void do_signal(struct pt_regs *regs) break; case -ERESTART_RESTARTBLOCK: - regs->ax = __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 823a55bf8c39..19e2b9143205 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -362,6 +362,8 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, return ret; } +#define NR_restart_syscall \ + test_thread_flag(TIF_IA32) ? __NR_ia32_restart_syscall : __NR_restart_syscall /* * Note that 'init' is a special process: it doesn't get signals it doesn't * want to handle. Thus you cannot kill init even with a SIGKILL even by @@ -423,9 +425,7 @@ static void do_signal(struct pt_regs *regs) regs->ip -= 2; break; case -ERESTART_RESTARTBLOCK: - regs->ax = test_thread_flag(TIF_IA32) ? - __NR_ia32_restart_syscall : - __NR_restart_syscall; + regs->ax = NR_restart_syscall; regs->ip -= 2; break; } -- cgit v1.2.2 From 1d13024e624d0e2e47f117b383917249a41ef5ce Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:28:06 -0700 Subject: x86: signal: split out frame setups Make setup_rt_frame() and split out frame setups from handle_signal(). This is for cosmetic unification of handle_signal(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 29 ++++++++++++++++++++--------- arch/x86/kernel/signal_64.c | 30 ++++++++++++++++++++---------- 2 files changed, 40 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index bd9b65031a9a..48982f7ce886 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -338,8 +338,8 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size, } static int -setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, - struct pt_regs *regs) +__setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, + struct pt_regs *regs) { struct sigframe __user *frame; void __user *restorer; @@ -416,8 +416,8 @@ give_sigsegv: return -EFAULT; } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *restorer; @@ -501,6 +501,21 @@ give_sigsegv: /* * OK, we're invoking a handler: */ +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int ret; + + /* Set up the stack frame */ + if (ka->sa.sa_flags & SA_SIGINFO) + ret = __setup_rt_frame(sig, ka, info, set, regs); + else + ret = __setup_frame(sig, ka, set, regs); + + return ret; +} + static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) @@ -537,11 +552,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; - /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - ret = setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = setup_frame(sig, ka, oldset, regs); + ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret) return ret; diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 19e2b9143205..8fbdd23d5cc6 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -195,8 +195,8 @@ get_stack(struct k_sigaction *ka, struct pt_regs *regs, unsigned long size) return (void __user *)round_down(sp - size, 64); } -static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, - sigset_t *set, struct pt_regs *regs) +static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) { struct rt_sigframe __user *frame; void __user *fp = NULL; @@ -280,6 +280,24 @@ give_sigsegv: /* * OK, we're invoking a handler */ +static int +setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, + sigset_t *set, struct pt_regs *regs) +{ + int ret; + +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = ia32_setup_rt_frame(sig, ka, info, set, regs); + else + ret = ia32_setup_frame(sig, ka, set, regs); + } else +#endif + ret = __setup_rt_frame(sig, ka, info, set, regs); + + return ret; +} static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, @@ -317,14 +335,6 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, likely(test_and_clear_thread_flag(TIF_FORCED_TF))) regs->flags &= ~X86_EFLAGS_TF; -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { - if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, oldset, regs); - else - ret = ia32_setup_frame(sig, ka, oldset, regs); - } else -#endif ret = setup_rt_frame(sig, ka, info, oldset, regs); if (ret == 0) { -- cgit v1.2.2 From 13ad7725e9955ac68fff670a039da99ab33e86a0 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 5 Sep 2008 16:28:38 -0700 Subject: x86_32: signal: move signal number conversion to upper layer Bring signal number conversion in __setup_frame() and __setup_rt_frame() up into the common part setup_frame(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 31 ++++++++++++------------------- 1 file changed, 12 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 48982f7ce886..b668efc18eae 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -344,7 +344,6 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, struct sigframe __user *frame; void __user *restorer; int err = 0; - int usig; void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); @@ -352,13 +351,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; - - err = __put_user(usig, &frame->sig); + err = __put_user(sig, &frame->sig); if (err) goto give_sigsegv; @@ -422,7 +415,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, struct rt_sigframe __user *frame; void __user *restorer; int err = 0; - int usig; void __user *fpstate = NULL; frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); @@ -430,13 +422,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) goto give_sigsegv; - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; - - err |= __put_user(usig, &frame->sig); + err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); @@ -482,7 +468,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; regs->ip = (unsigned long)ka->sa.sa_handler; - regs->ax = (unsigned long)usig; + regs->ax = (unsigned long)sig; regs->dx = (unsigned long)&frame->info; regs->cx = (unsigned long)&frame->uc; @@ -506,12 +492,19 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { int ret; + int usig; + + usig = current_thread_info()->exec_domain + && current_thread_info()->exec_domain->signal_invmap + && sig < 32 + ? current_thread_info()->exec_domain->signal_invmap[sig] + : sig; /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) - ret = __setup_rt_frame(sig, ka, info, set, regs); + ret = __setup_rt_frame(usig, ka, info, set, regs); else - ret = __setup_frame(sig, ka, set, regs); + ret = __setup_frame(usig, ka, set, regs); return ret; } -- cgit v1.2.2 From 464f04c9e9b3b1c4f5ffb89c51d8ba2a2034c846 Mon Sep 17 00:00:00 2001 From: Andrey Borzenkov Date: Sat, 6 Sep 2008 12:40:21 +0400 Subject: x86: fix ghost EDD devices in /sys again > This is regression but old enough. Apparently I had for whatever reasons > EDD turned off till recently. This is 2.6.27-rc5 just in case. > > In 2006 I fixed ghost devices due to buggy BIOS: > > http://marc.info/?l=linux-kernel&m=114087765422490&w=2 > > Later edd.S has been rewritten in C, and apparently this patch has been > lost: > > {pts/1}% ls /sys/firmware/edd > int13_dev80/ int13_dev84/ int13_dev88/ int13_dev8c/ > int13_dev81/ int13_dev85/ int13_dev89/ int13_dev8d/ > int13_dev82/ int13_dev86/ int13_dev8a/ int13_dev8e/ > int13_dev83/ int13_dev87/ int13_dev8b/ int13_dev8f/ > > But I have just a single disk. This is the same system BTW. Some BIOSes do not always set CF on error before return from int13. The patch adds additional check for status being zero (AH == 0). This was fixed for edd.S in http://marc.info/?l=linux-kernel&m=114087765422490&w=2, but lost again when edd.S was rewritten in C. Signed-off-by: Andrey Borzenkov Signed-off-by: Ingo Molnar --- arch/x86/boot/edd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index d93cbc6464d0..bf4ae6ff518e 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -32,7 +32,9 @@ static int read_mbr(u8 devno, void *buf) : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) : : "esi", "edi", "memory"); - return -(u8)ax; /* 0 or -1 */ + /* Some BIOSes do not set carry flag on error but still return + * error in AH. The condition below is expected to catch both */ + return -!!ax; /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) -- cgit v1.2.2 From 12cf105cd66d95cf32c73cfa847a50bd1b700f23 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: delay early cpu initialization until cpuid is done Move early cpu initialization after cpu early get cap so the early cpu initialization can fix up cpu caps. Signed-off-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0785b3c8d043..8aab8517642e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -335,11 +335,11 @@ static void __init early_cpu_detect(void) get_cpu_vendor(c, 1); + early_get_cap(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); - - early_get_cap(c); } /* -- cgit v1.2.2 From dd786dd12c99634055a9066f25ea957f29991c22 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: move mtrr cpu cap setting early in early_init_xxxx Krzysztof Helt found MTRR is not detected on k6-2 root cause: we moved mtrr_bp_init() early for mtrr trimming, and in early_detect we only read the CPU capability from cpuid, so some cpu doesn't have that bit in cpuid. So we need to add early_init_xxxx to preset those bit before mtrr_bp_init for those earlier cpus. this patch is for v2.6.27 Reported-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 9 +++++---- arch/x86/kernel/cpu/centaur.c | 11 +++++++++++ arch/x86/kernel/cpu/cyrix.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index cae9cabc3031..18514ed26104 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -31,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } + + /* Set MTRR capability flag if appropriate */ + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -166,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mbytes); } - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); break; } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a55..a0534c04d38a 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -314,6 +314,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index e710a21bb6e8..898a5a2002ed 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -161,6 +166,24 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -416,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, }; -- cgit v1.2.2 From 0722bba8f14eb5271c8b67e97def74da50eceb15 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Sep 2008 18:14:51 +0200 Subject: x86: kill sys32_pause It's an unused duplicate of the generic sys_pause. Signed-off-by: Christoph Hellwig Signed-off-by: Ingo Molnar --- arch/x86/ia32/sys_ia32.c | 9 --------- 1 file changed, 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index d3c64088b981..beda4232ce69 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -556,15 +556,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig, return ret; } -/* These are here just in case some old ia32 binary calls it. */ -asmlinkage long sys32_pause(void) -{ - current->state = TASK_INTERRUPTIBLE; - schedule(); - return -ERESTARTNOHAND; -} - - #ifdef CONFIG_SYSCTL_SYSCALL struct sysctl_ia32 { unsigned int name; -- cgit v1.2.2 From a19aac8548d85cf15d744335b8e82201da760d80 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 30 Aug 2008 06:03:34 +0400 Subject: x86: make setup_xstate_init() __init WARNING: vmlinux.o(.text+0x22453): Section mismatch in reference from the function setup_xstate_init() to the function .init.text:__alloc_bootmem() The function setup_xstate_init() references the function __init __alloc_bootmem(). This is often because setup_xstate_init lacks a __init annotation or the annotation of __alloc_bootmem is wrong. Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 07713d64debe..ed5274a5bb0f 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -272,7 +272,7 @@ void __cpuinit xsave_init(void) /* * setup the xstate image representing the init state */ -void setup_xstate_init(void) +static void __init setup_xstate_init(void) { init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; -- cgit v1.2.2 From 30cec97967beb5aa08e893e8ff69623d5fecd9b7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 12:49:55 +0100 Subject: x86: init annotations in early_printk() setup Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 825e9136b026..02fc5b40c14b 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -120,7 +120,7 @@ static __init void early_serial_init(char *s) if (!strncmp(s, "0x", 2)) { early_serial_base = simple_strtoul(s, &e, 16); } else { - static int bases[] = { 0x3f8, 0x2f8 }; + static const int __initconst bases[] = { 0x3f8, 0x2f8 }; if (!strncmp(s, "ttyS", 4)) s += 4; @@ -920,7 +920,7 @@ static struct console simnow_console = { /* Direct interface for emergencies */ static struct console *early_console = &early_vga_console; -static int early_console_initialized; +static int __initdata early_console_initialized; asmlinkage void early_printk(const char *fmt, ...) { -- cgit v1.2.2 From 17b746278da8d6642bc487ec35efe4be2333f03f Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 12:51:32 +0100 Subject: x86: pgd_{c,d}tor() cleanup Giving pgd_ctor() a properly typed parameter allows eliminating a local variable. Adjust pgd_dtor() to match. Signed-off-by: Jan Beulich Acked-by: Jeremy Fitzhardinge Cc: "Jeremy Fitzhardinge" Signed-off-by: Ingo Molnar --- arch/x86/mm/pgtable.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d50302774fe2..86f2ffc43c3d 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -63,10 +63,8 @@ static inline void pgd_list_del(pgd_t *pgd) #define UNSHARED_PTRS_PER_PGD \ (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) -static void pgd_ctor(void *p) +static void pgd_ctor(pgd_t *pgd) { - pgd_t *pgd = p; - /* If the pgd points to a shared pagetable level (either the ptes in non-PAE, or shared PMD in PAE), then just copy the references from swapper_pg_dir. */ @@ -87,7 +85,7 @@ static void pgd_ctor(void *p) pgd_list_add(pgd); } -static void pgd_dtor(void *pgd) +static void pgd_dtor(pgd_t *pgd) { unsigned long flags; /* can be called from interrupt context */ -- cgit v1.2.2 From cc643d4687533345fd8ebcba836f9ee25df7c458 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 12:53:45 +0100 Subject: x86: adjust vmalloc_sync_all() for Xen (2nd try) Since the fourth PDPT entry cannot be shared under Xen, vmalloc_sync_all() must iterate over pmd-s rather than pgd-s here. Luckily, the code isn't used for native PAE (SHARED_KERNEL_PMD is 1) and the change is benign to non-PAE. Also do a little more cleanup in that function. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar Cc: Jeremy Fitzhardinge --- arch/x86/mm/fault.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 455f3fe67b42..356ed2dec3a6 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -915,15 +915,15 @@ LIST_HEAD(pgd_list); void vmalloc_sync_all(void) { -#ifdef CONFIG_X86_32 - unsigned long start = VMALLOC_START & PGDIR_MASK; unsigned long address; +#ifdef CONFIG_X86_32 if (SHARED_KERNEL_PMD) return; - BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); - for (address = start; address >= TASK_SIZE; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PMD_MASK; + address >= TASK_SIZE && address < FIXADDR_TOP; + address += PMD_SIZE) { unsigned long flags; struct page *page; @@ -936,10 +936,8 @@ void vmalloc_sync_all(void) spin_unlock_irqrestore(&pgd_lock, flags); } #else /* CONFIG_X86_64 */ - unsigned long start = VMALLOC_START & PGDIR_MASK; - unsigned long address; - - for (address = start; address <= VMALLOC_END; address += PGDIR_SIZE) { + for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END; + address += PGDIR_SIZE) { const pgd_t *pgd_ref = pgd_offset_k(address); unsigned long flags; struct page *page; -- cgit v1.2.2 From 2d9cd6c27f00958a31622b8e9909d5e35f069b28 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 13:15:04 +0100 Subject: x86-64: add two __cpuinit annotations Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/mcheck/mce_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index e8045c4ef1c1..5cde4b18448c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -845,7 +845,7 @@ struct desc_ptr idt_descr = { 256 * 16 - 1, (unsigned long) idt_table }; char boot_cpu_stack[IRQSTACKSIZE] __page_aligned_bss; -void pda_init(int cpu) +void __cpuinit pda_init(int cpu) { struct x8664_pda *pda = cpu_pda(cpu); diff --git a/arch/x86/kernel/cpu/mcheck/mce_64.c b/arch/x86/kernel/cpu/mcheck/mce_64.c index 726a5fcdf341..4b031a4ac856 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_64.c @@ -860,7 +860,7 @@ error: return err; } -static void mce_remove_device(unsigned int cpu) +static __cpuinit void mce_remove_device(unsigned int cpu) { int i; -- cgit v1.2.2 From e4a6be4d2850da032a782b5296c07dfdf583af86 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 24 Jul 2008 12:15:45 -0300 Subject: x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags Using native_pte_val triggers the BUG_ON() in the paravirt_ops version of pte_flags(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08f..a4e201b47f64 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1324,7 +1324,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, -- cgit v1.2.2 From 6b213e1bc27da6f6280386b1ff0e817e602c7b7a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 16 Jun 2008 12:39:13 +0100 Subject: Remove redundant CONFIG_ARCH_SUPPORTS_AOUT We don't need this any more; arguably we never really did. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..2e8fa40b3cdf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -151,9 +151,6 @@ config AUDIT_ARCH bool default X86_64 -config ARCH_SUPPORTS_AOUT - def_bool y - config ARCH_SUPPORTS_OPTIMIZED_INLINING def_bool y @@ -1759,7 +1756,7 @@ config IA32_EMULATION config IA32_AOUT tristate "IA32 a.out support" - depends on IA32_EMULATION && ARCH_SUPPORTS_AOUT + depends on IA32_EMULATION help Support old a.out binaries in the 32bit emulation. -- cgit v1.2.2 From e17c6d56160e4fb9e8c2830e30cc9741d4309989 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 17 Jun 2008 12:19:34 +0100 Subject: Introduce HAVE_AOUT symbol to remove hard-coded arch list for BINFMT_AOUT HAVE_AOUT doesn't quite do the same thing as the recently removed ARCH_SUPPORTS_AOUT config option. That was set even on platforms where binfmt_aout isn't supported, although it's not entirely clear why. So it's best just to introduce a new symbol, handled consistently with other similar HAVE_xxx symbols; with a simple 'select' in the arch Kconfig. Signed-off-by: David Woodhouse --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2e8fa40b3cdf..59b1d65a85e9 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -18,6 +18,7 @@ config X86_64 ### Arch settings config X86 def_bool y + select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE -- cgit v1.2.2 From d04ec773d7ca1bbc05a2768be95c1cebe2b07757 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:27:30 +0200 Subject: x86: pda_init(): fix memory leak when using CPU hotplug pda->irqstackptr is allocated whenever a CPU is set online. But it is never freed. This results in a memory leak of 16K for each CPU offline/online cycle. Fix is to allocate pda->irqstackptr only once. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index c3afba5a81a7..4f2eeb5652ea 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -529,17 +529,20 @@ void pda_init(int cpu) /* others are initialized in smpboot.c */ pda->pcurrent = &init_task; pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) pda->nodenumber = cpu_to_node(cpu); } - - pda->irqstackptr += IRQSTACKSIZE-64; } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + -- cgit v1.2.2 From 23952a96ae738277f3139b63d622e22984589031 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:29:37 +0200 Subject: x86: cpu_init(): fix memory leak when using CPU hotplug Exception stacks are allocated each time a CPU is set online. But the allocated space is never freed. Thus with one CPU hotplug offline/online cycle there is a memory leak of 24K (6 pages) for a CPU. Fix is to allocate exception stacks only once -- when the CPU is set online for the first time. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 4f2eeb5652ea..a11f5d4477cd 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -640,19 +640,22 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (!orig_ist->ist[0]) { static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); -- cgit v1.2.2 From 5df45515512436a808d3476a90e83f2efb022422 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Sep 2008 23:55:40 +0200 Subject: x86, tsc calibration: fix my brown paperbag day ... Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 6dab90f68515..4847a9280505 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -319,7 +319,7 @@ static unsigned long quick_pit_calibrate(void) /* * Make sure we can rely on the second TSC timestamp: */ - if (!pit_expect_msb(--expect)) + if (!pit_expect_msb(expect)) goto failed; /* -- cgit v1.2.2 From 5394f80f92642c61fc2a95385be85f2fdcfb5adb Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 01:51:32 -0700 Subject: x86: check for and defend against BIOS memory corruption Some BIOSes have been observed to corrupt memory in the low 64k. This change: - Reserves all memory which does not have to be in that area, to prevent it from being used as general memory by the kernel. Things like the SMP trampoline are still in the memory, however. - Clears the reserved memory so we can observe changes to it. - Adds a function check_for_bios_corruption() which checks and reports on memory becoming unexpectedly non-zero. Currently it's called in the x86 fault handler, and the powermanagement debug output. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 ++ arch/x86/kernel/setup.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/mm/fault.c | 2 ++ 3 files changed, 92 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..1bb52e2ca02e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -201,6 +201,9 @@ config X86_TRAMPOLINE depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) default y +config X86_CHECK_BIOS_CORRUPTION + def_bool y + config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 362d4e7f2d38..ee89ebc5aabc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -578,6 +578,89 @@ static struct x86_quirks default_x86_quirks __initdata; struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; +/* + * Some BIOSes seem to corrupt the low 64k of memory during events + * like suspend/resume and unplugging an HDMI cable. Reserve all + * remaining free memory in that area and fill it with a distinct + * pattern. + */ +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +#define MAX_SCAN_AREAS 8 +static struct e820entry scan_areas[MAX_SCAN_AREAS]; +static int num_scan_areas; + +static void __init setup_bios_corruption_check(void) +{ + u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + + while(addr < 0x10000 && num_scan_areas < MAX_SCAN_AREAS) { + u64 size; + addr = find_e820_area_size(addr, &size, PAGE_SIZE); + + if (addr == 0) + break; + + if ((addr + size) > 0x10000) + size = 0x10000 - addr; + + if (size == 0) + break; + + e820_update_range(addr, size, E820_RAM, E820_RESERVED); + scan_areas[num_scan_areas].addr = addr; + scan_areas[num_scan_areas].size = size; + num_scan_areas++; + + /* Assume we've already mapped this early memory */ + memset(__va(addr), 0, size); + + addr += size; + } + + printk(KERN_INFO "scanning %d areas for BIOS corruption\n", + num_scan_areas); + update_e820(); +} + +static int __read_mostly bios_corruption_check = 1; + +void check_for_bios_corruption(void) +{ + int i; + int corruption = 0; + + if (!bios_corruption_check) + return; + + for(i = 0; i < num_scan_areas; i++) { + unsigned long *addr = __va(scan_areas[i].addr); + unsigned long size = scan_areas[i].size; + + for(; size; addr++, size -= sizeof(unsigned long)) { + if (!*addr) + continue; + printk(KERN_ERR "Corrupted low memory at %p (%lx phys) = %08lx\n", + addr, __pa(addr), *addr); + corruption = 1; + *addr = 0; + } + } + + if (corruption) + dump_stack(); +} + +static int set_bios_corruption_check(char *arg) +{ + char *end; + + bios_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("bios_corruption_check", set_bios_corruption_check); +#endif + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -750,6 +833,10 @@ void __init setup_arch(char **cmdline_p) high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; #endif +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION + setup_bios_corruption_check(); +#endif + /* max_pfn_mapped is updated here */ max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Sun, 7 Sep 2008 01:51:33 -0700 Subject: x86: add periodic corruption check Perodically check for corruption in low phusical memory. Don't bother checking at fault time, since it won't show anything useful. Signed-off-by: Hugh Dickins Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 17 +++++++++++++++++ arch/x86/mm/fault.c | 2 -- arch/x86/mm/init_32.c | 2 ++ arch/x86/mm/init_64.c | 2 ++ 4 files changed, 21 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ee89ebc5aabc..c239b3780973 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -623,6 +623,7 @@ static void __init setup_bios_corruption_check(void) } static int __read_mostly bios_corruption_check = 1; +static struct timer_list periodic_check_timer; void check_for_bios_corruption(void) { @@ -650,6 +651,22 @@ void check_for_bios_corruption(void) dump_stack(); } +static void periodic_check_for_corruption(unsigned long data) +{ + check_for_bios_corruption(); + mod_timer(&periodic_check_timer, jiffies + 60*HZ); +} + +void start_periodic_check_for_corruption(void) +{ + if (!bios_corruption_check) + return; + + init_timer(&periodic_check_timer); + periodic_check_timer.function = &periodic_check_for_corruption; + periodic_check_for_corruption(0); +} + static int set_bios_corruption_check(char *arg) { char *end; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 5140bdf03020..455f3fe67b42 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -848,8 +848,6 @@ no_context: * Oops. The kernel tried to access some bad page. We'll have to * terminate things with extreme prejudice. */ - check_for_bios_corruption(); - #ifdef CONFIG_X86_32 bust_spinlocks(1); #else diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..657a16ad61ba 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -907,6 +907,8 @@ void __init mem_init(void) int codesize, reservedpages, datasize, initsize; int tmp; + start_periodic_check_for_corruption(); + #ifdef CONFIG_FLATMEM BUG_ON(!mem_map); #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d3746efb060d..f4db5276fa21 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -769,6 +769,8 @@ void __init mem_init(void) { long codesize, reservedpages, datasize, initsize; + start_periodic_check_for_corruption(); + pci_iommu_alloc(); /* clear_bss() already clear the empty_zero_page */ -- cgit v1.2.2 From 9f077871ce7237e2387fc76542b3b4033cb05e49 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 01:51:34 -0700 Subject: x86: clean up memory corruption check and add more kernel parameters The corruption check is enabled in Kconfig by default, but disabled at runtime. This patch adds several kernel parameters to control the corruption check's behaviour; these are documented in kernel-parameters.txt. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 26 ++++++++++++++-- arch/x86/kernel/setup.c | 80 ++++++++++++++++++++++++++++++++++++------------- 2 files changed, 83 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1bb52e2ca02e..cbee4199689c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -201,9 +201,6 @@ config X86_TRAMPOLINE depends on X86_SMP || (X86_VOYAGER && SMP) || (64BIT && ACPI_SLEEP) default y -config X86_CHECK_BIOS_CORRUPTION - def_bool y - config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" @@ -1062,6 +1059,29 @@ config HIGHPTE low memory. Setting this option will put user-space page table entries in high memory. +config X86_CHECK_BIOS_CORRUPTION + bool "Check for low memory corruption" + default y + help + Periodically check for memory corruption in low memory, which + is suspected to be caused by BIOS. Even when enabled in the + configuration, it is disabled at runtime. Enable it by + setting "memory_corruption_check=1" on the kernel command + line. By default it scans the low 64k of memory every 60 + seconds; see the memory_corruption_check_size and + memory_corruption_check_period parameters in + Documentation/kernel-parameters.txt to adjust this. + + When enabled with the default parameters, this option has + almost no overhead, as it reserves a relatively small amount + of memory and scans it infrequently. It both detects corruption + and prevents it from affecting the running system. + + It is, however, intended as a diagnostic tool; if repeatable + BIOS-originated corruption always affects the same memory, + you can use memmap= to prevent the kernel from using that + memory. + config MATH_EMULATION bool prompt "Math emulation" if X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c239b3780973..27ae91288855 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -586,22 +586,71 @@ struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; */ #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION #define MAX_SCAN_AREAS 8 + +static int __read_mostly memory_corruption_check = 0; +static unsigned __read_mostly corruption_check_size = 64*1024; +static unsigned __read_mostly corruption_check_period = 60; /* seconds */ + static struct e820entry scan_areas[MAX_SCAN_AREAS]; static int num_scan_areas; + +static int set_corruption_check(char *arg) +{ + char *end; + + memory_corruption_check = simple_strtol(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check", set_corruption_check); + +static int set_corruption_check_period(char *arg) +{ + char *end; + + corruption_check_period = simple_strtoul(arg, &end, 10); + + return (*end == 0) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_period", set_corruption_check_period); + +static int set_corruption_check_size(char *arg) +{ + char *end; + unsigned size; + + size = memparse(arg, &end); + + if (*end == '\0') + corruption_check_size = size; + + return (size == corruption_check_size) ? 0 : -EINVAL; +} +early_param("memory_corruption_check_size", set_corruption_check_size); + + static void __init setup_bios_corruption_check(void) { u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ - while(addr < 0x10000 && num_scan_areas < MAX_SCAN_AREAS) { + if (corruption_check_size == 0) + memory_corruption_check = 0; + + if (!memory_corruption_check) + return; + + corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); + + while(addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { u64 size; addr = find_e820_area_size(addr, &size, PAGE_SIZE); if (addr == 0) break; - if ((addr + size) > 0x10000) - size = 0x10000 - addr; + if ((addr + size) > corruption_check_size) + size = corruption_check_size - addr; if (size == 0) break; @@ -617,12 +666,11 @@ static void __init setup_bios_corruption_check(void) addr += size; } - printk(KERN_INFO "scanning %d areas for BIOS corruption\n", + printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas); update_e820(); } -static int __read_mostly bios_corruption_check = 1; static struct timer_list periodic_check_timer; void check_for_bios_corruption(void) @@ -630,7 +678,7 @@ void check_for_bios_corruption(void) int i; int corruption = 0; - if (!bios_corruption_check) + if (!memory_corruption_check) return; for(i = 0; i < num_scan_areas; i++) { @@ -647,35 +695,27 @@ void check_for_bios_corruption(void) } } - if (corruption) - dump_stack(); + WARN(corruption, KERN_ERR "Memory corruption detected in low memory\n"); } static void periodic_check_for_corruption(unsigned long data) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, jiffies + 60*HZ); + mod_timer(&periodic_check_timer, jiffies + corruption_check_period*HZ); } void start_periodic_check_for_corruption(void) { - if (!bios_corruption_check) + if (!memory_corruption_check || corruption_check_period == 0) return; + printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", + corruption_check_period); + init_timer(&periodic_check_timer); periodic_check_timer.function = &periodic_check_for_corruption; periodic_check_for_corruption(0); } - -static int set_bios_corruption_check(char *arg) -{ - char *end; - - bios_corruption_check = simple_strtol(arg, &end, 10); - - return (*end == 0) ? 0 : -EINVAL; -} -early_param("bios_corruption_check", set_bios_corruption_check); #endif /* -- cgit v1.2.2 From c885df50f571faf9fd9f395361cfff1b3a16e06e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 02:37:32 -0700 Subject: x86: default corruption check to off, but put parameter default in Kconfig Default the low memory corruption check to off, but make the default setting of the memory_corruption_check kernel parameter a config parameter. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 9 ++++++++- arch/x86/kernel/setup.c | 13 ++++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cbee4199689c..7820d447bb8d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1061,7 +1061,6 @@ config HIGHPTE config X86_CHECK_BIOS_CORRUPTION bool "Check for low memory corruption" - default y help Periodically check for memory corruption in low memory, which is suspected to be caused by BIOS. Even when enabled in the @@ -1082,6 +1081,14 @@ config X86_CHECK_BIOS_CORRUPTION you can use memmap= to prevent the kernel from using that memory. +config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + bool "Set the default setting of memory_corruption_check" + depends on X86_CHECK_BIOS_CORRUPTION + default y + help + Set whether the default state of memory_corruption_check is + on or off. + config MATH_EMULATION bool prompt "Math emulation" if X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 27ae91288855..ec7e56c1b984 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -587,7 +587,8 @@ struct x86_quirks *x86_quirks __initdata = &default_x86_quirks; #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION #define MAX_SCAN_AREAS 8 -static int __read_mostly memory_corruption_check = 0; +static int __read_mostly memory_corruption_check = -1; + static unsigned __read_mostly corruption_check_size = 64*1024; static unsigned __read_mostly corruption_check_period = 60; /* seconds */ @@ -634,6 +635,16 @@ static void __init setup_bios_corruption_check(void) { u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + if (memory_corruption_check == -1) { + memory_corruption_check = +#ifdef CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK + 1 +#else + 0 +#endif + ; + } + if (corruption_check_size == 0) memory_corruption_check = 0; -- cgit v1.2.2 From 11fdd252bb5b461289fc5c21dc8fc87dc66f3284 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:50 -0700 Subject: x86: cpu make amd.c more like amd_64.c v2 1. make 32bit have early_init_amd_mc and amd_detect_cmp 2. seperate init_amd_k5/k6/k7 ... v2: fix compiling for !CONFIG_SMP Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 396 ++++++++++++++++++++++++------------------- arch/x86/kernel/cpu/amd_64.c | 17 +- arch/x86/kernel/cpu/common.c | 2 +- 3 files changed, 235 insertions(+), 180 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c3175da7bc69..a3a9e3cbdea9 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -24,8 +24,200 @@ extern void vide(void); __asm__(".align 4\nvide: ret"); +static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) +{ +/* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model == 9 || c->x86_model == 10) { + if (inl (CBAR) & CBAR_ENB) + outl (0 | CBAR_KEY, CBAR); + } +} + + +static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) +{ + u32 l, h; + int mbytes = num_physpages >> (20-PAGE_SHIFT); + + if (c->x86_model < 6) { + /* Based on AMD doc 20734R - June 2000 */ + if (c->x86_model == 0) { + clear_cpu_cap(c, X86_FEATURE_APIC); + set_cpu_cap(c, X86_FEATURE_PGE); + } + return; + } + + if (c->x86_model == 6 && c->x86_mask == 1) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + unsigned long d, d2; + + printk(KERN_INFO "AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + rdtscl(d); + while (n--) + f_vide(); + rdtscl(d2); + d = d2-d; + + if (d > 20*K6_BUG_LOOP) + printk("system stability may be impaired when more than 32 MB are used.\n"); + else + printk("probably OK (after B9730xxxx).\n"); + printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model == 8 && c->x86_mask < 8)) { + /* We can only write allocate on the low 508Mb */ + if (mbytes > 508) + mbytes = 508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF) == 0) { + unsigned long flags; + l = (1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + return; + } + + if ((c->x86_model == 8 && c->x86_mask > 7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if (mbytes > 4092) + mbytes = 4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000) == 0) { + unsigned long flags; + l = ((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + return; + } + + if (c->x86_model == 10) { + /* AMD Geode LX is model 10 */ + /* placeholder for any needed mods */ + return; + } +} + +static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + rdmsr(MSR_K7_HWCR, l, h); + l &= ~0x00008000; + wrmsr(MSR_K7_HWCR, l, h); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, + ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + set_cpu_cap(c, X86_FEATURE_K7); +} + +/* + * On a AMD dual core setup the lower bits of the APIC id distingush the cores. + * Assumes number of cores is a power of two. + */ +static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits; + + bits = c->x86_coreid_bits; + + /* Low order bits define the core id (index of core in socket) */ + c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); + /* Convert the initial APIC ID into the socket ID */ + c->phys_proc_id = c->initial_apicid >> bits; +#endif +} + +static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_HT + unsigned bits, ecx; + + /* Multi core CPU? */ + if (c->extended_cpuid_level < 0x80000008) + return; + + ecx = cpuid_ecx(0x80000008); + + c->x86_max_cores = (ecx & 0xff) + 1; + + /* CPU telling us the core id bits shift? */ + bits = (ecx >> 12) & 0xF; + + /* Otherwise recompute */ + if (bits == 0) { + while ((1 << bits) < c->x86_max_cores) + bits++; + } + + c->x86_coreid_bits = bits; +#endif +} + static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { + early_init_amd_mc(c); + if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); @@ -37,9 +229,6 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) static void __cpuinit init_amd(struct cpuinfo_x86 *c) { - u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); - #ifdef CONFIG_SMP unsigned long long value; @@ -50,7 +239,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Errata 63 for SH-B3 steppings * Errata 122 for all steppings (F+ have it disabled by default) */ - if (c->x86 == 15) { + if (c->x86 == 0xf) { rdmsrl(MSR_K7_HWCR, value); value |= 1 << 6; wrmsrl(MSR_K7_HWCR, value); @@ -73,192 +262,55 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) switch (c->x86) { case 4: - /* - * General Systems BIOSen alias the cpu frequency registers - * of the Elan at 0x000df000. Unfortuantly, one of the Linux - * drivers subsequently pokes it, and changes the CPU speed. - * Workaround : Remove the unneeded alias. - */ -#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ -#define CBAR_ENB (0x80000000) -#define CBAR_KEY (0X000000CB) - if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); - } - break; + init_amd_k5(c); + break; case 5: - if (c->x86_model < 6) { - /* Based on AMD doc 20734R - June 2000 */ - if (c->x86_model == 0) { - clear_cpu_cap(c, X86_FEATURE_APIC); - set_cpu_cap(c, X86_FEATURE_PGE); - } - break; - } - - if (c->x86_model == 6 && c->x86_mask == 1) { - const int K6_BUG_LOOP = 1000000; - int n; - void (*f_vide)(void); - unsigned long d, d2; - - printk(KERN_INFO "AMD K6 stepping B detected - "); - - /* - * It looks like AMD fixed the 2.6.2 bug and improved indirect - * calls at the same time. - */ - - n = K6_BUG_LOOP; - f_vide = vide; - rdtscl(d); - while (n--) - f_vide(); - rdtscl(d2); - d = d2-d; - - if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); - else - printk("probably OK (after B9730xxxx).\n"); - printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); - } - - /* K6 with old style WHCR */ - if (c->x86_model < 8 || - (c->x86_model == 8 && c->x86_mask < 8)) { - /* We can only write allocate on the low 508Mb */ - if (mbytes > 508) - mbytes = 508; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0x0000FFFF) == 0) { - unsigned long flags; - l = (1<<0)|((mbytes/4)<<1); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", - mbytes); - } - break; - } - - if ((c->x86_model == 8 && c->x86_mask > 7) || - c->x86_model == 9 || c->x86_model == 13) { - /* The more serious chips .. */ - - if (mbytes > 4092) - mbytes = 4092; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0xFFFF0000) == 0) { - unsigned long flags; - l = ((mbytes>>2)<<22)|(1<<16); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", - mbytes); - } - - break; - } - - if (c->x86_model == 10) { - /* AMD Geode LX is model 10 */ - /* placeholder for any needed mods */ - break; - } - break; - case 6: /* An Athlon/Duron */ - - /* - * Bit 15 of Athlon specific MSR 15, needs to be 0 - * to enable SSE on Palomino/Morgan/Barton CPU's. - * If the BIOS didn't enable it already, enable it here. - */ - if (c->x86_model >= 6 && c->x86_model <= 10) { - if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - rdmsr(MSR_K7_HWCR, l, h); - l &= ~0x00008000; - wrmsr(MSR_K7_HWCR, l, h); - set_cpu_cap(c, X86_FEATURE_XMM); - } - } - - /* - * It's been determined by AMD that Athlons since model 8 stepping 1 - * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx - * As per AMD technical note 27212 0.2 - */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { - rdmsr(MSR_K7_CLK_CTL, l, h); - if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); - wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); - } - } - break; - } - - switch (c->x86) { - case 15: - /* Use K8 tuning for Fam10h and Fam11h */ - case 0x10: - case 0x11: - set_cpu_cap(c, X86_FEATURE_K8); + init_amd_k6(c); break; - case 6: - set_cpu_cap(c, X86_FEATURE_K7); + case 6: /* An Athlon/Duron */ + init_amd_k7(c); break; } + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); + if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - display_cacheinfo(c); + if (!c->x86_model_id[0]) { + switch (c->x86) { + case 0xf: + /* Should distinguish Models here, but this is only + a fallback anyways. */ + strcpy(c->x86_model_id, "Hammer"); + break; + } + } - if (cpuid_eax(0x80000000) >= 0x80000008) - c->x86_max_cores = (cpuid_ecx(0x80000008) & 0xff) + 1; + display_cacheinfo(c); -#ifdef CONFIG_X86_HT - /* - * On a AMD multi core setup the lower bits of the APIC id - * distinguish the cores. - */ - if (c->x86_max_cores > 1) { - int cpu = smp_processor_id(); - unsigned bits = (cpuid_ecx(0x80000008) >> 12) & 0xf; + /* Multi core CPU? */ + if (c->extended_cpuid_level >= 0x80000008) + amd_detect_cmp(c); - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - c->cpu_core_id = c->phys_proc_id & ((1<phys_proc_id >>= bits; - printk(KERN_INFO "CPU %d(%d) -> Core %d\n", - cpu, c->x86_max_cores, c->cpu_core_id); - } -#endif + detect_ht(c); - if (cpuid_eax(0x80000000) >= 0x80000006) { - if ((c->x86 == 0x10) && (cpuid_edx(0x80000006) & 0xf000)) + if (c->extended_cpuid_level >= 0x80000006) { + if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) num_cache_leaves = 4; else num_cache_leaves = 3; } - /* K6s reports MCEs but don't actually have all the MSRs */ - if (c->x86 < 6) - clear_cpu_cap(c, X86_FEATURE_MCE); + if (c->x86 >= 0xf && c->x86 <= 0x11) + set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has_xmm2) + if (cpu_has_xmm2) { + /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } } static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 8c2d07f06f1d..7913e48f6739 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -174,17 +174,20 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); - if (c->extended_cpuid_level >= 0x80000006 && - (cpuid_edx(0x80000006) & 0xf000)) - num_cache_leaves = 4; - else - num_cache_leaves = 3; + if (c->extended_cpuid_level >= 0x80000006) { + if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + } if (c->x86 >= 0xf && c->x86 <= 0x11) set_cpu_cap(c, X86_FEATURE_K8); - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + if (cpu_has_xmm2) { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } if (c->x86 == 0x10) { /* do this for boot cpu */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5cde4b18448c..9a57106c1995 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -629,8 +629,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_vendor_id[0] = '\0'; /* Unset */ c->x86_model_id[0] = '\0'; /* Unset */ c->x86_max_cores = 1; -#ifdef CONFIG_X86_64 c->x86_coreid_bits = 0; +#ifdef CONFIG_X86_64 c->x86_clflush_size = 64; #else c->cpuid_level = -1; /* CPUID not detected */ -- cgit v1.2.2 From c58606ad551e9a1c85a9bd4f1698ca41ec279b2c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:51 -0700 Subject: x86: remove duplicated force_mwait Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd_64.c | 2 -- arch/x86/kernel/process.c | 1 - 2 files changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 7913e48f6739..466a1eaa81ea 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -9,8 +9,6 @@ #include "cpu.h" -int force_mwait __cpuinitdata; - #ifdef CONFIG_NUMA static int __cpuinit nearby_node(int apicid) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7fc4d5b0a6a0..9f94bb1c8117 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -15,7 +15,6 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; -static int force_mwait __cpuinitdata; int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { -- cgit v1.2.2 From 2b86473604f6e9aef0b5e89c56798a90ccddcdbe Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:52 -0700 Subject: x86: add srat_detect_node for amd64 separate that from amd_detect_cmp() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd_64.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 466a1eaa81ea..20c3f12dfe78 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -36,19 +36,23 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP unsigned bits; -#ifdef CONFIG_NUMA - int cpu = smp_processor_id(); - int node = 0; - unsigned apicid = hard_smp_processor_id(); -#endif + bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); /* Convert the initial APIC ID into the socket ID */ c->phys_proc_id = c->initial_apicid >> bits; +#endif +} +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +{ #ifdef CONFIG_NUMA + int cpu = smp_processor_id(); + int node; + unsigned apicid = hard_smp_processor_id(); + node = c->phys_proc_id; if (apicid_to_node[apicid] != NUMA_NO_NODE) node = apicid_to_node[apicid]; @@ -76,7 +80,6 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); #endif -#endif } static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) @@ -169,8 +172,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) display_cacheinfo(c); /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) + if (c->extended_cpuid_level >= 0x80000008) { amd_detect_cmp(c); + srat_detect_node(c); + } if (c->extended_cpuid_level >= 0x80000006) { if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) -- cgit v1.2.2 From 8d71a2ea0ad4ef9b9076ffd44726bad1f0ccf59b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:53 -0700 Subject: x86: merge header in amd_64.c Singed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 8 ++++++++ arch/x86/kernel/cpu/amd_64.c | 13 ++++++++++--- 2 files changed, 18 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a3a9e3cbdea9..3c8090d10053 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,11 +1,19 @@ #include #include #include + #include #include #include +#ifdef CONFIG_X86_64 +# include +# include +# include +#endif + #include + #include "cpu.h" /* diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 20c3f12dfe78..1d83601e85ef 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -1,9 +1,16 @@ #include +#include #include -#include -#include -#include +#include +#include +#include + +#ifdef CONFIG_X86_64 +# include +# include +# include +#endif #include -- cgit v1.2.2 From 6c62aa4a3c12989a1f1fcbbe6f1ee5d4de4b2300 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:54 -0700 Subject: x86: make amd.c have 64bit support code Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 137 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 126 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 3c8090d10053..32e73520adf7 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -16,6 +16,7 @@ #include "cpu.h" +#ifdef CONFIG_X86_32 /* * B step AMD K6 before B 9730xxxx have hardware bugs that can cause * misexecution of code under Linux. Owners of such processors should @@ -177,6 +178,26 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_K7); } +#endif + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) +static int __cpuinit nearby_node(int apicid) +{ + int i, node; + + for (i = apicid - 1; i >= 0; i--) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { + node = apicid_to_node[i]; + if (node != NUMA_NO_NODE && node_online(node)) + return node; + } + return first_node(node_online_map); /* Shouldn't happen */ +} +#endif /* * On a AMD dual core setup the lower bits of the APIC id distingush the cores. @@ -196,6 +217,42 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) #endif } +static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) +{ +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + int cpu = smp_processor_id(); + int node; + unsigned apicid = hard_smp_processor_id(); + + node = c->phys_proc_id; + if (apicid_to_node[apicid] != NUMA_NO_NODE) + node = apicid_to_node[apicid]; + if (!node_online(node)) { + /* Two possibilities here: + - The CPU is missing memory and no node was created. + In that case try picking one from a nearby CPU + - The APIC IDs differ from the HyperTransport node IDs + which the K8 northbridge parsing fills in. + Assume they are all increased by a constant offset, + but in the same order as the HT nodeids. + If that doesn't result in a usable node fall back to the + path for the previous case. */ + + int ht_nodeid = c->initial_apicid; + + if (ht_nodeid >= 0 && + apicid_to_node[ht_nodeid] != NUMA_NO_NODE) + node = apicid_to_node[ht_nodeid]; + /* Pick a nearby node */ + if (!node_online(node)) + node = nearby_node(apicid); + } + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +} + static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_HT @@ -226,13 +283,19 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { early_init_amd_mc(c); + /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#else /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); + if (c->x86 == 5) + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -256,18 +319,31 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) early_init_amd(c); - /* - * FIXME: We should handle the K5 here. Set up the write - * range and also turn on MSR 83 bits 4 and 31 (write alloc, - * no bus pipeline) - */ - /* * Bit 31 in normal CPUID used for nonstandard 3DNow ID; * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ clear_cpu_cap(c, 0*32+31); +#ifdef CONFIG_X86_64 + /* On C+ stepping K8 rep microcode works well for copy/memset */ + if (c->x86 == 0xf) { + u32 level; + + level = cpuid_eax(1); + if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); + } + if (c->x86 == 0x10 || c->x86 == 0x11) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + + /* + * FIXME: We should handle the K5 here. Set up the write + * range and also turn on MSR 83 bits 4 and 31 (write alloc, + * no bus pipeline) + */ + switch (c->x86) { case 4: init_amd_k5(c); @@ -283,7 +359,9 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* K6s reports MCEs but don't actually have all the MSRs */ if (c->x86 < 6) clear_cpu_cap(c, X86_FEATURE_MCE); +#endif + /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); @@ -300,10 +378,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) display_cacheinfo(c); /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) + if (c->extended_cpuid_level >= 0x80000008) { amd_detect_cmp(c); + srat_detect_node(c); + } +#ifdef CONFIG_X86_32 detect_ht(c); +#endif if (c->extended_cpuid_level >= 0x80000006) { if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) @@ -319,8 +401,38 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* MFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } + +#ifdef CONFIG_X86_64 + if (c->x86 == 0x10) { + /* do this for boot cpu */ + if (c == &boot_cpu_data) + check_enable_amd_mmconf_dmi(); + + fam10h_check_enable_mmcfg(); + } + + if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + unsigned long long tseg; + + /* + * Split up direct mapping around the TSEG SMM area. + * Don't do it for gbpages because there seems very little + * benefit in doing so. + */ + if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { + printk(KERN_DEBUG "tseg: %010llx\n", tseg); + if ((tseg>>PMD_SHIFT) < + (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || + ((tseg>>PMD_SHIFT) < + (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && + (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) + set_memory_4k((unsigned long)__va(tseg), 1); + } + } +#endif } +#ifdef CONFIG_X86_32 static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* AMD errata T13 (order #21922) */ @@ -333,10 +445,12 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int } return size; } +#endif static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, +#ifdef CONFIG_X86_32 .c_models = { { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = { @@ -349,9 +463,10 @@ static struct cpu_dev amd_cpu_dev __cpuinitdata = { } }, }, + .c_size_cache = amd_size_cache, +#endif .c_early_init = early_init_amd, .c_init = init_amd, - .c_size_cache = amd_size_cache, .c_x86_vendor = X86_VENDOR_AMD, }; -- cgit v1.2.2 From 2a02505055fdd44958efd0e140dd87cb9fdecaf1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:55 -0700 Subject: x86: make amd_64 have 32 bit code Signed-off-by: Yinghai Lu Cc: Yinghai Lu Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd_64.c | 259 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 247 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c index 1d83601e85ef..32e73520adf7 100644 --- a/arch/x86/kernel/cpu/amd_64.c +++ b/arch/x86/kernel/cpu/amd_64.c @@ -16,7 +16,171 @@ #include "cpu.h" -#ifdef CONFIG_NUMA +#ifdef CONFIG_X86_32 +/* + * B step AMD K6 before B 9730xxxx have hardware bugs that can cause + * misexecution of code under Linux. Owners of such processors should + * contact AMD for precise details and a CPU swap. + * + * See http://www.multimania.com/poulot/k6bug.html + * http://www.amd.com/K6/k6docs/revgd.html + * + * The following test is erm.. interesting. AMD neglected to up + * the chip setting when fixing the bug but they also tweaked some + * performance at the same time.. + */ + +extern void vide(void); +__asm__(".align 4\nvide: ret"); + +static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) +{ +/* + * General Systems BIOSen alias the cpu frequency registers + * of the Elan at 0x000df000. Unfortuantly, one of the Linux + * drivers subsequently pokes it, and changes the CPU speed. + * Workaround : Remove the unneeded alias. + */ +#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ +#define CBAR_ENB (0x80000000) +#define CBAR_KEY (0X000000CB) + if (c->x86_model == 9 || c->x86_model == 10) { + if (inl (CBAR) & CBAR_ENB) + outl (0 | CBAR_KEY, CBAR); + } +} + + +static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) +{ + u32 l, h; + int mbytes = num_physpages >> (20-PAGE_SHIFT); + + if (c->x86_model < 6) { + /* Based on AMD doc 20734R - June 2000 */ + if (c->x86_model == 0) { + clear_cpu_cap(c, X86_FEATURE_APIC); + set_cpu_cap(c, X86_FEATURE_PGE); + } + return; + } + + if (c->x86_model == 6 && c->x86_mask == 1) { + const int K6_BUG_LOOP = 1000000; + int n; + void (*f_vide)(void); + unsigned long d, d2; + + printk(KERN_INFO "AMD K6 stepping B detected - "); + + /* + * It looks like AMD fixed the 2.6.2 bug and improved indirect + * calls at the same time. + */ + + n = K6_BUG_LOOP; + f_vide = vide; + rdtscl(d); + while (n--) + f_vide(); + rdtscl(d2); + d = d2-d; + + if (d > 20*K6_BUG_LOOP) + printk("system stability may be impaired when more than 32 MB are used.\n"); + else + printk("probably OK (after B9730xxxx).\n"); + printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); + } + + /* K6 with old style WHCR */ + if (c->x86_model < 8 || + (c->x86_model == 8 && c->x86_mask < 8)) { + /* We can only write allocate on the low 508Mb */ + if (mbytes > 508) + mbytes = 508; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0x0000FFFF) == 0) { + unsigned long flags; + l = (1<<0)|((mbytes/4)<<1); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", + mbytes); + } + return; + } + + if ((c->x86_model == 8 && c->x86_mask > 7) || + c->x86_model == 9 || c->x86_model == 13) { + /* The more serious chips .. */ + + if (mbytes > 4092) + mbytes = 4092; + + rdmsr(MSR_K6_WHCR, l, h); + if ((l&0xFFFF0000) == 0) { + unsigned long flags; + l = ((mbytes>>2)<<22)|(1<<16); + local_irq_save(flags); + wbinvd(); + wrmsr(MSR_K6_WHCR, l, h); + local_irq_restore(flags); + printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", + mbytes); + } + + return; + } + + if (c->x86_model == 10) { + /* AMD Geode LX is model 10 */ + /* placeholder for any needed mods */ + return; + } +} + +static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) +{ + u32 l, h; + + /* + * Bit 15 of Athlon specific MSR 15, needs to be 0 + * to enable SSE on Palomino/Morgan/Barton CPU's. + * If the BIOS didn't enable it already, enable it here. + */ + if (c->x86_model >= 6 && c->x86_model <= 10) { + if (!cpu_has(c, X86_FEATURE_XMM)) { + printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); + rdmsr(MSR_K7_HWCR, l, h); + l &= ~0x00008000; + wrmsr(MSR_K7_HWCR, l, h); + set_cpu_cap(c, X86_FEATURE_XMM); + } + } + + /* + * It's been determined by AMD that Athlons since model 8 stepping 1 + * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx + * As per AMD technical note 27212 0.2 + */ + if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { + rdmsr(MSR_K7_CLK_CTL, l, h); + if ((l & 0xfff00000) != 0x20000000) { + printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, + ((l & 0x000fffff)|0x20000000)); + wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); + } + } + + set_cpu_cap(c, X86_FEATURE_K7); +} +#endif + +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) static int __cpuinit nearby_node(int apicid) { int i, node; @@ -41,7 +205,7 @@ static int __cpuinit nearby_node(int apicid) */ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT unsigned bits; bits = c->x86_coreid_bits; @@ -55,7 +219,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) { -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) int cpu = smp_processor_id(); int node; unsigned apicid = hard_smp_processor_id(); @@ -91,7 +255,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP +#ifdef CONFIG_X86_HT unsigned bits, ecx; /* Multi core CPU? */ @@ -112,7 +276,6 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) } c->x86_coreid_bits = bits; - #endif } @@ -124,15 +287,21 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSCALL32); +#else + /* Set MTRR capability flag if appropriate */ + if (c->x86 == 5) + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); +#endif } static void __cpuinit init_amd(struct cpuinfo_x86 *c) { - unsigned level; - #ifdef CONFIG_SMP - unsigned long value; + unsigned long long value; /* * Disable TLB flush filter by setting HWCR.FFDIS on K8 @@ -142,26 +311,55 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Errata 122 for all steppings (F+ have it disabled by default) */ if (c->x86 == 0xf) { - rdmsrl(MSR_K8_HWCR, value); + rdmsrl(MSR_K7_HWCR, value); value |= 1 << 6; - wrmsrl(MSR_K8_HWCR, value); + wrmsrl(MSR_K7_HWCR, value); } #endif early_init_amd(c); - /* Bit 31 in normal CPUID used for nonstandard 3DNow ID; - 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway */ + /* + * Bit 31 in normal CPUID used for nonstandard 3DNow ID; + * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway + */ clear_cpu_cap(c, 0*32+31); +#ifdef CONFIG_X86_64 /* On C+ stepping K8 rep microcode works well for copy/memset */ if (c->x86 == 0xf) { + u32 level; + level = cpuid_eax(1); if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) set_cpu_cap(c, X86_FEATURE_REP_GOOD); } if (c->x86 == 0x10 || c->x86 == 0x11) set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + + /* + * FIXME: We should handle the K5 here. Set up the write + * range and also turn on MSR 83 bits 4 and 31 (write alloc, + * no bus pipeline) + */ + + switch (c->x86) { + case 4: + init_amd_k5(c); + break; + case 5: + init_amd_k6(c); + break; + case 6: /* An Athlon/Duron */ + init_amd_k7(c); + break; + } + + /* K6s reports MCEs but don't actually have all the MSRs */ + if (c->x86 < 6) + clear_cpu_cap(c, X86_FEATURE_MCE); +#endif /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) @@ -176,6 +374,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) break; } } + display_cacheinfo(c); /* Multi core CPU? */ @@ -184,6 +383,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) srat_detect_node(c); } +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + if (c->extended_cpuid_level >= 0x80000006) { if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) num_cache_leaves = 4; @@ -199,6 +402,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); } +#ifdef CONFIG_X86_64 if (c->x86 == 0x10) { /* do this for boot cpu */ if (c == &boot_cpu_data) @@ -225,11 +429,42 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) set_memory_4k((unsigned long)__va(tseg), 1); } } +#endif +} + +#ifdef CONFIG_X86_32 +static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* AMD errata T13 (order #21922) */ + if ((c->x86 == 6)) { + if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ + size = 64; + if (c->x86_model == 4 && + (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ + size = 256; + } + return size; } +#endif static struct cpu_dev amd_cpu_dev __cpuinitdata = { .c_vendor = "AMD", .c_ident = { "AuthenticAMD" }, +#ifdef CONFIG_X86_32 + .c_models = { + { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = + { + [3] = "486 DX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB", + [14] = "Am5x86-WT", + [15] = "Am5x86-WB" + } + }, + }, + .c_size_cache = amd_size_cache, +#endif .c_early_init = early_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, -- cgit v1.2.2 From ff73152ced60871f7d5fb7dee52fa499902a3c6d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:56 -0700 Subject: x86: make 64 bit to use amd.c arch/x86/kernel/cpu/amd.c is now 100% identical to arch/x86/kernel/cpu/amd_64.c, so use amd.c on 64-bit too and fix up the namespace impact. Simplify the Kconfig glue as well. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 10 +- arch/x86/kernel/cpu/Makefile | 3 +- arch/x86/kernel/cpu/amd_64.c | 473 ------------------------------------------- 3 files changed, 2 insertions(+), 484 deletions(-) delete mode 100644 arch/x86/kernel/cpu/amd_64.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6156ac25ff8c..ab77d409fee0 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -444,17 +444,9 @@ config CPU_SUP_CYRIX_32 help This enables extended support for Cyrix processors -config CPU_SUP_AMD_32 +config CPU_SUP_AMD default y bool "Support AMD processors" if PROCESSOR_SELECT - depends on !64BIT - help - This enables extended support for AMD processors - -config CPU_SUP_AMD_64 - default y - bool "Support AMD processors" if PROCESSOR_SELECT - depends on 64BIT help This enables extended support for AMD processors diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index d031f248dfc0..510d1bcb058a 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -10,8 +10,7 @@ obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o -obj-$(CONFIG_CPU_SUP_AMD_32) += amd.o -obj-$(CONFIG_CPU_SUP_AMD_64) += amd_64.o +obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o diff --git a/arch/x86/kernel/cpu/amd_64.c b/arch/x86/kernel/cpu/amd_64.c deleted file mode 100644 index 32e73520adf7..000000000000 --- a/arch/x86/kernel/cpu/amd_64.c +++ /dev/null @@ -1,473 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#ifdef CONFIG_X86_64 -# include -# include -# include -#endif - -#include - -#include "cpu.h" - -#ifdef CONFIG_X86_32 -/* - * B step AMD K6 before B 9730xxxx have hardware bugs that can cause - * misexecution of code under Linux. Owners of such processors should - * contact AMD for precise details and a CPU swap. - * - * See http://www.multimania.com/poulot/k6bug.html - * http://www.amd.com/K6/k6docs/revgd.html - * - * The following test is erm.. interesting. AMD neglected to up - * the chip setting when fixing the bug but they also tweaked some - * performance at the same time.. - */ - -extern void vide(void); -__asm__(".align 4\nvide: ret"); - -static void __cpuinit init_amd_k5(struct cpuinfo_x86 *c) -{ -/* - * General Systems BIOSen alias the cpu frequency registers - * of the Elan at 0x000df000. Unfortuantly, one of the Linux - * drivers subsequently pokes it, and changes the CPU speed. - * Workaround : Remove the unneeded alias. - */ -#define CBAR (0xfffc) /* Configuration Base Address (32-bit) */ -#define CBAR_ENB (0x80000000) -#define CBAR_KEY (0X000000CB) - if (c->x86_model == 9 || c->x86_model == 10) { - if (inl (CBAR) & CBAR_ENB) - outl (0 | CBAR_KEY, CBAR); - } -} - - -static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) -{ - u32 l, h; - int mbytes = num_physpages >> (20-PAGE_SHIFT); - - if (c->x86_model < 6) { - /* Based on AMD doc 20734R - June 2000 */ - if (c->x86_model == 0) { - clear_cpu_cap(c, X86_FEATURE_APIC); - set_cpu_cap(c, X86_FEATURE_PGE); - } - return; - } - - if (c->x86_model == 6 && c->x86_mask == 1) { - const int K6_BUG_LOOP = 1000000; - int n; - void (*f_vide)(void); - unsigned long d, d2; - - printk(KERN_INFO "AMD K6 stepping B detected - "); - - /* - * It looks like AMD fixed the 2.6.2 bug and improved indirect - * calls at the same time. - */ - - n = K6_BUG_LOOP; - f_vide = vide; - rdtscl(d); - while (n--) - f_vide(); - rdtscl(d2); - d = d2-d; - - if (d > 20*K6_BUG_LOOP) - printk("system stability may be impaired when more than 32 MB are used.\n"); - else - printk("probably OK (after B9730xxxx).\n"); - printk(KERN_INFO "Please see http://membres.lycos.fr/poulot/k6bug.html\n"); - } - - /* K6 with old style WHCR */ - if (c->x86_model < 8 || - (c->x86_model == 8 && c->x86_mask < 8)) { - /* We can only write allocate on the low 508Mb */ - if (mbytes > 508) - mbytes = 508; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0x0000FFFF) == 0) { - unsigned long flags; - l = (1<<0)|((mbytes/4)<<1); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling old style K6 write allocation for %d Mb\n", - mbytes); - } - return; - } - - if ((c->x86_model == 8 && c->x86_mask > 7) || - c->x86_model == 9 || c->x86_model == 13) { - /* The more serious chips .. */ - - if (mbytes > 4092) - mbytes = 4092; - - rdmsr(MSR_K6_WHCR, l, h); - if ((l&0xFFFF0000) == 0) { - unsigned long flags; - l = ((mbytes>>2)<<22)|(1<<16); - local_irq_save(flags); - wbinvd(); - wrmsr(MSR_K6_WHCR, l, h); - local_irq_restore(flags); - printk(KERN_INFO "Enabling new style K6 write allocation for %d Mb\n", - mbytes); - } - - return; - } - - if (c->x86_model == 10) { - /* AMD Geode LX is model 10 */ - /* placeholder for any needed mods */ - return; - } -} - -static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) -{ - u32 l, h; - - /* - * Bit 15 of Athlon specific MSR 15, needs to be 0 - * to enable SSE on Palomino/Morgan/Barton CPU's. - * If the BIOS didn't enable it already, enable it here. - */ - if (c->x86_model >= 6 && c->x86_model <= 10) { - if (!cpu_has(c, X86_FEATURE_XMM)) { - printk(KERN_INFO "Enabling disabled K7/SSE Support.\n"); - rdmsr(MSR_K7_HWCR, l, h); - l &= ~0x00008000; - wrmsr(MSR_K7_HWCR, l, h); - set_cpu_cap(c, X86_FEATURE_XMM); - } - } - - /* - * It's been determined by AMD that Athlons since model 8 stepping 1 - * are more robust with CLK_CTL set to 200xxxxx instead of 600xxxxx - * As per AMD technical note 27212 0.2 - */ - if ((c->x86_model == 8 && c->x86_mask >= 1) || (c->x86_model > 8)) { - rdmsr(MSR_K7_CLK_CTL, l, h); - if ((l & 0xfff00000) != 0x20000000) { - printk ("CPU: CLK_CTL MSR was %x. Reprogramming to %x\n", l, - ((l & 0x000fffff)|0x20000000)); - wrmsr(MSR_K7_CLK_CTL, (l & 0x000fffff)|0x20000000, h); - } - } - - set_cpu_cap(c, X86_FEATURE_K7); -} -#endif - -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) -static int __cpuinit nearby_node(int apicid) -{ - int i, node; - - for (i = apicid - 1; i >= 0; i--) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { - node = apicid_to_node[i]; - if (node != NUMA_NO_NODE && node_online(node)) - return node; - } - return first_node(node_online_map); /* Shouldn't happen */ -} -#endif - -/* - * On a AMD dual core setup the lower bits of the APIC id distingush the cores. - * Assumes number of cores is a power of two. - */ -static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_HT - unsigned bits; - - bits = c->x86_coreid_bits; - - /* Low order bits define the core id (index of core in socket) */ - c->cpu_core_id = c->initial_apicid & ((1 << bits)-1); - /* Convert the initial APIC ID into the socket ID */ - c->phys_proc_id = c->initial_apicid >> bits; -#endif -} - -static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) -{ -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) - int cpu = smp_processor_id(); - int node; - unsigned apicid = hard_smp_processor_id(); - - node = c->phys_proc_id; - if (apicid_to_node[apicid] != NUMA_NO_NODE) - node = apicid_to_node[apicid]; - if (!node_online(node)) { - /* Two possibilities here: - - The CPU is missing memory and no node was created. - In that case try picking one from a nearby CPU - - The APIC IDs differ from the HyperTransport node IDs - which the K8 northbridge parsing fills in. - Assume they are all increased by a constant offset, - but in the same order as the HT nodeids. - If that doesn't result in a usable node fall back to the - path for the previous case. */ - - int ht_nodeid = c->initial_apicid; - - if (ht_nodeid >= 0 && - apicid_to_node[ht_nodeid] != NUMA_NO_NODE) - node = apicid_to_node[ht_nodeid]; - /* Pick a nearby node */ - if (!node_online(node)) - node = nearby_node(apicid); - } - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -} - -static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_HT - unsigned bits, ecx; - - /* Multi core CPU? */ - if (c->extended_cpuid_level < 0x80000008) - return; - - ecx = cpuid_ecx(0x80000008); - - c->x86_max_cores = (ecx & 0xff) + 1; - - /* CPU telling us the core id bits shift? */ - bits = (ecx >> 12) & 0xF; - - /* Otherwise recompute */ - if (bits == 0) { - while ((1 << bits) < c->x86_max_cores) - bits++; - } - - c->x86_coreid_bits = bits; -#endif -} - -static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) -{ - early_init_amd_mc(c); - - /* c->x86_power is 8000_0007 edx. Bit 8 is constant TSC */ - if (c->x86_power & (1<<8)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - -#ifdef CONFIG_X86_64 - set_cpu_cap(c, X86_FEATURE_SYSCALL32); -#else - /* Set MTRR capability flag if appropriate */ - if (c->x86 == 5) - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); -#endif -} - -static void __cpuinit init_amd(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_SMP - unsigned long long value; - - /* - * Disable TLB flush filter by setting HWCR.FFDIS on K8 - * bit 6 of msr C001_0015 - * - * Errata 63 for SH-B3 steppings - * Errata 122 for all steppings (F+ have it disabled by default) - */ - if (c->x86 == 0xf) { - rdmsrl(MSR_K7_HWCR, value); - value |= 1 << 6; - wrmsrl(MSR_K7_HWCR, value); - } -#endif - - early_init_amd(c); - - /* - * Bit 31 in normal CPUID used for nonstandard 3DNow ID; - * 3DNow is IDd by bit 31 in extended CPUID (1*32+31) anyway - */ - clear_cpu_cap(c, 0*32+31); - -#ifdef CONFIG_X86_64 - /* On C+ stepping K8 rep microcode works well for copy/memset */ - if (c->x86 == 0xf) { - u32 level; - - level = cpuid_eax(1); - if((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); - } - if (c->x86 == 0x10 || c->x86 == 0x11) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); -#else - - /* - * FIXME: We should handle the K5 here. Set up the write - * range and also turn on MSR 83 bits 4 and 31 (write alloc, - * no bus pipeline) - */ - - switch (c->x86) { - case 4: - init_amd_k5(c); - break; - case 5: - init_amd_k6(c); - break; - case 6: /* An Athlon/Duron */ - init_amd_k7(c); - break; - } - - /* K6s reports MCEs but don't actually have all the MSRs */ - if (c->x86 < 6) - clear_cpu_cap(c, X86_FEATURE_MCE); -#endif - - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) - set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - - if (!c->x86_model_id[0]) { - switch (c->x86) { - case 0xf: - /* Should distinguish Models here, but this is only - a fallback anyways. */ - strcpy(c->x86_model_id, "Hammer"); - break; - } - } - - display_cacheinfo(c); - - /* Multi core CPU? */ - if (c->extended_cpuid_level >= 0x80000008) { - amd_detect_cmp(c); - srat_detect_node(c); - } - -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - - if (c->extended_cpuid_level >= 0x80000006) { - if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) - num_cache_leaves = 4; - else - num_cache_leaves = 3; - } - - if (c->x86 >= 0xf && c->x86 <= 0x11) - set_cpu_cap(c, X86_FEATURE_K8); - - if (cpu_has_xmm2) { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); - } - -#ifdef CONFIG_X86_64 - if (c->x86 == 0x10) { - /* do this for boot cpu */ - if (c == &boot_cpu_data) - check_enable_amd_mmconf_dmi(); - - fam10h_check_enable_mmcfg(); - } - - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { - unsigned long long tseg; - - /* - * Split up direct mapping around the TSEG SMM area. - * Don't do it for gbpages because there seems very little - * benefit in doing so. - */ - if (!rdmsrl_safe(MSR_K8_TSEG_ADDR, &tseg)) { - printk(KERN_DEBUG "tseg: %010llx\n", tseg); - if ((tseg>>PMD_SHIFT) < - (max_low_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) || - ((tseg>>PMD_SHIFT) < - (max_pfn_mapped>>(PMD_SHIFT-PAGE_SHIFT)) && - (tseg>>PMD_SHIFT) >= (1ULL<<(32 - PMD_SHIFT)))) - set_memory_4k((unsigned long)__va(tseg), 1); - } - } -#endif -} - -#ifdef CONFIG_X86_32 -static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c, unsigned int size) -{ - /* AMD errata T13 (order #21922) */ - if ((c->x86 == 6)) { - if (c->x86_model == 3 && c->x86_mask == 0) /* Duron Rev A0 */ - size = 64; - if (c->x86_model == 4 && - (c->x86_mask == 0 || c->x86_mask == 1)) /* Tbird rev A1/A2 */ - size = 256; - } - return size; -} -#endif - -static struct cpu_dev amd_cpu_dev __cpuinitdata = { - .c_vendor = "AMD", - .c_ident = { "AuthenticAMD" }, -#ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_AMD, .family = 4, .model_names = - { - [3] = "486 DX/2", - [7] = "486 DX/2-WB", - [8] = "486 DX/4", - [9] = "486 DX/4-WB", - [14] = "Am5x86-WT", - [15] = "Am5x86-WB" - } - }, - }, - .c_size_cache = amd_size_cache, -#endif - .c_early_init = early_init_amd, - .c_init = init_amd, - .c_x86_vendor = X86_VENDOR_AMD, -}; - -cpu_dev_register(amd_cpu_dev); -- cgit v1.2.2 From f69feff720497237ae9dd2f4604921bd3080c421 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:58 -0700 Subject: x86: little clean up of intel.c/intel_64.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 7 +++---- arch/x86/kernel/cpu/intel_64.c | 8 +++----- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 959417b8cd64..4a8ac9d4ff50 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -76,7 +76,7 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) /* * find out the number of processor cores on the die */ -static int __cpuinit num_cpu_cores(struct cpuinfo_x86 *c) +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) { unsigned int eax, ebx, ecx, edx; @@ -183,7 +183,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) * let's use the legacy cpuid vector 0x1 and 0x4 for topology * detection. */ - c->x86_max_cores = num_cpu_cores(c); + c->x86_max_cores = intel_num_cpu_cores(c); detect_ht(c); } @@ -210,9 +210,8 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (c->x86 == 15) { + if (c->x86 == 15) set_cpu_cap(c, X86_FEATURE_P4); - } if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_P3); if (cpu_has_ds) { diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index 14a2cd98d480..aef4f2826313 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -71,17 +71,15 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) set_cpu_cap(c, X86_FEATURE_PEBS); - } - - - if (cpu_has_bts) ds_init_intel(c); + } if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_REP_GOOD); - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + if (cpu_has_xmm2) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); detect_extended_topology(c); if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) -- cgit v1.2.2 From de9f521fb72dd091aa4989fe2e004ecf4785a850 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Sep 2008 18:10:11 +0900 Subject: x86: move pci-nommu's dma_mask check to common code The check to see if dev->dma_mask is NULL in pci-nommu is more appropriate for dma_alloc_coherent(). Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 73853d3fdcac..0f51883cc6a8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -80,9 +80,6 @@ nommu_alloc_coherent(struct device *hwdev, size_t size, int node; struct page *page; - if (hwdev->dma_mask == NULL) - return NULL; - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); gfp |= __GFP_ZERO; -- cgit v1.2.2 From 8a53ad675f86ee003482b557da944e070d3c4859 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Sep 2008 18:10:12 +0900 Subject: x86: fix nommu_alloc_coherent allocation with NULL device argument We need to use __GFP_DMA for NULL device argument (fallback_dev) with pci-nommu. It's a hack for ISA (and some old code) so we need to use GFP_DMA. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 0f51883cc6a8..ada1c87cafc2 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -80,7 +80,6 @@ nommu_alloc_coherent(struct device *hwdev, size_t size, int node; struct page *page; - gfp &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); gfp |= __GFP_ZERO; dma_mask = hwdev->coherent_dma_mask; @@ -93,7 +92,7 @@ nommu_alloc_coherent(struct device *hwdev, size_t size, node = dev_to_node(hwdev); #ifdef CONFIG_X86_64 - if (dma_mask <= DMA_32BIT_MASK) + if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) gfp |= GFP_DMA32; #endif -- cgit v1.2.2 From 823e7e8c6ef12cd1943dc42fe7595ca74e8cc3d7 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 8 Sep 2008 18:10:13 +0900 Subject: x86: dma_alloc_coherent sets gfp flags properly Non real IOMMU implemenations (which doesn't do virtual mappings, e.g. swiotlb, pci-nommu, etc) need to use proper gfp flags and dma_mask to allocate pages in their own dma_alloc_coherent() (allocated page need to be suitable for device's coherent_dma_mask). This patch makes dma_alloc_coherent do this job so that IOMMUs don't need to take care of it any more. Real IOMMU implemenataions can simply ignore the gfp flags. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index ada1c87cafc2..8e398b56f50b 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -80,26 +80,11 @@ nommu_alloc_coherent(struct device *hwdev, size_t size, int node; struct page *page; - gfp |= __GFP_ZERO; - - dma_mask = hwdev->coherent_dma_mask; - if (!dma_mask) - dma_mask = *(hwdev->dma_mask); + dma_mask = dma_alloc_coherent_mask(hwdev, gfp); - if (dma_mask < DMA_24BIT_MASK) - return NULL; + gfp |= __GFP_ZERO; node = dev_to_node(hwdev); - -#ifdef CONFIG_X86_64 - if (dma_mask <= DMA_32BIT_MASK && !(gfp & GFP_DMA)) - gfp |= GFP_DMA32; -#endif - - /* No alloc-free penalty for ISA devices */ - if (dma_mask == DMA_24BIT_MASK) - gfp |= GFP_DMA; - again: page = alloc_pages_node(node, gfp, get_order(size)); if (!page) -- cgit v1.2.2 From 2737146b3aa2cf8b5d5ae87a18c49fe1c374528b Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 8 Sep 2008 13:43:33 +0100 Subject: x86, xen: fix build when !CONFIG_HOTPLUG_CPU Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index be5cbb2b7c60..bf51a466d62c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -332,6 +332,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus) { } +#ifdef CONFIG_HOTPLUG_CPU int xen_cpu_disable(void) { unsigned int cpu = smp_processor_id(); @@ -368,6 +369,23 @@ void xen_play_dead(void) cpu_bringup(); } +#else /* !CONFIG_HOTPLUG_CPU */ +int xen_cpu_disable(void) +{ + return -ENOSYS; +} + +void xen_cpu_die(unsigned int cpu) +{ + BUG(); +} + +void xen_play_dead(void) +{ + BUG(); +} + +#endif static void stop_self(void *v) { int cpu = smp_processor_id(); -- cgit v1.2.2 From 26fd10517e810dd59ea050b052de24a75ee6dc07 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 8 Sep 2008 13:43:34 +0100 Subject: xen: make CPU hotplug functions static There's no need for these functions to be accessed from outside of xen/smp.c Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/smp.c | 12 ++++++------ arch/x86/xen/xen-ops.h | 4 ---- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index bf51a466d62c..d77da613b1d2 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -333,7 +333,7 @@ static void xen_smp_cpus_done(unsigned int max_cpus) } #ifdef CONFIG_HOTPLUG_CPU -int xen_cpu_disable(void) +static int xen_cpu_disable(void) { unsigned int cpu = smp_processor_id(); if (cpu == 0) @@ -345,7 +345,7 @@ int xen_cpu_disable(void) return 0; } -void xen_cpu_die(unsigned int cpu) +static void xen_cpu_die(unsigned int cpu) { while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { current->state = TASK_UNINTERRUPTIBLE; @@ -362,7 +362,7 @@ void xen_cpu_die(unsigned int cpu) alternatives_smp_switch(0); } -void xen_play_dead(void) +static void xen_play_dead(void) { play_dead_common(); HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); @@ -370,17 +370,17 @@ void xen_play_dead(void) } #else /* !CONFIG_HOTPLUG_CPU */ -int xen_cpu_disable(void) +static int xen_cpu_disable(void) { return -ENOSYS; } -void xen_cpu_die(unsigned int cpu) +static void xen_cpu_die(unsigned int cpu) { BUG(); } -void xen_play_dead(void) +static void xen_play_dead(void) { BUG(); } diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 8dbd97fd7f18..d7422dc2a55c 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -51,10 +51,6 @@ void xen_mark_init_mm_pinned(void); void __init xen_setup_vcpu_info_placement(void); -void xen_play_dead(void); -void xen_cpu_die(unsigned int cpu); -int xen_cpu_disable(void); - #ifdef CONFIG_SMP void xen_smp_init(void); -- cgit v1.2.2 From e545a6140b698b2494daf0b32107bdcc5e901390 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Sep 2008 16:57:22 +0200 Subject: kernel/cpu.c: create a CPU_STARTING cpu_chain notifier Right now, there is no notifier that is called on a new cpu, before the new cpu begins processing interrupts/softirqs. Various kernel function would need that notification, e.g. kvm works around by calling smp_call_function_single(), rcu polls cpu_online_map. The patch adds a CPU_STARTING notification. It also adds a helper function that sends the message to all cpu_chain handlers. Tested on x86-64. All other archs are untested. Especially on sparc, I'm not sure if I got it right. Signed-off-by: Manfred Spraul Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 1 + arch/x86/mach-voyager/voyager_smp.c | 2 ++ 2 files changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7985c5b3f916..0b8261c3cac2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -257,6 +257,7 @@ static void __cpuinit smp_callin(void) end_local_APIC_setup(); map_cpu_to_logical_apicid(); + notify_cpu_starting(cpuid); /* * Get our bogomips. * diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index ee0fba092157..199a5f4a873c 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -448,6 +448,8 @@ static void __init start_secondary(void *unused) VDEBUG(("VOYAGER SMP: CPU%d, stack at about %p\n", cpuid, &cpuid)); + notify_cpu_starting(cpuid); + /* enable interrupts */ local_irq_enable(); -- cgit v1.2.2 From 14469a8dd23677921db5e7354a602c98d9c6300f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Sep 2008 09:30:14 -0700 Subject: x86: disable static NOPLs on 32 bits On 32-bit, at least the generic nops are fairly reasonable, but the default nops for 64-bit really look pretty sad, and the P6 nops really do look better. So I would suggest perhaps moving the static P6 nop selection into the CONFIG_X86_64 thing. The alternative is to just get rid of that static nop selection, and just have two cases: 32-bit and 64-bit, and just pick obviously safe cases for them. Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.cpu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2c518fbc52ec..b225219c448c 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -382,14 +382,17 @@ config X86_OOSTORE # P6_NOPs are a relatively minor optimization that require a family >= # 6 processor, except that it is broken on certain VIA chips. # Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). As a result, disallow these if we're -# compiling X86_GENERIC but not X86_64 (these NOPs do work on all -# x86-64 capable chips); the list of processors in the right-hand clause -# are the cores that benefit from this optimization. +# (which work on all CPUs). In addition, it looks like Virtual PC +# does not understand them. +# +# As a result, disallow these if we're not compiling for X86_64 (these +# NOPs do work on all x86-64 capable chips); the list of processors in +# the right-hand clause are the cores that benefit from this optimization. # config X86_P6_NOP def_bool y - depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) + depends on X86_64 + depends on (MCORE2 || MPENTIUM4 || MPSC) config X86_TSC def_bool y -- cgit v1.2.2 From 9c0bbee8a6fc14107e9a7af6750bfe1056cbf4bc Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 9 Sep 2008 11:01:31 +0400 Subject: seccomp: drop now bogus dependency on PROC_FS seccomp is prctl(2)-driven now. Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 68d91c8233f4..1e2afe60ba99 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1205,7 +1205,6 @@ config IRQBALANCE config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" - depends on PROC_FS help This kernel feature is useful for number crunching applications that may need to compute untrusted bytecode during their @@ -1213,7 +1212,7 @@ config SECCOMP the process as file descriptors supporting the read/write syscalls, it's possible to isolate those applications in their own address space using seccomp. Once seccomp is - enabled via /proc//seccomp, it cannot be disabled + enabled via prctl(PR_SET_SECCOMP), it cannot be disabled and the task is only allowed to execute a few safe syscalls defined by each seccomp mode. -- cgit v1.2.2 From 28f7e66fc1da53997a545684b21b91fb3ca3f321 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 8 Sep 2008 12:01:48 -0700 Subject: x86: prevent binutils from being "smart" and generating NOPLs for us binutils, contrary to documented behaviour, will generate long NOPs (a P6-or-higher instruction which is broken on at least some VIA chips, Virtual PC/Virtual Server, and some versions of Qemu) depending on the -mtune= option, which is not supposed to change architectural behaviour. Pass an explicit override to the assembler, in case ends up passing the -mtune= parameter to gas (gcc 4.3.0 does not appear to.) Signed-off-by: H. Peter Anvin --- arch/x86/Makefile_32.cpu | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index e372b584e919..b72b4f753113 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -45,3 +45,8 @@ cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx # cpu entries cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) +# Bug fix for binutils: this option is required in order to keep +# binutils from generating NOPL instructions against our will. +ifneq ($(CONFIG_X86_P6_NOP),y) +cflags-y += $(call cc-option,-Wa$(comma)-mtune=generic32,) +endif -- cgit v1.2.2 From d6be118a97ce51ca84035270f91c2bccecbfac5f Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 9 Sep 2008 09:56:08 -0400 Subject: x86: fix memmap=exactmap boot argument When using kdump modifying the e820 map is yielding strange results. For example starting with BIOS-provided physical RAM map: BIOS-e820: 0000000000000100 - 0000000000093400 (usable) BIOS-e820: 0000000000093400 - 00000000000a0000 (reserved) BIOS-e820: 0000000000100000 - 000000003fee0000 (usable) BIOS-e820: 000000003fee0000 - 000000003fef3000 (ACPI data) BIOS-e820: 000000003fef3000 - 000000003ff80000 (ACPI NVS) BIOS-e820: 000000003ff80000 - 0000000040000000 (reserved) BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved) BIOS-e820: 00000000fec00000 - 00000000fec10000 (reserved) BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved) BIOS-e820: 00000000ff000000 - 0000000100000000 (reserved) and booting with args memmap=exactmap memmap=640K@0K memmap=5228K@16384K memmap=125188K@22252K memmap=76K#1047424K memmap=564K#1047500K resulted in: user-defined physical RAM map: user: 0000000000000000 - 0000000000093400 (usable) user: 0000000000093400 - 00000000000a0000 (reserved) user: 0000000000100000 - 000000003fee0000 (usable) user: 000000003fee0000 - 000000003fef3000 (ACPI data) user: 000000003fef3000 - 000000003ff80000 (ACPI NVS) user: 000000003ff80000 - 0000000040000000 (reserved) user: 00000000e0000000 - 00000000f0000000 (reserved) user: 00000000fec00000 - 00000000fec10000 (reserved) user: 00000000fee00000 - 00000000fee01000 (reserved) user: 00000000ff000000 - 0000000100000000 (reserved) But should have resulted in: user-defined physical RAM map: user: 0000000000000000 - 00000000000a0000 (usable) user: 0000000001000000 - 000000000151b000 (usable) user: 00000000015bb000 - 0000000008ffc000 (usable) user: 000000003fee0000 - 000000003ff80000 (ACPI data) This is happening because of an improper usage of strcmp() in the e820 parsing code. The strcmp() always returns !0 and never resets the value for e820.nr_map and returns an incorrect user-defined map. This patch fixes the problem. Signed-off-by: Prarit Bhargava Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9af89078f7bb..66e48aa2dd1b 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1203,7 +1203,7 @@ static int __init parse_memmap_opt(char *p) if (!p) return -EINVAL; - if (!strcmp(p, "exactmap")) { + if (!strncmp(p, "exactmap", 8)) { #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know -- cgit v1.2.2 From 185f3b9da24c09c26b784591ed354fe57998a7b1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Sep 2008 16:40:35 -0700 Subject: x86: make intel.c have 64-bit support code prepare for unification. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 119 ++++++++++++++++++++++++++++++-------------- 1 file changed, 83 insertions(+), 36 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index a66989586a84..365a008080c2 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -15,6 +15,11 @@ #include #include +#ifdef CONFIG_X86_64 +#include +#include +#endif + #include "cpu.h" #ifdef CONFIG_X86_LOCAL_APIC @@ -25,14 +30,20 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { - /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ - if (c->x86 == 15 && c->x86_cache_alignment == 64) - c->x86_cache_alignment = 128; if ((c->x86 == 0xf && c->x86_model >= 0x03) || (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + +#ifdef CONFIG_X86_64 + set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#else + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +#endif } +#ifdef CONFIG_X86_32 /* * Early probe support logic for ppro memory erratum #50 * @@ -73,6 +84,40 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) } + +#ifdef CONFIG_X86_F00F_BUG +static void __cpuinit trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); +} +#endif +#endif + +static void __cpuinit srat_detect_node(void) +{ +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) + unsigned node; + int cpu = smp_processor_id(); + int apicid = hard_smp_processor_id(); + + /* Don't do the funky fallback heuristics the AMD version employs + for now. */ + node = apicid_to_node[apicid]; + if (node == NUMA_NO_NODE || !node_online(node)) + node = first_node(node_online_map); + numa_set_node(cpu, node); + + printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); +#endif +} + /* * find out the number of processor cores on the die */ @@ -91,20 +136,6 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } -#ifdef CONFIG_X86_F00F_BUG -static void __cpuinit trap_init_f00f_bug(void) -{ - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); -} -#endif - static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -139,6 +170,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } +#ifdef CONFIG_X86_32 /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) clear_cpu_cap(c, X86_FEATURE_SEP); @@ -176,18 +208,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (p) strcpy(c->x86_model_id, p); - detect_extended_topology(c); - - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - c->x86_max_cores = intel_num_cpu_cores(c); - detect_ht(c); - } - - /* Work around errata */ Intel_errata_workarounds(c); #ifdef CONFIG_X86_INTEL_USERCOPY @@ -206,14 +226,12 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) movsl_mask.mask = 7; break; } +#endif + #endif if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (c->x86 == 15) - set_cpu_cap(c, X86_FEATURE_P4); - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_P3); if (cpu_has_ds) { unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); @@ -224,6 +242,17 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) ds_init_intel(c); } +#ifdef CONFIG_X86_64 + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else + if (c->x86 == 15) + set_cpu_cap(c, X86_FEATURE_P4); + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_P3); + if (cpu_has_bts) ptrace_bts_init_intel(c); @@ -240,8 +269,25 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) #ifdef CONFIG_X86_NUMAQ numaq_tsc_disable(); #endif +#endif + + detect_extended_topology(c); + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ + c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + + /* Work around errata */ + srat_detect_node(); } +#ifdef CONFIG_X86_32 static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) { /* @@ -254,10 +300,12 @@ static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned i size = 256; return size; } +#endif static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, +#ifdef CONFIG_X86_32 .c_models = { { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = { @@ -307,13 +355,12 @@ static struct cpu_dev intel_cpu_dev __cpuinitdata = { } }, }, + .c_size_cache = intel_size_cache, +#endif .c_early_init = early_init_intel, .c_init = init_intel, - .c_size_cache = intel_size_cache, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); -/* arch_initcall(intel_cpu_init); */ - -- cgit v1.2.2 From 58602c1681bdfa1a0deaa5574b8a72d6e30c0e97 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Sep 2008 16:40:36 -0700 Subject: x86: make intel_64.c the same as intel.c No change in functionality intended - this only adds the 32-bit side. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_64.c | 301 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 284 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c index aef4f2826313..365a008080c2 100644 --- a/arch/x86/kernel/cpu/intel_64.c +++ b/arch/x86/kernel/cpu/intel_64.c @@ -1,42 +1,108 @@ #include +#include + +#include +#include #include +#include +#include + #include +#include +#include +#include #include +#include +#include + +#ifdef CONFIG_X86_64 #include #include +#endif #include "cpu.h" +#ifdef CONFIG_X86_LOCAL_APIC +#include +#include +#include +#endif + static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) { if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) + (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); +#ifdef CONFIG_X86_64 set_cpu_cap(c, X86_FEATURE_SYSENTER32); +#else + /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ + if (c->x86 == 15 && c->x86_cache_alignment == 64) + c->x86_cache_alignment = 128; +#endif } +#ifdef CONFIG_X86_32 /* - * find out the number of processor cores on the die + * Early probe support logic for ppro memory erratum #50 + * + * This is called before we do cpu ident work */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, t; - if (c->cpuid_level < 4) +int __cpuinit ppro_with_ram_bug(void) +{ + /* Uses data from early_cpu_detect now */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6 && + boot_cpu_data.x86_model == 1 && + boot_cpu_data.x86_mask < 8) { + printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n"); return 1; + } + return 0; +} - cpuid_count(4, 0, &eax, &t, &t, &t); - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; +/* + * P4 Xeon errata 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ +static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) +{ + unsigned long lo, hi; + + if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { + rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); + if ((lo & (1<<9)) == 0) { + printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); + printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); + lo |= (1<<9); /* Disable hw prefetching */ + wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); + } + } +} + + + +#ifdef CONFIG_X86_F00F_BUG +static void __cpuinit trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); } +#endif +#endif static void __cpuinit srat_detect_node(void) { -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) unsigned node; int cpu = smp_processor_id(); int apicid = hard_smp_processor_id(); @@ -52,11 +118,51 @@ static void __cpuinit srat_detect_node(void) #endif } +/* + * find out the number of processor cores on the die + */ +static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) +{ + unsigned int eax, ebx, ecx, edx; + + if (c->cpuid_level < 4) + return 1; + + /* Intel has a non-standard dependency on %ecx for this CPUID level. */ + cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); + if (eax & 0x1f) + return ((eax >> 26) + 1); + else + return 1; +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { + unsigned int l2 = 0; + char *p = NULL; + early_init_intel(c); - init_intel_cacheinfo(c); +#ifdef CONFIG_X86_F00F_BUG + /* + * All current models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the system. + * Note that the workaround only should be initialized once... + */ + c->f00f_bug = 0; + if (!paravirt_enabled() && c->x86 == 5) { + static int f00f_workaround_enabled; + + c->f00f_bug = 1; + if (!f00f_workaround_enabled) { + trap_init_f00f_bug(); + printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { unsigned eax = cpuid_eax(10); /* Check for version and the number of counters */ @@ -64,8 +170,70 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } +#ifdef CONFIG_X86_32 + /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* + * Names for the Pentium II/Celeron processors + * detectable only by also checking the cache size. + * Dixon is NOT a Celeron. + */ + if (c->x86 == 6) { + switch (c->x86_model) { + case 5: + if (c->x86_mask == 0) { + if (l2 == 0) + p = "Celeron (Covington)"; + else if (l2 == 256) + p = "Mobile Pentium II (Dixon)"; + } + break; + + case 6: + if (l2 == 128) + p = "Celeron (Mendocino)"; + else if (c->x86_mask == 0 || c->x86_mask == 5) + p = "Celeron-A"; + break; + + case 8: + if (l2 == 128) + p = "Celeron (Coppermine)"; + break; + } + } + + if (p) + strcpy(c->x86_model_id, p); + + Intel_errata_workarounds(c); + +#ifdef CONFIG_X86_INTEL_USERCOPY + /* + * Set up the preferred alignment for movsl bulk memory moves + */ + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } +#endif + +#endif + + if (cpu_has_xmm2) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { - unsigned int l1, l2; + unsigned int l1; rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); if (!(l1 & (1<<11))) set_cpu_cap(c, X86_FEATURE_BTS); @@ -74,26 +242,125 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) ds_init_intel(c); } +#ifdef CONFIG_X86_64 if (c->x86 == 15) c->x86_cache_alignment = c->x86_clflush_size * 2; if (c->x86 == 6) set_cpu_cap(c, X86_FEATURE_REP_GOOD); - if (cpu_has_xmm2) - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); +#else + if (c->x86 == 15) + set_cpu_cap(c, X86_FEATURE_P4); + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_P3); + + if (cpu_has_bts) + ptrace_bts_init_intel(c); + + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); + +#ifdef CONFIG_X86_NUMAQ + numaq_tsc_disable(); +#endif +#endif detect_extended_topology(c); - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) + if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { + /* + * let's use the legacy cpuid vector 0x1 and 0x4 for topology + * detection. + */ c->x86_max_cores = intel_num_cpu_cores(c); +#ifdef CONFIG_X86_32 + detect_ht(c); +#endif + } + /* Work around errata */ srat_detect_node(); } +#ifdef CONFIG_X86_32 +static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) +{ + /* + * Intel PIII Tualatin. This comes in two flavours. + * One has 256kb of cache, the other 512. We have no way + * to determine which, so we use a boottime override + * for the 512kb model, and assume 256 otherwise. + */ + if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) + size = 256; + return size; +} +#endif + static struct cpu_dev intel_cpu_dev __cpuinitdata = { .c_vendor = "Intel", .c_ident = { "GenuineIntel" }, +#ifdef CONFIG_X86_32 + .c_models = { + { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = + { + [0] = "486 DX-25/33", + [1] = "486 DX-50", + [2] = "486 SX", + [3] = "486 DX/2", + [4] = "486 SL", + [5] = "486 SX/2", + [7] = "486 DX/2-WB", + [8] = "486 DX/4", + [9] = "486 DX/4-WB" + } + }, + { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = + { + [0] = "Pentium 60/66 A-step", + [1] = "Pentium 60/66", + [2] = "Pentium 75 - 200", + [3] = "OverDrive PODP5V83", + [4] = "Pentium MMX", + [7] = "Mobile Pentium 75 - 200", + [8] = "Mobile Pentium MMX" + } + }, + { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = + { + [0] = "Pentium Pro A-step", + [1] = "Pentium Pro", + [3] = "Pentium II (Klamath)", + [4] = "Pentium II (Deschutes)", + [5] = "Pentium II (Deschutes)", + [6] = "Mobile Pentium II", + [7] = "Pentium III (Katmai)", + [8] = "Pentium III (Coppermine)", + [10] = "Pentium III (Cascades)", + [11] = "Pentium III (Tualatin)", + } + }, + { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = + { + [0] = "Pentium 4 (Unknown)", + [1] = "Pentium 4 (Willamette)", + [2] = "Pentium 4 (Northwood)", + [4] = "Pentium 4 (Foster)", + [5] = "Pentium 4 (Foster)", + } + }, + }, + .c_size_cache = intel_size_cache, +#endif .c_early_init = early_init_intel, .c_init = init_intel, .c_x86_vendor = X86_VENDOR_INTEL, }; cpu_dev_register(intel_cpu_dev); + -- cgit v1.2.2 From 879d792b66d633bbe466974f61d1acc9aa8c78eb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Sep 2008 16:40:37 -0700 Subject: x86: let intel 64-bit use intel.c now that arch/x86/kernel/cpu/intel_64.c and arch/x86/kernel/cpu/intel.c are equal, drop arch/x86/kernel/cpu/intel_64.c and fix up the glue. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 10 +- arch/x86/kernel/cpu/Makefile | 3 +- arch/x86/kernel/cpu/intel_64.c | 366 ----------------------------------------- 3 files changed, 2 insertions(+), 377 deletions(-) delete mode 100644 arch/x86/kernel/cpu/intel_64.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 1b29d6a87563..6761848329fa 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -423,17 +423,9 @@ menuconfig PROCESSOR_SELECT This lets you choose what x86 vendor support code your kernel will include. -config CPU_SUP_INTEL_32 +config CPU_SUP_INTEL default y bool "Support Intel processors" if PROCESSOR_SELECT - depends on !64BIT - help - This enables extended support for Intel processors - -config CPU_SUP_INTEL_64 - default y - bool "Support Intel processors" if PROCESSOR_SELECT - depends on 64BIT help This enables extended support for Intel processors diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 510d1bcb058a..7f0b45a5d788 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -8,8 +8,7 @@ obj-y += proc.o capflags.o powerflags.o common.o obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o obj-$(CONFIG_X86_64) += bugs_64.o -obj-$(CONFIG_CPU_SUP_INTEL_32) += intel.o -obj-$(CONFIG_CPU_SUP_INTEL_64) += intel_64.o +obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR_32) += centaur.o diff --git a/arch/x86/kernel/cpu/intel_64.c b/arch/x86/kernel/cpu/intel_64.c deleted file mode 100644 index 365a008080c2..000000000000 --- a/arch/x86/kernel/cpu/intel_64.c +++ /dev/null @@ -1,366 +0,0 @@ -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_X86_64 -#include -#include -#endif - -#include "cpu.h" - -#ifdef CONFIG_X86_LOCAL_APIC -#include -#include -#include -#endif - -static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) -{ - if ((c->x86 == 0xf && c->x86_model >= 0x03) || - (c->x86 == 0x6 && c->x86_model >= 0x0e)) - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); - -#ifdef CONFIG_X86_64 - set_cpu_cap(c, X86_FEATURE_SYSENTER32); -#else - /* Netburst reports 64 bytes clflush size, but does IO in 128 bytes */ - if (c->x86 == 15 && c->x86_cache_alignment == 64) - c->x86_cache_alignment = 128; -#endif -} - -#ifdef CONFIG_X86_32 -/* - * Early probe support logic for ppro memory erratum #50 - * - * This is called before we do cpu ident work - */ - -int __cpuinit ppro_with_ram_bug(void) -{ - /* Uses data from early_cpu_detect now */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == 1 && - boot_cpu_data.x86_mask < 8) { - printk(KERN_INFO "Pentium Pro with Errata#50 detected. Taking evasive action.\n"); - return 1; - } - return 0; -} - - -/* - * P4 Xeon errata 037 workaround. - * Hardware prefetcher may cause stale data to be loaded into the cache. - */ -static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) -{ - unsigned long lo, hi; - - if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { - rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); - if ((lo & (1<<9)) == 0) { - printk (KERN_INFO "CPU: C0 stepping P4 Xeon detected.\n"); - printk (KERN_INFO "CPU: Disabling hardware prefetching (Errata 037)\n"); - lo |= (1<<9); /* Disable hw prefetching */ - wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); - } - } -} - - - -#ifdef CONFIG_X86_F00F_BUG -static void __cpuinit trap_init_f00f_bug(void) -{ - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); - - /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. - */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); -} -#endif -#endif - -static void __cpuinit srat_detect_node(void) -{ -#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) - unsigned node; - int cpu = smp_processor_id(); - int apicid = hard_smp_processor_id(); - - /* Don't do the funky fallback heuristics the AMD version employs - for now. */ - node = apicid_to_node[apicid]; - if (node == NUMA_NO_NODE || !node_online(node)) - node = first_node(node_online_map); - numa_set_node(cpu, node); - - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); -#endif -} - -/* - * find out the number of processor cores on the die - */ -static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) -{ - unsigned int eax, ebx, ecx, edx; - - if (c->cpuid_level < 4) - return 1; - - /* Intel has a non-standard dependency on %ecx for this CPUID level. */ - cpuid_count(4, 0, &eax, &ebx, &ecx, &edx); - if (eax & 0x1f) - return ((eax >> 26) + 1); - else - return 1; -} - -static void __cpuinit init_intel(struct cpuinfo_x86 *c) -{ - unsigned int l2 = 0; - char *p = NULL; - - early_init_intel(c); - -#ifdef CONFIG_X86_F00F_BUG - /* - * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. - * Note that the workaround only should be initialized once... - */ - c->f00f_bug = 0; - if (!paravirt_enabled() && c->x86 == 5) { - static int f00f_workaround_enabled; - - c->f00f_bug = 1; - if (!f00f_workaround_enabled) { - trap_init_f00f_bug(); - printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); - f00f_workaround_enabled = 1; - } - } -#endif - - l2 = init_intel_cacheinfo(c); - if (c->cpuid_level > 9) { - unsigned eax = cpuid_eax(10); - /* Check for version and the number of counters */ - if ((eax & 0xff) && (((eax>>8) & 0xff) > 1)) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - } - -#ifdef CONFIG_X86_32 - /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) - clear_cpu_cap(c, X86_FEATURE_SEP); - - /* - * Names for the Pentium II/Celeron processors - * detectable only by also checking the cache size. - * Dixon is NOT a Celeron. - */ - if (c->x86 == 6) { - switch (c->x86_model) { - case 5: - if (c->x86_mask == 0) { - if (l2 == 0) - p = "Celeron (Covington)"; - else if (l2 == 256) - p = "Mobile Pentium II (Dixon)"; - } - break; - - case 6: - if (l2 == 128) - p = "Celeron (Mendocino)"; - else if (c->x86_mask == 0 || c->x86_mask == 5) - p = "Celeron-A"; - break; - - case 8: - if (l2 == 128) - p = "Celeron (Coppermine)"; - break; - } - } - - if (p) - strcpy(c->x86_model_id, p); - - Intel_errata_workarounds(c); - -#ifdef CONFIG_X86_INTEL_USERCOPY - /* - * Set up the preferred alignment for movsl bulk memory moves - */ - switch (c->x86) { - case 4: /* 486: untested */ - break; - case 5: /* Old Pentia: untested */ - break; - case 6: /* PII/PIII only like movsl with 8-byte alignment */ - movsl_mask.mask = 7; - break; - case 15: /* P4 is OK down to 8-byte alignment */ - movsl_mask.mask = 7; - break; - } -#endif - -#endif - - if (cpu_has_xmm2) - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (cpu_has_ds) { - unsigned int l1; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - ds_init_intel(c); - } - -#ifdef CONFIG_X86_64 - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); -#else - if (c->x86 == 15) - set_cpu_cap(c, X86_FEATURE_P4); - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_P3); - - if (cpu_has_bts) - ptrace_bts_init_intel(c); - - /* - * See if we have a good local APIC by checking for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ - if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && - (c->x86_mask < 0x6 || c->x86_mask == 0xb)) - set_cpu_cap(c, X86_FEATURE_11AP); - -#ifdef CONFIG_X86_NUMAQ - numaq_tsc_disable(); -#endif -#endif - - detect_extended_topology(c); - if (!cpu_has(c, X86_FEATURE_XTOPOLOGY)) { - /* - * let's use the legacy cpuid vector 0x1 and 0x4 for topology - * detection. - */ - c->x86_max_cores = intel_num_cpu_cores(c); -#ifdef CONFIG_X86_32 - detect_ht(c); -#endif - } - - /* Work around errata */ - srat_detect_node(); -} - -#ifdef CONFIG_X86_32 -static unsigned int __cpuinit intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) -{ - /* - * Intel PIII Tualatin. This comes in two flavours. - * One has 256kb of cache, the other 512. We have no way - * to determine which, so we use a boottime override - * for the 512kb model, and assume 256 otherwise. - */ - if ((c->x86 == 6) && (c->x86_model == 11) && (size == 0)) - size = 256; - return size; -} -#endif - -static struct cpu_dev intel_cpu_dev __cpuinitdata = { - .c_vendor = "Intel", - .c_ident = { "GenuineIntel" }, -#ifdef CONFIG_X86_32 - .c_models = { - { .vendor = X86_VENDOR_INTEL, .family = 4, .model_names = - { - [0] = "486 DX-25/33", - [1] = "486 DX-50", - [2] = "486 SX", - [3] = "486 DX/2", - [4] = "486 SL", - [5] = "486 SX/2", - [7] = "486 DX/2-WB", - [8] = "486 DX/4", - [9] = "486 DX/4-WB" - } - }, - { .vendor = X86_VENDOR_INTEL, .family = 5, .model_names = - { - [0] = "Pentium 60/66 A-step", - [1] = "Pentium 60/66", - [2] = "Pentium 75 - 200", - [3] = "OverDrive PODP5V83", - [4] = "Pentium MMX", - [7] = "Mobile Pentium 75 - 200", - [8] = "Mobile Pentium MMX" - } - }, - { .vendor = X86_VENDOR_INTEL, .family = 6, .model_names = - { - [0] = "Pentium Pro A-step", - [1] = "Pentium Pro", - [3] = "Pentium II (Klamath)", - [4] = "Pentium II (Deschutes)", - [5] = "Pentium II (Deschutes)", - [6] = "Mobile Pentium II", - [7] = "Pentium III (Katmai)", - [8] = "Pentium III (Coppermine)", - [10] = "Pentium III (Cascades)", - [11] = "Pentium III (Tualatin)", - } - }, - { .vendor = X86_VENDOR_INTEL, .family = 15, .model_names = - { - [0] = "Pentium 4 (Unknown)", - [1] = "Pentium 4 (Willamette)", - [2] = "Pentium 4 (Northwood)", - [4] = "Pentium 4 (Foster)", - [5] = "Pentium 4 (Foster)", - } - }, - }, - .c_size_cache = intel_size_cache, -#endif - .c_early_init = early_init_intel, - .c_init = init_intel, - .c_x86_vendor = X86_VENDOR_INTEL, -}; - -cpu_dev_register(intel_cpu_dev); - -- cgit v1.2.2 From 4052704d92c3172ebc13cc0cc8df8ba2bd446a5c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Sep 2008 16:40:38 -0700 Subject: x86: intel.c put workaround for old cpus together consolidate the code some more. No change in functionality intended. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 192 ++++++++++++++++++++++---------------------- 1 file changed, 98 insertions(+), 94 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 365a008080c2..5f76bf139fda 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -63,15 +63,54 @@ int __cpuinit ppro_with_ram_bug(void) return 0; } +#ifdef CONFIG_X86_F00F_BUG +static void __cpuinit trap_init_f00f_bug(void) +{ + __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); -/* - * P4 Xeon errata 037 workaround. - * Hardware prefetcher may cause stale data to be loaded into the cache. - */ -static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) + /* + * Update the IDT descriptor and reload the IDT so that + * it uses the read-only mapped virtual address. + */ + idt_descr.address = fix_to_virt(FIX_F00F_IDT); + load_idt(&idt_descr); +} +#endif + +static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) { unsigned long lo, hi; +#ifdef CONFIG_X86_F00F_BUG + /* + * All current models of Pentium and Pentium with MMX technology CPUs + * have the F0 0F bug, which lets nonprivileged users lock up the system. + * Note that the workaround only should be initialized once... + */ + c->f00f_bug = 0; + if (!paravirt_enabled() && c->x86 == 5) { + static int f00f_workaround_enabled; + + c->f00f_bug = 1; + if (!f00f_workaround_enabled) { + trap_init_f00f_bug(); + printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); + f00f_workaround_enabled = 1; + } + } +#endif + + /* + * SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until + * model 3 mask 3 + */ + if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) + clear_cpu_cap(c, X86_FEATURE_SEP); + + /* + * P4 Xeon errata 037 workaround. + * Hardware prefetcher may cause stale data to be loaded into the cache. + */ if ((c->x86 == 15) && (c->x86_model == 1) && (c->x86_mask == 1)) { rdmsr(MSR_IA32_MISC_ENABLE, lo, hi); if ((lo & (1<<9)) == 0) { @@ -81,23 +120,44 @@ static void __cpuinit Intel_errata_workarounds(struct cpuinfo_x86 *c) wrmsr (MSR_IA32_MISC_ENABLE, lo, hi); } } -} - + /* + * See if we have a good local APIC by checking for buggy Pentia, + * i.e. all B steppings and the C2 stepping of P54C when using their + * integrated APIC (see 11AP erratum in "Pentium Processor + * Specification Update"). + */ + if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && + (c->x86_mask < 0x6 || c->x86_mask == 0xb)) + set_cpu_cap(c, X86_FEATURE_11AP); -#ifdef CONFIG_X86_F00F_BUG -static void __cpuinit trap_init_f00f_bug(void) -{ - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); +#ifdef CONFIG_X86_INTEL_USERCOPY /* - * Update the IDT descriptor and reload the IDT so that - * it uses the read-only mapped virtual address. + * Set up the preferred alignment for movsl bulk memory moves */ - idt_descr.address = fix_to_virt(FIX_F00F_IDT); - load_idt(&idt_descr); -} + switch (c->x86) { + case 4: /* 486: untested */ + break; + case 5: /* Old Pentia: untested */ + break; + case 6: /* PII/PIII only like movsl with 8-byte alignment */ + movsl_mask.mask = 7; + break; + case 15: /* P4 is OK down to 8-byte alignment */ + movsl_mask.mask = 7; + break; + } #endif + +#ifdef CONFIG_X86_NUMAQ + numaq_tsc_disable(); +#endif +} +#else +static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) +{ +} #endif static void __cpuinit srat_detect_node(void) @@ -139,28 +199,10 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; - char *p = NULL; early_init_intel(c); -#ifdef CONFIG_X86_F00F_BUG - /* - * All current models of Pentium and Pentium with MMX technology CPUs - * have the F0 0F bug, which lets nonprivileged users lock up the system. - * Note that the workaround only should be initialized once... - */ - c->f00f_bug = 0; - if (!paravirt_enabled() && c->x86 == 5) { - static int f00f_workaround_enabled; - - c->f00f_bug = 1; - if (!f00f_workaround_enabled) { - trap_init_f00f_bug(); - printk(KERN_NOTICE "Intel Pentium with F0 0F bug - workaround enabled.\n"); - f00f_workaround_enabled = 1; - } - } -#endif + intel_workarounds(c); l2 = init_intel_cacheinfo(c); if (c->cpuid_level > 9) { @@ -170,17 +212,32 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } -#ifdef CONFIG_X86_32 - /* SEP CPUID bug: Pentium Pro reports SEP but doesn't have it until model 3 mask 3 */ - if ((c->x86<<8 | c->x86_model<<4 | c->x86_mask) < 0x633) - clear_cpu_cap(c, X86_FEATURE_SEP); + if (cpu_has_xmm2) + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + if (cpu_has_ds) { + unsigned int l1; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_cpu_cap(c, X86_FEATURE_BTS); + if (!(l1 & (1<<12))) + set_cpu_cap(c, X86_FEATURE_PEBS); + ds_init_intel(c); + } +#ifdef CONFIG_X86_64 + if (c->x86 == 15) + c->x86_cache_alignment = c->x86_clflush_size * 2; + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_REP_GOOD); +#else /* * Names for the Pentium II/Celeron processors * detectable only by also checking the cache size. * Dixon is NOT a Celeron. */ if (c->x86 == 6) { + char *p = NULL; + switch (c->x86_model) { case 5: if (c->x86_mask == 0) { @@ -203,51 +260,11 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) p = "Celeron (Coppermine)"; break; } - } - if (p) - strcpy(c->x86_model_id, p); - - Intel_errata_workarounds(c); - -#ifdef CONFIG_X86_INTEL_USERCOPY - /* - * Set up the preferred alignment for movsl bulk memory moves - */ - switch (c->x86) { - case 4: /* 486: untested */ - break; - case 5: /* Old Pentia: untested */ - break; - case 6: /* PII/PIII only like movsl with 8-byte alignment */ - movsl_mask.mask = 7; - break; - case 15: /* P4 is OK down to 8-byte alignment */ - movsl_mask.mask = 7; - break; + if (p) + strcpy(c->x86_model_id, p); } -#endif - -#endif - if (cpu_has_xmm2) - set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); - if (cpu_has_ds) { - unsigned int l1; - rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); - if (!(l1 & (1<<11))) - set_cpu_cap(c, X86_FEATURE_BTS); - if (!(l1 & (1<<12))) - set_cpu_cap(c, X86_FEATURE_PEBS); - ds_init_intel(c); - } - -#ifdef CONFIG_X86_64 - if (c->x86 == 15) - c->x86_cache_alignment = c->x86_clflush_size * 2; - if (c->x86 == 6) - set_cpu_cap(c, X86_FEATURE_REP_GOOD); -#else if (c->x86 == 15) set_cpu_cap(c, X86_FEATURE_P4); if (c->x86 == 6) @@ -256,19 +273,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) if (cpu_has_bts) ptrace_bts_init_intel(c); - /* - * See if we have a good local APIC by checking for buggy Pentia, - * i.e. all B steppings and the C2 stepping of P54C when using their - * integrated APIC (see 11AP erratum in "Pentium Processor - * Specification Update"). - */ - if (cpu_has_apic && (c->x86<<8 | c->x86_model<<4) == 0x520 && - (c->x86_mask < 0x6 || c->x86_mask == 0xb)) - set_cpu_cap(c, X86_FEATURE_11AP); - -#ifdef CONFIG_X86_NUMAQ - numaq_tsc_disable(); -#endif #endif detect_extended_topology(c); -- cgit v1.2.2 From ec70cae8698632917f06110fdb70c6364281ecc6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Sep 2008 16:40:39 -0700 Subject: x86: centaur_64.c remove duplicated setting of CONSTANT_TSC Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/centaur_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/centaur_64.c b/arch/x86/kernel/cpu/centaur_64.c index 0e5cf17a3c89..a1625f5a1e78 100644 --- a/arch/x86/kernel/cpu/centaur_64.c +++ b/arch/x86/kernel/cpu/centaur_64.c @@ -20,7 +20,6 @@ static void __cpuinit init_centaur(struct cpuinfo_x86 *c) if (c->x86 == 0x6 && c->x86_model >= 0xf) { c->x86_cache_alignment = c->x86_clflush_size * 2; - set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_REP_GOOD); } set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); -- cgit v1.2.2 From b2994ef0de252d41f1511e5f87704bedda12dabc Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 9 Sep 2008 17:18:50 -0700 Subject: x86: signal_64.c: clean up signal_fault() clean up and make signal_fault() same as 32bit. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 8fbdd23d5cc6..552a331313ff 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -473,12 +473,14 @@ void do_notify_resume(struct pt_regs *regs, void *unused, void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; + if (show_unhandled_signals && printk_ratelimit()) { - printk("%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", - me->comm, me->pid, where, frame, regs->ip, - regs->sp, regs->orig_ax); + printk(KERN_INFO + "%s[%d] bad frame in %s frame:%p ip:%lx sp:%lx orax:%lx", + me->comm, me->pid, where, frame, + regs->ip, regs->sp, regs->orig_ax); print_vma_addr(" in ", regs->ip); - printk("\n"); + printk(KERN_CONT "\n"); } force_sig(SIGSEGV, me); -- cgit v1.2.2 From 0c40ed71736c20f447843b3c96b715bb9c4904d7 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 9 Sep 2008 17:21:17 -0700 Subject: x86: signal_64.c: arg for restore_i387_xstate() is void __user * restore_i387_xstate() is declared as: int restore_i387_xstate(void __user *buf); so, make the variable buf void __user *. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 552a331313ff..321da93f2fb7 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -94,7 +94,8 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, } { - struct _fpstate __user *buf; + void __user *buf; + err |= __get_user(buf, &sc->fpstate); err |= restore_i387_xstate(buf); } -- cgit v1.2.2 From 764e8d128f9343027cf09afe8a145e8ff186e129 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 9 Sep 2008 17:22:12 -0700 Subject: x86: signal_64.c: make handle_signal() similar Make handle_signal() same as 32bit. No change in functionality intended. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 57 +++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 321da93f2fb7..43e6862fc7a2 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -338,39 +338,40 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, ret = setup_rt_frame(sig, ka, info, oldset, regs); - if (ret == 0) { - /* - * This has nothing to do with segment registers, - * despite the name. This magic affects uaccess.h - * macros' behavior. Reset it to the normal setting. - */ - set_fs(USER_DS); + if (ret) + return ret; - /* - * Clear the direction flag as per the ABI for function entry. - */ - regs->flags &= ~X86_EFLAGS_DF; + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); - /* - * Clear TF when entering the signal handler, but - * notify any tracer that was single-stepping it. - * The tracer may want to single-step inside the - * handler too. - */ - regs->flags &= ~X86_EFLAGS_TF; + /* + * Clear the direction flag as per the ABI for function entry. + */ + regs->flags &= ~X86_EFLAGS_DF; - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); - if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + /* + * Clear TF when entering the signal handler, but + * notify any tracer that was single-stepping it. + * The tracer may want to single-step inside the + * handler too. + */ + regs->flags &= ~X86_EFLAGS_TF; - tracehook_signal_handler(sig, info, ka, regs, - test_thread_flag(TIF_SINGLESTEP)); - } + spin_lock_irq(¤t->sighand->siglock); + sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + if (!(ka->sa.sa_flags & SA_NODEFER)) + sigaddset(¤t->blocked, sig); + recalc_sigpending(); + spin_unlock_irq(¤t->sighand->siglock); - return ret; + tracehook_signal_handler(sig, info, ka, regs, + test_thread_flag(TIF_SINGLESTEP)); + + return 0; } #define NR_restart_syscall \ -- cgit v1.2.2 From ac4ff656c07ada78316307b0c0ce8a8eb48aa6dd Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 10 Sep 2008 01:06:47 +0900 Subject: x86: convert gart to use is_buffer_dma_capable helper function Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 0b99d4a06f74..1b0c412566e5 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -214,24 +214,14 @@ static void iommu_full(struct device *dev, size_t size, int dir) static inline int need_iommu(struct device *dev, unsigned long addr, size_t size) { - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - - if (force_iommu) - mmu = 1; - - return mmu; + return force_iommu || + !is_buffer_dma_capable(*dev->dma_mask, addr, size); } static inline int nonforced_iommu(struct device *dev, unsigned long addr, size_t size) { - u64 mask = *dev->dma_mask; - int high = addr + size > mask; - int mmu = high; - - return mmu; + return !is_buffer_dma_capable(*dev->dma_mask, addr, size); } /* Map a single continuous physical area into the IOMMU. -- cgit v1.2.2 From 49fbf4e9f982c704dc365698c5b5efa780aadcb5 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 10 Sep 2008 01:06:48 +0900 Subject: x86: convert pci-nommu to use is_buffer_dma_capable helper function Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-nommu.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 8e398b56f50b..1c1c98a31d57 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -14,7 +14,7 @@ static int check_addr(char *name, struct device *hwdev, dma_addr_t bus, size_t size) { - if (hwdev && bus + size > *hwdev->dma_mask) { + if (hwdev && !is_buffer_dma_capable(*hwdev->dma_mask, bus, size)) { if (*hwdev->dma_mask >= DMA_32BIT_MASK) printk(KERN_ERR "nommu_%s: overflow %Lx+%zu of device mask %Lx\n", @@ -79,6 +79,7 @@ nommu_alloc_coherent(struct device *hwdev, size_t size, unsigned long dma_mask; int node; struct page *page; + dma_addr_t addr; dma_mask = dma_alloc_coherent_mask(hwdev, gfp); @@ -90,14 +91,15 @@ again: if (!page) return NULL; - if ((page_to_phys(page) + size > dma_mask) && !(gfp & GFP_DMA)) { + addr = page_to_phys(page); + if (!is_buffer_dma_capable(dma_mask, addr, size) && !(gfp & GFP_DMA)) { free_pages((unsigned long)page_address(page), get_order(size)); gfp |= GFP_DMA; goto again; } - *dma_addr = page_to_phys(page); - if (check_addr("alloc_coherent", hwdev, *dma_addr, size)) { + if (check_addr("alloc_coherent", hwdev, addr, size)) { + *dma_addr = addr; flush_write_buffers(); return page_address(page); } -- cgit v1.2.2 From 315a6558f30a264c88274fa70626615d1c7851c7 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 9 Sep 2008 14:54:53 +0800 Subject: x86: move VMX MSRs to msr-index.h They are hardware specific MSRs, and we would use them in virtualization feature detection later. Signed-off-by: Sheng Yang Signed-off-by: Ingo Molnar --- arch/x86/kvm/vmx.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 425a13436b3f..b32d4e5b123d 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,21 +331,6 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define MSR_IA32_VMX_BASIC 0x480 -#define MSR_IA32_VMX_PINBASED_CTLS 0x481 -#define MSR_IA32_VMX_PROCBASED_CTLS 0x482 -#define MSR_IA32_VMX_EXIT_CTLS 0x483 -#define MSR_IA32_VMX_ENTRY_CTLS 0x484 -#define MSR_IA32_VMX_MISC 0x485 -#define MSR_IA32_VMX_CR0_FIXED0 0x486 -#define MSR_IA32_VMX_CR0_FIXED1 0x487 -#define MSR_IA32_VMX_CR4_FIXED0 0x488 -#define MSR_IA32_VMX_CR4_FIXED1 0x489 -#define MSR_IA32_VMX_VMCS_ENUM 0x48a -#define MSR_IA32_VMX_PROCBASED_CTLS2 0x48b -#define MSR_IA32_VMX_EPT_VPID_CAP 0x48c - -#define MSR_IA32_FEATURE_CONTROL 0x3a #define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 #define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 -- cgit v1.2.2 From e38e05a85828dac23540cd007df5f20985388afc Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 10 Sep 2008 18:53:34 +0800 Subject: x86: extended "flags" to show virtualization HW feature in /proc/cpuinfo The hardware virtualization technology evolves very fast. But currently it's hard to tell if your CPU support a certain kind of HW technology without digging into the source code. The patch add a new catagory in "flags" under /proc/cpuinfo. Now "flags" can indicate the (important) HW virtulization features the CPU supported as well. Current implementation just cover Intel VMX side. Signed-off-by: Sheng Yang Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel.c | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5f76bf139fda..99468dbd08da 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -196,6 +196,44 @@ static int __cpuinit intel_num_cpu_cores(struct cpuinfo_x86 *c) return 1; } +static void __cpuinit detect_vmx_virtcap(struct cpuinfo_x86 *c) +{ + /* Intel VMX MSR indicated features */ +#define X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW 0x00200000 +#define X86_VMX_FEATURE_PROC_CTLS_VNMI 0x00400000 +#define X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS 0x80000000 +#define X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC 0x00000001 +#define X86_VMX_FEATURE_PROC_CTLS2_EPT 0x00000002 +#define X86_VMX_FEATURE_PROC_CTLS2_VPID 0x00000020 + + u32 vmx_msr_low, vmx_msr_high, msr_ctl, msr_ctl2; + + clear_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + clear_cpu_cap(c, X86_FEATURE_VNMI); + clear_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + clear_cpu_cap(c, X86_FEATURE_EPT); + clear_cpu_cap(c, X86_FEATURE_VPID); + + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, vmx_msr_low, vmx_msr_high); + msr_ctl = vmx_msr_high | vmx_msr_low; + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW) + set_cpu_cap(c, X86_FEATURE_TPR_SHADOW); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_VNMI) + set_cpu_cap(c, X86_FEATURE_VNMI); + if (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_2ND_CTLS) { + rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2, + vmx_msr_low, vmx_msr_high); + msr_ctl2 = vmx_msr_high | vmx_msr_low; + if ((msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VIRT_APIC) && + (msr_ctl & X86_VMX_FEATURE_PROC_CTLS_TPR_SHADOW)) + set_cpu_cap(c, X86_FEATURE_FLEXPRIORITY); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_EPT) + set_cpu_cap(c, X86_FEATURE_EPT); + if (msr_ctl2 & X86_VMX_FEATURE_PROC_CTLS2_VPID) + set_cpu_cap(c, X86_FEATURE_VPID); + } +} + static void __cpuinit init_intel(struct cpuinfo_x86 *c) { unsigned int l2 = 0; @@ -289,6 +327,9 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) /* Work around errata */ srat_detect_node(); + + if (cpu_has(c, X86_FEATURE_VMX)) + detect_vmx_virtcap(c); } #ifdef CONFIG_X86_32 -- cgit v1.2.2 From f461a1d80c865e5ec4d24107adbab8b010b60e32 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 10 Sep 2008 13:57:43 +0200 Subject: arch/x86/kernel/kdebugfs.c: introduce missing kfree Error handling code following a kmalloc should free the allocated data. Note that at the point of the change, node has not yet been stored in d, so it is not affected by the existing cleanup code. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,l; position p1,p2; expression *ptr != NULL; @@ ( if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S | x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S ) <... when != x when != if (...) { <+...x...+> } x->f = E ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: Ingo Molnar --- arch/x86/kernel/kdebugfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c index f2d43bc75514..ff7d3b0124f1 100644 --- a/arch/x86/kernel/kdebugfs.c +++ b/arch/x86/kernel/kdebugfs.c @@ -139,6 +139,7 @@ static int __init create_setup_data_nodes(struct dentry *parent) if (PageHighMem(pg)) { data = ioremap_cache(pa_data, sizeof(*data)); if (!data) { + kfree(node); error = -ENXIO; goto err_dir; } -- cgit v1.2.2 From f7d0b926ac8c8ec0c7a83ee69409bd2e6bb39f81 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 9 Sep 2008 15:43:22 -0700 Subject: mm: define USE_SPLIT_PTLOCKS rather than repeating expression Define USE_SPLIT_PTLOCKS as a constant expression rather than repeating "NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS" all over the place. Signed-off-by: Jeremy Fitzhardinge Acked-by: Hugh Dickins Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index aa37469da696..2e1b64088490 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -646,7 +646,7 @@ static spinlock_t *lock_pte(struct page *page) { spinlock_t *ptl = NULL; -#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS +#if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); spin_lock(ptl); #endif -- cgit v1.2.2 From 6a9e91846bf52cc70a0417de19fdfac224c435c4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 9 Sep 2008 15:43:25 -0700 Subject: xen: fix pinning when not using split pte locks We only pin PTE pages when using split PTE locks, so don't do the pin/unpin when attaching/detaching pte pages to a pinned pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b106e825d266..8ca2f88bde1e 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -826,7 +826,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, u32 pfn, unsigned level) if (!PageHighMem(page)) { make_lowmem_page_readonly(__va(PFN_PHYS((unsigned long)pfn))); - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); } else /* make sure there are no stray mappings of @@ -894,7 +894,7 @@ static void xen_release_ptpage(u32 pfn, unsigned level) if (PagePinned(page)) { if (!PageHighMem(page)) { - if (level == PT_PTE) + if (level == PT_PTE && USE_SPLIT_PTLOCKS) pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -- cgit v1.2.2 From 44874f84918e37b64bec6df1587e5fe2fdf6ab62 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 27 Aug 2008 14:18:43 +0200 Subject: KVM: SVM: fix random segfaults with NPT enabled This patch introduces a guest TLB flush on every NPF exit in KVM. This fixes random segfaults and #UD exceptions in the guest seen under some workloads (e.g. long running compile workloads or tbench). A kernbench run with and without that fix showed that it has a slowdown lower than 0.5% Cc: stable@kernel.org Signed-off-by: Joerg Roedel Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e2ee264740c7..d1106cddab0d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -62,6 +62,7 @@ static int npt = 1; module_param(npt, int, S_IRUGO); static void kvm_reput_irq(struct vcpu_svm *svm); +static void svm_flush_tlb(struct kvm_vcpu *vcpu); static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) { @@ -1027,6 +1028,13 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) KVMTRACE_3D(TDP_FAULT, &svm->vcpu, error_code, (u32)fault_address, (u32)(fault_address >> 32), handler); + /* + * FIXME: Tis shouldn't be necessary here, but there is a flush + * missing in the MMU code. Until we find this bug, flush the + * complete TLB here on an NPF + */ + if (npt_enabled) + svm_flush_tlb(&svm->vcpu); if (event_injection) kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); -- cgit v1.2.2 From e5eab0cede4b1ffaca4ad857d840127622038e55 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Sep 2008 19:11:51 +0200 Subject: KVM: SVM: fix guest global tlb flushes with NPT Accesses to CR4 are intercepted even with Nested Paging enabled. But the code does not check if the guest wants to do a global TLB flush. So this flush gets lost. This patch adds the check and the flush to svm_set_cr4. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index d1106cddab0d..8233b86c778c 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -879,6 +879,10 @@ set: static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) { unsigned long host_cr4_mce = read_cr4() & X86_CR4_MCE; + unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; + + if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) + force_new_asid(vcpu); vcpu->arch.cr4 = cr4; if (!npt_enabled) -- cgit v1.2.2 From 534e38b447df47f129a6d3ec3af6705c1e3f651e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 8 Sep 2008 15:12:30 +0800 Subject: KVM: VMX: Always return old for clear_flush_young() when using EPT As well as discard fake accessed bit and dirty bit of EPT. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 4 ++++ arch/x86/kvm/vmx.c | 3 +-- arch/x86/kvm/vmx.h | 2 -- 3 files changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 0bfe2bd305eb..3da2508eb22a 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -711,6 +711,10 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp) u64 *spte; int young = 0; + /* always return old for EPT */ + if (!shadow_accessed_mask) + return 0; + spte = rmap_next(kvm, rmapp, NULL); while (spte) { int _young; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2a69773e3b26..7041cc52b562 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3301,8 +3301,7 @@ static int __init vmx_init(void) kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK | VMX_EPT_WRITABLE_MASK | VMX_EPT_DEFAULT_MT << VMX_EPT_MT_EPTE_SHIFT); - kvm_mmu_set_mask_ptes(0ull, VMX_EPT_FAKE_ACCESSED_MASK, - VMX_EPT_FAKE_DIRTY_MASK, 0ull, + kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, VMX_EPT_EXECUTABLE_MASK); kvm_enable_tdp(); } else diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 425a13436b3f..23e8373507ad 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -370,8 +370,6 @@ enum vmcs_field { #define VMX_EPT_READABLE_MASK 0x1ull #define VMX_EPT_WRITABLE_MASK 0x2ull #define VMX_EPT_EXECUTABLE_MASK 0x4ull -#define VMX_EPT_FAKE_ACCESSED_MASK (1ull << 62) -#define VMX_EPT_FAKE_DIRTY_MASK (1ull << 63) #define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul -- cgit v1.2.2 From 0ad5bce7409d681a5655c66e64bb0eb740b89c1f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 16:42:00 -0700 Subject: x86: fix possible x86_64 and EFI regression Russ Anderson reported a boot crash with EFI and latest mainline: BIOS-e820: 00000000fffa0000 - 00000000fffac000 (reserved) Pid: 0, comm: swapper Not tainted 2.6.27-rc5-00100-gec0c15a-dirty #5 Call Trace: [] early_idt_handler+0x55/0x69 [] __memcpy+0x12/0xa4 [] efi_init+0xce/0x932 [] setup_early_serial8250_console+0x2d/0x36a [] __insert_resource+0x18/0xc8 [] setup_arch+0x3a7/0x632 [] start_kernel+0x91/0x367 [] x86_64_start_kernel+0xe3/0xe7 [] x86_64_start_kernel+0x0/0xe7 RIP 0x10 Such a crash is possible if the CPU in this system is a 64-bit processor which doesn't support NX (ie, old Intel P4 -based64-bit processors). Certainly, if we support such processors, then we should start with _PAGE_NX initially clear in __supported_pte_flags, and then set it once we've established that the processor does indeed support NX. That will prevent early_ioremap - or anything else - from trying to set it. The simple fix is to simply call check_efer() earlier. Reported-by: Russ Anderson Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 362d4e7f2d38..9838f2539dfc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -670,6 +670,10 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_X86_64 + check_efer(); +#endif + #if defined(CONFIG_VMI) && defined(CONFIG_X86_32) /* * Must be before kernel pagetables are setup @@ -738,7 +742,6 @@ void __init setup_arch(char **cmdline_p) #else num_physpages = max_pfn; - check_efer(); /* How many end-of-memory variables you have, grandma! */ /* need this before calling reserve_initrd */ -- cgit v1.2.2 From a0a29b62a9cac6b7d83b7514679f2ed8d33d4372 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 11 Sep 2008 23:27:52 +0200 Subject: x86, microcode rework, v2 this is a rework of the microcode splitup in tip/x86/microcode (1) I think this new interface is cleaner (look at the changes in 'struct microcode_ops' in microcode.h); (2) it's -64 lines of code; Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 84 ++++------ arch/x86/kernel/microcode_amd.c | 329 ++++++++++++++++++-------------------- arch/x86/kernel/microcode_intel.c | 220 +++++++++++-------------- 3 files changed, 285 insertions(+), 348 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index b2f84ce5eed3..902dada2eb6d 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -110,50 +110,28 @@ struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); #ifdef CONFIG_MICROCODE_OLD_INTERFACE -void __user *user_buffer; /* user area microcode data buffer */ -EXPORT_SYMBOL_GPL(user_buffer); -unsigned int user_buffer_size; /* it's size */ -EXPORT_SYMBOL_GPL(user_buffer_size); - -static int do_microcode_update(void) +static int do_microcode_update(const void __user *buf, size_t size) { - long cursor = 0; + cpumask_t old; int error = 0; - void *new_mc = NULL; int cpu; - cpumask_t old; old = current->cpus_allowed; - while ((cursor = microcode_ops->get_next_ucode(&new_mc, cursor)) > 0) { - if (microcode_ops->microcode_sanity_check != NULL) - error = microcode_ops->microcode_sanity_check(new_mc); - if (error) + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + error = microcode_ops->request_microcode_user(cpu, buf, size); + if (error < 0) goto out; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->valid) - continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->get_matching_microcode(new_mc, - cpu); - if (error < 0) - goto out; - if (error == 1) - microcode_ops->apply_microcode(cpu); - } - vfree(new_mc); + if (!error) + microcode_ops->apply_microcode(cpu); } out: - if (cursor > 0) - vfree(new_mc); - if (cursor < 0) - error = cursor; set_cpus_allowed_ptr(current, &old); return error; } @@ -178,10 +156,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, get_online_cpus(); mutex_lock(µcode_mutex); - user_buffer = (void __user *) buf; - user_buffer_size = (int) len; - - ret = do_microcode_update(); + ret = do_microcode_update(buf, len); if (!ret) ret = (ssize_t)len; @@ -231,7 +206,6 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ struct platform_device *microcode_pdev; -EXPORT_SYMBOL_GPL(microcode_pdev); static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, @@ -252,8 +226,12 @@ static ssize_t reload_store(struct sys_device *dev, if (cpu_online(cpu)) { set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); mutex_lock(µcode_mutex); - if (uci->valid) - err = microcode_ops->cpu_request_microcode(cpu); + if (uci->valid) { + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + if (!err) + microcode_ops->apply_microcode(cpu); + } mutex_unlock(µcode_mutex); set_cpus_allowed_ptr(current, &old); } @@ -315,7 +293,7 @@ static void collect_cpu_info(int cpu) uci->valid = 1; } -static void microcode_resume_cpu(int cpu) +static int microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct cpu_signature nsig; @@ -323,7 +301,7 @@ static void microcode_resume_cpu(int cpu) pr_debug("microcode: CPU%d resumed\n", cpu); if (!uci->mc.valid_mc) - return; + return 1; /* * Let's verify that the 'cached' ucode does belong @@ -331,21 +309,22 @@ static void microcode_resume_cpu(int cpu) */ if (microcode_ops->collect_cpu_info(cpu, &nsig)) { microcode_fini_cpu(cpu); - return; + return -1; } if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { microcode_fini_cpu(cpu); /* Should we look for a new ucode here? */ - return; + return 1; } - microcode_ops->apply_microcode(cpu); + return 0; } void microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int err = 0; /* We should bind the task to the CPU */ BUG_ON(raw_smp_processor_id() != cpu); @@ -356,12 +335,17 @@ void microcode_update_cpu(int cpu) * otherwise just request a firmware: */ if (uci->valid) { - microcode_resume_cpu(cpu); + err = microcode_resume_cpu(cpu); } else { collect_cpu_info(cpu); if (uci->valid && system_state == SYSTEM_RUNNING) - microcode_ops->cpu_request_microcode(cpu); + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); } + + if (!err) + microcode_ops->apply_microcode(cpu); + mutex_unlock(µcode_mutex); } @@ -414,7 +398,7 @@ static int mc_sysdev_resume(struct sys_device *dev) return 0; pr_debug("microcode: CPU%d resumed\n", cpu); /* only CPU 0 will apply ucode here */ - microcode_ops->apply_microcode(0); + microcode_update_cpu(0); return 0; } diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d606a05545ca..6815837a7753 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -59,7 +59,7 @@ MODULE_LICENSE("GPL v2"); /* serialize access to the physical write */ static DEFINE_SPINLOCK(microcode_update_lock); -struct equiv_cpu_entry *equiv_cpu_table; +static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { @@ -83,36 +83,37 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static int get_matching_microcode_amd(void *mc, int cpu) +static int get_matching_microcode(int cpu, void *mc, int rev) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_amd *mc_header = mc; - unsigned long total_size = get_totalsize(mc_header); - void *new_mc; struct pci_dev *nb_pci_dev, *sb_pci_dev; unsigned int current_cpu_id; unsigned int equiv_cpu_id = 0x00; unsigned int i = 0; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - /* This is a tricky part. We might be called from a write operation */ - /* to the device file instead of the usual process of firmware */ - /* loading. This routine needs to be able to distinguish both */ -/* cases. This is done by checking if there alread is a equivalent */ - /* CPU table installed. If not, we're written through */ - /* /dev/cpu/microcode. */ -/* Since we ignore all checks. The error case in which going through */ -/* firmware loading and that table is not loaded has already been */ - /* checked earlier. */ + /* + * dimm: do we need this? Why an update via /dev/... is different + * from the one via firmware? + * + * This is a tricky part. We might be called from a write operation + * to the device file instead of the usual process of firmware + * loading. This routine needs to be able to distinguish both + * cases. This is done by checking if there alread is a equivalent + * CPU table installed. If not, we're written through + * /dev/cpu/microcode. + * Since we ignore all checks. The error case in which going through + * firmware loading and that table is not loaded has already been + * checked earlier. + */ + BUG_ON(equiv_cpu_table == NULL); +#if 0 if (equiv_cpu_table == NULL) { printk(KERN_INFO "microcode: CPU%d microcode update with " "version 0x%x (current=0x%x)\n", cpu, mc_header->patch_id, uci->cpu_sig.rev); goto out; } - +#endif current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { @@ -175,27 +176,9 @@ static int get_matching_microcode_amd(void *mc, int cpu) pci_dev_put(sb_pci_dev); } - if (mc_header->patch_id <= uci->cpu_sig.rev) + if (mc_header->patch_id <= rev) return 0; - printk(KERN_INFO "microcode: CPU%d found a matching microcode " - "update with version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->cpu_sig.rev); - -out: - new_mc = vmalloc(UCODE_MAX_SIZE); - if (!new_mc) { - printk(KERN_ERR "microcode: error, can't allocate memory\n"); - return -ENOMEM; - } - memset(new_mc, 0, UCODE_MAX_SIZE); - - /* free previous update file */ - vfree(uci->mc.mc_amd); - - memcpy(new_mc, mc, total_size); - - uci->mc.mc_amd = new_mc; return 1; } @@ -245,104 +228,65 @@ static void apply_microcode_amd(int cpu) uci->cpu_sig.rev = rev; } -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -extern void __user *user_buffer; /* user area microcode data buffer */ -extern unsigned int user_buffer_size; /* it's size */ - -static long get_next_ucode_amd(void **mc, long offset) -{ - struct microcode_header_amd mc_header; - unsigned long total_size; - - /* No more data */ - if (offset >= user_buffer_size) - return 0; - if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - return -EFAULT; - } - total_size = get_totalsize(&mc_header); - if (offset + total_size > user_buffer_size) { - printk(KERN_ERR "microcode: error! Bad total size in microcode " - "data file\n"); - return -EINVAL; - } - *mc = vmalloc(UCODE_MAX_SIZE); - if (!*mc) - return -ENOMEM; - memset(*mc, 0, UCODE_MAX_SIZE); - - if (copy_from_user(*mc, user_buffer + offset, total_size)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(*mc); - return -EFAULT; - } - return offset + total_size; -} -#else -#define get_next_ucode_amd() NULL -#endif - -static long get_next_ucode_from_buffer_amd(void **mc, void *buf, - unsigned long size, long offset) +static void * get_next_ucode(u8 *buf, unsigned int size, + int (*get_ucode_data)(void *, const void *, size_t), + unsigned int *mc_size) { - struct microcode_header_amd *mc_header; - unsigned long total_size; - unsigned char *buf_pos = buf; + unsigned int total_size; +#define UCODE_UNKNOWN_HDR 8 + u8 hdr[UCODE_UNKNOWN_HDR]; + void *mc; - /* No more data */ - if (offset >= size) - return 0; + if (get_ucode_data(hdr, buf, UCODE_UNKNOWN_HDR)) + return NULL; - if (buf_pos[offset] != UCODE_UCODE_TYPE) { + if (hdr[0] != UCODE_UCODE_TYPE) { printk(KERN_ERR "microcode: error! " "Wrong microcode payload type field\n"); - return -EINVAL; + return NULL; } - mc_header = (struct microcode_header_amd *)(&buf_pos[offset+8]); + /* Why not by means of get_totalsize(hdr)? */ + total_size = (unsigned long) (hdr[4] + (hdr[5] << 8)); - total_size = (unsigned long) (buf_pos[offset+4] + - (buf_pos[offset+5] << 8)); + printk(KERN_INFO "microcode: size %u, total_size %u\n", + size, total_size); - printk(KERN_INFO "microcode: size %lu, total_size %lu, offset %ld\n", - size, total_size, offset); - - if (offset + total_size > size) { + if (total_size > size || total_size > UCODE_MAX_SIZE) { printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - return -EINVAL; + return NULL; } - *mc = vmalloc(UCODE_MAX_SIZE); - if (!*mc) { - printk(KERN_ERR "microcode: error! " - "Can not allocate memory for microcode patch\n"); - return -ENOMEM; + mc = vmalloc(UCODE_MAX_SIZE); + if (mc) { + memset(mc, 0, UCODE_MAX_SIZE); + if (get_ucode_data(mc, buf + UCODE_UNKNOWN_HDR, total_size)) { + vfree(mc); + mc = NULL; + } else + *mc_size = total_size + UCODE_UNKNOWN_HDR; } - - memset(*mc, 0, UCODE_MAX_SIZE); - memcpy(*mc, buf + offset + 8, total_size); - - return offset + total_size + 8; +#undef UCODE_UNKNOWN_HDR + return mc; } -static long install_equiv_cpu_table(void *buf, unsigned long size, long offset) + +static int install_equiv_cpu_table(u8 *buf, + int (*get_ucode_data)(void *, const void *, size_t)) { - unsigned int *buf_pos = buf; +#define UCODE_HEADER_SIZE 12 + u8 *hdr[UCODE_HEADER_SIZE]; + unsigned int *buf_pos = (unsigned int *)hdr; + unsigned long size; - /* No more data */ - if (offset >= size) + if (get_ucode_data(&hdr, buf, UCODE_HEADER_SIZE)) return 0; - if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE) { - printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table type field\n"); - return 0; - } + size = buf_pos[2]; - if (size == 0) { + if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { printk(KERN_ERR "microcode: error! " - "Wrong microcode equivalnet cpu table length\n"); + "Wrong microcode equivalnet cpu table\n"); return 0; } @@ -352,79 +296,118 @@ static long install_equiv_cpu_table(void *buf, unsigned long size, long offset) return 0; } - memset(equiv_cpu_table, 0, size); - memcpy(equiv_cpu_table, &buf_pos[3], size); + buf += UCODE_HEADER_SIZE; + if (get_ucode_data(equiv_cpu_table, buf, size)) { + vfree(equiv_cpu_table); + return 0; + } - return size + 12; /* add header length */ + return size + UCODE_HEADER_SIZE; /* add header length */ +#undef UCODE_HEADER_SIZE } -/* fake device for request_firmware */ -extern struct platform_device *microcode_pdev; - -static int cpu_request_microcode_amd(int cpu) +static void free_equiv_cpu_table(void) { - char name[30]; - const struct firmware *firmware; - void *buf; - unsigned int *buf_pos; - unsigned long size; - long offset = 0; - int error; - void *mc; - - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - sprintf(name, "amd-ucode/microcode_amd.bin"); - error = request_firmware(&firmware, "amd-ucode/microcode_amd.bin", - µcode_pdev->dev); - if (error) { - printk(KERN_ERR "microcode: ucode data file %s load failed\n", - name); - return error; - } - - buf_pos = (unsigned int *)firmware->data; - buf = (void *)firmware->data; - size = firmware->size; - - if (buf_pos[0] != UCODE_MAGIC) { - printk(KERN_ERR "microcode: error! Wrong microcode patch file magic\n"); - return -EINVAL; + if (equiv_cpu_table) { + vfree(equiv_cpu_table); + equiv_cpu_table = NULL; } +} - offset = install_equiv_cpu_table(buf, buf_pos[2], offset); +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover; + unsigned long offset; + offset = install_equiv_cpu_table(ucode_ptr, get_ucode_data); if (!offset) { printk(KERN_ERR "microcode: installing equivalent cpu table failed\n"); return -EINVAL; } - while ((offset = - get_next_ucode_from_buffer_amd(&mc, buf, size, offset)) > 0) { - error = get_matching_microcode_amd(mc, cpu); - if (error < 0) + ucode_ptr += offset; + leftover = size - offset; + + while (leftover) { + unsigned int mc_size; + struct microcode_header_amd *mc_header; + + mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); + if (!mc) break; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - if (error == 1) { - apply_microcode_amd(cpu); - error = 0; - } - vfree(mc); + + mc_header = (struct microcode_header_amd *)mc; + if (get_matching_microcode(cpu, mc, new_rev)) { + new_rev = mc_header->patch_id; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; } - if (offset > 0) { - vfree(mc); - vfree(equiv_cpu_table); - equiv_cpu_table = NULL; + + if (new_mc) { + if (!leftover) { + if (uci->mc.mc_amd) + vfree(uci->mc.mc_amd); + uci->mc.mc_amd = (struct microcode_amd *)new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, uci->mc.mc_amd->hdr.patch_id, uci->cpu_sig.rev); + } else + vfree(new_mc); } - if (offset < 0) - error = offset; + + free_equiv_cpu_table(); + + return (int)leftover; +} + +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} + +static int request_microcode_fw(int cpu, struct device *device) +{ + const char *fw_name = "amd-ucode/microcode_amd.bin"; + const struct firmware *firmware; + int ret; + + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + ret = request_firmware(&firmware, fw_name, device); + if (ret) { + printk(KERN_ERR "microcode: ucode data file %s load failed\n", fw_name); + return ret; + } + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + release_firmware(firmware); - return error; + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); } static void microcode_fini_cpu_amd(int cpu) @@ -436,10 +419,8 @@ static void microcode_fini_cpu_amd(int cpu) } static struct microcode_ops microcode_amd_ops = { - .get_next_ucode = get_next_ucode_amd, - .get_matching_microcode = get_matching_microcode_amd, - .microcode_sanity_check = NULL, - .cpu_request_microcode = cpu_request_microcode_amd, + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index c9b53202ba3d..f4930b55c6a0 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -155,15 +155,15 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -static inline int microcode_update_match(int cpu_num, - struct microcode_header_intel *mc_header, int sig, int pf) +static inline int update_match_cpu(struct cpu_signature *csig, int sig, int pf) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + return (!sigmatch(sig, csig->sig, pf, csig->pf)) ? 0 : 1; +} - if (!sigmatch(sig, uci->cpu_sig.sig, pf, uci->cpu_sig.pf) - || mc_header->rev <= uci->cpu_sig.rev) - return 0; - return 1; +static inline int +update_match_revision(struct microcode_header_intel *mc_header, int rev) +{ + return (mc_header->rev <= rev) ? 0 : 1; } static int microcode_sanity_check(void *mc) @@ -248,51 +248,36 @@ static int microcode_sanity_check(void *mc) /* * return 0 - no update found * return 1 - found update - * return < 0 - error */ -static int get_matching_microcode(void *mc, int cpu) +static int +get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); int ext_sigcount, i; struct extended_signature *ext_sig; - void *new_mc; - if (microcode_update_match(cpu, mc_header, - mc_header->sig, mc_header->pf)) - goto find; + if (!update_match_revision(mc_header, rev)) + return 0; + + if (update_match_cpu(cpu_sig, mc_header->sig, mc_header->pf)) + return 1; + /* Look for ext. headers: */ if (total_size <= get_datasize(mc_header) + MC_HEADER_SIZE) return 0; ext_header = mc + get_datasize(mc_header) + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; + for (i = 0; i < ext_sigcount; i++) { - if (microcode_update_match(cpu, mc_header, - ext_sig->sig, ext_sig->pf)) - goto find; + if (update_match_cpu(cpu_sig, ext_sig->sig, ext_sig->pf)) + return 1; ext_sig++; } return 0; -find: - pr_debug("microcode: CPU%d found a matching microcode update with" - " version 0x%x (current=0x%x)\n", - cpu, mc_header->rev, uci->cpu_sig.rev); - new_mc = vmalloc(total_size); - if (!new_mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; - } - - /* free previous update file */ - vfree(uci->mc.mc_intel); - - memcpy(new_mc, mc, total_size); - uci->mc.mc_intel = new_mc; - return 1; } static void apply_microcode(int cpu) @@ -300,7 +285,7 @@ static void apply_microcode(int cpu) unsigned long flags; unsigned int val[2]; int cpu_num = raw_smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -338,116 +323,105 @@ static void apply_microcode(int cpu) uci->cpu_sig.rev = val[1]; } -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -extern void __user *user_buffer; /* user area microcode data buffer */ -extern unsigned int user_buffer_size; /* it's size */ - -static long get_next_ucode(void **mc, long offset) +static int generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { - struct microcode_header_intel mc_header; - unsigned long total_size; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + u8 *ucode_ptr = data, *new_mc = NULL, *mc; + int new_rev = uci->cpu_sig.rev; + unsigned int leftover = size; - /* No more data */ - if (offset >= user_buffer_size) - return 0; - if (copy_from_user(&mc_header, user_buffer + offset, MC_HEADER_SIZE)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - return -EFAULT; - } - total_size = get_totalsize(&mc_header); - if (offset + total_size > user_buffer_size) { - printk(KERN_ERR "microcode: error! Bad total size in microcode " - "data file\n"); - return -EINVAL; - } - *mc = vmalloc(total_size); - if (!*mc) - return -ENOMEM; - if (copy_from_user(*mc, user_buffer + offset, total_size)) { - printk(KERN_ERR "microcode: error! Can not read user data\n"); - vfree(*mc); - return -EFAULT; - } - return offset + total_size; -} -#endif + while (leftover) { + struct microcode_header_intel mc_header; + unsigned int mc_size; -static long get_next_ucode_from_buffer(void **mc, const u8 *buf, - unsigned long size, long offset) -{ - struct microcode_header_intel *mc_header; - unsigned long total_size; + if (get_ucode_data(&mc_header, ucode_ptr, sizeof(mc_header))) + break; - /* No more data */ - if (offset >= size) - return 0; - mc_header = (struct microcode_header_intel *)(buf + offset); - total_size = get_totalsize(mc_header); + mc_size = get_totalsize(&mc_header); + if (!mc_size || mc_size > leftover) { + printk(KERN_ERR "microcode: error!" + "Bad data in microcode data file\n"); + break; + } - if (offset + total_size > size) { - printk(KERN_ERR "microcode: error! Bad data in microcode data file\n"); - return -EINVAL; + mc = vmalloc(mc_size); + if (!mc) + break; + + if (get_ucode_data(mc, ucode_ptr, mc_size) || + microcode_sanity_check(mc) < 0) { + vfree(mc); + break; + } + + if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + new_rev = mc_header.rev; + new_mc = mc; + } else + vfree(mc); + + ucode_ptr += mc_size; + leftover -= mc_size; } - *mc = vmalloc(total_size); - if (!*mc) { - printk(KERN_ERR "microcode: error! Can not allocate memory\n"); - return -ENOMEM; + if (new_mc) { + if (!leftover) { + if (uci->mc.mc_intel) + vfree(uci->mc.mc_intel); + uci->mc.mc_intel = (struct microcode_intel *)new_mc; + pr_debug("microcode: CPU%d found a matching microcode update with" + " version 0x%x (current=0x%x)\n", + cpu, uci->mc.mc_intel->hdr.rev, uci->cpu_sig.rev); + } else + vfree(new_mc); } - memcpy(*mc, buf + offset, total_size); - return offset + total_size; + + return (int)leftover; } -/* fake device for request_firmware */ -extern struct platform_device *microcode_pdev; +static int get_ucode_fw(void *to, const void *from, size_t n) +{ + memcpy(to, from, n); + return 0; +} -static int cpu_request_microcode(int cpu) +static int request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - const u8 *buf; - unsigned long size; - long offset = 0; - int error; - void *mc; + int ret; /* We should bind the task to the CPU */ BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - error = request_firmware(&firmware, name, µcode_pdev->dev); - if (error) { + ret = request_firmware(&firmware, name, device); + if (ret) { pr_debug("microcode: data file %s load failed\n", name); - return error; - } - buf = firmware->data; - size = firmware->size; - while ((offset = get_next_ucode_from_buffer(&mc, buf, size, offset)) - > 0) { - error = microcode_sanity_check(mc); - if (error) - break; - error = get_matching_microcode(mc, cpu); - if (error < 0) - break; - /* - * It's possible the data file has multiple matching ucode, - * lets keep searching till the latest version - */ - if (error == 1) { - apply_microcode(cpu); - error = 0; - } - vfree(mc); + return ret; } - if (offset > 0) - vfree(mc); - if (offset < 0) - error = offset; + + ret = generic_load_microcode(cpu, (void*)firmware->data, firmware->size, + &get_ucode_fw); + release_firmware(firmware); - return error; + return ret; +} + +static int get_ucode_user(void *to, const void *from, size_t n) +{ + return copy_from_user(to, from, n); +} + +static int request_microcode_user(int cpu, const void __user *buf, size_t size) +{ + /* We should bind the task to the CPU */ + BUG_ON(cpu != raw_smp_processor_id()); + + return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); } static void microcode_fini_cpu(int cpu) @@ -459,10 +433,8 @@ static void microcode_fini_cpu(int cpu) } static struct microcode_ops microcode_intel_ops = { - .get_next_ucode = get_next_ucode, - .get_matching_microcode = get_matching_microcode, - .microcode_sanity_check = microcode_sanity_check, - .cpu_request_microcode = cpu_request_microcode, + .request_microcode_user = request_microcode_user, + .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode, .microcode_fini_cpu = microcode_fini_cpu, -- cgit v1.2.2 From aef93c8bd506a78983d91cd576a97fee6e934ac1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 14 Sep 2008 02:33:15 -0700 Subject: x86: identify_cpu_without_cpuid v2 Krzysztof found some old cyrix cpu where an mtrr-alike cpu feature was not detected properly. this one is based on Krzysztof' patch, and we call ->c_identify() in early_identify_cpu. need to call c_identify() for cpus without cpuid even earlier ... v2: Krzysztof point out need to give cyrix another chance about cpuid checking again, after ->c_identify() enables cpuid for it Signed-off-by: Yinghai Lu Cc: Krzysztof Helt Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 57 +++++++++++++++++++++++++++++++------------- 1 file changed, 40 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 9a57106c1995..f5f25b85be95 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -485,6 +485,33 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_power = cpuid_edx(0x80000007); } + +static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + int i; + + /* + * First of all, decide if this is a 486 or higher + * It's a 486 if we can modify the AC flag + */ + if (flag_is_changeable_p(X86_EFLAGS_AC)) + c->x86 = 4; + else + c->x86 = 3; + + for (i = 0; i < X86_VENDOR_NUM; i++) + if (cpu_devs[i] && cpu_devs[i]->c_identify) { + c->x86_vendor_id[0] = 0; + cpu_devs[i]->c_identify(c); + if (c->x86_vendor_id[0]) { + get_cpu_vendor(c); + break; + } + } +#endif +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -503,13 +530,16 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) #endif c->x86_cache_alignment = c->x86_clflush_size; - if (!have_cpuid_p()) - return; - memset(&c->x86_capability, 0, sizeof c->x86_capability); - c->extended_cpuid_level = 0; + if (!have_cpuid_p()) + identify_cpu_without_cpuid(c); + + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid()) + return; + cpu_detect(c); get_cpu_vendor(c); @@ -583,10 +613,14 @@ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { + c->extended_cpuid_level = 0; + if (!have_cpuid_p()) - return; + identify_cpu_without_cpuid(c); - c->extended_cpuid_level = 0; + /* cyrix could have cpuid enabled via c_identify()*/ + if (!have_cpuid()) + return; cpu_detect(c); @@ -639,17 +673,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_cache_alignment = c->x86_clflush_size; memset(&c->x86_capability, 0, sizeof c->x86_capability); - if (!have_cpuid_p()) { - /* - * First of all, decide if this is a 486 or higher - * It's a 486 if we can modify the AC flag - */ - if (flag_is_changeable_p(X86_EFLAGS_AC)) - c->x86 = 4; - else - c->x86 = 3; - } - generic_identify(c); if (this_cpu->c_identify) -- cgit v1.2.2 From afae865613c8292e8bdcce4f0b161988d761f033 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 14 Sep 2008 02:33:16 -0700 Subject: x86: move transmeta cap read to early_init_transmeta() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 8 -------- arch/x86/kernel/cpu/transmeta.c | 28 +++++++++++++++------------- 2 files changed, 15 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f5f25b85be95..3e2ce4d33d3a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -465,14 +465,6 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_64 - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - /* Don't set x86_cpuid_level here for now to not confuse. */ - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } - if (c->extended_cpuid_level >= 0x80000008) { u32 eax = cpuid_eax(0x80000008); diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c index 738e03244f95..52b3fefbd5af 100644 --- a/arch/x86/kernel/cpu/transmeta.c +++ b/arch/x86/kernel/cpu/transmeta.c @@ -5,6 +5,18 @@ #include #include "cpu.h" +static void __cpuinit early_init_transmeta(struct cpuinfo_x86 *c) +{ + u32 xlvl; + + /* Transmeta-defined flags: level 0x80860001 */ + xlvl = cpuid_eax(0x80860000); + if ((xlvl & 0xffff0000) == 0x80860000) { + if (xlvl >= 0x80860001) + c->x86_capability[2] = cpuid_edx(0x80860001); + } +} + static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) { unsigned int cap_mask, uk, max, dummy; @@ -12,6 +24,8 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) unsigned int cpu_rev, cpu_freq = 0, cpu_flags, new_cpu_rev; char cpu_info[65]; + early_init_transmeta(c); + display_cacheinfo(c); /* Print CMS and CPU revision */ @@ -84,23 +98,11 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c) #endif } -static void __cpuinit transmeta_identify(struct cpuinfo_x86 *c) -{ - u32 xlvl; - - /* Transmeta-defined flags: level 0x80860001 */ - xlvl = cpuid_eax(0x80860000); - if ((xlvl & 0xffff0000) == 0x80860000) { - if (xlvl >= 0x80860001) - c->x86_capability[2] = cpuid_edx(0x80860001); - } -} - static struct cpu_dev transmeta_cpu_dev __cpuinitdata = { .c_vendor = "Transmeta", .c_ident = { "GenuineTMx86", "TransmetaCPU" }, + .c_early_init = early_init_transmeta, .c_init = init_transmeta, - .c_identify = transmeta_identify, .c_x86_vendor = X86_VENDOR_TRANSMETA, }; -- cgit v1.2.2 From a9853dd6d285c30a3ddeb3cce8c05e1678400bef Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 14:46:58 +0200 Subject: x86: cpuid, fix typo Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 3e2ce4d33d3a..cef3519b3bbb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -529,7 +529,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid()) + if (!have_cpuid_p()) return; cpu_detect(c); @@ -611,7 +611,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid()) + if (!have_cpuid_p()) return; cpu_detect(c); -- cgit v1.2.2 From a1c75cc5018f17ff6d80ce45a13435b1536f76db Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 14:50:26 +0200 Subject: x86, microcode rework, v2, fix based on patch from Dmitry Adamushko. - add missing vfree() - update debug printks Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode.c | 4 +--- arch/x86/kernel/microcode_amd.c | 6 ++++-- arch/x86/kernel/microcode_intel.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c index 902dada2eb6d..0c2634f4fd7c 100644 --- a/arch/x86/kernel/microcode.c +++ b/arch/x86/kernel/microcode.c @@ -70,8 +70,6 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ - -/* #define DEBUG pr_debug */ #include #include #include @@ -396,7 +394,7 @@ static int mc_sysdev_resume(struct sys_device *dev) if (!cpu_online(cpu)) return 0; - pr_debug("microcode: CPU%d resumed\n", cpu); + /* only CPU 0 will apply ucode here */ microcode_update_cpu(0); return 0; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 6815837a7753..48aec9f48e4f 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -92,7 +92,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) unsigned int i = 0; /* - * dimm: do we need this? Why an update via /dev/... is different + * FIXME! dimm: do we need this? Why an update via /dev/... is different * from the one via firmware? * * This is a tricky part. We might be called from a write operation @@ -246,7 +246,7 @@ static void * get_next_ucode(u8 *buf, unsigned int size, return NULL; } - /* Why not by means of get_totalsize(hdr)? */ + /* FIXME! dimm: Why not by means of get_totalsize(hdr)? */ total_size = (unsigned long) (hdr[4] + (hdr[5] << 8)); printk(KERN_INFO "microcode: size %u, total_size %u\n", @@ -342,6 +342,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, mc_header = (struct microcode_header_amd *)mc; if (get_matching_microcode(cpu, mc, new_rev)) { + if (new_mc) + vfree(new_mc); new_rev = mc_header->patch_id; new_mc = mc; } else diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index f4930b55c6a0..48ed3cef58c1 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,8 +70,6 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ - -/* #define DEBUG */ /* pr_debug */ #include #include #include @@ -356,6 +354,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, } if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { + if (new_mc) + vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; } else -- cgit v1.2.2 From b899219572350685e6163ce7535efb5ad9bcd6a4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 14 Sep 2008 13:44:41 +0400 Subject: x86: simpler SYSVIPC_COMPAT definition X86_64 part is entirely redundant. Signed-off-by: Alexey Dobriyan Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1e2afe60ba99..4c1475119cef 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1786,7 +1786,7 @@ config COMPAT_FOR_U64_ALIGNMENT config SYSVIPC_COMPAT def_bool y - depends on X86_64 && COMPAT && SYSVIPC + depends on COMPAT && SYSVIPC endmenu -- cgit v1.2.2 From 3d0aedd9538e6be8afec1a9d8b084bf90bc91495 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Sep 2008 17:01:09 -0700 Subject: x86: signal: put give_sigsegv of setup frames together When setup frame fails, force_sigsegv is called and returns -EFAULT. There is similar code in ia32_setup_frame(), ia32_setup_rt_frame(), __setup_frame() and __setup_rt_frame(). Make them identical. No change in functionality intended. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 26 +++++++++----------------- arch/x86/kernel/signal_32.c | 31 ++++++++++++++----------------- arch/x86/kernel/signal_64.c | 17 +++++++++-------- 3 files changed, 32 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 8d64c1bc8474..5f42cfcc1c5a 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -444,21 +444,21 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err |= __put_user(sig, &frame->sig); if (err) - goto give_sigsegv; + return -EFAULT; err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); if (err) - goto give_sigsegv; + return -EFAULT; if (_COMPAT_NSIG_WORDS > 1) { err |= __copy_to_user(frame->extramask, &set->sig[1], sizeof(frame->extramask)); if (err) - goto give_sigsegv; + return -EFAULT; } if (ka->sa.sa_flags & SA_RESTORER) { @@ -479,7 +479,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -502,10 +502,6 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -533,14 +529,14 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err |= __put_user(sig, &frame->sig); err |= __put_user(ptr_to_compat(&frame->info), &frame->pinfo); err |= __put_user(ptr_to_compat(&frame->uc), &frame->puc); err |= copy_siginfo_to_user32(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ if (cpu_has_xsave) @@ -556,7 +552,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; if (ka->sa.sa_flags & SA_RESTORER) restorer = ka->sa.sa_restorer; @@ -571,7 +567,7 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, */ err |= __copy_to_user(frame->retcode, &code, 8); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long) frame; @@ -599,8 +595,4 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b668efc18eae..1c22e0067fe7 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -349,21 +349,21 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err = __put_user(sig, &frame->sig); if (err) - goto give_sigsegv; + return -EFAULT; err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); if (err) - goto give_sigsegv; + return -EFAULT; if (_NSIG_WORDS > 1) { err = __copy_to_user(&frame->extramask, &set->sig[1], sizeof(frame->extramask)); if (err) - goto give_sigsegv; + return -EFAULT; } if (current->mm->context.vdso) @@ -388,7 +388,7 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -403,10 +403,6 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -420,14 +416,14 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; err |= __put_user(sig, &frame->sig); err |= __put_user(&frame->info, &frame->pinfo); err |= __put_user(&frame->uc, &frame->puc); err |= copy_siginfo_to_user(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; /* Create the ucontext. */ if (cpu_has_xsave) @@ -443,7 +439,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs, set->sig[0]); err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up to return from userspace. */ restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -463,7 +459,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->sp = (unsigned long)frame; @@ -478,10 +474,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* @@ -506,6 +498,11 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, else ret = __setup_frame(usig, ka, set, regs); + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + return ret; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 43e6862fc7a2..3b79e179ba3a 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -215,12 +215,12 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) - goto give_sigsegv; + return -EFAULT; if (ka->sa.sa_flags & SA_SIGINFO) { err |= copy_siginfo_to_user(&frame->info, info); if (err) - goto give_sigsegv; + return -EFAULT; } /* Create the ucontext. */ @@ -248,11 +248,11 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); } else { /* could use a vstub here */ - goto give_sigsegv; + return -EFAULT; } if (err) - goto give_sigsegv; + return -EFAULT; /* Set up registers for signal handler */ regs->di = sig; @@ -272,10 +272,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->cs = __USER_CS; return 0; - -give_sigsegv: - force_sigsegv(sig, current); - return -EFAULT; } /* @@ -297,6 +293,11 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, #endif ret = __setup_rt_frame(sig, ka, info, set, regs); + if (ret) { + force_sigsegv(sig, current); + return -EFAULT; + } + return ret; } -- cgit v1.2.2 From 2ba48e16e78216bb5b9fd08a088bfefda478df25 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Sep 2008 17:02:53 -0700 Subject: x86: signal: remove unneeded err handling This patch eliminates unused or unneeded variable handling. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 11 ++++------- arch/x86/kernel/signal_32.c | 11 ++++------- arch/x86/kernel/signal_64.c | 5 ++--- 3 files changed, 10 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 5f42cfcc1c5a..e47bed2440ee 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -446,18 +446,15 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - err |= __put_user(sig, &frame->sig); - if (err) + if (__put_user(sig, &frame->sig)) return -EFAULT; - err |= ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); - if (err) + if (ia32_setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) return -EFAULT; if (_COMPAT_NSIG_WORDS > 1) { - err |= __copy_to_user(frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) + if (__copy_to_user(frame->extramask, &set->sig[1], + sizeof(frame->extramask))) return -EFAULT; } diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 1c22e0067fe7..d433861a6599 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -351,18 +351,15 @@ __setup_frame(int sig, struct k_sigaction *ka, sigset_t *set, if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) return -EFAULT; - err = __put_user(sig, &frame->sig); - if (err) + if (__put_user(sig, &frame->sig)) return -EFAULT; - err = setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0]); - if (err) + if (setup_sigcontext(&frame->sc, fpstate, regs, set->sig[0])) return -EFAULT; if (_NSIG_WORDS > 1) { - err = __copy_to_user(&frame->extramask, &set->sig[1], - sizeof(frame->extramask)); - if (err) + if (__copy_to_user(&frame->extramask, &set->sig[1], + sizeof(frame->extramask))) return -EFAULT; } diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 3b79e179ba3a..a21c85197295 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -210,7 +210,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, (unsigned long)fp - sizeof(struct rt_sigframe), 16) - 8; if (save_i387_xstate(fp) < 0) - err |= -1; + return -EFAULT; } else frame = get_stack(ka, regs, sizeof(struct rt_sigframe)) - 8; @@ -218,8 +218,7 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, return -EFAULT; if (ka->sa.sa_flags & SA_SIGINFO) { - err |= copy_siginfo_to_user(&frame->info, info); - if (err) + if (copy_siginfo_to_user(&frame->info, info)) return -EFAULT; } -- cgit v1.2.2 From e6babb6b7fed93c93f8fc5ef8ebd3a474fc2df3e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Fri, 12 Sep 2008 17:03:31 -0700 Subject: x86: signal: introduce do_rt_sigreturn() introduce do_rt_sigreturn(), to collect common part of sys_rt_sigreturn(). No change in functionality intended. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 12 +++++++++--- arch/x86/kernel/signal_64.c | 11 ++++++++--- 2 files changed, 17 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index d433861a6599..da3cf3270f83 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -215,9 +215,8 @@ badframe: return 0; } -asmlinkage int sys_rt_sigreturn(unsigned long __unused) +static long do_rt_sigreturn(struct pt_regs *regs) { - struct pt_regs *regs = (struct pt_regs *)&__unused; struct rt_sigframe __user *frame; unsigned long ax; sigset_t set; @@ -243,10 +242,17 @@ asmlinkage int sys_rt_sigreturn(unsigned long __unused) return ax; badframe: - signal_fault(regs, frame, "rt sigreturn"); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage int sys_rt_sigreturn(unsigned long __unused) +{ + struct pt_regs *regs = (struct pt_regs *)&__unused; + + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index a21c85197295..bf77d4789a2d 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -104,11 +104,11 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, return err; } -asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +static long do_rt_sigreturn(struct pt_regs *regs) { struct rt_sigframe __user *frame; - sigset_t set; unsigned long ax; + sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) @@ -131,10 +131,15 @@ asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) return ax; badframe: - signal_fault(regs, frame, "sigreturn"); + signal_fault(regs, frame, "rt_sigreturn"); return 0; } +asmlinkage long sys_rt_sigreturn(struct pt_regs *regs) +{ + return do_rt_sigreturn(regs); +} + /* * Set up a signal frame. */ -- cgit v1.2.2 From bee44f294efd8417f5e68553778a6cc957af1547 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Fri, 12 Sep 2008 19:42:35 +0900 Subject: x86: make GART to respect device's dma_mask about virtual mappings Currently, GART IOMMU ingores device's dma_mask when it does virtual mappings. So it could give a device a virtual address that the device can't access to. This patch fixes the above problem. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 39 ++++++++++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 1b0c412566e5..9972c42ac925 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -83,23 +83,34 @@ static unsigned long next_bit; /* protected by iommu_bitmap_lock */ static int need_flush; /* global flush state. set for each gart wrap */ static unsigned long alloc_iommu(struct device *dev, int size, - unsigned long align_mask) + unsigned long align_mask, u64 dma_mask) { unsigned long offset, flags; unsigned long boundary_size; unsigned long base_index; + unsigned long limit; base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; + limit = iommu_device_max_index(iommu_pages, + DIV_ROUND_UP(iommu_bus_base, PAGE_SIZE), + dma_mask >> PAGE_SHIFT); + spin_lock_irqsave(&iommu_bitmap_lock, flags); - offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, + + if (limit <= next_bit) { + need_flush = 1; + next_bit = 0; + } + + offset = iommu_area_alloc(iommu_gart_bitmap, limit, next_bit, size, base_index, boundary_size, align_mask); - if (offset == -1) { + if (offset == -1 && next_bit) { need_flush = 1; - offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, + offset = iommu_area_alloc(iommu_gart_bitmap, limit, 0, size, base_index, boundary_size, align_mask); } @@ -228,12 +239,14 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) * Caller needs to check if the iommu is needed and flush. */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, - size_t size, int dir, unsigned long align_mask) + size_t size, int dir, unsigned long align_mask, + u64 dma_mask) { unsigned long npages = iommu_num_pages(phys_mem, size); - unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); + unsigned long iommu_page; int i; + iommu_page = alloc_iommu(dev, npages, align_mask, dma_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; @@ -263,7 +276,7 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) if (!need_iommu(dev, paddr, size)) return paddr; - bus = dma_map_area(dev, paddr, size, dir, 0); + bus = dma_map_area(dev, paddr, size, dir, 0, dma_get_mask(dev)); flush_gart(); return bus; @@ -314,6 +327,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, { struct scatterlist *s; int i; + u64 dma_mask = dma_get_mask(dev); #ifdef CONFIG_IOMMU_DEBUG printk(KERN_DEBUG "dma_map_sg overflow\n"); @@ -323,7 +337,8 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, unsigned long addr = sg_phys(s); if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir, 0); + addr = dma_map_area(dev, addr, s->length, dir, 0, + dma_mask); if (addr == bad_dma_address) { if (i > 0) gart_unmap_sg(dev, sg, i, dir); @@ -345,14 +360,16 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, struct scatterlist *sout, unsigned long pages) { - unsigned long iommu_start = alloc_iommu(dev, pages, 0); - unsigned long iommu_page = iommu_start; + unsigned long iommu_start; + unsigned long iommu_page; struct scatterlist *s; int i; + iommu_start = alloc_iommu(dev, pages, 0, dma_get_mask(dev)); if (iommu_start == -1) return -1; + iommu_page = iommu_start; for_each_sg(start, s, nelems, i) { unsigned long pages, addr; unsigned long phys_addr = s->dma_address; @@ -497,7 +514,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, align_mask = (1UL << get_order(size)) - 1; *dma_addr = dma_map_area(dev, __pa(vaddr), size, DMA_BIDIRECTIONAL, - align_mask); + align_mask, dma_mask); flush_gart(); if (*dma_addr != bad_dma_address) -- cgit v1.2.2 From f10ac8a232496bf9271cfc67c6eea432891f04a6 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 11 Sep 2008 23:08:47 +0900 Subject: x86: avoid unnecessary low zone allocation in Calgary's alloc_coherent x86's common alloc_coherent (dma_alloc_coherent in dma-mapping.h) sets up the gfp flag according to the device dma_mask but Calgary doesn't need it because of virtual mappings. This patch avoids unnecessary low zone allocation. Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 8415d92853c4..fe7695e4caae 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -491,6 +491,8 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size, npages = size >> PAGE_SHIFT; order = get_order(size); + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + /* alloc enough pages (and possibly more) */ ret = (void *)__get_free_pages(flag, order); if (!ret) -- cgit v1.2.2 From f6a32a36ab96016675cd414802904feb288d7899 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 11 Sep 2008 23:08:48 +0900 Subject: x86: gart alloc_coherent does virtual mapppings only when necessary gart alloc_coherent need to do virtual mapppings only when an allocated buffer is not DMA-capable for a device. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 9972c42ac925..9739d5682093 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -505,15 +505,23 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { void *vaddr; + dma_addr_t paddr; unsigned long align_mask; + u64 dma_mask = dma_alloc_coherent_mask(dev, flag); vaddr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size)); if (!vaddr) return NULL; + paddr = virt_to_phys(vaddr); + if (is_buffer_dma_capable(dma_mask, paddr, size)) { + *dma_addr = paddr; + return vaddr; + } + align_mask = (1UL << get_order(size)) - 1; - *dma_addr = dma_map_area(dev, __pa(vaddr), size, DMA_BIDIRECTIONAL, + *dma_addr = dma_map_area(dev, paddr, size, DMA_BIDIRECTIONAL, align_mask, dma_mask); flush_gart(); -- cgit v1.2.2 From 5670a43d710a12fcbbfaefd3991002768b488d82 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 14 Sep 2008 07:42:23 -0700 Subject: xen: fix for xen guest with mem > 3.7G PFN_PHYS() can truncate large addresses unless its passed a suitable large type. This is fixed more generally in the patch series introducing phys_addr_t, but we need a short-term fix to solve a Xen regression reported by Roberto De Ioris. Reported-by: Roberto De Ioris Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b6acc3a0af46..d67901083888 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -42,7 +42,7 @@ char * __init xen_memory_setup(void) e820.nr_map = 0; - e820_add_region(0, PFN_PHYS(max_pfn), E820_RAM); + e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); /* * Even though this is normal, usable memory under Xen, reserve -- cgit v1.2.2 From 600715dcdf567c86f8b2c6173fcfb4b873e25a19 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:45 -0700 Subject: generic: add phys_addr_t for holding physical addresses Add a kernel-wide "phys_addr_t" which is guaranteed to be able to hold any physical address. By default it equals the word size of the architecture, but a 32-bit architecture can set ARCH_PHYS_ADDR_T_64BIT if it needs a 64-bit phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..a0ffb5188c8c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -932,6 +932,9 @@ config X86_PAE has the cost of more pagetable lookup overhead, and also consumes more pagetable space per process. +config ARCH_PHYS_ADDR_T_64BIT + def_bool X86_64 || X86_PAE + # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support (EXPERIMENTAL)" -- cgit v1.2.2 From 8308c54d7e312f7a03e2ce2057d0837e6fe3843f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 11 Sep 2008 01:31:50 -0700 Subject: generic: redefine resource_size_t as phys_addr_t There's no good reason why a resource_size_t shouldn't just be a physical address, so simply redefine it in terms of phys_addr_t. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 - arch/x86/kernel/e820.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a0ffb5188c8c..b4e1875f9861 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -925,7 +925,6 @@ config X86_PAE def_bool n prompt "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G - select RESOURCES_64BIT help PAE is required for NX support, and furthermore enables larger swapspace support for non-overcommit purposes. It diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 66e48aa2dd1b..477f4bb7e552 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1276,12 +1276,10 @@ void __init e820_reserve_resources(void) res = alloc_bootmem_low(sizeof(struct resource) * e820.nr_map); for (i = 0; i < e820.nr_map; i++) { end = e820.map[i].addr + e820.map[i].size - 1; -#ifndef CONFIG_RESOURCES_64BIT - if (end > 0x100000000ULL) { + if (end != (resource_size_t)end) { res++; continue; } -#endif res->name = e820_type_to_string(e820.map[i].type); res->start = e820.map[i].addr; res->end = end; -- cgit v1.2.2 From 5132895f14a57607152f7865dc862fb076ce2585 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Wed, 3 Sep 2008 14:36:40 +0100 Subject: x86/paravirt: Remove duplicate paravirt_pagetable_setup_{start, done}() They were already called once in arch/x86/kernel/setup.c - we don't need to call them again. fixes: http://bugzilla.kernel.org/show_bug.cgi?id=11485 Signed-off-by: Alex Nixon Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..60ec1d08ff24 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -458,11 +458,7 @@ static void __init pagetable_init(void) { pgd_t *pgd_base = swapper_pg_dir; - paravirt_pagetable_setup_start(pgd_base); - permanent_kmaps_init(pgd_base); - - paravirt_pagetable_setup_done(pgd_base); } #ifdef CONFIG_ACPI_SLEEP -- cgit v1.2.2 From d7451fca18e2ec62641ae4bbfe946069f13765a3 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Fri, 12 Sep 2008 11:45:22 -0600 Subject: x86, hpet: SB600 - remove HPET resources from PCI device Prevent the HPET resources from appearing in PCI device 14.0 which confuses the PCI resource engine. Signed-off-by: Jordan Crouse Signed-off-by: Ingo Molnar --- arch/x86/pci/fixup.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 4bdaa590375d..3c27a809393b 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -511,3 +511,31 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1201, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1202, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1203, fam10h_pci_cfg_space_size); DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_AMD, 0x1204, fam10h_pci_cfg_space_size); + +/* + * SB600: Disable BAR1 on device 14.0 to avoid HPET resources from + * confusing the PCI engine: + */ +static void sb600_disable_hpet_bar(struct pci_dev *dev) +{ + u8 val; + + /* + * The SB600 and SB700 both share the same device + * ID, but the PM register 0x55 does something different + * for the SB700, so make sure we are dealing with the + * SB600 before touching the bit: + */ + + pci_read_config_byte(dev, 0x08, &val); + + if (val < 0x2F) { + outb(0x55, 0xCD6); + val = inb(0xCD7); + + /* Set bit 7 in PM register 0x55 */ + outb(0x55, 0xCD6); + outb(val | 0x80, 0xCD7); + } +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_ATI, 0x4385, sb600_disable_hpet_bar); -- cgit v1.2.2 From 5649b7c30316a51792808422ac03ee825d26aa5e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Sep 2008 09:29:09 +0200 Subject: x86: add DMI quirk for AMI BIOS which corrupts address 0xc000 during resume Alan Jenkins and Andy Wettstein reported a suspend/resume memory corruption bug and extensively documented it here: http://bugzilla.kernel.org/show_bug.cgi?id=11237 The bug is that the BIOS overwrites 1K of memory at 0xc000 physical, without registering it in e820 as reserved or giving the kernel any idea about this. Detect AMI BIOSen and reserve that 1K. We paint this bug around with a very broad brush (reserving that 1K on all AMI BIOS systems), as the bug was extremely hard to find and needed several weeks and lots of debugging and patching. The bug was found via the CONFIG_X86_CHECK_BIOS_CORRUPTION=y debug feature, if similar bugs are suspected then this feature can be enabled on other systems as well to scan low memory for corrupted memory. Reported-by: Alan Jenkins Reported-by: Andy Wettstein Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ec7e56c1b984..3109ca37a67c 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -729,6 +729,29 @@ void start_periodic_check_for_corruption(void) } #endif +static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) +{ + printk(KERN_NOTICE + "%s detected: BIOS corrupts 0xc000, working it around.\n", + d->ident); + + reserve_early(0xc000, 0xc400, "BIOS quirk"); + + return 0; +} + +/* List of systems that have known low memory corruption BIOS problems */ +static struct dmi_system_id __initdata bad_bios_dmi_table[] = { + { + .callback = dmi_low_memory_corruption, + .ident = "AMI BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), + }, + }, + {} +}; + /* * Determine if we were loaded by an EFI loader. If so, then we have also been * passed the efi memmap, systab, etc., so we should use these data structures @@ -752,6 +775,8 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif + dmi_check_system(bad_bios_dmi_table); + early_cpu_init(); early_ioremap_init(); @@ -1037,3 +1062,5 @@ void __init setup_arch(char **cmdline_p) #endif #endif } + + -- cgit v1.2.2 From 1e22436eba84edfec9c25e5a25d09062c4f91ca9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Sep 2008 09:58:02 +0200 Subject: x86: reserve low 64K on AMI and Phoenix BIOS boxen there's multiple reports about suspend/resume related low memory corruption in this bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=11237 the common pattern is that the corruption is caused by the BIOS, and that it affects some portion of the first 64K of physical RAM. So add a DMI quirk This will waste 64K RAM on 'good' systems too, but without knowing the exact nature of this BIOS memory corruption this is the safest approach. This might as well solve a wide range of suspend/resume breakages under Linux. Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 3109ca37a67c..33719544a224 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -732,10 +732,10 @@ void start_periodic_check_for_corruption(void) static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) { printk(KERN_NOTICE - "%s detected: BIOS corrupts 0xc000, working it around.\n", + "%s detected: BIOS may corrupt low RAM, working it around.\n", d->ident); - reserve_early(0xc000, 0xc400, "BIOS quirk"); + reserve_early(0x0, 0x10000, "BIOS quirk"); return 0; } @@ -749,6 +749,13 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."), }, }, + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), + }, + }, {} }; -- cgit v1.2.2 From fc38151947477596aa27df6c4306ad6008dc6711 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Sep 2008 10:07:34 +0200 Subject: x86: add X86_RESERVE_LOW_64K This bugzilla: http://bugzilla.kernel.org/show_bug.cgi?id=11237 Documents a wide range of systems where the BIOS utilizes the first 64K of physical memory during suspend/resume and other hardware events. Currently we reserve this memory on all AMI and Phoenix BIOS systems. Life is too short to hunt subtle memory corruption problems like this, so we try to be robust by default. Still, allow this to be overriden: allow users who want that first 64K of memory to be available to the kernel disable the quirk, via CONFIG_X86_RESERVE_LOW_64K=n. Also, allow the early reservation to overlap with other early reservations. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 20 ++++++++++++++++++++ arch/x86/kernel/setup.c | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7820d447bb8d..633f25dd9ee2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1089,6 +1089,26 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK Set whether the default state of memory_corruption_check is on or off. +config X86_RESERVE_LOW_64K + bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" + default y + help + Reserve the first 64K of physical RAM on BIOSes that are known + to potentially corrupt that memory range. A numbers of BIOSes are + known to utilize this area during suspend/resume, so it must not + be used by the kernel. + + Set this to N if you are absolutely sure that you trust the BIOS + to get all its memory reservations and usages right. + + If you have doubts about the BIOS (e.g. suspend/resume does not + work or there's kernel crashes after certain hardware hotplug + events) and it's not AMI or Phoenix, then you might want to enable + X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical + corruption patterns. + + Say Y if unsure. + config MATH_EMULATION bool prompt "Math emulation" if X86_32 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 33719544a224..786c1886ae53 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -735,13 +735,14 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) "%s detected: BIOS may corrupt low RAM, working it around.\n", d->ident); - reserve_early(0x0, 0x10000, "BIOS quirk"); + reserve_early_overlap_ok(0x0, 0x10000, "BIOS quirk"); return 0; } /* List of systems that have known low memory corruption BIOS problems */ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { +#ifdef CONFIG_X86_RESERVE_LOW_64K { .callback = dmi_low_memory_corruption, .ident = "AMI BIOS", @@ -757,6 +758,7 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { }, }, {} +#endif }; /* -- cgit v1.2.2 From ba0593bf553c450a03dbc5f8c1f0ff58b778a0c8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Sep 2008 09:29:40 -0700 Subject: x86: completely disable NOPL on 32 bits Completely disable NOPL on 32 bits. It turns out that Microsoft Virtual PC is so broken it can't even reliably *fail* in the presence of NOPL. This leaves the infrastructure in place but disables it unconditionally. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 8aab8517642e..4e456bd955bb 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -344,31 +344,15 @@ static void __init early_cpu_detect(void) /* * The NOPL instruction is supposed to exist on all CPUs with - * family >= 6, unfortunately, that's not true in practice because + * family >= 6; unfortunately, that's not true in practice because * of early VIA chips and (more importantly) broken virtualizers that - * are not easy to detect. Hence, probe for it based on first - * principles. + * are not easy to detect. In the latter case it doesn't even *fail* + * reliably, so probing for it doesn't even work. Disable it completely + * unless we can find a reliable way to detect all the broken cases. */ static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) { - const u32 nopl_signature = 0x888c53b1; /* Random number */ - u32 has_nopl = nopl_signature; - clear_cpu_cap(c, X86_FEATURE_NOPL); - if (c->x86 >= 6) { - asm volatile("\n" - "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ - "2:\n" - " .section .fixup,\"ax\"\n" - "3: xor %0,%0\n" - " jmp 2b\n" - " .previous\n" - _ASM_EXTABLE(1b,3b) - : "+a" (has_nopl)); - - if (has_nopl == nopl_signature) - set_cpu_cap(c, X86_FEATURE_NOPL); - } } static void __cpuinit generic_identify(struct cpuinfo_x86 *c) -- cgit v1.2.2 From 998564789137921acae9e367b61c5a1dc295653d Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Tue, 16 Sep 2008 11:17:03 +0200 Subject: x86 setup: drop SWAP_DEV Impact: None (cleanup) SWAP_DEV is unused since 2.6.23-rc1. The comment was already incorrect since (at least) 2.6.12. Signed-off-by: Paul Bolle Signed-off-by: H. Peter Anvin --- arch/x86/boot/header.S | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index af86e431acfa..b993062e9a5f 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -30,7 +30,6 @@ SYSSEG = DEF_SYSSEG /* system loaded at 0x10000 (65536) */ SYSSIZE = DEF_SYSSIZE /* system size: # of 16-byte clicks */ /* to be loaded */ ROOT_DEV = 0 /* ROOT_DEV is now written by "build" */ -SWAP_DEV = 0 /* SWAP_DEV is now written by "build" */ #ifndef SVGA_MODE #define SVGA_MODE ASK_VGA -- cgit v1.2.2 From 97fc0555dae8f4d437c8672c14d7191962429be4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 16 Sep 2008 15:09:26 -0700 Subject: x86 setup: handle more than 8 CPU flag words Checkin e38e05a85828dac23540cd007df5f20985388afc added a 9th CPU flag word, but didn't adjust the boot code to match. This patch adds the necessary boot code support. Note: due to a typo in an #if statement, it didn't trigger the #error this was supposed to do. Signed-off-by: H. Peter Anvin --- arch/x86/boot/cpu.c | 17 +++++++++-------- arch/x86/boot/mkcpustr.c | 38 +++++++++++++++++++------------------- 2 files changed, 28 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 75298fe2edca..6ec6bb6e9957 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -59,17 +59,18 @@ int validate_cpu(void) u32 e = err_flags[i]; for (j = 0; j < 32; j++) { - int n = (i << 5)+j; - if (*msg_strs < n) { + if (msg_strs[0] < i || + (msg_strs[0] == i && msg_strs[1] < j)) { /* Skip to the next string */ - do { - msg_strs++; - } while (*msg_strs); - msg_strs++; + msg_strs += 2; + while (*msg_strs++) + ; } if (e & 1) { - if (*msg_strs == n && msg_strs[1]) - printf("%s ", msg_strs+1); + if (msg_strs[0] == i && + msg_strs[1] == j && + msg_strs[2]) + printf("%s ", msg_strs+2); else printf("%d:%d ", i, j); } diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c index 4589caa3e9d1..8ef60f20b371 100644 --- a/arch/x86/boot/mkcpustr.c +++ b/arch/x86/boot/mkcpustr.c @@ -17,31 +17,31 @@ #include "../kernel/cpu/capflags.c" -#if NCAPFLAGS > 8 -# error "Need to adjust the boot code handling of CPUID strings" -#endif - int main(void) { - int i; + int i, j; const char *str; printf("static const char x86_cap_strs[] = \n"); - for (i = 0; i < NCAPINTS*32; i++) { - str = x86_cap_flags[i]; - - if (i == NCAPINTS*32-1) { - /* The last entry must be unconditional; this - also consumes the compiler-added null character */ - if (!str) - str = ""; - printf("\t\"\\x%02x\"\"%s\"\n", i, str); - } else if (str) { - printf("#if REQUIRED_MASK%d & (1 << %d)\n" - "\t\"\\x%02x\"\"%s\\0\"\n" - "#endif\n", - i >> 5, i & 31, i, str); + for (i = 0; i < NCAPINTS; i++) { + for (j = 0; j < 32; j++) { + str = x86_cap_flags[i*32+j]; + + if (i == NCAPINTS-1 && j == 31) { + /* The last entry must be unconditional; this + also consumes the compiler-added null + character */ + if (!str) + str = ""; + printf("\t\"\\x%02x\\x%02x\"\"%s\"\n", + i, j, str); + } else if (str) { + printf("#if REQUIRED_MASK%d & (1 << %d)\n" + "\t\"\\x%02x\\x%02x\"\"%s\\0\"\n" + "#endif\n", + i, j, i, j, str); + } } } printf("\t;\n"); -- cgit v1.2.2 From 90f7d25c6b672137344f447a30a9159945ffea72 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Tue, 16 Sep 2008 11:27:30 -0700 Subject: x86: print DMI information in the oops trace in order to diagnose hard system specific issues, it's useful to have the system name in the oops (as provided by DMI) Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0c3927accb00..7b9ee9f09639 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,7 @@ void __show_registers(struct pt_regs *regs, int all) unsigned long d0, d1, d2, d3, d6, d7; unsigned long sp; unsigned short ss, gs; + const char *board; if (user_mode_vm(regs)) { sp = regs->sp; @@ -172,11 +174,15 @@ void __show_registers(struct pt_regs *regs, int all) } printk("\n"); - printk("Pid: %d, comm: %s %s (%s %.*s)\n", + + board = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!board) + board = ""; + printk("Pid: %d, comm: %s %s (%s %.*s) %s\n", task_pid_nr(current), current->comm, print_tainted(), init_utsname()->release, (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); + init_utsname()->version, board); printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", (u16)regs->cs, regs->ip, regs->flags, -- cgit v1.2.2 From ee2fa7435b6dddf1ca119f298ad0100cf50c0397 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Sep 2008 13:47:25 +0200 Subject: AMD IOMMU: set iommu sunc flag after command queuing The iommu->need_sync flag must be set after the command is queued to avoid race conditions. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 69b4d060b21c..a96d8c049a88 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -140,6 +140,7 @@ static int iommu_completion_wait(struct amd_iommu *iommu) static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) { struct iommu_cmd cmd; + int ret; BUG_ON(iommu == NULL); @@ -147,9 +148,11 @@ static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); cmd.data[0] = devid; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } /* @@ -159,6 +162,7 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, u64 address, u16 domid, int pde, int s) { struct iommu_cmd cmd; + int ret; memset(&cmd, 0, sizeof(cmd)); address &= PAGE_MASK; @@ -171,9 +175,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, if (pde) /* PDE bit - we wan't flush everything not only the PTEs */ cmd.data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK; + ret = iommu_queue_command(iommu, &cmd); + iommu->need_sync = 1; - return iommu_queue_command(iommu, &cmd); + return ret; } /* -- cgit v1.2.2 From 7e4f88da7bf1887563f70bd5edbbd0479e31dc12 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Sep 2008 14:19:15 +0200 Subject: AMD IOMMU: protect completion wait loop with iommu lock The unlocked polling of the ComWaitInt bit in the IOMMU completion wait path is racy. Protect it with the iommu lock. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a96d8c049a88..042fdc27bc92 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -101,10 +101,10 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) */ static int iommu_completion_wait(struct amd_iommu *iommu) { - int ret, ready = 0; + int ret = 0, ready = 0; unsigned status = 0; struct iommu_cmd cmd; - unsigned long i = 0; + unsigned long flags, i = 0; memset(&cmd, 0, sizeof(cmd)); cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; @@ -112,10 +112,12 @@ static int iommu_completion_wait(struct amd_iommu *iommu) iommu->need_sync = 0; - ret = iommu_queue_command(iommu, &cmd); + spin_lock_irqsave(&iommu->lock, flags); + + ret = __iommu_queue_command(iommu, &cmd); if (ret) - return ret; + goto out; while (!ready && (i < EXIT_LOOP_COUNT)) { ++i; @@ -130,6 +132,8 @@ static int iommu_completion_wait(struct amd_iommu *iommu) if (unlikely((i == EXIT_LOOP_COUNT) && printk_ratelimit())) printk(KERN_WARNING "AMD IOMMU: Completion wait loop failed\n"); +out: + spin_unlock_irqrestore(&iommu->lock, flags); return 0; } -- cgit v1.2.2 From 279b0bbba2bb647348ad90e183b3960aa99eccfd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 18 Sep 2008 23:55:27 -0700 Subject: x86: fix arch/x86/kernel/cpu/mtrr/main.c warning fix this warning reported by Andrew Morton: > arch/x86/kernel/cpu/mtrr/main.c: In function 'mtrr_bp_init': > arch/x86/kernel/cpu/mtrr/main.c:1170: warning: 'extra_remove_base' may be used uninitialized in this function the warning is bogus but the logic that prevents uninitialized use is a bit convoluted so simplify it all. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b117d7f8a564..8a7c79234be6 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1218,11 +1218,10 @@ static int __init mtrr_cleanup(unsigned address_bits) memset(range, 0, sizeof(range)); extra_remove_size = 0; - if (mtrr_tom2) { - extra_remove_base = 1 << (32 - PAGE_SHIFT); + extra_remove_base = 1 << (32 - PAGE_SHIFT); + if (mtrr_tom2) extra_remove_size = (mtrr_tom2 >> PAGE_SHIFT) - extra_remove_base; - } nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); range_sums = sum_ranges(range, nr_range); -- cgit v1.2.2 From dbcc112e3b5367e81a845b082933506b0ff1d1e2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 4 Sep 2008 15:04:26 +0200 Subject: AMD IOMMU: check for invalid device pointers Currently AMD IOMMU code triggers a BUG_ON if NULL is passed as the device. This is inconsistent with other IOMMU implementations. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 43 +++++++++++++++++++++++++++++++++++-------- 1 file changed, 35 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 01c68c38840d..695e0fc41b10 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -645,6 +645,18 @@ static void set_device_domain(struct amd_iommu *iommu, * *****************************************************************************/ +/* + * This function checks if the driver got a valid device from the caller to + * avoid dereferencing invalid pointers. + */ +static bool check_device(struct device *dev) +{ + if (!dev || !dev->dma_mask) + return false; + + return true; +} + /* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the @@ -661,18 +673,19 @@ static int get_device_resources(struct device *dev, struct pci_dev *pcidev; u16 _bdf; - BUG_ON(!dev || dev->bus != &pci_bus_type || !dev->dma_mask); + *iommu = NULL; + *domain = NULL; + *bdf = 0xffff; + + if (dev->bus != &pci_bus_type) + return 0; pcidev = to_pci_dev(dev); _bdf = calc_devid(pcidev->bus->number, pcidev->devfn); /* device not translated by any IOMMU in the system? */ - if (_bdf > amd_iommu_last_bdf) { - *iommu = NULL; - *domain = NULL; - *bdf = 0xffff; + if (_bdf > amd_iommu_last_bdf) return 0; - } *bdf = amd_iommu_alias_table[_bdf]; @@ -826,6 +839,9 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, u16 devid; dma_addr_t addr; + if (!check_device(dev)) + return bad_dma_address; + get_device_resources(dev, &iommu, &domain, &devid); if (iommu == NULL || domain == NULL) @@ -860,7 +876,8 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, struct protection_domain *domain; u16 devid; - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!check_device(dev) || + !get_device_resources(dev, &iommu, &domain, &devid)) /* device not handled by any AMD IOMMU */ return; @@ -910,6 +927,9 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, phys_addr_t paddr; int mapped_elems = 0; + if (!check_device(dev)) + return 0; + get_device_resources(dev, &iommu, &domain, &devid); if (!iommu || !domain) @@ -967,7 +987,8 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, u16 devid; int i; - if (!get_device_resources(dev, &iommu, &domain, &devid)) + if (!check_device(dev) || + !get_device_resources(dev, &iommu, &domain, &devid)) return; spin_lock_irqsave(&domain->lock, flags); @@ -999,6 +1020,9 @@ static void *alloc_coherent(struct device *dev, size_t size, u16 devid; phys_addr_t paddr; + if (!check_device(dev)) + return NULL; + virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return 0; @@ -1047,6 +1071,9 @@ static void free_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; + if (!check_device(dev)) + return; + get_device_resources(dev, &iommu, &domain, &devid); if (!iommu || !domain) -- cgit v1.2.2 From 270cab2426cdc6307725e4f1f46ecf8ab8e69193 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 4 Sep 2008 15:49:46 +0200 Subject: AMD IOMMU: move TLB flushing to the map/unmap helper functions This patch moves the invocation of the flushing functions to the map/unmap helpers because its common code in all dma_ops relevant mapping/unmapping code. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 695e0fc41b10..691e023695ad 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -795,6 +795,9 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; + if (unlikely(iommu_has_npcache(iommu))) + iommu_flush_pages(iommu, dma_dom->domain.id, address, size); + out: return address; } @@ -825,6 +828,8 @@ static void __unmap_single(struct amd_iommu *iommu, } dma_ops_free_addresses(dma_dom, dma_addr, pages); + + iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); } /* @@ -853,9 +858,6 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, if (addr == bad_dma_address) goto out; - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, addr, size); - if (iommu->need_sync) iommu_completion_wait(iommu); @@ -885,8 +887,6 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, __unmap_single(iommu, domain->priv, dma_addr, size, dir); - iommu_flush_pages(iommu, domain->id, dma_addr, size); - if (iommu->need_sync) iommu_completion_wait(iommu); @@ -948,9 +948,6 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, mapped_elems++; } else goto unmap; - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, s->dma_address, - s->dma_length); } if (iommu->need_sync) @@ -996,8 +993,6 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, for_each_sg(sglist, s, nelems, i) { __unmap_single(iommu, domain->priv, s->dma_address, s->dma_length, dir); - iommu_flush_pages(iommu, domain->id, s->dma_address, - s->dma_length); s->dma_address = s->dma_length = 0; } @@ -1048,9 +1043,6 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out; } - if (iommu_has_npcache(iommu)) - iommu_flush_pages(iommu, domain->id, *dma_addr, size); - if (iommu->need_sync) iommu_completion_wait(iommu); @@ -1082,7 +1074,6 @@ static void free_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - iommu_flush_pages(iommu, domain->id, dma_addr, size); if (iommu->need_sync) iommu_completion_wait(iommu); -- cgit v1.2.2 From 2842e5bf3115193f05dc9dac20f940e7abf44c1a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 18 Sep 2008 15:23:43 +0200 Subject: x86: move GART TLB flushing options to generic code The GART currently implements the iommu=[no]fullflush command line parameters which influence its IO/TLB flushing strategy. This patch makes these parameters generic so that they can be used by the AMD IOMMU too. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 13 +++++++++++++ arch/x86/kernel/pci-gart_64.c | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0a1408abcc62..d2f2c0158dc1 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -16,6 +16,15 @@ EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ +int iommu_fullflush; + #ifdef CONFIG_IOMMU_DEBUG int panic_on_overflow __read_mostly = 1; int force_iommu __read_mostly = 1; @@ -171,6 +180,10 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "nomerge", 7)) iommu_merge = 0; + if (!strncmp(p, "fullflush", 8)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush", 11)) + iommu_fullflush = 0; if (!strncmp(p, "forcesac", 8)) iommu_sac_force = 1; if (!strncmp(p, "allowdac", 8)) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 9739d5682093..508ef470b27f 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -45,15 +45,6 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ -/* - * If this is disabled the IOMMU will use an optimized flushing strategy - * of only flushing when an mapping is reused. With it true the GART is - * flushed for every mapping. Problem is that doing the lazy flush seems - * to trigger bugs with some popular PCI cards, in particular 3ware (but - * has been also also seen with Qlogic at least). - */ -int iommu_fullflush = 1; - /* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); /* Guarded by iommu_bitmap_lock: */ @@ -901,10 +892,6 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; - if (!strncmp(p, "fullflush", 8)) - iommu_fullflush = 1; - if (!strncmp(p, "nofullflush", 11)) - iommu_fullflush = 0; if (!strncmp(p, "noagp", 5)) no_agp = 1; if (!strncmp(p, "noaperture", 10)) -- cgit v1.2.2 From 1c65577398589bb44ab0980f9b9d30804b48a5db Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 4 Sep 2008 18:40:05 +0200 Subject: AMD IOMMU: implement lazy IO/TLB flushing The IO/TLB flushing on every unmaping operation is the most expensive part in AMD IOMMU code and not strictly necessary. It is sufficient to do the flush before any entries are reused. This is patch implements lazy IO/TLB flushing which does exactly this. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 26 ++++++++++++++++++++++---- arch/x86/kernel/amd_iommu_init.c | 7 ++++++- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 691e023695ad..679f2a8e22ee 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -203,6 +203,14 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, return 0; } +/* Flush the whole IO/TLB for a given protection domain */ +static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) +{ + u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; + + iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); +} + /**************************************************************************** * * The functions below are used the create the page table mappings for @@ -386,14 +394,18 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, PAGE_SIZE) >> PAGE_SHIFT; limit = limit < size ? limit : size; - if (dom->next_bit >= limit) + if (dom->next_bit >= limit) { dom->next_bit = 0; + dom->need_flush = true; + } address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, 0 , boundary_size, 0); - if (address == -1) + if (address == -1) { address = iommu_area_alloc(dom->bitmap, limit, 0, pages, 0, boundary_size, 0); + dom->need_flush = true; + } if (likely(address != -1)) { dom->next_bit = address + pages; @@ -553,6 +565,8 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->bitmap[0] = 1; dma_dom->next_bit = 0; + dma_dom->need_flush = false; + /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { @@ -795,7 +809,10 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; - if (unlikely(iommu_has_npcache(iommu))) + if (unlikely(dma_dom->need_flush && !iommu_fullflush)) { + iommu_flush_tlb(iommu, dma_dom->domain.id); + dma_dom->need_flush = false; + } else if (unlikely(iommu_has_npcache(iommu))) iommu_flush_pages(iommu, dma_dom->domain.id, address, size); out: @@ -829,7 +846,8 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); - iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); + if (iommu_fullflush) + iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); } /* diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index a69cc0f52042..f2fa8dc81beb 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -995,6 +995,11 @@ int __init amd_iommu_init(void) else printk("disabled\n"); + if (iommu_fullflush) + printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); + else + printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); + out: return ret; @@ -1057,7 +1062,7 @@ void __init amd_iommu_detect(void) static int __init parse_amd_iommu_options(char *str) { for (; *str; ++str) { - if (strcmp(str, "isolate") == 0) + if (strncmp(str, "isolate", 7) == 0) amd_iommu_isolate = 1; } -- cgit v1.2.2 From 5507eef835c9c941e69d6d96e4b43af23eeb4ac9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 4 Sep 2008 19:01:02 +0200 Subject: AMD IOMMU: add branch hints to completion wait checks This patch adds branch hints to the cecks if a completion_wait is necessary. The completion_waits in the mapping paths are unlikly because they will only happen on software implementations of AMD IOMMU which don't exists today or with lazy IO/TLB flushing when the allocator wraps around the address space. With lazy IO/TLB flushing the completion_wait in the unmapping path is unlikely too. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 679f2a8e22ee..d743aa0adccc 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -876,7 +876,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, if (addr == bad_dma_address) goto out; - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -905,7 +905,7 @@ static void unmap_single(struct device *dev, dma_addr_t dma_addr, __unmap_single(iommu, domain->priv, dma_addr, size, dir); - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); @@ -968,7 +968,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, goto unmap; } - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -1014,7 +1014,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist, s->dma_address = s->dma_length = 0; } - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); @@ -1061,7 +1061,7 @@ static void *alloc_coherent(struct device *dev, size_t size, goto out; } - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); out: @@ -1093,7 +1093,7 @@ static void free_coherent(struct device *dev, size_t size, __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); - if (iommu->need_sync) + if (unlikely(iommu->need_sync)) iommu_completion_wait(iommu); spin_unlock_irqrestore(&domain->lock, flags); -- cgit v1.2.2 From 6d4f343f84993eb0d5864c0823dc9babd171a33a Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 4 Sep 2008 19:18:02 +0200 Subject: AMD IOMMU: align alloc_coherent addresses properly The API definition for dma_alloc_coherent states that the bus address has to be aligned to the next power of 2 boundary greater than the allocation size. This is violated by AMD IOMMU so far and this patch fixes it. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index d743aa0adccc..15792ed082e0 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -383,7 +383,8 @@ static unsigned long dma_mask_to_pages(unsigned long mask) */ static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, - unsigned int pages) + unsigned int pages, + unsigned long align_mask) { unsigned long limit = dma_mask_to_pages(*dev->dma_mask); unsigned long address; @@ -400,10 +401,10 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, } address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages, - 0 , boundary_size, 0); + 0 , boundary_size, align_mask); if (address == -1) { address = iommu_area_alloc(dom->bitmap, limit, 0, pages, - 0, boundary_size, 0); + 0, boundary_size, align_mask); dom->need_flush = true; } @@ -787,17 +788,22 @@ static dma_addr_t __map_single(struct device *dev, struct dma_ops_domain *dma_dom, phys_addr_t paddr, size_t size, - int dir) + int dir, + bool align) { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start; unsigned int pages; + unsigned long align_mask = 0; int i; pages = iommu_num_pages(paddr, size); paddr &= PAGE_MASK; - address = dma_ops_alloc_addresses(dev, dma_dom, pages); + if (align) + align_mask = (1UL << get_order(size)) - 1; + + address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask); if (unlikely(address == bad_dma_address)) goto out; @@ -872,7 +878,7 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false); if (addr == bad_dma_address) goto out; @@ -959,7 +965,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, paddr = sg_phys(s); s->dma_address = __map_single(dev, iommu, domain->priv, - paddr, s->length, dir); + paddr, s->length, dir, false); if (s->dma_address) { s->dma_length = s->length; @@ -1053,7 +1059,7 @@ static void *alloc_coherent(struct device *dev, size_t size, spin_lock_irqsave(&domain->lock, flags); *dma_addr = __map_single(dev, iommu, domain->priv, paddr, - size, DMA_BIDIRECTIONAL); + size, DMA_BIDIRECTIONAL, true); if (*dma_addr == bad_dma_address) { free_pages((unsigned long)virt_addr, get_order(size)); -- cgit v1.2.2 From 335503e57b6b8de04cec5d27eb2c3d09ff98905b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 5 Sep 2008 14:29:07 +0200 Subject: AMD IOMMU: add event buffer allocation This patch adds the allocation of a event buffer for each AMD IOMMU in the system. The hardware will log events like device page faults or other errors to this buffer once this is enabled. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index f2fa8dc81beb..41ce8d5d626e 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -417,6 +417,30 @@ static void __init free_command_buffer(struct amd_iommu *iommu) free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); } +/* allocates the memory where the IOMMU will log its events to */ +static u8 * __init alloc_event_buffer(struct amd_iommu *iommu) +{ + u64 entry; + iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(EVT_BUFFER_SIZE)); + + if (iommu->evt_buf == NULL) + return NULL; + + entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK; + memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET, + &entry, sizeof(entry)); + + iommu->evt_buf_size = EVT_BUFFER_SIZE; + + return iommu->evt_buf; +} + +static void __init free_event_buffer(struct amd_iommu *iommu) +{ + free_pages((unsigned long)iommu->evt_buf, get_order(EVT_BUFFER_SIZE)); +} + /* sets a specific bit in the device table entry. */ static void set_dev_entry_bit(u16 devid, u8 bit) { @@ -622,6 +646,7 @@ static int __init init_iommu_devices(struct amd_iommu *iommu) static void __init free_iommu_one(struct amd_iommu *iommu) { free_command_buffer(iommu); + free_event_buffer(iommu); iommu_unmap_mmio_space(iommu); } @@ -661,6 +686,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->cmd_buf) return -ENOMEM; + iommu->evt_buf = alloc_event_buffer(iommu); + if (!iommu->evt_buf) + return -ENOMEM; + init_iommu_from_pci(iommu); init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); -- cgit v1.2.2 From ee893c24edb8ebab9a3fb66566855572579ad616 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Sep 2008 14:48:04 +0200 Subject: AMD IOMMU: save pci segment from ACPI tables This patch adds the pci_seg field to the amd_iommu structure and fills it with the corresponding value from the ACPI table. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 41ce8d5d626e..b50234ef91ed 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -676,6 +676,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) */ iommu->devid = h->devid; iommu->cap_ptr = h->cap_ptr; + iommu->pci_seg = h->pci_seg; iommu->mmio_phys = h->mmio_phys; iommu->mmio_base = iommu_map_mmio_space(h->mmio_phys); if (!iommu->mmio_base) -- cgit v1.2.2 From 3eaf28a1cd2686aaa185b54d5a5e18e91b41f7f2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Mon, 8 Sep 2008 15:55:10 +0200 Subject: AMD IOMMU: save pci_dev instead of devid We need the pci_dev later anyways to enable MSI for the IOMMU hardware. So remove the devid pointing to the BDF and replace it with the pci_dev structure where the IOMMU is implemented. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index b50234ef91ed..a7eb89d8923d 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -242,9 +242,12 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) /* Function to enable the hardware */ void __init iommu_enable(struct amd_iommu *iommu) { - printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at "); - print_devid(iommu->devid, 0); - printk(" cap 0x%hx\n", iommu->cap_ptr); + printk(KERN_INFO "AMD IOMMU: Enabling IOMMU " + "at %02x:%02x.%x cap 0x%hx\n", + iommu->dev->bus->number, + PCI_SLOT(iommu->dev->devfn), + PCI_FUNC(iommu->dev->devfn), + iommu->cap_ptr); iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } @@ -511,15 +514,14 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) */ static void __init init_iommu_from_pci(struct amd_iommu *iommu) { - int bus = PCI_BUS(iommu->devid); - int dev = PCI_SLOT(iommu->devid); - int fn = PCI_FUNC(iommu->devid); int cap_ptr = iommu->cap_ptr; u32 range; - iommu->cap = read_pci_config(bus, dev, fn, cap_ptr+MMIO_CAP_HDR_OFFSET); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, + &iommu->cap); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, + &range); - range = read_pci_config(bus, dev, fn, cap_ptr+MMIO_RANGE_OFFSET); iommu->first_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_FD(range)); iommu->last_device = calc_devid(MMIO_GET_BUS(range), @@ -674,7 +676,10 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) /* * Copy data from ACPI table entry to the iommu struct */ - iommu->devid = h->devid; + iommu->dev = pci_get_bus_and_slot(PCI_BUS(h->devid), h->devid & 0xff); + if (!iommu->dev) + return 1; + iommu->cap_ptr = h->cap_ptr; iommu->pci_seg = h->pci_seg; iommu->mmio_phys = h->mmio_phys; @@ -695,6 +700,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); + pci_enable_device(iommu->dev); + return 0; } -- cgit v1.2.2 From a80dc3e0e0dc8393158de317d66ae0f345dc58f9 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 11 Sep 2008 16:51:41 +0200 Subject: AMD IOMMU: add MSI interrupt support The AMD IOMMU can generate interrupts for various reasons. This patch adds the basic interrupt enabling infrastructure to the driver. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/amd_iommu.c | 11 +++++ arch/x86/kernel/amd_iommu_init.c | 99 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 110 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..39fd3f42696d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -553,6 +553,7 @@ config CALGARY_IOMMU_ENABLED_BY_DEFAULT config AMD_IOMMU bool "AMD IOMMU support" select SWIOTLB + select PCI_MSI depends on X86_64 && PCI && ACPI help With this option you can enable support for AMD IOMMU hardware in diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 15792ed082e0..0e494b9d5f20 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -49,6 +49,17 @@ static int iommu_has_npcache(struct amd_iommu *iommu) return iommu->cap & IOMMU_CAP_NPCACHE; } +/**************************************************************************** + * + * Interrupt handling functions + * + ****************************************************************************/ + +irqreturn_t amd_iommu_int_handler(int irq, void *data) +{ + return IRQ_NONE; +} + /**************************************************************************** * * IOMMU command queuing functions diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index a7eb89d8923d..14a06464a694 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -515,17 +517,20 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m) static void __init init_iommu_from_pci(struct amd_iommu *iommu) { int cap_ptr = iommu->cap_ptr; - u32 range; + u32 range, misc; pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, &iommu->cap); pci_read_config_dword(iommu->dev, cap_ptr + MMIO_RANGE_OFFSET, &range); + pci_read_config_dword(iommu->dev, cap_ptr + MMIO_MISC_OFFSET, + &misc); iommu->first_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_FD(range)); iommu->last_device = calc_devid(MMIO_GET_BUS(range), MMIO_GET_LD(range)); + iommu->evt_msi_num = MMIO_MSI_NUM(misc); } /* @@ -696,6 +701,8 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) if (!iommu->evt_buf) return -ENOMEM; + iommu->int_enabled = false; + init_iommu_from_pci(iommu); init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); @@ -741,6 +748,95 @@ static int __init init_iommu_all(struct acpi_table_header *table) return 0; } +/**************************************************************************** + * + * The following functions initialize the MSI interrupts for all IOMMUs + * in the system. Its a bit challenging because there could be multiple + * IOMMUs per PCI BDF but we can call pci_enable_msi(x) only once per + * pci_dev. + * + ****************************************************************************/ + +static int __init iommu_setup_msix(struct amd_iommu *iommu) +{ + struct amd_iommu *curr; + struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */ + int nvec = 0, i; + + list_for_each_entry(curr, &amd_iommu_list, list) { + if (curr->dev == iommu->dev) { + entries[nvec].entry = curr->evt_msi_num; + entries[nvec].vector = 0; + curr->int_enabled = true; + nvec++; + } + } + + if (pci_enable_msix(iommu->dev, entries, nvec)) { + pci_disable_msix(iommu->dev); + return 1; + } + + for (i = 0; i < nvec; ++i) { + int r = request_irq(entries->vector, amd_iommu_int_handler, + IRQF_SAMPLE_RANDOM, + "AMD IOMMU", + NULL); + if (r) + goto out_free; + } + + return 0; + +out_free: + for (i -= 1; i >= 0; --i) + free_irq(entries->vector, NULL); + + pci_disable_msix(iommu->dev); + + return 1; +} + +static int __init iommu_setup_msi(struct amd_iommu *iommu) +{ + int r; + struct amd_iommu *curr; + + list_for_each_entry(curr, &amd_iommu_list, list) { + if (curr->dev == iommu->dev) + curr->int_enabled = true; + } + + + if (pci_enable_msi(iommu->dev)) + return 1; + + r = request_irq(iommu->dev->irq, amd_iommu_int_handler, + IRQF_SAMPLE_RANDOM, + "AMD IOMMU", + NULL); + + if (r) { + pci_disable_msi(iommu->dev); + return 1; + } + + return 0; +} + +static int __init iommu_init_msi(struct amd_iommu *iommu) +{ + if (iommu->int_enabled) + return 0; + + if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX)) + return iommu_setup_msix(iommu); + else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI)) + return iommu_setup_msi(iommu); + + return 1; +} + /**************************************************************************** * * The next functions belong to the third pass of parsing the ACPI @@ -862,6 +958,7 @@ static void __init enable_iommus(void) list_for_each_entry(iommu, &amd_iommu_list, list) { iommu_set_exclusion_range(iommu); + iommu_init_msi(iommu); iommu_enable(iommu); } } -- cgit v1.2.2 From 90008ee4b811c944455752dcb72b291a5ba81b53 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Sep 2008 16:41:05 +0200 Subject: AMD IOMMU: add event handling code This patch adds code for polling and printing out events generated by the AMD IOMMU. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 87 +++++++++++++++++++++++++++++++++++++++- arch/x86/kernel/amd_iommu_init.c | 1 - 2 files changed, 86 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0e494b9d5f20..0cb8fd2359f5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -55,9 +55,94 @@ static int iommu_has_npcache(struct amd_iommu *iommu) * ****************************************************************************/ +static void iommu_print_event(void *__evt) +{ + u32 *event = __evt; + int type = (event[1] >> EVENT_TYPE_SHIFT) & EVENT_TYPE_MASK; + int devid = (event[0] >> EVENT_DEVID_SHIFT) & EVENT_DEVID_MASK; + int domid = (event[1] >> EVENT_DOMID_SHIFT) & EVENT_DOMID_MASK; + int flags = (event[1] >> EVENT_FLAGS_SHIFT) & EVENT_FLAGS_MASK; + u64 address = (u64)(((u64)event[3]) << 32) | event[2]; + + printk(KERN_ERR "AMD IOMMU: Event logged ["); + + switch (type) { + case EVENT_TYPE_ILL_DEV: + printk("ILLEGAL_DEV_TABLE_ENTRY device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + case EVENT_TYPE_IO_FAULT: + printk("IO_PAGE_FAULT device=%02x:%02x.%x " + "domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domid, address, flags); + break; + case EVENT_TYPE_DEV_TAB_ERR: + printk("DEV_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + case EVENT_TYPE_PAGE_TAB_ERR: + printk("PAGE_TAB_HARDWARE_ERROR device=%02x:%02x.%x " + "domain=0x%04x address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + domid, address, flags); + break; + case EVENT_TYPE_ILL_CMD: + printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); + break; + case EVENT_TYPE_CMD_HARD_ERR: + printk("COMMAND_HARDWARE_ERROR address=0x%016llx " + "flags=0x%04x]\n", address, flags); + break; + case EVENT_TYPE_IOTLB_INV_TO: + printk("IOTLB_INV_TIMEOUT device=%02x:%02x.%x " + "address=0x%016llx]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address); + break; + case EVENT_TYPE_INV_DEV_REQ: + printk("INVALID_DEVICE_REQUEST device=%02x:%02x.%x " + "address=0x%016llx flags=0x%04x]\n", + PCI_BUS(devid), PCI_SLOT(devid), PCI_FUNC(devid), + address, flags); + break; + default: + printk(KERN_ERR "UNKNOWN type=0x%02x]\n", type); + } +} + +static void iommu_poll_events(struct amd_iommu *iommu) +{ + u32 head, tail; + unsigned long flags; + + spin_lock_irqsave(&iommu->lock, flags); + + head = readl(iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + tail = readl(iommu->mmio_base + MMIO_EVT_TAIL_OFFSET); + + while (head != tail) { + iommu_print_event(iommu->evt_buf + head); + head = (head + EVENT_ENTRY_SIZE) % iommu->evt_buf_size; + } + + writel(head, iommu->mmio_base + MMIO_EVT_HEAD_OFFSET); + + spin_unlock_irqrestore(&iommu->lock, flags); +} + irqreturn_t amd_iommu_int_handler(int irq, void *data) { - return IRQ_NONE; + struct amd_iommu *iommu; + + list_for_each_entry(iommu, &amd_iommu_list, list) + iommu_poll_events(iommu); + + return IRQ_HANDLED; } /**************************************************************************** diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 14a06464a694..eed488892c01 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -32,7 +32,6 @@ /* * definitions for the ACPI scanning code */ -#define PCI_BUS(x) (((x) >> 8) & 0xff) #define IVRS_HEADER_LENGTH 48 #define ACPI_IVHD_TYPE 0x10 -- cgit v1.2.2 From 126c52be4b1d2eb667a1d140f0ceaff9d353f700 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Sep 2008 16:47:35 +0200 Subject: AMD IOMMU: enable event logging The code to log IOMMU events is in place now. So enable event logging with this patch. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index eed488892c01..1974b73fece6 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -253,6 +253,13 @@ void __init iommu_enable(struct amd_iommu *iommu) iommu_feature_enable(iommu, CONTROL_IOMMU_EN); } +/* Function to enable IOMMU event logging and event interrupts */ +void __init iommu_enable_event_logging(struct amd_iommu *iommu) +{ + iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN); + iommu_feature_enable(iommu, CONTROL_EVT_INT_EN); +} + /* * mapping and unmapping functions for the IOMMU MMIO space. Each AMD IOMMU in * the system has one. @@ -958,6 +965,7 @@ static void __init enable_iommus(void) list_for_each_entry(iommu, &amd_iommu_list, list) { iommu_set_exclusion_range(iommu); iommu_init_msi(iommu); + iommu_enable_event_logging(iommu); iommu_enable(iommu); } } -- cgit v1.2.2 From a22131a223147016041b5e5cd0ae5ab61ef4177e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Sep 2008 17:55:28 +0200 Subject: AMD IOMMU: allow IO page faults from devices There is a bit in the device entry to suppress all IO page faults generated by a device. This bit was set until now because there was no event logging. Now that there is event logging this patch allows IO page faults from devices to see them in the kernel log. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 1974b73fece6..8c1375985557 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -950,7 +950,6 @@ static void init_device_table(void) for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { set_dev_entry_bit(devid, DEV_ENTRY_VALID); set_dev_entry_bit(devid, DEV_ENTRY_TRANSLATION); - set_dev_entry_bit(devid, DEV_ENTRY_NO_PAGE_FAULT); } } -- cgit v1.2.2 From b39ba6ad004a31bf2a08ba2b08c1e0f9b3530bb7 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Tue, 9 Sep 2008 18:40:46 +0200 Subject: AMD IOMMU: add dma_supported callback This function determines if the AMD IOMMU implementation is responsible for a given device. So the DMA layer can get this information from the driver. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 0cb8fd2359f5..a6a6f8ed1cf5 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1204,6 +1204,30 @@ free_mem: free_pages((unsigned long)virt_addr, get_order(size)); } +/* + * This function is called by the DMA layer to find out if we can handle a + * particular device. It is part of the dma_ops. + */ +static int amd_iommu_dma_supported(struct device *dev, u64 mask) +{ + u16 bdf; + struct pci_dev *pcidev; + + /* No device or no PCI device */ + if (!dev || dev->bus != &pci_bus_type) + return 0; + + pcidev = to_pci_dev(dev); + + bdf = calc_devid(pcidev->bus->number, pcidev->devfn); + + /* Out of our scope? */ + if (bdf > amd_iommu_last_bdf) + return 0; + + return 1; +} + /* * The function for pre-allocating protection domains. * @@ -1247,6 +1271,7 @@ static struct dma_mapping_ops amd_iommu_dma_ops = { .unmap_single = unmap_single, .map_sg = map_sg, .unmap_sg = unmap_sg, + .dma_supported = amd_iommu_dma_supported, }; /* -- cgit v1.2.2 From bd60b735c658e6e8c656e89771d281bcfcf51279 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 11 Sep 2008 10:24:48 +0200 Subject: AMD IOMMU: don't assign preallocated protection domains to devices In isolation mode the protection domains for the devices are preallocated and preassigned. This is bad if a device should be passed to a virtualization guest because the IOMMU code does not know if it is in use by a driver. This patch changes the code to assign the device to the preallocated domain only if there are dma mapping requests for it. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 43 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 38 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a6a6f8ed1cf5..7c1791447451 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -33,6 +33,10 @@ static DEFINE_RWLOCK(amd_iommu_devtable_lock); +/* A list of preallocated protection domains */ +static LIST_HEAD(iommu_pd_list); +static DEFINE_SPINLOCK(iommu_pd_list_lock); + /* * general struct to manage commands send to an IOMMU */ @@ -663,6 +667,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, dma_dom->next_bit = 0; dma_dom->need_flush = false; + dma_dom->target_dev = 0xffff; /* Intialize the exclusion range if necessary */ if (iommu->exclusion_start && @@ -768,6 +773,33 @@ static bool check_device(struct device *dev) return true; } +/* + * In this function the list of preallocated protection domains is traversed to + * find the domain for a specific device + */ +static struct dma_ops_domain *find_protection_domain(u16 devid) +{ + struct dma_ops_domain *entry, *ret = NULL; + unsigned long flags; + + if (list_empty(&iommu_pd_list)) + return NULL; + + spin_lock_irqsave(&iommu_pd_list_lock, flags); + + list_for_each_entry(entry, &iommu_pd_list, list) { + if (entry->target_dev == devid) { + ret = entry; + list_del(&ret->list); + break; + } + } + + spin_unlock_irqrestore(&iommu_pd_list_lock, flags); + + return ret; +} + /* * In the dma_ops path we only have the struct device. This function * finds the corresponding IOMMU, the protection domain and the @@ -803,9 +835,11 @@ static int get_device_resources(struct device *dev, *iommu = amd_iommu_rlookup_table[*bdf]; if (*iommu == NULL) return 0; - dma_dom = (*iommu)->default_dom; *domain = domain_for_device(*bdf); if (*domain == NULL) { + dma_dom = find_protection_domain(*bdf); + if (!dma_dom) + dma_dom = (*iommu)->default_dom; *domain = &dma_dom->domain; set_device_domain(*iommu, *domain, *bdf); printk(KERN_INFO "AMD IOMMU: Using protection domain %d for " @@ -1257,10 +1291,9 @@ void prealloc_protection_domains(void) if (!dma_dom) continue; init_unity_mappings_for_device(dma_dom, devid); - set_device_domain(iommu, &dma_dom->domain, devid); - printk(KERN_INFO "AMD IOMMU: Allocated domain %d for device ", - dma_dom->domain.id); - print_devid(devid, 1); + dma_dom->target_dev = devid; + + list_add_tail(&dma_dom->list, &iommu_pd_list); } } -- cgit v1.2.2 From 38ddf41b198e21d3ecbe5752e875857b7ce7589e Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 11 Sep 2008 10:38:32 +0200 Subject: AMD IOMMU: some set_device_domain cleanups Remove some magic numbers and split the pte_root using standard functions. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 7c1791447451..a34d8e915e3a 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -739,12 +739,13 @@ static void set_device_domain(struct amd_iommu *iommu, u64 pte_root = virt_to_phys(domain->pt_root); - pte_root |= (domain->mode & 0x07) << 9; - pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | 2; + pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) + << DEV_ENTRY_MODE_SHIFT; + pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; write_lock_irqsave(&amd_iommu_devtable_lock, flags); - amd_iommu_dev_table[devid].data[0] = pte_root; - amd_iommu_dev_table[devid].data[1] = pte_root >> 32; + amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); + amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); amd_iommu_dev_table[devid].data[2] = domain->id; amd_iommu_pd_table[devid] = domain; -- cgit v1.2.2 From 13d9fead3daa0efa1b8bb6ae59650e4453b39128 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 10 Sep 2008 20:19:40 +0900 Subject: AMD IOMMU: avoid unnecessary low zone allocation in alloc_coherent x86's common alloc_coherent (dma_alloc_coherent in dma-mapping.h) sets up the gfp flag according to the device dma_mask but AMD IOMMU doesn't need it for devices that the IOMMU can do virtual mappings for. This patch avoids unnecessary low zone allocation. Signed-off-by: FUJITA Tomonori Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index a34d8e915e3a..e4866660463e 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1173,6 +1173,9 @@ static void *alloc_coherent(struct device *dev, size_t size, if (!check_device(dev)) return NULL; + if (!get_device_resources(dev, &iommu, &domain, &devid)) + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return 0; @@ -1180,8 +1183,6 @@ static void *alloc_coherent(struct device *dev, size_t size, memset(virt_addr, 0, size); paddr = virt_to_phys(virt_addr); - get_device_resources(dev, &iommu, &domain, &devid); - if (!iommu || !domain) { *dma_addr = (dma_addr_t)paddr; return virt_addr; -- cgit v1.2.2 From c97ac5359e6897abe22770740294dda185bac30d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 11 Sep 2008 10:59:15 +0200 Subject: AMD IOMMU: replace memset with __GFP_ZERO in alloc_coherent Remove the memset and use __GFP_ZERO at allocation time instead. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index e4866660463e..f405a61f61fc 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1176,11 +1176,11 @@ static void *alloc_coherent(struct device *dev, size_t size, if (!get_device_resources(dev, &iommu, &domain, &devid)) flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + flag |= __GFP_ZERO; virt_addr = (void *)__get_free_pages(flag, get_order(size)); if (!virt_addr) return 0; - memset(virt_addr, 0, size); paddr = virt_to_phys(virt_addr); if (!iommu || !domain) { -- cgit v1.2.2 From 6754086ce67c0a1f5d7eac612102368781e14588 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Sep 2008 12:17:00 +0200 Subject: AMD IOMMU: simplify dma_mask_to_pages The current calculation is very complicated. This patch replaces it with a much simpler version. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f405a61f61fc..db64482b1796 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -472,8 +472,7 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, ****************************************************************************/ static unsigned long dma_mask_to_pages(unsigned long mask) { - return (mask >> PAGE_SHIFT) + - (PAGE_ALIGN(mask & ~PAGE_MASK) >> PAGE_SHIFT); + return PAGE_ALIGN(mask) >> PAGE_SHIFT; } /* -- cgit v1.2.2 From d58befd3a0110c93d70756537b4d01d05a9e6e12 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Sep 2008 12:19:58 +0200 Subject: AMD IOMMU: free domain bitmap with its allocation order The amd_iommu_pd_alloc_bitmap is allocated with a calculated order and freed with order 1. This is not a bug since the calculated order always evaluates to 1, but its unclean code. So replace the 1 with the calculation in the release path. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 8c1375985557..e60f4cd29eb2 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -1144,7 +1144,8 @@ out: return ret; free: - free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1); + free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, + get_order(MAX_DOMAIN_ID/8)); free_pages((unsigned long)amd_iommu_pd_table, get_order(rlookup_table_size)); -- cgit v1.2.2 From 199d0d501202f077fe647a5c14fe046b17abc46b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Sep 2008 16:45:59 +0200 Subject: AMD IOMMU: remove unnecessary cast to u64 in the init code The ctrl variable is only u32 and readl also returns a 32 bit value. So the cast to u64 is pointless. Remove it with this patch. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index e60f4cd29eb2..505fc04e8969 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -235,7 +235,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit) { u32 ctrl; - ctrl = (u64)readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); + ctrl = readl(iommu->mmio_base + MMIO_CONTROL_OFFSET); ctrl &= ~(1 << bit); writel(ctrl, iommu->mmio_base + MMIO_CONTROL_OFFSET); } -- cgit v1.2.2 From b514e55569855bbaab782a8ec073630ed4e99c68 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Sep 2008 17:14:27 +0200 Subject: AMD IOMMU: calculate IVHD size with a function The current calculation of the IVHD entry size is hard to read. So move this code to a seperate function to make it more clear what this calculation does. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 505fc04e8969..80250e63bd07 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -296,6 +296,14 @@ static void __init iommu_unmap_mmio_space(struct amd_iommu *iommu) * ****************************************************************************/ +/* + * This function calculates the length of a given IVHD entry + */ +static inline int ivhd_entry_length(u8 *ivhd) +{ + return 0x04 << (*ivhd >> 6); +} + /* * This function reads the last device id the IOMMU has to handle from the PCI * capability header for this IOMMU @@ -340,7 +348,7 @@ static int __init find_last_devid_from_ivhd(struct ivhd_header *h) default: break; } - p += 0x04 << (*p >> 6); + p += ivhd_entry_length(p); } WARN_ON(p != end); @@ -641,7 +649,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu, break; } - p += 0x04 << (e->type >> 6); + p += ivhd_entry_length(p); } } -- cgit v1.2.2 From 23c1713fe9e6ac886a4d44415298d0cbb2e83df2 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 17 Sep 2008 17:18:17 +0200 Subject: AMD IOMMU: use cmd_buf_size when freeing the command buffer The command buffer release function uses the CMD_BUF_SIZE macro for get_order. Replace this with iommu->cmd_buf_size which is more reliable about the actual size of the buffer. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 80250e63bd07..db0c83af44de 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -433,7 +433,8 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu) static void __init free_command_buffer(struct amd_iommu *iommu) { - free_pages((unsigned long)iommu->cmd_buf, get_order(CMD_BUFFER_SIZE)); + free_pages((unsigned long)iommu->cmd_buf, + get_order(iommu->cmd_buf_size)); } /* allocates the memory where the IOMMU will log its events to */ -- cgit v1.2.2 From 832a90c30485117d65180cc9a8d9869c1b158570 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 18 Sep 2008 15:54:23 +0200 Subject: AMD IOMMU: use coherent_dma_mask in alloc_coherent The alloc_coherent implementation for AMD IOMMU currently uses *dev->dma_mask per default. This patch changes it to prefer dev->coherent_dma_mask if it is set. Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 29 ++++++++++++++++++++++------- 1 file changed, 22 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index db64482b1796..6f7b97445738 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -483,9 +483,10 @@ static unsigned long dma_mask_to_pages(unsigned long mask) static unsigned long dma_ops_alloc_addresses(struct device *dev, struct dma_ops_domain *dom, unsigned int pages, - unsigned long align_mask) + unsigned long align_mask, + u64 dma_mask) { - unsigned long limit = dma_mask_to_pages(*dev->dma_mask); + unsigned long limit = dma_mask_to_pages(dma_mask); unsigned long address; unsigned long size = dom->aperture_size >> PAGE_SHIFT; unsigned long boundary_size; @@ -919,7 +920,8 @@ static dma_addr_t __map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir, - bool align) + bool align, + u64 dma_mask) { dma_addr_t offset = paddr & ~PAGE_MASK; dma_addr_t address, start; @@ -933,7 +935,8 @@ static dma_addr_t __map_single(struct device *dev, if (align) align_mask = (1UL << get_order(size)) - 1; - address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask); + address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, + dma_mask); if (unlikely(address == bad_dma_address)) goto out; @@ -997,10 +1000,13 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, struct protection_domain *domain; u16 devid; dma_addr_t addr; + u64 dma_mask; if (!check_device(dev)) return bad_dma_address; + dma_mask = *dev->dma_mask; + get_device_resources(dev, &iommu, &domain, &devid); if (iommu == NULL || domain == NULL) @@ -1008,7 +1014,8 @@ static dma_addr_t map_single(struct device *dev, phys_addr_t paddr, return (dma_addr_t)paddr; spin_lock_irqsave(&domain->lock, flags); - addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false); + addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, + dma_mask); if (addr == bad_dma_address) goto out; @@ -1080,10 +1087,13 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, struct scatterlist *s; phys_addr_t paddr; int mapped_elems = 0; + u64 dma_mask; if (!check_device(dev)) return 0; + dma_mask = *dev->dma_mask; + get_device_resources(dev, &iommu, &domain, &devid); if (!iommu || !domain) @@ -1095,7 +1105,8 @@ static int map_sg(struct device *dev, struct scatterlist *sglist, paddr = sg_phys(s); s->dma_address = __map_single(dev, iommu, domain->priv, - paddr, s->length, dir, false); + paddr, s->length, dir, false, + dma_mask); if (s->dma_address) { s->dma_length = s->length; @@ -1168,6 +1179,7 @@ static void *alloc_coherent(struct device *dev, size_t size, struct protection_domain *domain; u16 devid; phys_addr_t paddr; + u64 dma_mask = dev->coherent_dma_mask; if (!check_device(dev)) return NULL; @@ -1187,10 +1199,13 @@ static void *alloc_coherent(struct device *dev, size_t size, return virt_addr; } + if (!dma_mask) + dma_mask = *dev->dma_mask; + spin_lock_irqsave(&domain->lock, flags); *dma_addr = __map_single(dev, iommu, domain->priv, paddr, - size, DMA_BIDIRECTIONAL, true); + size, DMA_BIDIRECTIONAL, true, dma_mask); if (*dma_addr == bad_dma_address) { free_pages((unsigned long)virt_addr, get_order(size)); -- cgit v1.2.2 From 5871c6b0a52bb052c3891a787655043b90ae18e3 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Sat, 20 Sep 2008 20:35:44 -0700 Subject: x86: use round_jiffies() for the corruption check timer the exact timing of the corruption check isn't too important (it's once a minute timer), use round_jiffies() to align it and avoid extra wakeups. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 786c1886ae53..161f1b33ecec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -712,7 +712,7 @@ void check_for_bios_corruption(void) static void periodic_check_for_corruption(unsigned long data) { check_for_bios_corruption(); - mod_timer(&periodic_check_timer, jiffies + corruption_check_period*HZ); + mod_timer(&periodic_check_timer, round_jiffies(jiffies + corruption_check_period*HZ)); } void start_periodic_check_for_corruption(void) -- cgit v1.2.2 From b61e06f258e50b25c38a73bea782bdb6876f0f70 Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 20 Sep 2008 18:02:27 +0200 Subject: x86, oprofile: BUG scheduling while atomic nmi_shutdown() calls unregister_die_notifier() from an atomic context after setting preempt_disable() via get_cpu_var(): [ 1049.404154] BUG: scheduling while atomic: oprofiled/7796/0x00000002 [ 1049.404171] INFO: lockdep is turned off. [ 1049.404176] Modules linked in: oprofile af_packet rfcomm l2cap kvm_intel kvm i915 drm acpi_cpufreq cpufreq_userspace cpufreq_conservative cpufreq_ondemand cpufreq_powersave freq_table container sbs sbshc dm_mod arc4 ecb cryptomgr aead snd_hda_intel crypto_blkcipher snd_pcm_oss crypto_algapi snd_pcm iwlagn iwlcore snd_timer iTCO_wdt led_class btusb iTCO_vendor_support snd psmouse bluetooth mac80211 soundcore cfg80211 snd_page_alloc intel_agp video output button battery ac dcdbas evdev ext3 jbd mbcache sg sd_mod piix ata_piix libata scsi_mod dock tg3 libphy ehci_hcd uhci_hcd usbcore thermal processor fan fuse [ 1049.404362] Pid: 7796, comm: oprofiled Not tainted 2.6.27-rc5-mm1 #30 [ 1049.404368] Call Trace: [ 1049.404384] [] thread_return+0x4a0/0x7d3 [ 1049.404396] [] generic_exec_single+0x52/0xe0 [ 1049.404405] [] generic_exec_single+0xda/0xe0 [ 1049.404414] [] smp_call_function_single+0x73/0x150 [ 1049.404423] [] schedule_timeout+0x95/0xd0 [ 1049.404430] [] wait_for_common+0x43/0x180 [ 1049.404438] [] wait_for_common+0x114/0x180 [ 1049.404448] [] default_wake_function+0x0/0x10 [ 1049.404457] [] synchronize_rcu+0x30/0x40 [ 1049.404463] [] wakeme_after_rcu+0x0/0x10 [ 1049.404472] [] _spin_unlock_irqrestore+0x40/0x80 [ 1049.404482] [] atomic_notifier_chain_unregister+0x3f/0x60 [ 1049.404501] [] nmi_shutdown+0x51/0x90 [oprofile] [ 1049.404517] [] oprofile_shutdown+0x34/0x70 [oprofile] [ 1049.404532] [] event_buffer_release+0xe/0x40 [oprofile] [ 1049.404543] [] __fput+0xcd/0x240 [ 1049.404551] [] filp_close+0x54/0x90 [ 1049.404560] [] put_files_struct+0xb1/0xd0 [ 1049.404568] [] do_exit+0x18f/0x930 [ 1049.404576] [] restore_args+0x0/0x30 [ 1049.404584] [] do_group_exit+0x36/0xa0 [ 1049.404592] [] system_call_fastpath+0x16/0x1b This can be easily triggered with 'opcontrol --shutdown'. Simply move get_cpu_var() above unregister_die_notifier(). Signed-off-by: Andrea Righi Acked-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 0227694f7dab..8a5f1614a3d5 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -295,10 +295,12 @@ static void nmi_cpu_shutdown(void *dummy) static void nmi_shutdown(void) { - struct op_msrs *msrs = &get_cpu_var(cpu_msrs); + struct op_msrs *msrs; + nmi_enabled = 0; on_each_cpu(nmi_cpu_shutdown, NULL, 1); unregister_die_notifier(&profile_exceptions_nb); + msrs = &get_cpu_var(cpu_msrs); model->shutdown(msrs); free_msrs(); put_cpu_var(cpu_msrs); -- cgit v1.2.2 From 2216d199b1430d1c0affb1498a9ebdbd9c0de439 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 22 Sep 2008 02:52:26 -0700 Subject: x86: fix CONFIG_X86_RESERVE_LOW_64K=y The bad_bios_dmi_table() quirk never triggered because we do DMI setup too late. Move it a bit earlier. Also change the CONFIG_X86_RESERVE_LOW_64K quirk to operate on the e820 table directly instead of messing with early reservations - this handles overlaps (which do occur in this low range of RAM) more gracefully. Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 161f1b33ecec..d29951c11be0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -735,7 +735,8 @@ static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) "%s detected: BIOS may corrupt low RAM, working it around.\n", d->ident); - reserve_early_overlap_ok(0x0, 0x10000, "BIOS quirk"); + e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED); + sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); return 0; } @@ -784,8 +785,6 @@ void __init setup_arch(char **cmdline_p) printk(KERN_INFO "Command line: %s\n", boot_command_line); #endif - dmi_check_system(bad_bios_dmi_table); - early_cpu_init(); early_ioremap_init(); @@ -880,6 +879,10 @@ void __init setup_arch(char **cmdline_p) finish_e820_parsing(); + dmi_scan_machine(); + + dmi_check_system(bad_bios_dmi_table); + #ifdef CONFIG_X86_32 probe_roms(); #endif @@ -967,8 +970,6 @@ void __init setup_arch(char **cmdline_p) vsmp_init(); #endif - dmi_scan_machine(); - io_delay_init(); /* -- cgit v1.2.2 From af2d237bf574f89ae5a1b67f2556a324c8f64ff5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 21 Sep 2008 23:27:13 +0900 Subject: x86: check for ioremap() failure in copy_oldmem_page() Add a check for ioremap() failure in copy_oldmem_page(). This patch also includes small coding style fixes. Signed-off-by: Akinobu Mita Signed-off-by: Ingo Molnar --- arch/x86/kernel/crash_dump_64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index 15e6c6bc4a46..280d6ef3af02 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -33,14 +33,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, return 0; vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); + if (!vaddr) + return -ENOMEM; if (userbuf) { - if (copy_to_user(buf, (vaddr + offset), csize)) { + if (copy_to_user(buf, vaddr + offset, csize)) { iounmap(vaddr); return -EFAULT; } } else - memcpy(buf, (vaddr + offset), csize); + memcpy(buf, vaddr + offset, csize); iounmap(vaddr); return csize; -- cgit v1.2.2 From 45f197ade73ba95681b9803680c75352fc0a1c0a Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 20 Sep 2008 12:58:40 +0200 Subject: x86, oprofile: BUG: using smp_processor_id() in preemptible code Add __raw access before setting per cpu variable switch_index, to avoid the following BUG: [ 449.166827] BUG: using smp_processor_id() in preemptible [00000000] code: modprobe/6998 [ 449.166848] caller is op_nmi_init+0xf0/0x2b0 [oprofile] [ 449.166855] Pid: 6998, comm: modprobe Not tainted 2.6.27-rc5-mm1 #29 [ 449.166860] Call Trace: [ 449.166872] [] debug_smp_processor_id+0xd7/0xe0 [ 449.166887] [] op_nmi_init+0xf0/0x2b0 [oprofile] [ 449.166902] [] oprofile_init+0x0/0x60 [oprofile] [ 449.166915] [] oprofile_arch_init+0x9/0x30 [oprofile] [ 449.166928] [] oprofile_init+0x1e/0x60 [oprofile] [ 449.166937] [] _stext+0x3b/0x160 [ 449.166946] [] __mutex_unlock_slowpath+0xe5/0x190 [ 449.166955] [] trace_hardirqs_on_caller+0xca/0x140 [ 449.166965] [] sys_init_module+0xdc/0x210 [ 449.166972] [] system_call_fastpath+0x16/0x1b Signed-off-by: Andrea Righi Acked-by: Robert Richter Signed-off-by: Ingo Molnar --- arch/x86/oprofile/nmi_int.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index fb4902bc6f14..4108d02c5292 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -551,7 +551,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) } /* default values, can be overwritten by model */ - __get_cpu_var(switch_index) = 0; + __raw_get_cpu_var(switch_index) = 0; ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; -- cgit v1.2.2 From 153dab77e228931f3aff74f21b762927ac710ca7 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 21 Sep 2008 23:25:40 +0900 Subject: x86: use platform_device_register_simple() Cleanup pcspeaker.c Signed-off-by: Akinobu Mita Signed-off-by: Ingo Molnar --- arch/x86/kernel/pcspeaker.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pcspeaker.c b/arch/x86/kernel/pcspeaker.c index bc1f2d3ea277..a311ffcaad16 100644 --- a/arch/x86/kernel/pcspeaker.c +++ b/arch/x86/kernel/pcspeaker.c @@ -1,20 +1,13 @@ #include -#include +#include #include static __init int add_pcspkr(void) { struct platform_device *pd; - int ret; - pd = platform_device_alloc("pcspkr", -1); - if (!pd) - return -ENOMEM; + pd = platform_device_register_simple("pcspkr", -1, NULL, 0); - ret = platform_device_add(pd); - if (ret) - platform_device_put(pd); - - return ret; + return IS_ERR(pd) ? PTR_ERR(pd) : 0; } device_initcall(add_pcspkr); -- cgit v1.2.2 From 16dc552f35bc0ec6fec8ef83f8032eee352d17f5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Sep 2008 11:45:04 -0700 Subject: x86: use WARN_ONCE in workaround for mtrr mask so could help catch attention about bug in bios about mtrr mask setting. WARN_ONCE got into mainline already, lets use it. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index cb7d3b6a80eb..4e8d77f01eeb 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -401,12 +401,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, tmp |= ~((1<<(hi - 1)) - 1); if (tmp != mask_lo) { - static int once = 1; - - if (once) { - printk(KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); - once = 0; - } + WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); mask_lo = tmp; } } -- cgit v1.2.2 From d26dbc5cf94b0a28acc947285c3b54814a73cb2e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 22 Sep 2008 22:35:07 +0900 Subject: iommu: export iommu_area_reserve helper function x86 has set_bit_string() that does the exact same thing that set_bit_area() in lib/iommu-helper.c does. This patch exports set_bit_area() in lib/iommu-helper.c as iommu_area_reserve(), converts GART, Calgary, and AMD IOMMU to use it. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 2 +- arch/x86/kernel/pci-calgary_64.c | 2 +- arch/x86/kernel/pci-gart_64.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 6f7b97445738..70537d117a96 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -572,7 +572,7 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom, if (start_page + pages > last_page) pages = last_page - start_page; - set_bit_string(dom->bitmap, start_page, pages); + iommu_area_reserve(dom->bitmap, start_page, pages); } static void dma_ops_free_pagetable(struct dma_ops_domain *dma_dom) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index fe7695e4caae..080d1d27f37a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -261,7 +261,7 @@ static void iommu_range_reserve(struct iommu_table *tbl, badbit, tbl, start_addr, npages); } - set_bit_string(tbl->it_map, index, npages); + iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); } diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 508ef470b27f..3dcb1ad86e38 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -827,7 +827,7 @@ void __init gart_iommu_init(void) * Out of IOMMU space handling. * Reserve some invalid pages at the beginning of the GART. */ - set_bit_string(iommu_gart_bitmap, 0, EMERGENCY_PAGES); + iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); agp_memory_reserved = iommu_size; printk(KERN_INFO -- cgit v1.2.2 From 28b166a700899a0f88b1cc283c449fb5bf72a635 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Mon, 22 Sep 2008 13:14:13 -0400 Subject: x86, NMI watchdog: when booting with reset_devices, clear the performance counters P4s have a quirk that makes necessary to clear P4_CCCR_OVF bit on the CCCR everytime the PMI is triggered. When booting the kernel with reset_devices (more specific kdump case), the counters reach zero and the PMI will be generated. This is not a problem on other processors but on P4s, it'll continue to generate NMIs until that bit is cleared. Since there may be other users of the performance counters, clear and disable all of them when booting with reset_devices option. We have a P4 box here that crashes because of this problem. Since the kdump kernel usually boots with only one processor active, the second logical unit won't be set up, therefore, MSR_P4_IQ_CCCR1 (and other performance counter registers) won't be cleared and P4_CCCR_OVF may be still set because the previous kernel was using this register. An NMI is triggered because of the MSR_P4_IQ_CCCR1 right after the NMI delivery is enabled, triggering the race fixed on my previous email. Signed-off-by: Aristeu Rozanski Acked-by: Don Zickus Acked-by: Prarit Bhargava Acked-by: Vivek Goyal Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 05cc22dbd4ff..62c010063974 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -432,6 +432,27 @@ static const struct wd_ops p6_wd_ops = { #define P4_CCCR_ENABLE (1 << 12) #define P4_CCCR_OVF (1 << 31) +#define P4_CONTROLS 18 +static unsigned int p4_controls[18] = { + MSR_P4_BPU_CCCR0, + MSR_P4_BPU_CCCR1, + MSR_P4_BPU_CCCR2, + MSR_P4_BPU_CCCR3, + MSR_P4_MS_CCCR0, + MSR_P4_MS_CCCR1, + MSR_P4_MS_CCCR2, + MSR_P4_MS_CCCR3, + MSR_P4_FLAME_CCCR0, + MSR_P4_FLAME_CCCR1, + MSR_P4_FLAME_CCCR2, + MSR_P4_FLAME_CCCR3, + MSR_P4_IQ_CCCR0, + MSR_P4_IQ_CCCR1, + MSR_P4_IQ_CCCR2, + MSR_P4_IQ_CCCR3, + MSR_P4_IQ_CCCR4, + MSR_P4_IQ_CCCR5, +}; /* * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter * CRU_ESCR0 (with any non-null event selector) through a complemented @@ -473,6 +494,26 @@ static int setup_p4_watchdog(unsigned nmi_hz) evntsel_msr = MSR_P4_CRU_ESCR0; cccr_msr = MSR_P4_IQ_CCCR0; cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4); + + /* + * If we're on the kdump kernel or other situation, we may + * still have other performance counter registers set to + * interrupt and they'll keep interrupting forever because + * of the P4_CCCR_OVF quirk. So we need to ACK all the + * pending interrupts and disable all the registers here, + * before reenabling the NMI delivery. Refer to p4_rearm() + * about the P4_CCCR_OVF quirk. + */ + if (reset_devices) { + unsigned int low, high; + int i; + + for (i = 0; i < P4_CONTROLS; i++) { + rdmsr(p4_controls[i], low, high); + low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF); + wrmsr(p4_controls[i], low, high); + } + } } else { /* logical cpu 1 */ perfctr_msr = MSR_P4_IQ_PERFCTR1; -- cgit v1.2.2 From b3e15bdef689641e7f1bb03efbe56112c3ee82e2 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Mon, 22 Sep 2008 13:13:59 -0400 Subject: x86, NMI watchdog: setup before enabling NMI watchdog There's a small window when NMI watchdog is being set up that if any NMIs are triggered, the NMI code will make make use of not initalized wd_ops elements: void setup_apic_nmi_watchdog(void *unused) { if (__get_cpu_var(wd_enabled)) return; /* cheap hack to support suspend/resume */ /* if cpu0 is not active neither should the other cpus */ if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0) return; switch (nmi_watchdog) { case NMI_LOCAL_APIC: /* enable it before to avoid race with handler */ --> __get_cpu_var(wd_enabled) = 1; --> if (lapic_watchdog_init(nmi_hz) < 0) { (...) asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) { (...) if (nmi_watchdog_tick(regs, reason)) return; (...) notrace __kprobes int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) { (...) if (!__get_cpu_var(wd_enabled)) return rc; switch (nmi_watchdog) { case NMI_LOCAL_APIC: rc |= lapic_wd_event(nmi_hz); (...) int lapic_wd_event(unsigned nmi_hz) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 ctr; --> rdmsrl(wd->perfctr_msr, ctr); and wd->*_msr will be initialized on each processor type specific setup, after enabling NMIs for PMIs. Since the counter was just set, the chances of an performance counter generated NMI is minimal, but any other unknown NMI would trigger the problem. This patch fixes the problem by setting everything up before enabling performance counter generated NMIs and will set wd_enabled using a callback function. Signed-off-by: Aristeu Rozanski Acked-by: Don Zickus Acked-by: Prarit Bhargava Acked-by: Vivek Goyal Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 45 +++++++++++++++++++++++++--------- arch/x86/kernel/nmi.c | 11 +++++++-- 2 files changed, 42 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 62c010063974..6bff382094f5 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -295,13 +295,19 @@ static int setup_k7_watchdog(unsigned nmi_hz) /* setup the timer */ wrmsr(evntsel_msr, evntsel, 0); write_watchdog_counter(perfctr_msr, "K7_PERFCTR0",nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= K7_EVNTSEL_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); + /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= K7_EVNTSEL_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + return 1; } @@ -379,13 +385,19 @@ static int setup_p6_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0",nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= P6_EVNTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); + /* initialize the wd struct before enabling */ wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= P6_EVNTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); + return 1; } @@ -540,12 +552,17 @@ static int setup_p4_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); wrmsr(cccr_msr, cccr_val, 0); write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - cccr_val |= P4_CCCR_ENABLE; - wrmsr(cccr_msr, cccr_val, 0); + wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = cccr_msr; + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + cccr_val |= P4_CCCR_ENABLE; + wrmsr(cccr_msr, cccr_val, 0); return 1; } @@ -661,13 +678,17 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz) wrmsr(evntsel_msr, evntsel, 0); nmi_hz = adjust_for_32bit_ctr(nmi_hz); write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz); - apic_write(APIC_LVTPC, APIC_DM_NMI); - evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(evntsel_msr, evntsel, 0); wd->perfctr_msr = perfctr_msr; wd->evntsel_msr = evntsel_msr; wd->cccr_msr = 0; /* unused */ + + /* ok, everything is initialized, announce that we're set */ + cpu_nmi_set_wd_enabled(); + + apic_write(APIC_LVTPC, APIC_DM_NMI); + evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(evntsel_msr, evntsel, 0); intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); return 1; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index abb78a2cc4ad..2c97f07f1c2c 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -299,6 +299,15 @@ void acpi_nmi_disable(void) on_each_cpu(__acpi_nmi_disable, NULL, 1); } +/* + * This function is called as soon the LAPIC NMI watchdog driver has everything + * in place and it's ready to check if the NMIs belong to the NMI watchdog + */ +void cpu_nmi_set_wd_enabled(void) +{ + __get_cpu_var(wd_enabled) = 1; +} + void setup_apic_nmi_watchdog(void *unused) { if (__get_cpu_var(wd_enabled)) @@ -311,8 +320,6 @@ void setup_apic_nmi_watchdog(void *unused) switch (nmi_watchdog) { case NMI_LOCAL_APIC: - /* enable it before to avoid race with handler */ - __get_cpu_var(wd_enabled) = 1; if (lapic_watchdog_init(nmi_hz) < 0) { __get_cpu_var(wd_enabled) = 0; return; -- cgit v1.2.2 From afa9fdc2f5f8e4d98f3e77bfa204412cbc181346 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 20 Sep 2008 01:23:30 +0900 Subject: iommu: remove fullflush and nofullflush in IOMMU generic option This patch against tip/x86/iommu virtually reverts 2842e5bf3115193f05dc9dac20f940e7abf44c1a. But just reverting the commit breaks AMD IOMMU so this patch also includes some fixes. The above commit adds new two options to x86 IOMMU generic kernel boot options, fullflush and nofullflush. But such change that affects all the IOMMUs needs more discussion (all IOMMU parties need the chance to discuss it): http://lkml.org/lkml/2008/9/19/106 Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 4 ++-- arch/x86/kernel/amd_iommu_init.c | 5 ++++- arch/x86/kernel/pci-dma.c | 13 ------------- arch/x86/kernel/pci-gart_64.c | 13 +++++++++++++ 4 files changed, 19 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 70537d117a96..c19212191c98 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -948,7 +948,7 @@ static dma_addr_t __map_single(struct device *dev, } address += offset; - if (unlikely(dma_dom->need_flush && !iommu_fullflush)) { + if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { iommu_flush_tlb(iommu, dma_dom->domain.id); dma_dom->need_flush = false; } else if (unlikely(iommu_has_npcache(iommu))) @@ -985,7 +985,7 @@ static void __unmap_single(struct amd_iommu *iommu, dma_ops_free_addresses(dma_dom, dma_addr, pages); - if (iommu_fullflush) + if (amd_iommu_unmap_flush) iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); } diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index db0c83af44de..148fcfe22f17 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -122,6 +122,7 @@ LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings we find in ACPI */ unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */ int amd_iommu_isolate; /* if 1, device isolation is enabled */ +bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the system */ @@ -1144,7 +1145,7 @@ int __init amd_iommu_init(void) else printk("disabled\n"); - if (iommu_fullflush) + if (amd_iommu_unmap_flush) printk(KERN_INFO "AMD IOMMU: IO/TLB flush on unmap enabled\n"); else printk(KERN_INFO "AMD IOMMU: Lazy IO/TLB flushing enabled\n"); @@ -1214,6 +1215,8 @@ static int __init parse_amd_iommu_options(char *str) for (; *str; ++str) { if (strncmp(str, "isolate", 7) == 0) amd_iommu_isolate = 1; + if (strncmp(str, "fullflush", 11) == 0) + amd_iommu_unmap_flush = true; } return 1; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index d2f2c0158dc1..0a1408abcc62 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -16,15 +16,6 @@ EXPORT_SYMBOL(dma_ops); static int iommu_sac_force __read_mostly; -/* - * If this is disabled the IOMMU will use an optimized flushing strategy - * of only flushing when an mapping is reused. With it true the GART is - * flushed for every mapping. Problem is that doing the lazy flush seems - * to trigger bugs with some popular PCI cards, in particular 3ware (but - * has been also also seen with Qlogic at least). - */ -int iommu_fullflush; - #ifdef CONFIG_IOMMU_DEBUG int panic_on_overflow __read_mostly = 1; int force_iommu __read_mostly = 1; @@ -180,10 +171,6 @@ static __init int iommu_setup(char *p) } if (!strncmp(p, "nomerge", 7)) iommu_merge = 0; - if (!strncmp(p, "fullflush", 8)) - iommu_fullflush = 1; - if (!strncmp(p, "nofullflush", 11)) - iommu_fullflush = 0; if (!strncmp(p, "forcesac", 8)) iommu_sac_force = 1; if (!strncmp(p, "allowdac", 8)) diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 3dcb1ad86e38..9e390f1bd46a 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -45,6 +45,15 @@ static unsigned long iommu_pages; /* .. and in pages */ static u32 *iommu_gatt_base; /* Remapping table */ +/* + * If this is disabled the IOMMU will use an optimized flushing strategy + * of only flushing when an mapping is reused. With it true the GART is + * flushed for every mapping. Problem is that doing the lazy flush seems + * to trigger bugs with some popular PCI cards, in particular 3ware (but + * has been also also seen with Qlogic at least). + */ +int iommu_fullflush = 1; + /* Allocation bitmap for the remapping area: */ static DEFINE_SPINLOCK(iommu_bitmap_lock); /* Guarded by iommu_bitmap_lock: */ @@ -892,6 +901,10 @@ void __init gart_parse_options(char *p) #endif if (isdigit(*p) && get_option(&p, &arg)) iommu_size = arg; + if (!strncmp(p, "fullflush", 8)) + iommu_fullflush = 1; + if (!strncmp(p, "nofullflush", 11)) + iommu_fullflush = 0; if (!strncmp(p, "noagp", 5)) no_agp = 1; if (!strncmp(p, "noaperture", 10)) -- cgit v1.2.2 From 2fd47094f92fa2bdbf99be33294a7b6b97785a70 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Mon, 1 Sep 2008 14:27:03 +0200 Subject: CPUFREQ: powernow-k8: Try to detect old BIOS, not supporting CPU freq on a recent AMD CPUs. Make use of FW_BUG interface to give vendors and users the ability to automatically check for powernow-k8 related BIOS bugs by: dmesg |grep "Firmware Bug" Signed-off-by: Thomas Renninger Signed-off-by: Andi Kleen Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 42 ++++++++++++++++++------------- 1 file changed, 24 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 84bb395038d8..4e0c6abd7ca4 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -45,7 +45,6 @@ #endif #define PFX "powernow-k8: " -#define BFX PFX "BIOS error: " #define VERSION "version 2.20.00" #include "powernow-k8.h" @@ -536,35 +535,40 @@ static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst, u8 for (j = 0; j < data->numps; j++) { if (pst[j].vid > LEAST_VID) { - printk(KERN_ERR PFX "vid %d invalid : 0x%x\n", j, pst[j].vid); + printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n", + j, pst[j].vid); return -EINVAL; } if (pst[j].vid < data->rvo) { /* vid + rvo >= 0 */ - printk(KERN_ERR BFX "0 vid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (pst[j].vid < maxvid + data->rvo) { /* vid + rvo >= maxvid */ - printk(KERN_ERR BFX "maxvid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (pst[j].fid > MAX_FID) { - printk(KERN_ERR BFX "maxfid exceeded with pstate %d\n", j); + printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate" + " %d\n", j); return -ENODEV; } if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) { /* Only first fid is allowed to be in "low" range */ - printk(KERN_ERR BFX "two low fids - %d : 0x%x\n", j, pst[j].fid); + printk(KERN_ERR FW_BUG PFX "two low fids - %d : " + "0x%x\n", j, pst[j].fid); return -EINVAL; } if (pst[j].fid < lastfid) lastfid = pst[j].fid; } if (lastfid & 1) { - printk(KERN_ERR BFX "lastfid invalid\n"); + printk(KERN_ERR FW_BUG PFX "lastfid invalid\n"); return -EINVAL; } if (lastfid > LO_FID_TABLE_TOP) - printk(KERN_INFO BFX "first fid not from lo freq table\n"); + printk(KERN_INFO FW_BUG PFX "first fid not from lo freq table\n"); return 0; } @@ -672,13 +676,13 @@ static int find_psb_table(struct powernow_k8_data *data) dprintk("table vers: 0x%x\n", psb->tableversion); if (psb->tableversion != PSB_VERSION_1_4) { - printk(KERN_ERR BFX "PSB table is not v1.4\n"); + printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n"); return -ENODEV; } dprintk("flags: 0x%x\n", psb->flags1); if (psb->flags1) { - printk(KERN_ERR BFX "unknown flags\n"); + printk(KERN_ERR FW_BUG PFX "unknown flags\n"); return -ENODEV; } @@ -705,7 +709,7 @@ static int find_psb_table(struct powernow_k8_data *data) } } if (cpst != 1) { - printk(KERN_ERR BFX "numpst must be 1\n"); + printk(KERN_ERR FW_BUG PFX "numpst must be 1\n"); return -ENODEV; } @@ -1130,17 +1134,19 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) "ACPI Processor module before starting this " "driver.\n"); #else - printk(KERN_ERR PFX "Your BIOS does not provide ACPI " - "_PSS objects in a way that Linux understands. " - "Please report this to the Linux ACPI maintainers" - " and complain to your BIOS vendor.\n"); + printk(KERN_ERR FW_BUG PFX "Your BIOS does not provide" + " ACPI _PSS objects in a way that Linux " + "understands. Please report this to the Linux " + "ACPI maintainers and complain to your BIOS " + "vendor.\n"); #endif kfree(data); return -ENODEV; } if (pol->cpu != 0) { - printk(KERN_ERR PFX "No ACPI _PSS objects for CPU other than " - "CPU0. Complain to your BIOS vendor.\n"); + printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for " + "CPU other than CPU0. Complain to your BIOS " + "vendor.\n"); kfree(data); return -ENODEV; } @@ -1193,7 +1199,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) /* min/max the cpu is capable of */ if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) { - printk(KERN_ERR PFX "invalid powernow_table\n"); + printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n"); powernow_k8_cpu_exit_acpi(data); kfree(data->powernow_table); kfree(data); -- cgit v1.2.2 From a8b71a2810386a5ac8f43d2095fe3355f0d8db37 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 23 Sep 2008 00:35:33 -0700 Subject: x86: fix macro with bad_bios_dmi_table DMI tables need a blank NULL tail. fixes the crash on Ingo's test box. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d29951c11be0..3ce3edfcc3cd 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -758,8 +758,8 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies, LTD"), }, }, - {} #endif + {} }; /* -- cgit v1.2.2 From 05e12e1c4c09cd35ac9f4e6af1e42b0036375d72 Mon Sep 17 00:00:00 2001 From: Ravikiran G Thirumalai Date: Mon, 22 Sep 2008 22:58:47 -0700 Subject: x86: fix 27-rc crash on vsmp due to paravirt during module load 27-rc fails to boot up if configured to use modules. Turns out vsmp_patch was marked __init, and vsmp_patch being the pvops 'patch' routine for vsmp, a call to vsmp_patch just turns out to execute a code page with series of 0xcc (POISON_FREE_INITMEM -- int3). vsmp_patch has been marked with __init ever since pvops, however, apply_paravirt can be called during module load causing calls to freed memory location. Since apply_paravirt can only be called during init/module load, make vsmp_patch with "__init_or_module" Signed-off-by: Ravikiran Thirumalai Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsmp_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 0c029e8959c7..7766d36983fc 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -61,7 +61,7 @@ static void vsmp_irq_enable(void) native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); } -static unsigned __init vsmp_patch(u8 type, u16 clobbers, void *ibuf, +static unsigned __init_or_module vsmp_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { switch (type) { -- cgit v1.2.2 From 4faac97d44ac27bdbb010a9c3597401a8f89341f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 18:54:29 +0200 Subject: x86: prevent stale state of c1e_mask across CPU offline/online Impact: hang which happens across CPU offline/online on AMD C1E systems. When a CPU goes offline then the corresponding bit in the broadcast mask is cleared. For AMD C1E enabled CPUs we do not reenable the broadcast when the CPU comes online again as we do not clear the corresponding bit in the c1e_mask, which keeps track which CPUs have been switched to broadcast already. So on those !$@#& machines we never switch back to broadcasting after a CPU offline/online cycle. Clear the bit when the CPU plays dead. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 11 ++++++++--- arch/x86/kernel/process_32.c | 1 + arch/x86/kernel/process_64.c | 2 ++ 3 files changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 7fc4d5b0a6a0..2e2247117f6e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -246,6 +246,14 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) return 1; } +static cpumask_t c1e_mask = CPU_MASK_NONE; +static int c1e_detected; + +void c1e_remove_cpu(int cpu) +{ + cpu_clear(cpu, c1e_mask); +} + /* * C1E aware idle routine. We check for C1E active in the interrupt * pending message MSR. If we detect C1E, then we handle it the same @@ -253,9 +261,6 @@ static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) */ static void c1e_idle(void) { - static cpumask_t c1e_mask = CPU_MASK_NONE; - static int c1e_detected; - if (need_resched()) return; diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 3b7a1ddcc0bc..4b3cfdf54216 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -88,6 +88,7 @@ static void cpu_exit_clear(void) cpu_clear(cpu, cpu_callin_map); numa_remove_cpu(cpu); + c1e_remove_cpu(cpu); } /* We don't actually take CPU down, just spin without interrupts. */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71553b664e2a..e12e0e4dd256 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -93,6 +93,8 @@ DECLARE_PER_CPU(int, cpu_state); static inline void play_dead(void) { idle_task_exit(); + c1e_remove_cpu(raw_smp_processor_id()); + mb(); /* Ack it */ __get_cpu_var(cpu_state) = CPU_DEAD; -- cgit v1.2.2 From a8d6829044901a67732904be5f1eacdf8539604f Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 22 Sep 2008 19:02:25 +0200 Subject: x86: prevent C-states hang on AMD C1E enabled machines Impact: System hang when AMD C1E machines switch into C2/C3 AMD C1E enabled systems do not work with normal ACPI C-states even if the BIOS is advertising them. Limit the C-states to C1 for the ACPI processor idle code. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 2e2247117f6e..d8c2a299bfe5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -272,6 +272,7 @@ static void c1e_idle(void) c1e_detected = 1; mark_tsc_unstable("TSC halt in C1E"); printk(KERN_INFO "System has C1E enabled\n"); + set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } -- cgit v1.2.2 From 09bfeea13cea843fb03eaa96b5d891fa0abdcc90 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 18 Sep 2008 21:12:10 +0200 Subject: x86: c1e_idle: don't mark TSC unstable if CPU has invariant TSC Impact: Functional TSC is marked unstable on AMD family 0x10 and 0x11 CPUs. This would be wrong because for those CPUs "invariant TSC" means: "The TSC counts at the same rate in all P-states, all C states, S0, or S1" (See "Processor BIOS and Kernel Developer's Guides" for those CPUs.) [ tglx: Changed C1E to AMD C1E in the printks to avoid confusion with Intel C1E ] Signed-off-by: Andreas Herrmann Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index d8c2a299bfe5..876e91890777 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -270,8 +270,9 @@ static void c1e_idle(void) rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); if (lo & K8_INTP_C1E_ACTIVE_MASK) { c1e_detected = 1; - mark_tsc_unstable("TSC halt in C1E"); - printk(KERN_INFO "System has C1E enabled\n"); + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + printk(KERN_INFO "System has AMD C1E enabled\n"); set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } -- cgit v1.2.2 From 18dbc9160507dc7df998e00cd1dcd7889557307b Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Tue, 23 Sep 2008 12:08:44 +0200 Subject: x86: moved microcode.c to microcode_intel.c Combine both generic and arch-specific parts of microcode into a single module (arch-specific parts are config-dependent). Also while we are at it, move arch-specific parts from microcode.h into their respective arch-specific .c files. Signed-off-by: Dmitry Adamushko Cc: "Peter Oruba" Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 10 +- arch/x86/kernel/Makefile | 8 +- arch/x86/kernel/microcode.c | 505 ------------------------------------- arch/x86/kernel/microcode_amd.c | 72 +++--- arch/x86/kernel/microcode_core.c | 509 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/microcode_intel.c | 79 +++--- 6 files changed, 608 insertions(+), 575 deletions(-) delete mode 100644 arch/x86/kernel/microcode.c create mode 100644 arch/x86/kernel/microcode_core.c (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0e5bf1eddcea..2e6080951350 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -801,7 +801,7 @@ config MICROCODE module will be called microcode. config MICROCODE_INTEL - tristate "Intel microcode patch loading support" + bool "Intel microcode patch loading support" depends on MICROCODE default MICROCODE select FW_LOADER @@ -813,20 +813,14 @@ config MICROCODE_INTEL Intel ingredients for this driver, check: . - This driver is only available as a module: the module - will be called microcode_intel. - config MICROCODE_AMD - tristate "AMD microcode patch loading support" + bool "AMD microcode patch loading support" depends on MICROCODE select FW_LOADER --help--- If you select this option, microcode patch loading support for AMD processors will be enabled. - This driver is only available as a module: the module - will be called microcode_amd. - config MICROCODE_OLD_INTERFACE def_bool y depends on MICROCODE diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index be454f348c3b..f891996f6849 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -51,9 +51,6 @@ obj-$(CONFIG_X86_BIOS_REBOOT) += reboot.o obj-$(CONFIG_MCA) += mca_32.o obj-$(CONFIG_X86_MSR) += msr.o obj-$(CONFIG_X86_CPUID) += cpuid.o -obj-$(CONFIG_MICROCODE) += microcode.o -obj-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o -obj-$(CONFIG_MICROCODE_AMD) += microcode_amd.o obj-$(CONFIG_PCI) += early-quirks.o apm-y := apm_32.o obj-$(CONFIG_APM) += apm.o @@ -101,6 +98,11 @@ scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +microcode-y := microcode_core.o +microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o +microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o +obj-$(CONFIG_MICROCODE) += microcode.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/microcode.c b/arch/x86/kernel/microcode.c deleted file mode 100644 index 0c2634f4fd7c..000000000000 --- a/arch/x86/kernel/microcode.c +++ /dev/null @@ -1,505 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2000-2006 Tigran Aivazian - * 2006 Shaohua Li - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/design/pentium4/manuals/253668.htm - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - * 1.0 16 Feb 2000, Tigran Aivazian - * Initial release. - * 1.01 18 Feb 2000, Tigran Aivazian - * Added read() support + cleanups. - * 1.02 21 Feb 2000, Tigran Aivazian - * Added 'device trimming' support. open(O_WRONLY) zeroes - * and frees the saved copy of applied microcode. - * 1.03 29 Feb 2000, Tigran Aivazian - * Made to use devfs (/dev/cpu/microcode) + cleanups. - * 1.04 06 Jun 2000, Simon Trimmer - * Added misc device support (now uses both devfs and misc). - * Added MICROCODE_IOCFREE ioctl to clear memory. - * 1.05 09 Jun 2000, Simon Trimmer - * Messages for error cases (non Intel & no suitable microcode). - * 1.06 03 Aug 2000, Tigran Aivazian - * Removed ->release(). Removed exclusive open and status bitmap. - * Added microcode_rwsem to serialize read()/write()/ioctl(). - * Removed global kernel lock usage. - * 1.07 07 Sep 2000, Tigran Aivazian - * Write 0 to 0x8B msr and then cpuid before reading revision, - * so that it works even if there were no update done by the - * BIOS. Otherwise, reading from 0x8B gives junk (which happened - * to be 0 on my machine which is why it worked even when I - * disabled update by the BIOS) - * Thanks to Eric W. Biederman for the fix. - * 1.08 11 Dec 2000, Richard Schaal and - * Tigran Aivazian - * Intel Pentium 4 processor support and bugfixes. - * 1.09 30 Oct 2001, Tigran Aivazian - * Bugfix for HT (Hyper-Threading) enabled processors - * whereby processor resources are shared by all logical processors - * in a single CPU package. - * 1.10 28 Feb 2002 Asit K Mallick and - * Tigran Aivazian , - * Serialize updates as required on HT processors due to - * speculative nature of implementation. - * 1.11 22 Mar 2002 Tigran Aivazian - * Fix the panic when writing zero-length microcode chunk. - * 1.12 29 Sep 2003 Nitin Kamble , - * Jun Nakajima - * Support for the microcode updates in the new format. - * 1.13 10 Oct 2003 Tigran Aivazian - * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl - * because we no longer hold a copy of applied microcode - * in kernel memory. - * 1.14 25 Jun 2004 Tigran Aivazian - * Fix sigmatch() macro to handle old CPUs with pf == 0. - * Thanks to Stuart Swales for pointing out this bug. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -MODULE_DESCRIPTION("Microcode Update Driver"); -MODULE_AUTHOR("Tigran Aivazian "); -MODULE_LICENSE("GPL"); - -#define MICROCODE_VERSION "2.00" - -struct microcode_ops *microcode_ops; - -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ -static DEFINE_MUTEX(microcode_mutex); - -struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; -EXPORT_SYMBOL_GPL(ucode_cpu_info); - -#ifdef CONFIG_MICROCODE_OLD_INTERFACE -static int do_microcode_update(const void __user *buf, size_t size) -{ - cpumask_t old; - int error = 0; - int cpu; - - old = current->cpus_allowed; - - for_each_online_cpu(cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!uci->valid) - continue; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); - } -out: - set_cpus_allowed_ptr(current, &old); - return error; -} - -static int microcode_open(struct inode *unused1, struct file *unused2) -{ - cycle_kernel_lock(); - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; -} - -static ssize_t microcode_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - ssize_t ret; - - if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; - } - - get_online_cpus(); - mutex_lock(µcode_mutex); - - ret = do_microcode_update(buf, len); - if (!ret) - ret = (ssize_t)len; - - mutex_unlock(µcode_mutex); - put_online_cpus(); - - return ret; -} - -static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, -}; - -static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, -}; - -static int __init microcode_dev_init(void) -{ - int error; - - error = misc_register(µcode_dev); - if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); - return error; - } - - return 0; -} - -static void microcode_dev_exit(void) -{ - misc_deregister(µcode_dev); -} - -MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); -#else -#define microcode_dev_init() 0 -#define microcode_dev_exit() do { } while (0) -#endif - -/* fake device for request_firmware */ -struct platform_device *microcode_pdev; - -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, - const char *buf, size_t sz) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; - int cpu = dev->id; - - if (end == buf) - return -EINVAL; - if (val == 1) { - cpumask_t old = current->cpus_allowed; - - get_online_cpus(); - if (cpu_online(cpu)) { - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - mutex_lock(µcode_mutex); - if (uci->valid) { - err = microcode_ops->request_microcode_fw(cpu, - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(cpu); - } - mutex_unlock(µcode_mutex); - set_cpus_allowed_ptr(current, &old); - } - put_online_cpus(); - } - if (err) - return err; - return sz; -} - -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); -} - -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; - - return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); -} - -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); - -static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, - NULL -}; - -static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", -}; - -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - mutex_lock(µcode_mutex); - microcode_ops->microcode_fini_cpu(cpu); - uci->valid = 0; - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; -} - -static int microcode_resume_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; - - pr_debug("microcode: CPU%d resumed\n", cpu); - - if (!uci->mc.valid_mc) - return 1; - - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - microcode_fini_cpu(cpu); - return -1; - } - - if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { - microcode_fini_cpu(cpu); - /* Should we look for a new ucode here? */ - return 1; - } - - return 0; -} - -void microcode_update_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - int err = 0; - - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu); - - mutex_lock(µcode_mutex); - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(cpu); - } else { - collect_cpu_info(cpu); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw(cpu, - µcode_pdev->dev); - } - - if (!err) - microcode_ops->apply_microcode(cpu); - - mutex_unlock(µcode_mutex); -} - -static void microcode_init_cpu(int cpu) -{ - cpumask_t old = current->cpus_allowed; - - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - microcode_update_cpu(cpu); - set_cpus_allowed_ptr(current, &old); -} - -static int mc_sysdev_add(struct sys_device *sys_dev) -{ - int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); - - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); - if (err) - return err; - - microcode_init_cpu(cpu); - return 0; -} - -static int mc_sysdev_remove(struct sys_device *sys_dev) -{ - int cpu = sys_dev->id; - - if (!cpu_online(cpu)) - return 0; - - pr_debug("microcode: CPU%d removed\n", cpu); - microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - return 0; -} - -static int mc_sysdev_resume(struct sys_device *dev) -{ - int cpu = dev->id; - - if (!cpu_online(cpu)) - return 0; - - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(0); - return 0; -} - -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, -}; - -static __cpuinit int -mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; - - sys_dev = get_cpu_sysdev(cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - microcode_init_cpu(cpu); - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: - pr_debug("microcode: CPU%d added\n", cpu); - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); - break; - case CPU_DOWN_PREPARE: - case CPU_DOWN_PREPARE_FROZEN: - /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); - pr_debug("microcode: CPU%d removed\n", cpu); - break; - case CPU_DEAD: - case CPU_UP_CANCELED_FROZEN: - /* The CPU refused to come up during a system resume */ - microcode_fini_cpu(cpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata mc_cpu_notifier = { - .notifier_call = mc_cpu_callback, -}; - -int microcode_init(void *opaque, struct module *module) -{ - struct microcode_ops *ops = (struct microcode_ops *)opaque; - int error; - - if (microcode_ops) { - printk(KERN_ERR "microcode: already loaded the other module\n"); - return -EEXIST; - } - - microcode_ops = ops; - - error = microcode_dev_init(); - if (error) - return error; - microcode_pdev = platform_device_register_simple("microcode", -1, - NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); - return PTR_ERR(microcode_pdev); - } - - get_online_cpus(); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - if (error) { - microcode_dev_exit(); - platform_device_unregister(microcode_pdev); - return error; - } - - register_hotcpu_notifier(&mc_cpu_notifier); - - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION - " " - " \n"); - - return 0; -} -EXPORT_SYMBOL_GPL(microcode_init); - -void __exit microcode_exit(void) -{ - microcode_dev_exit(); - - unregister_hotcpu_notifier(&mc_cpu_notifier); - - get_online_cpus(); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); - put_online_cpus(); - - platform_device_unregister(microcode_pdev); - - microcode_ops = NULL; - - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); -} -EXPORT_SYMBOL_GPL(microcode_exit); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 48aec9f48e4f..03ea4e52e87a 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -46,6 +46,35 @@ MODULE_LICENSE("GPL v2"); #define UCODE_EQUIV_CPU_TABLE_TYPE 0x00000000 #define UCODE_UCODE_TYPE 0x00000001 +struct equiv_cpu_entry { + unsigned int installed_cpu; + unsigned int fixed_errata_mask; + unsigned int fixed_errata_compare; + unsigned int equiv_cpu; +}; + +struct microcode_header_amd { + unsigned int data_code; + unsigned int patch_id; + unsigned char mc_patch_data_id[2]; + unsigned char mc_patch_data_len; + unsigned char init_flag; + unsigned int mc_patch_data_checksum; + unsigned int nb_dev_id; + unsigned int sb_dev_id; + unsigned char processor_rev_id[2]; + unsigned char nb_rev_id; + unsigned char sb_rev_id; + unsigned char bios_api_rev; + unsigned char reserved1[3]; + unsigned int match_reg[8]; +}; + +struct microcode_amd { + struct microcode_header_amd hdr; + unsigned int mpb[0]; +}; + #define UCODE_MAX_SIZE (2048) #define DEFAULT_UCODE_DATASIZE (896) #define MC_HEADER_SIZE (sizeof(struct microcode_header_amd)) @@ -189,17 +218,18 @@ static void apply_microcode_amd(int cpu) unsigned int rev; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; + struct microcode_amd *mc_amd = uci->mc; unsigned long addr; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); - if (uci->mc.mc_amd == NULL) + if (mc_amd == NULL) return; spin_lock_irqsave(µcode_update_lock, flags); - addr = (unsigned long)&uci->mc.mc_amd->hdr.data_code; + addr = (unsigned long)&mc_amd->hdr.data_code; edx = (unsigned int)(((unsigned long)upper_32_bits(addr))); eax = (unsigned int)(((unsigned long)lower_32_bits(addr))); @@ -214,16 +244,16 @@ static void apply_microcode_amd(int cpu) spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ - if (rev != uci->mc.mc_amd->hdr.patch_id) { + if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d update from revision " "0x%x to 0x%x failed\n", cpu_num, - uci->mc.mc_amd->hdr.patch_id, rev); + mc_amd->hdr.patch_id, rev); return; } printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x \n", - cpu_num, uci->cpu_sig.rev, uci->mc.mc_amd->hdr.patch_id); + cpu_num, uci->cpu_sig.rev, mc_amd->hdr.patch_id); uci->cpu_sig.rev = rev; } @@ -355,12 +385,12 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (new_mc) { if (!leftover) { - if (uci->mc.mc_amd) - vfree(uci->mc.mc_amd); - uci->mc.mc_amd = (struct microcode_amd *)new_mc; + if (uci->mc) + vfree(uci->mc); + uci->mc = new_mc; pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", - cpu, uci->mc.mc_amd->hdr.patch_id, uci->cpu_sig.rev); + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -416,8 +446,8 @@ static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - vfree(uci->mc.mc_amd); - uci->mc.mc_amd = NULL; + vfree(uci->mc); + uci->mc = NULL; } static struct microcode_ops microcode_amd_ops = { @@ -428,23 +458,7 @@ static struct microcode_ops microcode_amd_ops = { .microcode_fini_cpu = microcode_fini_cpu_amd, }; -static int __init microcode_amd_module_init(void) +struct microcode_ops * __init init_amd_microcode(void) { - struct cpuinfo_x86 *c = &cpu_data(0); - - equiv_cpu_table = NULL; - if (c->x86_vendor != X86_VENDOR_AMD) { - printk(KERN_ERR "microcode: CPU platform is not AMD-capable\n"); - return -ENODEV; - } - - return microcode_init(µcode_amd_ops, THIS_MODULE); -} - -static void __exit microcode_amd_module_exit(void) -{ - microcode_exit(); + return µcode_amd_ops; } - -module_init(microcode_amd_module_init) -module_exit(microcode_amd_module_exit) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c new file mode 100644 index 000000000000..ff031dbccdf6 --- /dev/null +++ b/arch/x86/kernel/microcode_core.c @@ -0,0 +1,509 @@ +/* + * Intel CPU Microcode Update Driver for Linux + * + * Copyright (C) 2000-2006 Tigran Aivazian + * 2006 Shaohua Li + * + * This driver allows to upgrade microcode on Intel processors + * belonging to IA-32 family - PentiumPro, Pentium II, + * Pentium III, Xeon, Pentium 4, etc. + * + * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture + * Software Developer's Manual + * Order Number 253668 or free download from: + * + * http://developer.intel.com/design/pentium4/manuals/253668.htm + * + * For more information, go to http://www.urbanmyth.org/microcode + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * 1.0 16 Feb 2000, Tigran Aivazian + * Initial release. + * 1.01 18 Feb 2000, Tigran Aivazian + * Added read() support + cleanups. + * 1.02 21 Feb 2000, Tigran Aivazian + * Added 'device trimming' support. open(O_WRONLY) zeroes + * and frees the saved copy of applied microcode. + * 1.03 29 Feb 2000, Tigran Aivazian + * Made to use devfs (/dev/cpu/microcode) + cleanups. + * 1.04 06 Jun 2000, Simon Trimmer + * Added misc device support (now uses both devfs and misc). + * Added MICROCODE_IOCFREE ioctl to clear memory. + * 1.05 09 Jun 2000, Simon Trimmer + * Messages for error cases (non Intel & no suitable microcode). + * 1.06 03 Aug 2000, Tigran Aivazian + * Removed ->release(). Removed exclusive open and status bitmap. + * Added microcode_rwsem to serialize read()/write()/ioctl(). + * Removed global kernel lock usage. + * 1.07 07 Sep 2000, Tigran Aivazian + * Write 0 to 0x8B msr and then cpuid before reading revision, + * so that it works even if there were no update done by the + * BIOS. Otherwise, reading from 0x8B gives junk (which happened + * to be 0 on my machine which is why it worked even when I + * disabled update by the BIOS) + * Thanks to Eric W. Biederman for the fix. + * 1.08 11 Dec 2000, Richard Schaal and + * Tigran Aivazian + * Intel Pentium 4 processor support and bugfixes. + * 1.09 30 Oct 2001, Tigran Aivazian + * Bugfix for HT (Hyper-Threading) enabled processors + * whereby processor resources are shared by all logical processors + * in a single CPU package. + * 1.10 28 Feb 2002 Asit K Mallick and + * Tigran Aivazian , + * Serialize updates as required on HT processors due to + * speculative nature of implementation. + * 1.11 22 Mar 2002 Tigran Aivazian + * Fix the panic when writing zero-length microcode chunk. + * 1.12 29 Sep 2003 Nitin Kamble , + * Jun Nakajima + * Support for the microcode updates in the new format. + * 1.13 10 Oct 2003 Tigran Aivazian + * Removed ->read() method and obsoleted MICROCODE_IOCFREE ioctl + * because we no longer hold a copy of applied microcode + * in kernel memory. + * 1.14 25 Jun 2004 Tigran Aivazian + * Fix sigmatch() macro to handle old CPUs with pf == 0. + * Thanks to Stuart Swales for pointing out this bug. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_DESCRIPTION("Microcode Update Driver"); +MODULE_AUTHOR("Tigran Aivazian "); +MODULE_LICENSE("GPL"); + +#define MICROCODE_VERSION "2.00" + +struct microcode_ops *microcode_ops; + +/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +static DEFINE_MUTEX(microcode_mutex); + +struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; +EXPORT_SYMBOL_GPL(ucode_cpu_info); + +#ifdef CONFIG_MICROCODE_OLD_INTERFACE +static int do_microcode_update(const void __user *buf, size_t size) +{ + cpumask_t old; + int error = 0; + int cpu; + + old = current->cpus_allowed; + + for_each_online_cpu(cpu) { + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!uci->valid) + continue; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + error = microcode_ops->request_microcode_user(cpu, buf, size); + if (error < 0) + goto out; + if (!error) + microcode_ops->apply_microcode(cpu); + } +out: + set_cpus_allowed_ptr(current, &old); + return error; +} + +static int microcode_open(struct inode *unused1, struct file *unused2) +{ + cycle_kernel_lock(); + return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; +} + +static ssize_t microcode_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + ssize_t ret; + + if ((len >> PAGE_SHIFT) > num_physpages) { + printk(KERN_ERR "microcode: too much data (max %ld pages)\n", + num_physpages); + return -EINVAL; + } + + get_online_cpus(); + mutex_lock(µcode_mutex); + + ret = do_microcode_update(buf, len); + if (!ret) + ret = (ssize_t)len; + + mutex_unlock(µcode_mutex); + put_online_cpus(); + + return ret; +} + +static const struct file_operations microcode_fops = { + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, +}; + +static struct miscdevice microcode_dev = { + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, +}; + +static int __init microcode_dev_init(void) +{ + int error; + + error = misc_register(µcode_dev); + if (error) { + printk(KERN_ERR + "microcode: can't misc_register on minor=%d\n", + MICROCODE_MINOR); + return error; + } + + return 0; +} + +static void microcode_dev_exit(void) +{ + misc_deregister(µcode_dev); +} + +MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +#else +#define microcode_dev_init() 0 +#define microcode_dev_exit() do { } while (0) +#endif + +/* fake device for request_firmware */ +struct platform_device *microcode_pdev; + +static ssize_t reload_store(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t sz) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + char *end; + unsigned long val = simple_strtoul(buf, &end, 0); + int err = 0; + int cpu = dev->id; + + if (end == buf) + return -EINVAL; + if (val == 1) { + cpumask_t old = current->cpus_allowed; + + get_online_cpus(); + if (cpu_online(cpu)) { + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + mutex_lock(µcode_mutex); + if (uci->valid) { + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + if (!err) + microcode_ops->apply_microcode(cpu); + } + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); + } + put_online_cpus(); + } + if (err) + return err; + return sz; +} + +static ssize_t version_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); +} + +static ssize_t pf_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; + + return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); +} + +static SYSDEV_ATTR(reload, 0200, NULL, reload_store); +static SYSDEV_ATTR(version, 0400, version_show, NULL); +static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); + +static struct attribute *mc_default_attrs[] = { + &attr_reload.attr, + &attr_version.attr, + &attr_processor_flags.attr, + NULL +}; + +static struct attribute_group mc_attr_group = { + .attrs = mc_default_attrs, + .name = "microcode", +}; + +static void microcode_fini_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + mutex_lock(µcode_mutex); + microcode_ops->microcode_fini_cpu(cpu); + uci->valid = 0; + mutex_unlock(µcode_mutex); +} + +static void collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + memset(uci, 0, sizeof(*uci)); + if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) + uci->valid = 1; +} + +static int microcode_resume_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct cpu_signature nsig; + + pr_debug("microcode: CPU%d resumed\n", cpu); + + if (!uci->mc) + return 1; + + /* + * Let's verify that the 'cached' ucode does belong + * to this cpu (a bit of paranoia): + */ + if (microcode_ops->collect_cpu_info(cpu, &nsig)) { + microcode_fini_cpu(cpu); + return -1; + } + + if (memcmp(&nsig, &uci->cpu_sig, sizeof(nsig))) { + microcode_fini_cpu(cpu); + /* Should we look for a new ucode here? */ + return 1; + } + + return 0; +} + +void microcode_update_cpu(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int err = 0; + + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + + mutex_lock(µcode_mutex); + /* + * Check if the system resume is in progress (uci->valid != NULL), + * otherwise just request a firmware: + */ + if (uci->valid) { + err = microcode_resume_cpu(cpu); + } else { + collect_cpu_info(cpu); + if (uci->valid && system_state == SYSTEM_RUNNING) + err = microcode_ops->request_microcode_fw(cpu, + µcode_pdev->dev); + } + + if (!err) + microcode_ops->apply_microcode(cpu); + + mutex_unlock(µcode_mutex); +} + +static void microcode_init_cpu(int cpu) +{ + cpumask_t old = current->cpus_allowed; + + set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + microcode_update_cpu(cpu); + set_cpus_allowed_ptr(current, &old); +} + +static int mc_sysdev_add(struct sys_device *sys_dev) +{ + int err, cpu = sys_dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d added\n", cpu); + memset(uci, 0, sizeof(*uci)); + + err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + if (err) + return err; + + microcode_init_cpu(cpu); + return 0; +} + +static int mc_sysdev_remove(struct sys_device *sys_dev) +{ + int cpu = sys_dev->id; + + if (!cpu_online(cpu)) + return 0; + + pr_debug("microcode: CPU%d removed\n", cpu); + microcode_fini_cpu(cpu); + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + return 0; +} + +static int mc_sysdev_resume(struct sys_device *dev) +{ + int cpu = dev->id; + + if (!cpu_online(cpu)) + return 0; + + /* only CPU 0 will apply ucode here */ + microcode_update_cpu(0); + return 0; +} + +static struct sysdev_driver mc_sysdev_driver = { + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, +}; + +static __cpuinit int +mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +{ + unsigned int cpu = (unsigned long)hcpu; + struct sys_device *sys_dev; + + sys_dev = get_cpu_sysdev(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + microcode_init_cpu(cpu); + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + pr_debug("microcode: CPU%d added\n", cpu); + if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + printk(KERN_ERR "microcode: Failed to create the sysfs " + "group for CPU%d\n", cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + /* Suspend is in progress, only remove the interface */ + sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + pr_debug("microcode: CPU%d removed\n", cpu); + break; + case CPU_DEAD: + case CPU_UP_CANCELED_FROZEN: + /* The CPU refused to come up during a system resume */ + microcode_fini_cpu(cpu); + break; + } + return NOTIFY_OK; +} + +static struct notifier_block __refdata mc_cpu_notifier = { + .notifier_call = mc_cpu_callback, +}; + +static int __init microcode_init(void) +{ + struct cpuinfo_x86 *c = &cpu_data(0); + int error; + + if (c->x86_vendor == X86_VENDOR_INTEL) + microcode_ops = init_intel_microcode(); + else if (c->x86_vendor != X86_VENDOR_AMD) + microcode_ops = init_amd_microcode(); + + if (!microcode_ops) { + printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + return -ENODEV; + } + + error = microcode_dev_init(); + if (error) + return error; + microcode_pdev = platform_device_register_simple("microcode", -1, + NULL, 0); + if (IS_ERR(microcode_pdev)) { + microcode_dev_exit(); + return PTR_ERR(microcode_pdev); + } + + get_online_cpus(); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + if (error) { + microcode_dev_exit(); + platform_device_unregister(microcode_pdev); + return error; + } + + register_hotcpu_notifier(&mc_cpu_notifier); + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION + " " + " \n"); + + return 0; +} + +static void __exit microcode_exit(void) +{ + microcode_dev_exit(); + + unregister_hotcpu_notifier(&mc_cpu_notifier); + + get_online_cpus(); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + put_online_cpus(); + + platform_device_unregister(microcode_pdev); + + microcode_ops = NULL; + + printk(KERN_INFO + "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); +} + +module_init(microcode_init); +module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 48ed3cef58c1..622dc4a21784 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -97,6 +97,38 @@ MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); MODULE_LICENSE("GPL"); +struct microcode_header_intel { + unsigned int hdrver; + unsigned int rev; + unsigned int date; + unsigned int sig; + unsigned int cksum; + unsigned int ldrver; + unsigned int pf; + unsigned int datasize; + unsigned int totalsize; + unsigned int reserved[3]; +}; + +struct microcode_intel { + struct microcode_header_intel hdr; + unsigned int bits[0]; +}; + +/* microcode format is extended from prescott processors */ +struct extended_signature { + unsigned int sig; + unsigned int pf; + unsigned int cksum; +}; + +struct extended_sigtable { + unsigned int count; + unsigned int cksum; + unsigned int reserved[3]; + struct extended_signature sigs[0]; +}; + #define DEFAULT_UCODE_DATASIZE (2000) #define MC_HEADER_SIZE (sizeof(struct microcode_header_intel)) #define DEFAULT_UCODE_TOTALSIZE (DEFAULT_UCODE_DATASIZE + MC_HEADER_SIZE) @@ -284,11 +316,12 @@ static void apply_microcode(int cpu) unsigned int val[2]; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + struct microcode_intel *mc_intel = uci->mc; /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); - if (uci->mc.mc_intel == NULL) + if (mc_intel == NULL) return; /* serialize access to the physical write to MSR 0x79 */ @@ -296,8 +329,8 @@ static void apply_microcode(int cpu) /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, - (unsigned long) uci->mc.mc_intel->bits, - (unsigned long) uci->mc.mc_intel->bits >> 16 >> 16); + (unsigned long) mc_intel->bits, + (unsigned long) mc_intel->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ @@ -307,7 +340,7 @@ static void apply_microcode(int cpu) rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); spin_unlock_irqrestore(µcode_update_lock, flags); - if (val[1] != uci->mc.mc_intel->hdr.rev) { + if (val[1] != mc_intel->hdr.rev) { printk(KERN_ERR "microcode: CPU%d update from revision " "0x%x to 0x%x failed\n", cpu_num, uci->cpu_sig.rev, val[1]); return; @@ -315,9 +348,9 @@ static void apply_microcode(int cpu) printk(KERN_INFO "microcode: CPU%d updated from revision " "0x%x to 0x%x, date = %04x-%02x-%02x \n", cpu_num, uci->cpu_sig.rev, val[1], - uci->mc.mc_intel->hdr.date & 0xffff, - uci->mc.mc_intel->hdr.date >> 24, - (uci->mc.mc_intel->hdr.date >> 16) & 0xff); + mc_intel->hdr.date & 0xffff, + mc_intel->hdr.date >> 24, + (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; } @@ -367,12 +400,12 @@ static int generic_load_microcode(int cpu, void *data, size_t size, if (new_mc) { if (!leftover) { - if (uci->mc.mc_intel) - vfree(uci->mc.mc_intel); - uci->mc.mc_intel = (struct microcode_intel *)new_mc; + if (uci->mc) + vfree(uci->mc); + uci->mc = (struct microcode_intel *)new_mc; pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", - cpu, uci->mc.mc_intel->hdr.rev, uci->cpu_sig.rev); + cpu, new_rev, uci->cpu_sig.rev); } else vfree(new_mc); } @@ -428,11 +461,11 @@ static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - vfree(uci->mc.mc_intel); - uci->mc.mc_intel = NULL; + vfree(uci->mc); + uci->mc = NULL; } -static struct microcode_ops microcode_intel_ops = { +struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, @@ -440,22 +473,8 @@ static struct microcode_ops microcode_intel_ops = { .microcode_fini_cpu = microcode_fini_cpu, }; -static int __init microcode_intel_module_init(void) -{ - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor != X86_VENDOR_INTEL) { - printk(KERN_ERR "microcode: CPU platform is not Intel-capable\n"); - return -ENODEV; - } - - return microcode_init(µcode_intel_ops, THIS_MODULE); -} - -static void __exit microcode_intel_module_exit(void) +struct microcode_ops * __init init_intel_microcode(void) { - microcode_exit(); + return µcode_intel_ops; } -module_init(microcode_intel_module_init) -module_exit(microcode_intel_module_exit) -- cgit v1.2.2 From da654b74bda14c45a7d98c731bf3c1a43b6b74e2 Mon Sep 17 00:00:00 2001 From: Srinivasa Ds Date: Tue, 23 Sep 2008 15:23:52 +0530 Subject: signals: demultiplexing SIGTRAP signal Currently a SIGTRAP can denote any one of below reasons. - Breakpoint hit - H/W debug register hit - Single step - Signal sent through kill() or rasie() Architectures like powerpc/parisc provides infrastructure to demultiplex SIGTRAP signal by passing down the information for receiving SIGTRAP through si_code of siginfot_t structure. Here is an attempt is generalise this infrastructure by extending it to x86 and x86_64 archs. Signed-off-by: Srinivasa DS Cc: Roland McGrath Cc: akpm@linux-foundation.org Cc: paulus@samba.org Cc: linuxppc-dev@ozlabs.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 7 ++++--- arch/x86/kernel/traps_32.c | 4 +++- arch/x86/kernel/traps_64.c | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 9e43a48ad6e0..bf45cdf1aaca 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1358,7 +1358,8 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task) #endif } -void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) +void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, + int error_code, int si_code) { struct siginfo info; @@ -1367,7 +1368,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, int error_code) memset(&info, 0, sizeof(info)); info.si_signo = SIGTRAP; - info.si_code = TRAP_BRKPT; + info.si_code = si_code; /* User-mode ip? */ info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; @@ -1454,5 +1455,5 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs) */ if (test_thread_flag(TIF_SINGLESTEP) && tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL)) - send_sigtrap(current, regs, 0); + send_sigtrap(current, regs, 0, TRAP_BRKPT); } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index da5a5964fccb..0429c5de5ea9 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -891,6 +891,7 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned int condition; + int si_code; trace_hardirqs_fixup(); @@ -935,8 +936,9 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_TF_reenable; } + si_code = get_si_code((unsigned long)condition); /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code); + send_sigtrap(tsk, regs, error_code, si_code); /* * Disable additional traps. They'll be re-enabled when diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 56d6f1147785..011d8e1fac6e 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -941,7 +941,7 @@ asmlinkage void __kprobes do_debug(struct pt_regs *regs, tsk->thread.error_code = error_code; info.si_signo = SIGTRAP; info.si_errno = 0; - info.si_code = TRAP_BRKPT; + info.si_code = get_si_code(condition); info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; force_sig_info(SIGTRAP, &info, tsk); -- cgit v1.2.2 From b6cffde1a20409f9720d5c9aba28d3efd3a4f04e Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Wed, 17 Sep 2008 15:05:52 +0200 Subject: x86, microcode rework, v2, renaming Renaming based on patch from Dmitry Adamushko. Made code more readable by renaming define and variables related to microcode _container_file_ header to make it distinguishable from microcode _patch_ header. Signed-off-by: Peter Oruba Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 03ea4e52e87a..62058e9c8b4e 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -304,12 +304,12 @@ static void * get_next_ucode(u8 *buf, unsigned int size, static int install_equiv_cpu_table(u8 *buf, int (*get_ucode_data)(void *, const void *, size_t)) { -#define UCODE_HEADER_SIZE 12 - u8 *hdr[UCODE_HEADER_SIZE]; - unsigned int *buf_pos = (unsigned int *)hdr; +#define UCODE_CONTAINER_HEADER_SIZE 12 + u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; + unsigned int *buf_pos = (unsigned int *)container_hdr; unsigned long size; - if (get_ucode_data(&hdr, buf, UCODE_HEADER_SIZE)) + if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) return 0; size = buf_pos[2]; @@ -326,14 +326,14 @@ static int install_equiv_cpu_table(u8 *buf, return 0; } - buf += UCODE_HEADER_SIZE; + buf += UCODE_CONTAINER_HEADER_SIZE; if (get_ucode_data(equiv_cpu_table, buf, size)) { vfree(equiv_cpu_table); return 0; } - return size + UCODE_HEADER_SIZE; /* add header length */ -#undef UCODE_HEADER_SIZE + return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ +#undef UCODE_CONTAINER_HEADER_SIZE } static void free_equiv_cpu_table(void) -- cgit v1.2.2 From d4738792fb86600b6cb7220459d9c47e819b3580 Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Wed, 17 Sep 2008 15:39:18 +0200 Subject: x86, microcode rework, v2, renaming cont. Renaming based on patch from Dmitry Adamushko. Further clarification by renaming define and variable related to microcode container file. Signed-off-by: Peter Oruba Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 62058e9c8b4e..829415208ff8 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -263,21 +263,20 @@ static void * get_next_ucode(u8 *buf, unsigned int size, unsigned int *mc_size) { unsigned int total_size; -#define UCODE_UNKNOWN_HDR 8 - u8 hdr[UCODE_UNKNOWN_HDR]; +#define UCODE_CONTAINER_SECTION_HDR 8 + u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; void *mc; - if (get_ucode_data(hdr, buf, UCODE_UNKNOWN_HDR)) + if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) return NULL; - if (hdr[0] != UCODE_UCODE_TYPE) { + if (section_hdr[0] != UCODE_UCODE_TYPE) { printk(KERN_ERR "microcode: error! " "Wrong microcode payload type field\n"); return NULL; } - /* FIXME! dimm: Why not by means of get_totalsize(hdr)? */ - total_size = (unsigned long) (hdr[4] + (hdr[5] << 8)); + total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); printk(KERN_INFO "microcode: size %u, total_size %u\n", size, total_size); @@ -290,13 +289,13 @@ static void * get_next_ucode(u8 *buf, unsigned int size, mc = vmalloc(UCODE_MAX_SIZE); if (mc) { memset(mc, 0, UCODE_MAX_SIZE); - if (get_ucode_data(mc, buf + UCODE_UNKNOWN_HDR, total_size)) { + if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, total_size)) { vfree(mc); mc = NULL; } else - *mc_size = total_size + UCODE_UNKNOWN_HDR; + *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR; } -#undef UCODE_UNKNOWN_HDR +#undef UCODE_CONTAINER_SECTION_HDR return mc; } -- cgit v1.2.2 From 1eda81495a49a4ee91d8863b0a441a624375efea Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Tue, 23 Sep 2008 22:40:02 -0400 Subject: x86: prevent stale state of c1e_mask across CPU offline/online, fix Fix build error introduced by commit 4faac97d44ac27 ("x86: prevent stale state of c1e_mask across CPU offline/online"). process_32.c needs to include idle.h to get the prototype for c1e_remove_cpu() Signed-off-by: Marc Dionne Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4b3cfdf54216..31f40b24bf5d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,6 +55,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -- cgit v1.2.2 From 77a9a768b7374cd23d1f400097eede9f1547f508 Mon Sep 17 00:00:00 2001 From: Jeremy Katz Date: Tue, 23 Sep 2008 21:54:00 -0400 Subject: x86: disable apm on the olpc The OLPC doesn't support APM but also doesn't have DMI, so we can't detect and disable it based on DMI data. So, just disable based on machine_is_olpc() Signed-off-by: Jeremy Katz Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 9ee24e6bc4b0..732d1f4e10ee 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -234,6 +234,7 @@ #include #include #include +#include #include #include @@ -2217,7 +2218,7 @@ static int __init apm_init(void) dmi_check_system(apm_dmi_table); - if (apm_info.bios.version == 0 || paravirt_enabled()) { + if (apm_info.bios.version == 0 || paravirt_enabled() || machine_is_olpc()) { printk(KERN_INFO "apm: BIOS not found.\n"); return -ENODEV; } -- cgit v1.2.2 From 5fd933303bd1efacbd0acbe452ba9b889440eb40 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 23 Sep 2008 17:19:44 -0700 Subject: x86: signal: cosmetic unification of do_signal() Make do_signal() same. Thia patch modifies only comments in signal_64.c. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index bf77d4789a2d..5a5fbc3b1eea 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -410,7 +410,8 @@ static void do_signal(struct pt_regs *regs) signr = get_signal_to_deliver(&info, &ka, regs, NULL); if (signr > 0) { - /* Re-enable any watchpoints before delivering the + /* + * Re-enable any watchpoints before delivering the * signal to user space. The processor register will * have been cleared if the watchpoint triggered * inside the kernel. @@ -418,7 +419,7 @@ static void do_signal(struct pt_regs *regs) if (current->thread.debugreg7) set_debugreg(current->thread.debugreg7, 7); - /* Whee! Actually deliver the signal. */ + /* Whee! Actually deliver the signal. */ if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { /* * A signal was successfully delivered; the saved @@ -441,6 +442,7 @@ static void do_signal(struct pt_regs *regs) regs->ax = regs->orig_ax; regs->ip -= 2; break; + case -ERESTART_RESTARTBLOCK: regs->ax = NR_restart_syscall; regs->ip -= 2; -- cgit v1.2.2 From ee847c54ba7fc09f85f13a5bf18f45ea6c19aa83 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 23 Sep 2008 17:21:45 -0700 Subject: x86: signal: cosmetic unification of do_notify_resume() Make do_notify_resume() same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 8 ++++++++ arch/x86/kernel/signal_64.c | 16 ++++++++++++---- 2 files changed, 20 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index da3cf3270f83..b94463f264b4 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -663,6 +663,12 @@ static void do_signal(struct pt_regs *regs) void do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ + /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs); @@ -672,7 +678,9 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } +#ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 5a5fbc3b1eea..9087752f4109 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -460,14 +460,18 @@ static void do_signal(struct pt_regs *regs) } } -void do_notify_resume(struct pt_regs *regs, void *unused, - __u32 thread_info_flags) +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) { -#ifdef CONFIG_X86_MCE +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) /* notify userspace of pending MCEs */ if (thread_info_flags & _TIF_MCE_NOTIFY) mce_notify_user(); -#endif /* CONFIG_X86_MCE */ +#endif /* CONFIG_X86_64 && CONFIG_X86_MCE */ /* deal with pending signal delivery */ if (thread_info_flags & _TIF_SIGPENDING) @@ -477,6 +481,10 @@ void do_notify_resume(struct pt_regs *regs, void *unused, clear_thread_flag(TIF_NOTIFY_RESUME); tracehook_notify_resume(regs); } + +#ifdef CONFIG_X86_32 + clear_thread_flag(TIF_IRET); +#endif /* CONFIG_X86_32 */ } void signal_fault(struct pt_regs *regs, void __user *frame, char *where) -- cgit v1.2.2 From 86d3237cd1a09136b4fb3a1d73d3c3fd6331cb14 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 23 Sep 2008 17:22:32 -0700 Subject: x86: signal: cosmetic unification of handle_signal() Make handle_signal() same. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 9 +++++++++ arch/x86/kernel/signal_64.c | 2 ++ 2 files changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b94463f264b4..bb05917f232c 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -550,6 +550,15 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return ret; +#ifdef CONFIG_X86_64 + /* + * This has nothing to do with segment registers, + * despite the name. This magic affects uaccess.h + * macros' behavior. Reset it to the normal setting. + */ + set_fs(USER_DS); +#endif + /* * Clear the direction flag as per the ABI for function entry. */ diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 9087752f4109..963236f2c3c1 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -346,12 +346,14 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, if (ret) return ret; +#ifdef CONFIG_X86_64 /* * This has nothing to do with segment registers, * despite the name. This magic affects uaccess.h * macros' behavior. Reset it to the normal setting. */ set_fs(USER_DS); +#endif /* * Clear the direction flag as per the ABI for function entry. -- cgit v1.2.2 From 493cd9122af5bd0b219974a48f0e31da0c29ff7e Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 23 Sep 2008 14:56:44 -0700 Subject: x86: ds.c ptrace.c integer as NULL pointer sparse fixes fix: arch/x86/kernel/ptrace.c:763:29: warning: Using plain integer as NULL pointer arch/x86/kernel/ptrace.c:777:46: warning: Using plain integer as NULL pointer arch/x86/kernel/ptrace.c:1115:45: warning: Using plain integer as NULL pointer arch/x86/kernel/ds.c:482:26: warning: Using plain integer as NULL pointer arch/x86/kernel/ds.c:487:25: warning: Using plain integer as NULL pointer Signed-off-by: Harvey Harrison Acked-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 4 ++-- arch/x86/kernel/ptrace.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ab21c270bfa4..2b69994fd3a8 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -479,12 +479,12 @@ static int ds_release(struct task_struct *task, enum ds_qualifier qual) goto out; kfree(context->buffer[qual]); - context->buffer[qual] = 0; + context->buffer[qual] = NULL; current->mm->total_vm -= context->pages[qual]; current->mm->locked_vm -= context->pages[qual]; context->pages[qual] = 0; - context->owner[qual] = 0; + context->owner[qual] = NULL; /* * we put the context twice: diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index ba19bb49bd09..5df6093ac776 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -734,7 +734,7 @@ static int ptrace_bts_config(struct task_struct *child, goto errout; if (cfg.flags & PTRACE_BTS_O_ALLOC) { - ds_ovfl_callback_t ovfl = 0; + ds_ovfl_callback_t ovfl = NULL; unsigned int sig = 0; /* we ignore the error in case we were not tracing child */ @@ -748,7 +748,7 @@ static int ptrace_bts_config(struct task_struct *child, ovfl = ptrace_bts_ovfl; } - error = ds_request_bts(child, /* base = */ 0, cfg.size, ovfl); + error = ds_request_bts(child, /* base = */ NULL, cfg.size, ovfl); if (error < 0) goto errout; @@ -1086,7 +1086,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; case PTRACE_BTS_SIZE: - ret = ds_get_bts_index(child, /* pos = */ 0); + ret = ds_get_bts_index(child, /* pos = */ NULL); break; case PTRACE_BTS_GET: -- cgit v1.2.2 From e51a1ac2dfca9ad869471e88f828281db7e810c0 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 23 Sep 2008 15:20:09 -0700 Subject: x86, olpc: fix endian bug in openfirmware workaround Boardrev is always treated as a u32 everywhere else, no reason to byteswap the 0xc2 value. The only use is to print out if it is a prerelease board, the test being: (olpc_platform_info.boardrev & 0xf) < 8 Which is currently always true as be32_to_cpu(0xc2) & 0xf = 0 but I doubt that was the intention here. The consequences of the bug are pretty minor though (incorrect boardrev displayed in dmesg when ofw support not configured) Also annotate the temporary used to read the boardrev in the ofw case. The confusion was noticed by Sparse: arch/x86/kernel/olpc.c:206:32: warning: cast to restricted __be32 Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/kernel/olpc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 3e6672274807..7a13fac63a1f 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -190,12 +190,12 @@ EXPORT_SYMBOL_GPL(olpc_ec_cmd); static void __init platform_detect(void) { size_t propsize; - u32 rev; + __be32 rev; if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, &propsize) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); - rev = 0; + rev = cpu_to_be32(0); } olpc_platform_info.boardrev = be32_to_cpu(rev); } @@ -203,7 +203,7 @@ static void __init platform_detect(void) static void __init platform_detect(void) { /* stopgap until OFW support is added to the kernel */ - olpc_platform_info.boardrev = be32_to_cpu(0xc2); + olpc_platform_info.boardrev = 0xc2; } #endif -- cgit v1.2.2 From 2f9284e4e3be7b0b4d8d0638f9805603069a762d Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Tue, 23 Sep 2008 22:56:35 +0200 Subject: x86, microcode_amd: cleanup, mark request_microcode_user() as unsupported (1) mark mc_size in generic_load_microcode() as unitialized_var to avoid gcc's (false) warning; (2) mark request_microcode_user() as unsupported. The required changes can be added later. Note, we don't break any user-space interfaces here, as there were no kernels with support for AMD-specific ucode update yet. The ucode has to be updated via 'firmware'. Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_amd.c | 36 ++++-------------------------------- 1 file changed, 4 insertions(+), 32 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 829415208ff8..7a1f8eeac2c7 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -120,29 +120,7 @@ static int get_matching_microcode(int cpu, void *mc, int rev) unsigned int equiv_cpu_id = 0x00; unsigned int i = 0; - /* - * FIXME! dimm: do we need this? Why an update via /dev/... is different - * from the one via firmware? - * - * This is a tricky part. We might be called from a write operation - * to the device file instead of the usual process of firmware - * loading. This routine needs to be able to distinguish both - * cases. This is done by checking if there alread is a equivalent - * CPU table installed. If not, we're written through - * /dev/cpu/microcode. - * Since we ignore all checks. The error case in which going through - * firmware loading and that table is not loaded has already been - * checked earlier. - */ BUG_ON(equiv_cpu_table == NULL); -#if 0 - if (equiv_cpu_table == NULL) { - printk(KERN_INFO "microcode: CPU%d microcode update with " - "version 0x%x (current=0x%x)\n", - cpu, mc_header->patch_id, uci->cpu_sig.rev); - goto out; - } -#endif current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { @@ -362,7 +340,7 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover = size - offset; while (leftover) { - unsigned int mc_size; + unsigned int uninitialized_var(mc_size); struct microcode_header_amd *mc_header; mc = get_next_ucode(ucode_ptr, leftover, get_ucode_data, &mc_size); @@ -428,17 +406,11 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int get_ucode_user(void *to, const void *from, size_t n) -{ - return copy_from_user(to, from, n); -} - static int request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - return generic_load_microcode(cpu, (void*)buf, size, &get_ucode_user); + printk(KERN_WARNING "microcode: AMD microcode update via /dev/cpu/microcode" + "is not supported\n"); + return -1; } static void microcode_fini_cpu_amd(int cpu) -- cgit v1.2.2 From 82b078659ed04e1ecdebf8326e189cf76ed361af Mon Sep 17 00:00:00 2001 From: Peter Oruba Date: Wed, 24 Sep 2008 11:50:35 +0200 Subject: x86: microcode patch loader bugfix Corrected CPU vendor check condition for AMD microcode patch loader initialization. Signed-off-by: Peter Oruba Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index ff031dbccdf6..8db2eb55e9e2 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -450,7 +450,7 @@ static int __init microcode_init(void) if (c->x86_vendor == X86_VENDOR_INTEL) microcode_ops = init_intel_microcode(); - else if (c->x86_vendor != X86_VENDOR_AMD) + else if (c->x86_vendor == X86_VENDOR_AMD) microcode_ops = init_amd_microcode(); if (!microcode_ops) { -- cgit v1.2.2 From 4c168eaf7ea39f25a45a3d8c7eebc3fedb633a1d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 24 Sep 2008 11:08:52 +0200 Subject: Revert "Oprofile Multiplexing Patch" Reverting commit 1a960b402a51d80abf54e3f8e4972374ffe5f22d for the main branch. Multiplexing will be tracked on a separate feature branch. Conflicts: arch/x86/oprofile/nmi_int.c --- arch/x86/oprofile/nmi_int.c | 100 +++----------------------------------- arch/x86/oprofile/op_counter.h | 3 +- arch/x86/oprofile/op_model_amd.c | 76 ++++++++++++----------------- arch/x86/oprofile/op_model_p4.c | 4 -- arch/x86/oprofile/op_model_ppro.c | 2 - arch/x86/oprofile/op_x86_model.h | 3 -- 6 files changed, 39 insertions(+), 149 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 4108d02c5292..287513a09819 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -23,18 +23,12 @@ #include "op_counter.h" #include "op_x86_model.h" -DEFINE_PER_CPU(int, switch_index); - static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); static int nmi_start(void); static void nmi_stop(void); -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs); -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs); -static void nmi_cpu_stop(void *dummy); -static void nmi_cpu_start(void *dummy); /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; @@ -87,47 +81,6 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ -static void nmi_cpu_switch(void *dummy) -{ - int cpu = smp_processor_id(); - int si = per_cpu(switch_index, cpu); - struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); - - nmi_cpu_stop(NULL); - nmi_cpu_save_mpx_registers(msrs); - - /* move to next set */ - si += model->num_hardware_counters; - if ((si > model->num_counters) || (counter_config[si].count == 0)) - per_cpu(switch_index, smp_processor_id()) = 0; - else - per_cpu(switch_index, smp_processor_id()) = si; - - nmi_cpu_restore_mpx_registers(msrs); - model->setup_ctrs(msrs); - nmi_cpu_start(NULL); -} - -/* - * Quick check to see if multiplexing is necessary. - * The check should be sufficient since counters are used - * in ordre. - */ -static int nmi_multiplex_on(void) -{ - return counter_config[model->num_hardware_counters].count ? 0 : -EINVAL; -} - -static int nmi_switch_event(void) -{ - if (nmi_multiplex_on() < 0) - return -EINVAL; - - on_each_cpu(nmi_cpu_switch, NULL, 1); - - return 0; -} - static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -191,10 +144,11 @@ static void free_msrs(void) static int allocate_msrs(void) { - int i, success = 1; + int success = 1; size_t controls_size = sizeof(struct op_msr) * model->num_controls; size_t counters_size = sizeof(struct op_msr) * model->num_counters; + int i; for_each_possible_cpu(i) { per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, GFP_KERNEL); @@ -202,8 +156,8 @@ static int allocate_msrs(void) success = 0; break; } - per_cpu(cpu_msrs, i).controls = - kmalloc(controls_size, GFP_KERNEL); + per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, + GFP_KERNEL); if (!per_cpu(cpu_msrs, i).controls) { success = 0; break; @@ -247,8 +201,7 @@ static int nmi_setup(void) return err; } - /* - * We need to serialize save and setup for HT because the subset + /* We need to serialize save and setup for HT because the subset * of msrs are distinct for save and setup operations */ @@ -264,6 +217,7 @@ static int nmi_setup(void) per_cpu(cpu_msrs, 0).controls, sizeof(struct op_msr) * model->num_controls); } + } on_each_cpu(nmi_save_registers, NULL, 1); on_each_cpu(nmi_cpu_setup, NULL, 1); @@ -271,41 +225,7 @@ static int nmi_setup(void) return 0; } -static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) -{ - unsigned int si = __get_cpu_var(switch_index); - unsigned int const nr_ctrs = model->num_hardware_counters; - struct op_msr *counters = &msrs->counters[si]; - unsigned int i; - - for (i = 0; i < nr_ctrs; ++i) { - int offset = i + si; - if (counters[offset].addr) { - rdmsr(counters[offset].addr, - counters[offset].multiplex.low, - counters[offset].multiplex.high); - } - } -} - -static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) -{ - unsigned int si = __get_cpu_var(switch_index); - unsigned int const nr_ctrs = model->num_hardware_counters; - struct op_msr *counters = &msrs->counters[si]; - unsigned int i; - - for (i = 0; i < nr_ctrs; ++i) { - int offset = i + si; - if (counters[offset].addr) { - wrmsr(counters[offset].addr, - counters[offset].multiplex.low, - counters[offset].multiplex.high); - } - } -} - -static void nmi_cpu_restore_registers(struct op_msrs *msrs) +static void nmi_restore_registers(struct op_msrs *msrs) { unsigned int const nr_ctrs = model->num_counters; unsigned int const nr_ctrls = model->num_controls; @@ -345,8 +265,7 @@ static void nmi_cpu_shutdown(void *dummy) apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); apic_write(APIC_LVTPC, per_cpu(saved_lvtpc, cpu)); apic_write(APIC_LVTERR, v); - nmi_cpu_restore_registers(msrs); - __get_cpu_var(switch_index) = 0; + nmi_restore_registers(msrs); } static void nmi_shutdown(void) @@ -409,7 +328,6 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); - counter_config[i].save_count_low = 0; } return 0; @@ -551,14 +469,12 @@ int __init op_nmi_init(struct oprofile_operations *ops) } /* default values, can be overwritten by model */ - __raw_get_cpu_var(switch_index) = 0; ops->create_files = nmi_create_files; ops->setup = nmi_setup; ops->shutdown = nmi_shutdown; ops->start = nmi_start; ops->stop = nmi_stop; ops->cpu_type = cpu_type; - ops->switch_events = nmi_switch_event; if (model->init) ret = model->init(ops); diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 786d6e01cf7f..2880b15c4675 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -10,14 +10,13 @@ #ifndef OP_COUNTER_H #define OP_COUNTER_H -#define OP_MAX_COUNTER 32 +#define OP_MAX_COUNTER 8 /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { unsigned long count; - unsigned long save_count_low; unsigned long enabled; unsigned long event; unsigned long kernel; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index bbf2b68bcc5d..d9faf607b3a6 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -24,10 +23,8 @@ #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 32 -#define NUM_HARDWARE_COUNTERS 4 -#define NUM_CONTROLS 32 -#define NUM_HARDWARE_CONTROLS 4 +#define NUM_COUNTERS 4 +#define NUM_CONTROLS 4 #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) @@ -51,7 +48,6 @@ #define CTRL_SET_GUEST_ONLY(val, h) (val |= ((h & 1) << 8)) static unsigned long reset_value[NUM_COUNTERS]; -DECLARE_PER_CPU(int, switch_index); #ifdef CONFIG_OPROFILE_IBS @@ -134,17 +130,15 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs) int i; for (i = 0; i < NUM_COUNTERS; i++) { - int hw_counter = i % NUM_HARDWARE_COUNTERS; - if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + hw_counter)) - msrs->counters[i].addr = MSR_K7_PERFCTR0 + hw_counter; + if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) + msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; else msrs->counters[i].addr = 0; } for (i = 0; i < NUM_CONTROLS; i++) { - int hw_control = i % NUM_HARDWARE_CONTROLS; - if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + hw_control)) - msrs->controls[i].addr = MSR_K7_EVNTSEL0 + hw_control; + if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) + msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; else msrs->controls[i].addr = 0; } @@ -156,16 +150,8 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_HARDWARE_CONTROLS; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (counter_config[offset].enabled) - reset_value[offset] = counter_config[offset].count; - else - reset_value[offset] = 0; - } - /* clear all counters */ - for (i = 0 ; i < NUM_HARDWARE_CONTROLS; ++i) { + for (i = 0 ; i < NUM_CONTROLS; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -175,31 +161,34 @@ static void op_amd_setup_ctrs(struct op_msrs const * const msrs) } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { + for (i = 0; i < NUM_COUNTERS; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; CTR_WRITE(1, msrs, i); } /* enable active counters */ - for (i = 0; i < NUM_HARDWARE_COUNTERS; ++i) { - int offset = i + __get_cpu_var(switch_index); - if ((counter_config[offset].enabled) && (CTR_IS_RESERVED(msrs, i))) { - CTR_WRITE(counter_config[offset].count, msrs, i); + for (i = 0; i < NUM_COUNTERS; ++i) { + if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { + reset_value[i] = counter_config[i].count; + + CTR_WRITE(counter_config[i].count, msrs, i); CTRL_READ(low, high, msrs, i); CTRL_CLEAR_LO(low); CTRL_CLEAR_HI(high); CTRL_SET_ENABLE(low); - CTRL_SET_USR(low, counter_config[offset].user); - CTRL_SET_KERN(low, counter_config[offset].kernel); - CTRL_SET_UM(low, counter_config[offset].unit_mask); - CTRL_SET_EVENT_LOW(low, counter_config[offset].event); - CTRL_SET_EVENT_HIGH(high, counter_config[offset].event); + CTRL_SET_USR(low, counter_config[i].user); + CTRL_SET_KERN(low, counter_config[i].kernel); + CTRL_SET_UM(low, counter_config[i].unit_mask); + CTRL_SET_EVENT_LOW(low, counter_config[i].event); + CTRL_SET_EVENT_HIGH(high, counter_config[i].event); CTRL_SET_HOST_ONLY(high, 0); CTRL_SET_GUEST_ONLY(high, 0); CTRL_WRITE(low, high, msrs, i); + } else { + reset_value[i] = 0; } } } @@ -287,14 +276,13 @@ static int op_amd_check_ctrs(struct pt_regs * const regs, unsigned int low, high; int i; - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (!reset_value[offset]) + for (i = 0 ; i < NUM_COUNTERS; ++i) { + if (!reset_value[i]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { - oprofile_add_sample(regs, offset); - CTR_WRITE(reset_value[offset], msrs, i); + oprofile_add_sample(regs, i); + CTR_WRITE(reset_value[i], msrs, i); } } @@ -310,10 +298,8 @@ static void op_amd_start(struct op_msrs const * const msrs) { unsigned int low, high; int i; - - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { - int offset = i + __get_cpu_var(switch_index); - if (reset_value[offset]) { + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (reset_value[i]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); CTRL_WRITE(low, high, msrs, i); @@ -343,8 +329,8 @@ static void op_amd_stop(struct op_msrs const * const msrs) /* Subtle: stop on all counters to avoid race with * setting our pm callback */ - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { - if (!reset_value[i + per_cpu(switch_index, smp_processor_id())]) + for (i = 0 ; i < NUM_COUNTERS ; ++i) { + if (!reset_value[i]) continue; CTRL_READ(low, high, msrs, i); CTRL_SET_INACTIVE(low); @@ -370,11 +356,11 @@ static void op_amd_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + for (i = 0 ; i < NUM_COUNTERS ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_K7_PERFCTR0 + i); } - for (i = 0 ; i < NUM_HARDWARE_COUNTERS ; ++i) { + for (i = 0 ; i < NUM_CONTROLS ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); } @@ -548,8 +534,6 @@ struct op_x86_model_spec const op_amd_spec = { .exit = op_amd_exit, .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, - .num_hardware_counters = NUM_HARDWARE_COUNTERS, - .num_hardware_controls = NUM_HARDWARE_CONTROLS, .fill_in_addresses = &op_amd_fill_in_addresses, .setup_ctrs = &op_amd_setup_ctrs, .check_ctrs = &op_amd_check_ctrs, diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index cacba61ffbac..43ac5af338d8 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -700,8 +700,6 @@ static void p4_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_p4_ht2_spec = { .num_counters = NUM_COUNTERS_HT2, .num_controls = NUM_CONTROLS_HT2, - .num_hardware_counters = NUM_COUNTERS_HT2, - .num_hardware_controls = NUM_CONTROLS_HT2, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, @@ -714,8 +712,6 @@ struct op_x86_model_spec const op_p4_ht2_spec = { struct op_x86_model_spec const op_p4_spec = { .num_counters = NUM_COUNTERS_NON_HT, .num_controls = NUM_CONTROLS_NON_HT, - .num_hardware_counters = NUM_COUNTERS_NON_HT, - .num_hardware_controls = NUM_CONTROLS_NON_HT, .fill_in_addresses = &p4_fill_in_addresses, .setup_ctrs = &p4_setup_ctrs, .check_ctrs = &p4_check_ctrs, diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index e5811aa480eb..eff431f6c57b 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -183,8 +183,6 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { .num_counters = NUM_COUNTERS, .num_controls = NUM_CONTROLS, - .num_hardware_counters = NUM_COUNTERS, - .num_hardware_controls = NUM_CONTROLS, .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index e07ba1076371..05a0261ba0c3 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -19,7 +19,6 @@ struct op_saved_msr { struct op_msr { unsigned long addr; struct op_saved_msr saved; - struct op_saved_msr multiplex; }; struct op_msrs { @@ -35,8 +34,6 @@ struct pt_regs; struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); - unsigned int const num_hardware_counters; - unsigned int const num_hardware_controls; unsigned int const num_counters; unsigned int const num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); -- cgit v1.2.2 From 8d8c13bdb53977319593d9efc4971a4cacc8bd03 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:10:29 -0700 Subject: x86: signal_32.c: introduce signr_convert() Introduce signr_convert(). This function will help unification of setup_rt_frame(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index bb05917f232c..b1bc90f19b9a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -482,18 +482,21 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* * OK, we're invoking a handler: */ +static int signr_convert(int sig) +{ + struct thread_info *info = current_thread_info(); + + if (info->exec_domain && info->exec_domain->signal_invmap && sig < 32) + return info->exec_domain->signal_invmap[sig]; + return sig; +} + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { + int usig = signr_convert(sig); int ret; - int usig; - - usig = current_thread_info()->exec_domain - && current_thread_info()->exec_domain->signal_invmap - && sig < 32 - ? current_thread_info()->exec_domain->signal_invmap[sig] - : sig; /* Set up the stack frame */ if (ka->sa.sa_flags & SA_SIGINFO) -- cgit v1.2.2 From b94fd69827b2104a2a4d9d0bc05a8e908a937ae8 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:12:54 -0700 Subject: x86: signal_64.c: introduce helper function signr_convert() This helper function is for unification of setup_rt_frame(). No effect in binary. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 963236f2c3c1..c5d80024a8f3 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -281,18 +281,24 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* * OK, we're invoking a handler */ +static int signr_convert(int sig) +{ + return sig; +} + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) { + int usig = signr_convert(sig); int ret; #ifdef CONFIG_IA32_EMULATION if (test_thread_flag(TIF_IA32)) { if (ka->sa.sa_flags & SA_SIGINFO) - ret = ia32_setup_rt_frame(sig, ka, info, set, regs); + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else - ret = ia32_setup_frame(sig, ka, set, regs); + ret = ia32_setup_frame(usig, ka, set, regs); } else #endif ret = __setup_rt_frame(sig, ka, info, set, regs); -- cgit v1.2.2 From 455edbc423db282bb64dec2d7c8968498ea8e619 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:13:11 -0700 Subject: x86: signal: introduce helper macro is_ia32 Introduce new macro is_ia32 for unification of setup_rt_frame(). No effect in binary, compiler will optimize. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 13 +++++++++---- arch/x86/kernel/signal_64.c | 13 +++++++++---- 2 files changed, 18 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index b1bc90f19b9a..cf62f70cc2a6 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -491,6 +491,8 @@ static int signr_convert(int sig) return sig; } +#define is_ia32 1 + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) @@ -499,10 +501,13 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int ret; /* Set up the stack frame */ - if (ka->sa.sa_flags & SA_SIGINFO) - ret = __setup_rt_frame(usig, ka, info, set, regs); - else - ret = __setup_frame(usig, ka, set, regs); + if (is_ia32) { + if (ka->sa.sa_flags & SA_SIGINFO) + ret = __setup_rt_frame(usig, ka, info, set, regs); + else + ret = __setup_frame(usig, ka, set, regs); + } else + ret = __setup_rt_frame(sig, ka, info, set, regs); if (ret) { force_sigsegv(sig, current); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index c5d80024a8f3..53f86d95985b 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -286,6 +286,12 @@ static int signr_convert(int sig) return sig; } +#ifdef CONFIG_IA32_EMULATION +#define is_ia32 test_thread_flag(TIF_IA32) +#else +#define is_ia32 0 +#endif + static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, sigset_t *set, struct pt_regs *regs) @@ -293,15 +299,14 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, int usig = signr_convert(sig); int ret; -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) { + /* Set up the stack frame */ + if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else ret = ia32_setup_frame(usig, ka, set, regs); } else -#endif - ret = __setup_rt_frame(sig, ka, info, set, regs); + ret = __setup_rt_frame(sig, ka, info, set, regs); if (ret) { force_sigsegv(sig, current); -- cgit v1.2.2 From 4694d2391221e14fe671602fe34ea7f24f5a561f Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Wed, 24 Sep 2008 19:13:29 -0700 Subject: x86: signal_32.c: introduce macro ia32_setup_frame and ia32_setup_rt_frame Make 32-bit setup_rt_frame() look like 64-bit version for unification. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index cf62f70cc2a6..4337cd510f0a 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -492,6 +492,8 @@ static int signr_convert(int sig) } #define is_ia32 1 +#define ia32_setup_frame __setup_frame +#define ia32_setup_rt_frame __setup_rt_frame static int setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, @@ -503,9 +505,9 @@ setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, /* Set up the stack frame */ if (is_ia32) { if (ka->sa.sa_flags & SA_SIGINFO) - ret = __setup_rt_frame(usig, ka, info, set, regs); + ret = ia32_setup_rt_frame(usig, ka, info, set, regs); else - ret = __setup_frame(usig, ka, set, regs); + ret = ia32_setup_frame(usig, ka, set, regs); } else ret = __setup_rt_frame(sig, ka, info, set, regs); -- cgit v1.2.2 From 9f6ac57729724b58df81ca5dc005326759a806fe Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 24 Sep 2008 20:48:35 +0900 Subject: x86: export pci-nommu's alloc_coherent This patch exports nommu_alloc_coherent (renamed dma_generic_alloc_coherent). GART needs this function. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-dma.c | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/pci-nommu.c | 39 +-------------------------------------- 2 files changed, 32 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0a1408abcc62..4e612d20170a 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -134,6 +134,37 @@ unsigned long iommu_num_pages(unsigned long addr, unsigned long len) EXPORT_SYMBOL(iommu_num_pages); #endif +void *dma_generic_alloc_coherent(struct device *dev, size_t size, + dma_addr_t *dma_addr, gfp_t flag) +{ + unsigned long dma_mask; + struct page *page; + dma_addr_t addr; + + dma_mask = dma_alloc_coherent_mask(dev, flag); + + flag |= __GFP_ZERO; +again: + page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); + if (!page) + return NULL; + + addr = page_to_phys(page); + if (!is_buffer_dma_capable(dma_mask, addr, size)) { + __free_pages(page, get_order(size)); + + if (dma_mask < DMA_32BIT_MASK && !(flag & GFP_DMA)) { + flag = (flag & ~GFP_DMA32) | GFP_DMA; + goto again; + } + + return NULL; + } + + *dma_addr = addr; + return page_address(page); +} + /* * See for the iommu kernel parameter * documentation. diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c index 1c1c98a31d57..c70ab5a5d4c8 100644 --- a/arch/x86/kernel/pci-nommu.c +++ b/arch/x86/kernel/pci-nommu.c @@ -72,43 +72,6 @@ static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, return nents; } -static void * -nommu_alloc_coherent(struct device *hwdev, size_t size, - dma_addr_t *dma_addr, gfp_t gfp) -{ - unsigned long dma_mask; - int node; - struct page *page; - dma_addr_t addr; - - dma_mask = dma_alloc_coherent_mask(hwdev, gfp); - - gfp |= __GFP_ZERO; - - node = dev_to_node(hwdev); -again: - page = alloc_pages_node(node, gfp, get_order(size)); - if (!page) - return NULL; - - addr = page_to_phys(page); - if (!is_buffer_dma_capable(dma_mask, addr, size) && !(gfp & GFP_DMA)) { - free_pages((unsigned long)page_address(page), get_order(size)); - gfp |= GFP_DMA; - goto again; - } - - if (check_addr("alloc_coherent", hwdev, addr, size)) { - *dma_addr = addr; - flush_write_buffers(); - return page_address(page); - } - - free_pages((unsigned long)page_address(page), get_order(size)); - - return NULL; -} - static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr) { @@ -116,7 +79,7 @@ static void nommu_free_coherent(struct device *dev, size_t size, void *vaddr, } struct dma_mapping_ops nommu_dma_ops = { - .alloc_coherent = nommu_alloc_coherent, + .alloc_coherent = dma_generic_alloc_coherent, .free_coherent = nommu_free_coherent, .map_single = nommu_map_single, .map_sg = nommu_map_sg, -- cgit v1.2.2 From ecef533ea68b2fb3baaf459beb2f802a240bdb16 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 24 Sep 2008 20:48:36 +0900 Subject: revert "x86: make GART to respect device's dma_mask about virtual mappings" This reverts: commit bee44f294efd8417f5e68553778a6cc957af1547 Author: FUJITA Tomonori Date: Fri Sep 12 19:42:35 2008 +0900 x86: make GART to respect device's dma_mask about virtual mappings I wrote the above commit to fix a GART alloc_coherent regression, that can't handle a device having dma_masks > 24bit < 32bits, introduced by the alloc_coherent rewrite: http://lkml.org/lkml/2008/8/12/200 After the alloc_coherent rewrite, GART alloc_coherent tried to allocate pages with GFP_DMA32. If GART got an address that a device can't access to, GART mapped the address to a virtual I/O address. But GART mapping mechanism didn't take account of dma mask, so GART could use a virtual I/O address that the device can't access to again. Alan pointed out: " This is indeed a specific problem found with things like older AACRAID where control blocks must be below 31bits and the GART is above 0x80000000. " The above commit modified GART mapping mechanism to take care of dma mask. But Andi pointed out, "The GART is somewhere in the 4GB range so you cannot use it to map anything < 4GB. Also GART is pretty small." http://lkml.org/lkml/2008/9/12/43 That means it's possible that GART doesn't have virtual I/O address space that a device can access to. The above commit (to modify GART mapping mechanism to take care of dma mask) can't fix the regression reliably so let's avoid making GART more complicated. We need a solution that always works for dma_masks > 24bit < 32bits. That's how GART worked before the alloc_coherent rewrite. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Acked-by: Alan Cox Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 39 +++++++++++---------------------------- 1 file changed, 11 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 9e390f1bd46a..7e08e466b8ad 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -83,34 +83,23 @@ static unsigned long next_bit; /* protected by iommu_bitmap_lock */ static int need_flush; /* global flush state. set for each gart wrap */ static unsigned long alloc_iommu(struct device *dev, int size, - unsigned long align_mask, u64 dma_mask) + unsigned long align_mask) { unsigned long offset, flags; unsigned long boundary_size; unsigned long base_index; - unsigned long limit; base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), PAGE_SIZE) >> PAGE_SHIFT; boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; - limit = iommu_device_max_index(iommu_pages, - DIV_ROUND_UP(iommu_bus_base, PAGE_SIZE), - dma_mask >> PAGE_SHIFT); - spin_lock_irqsave(&iommu_bitmap_lock, flags); - - if (limit <= next_bit) { - need_flush = 1; - next_bit = 0; - } - - offset = iommu_area_alloc(iommu_gart_bitmap, limit, next_bit, + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, size, base_index, boundary_size, align_mask); - if (offset == -1 && next_bit) { + if (offset == -1) { need_flush = 1; - offset = iommu_area_alloc(iommu_gart_bitmap, limit, 0, + offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, size, base_index, boundary_size, align_mask); } @@ -239,14 +228,12 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) * Caller needs to check if the iommu is needed and flush. */ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, - size_t size, int dir, unsigned long align_mask, - u64 dma_mask) + size_t size, int dir, unsigned long align_mask) { unsigned long npages = iommu_num_pages(phys_mem, size); - unsigned long iommu_page; + unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; - iommu_page = alloc_iommu(dev, npages, align_mask, dma_mask); if (iommu_page == -1) { if (!nonforced_iommu(dev, phys_mem, size)) return phys_mem; @@ -276,7 +263,7 @@ gart_map_single(struct device *dev, phys_addr_t paddr, size_t size, int dir) if (!need_iommu(dev, paddr, size)) return paddr; - bus = dma_map_area(dev, paddr, size, dir, 0, dma_get_mask(dev)); + bus = dma_map_area(dev, paddr, size, dir, 0); flush_gart(); return bus; @@ -327,7 +314,6 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, { struct scatterlist *s; int i; - u64 dma_mask = dma_get_mask(dev); #ifdef CONFIG_IOMMU_DEBUG printk(KERN_DEBUG "dma_map_sg overflow\n"); @@ -337,8 +323,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg, unsigned long addr = sg_phys(s); if (nonforced_iommu(dev, addr, s->length)) { - addr = dma_map_area(dev, addr, s->length, dir, 0, - dma_mask); + addr = dma_map_area(dev, addr, s->length, dir, 0); if (addr == bad_dma_address) { if (i > 0) gart_unmap_sg(dev, sg, i, dir); @@ -360,16 +345,14 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, int nelems, struct scatterlist *sout, unsigned long pages) { - unsigned long iommu_start; - unsigned long iommu_page; + unsigned long iommu_start = alloc_iommu(dev, pages, 0); + unsigned long iommu_page = iommu_start; struct scatterlist *s; int i; - iommu_start = alloc_iommu(dev, pages, 0, dma_get_mask(dev)); if (iommu_start == -1) return -1; - iommu_page = iommu_start; for_each_sg(start, s, nelems, i) { unsigned long pages, addr; unsigned long phys_addr = s->dma_address; @@ -522,7 +505,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, align_mask = (1UL << get_order(size)) - 1; *dma_addr = dma_map_area(dev, paddr, size, DMA_BIDIRECTIONAL, - align_mask, dma_mask); + align_mask); flush_gart(); if (*dma_addr != bad_dma_address) -- cgit v1.2.2 From 1d990882153f36723f9e8717c4401689e64c7a36 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 24 Sep 2008 20:48:37 +0900 Subject: x86: restore old GART alloc_coherent behavior Currently, GART alloc_coherent tries to allocate pages with GFP_DMA32 for a device having dma_masks > 24bit < 32bits. If GART gets an address that a device can't access to, GART try to map the address to a virtual I/O address that the device can access to. But Andi pointed out, "The GART is somewhere in the 4GB range so you cannot use it to map anything < 4GB. Also GART is pretty small." http://lkml.org/lkml/2008/9/12/43 That is, it's possible that GART doesn't have virtual I/O address space that a device can access to. The above behavior doesn't work for a device having dma_masks > 24bit < 32bits. This patch restores old GART alloc_coherent behavior (before the alloc_coherent rewrite). Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 7e08e466b8ad..25c94fb96d74 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -487,31 +487,28 @@ static void * gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr, gfp_t flag) { - void *vaddr; dma_addr_t paddr; unsigned long align_mask; - u64 dma_mask = dma_alloc_coherent_mask(dev, flag); - - vaddr = (void *)__get_free_pages(flag | __GFP_ZERO, get_order(size)); - if (!vaddr) - return NULL; - - paddr = virt_to_phys(vaddr); - if (is_buffer_dma_capable(dma_mask, paddr, size)) { - *dma_addr = paddr; - return vaddr; - } - - align_mask = (1UL << get_order(size)) - 1; - - *dma_addr = dma_map_area(dev, paddr, size, DMA_BIDIRECTIONAL, - align_mask); - flush_gart(); - - if (*dma_addr != bad_dma_address) - return vaddr; - - free_pages((unsigned long)vaddr, get_order(size)); + struct page *page; + + if (force_iommu && !(flag & GFP_DMA)) { + flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); + page = alloc_pages(flag | __GFP_ZERO, get_order(size)); + if (!page) + return NULL; + + align_mask = (1UL << get_order(size)) - 1; + paddr = dma_map_area(dev, page_to_phys(page), size, + DMA_BIDIRECTIONAL, align_mask); + + flush_gart(); + if (paddr != bad_dma_address) { + *dma_addr = paddr; + return page_address(page); + } + __free_pages(page, get_order(size)); + } else + return dma_generic_alloc_coherent(dev, size, dma_addr, flag); return NULL; } -- cgit v1.2.2 From 1615965e54eb94d7bcd298d2163739bd79f602d4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 24 Sep 2008 22:41:10 +0900 Subject: x86 gart: remove unnecessary initialization There is no point to have such initialization in struct dma_mapping_ops. Signed-off-by: FUJITA Tomonori Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 25c94fb96d74..b85a2f9bb340 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -715,12 +715,6 @@ extern int agp_amd64_init(void); static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, .unmap_single = gart_unmap_single, - .sync_single_for_cpu = NULL, - .sync_single_for_device = NULL, - .sync_single_range_for_cpu = NULL, - .sync_single_range_for_device = NULL, - .sync_sg_for_cpu = NULL, - .sync_sg_for_device = NULL, .map_sg = gart_map_sg, .unmap_sg = gart_unmap_sg, .alloc_coherent = gart_alloc_coherent, -- cgit v1.2.2 From f6476774f1fe32593d3d71903b1e98514efbf685 Mon Sep 17 00:00:00 2001 From: Bill Nottingham Date: Wed, 24 Sep 2008 14:35:17 -0400 Subject: x86_64: be less annoying on boot Remove mostly useless message on every boot. Signed-off-by: Bill Nottingham Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 9bfc4d72fb2e..11aa501c9f4f 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -112,8 +112,6 @@ void __init x86_64_start_kernel(char * real_mode_data) x86_64_init_pda(); - early_printk("Kernel really alive\n"); - x86_64_start_reservations(real_mode_data); } -- cgit v1.2.2 From 379daf6290814e41f14880094b7b773640df2461 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Thu, 25 Sep 2008 18:43:34 -0700 Subject: IO resources, x86: ioremap sanity check to catch mapping requests exceeding the BAR sizes Go through the iomem resource tree to check if any of the ioremap() requests span more than any slot in the iomem resource tree and do a WARN_ON() if we hit this check. This will raise a red-flag, if some driver is mapping more than what is needed. And hopefully identify possible corruptions much earlier. Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4b6e6a29ae3..c818b45bd07d 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -149,6 +149,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, if (is_ISA_range(phys_addr, last_addr)) return (__force void __iomem *)phys_to_virt(phys_addr); + /* + * Check if the request spans more than any BAR in the iomem resource + * tree. + */ + WARN_ON(iomem_map_sanity_check(phys_addr, size)); + /* * Don't allow anybody to remap normal RAM that we're using.. */ -- cgit v1.2.2 From d7161a65341556bacb5e6654e133803f46f51063 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 26 Sep 2008 10:36:41 -0500 Subject: kgdb, x86, arm, mips, powerpc: ignore user space single stepping On the x86 arch, user space single step exceptions should be ignored if they occur in the kernel space, such as ptrace stepping through a system call. First check if it is kgdb that is executing a single step, then ensure it is not an accidental traversal into the user space, while in kgdb, any other time the TIF_SINGLESTEP is set, kgdb should ignore the exception. On x86, arm, mips and powerpc, the kgdb_contthread usage was inconsistent with the way single stepping is implemented in the kgdb core. The arch specific stub should always set the kgdb_cpu_doing_single_step correctly if it is single stepping. This allows kgdb to correctly process an instruction steps if ptrace happens to be requesting an instruction step over a system call. Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index f47f0eb886b8..00f7896c9a19 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -378,10 +378,8 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, if (remcomInBuffer[0] == 's') { linux_regs->flags |= X86_EFLAGS_TF; kgdb_single_step = 1; - if (kgdb_contthread) { - atomic_set(&kgdb_cpu_doing_single_step, - raw_smp_processor_id()); - } + atomic_set(&kgdb_cpu_doing_single_step, + raw_smp_processor_id()); } get_debugreg(dr6, 6); @@ -466,9 +464,15 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) case DIE_DEBUG: if (atomic_read(&kgdb_cpu_doing_single_step) == - raw_smp_processor_id() && - user_mode(regs)) - return single_step_cont(regs, args); + raw_smp_processor_id()) { + if (user_mode(regs)) + return single_step_cont(regs, args); + break; + } else if (test_thread_flag(TIF_SINGLESTEP)) + /* This means a user thread is single stepping + * a system call which should be ignored + */ + return NOTIFY_DONE; /* fall through */ default: if (user_mode(regs)) -- cgit v1.2.2 From 703a1edcd1534468fc18f733c03bd91a65c8c6f0 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Fri, 26 Sep 2008 10:36:42 -0500 Subject: kgdb, x86_64: fix PS CS SS registers in gdb serial On x86_64 the gdb serial register structure defines the PS (also known as eflags), CS and SS registers as 4 bytes entities. This patch splits the x86_64 regnames enum into a 32 and 64 version to account for the 32 bit entities in the gdb serial packets. Also the program counter is properly filled in for the sleeping threads. Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 00f7896c9a19..8282a2139681 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -69,6 +69,9 @@ static int gdb_x86vector = -1; */ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = regs->ax; gdb_regs[GDB_BX] = regs->bx; gdb_regs[GDB_CX] = regs->cx; @@ -76,9 +79,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_SI] = regs->si; gdb_regs[GDB_DI] = regs->di; gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_PC] = regs->ip; #ifdef CONFIG_X86_32 + gdb_regs[GDB_PS] = regs->flags; gdb_regs[GDB_DS] = regs->ds; gdb_regs[GDB_ES] = regs->es; gdb_regs[GDB_CS] = regs->cs; @@ -94,6 +97,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) gdb_regs[GDB_R13] = regs->r13; gdb_regs[GDB_R14] = regs->r14; gdb_regs[GDB_R15] = regs->r15; + gdb_regs32[GDB_PS] = regs->flags; + gdb_regs32[GDB_CS] = regs->cs; + gdb_regs32[GDB_SS] = regs->ss; #endif gdb_regs[GDB_SP] = regs->sp; } @@ -112,6 +118,9 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) */ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif gdb_regs[GDB_AX] = 0; gdb_regs[GDB_BX] = 0; gdb_regs[GDB_CX] = 0; @@ -129,8 +138,10 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); - gdb_regs[GDB_PC] = 0; + gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_CS] = __KERNEL_CS; + gdb_regs32[GDB_SS] = __KERNEL_DS; + gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -153,6 +164,9 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) */ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) { +#ifndef CONFIG_X86_32 + u32 *gdb_regs32 = (u32 *)gdb_regs; +#endif regs->ax = gdb_regs[GDB_AX]; regs->bx = gdb_regs[GDB_BX]; regs->cx = gdb_regs[GDB_CX]; @@ -160,9 +174,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->si = gdb_regs[GDB_SI]; regs->di = gdb_regs[GDB_DI]; regs->bp = gdb_regs[GDB_BP]; - regs->flags = gdb_regs[GDB_PS]; regs->ip = gdb_regs[GDB_PC]; #ifdef CONFIG_X86_32 + regs->flags = gdb_regs[GDB_PS]; regs->ds = gdb_regs[GDB_DS]; regs->es = gdb_regs[GDB_ES]; regs->cs = gdb_regs[GDB_CS]; @@ -175,6 +189,9 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) regs->r13 = gdb_regs[GDB_R13]; regs->r14 = gdb_regs[GDB_R14]; regs->r15 = gdb_regs[GDB_R15]; + regs->flags = gdb_regs32[GDB_PS]; + regs->cs = gdb_regs32[GDB_CS]; + regs->ss = gdb_regs32[GDB_SS]; #endif } -- cgit v1.2.2 From 7fc2368d1d0dce7a778beb2fba3acac8fa7a34b6 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 27 Sep 2008 00:30:07 -0700 Subject: x86: don't need to go to chunksize to 4G change back chunksize max to 2g otherwise will get strange layout in 2G ram system like 0 - 4g WB, 2040M - 2048M UC, 2048M - 4G NC instead of 0 - 2g WB, 2040M - 2048M UC Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b117d7f8a564..cc135572882a 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1155,10 +1155,10 @@ struct mtrr_cleanup_result { /* * gran_size: 1M, 2M, ..., 2G - * chunk size: gran_size, ..., 4G - * so we need (2+13)*6 + * chunk size: gran_size, ..., 2G + * so we need (1+12)*6 */ -#define NUM_RESULT 90 +#define NUM_RESULT 78 #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; @@ -1276,7 +1276,7 @@ static int __init mtrr_cleanup(unsigned address_bits) memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { - for (chunk_size = gran_size; chunk_size < (1ULL<<33); + for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { int num_reg; -- cgit v1.2.2 From 2313c2793d290a8cc37c428f8622c53f3fe1d6dc Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 27 Sep 2008 00:30:08 -0700 Subject: x86: mtrr_cleanup optimization, v2 fix hpa's t61 with 4g ram: change layout from (n - 1)*chunksize + chunk_size - NC to n*chunksize - NC Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 79 +++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index cc135572882a..93d575ac61a3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -970,6 +970,8 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, /* try to append some small hole */ range0_basek = state->range_startk; range0_sizek = ALIGN(state->range_sizek, chunk_sizek); + + /* no increase */ if (range0_sizek == state->range_sizek) { if (debug_print) printk(KERN_DEBUG "rangeX: %016lx - %016lx\n", @@ -980,13 +982,13 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, return 0; } - range0_sizek -= chunk_sizek; - if (range0_sizek && sizek) { - while (range0_basek + range0_sizek > (basek + sizek)) { - range0_sizek -= chunk_sizek; - if (!range0_sizek) - break; - } + /* only cut back, when it is not the last */ + if (sizek) { + while (range0_basek + range0_sizek > (basek + sizek)) { + range0_sizek -= chunk_sizek; + if (!range0_sizek) + break; + } } if (range0_sizek) { @@ -1000,46 +1002,39 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, } range_basek = range0_basek + range0_sizek; - range_sizek = chunk_sizek; - if (range_basek + range_sizek > basek && - range_basek + range_sizek <= (basek + sizek)) { - /* one hole */ - second_basek = basek; - second_sizek = range_basek + range_sizek - basek; - } + /* one hole in the middle */ + if (range_basek > basek && range_basek <= (basek + sizek)) + second_sizek = range_basek - basek; - /* if last piece, only could one hole near end */ - if ((second_basek || !basek) && - range_sizek - (state->range_sizek - range0_sizek) - second_sizek < - (chunk_sizek >> 1)) { - /* - * one hole in middle (second_sizek is 0) or at end - * (second_sizek is 0 ) - */ - hole_sizek = range_sizek - (state->range_sizek - range0_sizek) - - second_sizek; - hole_basek = range_basek + range_sizek - hole_sizek - - second_sizek; - } else { - /* fallback for big hole, or several holes */ + if (range0_sizek > state->range_sizek) { + unsigned long hole_basek, hole_sizek; + + /* one hole in middle or at end */ + hole_sizek = range0_sizek - state->range_sizek - second_sizek; + if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; + if (debug_print) + printk(KERN_DEBUG "hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, + MTRR_TYPE_UNCACHABLE); + } + } else { + /* need to handle left over */ range_sizek = state->range_sizek - range0_sizek; - second_basek = 0; - second_sizek = 0; - } - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", range_basek<<10, - (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, range_sizek, + if (range_sizek) { + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); - if (hole_sizek) { - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, hole_sizek, - MTRR_TYPE_UNCACHABLE); - + } } return second_sizek; -- cgit v1.2.2 From 54d45ff4208836e62536fc40b0141586dbf6641f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 27 Sep 2008 00:30:06 -0700 Subject: x86: add mtrr_cleanup_debug command line add mtrr_cleanup_debug to print out more info about layout Signed-off-by: Yinghai Lu Cc: Yinghai Lu Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 93d575ac61a3..aff46b99ff01 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -836,6 +836,13 @@ static int __init enable_mtrr_cleanup_setup(char *str) } early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); +static int __init mtrr_cleanup_debug_setup(char *str) +{ + debug_print = 1; + return 0; +} +early_param("mtrr_cleanup_debug", mtrr_cleanup_debug_setup); + struct var_mtrr_state { unsigned long range_startk; unsigned long range_sizek; @@ -1227,7 +1234,7 @@ static int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { int num_reg; - debug_print = 1; + debug_print++; /* convert ranges to var ranges state */ num_reg = x86_setup_var_mtrrs(range, nr_range, mtrr_chunk_size, mtrr_gran_size); @@ -1263,7 +1270,7 @@ static int __init mtrr_cleanup(unsigned address_bits) } printk(KERN_INFO "invalid mtrr_gran_size or mtrr_chunk_size, " "will find optimal one\n"); - debug_print = 0; + debug_print--; memset(result, 0, sizeof(result[0])); } @@ -1366,8 +1373,9 @@ static int __init mtrr_cleanup(unsigned address_bits) chunk_size <<= 10; gran_size = result[i].gran_sizek; gran_size <<= 10; - debug_print = 1; + debug_print++; x86_setup_var_mtrrs(range, nr_range, chunk_size, gran_size); + debug_print--; set_var_mtrr_all(address_bits); return 1; } -- cgit v1.2.2 From 237a62247c2879331986a300d6ab36ad21264c68 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Sep 2008 12:13:53 +0200 Subject: x86/iommu: make GART driver checkpatch clean Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b85a2f9bb340..aa569db73a2a 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -27,8 +27,8 @@ #include #include #include +#include #include -#include #include #include #include @@ -175,7 +175,8 @@ static void dump_leak(void) iommu_leak_pages); for (i = 0; i < iommu_leak_pages; i += 2) { printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], 0); + printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], + 0); printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); } printk(KERN_DEBUG "\n"); @@ -688,7 +689,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info) if (!error) error = sysdev_register(&device_gart); if (error) - panic("Could not register gart_sysdev -- would corrupt data on next suspend"); + panic("Could not register gart_sysdev -- " + "would corrupt data on next suspend"); flush_gart(); @@ -710,8 +712,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) return -1; } -extern int agp_amd64_init(void); - static struct dma_mapping_ops gart_dma_ops = { .map_single = gart_map_single, .unmap_single = gart_unmap_single, @@ -777,8 +777,8 @@ void __init gart_iommu_init(void) (no_agp && init_k8_gatt(&info) < 0)) { if (max_pfn > MAX_DMA32_PFN) { printk(KERN_WARNING "More than 4GB of memory " - "but GART IOMMU not available.\n" - KERN_WARNING "falling back to iommu=soft.\n"); + "but GART IOMMU not available.\n"); + printk(KERN_WARNING "falling back to iommu=soft.\n"); } return; } @@ -868,7 +868,8 @@ void __init gart_parse_options(char *p) if (!strncmp(p, "leak", 4)) { leak_trace = 1; p += 4; - if (*p == '=') ++p; + if (*p == '=') + ++p; if (isdigit(*p) && get_option(&p, &arg)) iommu_leak_pages = arg; } -- cgit v1.2.2 From 3610f2116e961cdcbd3546a3828470f7aa636212 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Sep 2008 12:13:54 +0200 Subject: x86/iommu: convert GART need_flush to bool Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index aa569db73a2a..aecea068f583 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -80,7 +80,7 @@ AGPEXTERN int agp_memory_reserved; AGPEXTERN __u32 *agp_gatt_table; static unsigned long next_bit; /* protected by iommu_bitmap_lock */ -static int need_flush; /* global flush state. set for each gart wrap */ +static bool need_flush; /* global flush state. set for each gart wrap */ static unsigned long alloc_iommu(struct device *dev, int size, unsigned long align_mask) @@ -98,7 +98,7 @@ static unsigned long alloc_iommu(struct device *dev, int size, offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, next_bit, size, base_index, boundary_size, align_mask); if (offset == -1) { - need_flush = 1; + need_flush = true; offset = iommu_area_alloc(iommu_gart_bitmap, iommu_pages, 0, size, base_index, boundary_size, align_mask); @@ -107,11 +107,11 @@ static unsigned long alloc_iommu(struct device *dev, int size, next_bit = offset+size; if (next_bit >= iommu_pages) { next_bit = 0; - need_flush = 1; + need_flush = true; } } if (iommu_fullflush) - need_flush = 1; + need_flush = true; spin_unlock_irqrestore(&iommu_bitmap_lock, flags); return offset; @@ -136,7 +136,7 @@ static void flush_gart(void) spin_lock_irqsave(&iommu_bitmap_lock, flags); if (need_flush) { k8_flush_garts(); - need_flush = 0; + need_flush = false; } spin_unlock_irqrestore(&iommu_bitmap_lock, flags); } -- cgit v1.2.2 From 0114267be1bebc2e9c913b19579900153583d617 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 25 Sep 2008 12:42:12 +0200 Subject: x86/iommu: use __GFP_ZERO instead of memset for GART Signed-off-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index aecea068f583..d077116fec1b 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -674,13 +674,13 @@ static __init int init_k8_gatt(struct agp_kern_info *info) info->aper_size = aper_size >> 20; gatt_size = (aper_size >> PAGE_SHIFT) * sizeof(u32); - gatt = (void *)__get_free_pages(GFP_KERNEL, get_order(gatt_size)); + gatt = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(gatt_size)); if (!gatt) panic("Cannot allocate GATT table"); if (set_memory_uc((unsigned long)gatt, gatt_size >> PAGE_SHIFT)) panic("Could not set GART PTEs to uncacheable pages"); - memset(gatt, 0, gatt_size); agp_gatt_table = gatt; enable_gart_translations(); @@ -788,19 +788,16 @@ void __init gart_iommu_init(void) iommu_size = check_iommu_size(info.aper_base, aper_size); iommu_pages = iommu_size >> PAGE_SHIFT; - iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL, + iommu_gart_bitmap = (void *) __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(iommu_pages/8)); if (!iommu_gart_bitmap) panic("Cannot allocate iommu bitmap\n"); - memset(iommu_gart_bitmap, 0, iommu_pages/8); #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL, + iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, get_order(iommu_pages*sizeof(void *))); - if (iommu_leak_tab) - memset(iommu_leak_tab, 0, iommu_pages * 8); - else + if (!iommu_leak_tab) printk(KERN_DEBUG "PCI-DMA: Cannot allocate leak trace area\n"); } -- cgit v1.2.2 From 8f0afaa58e912bbe7d5b0bad9fb024337edf363e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 27 Sep 2008 20:26:06 -0700 Subject: x86: mtrr_cleanup hole size should be less than half of chunk_size, v2 v2: should check with half of range0 size instead of chunk_size So don't have silly big hole. in hpa's case we could auto detect instead of adding mtrr_chunk_size in command line. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 74 ++++++++++++++++++++++++----------------- 1 file changed, 43 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index aff46b99ff01..bccf57f5b615 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -992,22 +992,17 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, /* only cut back, when it is not the last */ if (sizek) { while (range0_basek + range0_sizek > (basek + sizek)) { - range0_sizek -= chunk_sizek; + if (range0_sizek >= chunk_sizek) + range0_sizek -= chunk_sizek; + else + range0_sizek = 0; + if (!range0_sizek) break; } } - if (range0_sizek) { - if (debug_print) - printk(KERN_DEBUG "range0: %016lx - %016lx\n", - range0_basek<<10, - (range0_basek + range0_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range0_basek, - range0_sizek, MTRR_TYPE_WRBACK); - - } - +second_try: range_basek = range0_basek + range0_sizek; /* one hole in the middle */ @@ -1015,33 +1010,50 @@ range_to_mtrr_with_hole(struct var_mtrr_state *state, unsigned long basek, second_sizek = range_basek - basek; if (range0_sizek > state->range_sizek) { - unsigned long hole_basek, hole_sizek; /* one hole in middle or at end */ hole_sizek = range0_sizek - state->range_sizek - second_sizek; - if (hole_sizek) { - hole_basek = range_basek - hole_sizek - second_sizek; - if (debug_print) - printk(KERN_DEBUG "hole: %016lx - %016lx\n", - hole_basek<<10, - (hole_basek + hole_sizek)<<10); - state->reg = range_to_mtrr(state->reg, hole_basek, - hole_sizek, - MTRR_TYPE_UNCACHABLE); + + /* hole size should be less than half of range0 size */ + if (hole_sizek > (range0_sizek >> 1) && + range0_sizek >= chunk_sizek) { + range0_sizek -= chunk_sizek; + second_sizek = 0; + hole_sizek = 0; + + goto second_try; } - } else { + } + + if (range0_sizek) { + if (debug_print) + printk(KERN_DEBUG "range0: %016lx - %016lx\n", + range0_basek<<10, + (range0_basek + range0_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range0_basek, + range0_sizek, MTRR_TYPE_WRBACK); + } + + if (range0_sizek < state->range_sizek) { /* need to handle left over */ range_sizek = state->range_sizek - range0_sizek; - if (range_sizek) { - if (debug_print) - printk(KERN_DEBUG "range: %016lx - %016lx\n", - range_basek<<10, - (range_basek + range_sizek)<<10); - state->reg = range_to_mtrr(state->reg, range_basek, - range_sizek, - MTRR_TYPE_WRBACK); - } + if (debug_print) + printk(KERN_DEBUG "range: %016lx - %016lx\n", + range_basek<<10, + (range_basek + range_sizek)<<10); + state->reg = range_to_mtrr(state->reg, range_basek, + range_sizek, MTRR_TYPE_WRBACK); + } + + if (hole_sizek) { + hole_basek = range_basek - hole_sizek - second_sizek; + if (debug_print) + printk(KERN_DEBUG "hole: %016lx - %016lx\n", + hole_basek<<10, + (hole_basek + hole_sizek)<<10); + state->reg = range_to_mtrr(state->reg, hole_basek, + hole_sizek, MTRR_TYPE_UNCACHABLE); } return second_sizek; -- cgit v1.2.2 From 12544697f12e0ecdcf971075415c7678fae502af Mon Sep 17 00:00:00 2001 From: dcg Date: Sun, 28 Sep 2008 18:49:46 +0200 Subject: x86_64: be less annoying on boot, v2 Honour "quiet" boot parameter in early_printk() calls Signed-off-by: Diego Calleja Signed-off-by: Ingo Molnar --- arch/x86/kernel/head64.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 11aa501c9f4f..d16084f90649 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -108,7 +108,8 @@ void __init x86_64_start_kernel(char * real_mode_data) } load_idt((const struct desc_ptr *)&idt_descr); - early_printk("Kernel alive\n"); + if (console_loglevel == 10) + early_printk("Kernel alive\n"); x86_64_init_pda(); -- cgit v1.2.2 From 73436a1d2501575f9c2e6ddb26889145e23cefd8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 29 Sep 2008 13:39:17 -0700 Subject: x86: mtrr_cleanup safe to get more spare regs now Delay exit to make sure we can actually get the optimal result in as many cases as possible. Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index bccf57f5b615..ae0ca97e20bd 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1353,10 +1353,8 @@ static int __init mtrr_cleanup(unsigned address_bits) nr_mtrr_spare_reg = num_var_ranges - 1; num_reg_good = -1; for (i = num_var_ranges - nr_mtrr_spare_reg; i > 0; i--) { - if (!min_loss_pfn[i]) { + if (!min_loss_pfn[i]) num_reg_good = i; - break; - } } index_good = -1; -- cgit v1.2.2 From dd7e52224fd7438d701b4bb9834a9ddc06828210 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 29 Sep 2008 18:54:11 -0700 Subject: x86: mtrr_cleanup prepare to make gran_size to less 1M make the print out right with size < 1M Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 103 ++++++++++++++++++++++++++++++---------- 1 file changed, 79 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index ae0ca97e20bd..b1bd1038c913 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -905,6 +905,27 @@ set_var_mtrr_all(unsigned int address_bits) } } +static unsigned long to_size_factor(unsigned long sizek, char *factorp) +{ + char factor; + unsigned long base = sizek; + + if (base & ((1<<10) - 1)) { + /* not MB alignment */ + factor = 'K'; + } else if (base & ((1<<20) - 1)){ + factor = 'M'; + base >>= 10; + } else { + factor = 'G'; + base >>= 20; + } + + *factorp = factor; + + return base; +} + static unsigned int __init range_to_mtrr(unsigned int reg, unsigned long range_startk, unsigned long range_sizek, unsigned char type) @@ -926,13 +947,21 @@ range_to_mtrr(unsigned int reg, unsigned long range_startk, align = max_align; sizek = 1 << align; - if (debug_print) + if (debug_print) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + start_base = to_size_factor(range_startk, &start_factor), + size_base = to_size_factor(sizek, &size_factor), + printk(KERN_DEBUG "Setting variable MTRR %d, " - "base: %ldMB, range: %ldMB, type %s\n", - reg, range_startk >> 10, sizek >> 10, + "base: %ld%cB, range: %ld%cB, type %s\n", + reg, start_base, start_factor, + size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE)?"UC": ((type == MTRR_TYPE_WRBACK)?"WB":"Other") ); + } save_var_mtrr(reg++, range_startk, sizek, type); range_startk += sizek; range_sizek -= sizek; @@ -1245,6 +1274,8 @@ static int __init mtrr_cleanup(unsigned address_bits) if (mtrr_chunk_size && mtrr_gran_size) { int num_reg; + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; debug_print++; /* convert ranges to var ranges state */ @@ -1270,12 +1301,15 @@ static int __init mtrr_cleanup(unsigned address_bits) result[i].lose_cover_sizek = (range_sums - range_sums_new) << PSHIFT; - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?"*BAD*":" ", result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ldM \n", + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad?"*BAD*":" ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", result[i].num_reg, result[i].bad?"-":"", - result[i].lose_cover_sizek >> 10); + lose_base, lose_factor); if (!result[i].bad) { set_var_mtrr_all(address_bits); return 1; @@ -1290,14 +1324,25 @@ static int __init mtrr_cleanup(unsigned address_bits) memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { + char gran_factor; + unsigned long gran_base; + + if (debug_print) + gran_base = to_size_factor(gran_size >> 10, &gran_factor); + for (chunk_size = gran_size; chunk_size < (1ULL<<32); chunk_size <<= 1) { int num_reg; - if (debug_print) - printk(KERN_INFO - "\ngran_size: %lldM chunk_size_size: %lldM\n", - gran_size >> 20, chunk_size >> 20); + if (debug_print) { + char chunk_factor; + unsigned long chunk_base; + + chunk_base = to_size_factor(chunk_size>>10, &chunk_factor), + printk(KERN_INFO "\n"); + printk(KERN_INFO "gran_size: %ld%c chunk_size: %ld%c \n", + gran_base, gran_factor, chunk_base, chunk_factor); + } if (i >= NUM_RESULT) continue; @@ -1340,12 +1385,18 @@ static int __init mtrr_cleanup(unsigned address_bits) /* print out all */ for (i = 0; i < NUM_RESULT; i++) { - printk(KERN_INFO "%sgran_size: %ldM \tchunk_size: %ldM \t", - result[i].bad?"*BAD* ":" ", result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose RAM: %s%ldM\n", - result[i].num_reg, result[i].bad?"-":"", - result[i].lose_cover_sizek >> 10); + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "%sgran_size: %ld%c \tchunk_size: %ld%c \t", + result[i].bad?"*BAD*":" ", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose cover RAM: %s%ld%c\n", + result[i].num_reg, result[i].bad?"-":"", + lose_base, lose_factor); } /* try to find the optimal index */ @@ -1370,14 +1421,18 @@ static int __init mtrr_cleanup(unsigned address_bits) } if (index_good != -1) { + char gran_factor, chunk_factor, lose_factor; + unsigned long gran_base, chunk_base, lose_base; + printk(KERN_INFO "Found optimal setting for mtrr clean up\n"); i = index_good; - printk(KERN_INFO "gran_size: %ldM \tchunk_size: %ldM \t", - result[i].gran_sizek >> 10, - result[i].chunk_sizek >> 10); - printk(KERN_CONT "num_reg: %d \tlose RAM: %ldM\n", - result[i].num_reg, - result[i].lose_cover_sizek >> 10); + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + printk(KERN_INFO "gran_size: %ld%c \tchunk_size: %ld%c \t", + gran_base, gran_factor, chunk_base, chunk_factor); + printk(KERN_CONT "num_reg: %d \tlose RAM: %ld%c\n", + result[i].num_reg, lose_base, lose_factor); /* convert ranges to var ranges state */ chunk_size = result[i].chunk_sizek; chunk_size <<= 10; -- cgit v1.2.2 From 4624065731751a3ace88e5824d8e5654e2d7abd3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 29 Sep 2008 18:54:12 -0700 Subject: x86: mtrr_cleanup try gran_size to less than 1M one have gran < 1M reg00: base=0xd8000000 (3456MB), size= 128MB: uncachable, count=1 reg01: base=0xe0000000 (3584MB), size= 512MB: uncachable, count=1 reg02: base=0x00000000 ( 0MB), size=4096MB: write-back, count=1 reg03: base=0x100000000 (4096MB), size= 512MB: write-back, count=1 reg04: base=0x120000000 (4608MB), size= 128MB: write-back, count=1 reg05: base=0xd7f80000 (3455MB), size= 512KB: uncachable, count=1 will get Found optimal setting for mtrr clean up gran_size: 512K chunk_size: 2M num_reg: 7 lose RAM: 0G range0: 0000000000000000 - 00000000d8000000 Setting variable MTRR 0, base: 0GB, range: 2GB, type WB Setting variable MTRR 1, base: 2GB, range: 1GB, type WB Setting variable MTRR 2, base: 3GB, range: 256MB, type WB Setting variable MTRR 3, base: 3328MB, range: 128MB, type WB hole: 00000000d7f00000 - 00000000d7f80000 Setting variable MTRR 4, base: 3455MB, range: 512KB, type UC rangeX: 0000000100000000 - 0000000128000000 Setting variable MTRR 5, base: 4GB, range: 512MB, type WB Setting variable MTRR 6, base: 4608MB, range: 128MB, type WB so start from 64k instead of 1M Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b1bd1038c913..8f1a342b16b7 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1197,11 +1197,11 @@ struct mtrr_cleanup_result { }; /* - * gran_size: 1M, 2M, ..., 2G + * gran_size: 64K, 128K, 256K, 512K, 1M, 2M, ..., 2G * chunk size: gran_size, ..., 2G - * so we need (1+12)*6 + * so we need (1+16)*8 */ -#define NUM_RESULT 78 +#define NUM_RESULT 136 #define PSHIFT (PAGE_SHIFT - 10) static struct mtrr_cleanup_result __initdata result[NUM_RESULT]; @@ -1323,7 +1323,7 @@ static int __init mtrr_cleanup(unsigned address_bits) i = 0; memset(min_loss_pfn, 0xff, sizeof(min_loss_pfn)); memset(result, 0, sizeof(result)); - for (gran_size = (1ULL<<20); gran_size < (1ULL<<32); gran_size <<= 1) { + for (gran_size = (1ULL<<16); gran_size < (1ULL<<32); gran_size <<= 1) { char gran_factor; unsigned long gran_base; -- cgit v1.2.2 From a03352d2c1dcb00970801fb8b800a39acd3103d9 Mon Sep 17 00:00:00 2001 From: Bruce Allan Date: Mon, 29 Sep 2008 20:19:22 -0700 Subject: x86: export set_memory_ro and set_memory_rw Export set_memory_ro() and set_memory_rw() calls for use by drivers that need to have more debug information about who might be writing to memory space. this was initially developed for use while debugging a memory corruption problem with e1000e. Signed-off-by: Bruce Allan Signed-off-by: Jesse Brandeburg Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 43e2f8483e4f..62c1eefcce05 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -906,11 +906,13 @@ int set_memory_ro(unsigned long addr, int numpages) { return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_RW)); } +EXPORT_SYMBOL_GPL(set_memory_ro); int set_memory_rw(unsigned long addr, int numpages) { return change_page_attr_set(addr, numpages, __pgprot(_PAGE_RW)); } +EXPORT_SYMBOL_GPL(set_memory_rw); int set_memory_np(unsigned long addr, int numpages) { -- cgit v1.2.2 From 08115ab4d98cb577a83971ebd57cdfbcc6f50b68 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Mon, 29 Sep 2008 18:24:23 -0400 Subject: xen: make CONFIG_XEN_SAVE_RESTORE depend on CONFIG_XEN Xen options need to depend on XEN. Also, add newline at end of file. Without this patch you need to disable CONFIG_PM in order to disable CPU hotplugging. Signed-off-by: Chuck Ebbert Acked-by Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig index d3e68465ace9..87b9ab166423 100644 --- a/arch/x86/xen/Kconfig +++ b/arch/x86/xen/Kconfig @@ -26,7 +26,7 @@ config XEN_MAX_DOMAIN_MEMORY config XEN_SAVE_RESTORE bool - depends on PM + depends on XEN && PM default y config XEN_DEBUG_FS @@ -35,4 +35,4 @@ config XEN_DEBUG_FS default n help Enable statistics output and various tuning options in debugfs. - Enabling this option may incur a significant performance overhead. \ No newline at end of file + Enabling this option may incur a significant performance overhead. -- cgit v1.2.2 From 3dcd7e269d2223126f6ee9bc893f5a6166e1770d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2EA=2E=20Magall=C3=B3n?= Date: Tue, 30 Sep 2008 10:02:52 +0200 Subject: x86: fix typo in enable_mtrr_cleanup early parameter Correct typo for 'enable_mtrr_cleanup' early boot param name. Signed-off-by: J.A. Magallon Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b117d7f8a564..885c8265e6b5 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -834,7 +834,7 @@ static int __init enable_mtrr_cleanup_setup(char *str) enable_mtrr_cleanup = 1; return 0; } -early_param("enble_mtrr_cleanup", enable_mtrr_cleanup_setup); +early_param("enable_mtrr_cleanup", enable_mtrr_cleanup_setup); struct var_mtrr_state { unsigned long range_startk; -- cgit v1.2.2 From 9b1568458a3ef006361710dc12848aec891883b5 Mon Sep 17 00:00:00 2001 From: Adam Jackson Date: Mon, 29 Sep 2008 14:52:03 -0400 Subject: x86, debug printouts: IOMMU setup failures should not be KERN_ERR The number of BIOSes that have an option to enable the IOMMU, or fix anything about its configuration, is vanishingly small. There's no good reason to punish quiet boot for this. Signed-off-by: Adam Jackson Signed-off-by: Ingo Molnar --- arch/x86/kernel/aperture_64.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 44e21826db11..9a32b37ee2ee 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -455,11 +455,11 @@ out: force_iommu || valid_agp || fallback_aper_force) { - printk(KERN_ERR + printk(KERN_INFO "Your BIOS doesn't leave a aperture memory hole\n"); - printk(KERN_ERR + printk(KERN_INFO "Please enable the IOMMU option in the BIOS setup\n"); - printk(KERN_ERR + printk(KERN_INFO "This costs you %d MB of RAM\n", 32 << fallback_aper_order); -- cgit v1.2.2 From de59985e3a623d4d5d6207f1777398ca0606ab1c Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 30 Sep 2008 11:02:12 -0700 Subject: x86: Fix broken LDT access in VMI After investigating a JRE failure, I found this bug was introduced a long time ago, and had already managed to survive another bugfix which occurred on the same line. The result is a total failure of the JRE due to LDT selectors not working properly. This one took a long time to rear up because LDT usage is not very common, but the bug is quite serious. It got introduced along with another bug, already fixed, by 75b8bb3e56ca09a467fbbe5229bc68627f7445be Signed-off-by: Zachary Amsden Cc: Ingo Molnar Cc: Glauber de Oliveira Costa Cc: Signed-off-by: Linus Torvalds --- arch/x86/kernel/vmi_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 6ca515d6db54..edfb09f30479 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); + vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, -- cgit v1.2.2 From dc63b52673d71f9d49b9d72d263a9f32df18c3ee Mon Sep 17 00:00:00 2001 From: Zachary Amsden Date: Tue, 30 Sep 2008 11:02:12 -0700 Subject: x86, vmi: fix broken LDT access This one took a long time to rear up because LDT usage is not very common, but the bug is quite serious. It got introduced along with another bug, already fixed, by 75b8bb3e56ca09a467fbbe5229bc68627f7445be After investigating a JRE failure, I found this bug was introduced a long time ago, and had already managed to survive another bugfix which occurred on the same line. The result is a total failure of the JRE due to LDT selectors not working properly. Signed-off-by: Zachary Amsden Cc: Glauber de Oliveira Costa Cc: stable@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmi_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 6ca515d6db54..edfb09f30479 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -235,7 +235,7 @@ static void vmi_write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { u32 *ldt_entry = (u32 *)desc; - vmi_ops.write_idt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); + vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]); } static void vmi_load_sp0(struct tss_struct *tss, -- cgit v1.2.2 From bff0aa4b8f8a1c8492e99c522d044bb98a6ec098 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 30 Sep 2008 20:28:20 -0700 Subject: x86: ia32_signal.c: remove unnecessary cast to u32 __put_user() looks type of the 2nd parameter, so casting the 1st parameter is not necessary. text data bss dec hex filename 6227 0 8 6235 185b ia32_signal.o.new 6227 0 8 6235 185b ia32_signal.o.old Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index e47bed2440ee..690a480c68c5 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -351,21 +351,21 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, savesegment(es, tmp); err |= __put_user(tmp, (unsigned int __user *)&sc->es); - err |= __put_user((u32)regs->di, &sc->di); - err |= __put_user((u32)regs->si, &sc->si); - err |= __put_user((u32)regs->bp, &sc->bp); - err |= __put_user((u32)regs->sp, &sc->sp); - err |= __put_user((u32)regs->bx, &sc->bx); - err |= __put_user((u32)regs->dx, &sc->dx); - err |= __put_user((u32)regs->cx, &sc->cx); - err |= __put_user((u32)regs->ax, &sc->ax); - err |= __put_user((u32)regs->cs, &sc->cs); - err |= __put_user((u32)regs->ss, &sc->ss); + err |= __put_user(regs->di, &sc->di); + err |= __put_user(regs->si, &sc->si); + err |= __put_user(regs->bp, &sc->bp); + err |= __put_user(regs->sp, &sc->sp); + err |= __put_user(regs->bx, &sc->bx); + err |= __put_user(regs->dx, &sc->dx); + err |= __put_user(regs->cx, &sc->cx); + err |= __put_user(regs->ax, &sc->ax); + err |= __put_user(regs->cs, &sc->cs); + err |= __put_user(regs->ss, &sc->ss); err |= __put_user(current->thread.trap_no, &sc->trapno); err |= __put_user(current->thread.error_code, &sc->err); - err |= __put_user((u32)regs->ip, &sc->ip); - err |= __put_user((u32)regs->flags, &sc->flags); - err |= __put_user((u32)regs->sp, &sc->sp_at_signal); + err |= __put_user(regs->ip, &sc->ip); + err |= __put_user(regs->flags, &sc->flags); + err |= __put_user(regs->sp, &sc->sp_at_signal); tmp = save_i387_xstate_ia32(fpstate); if (tmp < 0) -- cgit v1.2.2 From 7b9cee16ffb495558c1e3ada55cba906e520006e Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Tue, 30 Sep 2008 20:30:51 -0700 Subject: x86: ia32_signal.c remove unnecessary function calls the below 2 functions are called in save_i387_xstate_ia32() - clear_used_math(); - stts(); Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 690a480c68c5..4bc02b23674b 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -370,12 +370,9 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc, tmp = save_i387_xstate_ia32(fpstate); if (tmp < 0) err = -EFAULT; - else { - clear_used_math(); - stts(); + else err |= __put_user(ptr_to_compat(tmp ? fpstate : NULL), &sc->fpstate); - } /* non-iBCS2 extensions.. */ err |= __put_user(mask, &sc->oldmask); -- cgit v1.2.2 From 2ffb3501f6f356ff80e7149214bc64d3fa9021c4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 30 Sep 2008 16:29:40 -0700 Subject: x86: change MTRR_SANITIZER to def_bool y This option has been added in v2.6.26 as a default-disabled feature and went through several revisions since then. The feature fixes a wide range of MTRR setup problems that BIOSes leave us with: slow system, slow Xorg, slow system when adding lots of RAM, etc., so we want to enable it by default for v2.6.28. See: [Bug 10508] Upgrade to 4GB of RAM messes up MTRRs http://bugzilla.kernel.org/show_bug.cgi?id=10508 and the test results in: http://lkml.org/lkml/2008/9/29/273 1. hpa reg00: base=0xc0000000 (3072MB), size=1024MB: uncachable, count=1 reg01: base=0x13c000000 (5056MB), size= 64MB: uncachable, count=1 reg02: base=0x00000000 ( 0MB), size=4096MB: write-back, count=1 reg03: base=0x100000000 (4096MB), size=1024MB: write-back, count=1 reg04: base=0xbf700000 (3063MB), size= 1MB: uncachable, count=1 reg05: base=0xbf800000 (3064MB), size= 8MB: uncachable, count=1 will get Found optimal setting for mtrr clean up gran_size: 1M chunk_size: 128M num_reg: 6 lose RAM: 0M range0: 0000000000000000 - 00000000c0000000 Setting variable MTRR 0, base: 0MB, range: 2048MB, type WB Setting variable MTRR 1, base: 2048MB, range: 1024MB, type WB hole: 00000000bf700000 - 00000000c0000000 Setting variable MTRR 2, base: 3063MB, range: 1MB, type UC Setting variable MTRR 3, base: 3064MB, range: 8MB, type UC range0: 0000000100000000 - 0000000140000000 Setting variable MTRR 4, base: 4096MB, range: 1024MB, type WB hole: 000000013c000000 - 0000000140000000 Setting variable MTRR 5, base: 5056MB, range: 64MB, type UC 2. Dylan Taft reg00: base=0x00000000 ( 0MB), size=4096MB: write-back, count=1 reg01: base=0x100000000 (4096MB), size= 512MB: write-back, count=1 reg02: base=0x120000000 (4608MB), size= 256MB: write-back, count=1 reg03: base=0xd0000000 (3328MB), size= 256MB: uncachable, count=1 reg04: base=0xe0000000 (3584MB), size= 512MB: uncachable, count=1 reg05: base=0xc7e00000 (3198MB), size= 2MB: uncachable, count=1 reg06: base=0xc8000000 (3200MB), size= 128MB: uncachable, count=1 will get Found optimal setting for mtrr clean up gran_size: 1M chunk_size: 4M num_reg: 6 lose RAM: 0M range0: 0000000000000000 - 00000000c8000000 Setting variable MTRR 0, base: 0MB, range: 2048MB, type WB Setting variable MTRR 1, base: 2048MB, range: 1024MB, type WB Setting variable MTRR 2, base: 3072MB, range: 128MB, type WB hole: 00000000c7e00000 - 00000000c8000000 Setting variable MTRR 3, base: 3198MB, range: 2MB, type UC rangeX: 0000000100000000 - 0000000130000000 Setting variable MTRR 4, base: 4096MB, range: 512MB, type WB Setting variable MTRR 5, base: 4608MB, range: 256MB, type WB 3. Gabriel reg00: base=0xd0000000 (3328MB), size= 256MB: uncachable, count=1 reg01: base=0xe0000000 (3584MB), size= 512MB: uncachable, count=1 reg02: base=0x00000000 ( 0MB), size=4096MB: write-back, count=1 reg03: base=0x100000000 (4096MB), size= 512MB: write-back, count=1 reg04: base=0x120000000 (4608MB), size= 128MB: write-back, count=1 reg05: base=0x128000000 (4736MB), size= 64MB: write-back, count=1 reg06: base=0xcf600000 (3318MB), size= 2MB: uncachable, count=1 will get Found optimal setting for mtrr clean up gran_size: 1M chunk_size: 16M num_reg: 7 lose RAM: 0M range0: 0000000000000000 - 00000000d0000000 Setting variable MTRR 0, base: 0MB, range: 2048MB, type WB Setting variable MTRR 1, base: 2048MB, range: 1024MB, type WB Setting variable MTRR 2, base: 3072MB, range: 256MB, type WB hole: 00000000cf600000 - 00000000cf800000 Setting variable MTRR 3, base: 3318MB, range: 2MB, type UC rangeX: 0000000100000000 - 000000012c000000 Setting variable MTRR 4, base: 4096MB, range: 512MB, type WB Setting variable MTRR 5, base: 4608MB, range: 128MB, type WB Setting variable MTRR 6, base: 4736MB, range: 64MB, type WB 4. Mika Fischer reg00: base=0xc0000000 (3072MB), size=1024MB: uncachable, count=1 reg01: base=0x00000000 ( 0MB), size=4096MB: write-back, count=1 reg02: base=0x100000000 (4096MB), size=1024MB: write-back, count=1 reg03: base=0xbf700000 (3063MB), size= 1MB: uncachable, count=1 reg04: base=0xbf800000 (3064MB), size= 8MB: uncachable, count=1 will get Found optimal setting for mtrr clean up gran_size: 1M chunk_size: 16M num_reg: 5 lose RAM: 0M range0: 0000000000000000 - 00000000c0000000 Setting variable MTRR 0, base: 0MB, range: 2048MB, type WB Setting variable MTRR 1, base: 2048MB, range: 1024MB, type WB hole: 00000000bf700000 - 00000000c0000000 Setting variable MTRR 2, base: 3063MB, range: 1MB, type UC Setting variable MTRR 3, base: 3064MB, range: 8MB, type UC rangeX: 0000000100000000 - 0000000140000000 Setting variable MTRR 4, base: 4096MB, range: 1024MB, type WB Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..09f6b7fa29ac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1120,7 +1120,7 @@ config MTRR See for more information. config MTRR_SANITIZER - bool + def_bool y prompt "MTRR cleanup support" depends on MTRR help @@ -1131,7 +1131,7 @@ config MTRR_SANITIZER The largest mtrr entry size for a continous block can be set with mtrr_chunk_size. - If unsure, say N. + If unsure, say Y. config MTRR_SANITIZER_ENABLE_DEFAULT int "MTRR cleanup enable value (0-1)" -- cgit v1.2.2 From 40becd8d5af03ee7935e79c3fccd0d1f380d95b4 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 29 Sep 2008 00:06:36 +0900 Subject: AMD IOMMU: use iommu_device_max_index AMD IOMMU can use iommu_device_max_index() instead of the homegrown function. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index c19212191c98..3b346c6f5514 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -470,10 +470,6 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, * efficient allocator. * ****************************************************************************/ -static unsigned long dma_mask_to_pages(unsigned long mask) -{ - return PAGE_ALIGN(mask) >> PAGE_SHIFT; -} /* * The address allocator core function. @@ -486,14 +482,14 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev, unsigned long align_mask, u64 dma_mask) { - unsigned long limit = dma_mask_to_pages(dma_mask); + unsigned long limit; unsigned long address; - unsigned long size = dom->aperture_size >> PAGE_SHIFT; unsigned long boundary_size; boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1, PAGE_SIZE) >> PAGE_SHIFT; - limit = limit < size ? limit : size; + limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0, + dma_mask >> PAGE_SHIFT); if (dom->next_bit >= limit) { dom->next_bit = 0; -- cgit v1.2.2 From fd1452ebf257317f24e0e285a17a2ec2ce3e6df7 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Thu, 2 Oct 2008 16:56:19 +0300 Subject: x86/microcode: fix sleeping function called from invalid context at kernel/mutex.c Fix the following "sleeping function called from invalid context" bug: ... __might_sleep mutex_lock_nested microcode_update_cpu mc_sysdev_resume __sysdev_resume sysdev_resume device_power_up ... Signed-off-by: Dmitry Adamushko Signed-off-by: Ingo Molnar --- arch/x86/kernel/microcode_core.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 8db2eb55e9e2..936d8d55f230 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -324,10 +324,6 @@ void microcode_update_cpu(int cpu) struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; - /* We should bind the task to the CPU */ - BUG_ON(raw_smp_processor_id() != cpu); - - mutex_lock(µcode_mutex); /* * Check if the system resume is in progress (uci->valid != NULL), * otherwise just request a firmware: @@ -340,11 +336,8 @@ void microcode_update_cpu(int cpu) err = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); } - if (!err) microcode_ops->apply_microcode(cpu); - - mutex_unlock(µcode_mutex); } static void microcode_init_cpu(int cpu) @@ -352,7 +345,13 @@ static void microcode_init_cpu(int cpu) cpumask_t old = current->cpus_allowed; set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); + /* We should bind the task to the CPU */ + BUG_ON(raw_smp_processor_id() != cpu); + + mutex_lock(µcode_mutex); microcode_update_cpu(cpu); + mutex_unlock(µcode_mutex); + set_cpus_allowed_ptr(current, &old); } -- cgit v1.2.2 From 136c82c6f7f12c9bed8bf709f92123077966512c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=2EA=2E=20Magall=C3=B3n?= Date: Fri, 3 Oct 2008 00:32:37 +0200 Subject: x86: mtrr_cleanup try gran_size to less than 1M, cleanup Patch below cleans up formatting, with space for big bases and sizes (64 Gb). Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/if.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c index 84c480bb3715..4c4214690dd1 100644 --- a/arch/x86/kernel/cpu/mtrr/if.c +++ b/arch/x86/kernel/cpu/mtrr/if.c @@ -405,9 +405,9 @@ static int mtrr_seq_show(struct seq_file *seq, void *offset) } /* RED-PEN: base can be > 32bit */ len += seq_printf(seq, - "reg%02i: base=0x%05lx000 (%4luMB), size=%4lu%cB: %s, count=%d\n", + "reg%02i: base=0x%06lx000 (%5luMB), size=%5lu%cB, count=%d: %s\n", i, base, base >> (20 - PAGE_SHIFT), size, factor, - mtrr_attrib_to_str(type), mtrr_usage_table[i]); + mtrr_usage_table[i], mtrr_attrib_to_str(type)); } } return 0; -- cgit v1.2.2 From 834836ee6a3a9deadff9394b23db965a08bcde35 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 2 Oct 2008 15:46:20 -0700 Subject: x86: mtrr_cleanup try gran_size to less than 1M, v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit J.A. Magallón reported: >> Also, on a 64 bit box with 4Gb, it gives this: >> >> cicely:~# cat /proc/mtrr >> reg00: base=0x00000000 ( 0MB), size=4096MB: write-back, count=1 >> reg01: base=0x100000000 (4096MB), size=1024MB: write-back, count=1 >> reg02: base=0x140000000 (5120MB), size= 512MB: write-back, count=1 >> reg03: base=0x160000000 (5632MB), size= 256MB: write-back, count=1 >> reg04: base=0x80000000 (2048MB), size=2048MB: uncachable, count=1 boundary handling has a problem ... fix it. Reported-by: J.A. Magallón Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 8f1a342b16b7..b4a3f0c1a5e3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1044,7 +1044,7 @@ second_try: hole_sizek = range0_sizek - state->range_sizek - second_sizek; /* hole size should be less than half of range0 size */ - if (hole_sizek > (range0_sizek >> 1) && + if (hole_sizek >= (range0_sizek >> 1) && range0_sizek >= chunk_sizek) { range0_sizek -= chunk_sizek; second_sizek = 0; -- cgit v1.2.2 From 8bb39311bf461243f6cade3644764d848516dd23 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 2 Oct 2008 23:26:59 -0700 Subject: x86, debug: mtrr_cleanup print out var mtrr before change it Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/main.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index b4a3f0c1a5e3..406074c46f1e 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1211,13 +1211,14 @@ static unsigned long __initdata min_loss_pfn[RANGE_NUM]; static int __init mtrr_cleanup(unsigned address_bits) { unsigned long extra_remove_base, extra_remove_size; - unsigned long i, base, size, def, dummy; + unsigned long base, size, def, dummy; mtrr_type type; int nr_range, nr_range_new; u64 chunk_size, gran_size; unsigned long range_sums, range_sums_new; int index_good; int num_reg_good; + int i; /* extra one for all 0 */ int num[MTRR_NUM_TYPES + 1]; @@ -1259,6 +1260,28 @@ static int __init mtrr_cleanup(unsigned address_bits) num_var_ranges - num[MTRR_NUM_TYPES]) return 0; + /* print original var MTRRs at first, for debugging: */ + printk(KERN_DEBUG "original variable MTRRs\n"); + for (i = 0; i < num_var_ranges; i++) { + char start_factor = 'K', size_factor = 'K'; + unsigned long start_base, size_base; + + size_base = range_state[i].size_pfn << (PAGE_SHIFT - 10); + if (!size_base) + continue; + + size_base = to_size_factor(size_base, &size_factor), + start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); + start_base = to_size_factor(start_base, &start_factor), + + printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", + i, start_base, start_factor, + size_base, size_factor, + (type == MTRR_TYPE_UNCACHABLE) ? "UC" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") + ); + } + memset(range, 0, sizeof(range)); extra_remove_size = 0; if (mtrr_tom2) { -- cgit v1.2.2 From db053b86f4b1ec790da2dafe2acb93be76288bb9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 2 Oct 2008 16:41:31 -0700 Subject: xen: clean up x86-64 warnings There are a couple of Xen features which rely on directly accessing per-cpu data via a segment register, which is not yet available on x86-64. In the meantime, just disable direct access to the vcpu info structure; this leaves some of the code as dead, but it will come to life in time, and the warnings are suppressed. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 61 ++++++++--------------------------------------- arch/x86/xen/xen-asm_64.S | 20 +++++++++++++--- 2 files changed, 27 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 8ca2f88bde1e..85692c9f6496 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -112,7 +112,14 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info; * * 0: not available, 1: available */ -static int have_vcpu_info_placement = 1; +static int have_vcpu_info_placement = +#ifdef CONFIG_X86_32 + 1 +#else + 0 +#endif + ; + static void xen_vcpu_setup(int cpu) { @@ -941,6 +948,7 @@ static void *xen_kmap_atomic_pte(struct page *page, enum km_type type) } #endif +#ifdef CONFIG_X86_32 static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) { /* If there's an existing pte, then don't allow _PAGE_RW to be set */ @@ -959,6 +967,7 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) xen_set_pte(ptep, pte); } +#endif static __init void xen_pagetable_setup_start(pgd_t *base) { @@ -1025,7 +1034,6 @@ void xen_setup_vcpu_info_placement(void) /* xen_vcpu_setup managed to place the vcpu_info within the percpu area for all cpus, so make use of it */ -#ifdef CONFIG_X86_32 if (have_vcpu_info_placement) { printk(KERN_INFO "Xen: using vcpu_info placement\n"); @@ -1035,7 +1043,6 @@ void xen_setup_vcpu_info_placement(void) pv_irq_ops.irq_enable = xen_irq_enable_direct; pv_mmu_ops.read_cr2 = xen_read_cr2_direct; } -#endif } static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, @@ -1056,12 +1063,10 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf, goto patch_site switch (type) { -#ifdef CONFIG_X86_32 SITE(pv_irq_ops, irq_enable); SITE(pv_irq_ops, irq_disable); SITE(pv_irq_ops, save_fl); SITE(pv_irq_ops, restore_fl); -#endif /* CONFIG_X86_32 */ #undef SITE patch_site: @@ -1399,48 +1404,11 @@ static void *m2v(phys_addr_t maddr) return __ka(m2p(maddr)); } -#ifdef CONFIG_X86_64 -static void walk(pgd_t *pgd, unsigned long addr) -{ - unsigned l4idx = pgd_index(addr); - unsigned l3idx = pud_index(addr); - unsigned l2idx = pmd_index(addr); - unsigned l1idx = pte_index(addr); - pgd_t l4; - pud_t l3; - pmd_t l2; - pte_t l1; - - xen_raw_printk("walk %p, %lx -> %d %d %d %d\n", - pgd, addr, l4idx, l3idx, l2idx, l1idx); - - l4 = pgd[l4idx]; - xen_raw_printk(" l4: %016lx\n", l4.pgd); - xen_raw_printk(" %016lx\n", pgd_val(l4)); - - l3 = ((pud_t *)(m2v(l4.pgd)))[l3idx]; - xen_raw_printk(" l3: %016lx\n", l3.pud); - xen_raw_printk(" %016lx\n", pud_val(l3)); - - l2 = ((pmd_t *)(m2v(l3.pud)))[l2idx]; - xen_raw_printk(" l2: %016lx\n", l2.pmd); - xen_raw_printk(" %016lx\n", pmd_val(l2)); - - l1 = ((pte_t *)(m2v(l2.pmd)))[l1idx]; - xen_raw_printk(" l1: %016lx\n", l1.pte); - xen_raw_printk(" %016lx\n", pte_val(l1)); -} -#endif - static void set_page_prot(void *addr, pgprot_t prot) { unsigned long pfn = __pa(addr) >> PAGE_SHIFT; pte_t pte = pfn_pte(pfn, prot); - xen_raw_printk("addr=%p pfn=%lx mfn=%lx prot=%016llx pte=%016llx\n", - addr, pfn, get_phys_to_machine(pfn), - pgprot_val(prot), pte.pte); - if (HYPERVISOR_update_va_mapping((unsigned long)addr, pte, 0)) BUG(); } @@ -1698,15 +1666,6 @@ asmlinkage void __init xen_start_kernel(void) xen_raw_console_write("about to get started...\n"); -#if 0 - xen_raw_printk("&boot_params=%p __pa(&boot_params)=%lx __va(__pa(&boot_params))=%lx\n", - &boot_params, __pa_symbol(&boot_params), - __va(__pa_symbol(&boot_params))); - - walk(pgd, &boot_params); - walk(pgd, __va(__pa(&boot_params))); -#endif - /* Start the world */ #ifdef CONFIG_X86_32 i386_start_kernel(); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 3b9bda46487a..05794c566e87 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -26,8 +26,15 @@ /* Pseudo-flag used for virtual NMI, which we don't implement yet */ #define XEN_EFLAGS_NMI 0x80000000 -#if 0 -#include +#if 1 +/* + x86-64 does not yet support direct access to percpu variables + via a segment override, so we just need to make sure this code + never gets used + */ +#define BUG ud2a +#define PER_CPU_VAR(var, off) 0xdeadbeef +#endif /* Enable events. This clears the event mask and tests the pending @@ -35,6 +42,8 @@ events, then enter the hypervisor to get them handled. */ ENTRY(xen_irq_enable_direct) + BUG + /* Unmask events */ movb $0, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) @@ -58,6 +67,8 @@ ENDPATCH(xen_irq_enable_direct) non-zero. */ ENTRY(xen_irq_disable_direct) + BUG + movb $1, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) ENDPATCH(xen_irq_disable_direct) ret @@ -74,6 +85,8 @@ ENDPATCH(xen_irq_disable_direct) Xen and x86 use opposite senses (mask vs enable). */ ENTRY(xen_save_fl_direct) + BUG + testb $0xff, PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) setz %ah addb %ah,%ah @@ -91,6 +104,8 @@ ENDPATCH(xen_save_fl_direct) if so. */ ENTRY(xen_restore_fl_direct) + BUG + testb $X86_EFLAGS_IF>>8, %ah setz PER_CPU_VAR(xen_vcpu_info, XEN_vcpu_info_mask) /* Preempt here doesn't matter because that will deal with @@ -133,7 +148,6 @@ check_events: pop %rcx pop %rax ret -#endif ENTRY(xen_adjust_exception_frame) mov 8+0(%rsp),%rcx -- cgit v1.2.2 From a2e8d3dcfd420177aaa0c53aca60a869bad75f4b Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 2 Oct 2008 22:09:20 -0700 Subject: x86: signal: move macros out from restore_sigcontext() move macros, COPY, COPY_SEG*, GET_SEG, out from restore_sigcontext(). x86_64: introduce COPY_SEG_STRICT for cs. Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 38 +++++++++++++++++++++----------------- arch/x86/kernel/signal_64.c | 18 +++++++++++------- 2 files changed, 32 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 4337cd510f0a..545448b7aeba 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -113,6 +113,27 @@ asmlinkage int sys_sigaltstack(unsigned long bx) return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp; \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + +#define GET_SEG(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + loadsegment(seg, tmp); \ +} /* * Do a signal return; undo the signal stack. @@ -126,23 +147,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) err |= __get_user(regs->x, &sc->x) - -#define COPY_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp; } - -#define COPY_SEG_STRICT(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - regs->seg = tmp|3; } - -#define GET_SEG(seg) \ - { unsigned short tmp; \ - err |= __get_user(tmp, &sc->seg); \ - loadsegment(seg, tmp); } - GET_SEG(gs); COPY_SEG(fs); COPY_SEG(es); diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index 53f86d95985b..feff4a91d095 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -52,6 +52,16 @@ sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, return do_sigaltstack(uss, uoss, regs->sp); } +#define COPY(x) { \ + err |= __get_user(regs->x, &sc->x); \ +} + +#define COPY_SEG_STRICT(seg) { \ + unsigned short tmp; \ + err |= __get_user(tmp, &sc->seg); \ + regs->seg = tmp | 3; \ +} + /* * Do a signal return; undo the signal stack. */ @@ -64,8 +74,6 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Always make any pending restarted system calls return -EINTR */ current_thread_info()->restart_block.fn = do_no_restart_syscall; -#define COPY(x) (err |= __get_user(regs->x, &sc->x)) - COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); COPY(dx); COPY(cx); COPY(ip); COPY(r8); @@ -80,11 +88,7 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, /* Kernel saves and restores only the CS segment register on signals, * which is the bare minimum needed to allow mixed 32/64-bit code. * App's signal handler can save/restore other segments if needed. */ - { - unsigned cs; - err |= __get_user(cs, &sc->cs); - regs->cs = cs | 3; /* Force into user mode */ - } + COPY_SEG_STRICT(cs); { unsigned int tmpflags; -- cgit v1.2.2 From 69e13ad56f9e2cd81c4f8bfd6267211c10c14c08 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Thu, 2 Oct 2008 22:18:47 -0700 Subject: x86: signal: remove indent in restore_sigcontext() remove braces and indent for flags and fpstate in restore_sigcontext(). Signed-off-by: Hiroshi Shimamoto Signed-off-by: Ingo Molnar --- arch/x86/kernel/signal_32.c | 21 +++++++-------------- arch/x86/kernel/signal_64.c | 19 +++++++------------ 2 files changed, 14 insertions(+), 26 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal_32.c b/arch/x86/kernel/signal_32.c index 545448b7aeba..d6dd057d0f22 100644 --- a/arch/x86/kernel/signal_32.c +++ b/arch/x86/kernel/signal_32.c @@ -142,6 +142,8 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ @@ -156,21 +158,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY_SEG_STRICT(cs); COPY_SEG_STRICT(ss); - { - unsigned int tmpflags; + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | - (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } - - { - void __user *buf; - - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; diff --git a/arch/x86/kernel/signal_64.c b/arch/x86/kernel/signal_64.c index feff4a91d095..a5c9627f4db9 100644 --- a/arch/x86/kernel/signal_64.c +++ b/arch/x86/kernel/signal_64.c @@ -69,6 +69,8 @@ static int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, unsigned long *pax) { + void __user *buf; + unsigned int tmpflags; unsigned int err = 0; /* Always make any pending restarted system calls return -EINTR */ @@ -90,19 +92,12 @@ restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, * App's signal handler can save/restore other segments if needed. */ COPY_SEG_STRICT(cs); - { - unsigned int tmpflags; - err |= __get_user(tmpflags, &sc->flags); - regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); - regs->orig_ax = -1; /* disable syscall checks */ - } - - { - void __user *buf; + err |= __get_user(tmpflags, &sc->flags); + regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); + regs->orig_ax = -1; /* disable syscall checks */ - err |= __get_user(buf, &sc->fpstate); - err |= restore_i387_xstate(buf); - } + err |= __get_user(buf, &sc->fpstate); + err |= restore_i387_xstate(buf); err |= __get_user(*pax, &sc->ax); return err; -- cgit v1.2.2 From 175e438f7a2de9d94110046be48697969569736a Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Thu, 2 Oct 2008 17:32:06 -0500 Subject: x86: trivial printk fix in efi.c [patch] x86: Trivial printk fix in efi.c The following line is lacking a space between "memdesc" and "doesn't". "Kernel-defined memdescdoesn't match the one from EFI!" Fixed the printk by adding a space. Signed-off-by: Russ Anderson Cc: Russ Anderson Signed-off-by: Ingo Molnar --- arch/x86/kernel/efi.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 06cc8d4254b1..945a31cdd81f 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -414,9 +414,11 @@ void __init efi_init(void) if (memmap.map == NULL) printk(KERN_ERR "Could not map the EFI memory map!\n"); memmap.map_end = memmap.map + (memmap.nr_map * memmap.desc_size); + if (memmap.desc_size != sizeof(efi_memory_desc_t)) - printk(KERN_WARNING "Kernel-defined memdesc" - "doesn't match the one from EFI!\n"); + printk(KERN_WARNING + "Kernel-defined memdesc doesn't match the one from EFI!\n"); + if (add_efi_memmap) do_add_efi_memmap(); -- cgit v1.2.2 From 6cdcdb99cf7c2e1835fc5b471864d21161c3e679 Mon Sep 17 00:00:00 2001 From: Andrey Borzenkov Date: Fri, 3 Oct 2008 21:08:49 +0400 Subject: x86 setup: fix ghost entries under /sys/firmware/edd take 3 Some BIOSes do not indicate error when trying to read from non- existing device. Zero buffer before reading and check that we possibly have valid MBR by looking for MBR magic. This was fixed in different way for edd.S in http://marc.info/?l=linux-kernel&m=114087765422490&w=2, but lost again when edd.S was rewritten in C. Signed-off-by: Andrey Borzenkov < arvidjaar@mail.ru> Signed-off-by: H. Peter Anvin --- arch/x86/boot/edd.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index bf4ae6ff518e..067e28cd3c5f 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -43,6 +43,7 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) char *mbrbuf_ptr, *mbrbuf_end; u32 buf_base, mbr_base; extern char _end[]; + u16 mbr_magic; sector_size = ei->params.bytes_per_sector; if (!sector_size) @@ -60,11 +61,15 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) if (mbrbuf_end > (char *)(size_t)boot_params.hdr.heap_end_ptr) return -1; + memset(mbrbuf_ptr, 0, sector_size); if (read_mbr(devno, mbrbuf_ptr)) return -1; *mbrsig = *(u32 *)&mbrbuf_ptr[EDD_MBR_SIG_OFFSET]; - return 0; + mbr_magic = *(u16 *)&mbrbuf_ptr[510]; + + /* check for valid MBR magic */ + return mbr_magic == 0xAA55 ? 0 : -1; } static int get_edd_info(u8 devno, struct edd_info *ei) -- cgit v1.2.2 From 98920dc3d1113b883cbc73e3293446d3525c6042 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 3 Oct 2008 10:22:33 -0700 Subject: Revert "x86: fix ghost EDD devices in /sys again" This reverts commit 464f04c9e9b3b1c4f5ffb89c51d8ba2a2034c846. Obsoleted by commit 6cdcdb99cf7c2e1835fc5b471864d21161c3e679. Signed-off-by: H. Peter Anvin --- arch/x86/boot/edd.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 067e28cd3c5f..1aae8f3e5ca1 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -32,9 +32,7 @@ static int read_mbr(u8 devno, void *buf) : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) : : "esi", "edi", "memory"); - /* Some BIOSes do not set carry flag on error but still return - * error in AH. The condition below is expected to catch both */ - return -!!ax; /* 0 or -1 */ + return -(u8)ax; /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) -- cgit v1.2.2 From cc65f1ec192dc54de57483194502e9fa00934c39 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 3 Oct 2008 13:00:56 -0700 Subject: x86 setup: correct segfault in generation of 32-bit reloc kernel Impact: segfault on build of a 32-bit relocatable kernel When converting arch/x86/boot/compressed/relocs.c to support unlimited sections, the computation of sym_strtab in walk_relocs() was done incorrectly. This causes a segfault for some people when building the relocatable 32-bit kernel. Pointed out by Anonymous . Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/relocs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c index a1310c52fc0c..857e492c571e 100644 --- a/arch/x86/boot/compressed/relocs.c +++ b/arch/x86/boot/compressed/relocs.c @@ -492,7 +492,7 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym)) continue; } sh_symtab = sec_symtab->symtab; - sym_strtab = sec->link->strtab; + sym_strtab = sec_symtab->link->strtab; for (j = 0; j < sec->shdr.sh_size/sizeof(Elf32_Rel); j++) { Elf32_Rel *rel; Elf32_Sym *sym; -- cgit v1.2.2 From d960c9ce47c5360b39e47c064818e91c89df0e21 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Sat, 4 Oct 2008 21:18:51 +0200 Subject: x86 setup: remove IMAGE_OFFSET After commit 968de4f ("i386: Relocatable kernel support") IMAGE_OFFSET wasn't actually used anymore in the (current) X86 build system. Now remove its last traces. Signed-off-by: Paul Bolle Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index cceba1f46365..cd48c7210016 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -72,9 +72,7 @@ KBUILD_CFLAGS := $(LINUXINCLUDE) -g -Os -D_SETUP -D__KERNEL__ \ KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ -$(obj)/zImage: IMAGE_OFFSET := 0x1000 $(obj)/zImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -$(obj)/bzImage: IMAGE_OFFSET := 0x100000 $(obj)/bzImage: ccflags-y := -D__BIG_KERNEL__ $(obj)/bzImage: asflags-y := $(SVGA_MODE) $(RAMDISK) -D__BIG_KERNEL__ $(obj)/bzImage: BUILDFLAGS := -b @@ -117,7 +115,7 @@ $(obj)/setup.bin: $(obj)/setup.elf FORCE $(call if_changed,objcopy) $(obj)/compressed/vmlinux: FORCE - $(Q)$(MAKE) $(build)=$(obj)/compressed IMAGE_OFFSET=$(IMAGE_OFFSET) $@ + $(Q)$(MAKE) $(build)=$(obj)/compressed $@ # Set this if you want to pass append arguments to the zdisk/fdimage/isoimage kernel FDARGS = -- cgit v1.2.2 From 42fde7a05c5d6dc76e518ae12091ea897b1c132b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 4 Oct 2008 19:34:18 -0700 Subject: x86: mtrr_cleanup: print out correct type v2 Print out the correct type when the Write Protected (WP) type is seen. Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 406074c46f1e..9086b38fbabe 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1273,12 +1273,14 @@ static int __init mtrr_cleanup(unsigned address_bits) size_base = to_size_factor(size_base, &size_factor), start_base = range_state[i].base_pfn << (PAGE_SHIFT - 10); start_base = to_size_factor(start_base, &start_factor), + type = range_state[i].type; printk(KERN_DEBUG "reg %d, base: %ld%cB, range: %ld%cB, type %s\n", i, start_base, start_factor, size_base, size_factor, (type == MTRR_TYPE_UNCACHABLE) ? "UC" : - ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other") + ((type == MTRR_TYPE_WRPROT) ? "WP" : + ((type == MTRR_TYPE_WRBACK) ? "WB" : "Other")) ); } -- cgit v1.2.2 From 99e1aa17ce434010dd820b583628370cc15f10f3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 4 Oct 2008 14:50:32 -0700 Subject: x86: mtrr_cleanup: first 1M may be covered in var mtrrs The first 1M is don't care when it comes to the variables MTRRs. Cover it as WB as a heuristic approximation; this is generally what we want to minimize the number of registers. Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 9086b38fbabe..663e530e08e0 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -1293,6 +1293,15 @@ static int __init mtrr_cleanup(unsigned address_bits) } nr_range = x86_get_mtrr_mem_range(range, 0, extra_remove_base, extra_remove_size); + /* + * [0, 1M) should always be coverred by var mtrr with WB + * and fixed mtrrs should take effective before var mtrr for it + */ + nr_range = add_range_with_merge(range, nr_range, 0, + (1ULL<<(20 - PAGE_SHIFT)) - 1); + /* sort the ranges */ + sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); + range_sums = sum_ranges(range, nr_range); printk(KERN_INFO "total RAM coverred: %ldM\n", range_sums >> (20 - PAGE_SHIFT)); -- cgit v1.2.2 From dd5523552c2897e3fde16fc2fc8f6332addf66ab Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 4 Oct 2008 14:50:33 -0700 Subject: x86: mtrr_cleanup: treat WRPROT as UNCACHEABLE For the purpose of MTRR canonicalization, treat WRPROT as UNCACHEABLE. Signed-off-by: Yinghai Lu Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 663e530e08e0..5994a9f78f3d 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -759,7 +759,8 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range, /* take out UC ranges */ for (i = 0; i < num_var_ranges; i++) { type = range_state[i].type; - if (type != MTRR_TYPE_UNCACHABLE) + if (type != MTRR_TYPE_UNCACHABLE && + type != MTRR_TYPE_WRPROT) continue; size = range_state[i].size_pfn; if (!size) @@ -1248,6 +1249,8 @@ static int __init mtrr_cleanup(unsigned address_bits) continue; if (!size) type = MTRR_NUM_TYPES; + if (type == MTRR_TYPE_WRPROT) + type = MTRR_TYPE_UNCACHABLE; num[type]++; } -- cgit v1.2.2 From d99e90164e6cf2eb85fa94d547d6336f8127a107 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 4 Oct 2008 15:55:12 -0700 Subject: x86: gart iommu have direct mapping when agp is present too move init_memory_mapping() out of init_k8_gatt. for: http://bugzilla.kernel.org/show_bug.cgi?id=11676 2.6.27-rc2 to rc8, apgart fails, iommu=soft works, regression This is needed because we need to map the GART aperture even if the GATT is not initialized. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-gart_64.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 49285f8fd4d5..be33a5442d82 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -626,7 +626,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) struct pci_dev *dev; void *gatt; int i, error; - unsigned long start_pfn, end_pfn; printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); aper_size = aper_base = info->aper_size = 0; @@ -672,12 +671,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info) printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", aper_base, aper_size>>10); - /* need to map that range */ - end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); - if (end_pfn > max_low_pfn_mapped) { - start_pfn = (aper_base>>PAGE_SHIFT); - init_memory_mapping(start_pfn<>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); + if (end_pfn > max_low_pfn_mapped) { + start_pfn = (aper_base>>PAGE_SHIFT); + init_memory_mapping(start_pfn<> PAGE_SHIFT; -- cgit v1.2.2 From 2407390bd20de38740eef87eab4fe3d1deafdbdd Mon Sep 17 00:00:00 2001 From: Michal Januszewski Date: Sun, 5 Oct 2008 12:16:04 +0200 Subject: x86: replace a magic number with a named constant in the VESA boot code Replace a magic number with a named constant in the VESA boot code. Signed-off-by: Michal Januszewski Cc: linux-fbdev-devel@lists.sourceforge.net Signed-off-by: Ingo Molnar --- arch/x86/boot/video-vesa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 401ad998ad08..1e6fe0214c85 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -224,7 +224,7 @@ static void vesa_store_pm_info(void) static void vesa_store_mode_params_graphics(void) { /* Tell the kernel we're in VESA graphics mode */ - boot_params.screen_info.orig_video_isVGA = 0x23; + boot_params.screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; /* Mode parameters */ boot_params.screen_info.vesa_attributes = vminfo.mode_attr; -- cgit v1.2.2 From e84956f92a846246b09b34f2a728329c386d250f Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 6 Oct 2008 11:59:29 +0200 Subject: x86 ACPI: Blacklist two HP machines with buggy BIOSes There is a bug in the BIOSes of some HP boxes with AMD Turions which connects IO-APIC pins with ACPI thermal trip points in such a way that if the state of the IO-APIC is not as expected by the (buggy) BIOS, the thermal trip points are set to insanely low values (usually all of them become 16 degrees Celsius). As a result, thermal throttling kicks in and knock the system down to its shoes. Unfortunately some of the recent IO-APIC changes made the bug show up. To prevent this from happening, blacklist machines that are known to be affected (nx6115 and 6715b in this particular case). This fixes http://bugzilla.kernel.org/show_bug.cgi?id=11516 listed as a regression from 2.6.26. On my box it was caused by: commit 691874fa96d6349a8b60f8ea9c2bae52ece79941 Author: Maciej W. Rozycki Date: Tue May 27 21:19:51 2008 +0100 x86: I/O APIC: timer through 8259A second-chance Signed-off-by: Maciej W. Rozycki Signed-off-by: Ingo Molnar and the whole story is described in this (huge) thread: http://marc.info/?l=linux-kernel&m=121358440508410&w=4 Matthew Garrett told us about that happening on the nx6125: http://marc.info/?l=linux-kernel&m=121396307411930&w=4 and then Maciej analysed the breakage on the basis of a DSDT from the nx6325: http://marc.info/?l=linux-kernel&m=121401068718826&w=4 As far as the Dmitry's and Jason's boxes are concerned, I recognized the symptoms and asked them to verify that the blacklisting helped. It appears that the buggy BIOS code has been copy-pasted to the entire range of machines, for no good reason. Signed-off-by: Rafael J. Wysocki Tested-by: Dmitry Torokhov Tested-by: Jason Vas Dias Signed-off-by: Linus Torvalds --- arch/x86/kernel/acpi/boot.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index bfd10fd211cd..c102af85df9c 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1603,6 +1603,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { * is not connected at all. Force ignoring BIOS IRQ0 pin2 * override in that cases. */ + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP nx6115 laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6115"), + }, + }, { .callback = dmi_ignore_irq0_timer_override, .ident = "HP NX6125 laptop", @@ -1619,6 +1627,14 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq nx6325"), }, }, + { + .callback = dmi_ignore_irq0_timer_override, + .ident = "HP 6715b laptop", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Hewlett-Packard"), + DMI_MATCH(DMI_PRODUCT_NAME, "HP Compaq 6715b"), + }, + }, {} }; -- cgit v1.2.2 From e85ceae9102f6e3c1d707e7ac88fa48d252e9cfa Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 6 Oct 2008 13:50:59 -0500 Subject: kgdb, x86: Avoid invoking kgdb_nmicallback twice per NMI Stress-testing KVM's latest NMI support with kgdbts inside an SMP guest, I came across spurious unhandled NMIs while running the singlestep test. Looking closer at the code path each NMI takes when KGDB is enabled, I noticed that kgdb_nmicallback is called twice per event: One time via DIE_NMI_IPI notification, the second time on DIE_NMI. Removing the first invocation cures the unhandled NMIs here. Signed-off-by: Jan Kiszka Signed-off-by: Jason Wessel --- arch/x86/kernel/kgdb.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 8282a2139681..10435a120d22 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -455,12 +455,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) return NOTIFY_DONE; case DIE_NMI_IPI: - if (atomic_read(&kgdb_active) != -1) { - /* KGDB CPU roundup */ - kgdb_nmicallback(raw_smp_processor_id(), regs); - was_in_debug_nmi[raw_smp_processor_id()] = 1; - touch_nmi_watchdog(); - } + /* Just ignore, we will handle the roundup on DIE_NMI. */ return NOTIFY_DONE; case DIE_NMIUNKNOWN: -- cgit v1.2.2 From 33fb0e4eb53f16af312f9698f974e2e64af39c12 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 7 Oct 2008 00:11:22 +0200 Subject: x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC On some HP nx6... laptops (e.g. nx6325) BIOS reports an IRQ0 override but the SB450 chipset is configured such that timer interrupts goe to INT0 of IOAPIC. Check IRQ0 routing and if it is routed to INT0 of IOAPIC skip the timer override. [ This more generic PCI ID based quirk should alleviate the need for dmi_ignore_irq0_timer_override DMI quirks. ] Signed-off-by: Andreas Herrmann Acked-by: "Maciej W. Rozycki" Tested-by: Dmitry Torokhov Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 48 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 4353cf5e6fac..6b839b147644 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,6 +95,52 @@ static void __init nvidia_bugs(int num, int slot, int func) } +static u32 ati_ixp4x0_rev(int num, int slot, int func) +{ + u32 d; + u8 b; + + b = read_pci_config_byte(num, slot, func, 0xac); + b &= ~(1<<5); + write_pci_config_byte(num, slot, func, 0xac, b); + + d = read_pci_config(num, slot, func, 0x70); + d |= 1<<8; + write_pci_config(num, slot, func, 0x70, d); + + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + return d; +} + +static void __init ati_bugs(int num, int slot, int func) +{ +#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) + u32 d; + u8 b; + + if (acpi_use_timer_override) + return; + + d = ati_ixp4x0_rev(num, slot, func); + if (d < 0x82) + acpi_skip_timer_override = 1; + else { + /* check for IRQ0 interrupt swap */ + outb(0x72, 0xcd6); b = inb(0xcd7); + if (!(b & 0x2)) + acpi_skip_timer_override = 1; + } + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB4X0 revision 0x%x\n", d); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +#endif +} + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -114,6 +160,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, via_bugs }, { PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, {} }; -- cgit v1.2.2 From 8d89adf44cf750e49691ba5b744b2ad77a05e997 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 7 Oct 2008 06:47:52 +0200 Subject: x86: SB450: deprioritize DMI quirks This PCI ID based quick should be a full solution for the IRQ0 override related slowdown problem on SB450 based systems: 33fb0e4: x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC Emit a warning in those cases where the DMI quirk triggers but the PCI ID based quirk didnt. If this warning does not trigger then we can phase out the DMI quirks. Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c102af85df9c..096102d9b24e 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1421,8 +1421,16 @@ static int __init force_acpi_ht(const struct dmi_system_id *d) */ static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) { - pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", d->ident); - acpi_skip_timer_override = 1; + /* + * The ati_ixp4x0_rev() early PCI quirk should have set + * the acpi_skip_timer_override flag already: + */ + if (!acpi_skip_timer_override) { + WARN(1, KERN_ERR "ati_ixp4x0 quirk not complete.\n"); + pr_notice("%s detected: Ignoring BIOS IRQ0 pin2 override\n", + d->ident); + acpi_skip_timer_override = 1; + } return 0; } -- cgit v1.2.2 From f364eadab59b316ea0bd9f9bc01af0ad89065569 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 7 Oct 2008 14:04:27 -0700 Subject: x86: xsave: fix error condition in save_i387_xstate() Actually return failure on error. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/xsave.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index ed5274a5bb0f..448fde96963c 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -121,6 +121,8 @@ int save_i387_xstate(void __user *buf) err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *) (buf + sig_xstate_size - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return err; } return 1; -- cgit v1.2.2 From 04944b793e18ece23f63c0252646b310c1845940 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 7 Oct 2008 14:04:28 -0700 Subject: x86: xsave: set FP, SSE bits in the xsave header in the user sigcontext If a processor implementation discern that a processor state component is in its initialized state, it may modify the corresponding bit in the xsave header.xstate_bv as '0'. State in the memory layout setup by 'xsave' will be consistent with the bit values in the header. During signal handling, legacy applications may change the FP/SSE bits in the sigcontext memory layout without touching the FP/SSE header bits in the xsave header. So always set FP/SSE bits in the xsave header while saving the sigcontext state to the user space. During signal return, this will enable the kernel to capture any changes to the FP/SSE bits by the legacy applications which don't touch xsave headers. xsave aware apps can change the xstate_bv in the xsave header aswell as change any contents in the memory layout. xrestor as part of sigreturn will capture all the changes. Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/i387.c | 14 ++++++++++++++ arch/x86/kernel/xsave.c | 25 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 45723f1fe198..1f20608d4ca8 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -468,9 +468,23 @@ static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) static int save_i387_xsave(void __user *buf) { + struct task_struct *tsk = current; struct _fpstate_ia32 __user *fx = buf; int err = 0; + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. + * This will enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware applications can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + if (save_i387_fxsave(fx) < 0) return -1; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 448fde96963c..2f98323716d9 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -114,6 +114,8 @@ int save_i387_xstate(void __user *buf) if (task_thread_info(tsk)->status & TS_XSAVE) { struct _fpstate __user *fx = buf; + struct _xstate __user *x = buf; + u64 xstate_bv; err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, sizeof(struct _fpx_sw_bytes)); @@ -121,6 +123,29 @@ int save_i387_xstate(void __user *buf) err |= __put_user(FP_XSTATE_MAGIC2, (__u32 __user *) (buf + sig_xstate_size - FP_XSTATE_MAGIC2_SIZE)); + + /* + * Read the xstate_bv which we copied (directly from the cpu or + * from the state in task struct) to the user buffers and + * set the FP/SSE bits. + */ + err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); + + /* + * For legacy compatible, we always set FP/SSE bits in the bit + * vector while saving the state to the user context. This will + * enable us capturing any changes(during sigreturn) to + * the FP/SSE bits by the legacy applications which don't touch + * xstate_bv in the xsave header. + * + * xsave aware apps can change the xstate_bv in the xsave + * header as well as change any contents in the memory layout. + * xrestore as part of sigreturn will capture all the changes. + */ + xstate_bv |= XSTATE_FPSSE; + + err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); + if (err) return err; } -- cgit v1.2.2 From eefb47f6a1e855653d275cb90592a3587ea93a09 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 8 Oct 2008 13:01:39 -0700 Subject: xen: use spin_lock_nest_lock when pinning a pagetable When pinning/unpinning a pagetable with split pte locks, we can end up holding multiple pte locks at once (we need to hold the locks while there's a pending batched hypercall affecting the pte page). Because all the pte locks are in the same lock class, lockdep thinks that we're potentially taking a lock recursively. This warning is spurious because we always take the pte locks while holding mm->page_table_lock. lockdep now has spin_lock_nest_lock to express this kind of dominant lock use, so use it here so that lockdep knows what's going on. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/mmu.c | 74 +++++++++++++++++++++++++++++++++++------------------- arch/x86/xen/mmu.h | 3 --- 2 files changed, 48 insertions(+), 29 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 64e58681767e..ae173f6edd8b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -651,9 +651,12 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) * For 64-bit, we must skip the Xen hole in the middle of the address * space, just after the big x86-64 virtual hole. */ -static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), +static int xen_pgd_walk(struct mm_struct *mm, + int (*func)(struct mm_struct *mm, struct page *, + enum pt_level), unsigned long limit) { + pgd_t *pgd = mm->pgd; int flush = 0; unsigned hole_low, hole_high; unsigned pgdidx_limit, pudidx_limit, pmdidx_limit; @@ -698,7 +701,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pud = pud_offset(&pgd[pgdidx], 0); if (PTRS_PER_PUD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pud), PT_PUD); + flush |= (*func)(mm, virt_to_page(pud), PT_PUD); for (pudidx = 0; pudidx < PTRS_PER_PUD; pudidx++) { pmd_t *pmd; @@ -713,7 +716,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), pmd = pmd_offset(&pud[pudidx], 0); if (PTRS_PER_PMD > 1) /* not folded */ - flush |= (*func)(virt_to_page(pmd), PT_PMD); + flush |= (*func)(mm, virt_to_page(pmd), PT_PMD); for (pmdidx = 0; pmdidx < PTRS_PER_PMD; pmdidx++) { struct page *pte; @@ -727,7 +730,7 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), continue; pte = pmd_page(pmd[pmdidx]); - flush |= (*func)(pte, PT_PTE); + flush |= (*func)(mm, pte, PT_PTE); } } } @@ -735,20 +738,20 @@ static int xen_pgd_walk(pgd_t *pgd, int (*func)(struct page *, enum pt_level), out: /* Do the top level last, so that the callbacks can use it as a cue to do final things like tlb flushes. */ - flush |= (*func)(virt_to_page(pgd), PT_PGD); + flush |= (*func)(mm, virt_to_page(pgd), PT_PGD); return flush; } /* If we're using split pte locks, then take the page's lock and return a pointer to it. Otherwise return NULL. */ -static spinlock_t *xen_pte_lock(struct page *page) +static spinlock_t *xen_pte_lock(struct page *page, struct mm_struct *mm) { spinlock_t *ptl = NULL; #if USE_SPLIT_PTLOCKS ptl = __pte_lockptr(page); - spin_lock(ptl); + spin_lock_nest_lock(ptl, &mm->page_table_lock); #endif return ptl; @@ -772,7 +775,8 @@ static void xen_do_pin(unsigned level, unsigned long pfn) MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF); } -static int xen_pin_page(struct page *page, enum pt_level level) +static int xen_pin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestSetPagePinned(page); int flush; @@ -813,7 +817,7 @@ static int xen_pin_page(struct page *page, enum pt_level level) */ ptl = NULL; if (level == PT_PTE) - ptl = xen_pte_lock(page); + ptl = xen_pte_lock(page, mm); MULTI_update_va_mapping(mcs.mc, (unsigned long)pt, pfn_pte(pfn, PAGE_KERNEL_RO), @@ -834,11 +838,11 @@ static int xen_pin_page(struct page *page, enum pt_level level) /* This is called just after a mm has been created, but it has not been used yet. We need to make sure that its pagetable is all read-only, and can be pinned. */ -void xen_pgd_pin(pgd_t *pgd) +static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); - if (xen_pgd_walk(pgd, xen_pin_page, USER_LIMIT)) { + if (xen_pgd_walk(mm, xen_pin_page, USER_LIMIT)) { /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); @@ -852,25 +856,35 @@ void xen_pgd_pin(pgd_t *pgd) xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd))); if (user_pgd) { - xen_pin_page(virt_to_page(user_pgd), PT_PGD); + xen_pin_page(mm, virt_to_page(user_pgd), PT_PGD); xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(user_pgd))); } } #else /* CONFIG_X86_32 */ #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is pinnable */ - xen_pin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_pin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif xen_do_pin(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(pgd))); #endif /* CONFIG_X86_64 */ xen_mc_issue(0); } +static void xen_pgd_pin(struct mm_struct *mm) +{ + __xen_pgd_pin(mm, mm->pgd); +} + /* * On save, we need to pin all pagetables to make sure they get their * mfns turned into pfns. Search the list for any unpinned pgds and pin * them (unpinned pgds are not currently in use, probably because the * process is under construction or destruction). + * + * Expected to be called in stop_machine() ("equivalent to taking + * every spinlock in the system"), so the locking doesn't really + * matter all that much. */ void xen_mm_pin_all(void) { @@ -881,7 +895,7 @@ void xen_mm_pin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (!PagePinned(page)) { - xen_pgd_pin((pgd_t *)page_address(page)); + __xen_pgd_pin(&init_mm, (pgd_t *)page_address(page)); SetPageSavePinned(page); } } @@ -894,7 +908,8 @@ void xen_mm_pin_all(void) * that's before we have page structures to store the bits. So do all * the book-keeping now. */ -static __init int xen_mark_pinned(struct page *page, enum pt_level level) +static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, + enum pt_level level) { SetPagePinned(page); return 0; @@ -902,10 +917,11 @@ static __init int xen_mark_pinned(struct page *page, enum pt_level level) void __init xen_mark_init_mm_pinned(void) { - xen_pgd_walk(init_mm.pgd, xen_mark_pinned, FIXADDR_TOP); + xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } -static int xen_unpin_page(struct page *page, enum pt_level level) +static int xen_unpin_page(struct mm_struct *mm, struct page *page, + enum pt_level level) { unsigned pgfl = TestClearPagePinned(page); @@ -923,7 +939,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) * partially-pinned state. */ if (level == PT_PTE) { - ptl = xen_pte_lock(page); + ptl = xen_pte_lock(page, mm); if (ptl) xen_do_pin(MMUEXT_UNPIN_TABLE, pfn); @@ -945,7 +961,7 @@ static int xen_unpin_page(struct page *page, enum pt_level level) } /* Release a pagetables pages back as normal RW */ -static void xen_pgd_unpin(pgd_t *pgd) +static void __xen_pgd_unpin(struct mm_struct *mm, pgd_t *pgd) { xen_mc_batch(); @@ -957,21 +973,27 @@ static void xen_pgd_unpin(pgd_t *pgd) if (user_pgd) { xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(user_pgd))); - xen_unpin_page(virt_to_page(user_pgd), PT_PGD); + xen_unpin_page(mm, virt_to_page(user_pgd), PT_PGD); } } #endif #ifdef CONFIG_X86_PAE /* Need to make sure unshared kernel PMD is unpinned */ - xen_unpin_page(virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), PT_PMD); + xen_unpin_page(mm, virt_to_page(pgd_page(pgd[pgd_index(TASK_SIZE)])), + PT_PMD); #endif - xen_pgd_walk(pgd, xen_unpin_page, USER_LIMIT); + xen_pgd_walk(mm, xen_unpin_page, USER_LIMIT); xen_mc_issue(0); } +static void xen_pgd_unpin(struct mm_struct *mm) +{ + __xen_pgd_unpin(mm, mm->pgd); +} + /* * On resume, undo any pinning done at save, so that the rest of the * kernel doesn't see any unexpected pinned pagetables. @@ -986,7 +1008,7 @@ void xen_mm_unpin_all(void) list_for_each_entry(page, &pgd_list, lru) { if (PageSavePinned(page)) { BUG_ON(!PagePinned(page)); - xen_pgd_unpin((pgd_t *)page_address(page)); + __xen_pgd_unpin(&init_mm, (pgd_t *)page_address(page)); ClearPageSavePinned(page); } } @@ -997,14 +1019,14 @@ void xen_mm_unpin_all(void) void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); - xen_pgd_pin(next->pgd); + xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); - xen_pgd_pin(mm->pgd); + xen_pgd_pin(mm); spin_unlock(&mm->page_table_lock); } @@ -1095,7 +1117,7 @@ void xen_exit_mmap(struct mm_struct *mm) /* pgd may not be pinned in the error exit path of execve */ if (xen_page_pinned(mm->pgd)) - xen_pgd_unpin(mm->pgd); + xen_pgd_unpin(mm); spin_unlock(&mm->page_table_lock); } diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 0f59bd03f9e3..98d71659da5a 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -18,9 +18,6 @@ void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); void xen_exit_mmap(struct mm_struct *mm); -void xen_pgd_pin(pgd_t *pgd); -//void xen_pgd_unpin(pgd_t *pgd); - pteval_t xen_pte_val(pte_t); pmdval_t xen_pmd_val(pmd_t); pgdval_t xen_pgd_val(pgd_t); -- cgit v1.2.2 From 8d5922572038bae9f7b16fcb974eee806727b44c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=A9meth=20M=C3=A1rton?= Date: Thu, 9 Oct 2008 14:59:17 +0200 Subject: [CPUFREQ] correct broken links and email addresses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the no longer working links and email address in the documentation and in source code. Signed-off-by: Márton Németh Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/p4-clockmod.c | 2 +- arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index f1685fb91fbd..b8e05ee4f736 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -171,7 +171,7 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } if (c->x86 != 0xF) { - printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to \n"); + printk(KERN_WARNING PFX "Unknown p4-clockmod-capable CPU. Please send an e-mail to \n"); return 0; } diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c index 15e13c01cc36..3b5f06423e77 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c @@ -26,7 +26,7 @@ #include #define PFX "speedstep-centrino: " -#define MAINTAINER "cpufreq@lists.linux.org.uk" +#define MAINTAINER "cpufreq@vger.kernel.org" #define dprintk(msg...) \ cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg) -- cgit v1.2.2 From 18c6faa9624bf5d9d7da3906a8a1b909dba5899b Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Mon, 28 Jul 2008 13:00:49 +0200 Subject: [CPUFREQ] Coding style fixes to arch/x86/kernel/cpu/cpufreq/elanfreq.c Before: total: 15 errors, 10 warnings, 308 lines checked After: total: 0 errors, 4 warnings, 308 lines checked paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/elafreq.o.* add1d36c2f077c5aab7682e8642a9f34 /tmp/elafreq.o.after add1d36c2f077c5aab7682e8642a9f34 /tmp/elafreq.o.before paolo@paolo-desktop:~/linux.trees.git$ size /tmp/elafreq.o.* text data bss dec hex filename 934 270 4 1208 4b8 /tmp/elafreq.o.after 934 270 4 1208 4b8 /tmp/elafreq.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/elanfreq.c | 42 +++++++++++++++++----------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c index e4a4bf870e94..fe613c93b366 100644 --- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c +++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c @@ -25,8 +25,8 @@ #include #include -#include -#include +#include +#include #define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */ #define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */ @@ -82,7 +82,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) u8 clockspeed_reg; /* Clock Speed Register */ local_irq_disable(); - outb_p(0x80,REG_CSCIR); + outb_p(0x80, REG_CSCIR); clockspeed_reg = inb_p(REG_CSCDR); local_irq_enable(); @@ -98,10 +98,10 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) } /* 33 MHz is not 32 MHz... */ - if ((clockspeed_reg & 0xE0)==0xA0) + if ((clockspeed_reg & 0xE0) == 0xA0) return 33000; - return ((1<<((clockspeed_reg & 0xE0) >> 5)) * 1000); + return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000; } @@ -117,7 +117,7 @@ static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu) * There is no return value. */ -static void elanfreq_set_cpu_state (unsigned int state) +static void elanfreq_set_cpu_state(unsigned int state) { struct cpufreq_freqs freqs; @@ -144,20 +144,20 @@ static void elanfreq_set_cpu_state (unsigned int state) */ local_irq_disable(); - outb_p(0x40,REG_CSCIR); /* Disable hyperspeed mode */ - outb_p(0x00,REG_CSCDR); + outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */ + outb_p(0x00, REG_CSCDR); local_irq_enable(); /* wait till internal pipelines and */ udelay(1000); /* buffers have cleaned up */ local_irq_disable(); /* now, set the CPU clock speed register (0x80) */ - outb_p(0x80,REG_CSCIR); - outb_p(elan_multiplier[state].val80h,REG_CSCDR); + outb_p(0x80, REG_CSCIR); + outb_p(elan_multiplier[state].val80h, REG_CSCDR); /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */ - outb_p(0x40,REG_CSCIR); - outb_p(elan_multiplier[state].val40h,REG_CSCDR); + outb_p(0x40, REG_CSCIR); + outb_p(elan_multiplier[state].val40h, REG_CSCDR); udelay(10000); local_irq_enable(); @@ -173,12 +173,12 @@ static void elanfreq_set_cpu_state (unsigned int state) * for the hardware supported by the driver. */ -static int elanfreq_verify (struct cpufreq_policy *policy) +static int elanfreq_verify(struct cpufreq_policy *policy) { return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]); } -static int elanfreq_target (struct cpufreq_policy *policy, +static int elanfreq_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -205,7 +205,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) /* capability check */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) + (c->x86 != 4) || (c->x86_model != 10)) return -ENODEV; /* max freq */ @@ -213,7 +213,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) max_freq = elanfreq_get_cpu_frequency(0); /* table init */ - for (i=0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) { if (elanfreq_table[i].frequency > max_freq) elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID; } @@ -224,7 +224,7 @@ static int elanfreq_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu); return 0; @@ -260,7 +260,7 @@ __setup("elanfreq=", elanfreq_setup); #endif -static struct freq_attr* elanfreq_attr[] = { +static struct freq_attr *elanfreq_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -284,9 +284,9 @@ static int __init elanfreq_init(void) /* Test if we have the right hardware */ if ((c->x86_vendor != X86_VENDOR_AMD) || - (c->x86 != 4) || (c->x86_model!=10)) { + (c->x86 != 4) || (c->x86_model != 10)) { printk(KERN_INFO "elanfreq: error: no Elan processor found!\n"); - return -ENODEV; + return -ENODEV; } return cpufreq_register_driver(&elanfreq_driver); } @@ -298,7 +298,7 @@ static void __exit elanfreq_exit(void) } -module_param (max_freq, int, 0444); +module_param(max_freq, int, 0444); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Robert Schwebel , Sven Geggus "); -- cgit v1.2.2 From 8d2d2051e50c4ca0207a42b243369e22f7d90a5c Mon Sep 17 00:00:00 2001 From: Paolo Ciarrocchi Date: Mon, 28 Jul 2008 21:08:16 +0200 Subject: [CPUFREQ] Coding style fixes to arch/x86/kernel/cpu/cpufreq/powernow-k6.c Before: total: 11 errors, 15 warnings, 255 lines checked After: total: 0 errors, 6 warnings, 254 lines checked paolo@paolo-desktop:~/linux.trees.git$ md5sum /tmp/powernow-k6.o.* 476932f5e1ffe365db9d1dfb3f860369 /tmp/powernow-k6.o.after 476932f5e1ffe365db9d1dfb3f860369 /tmp/powernow-k6.o.before Signed-off-by: Paolo Ciarrocchi Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 41 +++++++++++++++---------------- 1 file changed, 20 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index eb9b62b0830c..b5ced806a316 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -15,12 +15,11 @@ #include #include -#include -#include +#include +#include - -#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long - as it is unused */ +#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long + as it is unused */ static unsigned int busfreq; /* FSB, in 10 kHz */ static unsigned int max_multiplier; @@ -53,7 +52,7 @@ static int powernow_k6_get_cpu_multiplier(void) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -67,9 +66,9 @@ static int powernow_k6_get_cpu_multiplier(void) * * Tries to change the PowerNow! multiplier */ -static void powernow_k6_set_state (unsigned int best_i) +static void powernow_k6_set_state(unsigned int best_i) { - unsigned long outvalue=0, invalue=0; + unsigned long outvalue = 0, invalue = 0; unsigned long msrval; struct cpufreq_freqs freqs; @@ -90,10 +89,10 @@ static void powernow_k6_set_state (unsigned int best_i) msrval = POWERNOW_IOPORT + 0x1; wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */ - invalue=inl(POWERNOW_IOPORT + 0x8); + invalue = inl(POWERNOW_IOPORT + 0x8); invalue = invalue & 0xf; outvalue = outvalue | invalue; - outl(outvalue ,(POWERNOW_IOPORT + 0x8)); + outl(outvalue , (POWERNOW_IOPORT + 0x8)); msrval = POWERNOW_IOPORT + 0x0; wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */ @@ -124,7 +123,7 @@ static int powernow_k6_verify(struct cpufreq_policy *policy) * * sets a new CPUFreq policy */ -static int powernow_k6_target (struct cpufreq_policy *policy, +static int powernow_k6_target(struct cpufreq_policy *policy, unsigned int target_freq, unsigned int relation) { @@ -152,7 +151,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) busfreq = cpu_khz / max_multiplier; /* table init */ - for (i=0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { + for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) { if (clock_ratio[i].index > max_multiplier) clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID; else @@ -165,7 +164,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); if (result) - return (result); + return result; cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu); @@ -176,8 +175,8 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy) static int powernow_k6_cpu_exit(struct cpufreq_policy *policy) { unsigned int i; - for (i=0; i<8; i++) { - if (i==max_multiplier) + for (i = 0; i < 8; i++) { + if (i == max_multiplier) powernow_k6_set_state(i); } cpufreq_frequency_table_put_attr(policy->cpu); @@ -189,7 +188,7 @@ static unsigned int powernow_k6_get(unsigned int cpu) return busfreq * powernow_k6_get_cpu_multiplier(); } -static struct freq_attr* powernow_k6_attr[] = { +static struct freq_attr *powernow_k6_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, NULL, }; @@ -227,7 +226,7 @@ static int __init powernow_k6_init(void) } if (cpufreq_register_driver(&powernow_k6_driver)) { - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); return -EINVAL; } @@ -243,13 +242,13 @@ static int __init powernow_k6_init(void) static void __exit powernow_k6_exit(void) { cpufreq_unregister_driver(&powernow_k6_driver); - release_region (POWERNOW_IOPORT, 16); + release_region(POWERNOW_IOPORT, 16); } -MODULE_AUTHOR ("Arjan van de Ven , Dave Jones , Dominik Brodowski "); -MODULE_DESCRIPTION ("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); -MODULE_LICENSE ("GPL"); +MODULE_AUTHOR("Arjan van de Ven , Dave Jones , Dominik Brodowski "); +MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); +MODULE_LICENSE("GPL"); module_init(powernow_k6_init); module_exit(powernow_k6_exit); -- cgit v1.2.2 From 847aef6ffd85787b62395b64719f8f7c5975bf1b Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Mon, 14 Jul 2008 11:59:44 +0900 Subject: [CPUFREQ] acpi-cpufreq: add error handling for cpufreq_register_driver() error add error handling for cpufreq_register_driver() error Signed-off-by: Akinobu Mita Cc: cpufreq@lists.linux.org.uk Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b835839..1def5b06fa4a 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -785,7 +785,11 @@ static int __init acpi_cpufreq_init(void) if (ret) return ret; - return cpufreq_register_driver(&acpi_cpufreq_driver); + ret = cpufreq_register_driver(&acpi_cpufreq_driver); + if (ret) + free_percpu(acpi_perf_data); + + return ret; } static void __exit acpi_cpufreq_exit(void) @@ -795,8 +799,6 @@ static void __exit acpi_cpufreq_exit(void) cpufreq_unregister_driver(&acpi_cpufreq_driver); free_percpu(acpi_perf_data); - - return; } module_param(acpi_pstate_strict, uint, 0644); -- cgit v1.2.2 From bf0b90e357c883e8efd72954432efe652de74c76 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Mon, 4 Aug 2008 11:59:07 -0700 Subject: [CPUFREQ][1/6] cpufreq: Add cpu number parameter to __cpufreq_driver_getavg() Add a cpu parameter to __cpufreq_driver_getavg(). This is needed for software cpufreq coordination where policy->cpu may not be same as the CPU on which we want to getavg frequency. A follow-on patch will use this parameter to getavg freq from all cpus in policy->cpus. Change since last patch. Fix the offline/online and suspend/resume oops reported by Youquan Song Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 1def5b06fa4a..c24c4a487b7c 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -256,7 +256,8 @@ static u32 get_cur_val(const cpumask_t *mask) * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and * no meaning should be associated with absolute values of these MSRs. */ -static unsigned int get_measured_perf(unsigned int cpu) +static unsigned int get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu) { union { struct { @@ -326,7 +327,7 @@ static unsigned int get_measured_perf(unsigned int cpu) #endif - retval = per_cpu(drv_data, cpu)->max_freq * perf_percent / 100; + retval = per_cpu(drv_data, policy->cpu)->max_freq * perf_percent / 100; put_cpu(); set_cpus_allowed_ptr(current, &saved_mask); -- cgit v1.2.2 From 5dc64a3442b98eaa0e3730c35fcf00cf962a93e7 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 10 Oct 2008 11:27:38 +0100 Subject: xen: do not reserve 2 pages of padding between hypervisor and fixmap. When reserving space for the hypervisor the Xen paravirt backend adds an extra two pages (this was carried forward from the 2.6.18-xen tree which had them "for safety"). Depending on various CONFIG options this can cause the boot time fixmaps to span multiple PMDs which is not supported and triggers a WARN in early_ioremap_init(). This was exposed by 2216d199b1430d1c0affb1498a9ebdbd9c0de439 which moved the dmi table parsing earlier. x86: fix CONFIG_X86_RESERVE_LOW_64K=y The bad_bios_dmi_table() quirk never triggered because we do DMI setup too late. Move it a bit earlier. There is no real reason to reserve these two extra pages and the fixmap already incorporates FIX_HOLE which serves the same purpose. None of the other callers of reserve_top_address do this. Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 85692c9f6496..977a54255fb4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1370,7 +1370,7 @@ static void __init xen_reserve_top(void) if (HYPERVISOR_xen_version(XENVER_platform_parameters, &pp) == 0) top = pp.virt_start; - reserve_top_address(-top + 2 * PAGE_SIZE); + reserve_top_address(-top); #endif /* CONFIG_X86_32 */ } -- cgit v1.2.2 From 43603c8df97f246e8be7b9cc92a8f968a85108bd Mon Sep 17 00:00:00 2001 From: Hans Schou Date: Thu, 9 Oct 2008 20:47:24 +0200 Subject: x86, debug: print more information about unknown CPUs Write the name of the unknown vendor_id to output instead of just "unknown". Tag changed to 'vendor_id' as used in /proc/cpuinfo Signed-off-by: Hans Schou Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cef3519b3bbb..84c4040efa80 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -406,7 +406,7 @@ static void __cpuinit get_cpu_vendor(struct cpuinfo_x86 *c) if (!printed) { printed++; - printk(KERN_ERR "CPU: Vendor unknown, using generic init.\n"); + printk(KERN_ERR "CPU: vendor_id '%s' unknown, using generic init.\n", v); printk(KERN_ERR "CPU: Your system may be unstable.\n"); } -- cgit v1.2.2 From b2bc27314664c4d1a2f02e6f4cd0c32e4681d61e Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:36 -0700 Subject: x86, cpa: rename PTE attribute macros for kernel direct mapping in early boot Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 34 +++++++++++++++------------------- arch/x86/kernel/head_64.S | 4 ++-- 2 files changed, 17 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index a7010c3a377a..e835b4eea70b 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -172,10 +172,6 @@ num_subarch_entries = (. - subarch_entries) / 4 * * Note that the stack is not yet set up! */ -#define PTE_ATTR 0x007 /* PRESENT+RW+USER */ -#define PDE_ATTR 0x067 /* PRESENT+RW+USER+DIRTY+ACCESSED */ -#define PGD_ATTR 0x001 /* PRESENT (no other attributes) */ - default_entry: #ifdef CONFIG_X86_PAE @@ -196,9 +192,9 @@ default_entry: movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_pmd), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PMD entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ movl %ecx,(%edx) /* Store PMD entry */ /* Upper half already zero */ addl $8,%edx @@ -215,7 +211,7 @@ default_entry: * End condition: we must map up to and including INIT_MAP_BEYOND_END * bytes beyond the end of our own page tables. */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b 1: @@ -224,7 +220,7 @@ default_entry: movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) #else /* Not PAE */ @@ -233,9 +229,9 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl $pa(pg0), %edi movl %edi, pa(init_pg_tables_start) movl $pa(swapper_pg_dir), %edx - movl $PTE_ATTR, %eax + movl $PTE_IDENT_ATTR, %eax 10: - leal PDE_ATTR(%edi),%ecx /* Create PDE entry */ + leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ movl %ecx,(%edx) /* Store identity PDE entry */ movl %ecx,page_pde_offset(%edx) /* Store kernel PDE entry */ addl $4,%edx @@ -249,7 +245,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); * bytes beyond the end of our own page tables; the +0x007 is * the attribute bits */ - leal (INIT_MAP_BEYOND_END+PTE_ATTR)(%edi),%ebp + leal (INIT_MAP_BEYOND_END+PTE_IDENT_ATTR)(%edi),%ebp cmpl %ebp,%eax jb 10b movl %edi,pa(init_pg_tables_end) @@ -257,7 +253,7 @@ page_pde_offset = (__PAGE_OFFSET >> 20); movl %eax, pa(max_pfn_mapped) /* Do early initialization of the fixmap area */ - movl $pa(swapper_pg_fixmap)+PDE_ATTR,%eax + movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax movl %eax,pa(swapper_pg_dir+0xffc) #endif jmp 3f @@ -634,19 +630,19 @@ ENTRY(empty_zero_page) /* Page-aligned for the benefit of paravirt? */ .align PAGE_SIZE_asm ENTRY(swapper_pg_dir) - .long pa(swapper_pg_pmd+PGD_ATTR),0 /* low identity map */ + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ # if KPMDS == 3 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x2000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 # elif KPMDS == 2 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 - .long pa(swapper_pg_pmd+PGD_ATTR+0x1000),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 # elif KPMDS == 1 .long 0,0 .long 0,0 - .long pa(swapper_pg_pmd+PGD_ATTR),0 + .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 # else # error "Kernel PMDs should be 1, 2 or 3" # endif diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index db3280afe886..26cfdc1d7c7f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -110,7 +110,7 @@ startup_64: movq %rdi, %rax shrq $PMD_SHIFT, %rax andq $(PTRS_PER_PMD - 1), %rax - leaq __PAGE_KERNEL_LARGE_EXEC(%rdi), %rdx + leaq __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx leaq level2_spare_pgt(%rip), %rbx movq %rdx, 0(%rbx, %rax, 8) ident_complete: @@ -374,7 +374,7 @@ NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. * Don't set NX because code runs from these pages. */ - PMDS(0, __PAGE_KERNEL_LARGE_EXEC, PTRS_PER_PMD) + PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) NEXT_PAGE(level2_kernel_pgt) /* -- cgit v1.2.2 From a2699e477b8e6b17d4da64916f766dd5a2576c9c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:38 -0700 Subject: x86, cpa: make the kernel physical mapping initialization a two pass sequence In the first pass, kernel physical mapping will be setup using large or small pages but uses the same PTE attributes as that of the early PTE attributes setup by early boot code in head_[32|64].S After flushing TLB's, we go through the second pass, which setups the direct mapped PTE's with the appropriate attributes (like NX, GLOBAL etc) which are runtime detectable. This two pass mechanism conforms to the TLB app note which says: "Software should not write to a paging-structure entry in a way that would change, for any linear address, both the page size and either the page frame or attributes." Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 65 ++++++++++++++++++++++++++++++--- arch/x86/mm/init_64.c | 99 +++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 144 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index d37f29376b0c..9b5f7d7049d0 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -194,11 +194,30 @@ static void __init kernel_physical_mapping_init(pgd_t *pgd_base, pgd_t *pgd; pmd_t *pmd; pte_t *pte; - unsigned pages_2m = 0, pages_4k = 0; + unsigned pages_2m, pages_4k; + int mapping_iter; + + /* + * First iteration will setup identity mapping using large/small pages + * based on use_pse, with other attributes same as set by + * the early code in head_32.S + * + * Second iteration will setup the appropriate attributes (NX, GLOBAL..) + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + */ + mapping_iter = 1; if (!cpu_has_pse) use_pse = 0; +repeat: + pages_2m = pages_4k = 0; pfn = start_pfn; pgd_idx = pgd_index((pfn<> PAGE_SHIFT, PAGE_KERNEL).pte); + pages++; +repeat_set_pte: set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; - pages++; } - update_page_count(PG_LEVEL_4K, pages); + + if (physical_mapping_iter == 1) + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } @@ -318,7 +323,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, { unsigned long pages = 0; unsigned long last_map_addr = end; - unsigned long start = address; int i = pmd_index(address); @@ -341,15 +345,14 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, last_map_addr = phys_pte_update(pmd, address, end); spin_unlock(&init_mm.page_table_lock); + continue; } - /* Count entries we're using from level2_ident_pgt */ - if (start == 0) - pages++; - continue; + goto repeat_set_pte; } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -366,7 +369,8 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); spin_unlock(&init_mm.page_table_lock); } - update_page_count(PG_LEVEL_2M, pages); + if (physical_mapping_iter == 1) + update_page_count(PG_LEVEL_2M, pages); return last_map_addr; } @@ -405,14 +409,18 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, } if (pud_val(*pud)) { - if (!pud_large(*pud)) + if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, page_size_mask); - continue; + continue; + } + + goto repeat_set_pte; } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -430,7 +438,9 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, spin_unlock(&init_mm.page_table_lock); } __flush_tlb_all(); - update_page_count(PG_LEVEL_1G, pages); + + if (physical_mapping_iter == 1) + update_page_count(PG_LEVEL_1G, pages); return last_map_addr; } @@ -494,15 +504,54 @@ static void __init init_gbpages(void) direct_gbpages = 0; } +static int is_kernel(unsigned long pfn) +{ + unsigned long pg_addresss = pfn << PAGE_SHIFT; + + if (pg_addresss >= (unsigned long) __pa(_text) && + pg_addresss <= (unsigned long) __pa(_end)) + return 1; + + return 0; +} + static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - unsigned long next, last_map_addr = end; + unsigned long next, last_map_addr; + u64 cached_supported_pte_mask = __supported_pte_mask; + unsigned long cache_start = start; + unsigned long cache_end = end; + + /* + * First iteration will setup identity mapping using large/small pages + * based on page_size_mask, with other attributes same as set by + * the early code in head_64.S + * + * Second iteration will setup the appropriate attributes + * as desired for the kernel identity mapping. + * + * This two pass mechanism conforms to the TLB app note which says: + * + * "Software should not write to a paging-structure entry in a way + * that would change, for any linear address, both the page size + * and either the page frame or attributes." + * + * For now, only difference between very early PTE attributes used in + * head_64.S and here is _PAGE_NX. + */ + BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC) + != _PAGE_NX); + __supported_pte_mask &= ~(_PAGE_NX); + physical_mapping_iter = 1; - start = (unsigned long)__va(start); - end = (unsigned long)__va(end); +repeat: + last_map_addr = cache_end; + + start = (unsigned long)__va(cache_start); + end = (unsigned long)__va(cache_end); for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); @@ -514,11 +563,21 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, next = end; if (pgd_val(*pgd)) { + /* + * Static identity mappings will be overwritten + * with run-time mappings. For example, this allows + * the static 0-1GB identity mapping to be mapped + * non-executable with this. + */ + if (is_kernel(pte_pfn(*((pte_t *) pgd)))) + goto realloc; + last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end), page_size_mask); continue; } +realloc: pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); @@ -528,6 +587,16 @@ static unsigned long __init kernel_physical_mapping_init(unsigned long start, pgd_populate(&init_mm, pgd, __va(pud_phys)); spin_unlock(&init_mm.page_table_lock); } + __flush_tlb_all(); + + if (physical_mapping_iter == 1) { + physical_mapping_iter = 2; + /* + * Second iteration will set the actual desired PTE attributes. + */ + __supported_pte_mask = cached_supported_pte_mask; + goto repeat; + } return last_map_addr; } -- cgit v1.2.2 From 0b8fdcbcd287a1fbe66817491e6149841ae25705 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:39 -0700 Subject: x86, cpa: dont use large pages for kernel identity mapping with DEBUG_PAGEALLOC Don't use large pages for kernel identity mapping with DEBUG_PAGEALLOC. This will remove the need to split the large page for the allocated kernel page in the interrupt context. This will simplify cpa code(as we don't do the split any more from the interrupt context). cpa code simplication in the subsequent patches. Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 18 ++++++++++++++---- arch/x86/mm/init_64.c | 26 ++++++++++++++++++++------ 2 files changed, 34 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 9b5f7d7049d0..44ccb028c350 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -777,7 +777,7 @@ void __init setup_bootmem_allocator(void) after_init_bootmem = 1; } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse) { unsigned long puds, pmds, ptes, tables, start; @@ -787,7 +787,7 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += PAGE_ALIGN(pmds * sizeof(pmd_t)); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); @@ -827,12 +827,22 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, pgd_t *pgd_base = swapper_pg_dir; unsigned long start_pfn, end_pfn; unsigned long big_page_start; +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + int use_pse = 0; +#else + int use_pse = cpu_has_pse; +#endif /* * Find space for the kernel direct mapping tables. */ if (!after_init_bootmem) - find_early_table_space(end); + find_early_table_space(end, use_pse); #ifdef CONFIG_X86_PAE set_nx(); @@ -878,7 +888,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, end_pfn = (end>>PMD_SHIFT) << (PMD_SHIFT - PAGE_SHIFT); if (start_pfn < end_pfn) kernel_physical_mapping_init(pgd_base, start_pfn, end_pfn, - cpu_has_pse); + use_pse); /* tail is not big page alignment ? */ start_pfn = end_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1ba945eb6282..9d7587ac1ebc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -456,13 +456,14 @@ phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end, return phys_pud_init(pud, addr, end, page_size_mask); } -static void __init find_early_table_space(unsigned long end) +static void __init find_early_table_space(unsigned long end, int use_pse, + int use_gbpages) { unsigned long puds, pmds, ptes, tables, start; puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; tables = round_up(puds * sizeof(pud_t), PAGE_SIZE); - if (direct_gbpages) { + if (use_gbpages) { unsigned long extra; extra = end - ((end>>PUD_SHIFT) << PUD_SHIFT); pmds = (extra + PMD_SIZE - 1) >> PMD_SHIFT; @@ -470,7 +471,7 @@ static void __init find_early_table_space(unsigned long end) pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT; tables += round_up(pmds * sizeof(pmd_t), PAGE_SIZE); - if (cpu_has_pse) { + if (use_pse) { unsigned long extra; extra = end - ((end>>PMD_SHIFT) << PMD_SHIFT); ptes = (extra + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -640,6 +641,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, struct map_range mr[NR_RANGE_MR]; int nr_range, i; + int use_pse, use_gbpages; printk(KERN_INFO "init_memory_mapping\n"); @@ -653,9 +655,21 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, if (!after_bootmem) init_gbpages(); - if (direct_gbpages) +#ifdef CONFIG_DEBUG_PAGEALLOC + /* + * For CONFIG_DEBUG_PAGEALLOC, identity mapping will use small pages. + * This will simplify cpa(), which otherwise needs to support splitting + * large pages into small in interrupt context, etc. + */ + use_pse = use_gbpages = 0; +#else + use_pse = cpu_has_pse; + use_gbpages = direct_gbpages; +#endif + + if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; - if (cpu_has_pse) + if (use_pse) page_size_mask |= 1 << PG_LEVEL_2M; memset(mr, 0, sizeof(mr)); @@ -716,7 +730,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, (mr[i].page_size_mask & (1< Date: Tue, 23 Sep 2008 14:00:40 -0700 Subject: x86, cpa: no need to check alias for __set_pages_p/__set_pages_np No alias checking needed for setting present/not-present mapping. Otherwise, we may need to break large pages for 64-bit kernel text mappings (this adds to complexity if we want to do this from atomic context especially, for ex: with CONFIG_DEBUG_PAGEALLOC). Let's keep it simple! Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4b6968ba0864..162812b05d28 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1121,7 +1121,13 @@ static int __set_pages_p(struct page *page, int numpages) .mask_clr = __pgprot(0), .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } static int __set_pages_np(struct page *page, int numpages) @@ -1133,7 +1139,13 @@ static int __set_pages_np(struct page *page, int numpages) .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW), .flags = 0}; - return __change_page_attr_set_clr(&cpa, 1); + /* + * No alias checking needed for setting not present flag. otherwise, + * we may need to break large pages for 64-bit kernel text + * mappings (this adds to complexity if we want to do this from + * atomic context especially). Let's keep it simple! + */ + return __change_page_attr_set_clr(&cpa, 0); } void kernel_map_pages(struct page *page, int numpages, int enable) @@ -1153,11 +1165,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) /* * The return value is ignored as the calls cannot fail. - * Large pages are kept enabled at boot time, and are - * split up quickly with DEBUG_PAGEALLOC. If a splitup - * fails here (due to temporary memory shortage) no damage - * is done because we just keep the largepage intact up - * to the next attempt when it will likely be split up: + * Large pages for identity mappings are not used at boot time + * and hence no memory allocations during large page split. */ if (enable) __set_pages_p(page, numpages); -- cgit v1.2.2 From 8311eb84bf842d345f543f4c62ca2b6ea26f638c Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:41 -0700 Subject: x86, cpa: remove cpa pool code Interrupt context no longer splits large page in cpa(). So we can do away with cpa memory pool code. Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 - arch/x86/mm/pageattr.c | 157 ++----------------------------------------------- 3 files changed, 5 insertions(+), 155 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 44ccb028c350..74780800e7e7 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -1051,7 +1051,6 @@ void __init mem_init(void) if (boot_cpu_data.wp_works_ok < 0) test_wp_bit(); - cpa_init(); save_pg_dir(); zap_low_mappings(); } diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 9d7587ac1ebc..f54a4d97530f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -889,8 +889,6 @@ void __init mem_init(void) reservedpages << (PAGE_SHIFT-10), datasize >> 10, initsize >> 10); - - cpa_init(); } void free_init_pages(char *what, unsigned long begin, unsigned long end) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 162812b05d28..f5e8663c0f75 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -447,114 +447,17 @@ out_unlock: return do_split; } -static LIST_HEAD(page_pool); -static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed; - -static void cpa_fill_pool(struct page **ret) -{ - gfp_t gfp = GFP_KERNEL; - unsigned long flags; - struct page *p; - - /* - * Avoid recursion (on debug-pagealloc) and also signal - * our priority to get to these pagetables: - */ - if (current->flags & PF_MEMALLOC) - return; - current->flags |= PF_MEMALLOC; - - /* - * Allocate atomically from atomic contexts: - */ - if (in_atomic() || irqs_disabled() || debug_pagealloc) - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - - while (pool_pages < pool_size || (ret && !*ret)) { - p = alloc_pages(gfp, 0); - if (!p) { - pool_failed++; - break; - } - /* - * If the call site needs a page right now, provide it: - */ - if (ret && !*ret) { - *ret = p; - continue; - } - spin_lock_irqsave(&pgd_lock, flags); - list_add(&p->lru, &page_pool); - pool_pages++; - spin_unlock_irqrestore(&pgd_lock, flags); - } - - current->flags &= ~PF_MEMALLOC; -} - -#define SHIFT_MB (20 - PAGE_SHIFT) -#define ROUND_MB_GB ((1 << 10) - 1) -#define SHIFT_MB_GB 10 -#define POOL_PAGES_PER_GB 16 - -void __init cpa_init(void) -{ - struct sysinfo si; - unsigned long gb; - - si_meminfo(&si); - /* - * Calculate the number of pool pages: - * - * Convert totalram (nr of pages) to MiB and round to the next - * GiB. Shift MiB to Gib and multiply the result by - * POOL_PAGES_PER_GB: - */ - if (debug_pagealloc) { - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; - } else { - pool_size = 1; - } - pool_low = pool_size; - - cpa_fill_pool(NULL); - printk(KERN_DEBUG - "CPA: page pool initialized %lu of %lu pages preallocated\n", - pool_pages, pool_size); -} - static int split_large_page(pte_t *kpte, unsigned long address) { unsigned long flags, pfn, pfninc = 1; unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; - struct page *base; + struct page *base = alloc_pages(GFP_KERNEL, 0); + if (!base) + return -ENOMEM; - /* - * Get a page from the pool. The pool list is protected by the - * pgd_lock, which we have to take anyway for the split - * operation: - */ spin_lock_irqsave(&pgd_lock, flags); - if (list_empty(&page_pool)) { - spin_unlock_irqrestore(&pgd_lock, flags); - base = NULL; - cpa_fill_pool(&base); - if (!base) - return -ENOMEM; - spin_lock_irqsave(&pgd_lock, flags); - } else { - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - } - /* * Check for races, another CPU might have split this page * up for us already: @@ -611,11 +514,8 @@ out_unlock: * If we dropped out via the lookup_address check under * pgd_lock then stick the page back into the pool: */ - if (base) { - list_add(&base->lru, &page_pool); - pool_pages++; - } else - pool_used++; + if (base) + __free_page(base); spin_unlock_irqrestore(&pgd_lock, flags); return 0; @@ -899,8 +799,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cpa_flush_all(cache); out: - cpa_fill_pool(NULL); - return ret; } @@ -1178,53 +1076,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * but that can deadlock->flush only current cpu: */ __flush_tlb_all(); - - /* - * Try to refill the page pool here. We can do this only after - * the tlb flush. - */ - cpa_fill_pool(NULL); -} - -#ifdef CONFIG_DEBUG_FS -static int dpa_show(struct seq_file *m, void *v) -{ - seq_puts(m, "DEBUG_PAGEALLOC\n"); - seq_printf(m, "pool_size : %lu\n", pool_size); - seq_printf(m, "pool_pages : %lu\n", pool_pages); - seq_printf(m, "pool_low : %lu\n", pool_low); - seq_printf(m, "pool_used : %lu\n", pool_used); - seq_printf(m, "pool_failed : %lu\n", pool_failed); - - return 0; -} - -static int dpa_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, dpa_show, NULL); } -static const struct file_operations dpa_fops = { - .open = dpa_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static int __init debug_pagealloc_proc_init(void) -{ - struct dentry *de; - - de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, - &dpa_fops); - if (!de) - return -ENOMEM; - - return 0; -} -__initcall(debug_pagealloc_proc_init); -#endif - #ifdef CONFIG_HIBERNATION bool kernel_page_present(struct page *page) -- cgit v1.2.2 From ad5ca55f6bdb47c957b681c7358bb3719ba4ee82 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 23 Sep 2008 14:00:42 -0700 Subject: x86, cpa: srlz cpa(), global flush tlb after splitting big page and before doing cpa Do a global flush tlb after splitting the large page and before we do the actual change page attribute in the PTE. With out this, we violate the TLB application note, which says "The TLBs may contain both ordinary and large-page translations for a 4-KByte range of linear addresses. This may occur if software modifies the paging structures so that the page size used for the address range changes. If the two translations differ with respect to page frame or attributes (e.g., permissions), processor behavior is undefined and may be implementation-specific." And also serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) using cpa_lock. So that we don't allow any other cpu, with stale large tlb entries change the page attribute in parallel to some other cpu splitting a large page entry along with changing the attribute. Signed-off-by: Suresh Siddha Cc: Suresh Siddha Cc: arjan@linux.intel.com Cc: venkatesh.pallipadi@intel.com Cc: jeremy@goop.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f5e8663c0f75..b6374d653d06 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -35,6 +35,14 @@ struct cpa_data { int curpage; }; +/* + * Serialize cpa() (for !DEBUG_PAGEALLOC which uses large identity mappings) + * using cpa_lock. So that we don't allow any other cpu, with stale large tlb + * entries change the page attribute in parallel to some other cpu + * splitting a large page entry along with changing the attribute. + */ +static DEFINE_SPINLOCK(cpa_lock); + #define CPA_FLUSHTLB 1 #define CPA_ARRAY 2 @@ -453,7 +461,13 @@ static int split_large_page(pte_t *kpte, unsigned long address) unsigned int i, level; pte_t *pbase, *tmp; pgprot_t ref_prot; - struct page *base = alloc_pages(GFP_KERNEL, 0); + struct page *base; + + if (!debug_pagealloc) + spin_unlock(&cpa_lock); + base = alloc_pages(GFP_KERNEL, 0); + if (!debug_pagealloc) + spin_lock(&cpa_lock); if (!base) return -ENOMEM; @@ -594,7 +608,25 @@ repeat: */ err = split_large_page(kpte, address); if (!err) { - cpa->flags |= CPA_FLUSHTLB; + /* + * Do a global flush tlb after splitting the large page + * and before we do the actual change page attribute in the PTE. + * + * With out this, we violate the TLB application note, that says + * "The TLBs may contain both ordinary and large-page + * translations for a 4-KByte range of linear addresses. This + * may occur if software modifies the paging structures so that + * the page size used for the address range changes. If the two + * translations differ with respect to page frame or attributes + * (e.g., permissions), processor behavior is undefined and may + * be implementation-specific." + * + * We do this global tlb flush inside the cpa_lock, so that we + * don't allow any other cpu, with stale tlb entries change the + * page attribute in parallel, that also falls into the + * just split large page entry. + */ + flush_tlb_all(); goto repeat; } @@ -686,7 +718,11 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (cpa->flags & CPA_ARRAY) cpa->numpages = 1; + if (!debug_pagealloc) + spin_lock(&cpa_lock); ret = __change_page_attr(cpa, checkalias); + if (!debug_pagealloc) + spin_unlock(&cpa_lock); if (ret) return ret; -- cgit v1.2.2 From 9542ada803198e6eba29d3289abb39ea82047b92 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 24 Sep 2008 08:53:33 -0700 Subject: x86: track memtype for RAM in page struct Track the memtype for RAM pages in page struct instead of using the memtype list. This avoids the explosion in the number of entries in memtype list (of the order of 20,000 with AGP) and makes the PAT tracking simpler. We are using PG_arch_1 bit in page->flags. We still use the memtype list for non RAM pages. Signed-off-by: Suresh Siddha Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 19 ++++++++++++ arch/x86/mm/pat.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index d4b6e6a29ae3..d03c461e045e 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -83,6 +83,25 @@ int page_is_ram(unsigned long pagenr) return 0; } +int pagerange_is_ram(unsigned long start, unsigned long end) +{ + int ram_page = 0, not_rampage = 0; + unsigned long page_nr; + + for (page_nr = (start >> PAGE_SHIFT); page_nr < (end >> PAGE_SHIFT); + ++page_nr) { + if (page_is_ram(page_nr)) + ram_page = 1; + else + not_rampage = 1; + + if (ram_page == not_rampage) + return -1; + } + + return ram_page; +} + /* * Fix up the linear direct mapping of the kernel to avoid cache attribute * conflicts. diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index f049b1d6ebdf..aceb6c7c6dba 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -210,6 +210,75 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, static struct memtype *cached_entry; static u64 cached_start; +/* + * RED-PEN: TODO: Add PageReserved() check as well here, + * once we add SetPageReserved() to all the drivers using + * set_memory_* or set_pages_*. + * + * This will help prevent accidentally freeing pages + * before setting the attribute back to WB. + */ + +/* + * For RAM pages, mark the pages as non WB memory type using + * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or + * set_memory_wc() on a RAM page at a time before marking it as WB again. + * This is ok, because only one driver will be owning the page and + * doing set_memory_*() calls. + * + * For now, we use PageNonWB to track that the RAM page is being mapped + * as non WB. In future, we will have to use one more flag + * (or some other mechanism in page_struct) to distinguish between + * UC and WC mapping. + */ +static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, + unsigned long *new_type) +{ + struct page *page; + u64 pfn, end_pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + if (page_mapped(page) || PageNonWB(page)) + goto out; + + SetPageNonWB(page); + } + return 0; + +out: + end_pfn = pfn; + for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + page = pfn_to_page(pfn); + ClearPageNonWB(page); + } + + return -EINVAL; +} + +static int free_ram_pages_type(u64 start, u64 end) +{ + struct page *page; + u64 pfn, end_pfn; + + for (pfn = (start >> PAGE_SHIFT); pfn < (end >> PAGE_SHIFT); ++pfn) { + page = pfn_to_page(pfn); + if (page_mapped(page) || !PageNonWB(page)) + goto out; + + ClearPageNonWB(page); + } + return 0; + +out: + end_pfn = pfn; + for (pfn = (start >> PAGE_SHIFT); pfn < end_pfn; ++pfn) { + page = pfn_to_page(pfn); + SetPageNonWB(page); + } + return -EINVAL; +} + /* * req_type typically has one of the: * - _PAGE_CACHE_WB @@ -232,6 +301,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, unsigned long actual_type; struct list_head *where; int err = 0; + int is_range_ram; BUG_ON(start >= end); /* end is exclusive */ @@ -270,6 +340,12 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return reserve_ram_pages_type(start, end, req_type, new_type); + else if (is_range_ram < 0) + return -EINVAL; + new = kmalloc(sizeof(struct memtype), GFP_KERNEL); if (!new) return -ENOMEM; @@ -358,6 +434,7 @@ int free_memtype(u64 start, u64 end) { struct memtype *entry; int err = -EINVAL; + int is_range_ram; if (!pat_enabled) return 0; @@ -366,6 +443,12 @@ int free_memtype(u64 start, u64 end) if (is_ISA_range(start, end - 1)) return 0; + is_range_ram = pagerange_is_ram(start, end); + if (is_range_ram == 1) + return free_ram_pages_type(start, end); + else if (is_range_ram < 0) + return -EINVAL; + spin_lock(&memtype_lock); list_for_each_entry(entry, &memtype_list, nd) { if (entry->start == start && entry->end == end) { -- cgit v1.2.2 From 28dd033f43ca957cd751e02652b36c6fa364ca18 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Mon, 29 Sep 2008 12:13:26 -0700 Subject: x86: fix pagetable init 64-bit breakage Fix _end alignment check - can trigger a crash if _end happens to be on a page boundary. Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f54a4d97530f..6116ff0d7416 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -510,7 +510,7 @@ static int is_kernel(unsigned long pfn) unsigned long pg_addresss = pfn << PAGE_SHIFT; if (pg_addresss >= (unsigned long) __pa(_text) && - pg_addresss <= (unsigned long) __pa(_end)) + pg_addresss < (unsigned long) __pa(_end)) return 1; return 0; -- cgit v1.2.2 From ad2cde16a21985cdc4302e4a4b0fc373d666fdf7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 30 Sep 2008 13:20:45 +0200 Subject: x86, pat: cleanups clean up recently added code to be more consistent with other x86 code. Signed-off-by: Ingo Molnar --- arch/x86/mm/pat.c | 67 ++++++++++++++++++++++++++----------------------------- 1 file changed, 32 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index aceb6c7c6dba..738fd0f24958 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -7,24 +7,24 @@ * Loosely based on earlier PAT patchset from Eric Biederman and Andi Kleen. */ -#include +#include +#include +#include #include #include +#include #include -#include -#include -#include -#include -#include +#include #include -#include +#include #include -#include -#include -#include #include +#include #include +#include +#include +#include #include #ifdef CONFIG_X86_PAT @@ -46,6 +46,7 @@ early_param("nopat", nopat); static int debug_enable; + static int __init pat_debug_setup(char *str) { debug_enable = 1; @@ -145,14 +146,14 @@ static char *cattr_name(unsigned long flags) */ struct memtype { - u64 start; - u64 end; - unsigned long type; - struct list_head nd; + u64 start; + u64 end; + unsigned long type; + struct list_head nd; }; static LIST_HEAD(memtype_list); -static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ +static DEFINE_SPINLOCK(memtype_lock); /* protects memtype list */ /* * Does intersection of PAT memory type and MTRR memory type and returns @@ -180,8 +181,8 @@ static unsigned long pat_x_mtrr_type(u64 start, u64 end, unsigned long req_type) return req_type; } -static int chk_conflict(struct memtype *new, struct memtype *entry, - unsigned long *type) +static int +chk_conflict(struct memtype *new, struct memtype *entry, unsigned long *type) { if (new->type != entry->type) { if (type) { @@ -210,15 +211,6 @@ static int chk_conflict(struct memtype *new, struct memtype *entry, static struct memtype *cached_entry; static u64 cached_start; -/* - * RED-PEN: TODO: Add PageReserved() check as well here, - * once we add SetPageReserved() to all the drivers using - * set_memory_* or set_pages_*. - * - * This will help prevent accidentally freeing pages - * before setting the attribute back to WB. - */ - /* * For RAM pages, mark the pages as non WB memory type using * PageNonWB (PG_arch_1). We allow only one set_memory_uc() or @@ -232,7 +224,7 @@ static u64 cached_start; * UC and WC mapping. */ static int reserve_ram_pages_type(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) + unsigned long *new_type) { struct page *page; u64 pfn, end_pfn; @@ -295,15 +287,15 @@ out: * it will return a negative return value. */ int reserve_memtype(u64 start, u64 end, unsigned long req_type, - unsigned long *new_type) + unsigned long *new_type) { struct memtype *new, *entry; unsigned long actual_type; struct list_head *where; - int err = 0; int is_range_ram; + int err = 0; - BUG_ON(start >= end); /* end is exclusive */ + BUG_ON(start >= end); /* end is exclusive */ if (!pat_enabled) { /* This is identical to page table setting without PAT */ @@ -336,9 +328,10 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, actual_type = _PAGE_CACHE_WB; else actual_type = _PAGE_CACHE_UC_MINUS; - } else + } else { actual_type = pat_x_mtrr_type(start, end, req_type & _PAGE_CACHE_MASK); + } is_range_ram = pagerange_is_ram(start, end); if (is_range_ram == 1) @@ -350,9 +343,9 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, if (!new) return -ENOMEM; - new->start = start; - new->end = end; - new->type = actual_type; + new->start = start; + new->end = end; + new->type = actual_type; if (new_type) *new_type = actual_type; @@ -411,6 +404,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type, start, end, cattr_name(new->type), cattr_name(req_type)); kfree(new); spin_unlock(&memtype_lock); + return err; } @@ -469,6 +463,7 @@ int free_memtype(u64 start, u64 end) } dprintk("free_memtype request 0x%Lx-0x%Lx\n", start, end); + return err; } @@ -575,9 +570,9 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn, void map_devmem(unsigned long pfn, unsigned long size, pgprot_t vma_prot) { + unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); u64 addr = (u64)pfn << PAGE_SHIFT; unsigned long flags; - unsigned long want_flags = (pgprot_val(vma_prot) & _PAGE_CACHE_MASK); reserve_memtype(addr, addr + size, want_flags, &flags); if (flags != want_flags) { @@ -620,6 +615,7 @@ static struct memtype *memtype_get_idx(loff_t pos) } spin_unlock(&memtype_lock); kfree(print_entry); + return NULL; } @@ -650,6 +646,7 @@ static int memtype_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%s @ 0x%Lx-0x%Lx\n", cattr_name(print_entry->type), print_entry->start, print_entry->end); kfree(print_entry); + return 0; } -- cgit v1.2.2 From b27a43c1e90582facad44de67d02bc9e9f900289 Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Tue, 7 Oct 2008 13:58:46 -0700 Subject: x86, cpa: make the kernel physical mapping initialization a two pass sequence, fix Jeremy Fitzhardinge wrote: > I'd noticed that current tip/master hasn't been booting under Xen, and I > just got around to bisecting it down to this change. > > commit 065ae73c5462d42e9761afb76f2b52965ff45bd6 > Author: Suresh Siddha > > x86, cpa: make the kernel physical mapping initialization a two pass sequence > > This patch is causing Xen to fail various pagetable updates because it > ends up remapping pagetables to RW, which Xen explicitly prohibits (as > that would allow guests to make arbitrary changes to pagetables, rather > than have them mediated by the hypervisor). Instead of making init a two pass sequence, to satisfy the Intel's TLB Application note (developer.intel.com/design/processor/applnots/317080.pdf Section 6 page 26), we preserve the original page permissions when fragmenting the large mappings and don't touch the existing memory mapping (which satisfies Xen's requirements). Only open issue is: on a native linux kernel, we will go back to mapping the first 0-1GB kernel identity mapping as executable (because of the static mapping setup in head_64.S). We can fix this in a different patch if needed. Signed-off-by: Suresh Siddha Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 149 +++++++++++++++++++++----------------------------- 1 file changed, 61 insertions(+), 88 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6116ff0d7416..8c7eae490a2c 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -270,10 +270,9 @@ static __ref void unmap_low_page(void *adr) early_iounmap(adr, PAGE_SIZE); } -static int physical_mapping_iter; - static unsigned long __meminit -phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) +phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, + pgprot_t prot) { unsigned pages = 0; unsigned long last_map_addr = end; @@ -291,35 +290,40 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end) break; } + /* + * We will re-use the existing mapping. + * Xen for example has some special requirements, like mapping + * pagetable pages as RO. So assume someone who pre-setup + * these mappings are more intelligent. + */ if (pte_val(*pte)) - goto repeat_set_pte; + continue; if (0) printk(" pte=%p addr=%lx pte=%016lx\n", pte, addr, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL).pte); pages++; -repeat_set_pte: - set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, PAGE_KERNEL)); + set_pte(pte, pfn_pte(addr >> PAGE_SHIFT, prot)); last_map_addr = (addr & PAGE_MASK) + PAGE_SIZE; } - if (physical_mapping_iter == 1) - update_page_count(PG_LEVEL_4K, pages); + update_page_count(PG_LEVEL_4K, pages); return last_map_addr; } static unsigned long __meminit -phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end) +phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end, + pgprot_t prot) { pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd); - return phys_pte_init(pte, address, end); + return phys_pte_init(pte, address, end, prot); } static unsigned long __meminit phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { unsigned long pages = 0; unsigned long last_map_addr = end; @@ -330,6 +334,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, unsigned long pte_phys; pmd_t *pmd = pmd_page + pmd_index(address); pte_t *pte; + pgprot_t new_prot = prot; if (address >= end) { if (!after_bootmem) { @@ -343,45 +348,58 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); last_map_addr = phys_pte_update(pmd, address, - end); + end, prot); spin_unlock(&init_mm.page_table_lock); continue; } - goto repeat_set_pte; + /* + * If we are ok with PG_LEVEL_2M mapping, then we will + * use the existing mapping, + * + * Otherwise, we will split the large page mapping but + * use the same existing protection bits except for + * large page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_2M)) + continue; + new_prot = pte_pgprot(pte_clrhuge(*(pte_t *)pmd)); } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); + pfn_pte(address >> PAGE_SHIFT, + __pgprot(pgprot_val(prot) | _PAGE_PSE))); spin_unlock(&init_mm.page_table_lock); last_map_addr = (address & PMD_MASK) + PMD_SIZE; continue; } pte = alloc_low_page(&pte_phys); - last_map_addr = phys_pte_init(pte, address, end); + last_map_addr = phys_pte_init(pte, address, end, new_prot); unmap_low_page(pte); spin_lock(&init_mm.page_table_lock); pmd_populate_kernel(&init_mm, pmd, __va(pte_phys)); spin_unlock(&init_mm.page_table_lock); } - if (physical_mapping_iter == 1) - update_page_count(PG_LEVEL_2M, pages); + update_page_count(PG_LEVEL_2M, pages); return last_map_addr; } static unsigned long __meminit phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end, - unsigned long page_size_mask) + unsigned long page_size_mask, pgprot_t prot) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long last_map_addr; - last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot); __flush_tlb_all(); return last_map_addr; } @@ -398,6 +416,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, unsigned long pmd_phys; pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; + pgprot_t prot = PAGE_KERNEL; if (addr >= end) break; @@ -411,16 +430,28 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, if (pud_val(*pud)) { if (!pud_large(*pud)) { last_map_addr = phys_pmd_update(pud, addr, end, - page_size_mask); + page_size_mask, prot); continue; } - - goto repeat_set_pte; + /* + * If we are ok with PG_LEVEL_1G mapping, then we will + * use the existing mapping. + * + * Otherwise, we will split the gbpage mapping but use + * the same existing protection bits except for large + * page, so that we don't violate Intel's TLB + * Application note (317080) which says, while changing + * the page sizes, new and old translations should + * not differ with respect to page frame and + * attributes. + */ + if (page_size_mask & (1 << PG_LEVEL_1G)) + continue; + prot = pte_pgprot(pte_clrhuge(*(pte_t *)pud)); } if (page_size_mask & (1<> PAGE_SHIFT, PAGE_KERNEL_LARGE)); @@ -430,7 +461,8 @@ repeat_set_pte: } pmd = alloc_low_page(&pmd_phys); - last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask); + last_map_addr = phys_pmd_init(pmd, addr, end, page_size_mask, + prot); unmap_low_page(pmd); spin_lock(&init_mm.page_table_lock); @@ -439,8 +471,7 @@ repeat_set_pte: } __flush_tlb_all(); - if (physical_mapping_iter == 1) - update_page_count(PG_LEVEL_1G, pages); + update_page_count(PG_LEVEL_1G, pages); return last_map_addr; } @@ -505,54 +536,15 @@ static void __init init_gbpages(void) direct_gbpages = 0; } -static int is_kernel(unsigned long pfn) -{ - unsigned long pg_addresss = pfn << PAGE_SHIFT; - - if (pg_addresss >= (unsigned long) __pa(_text) && - pg_addresss < (unsigned long) __pa(_end)) - return 1; - - return 0; -} - static unsigned long __init kernel_physical_mapping_init(unsigned long start, unsigned long end, unsigned long page_size_mask) { - unsigned long next, last_map_addr; - u64 cached_supported_pte_mask = __supported_pte_mask; - unsigned long cache_start = start; - unsigned long cache_end = end; - - /* - * First iteration will setup identity mapping using large/small pages - * based on page_size_mask, with other attributes same as set by - * the early code in head_64.S - * - * Second iteration will setup the appropriate attributes - * as desired for the kernel identity mapping. - * - * This two pass mechanism conforms to the TLB app note which says: - * - * "Software should not write to a paging-structure entry in a way - * that would change, for any linear address, both the page size - * and either the page frame or attributes." - * - * For now, only difference between very early PTE attributes used in - * head_64.S and here is _PAGE_NX. - */ - BUILD_BUG_ON((__PAGE_KERNEL_LARGE & ~__PAGE_KERNEL_IDENT_LARGE_EXEC) - != _PAGE_NX); - __supported_pte_mask &= ~(_PAGE_NX); - physical_mapping_iter = 1; + unsigned long next, last_map_addr = end; -repeat: - last_map_addr = cache_end; - - start = (unsigned long)__va(cache_start); - end = (unsigned long)__va(cache_end); + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); for (; start < end; start = next) { pgd_t *pgd = pgd_offset_k(start); @@ -564,21 +556,11 @@ repeat: next = end; if (pgd_val(*pgd)) { - /* - * Static identity mappings will be overwritten - * with run-time mappings. For example, this allows - * the static 0-1GB identity mapping to be mapped - * non-executable with this. - */ - if (is_kernel(pte_pfn(*((pte_t *) pgd)))) - goto realloc; - last_map_addr = phys_pud_update(pgd, __pa(start), __pa(end), page_size_mask); continue; } -realloc: pud = alloc_low_page(&pud_phys); last_map_addr = phys_pud_init(pud, __pa(start), __pa(next), page_size_mask); @@ -590,15 +572,6 @@ realloc: } __flush_tlb_all(); - if (physical_mapping_iter == 1) { - physical_mapping_iter = 2; - /* - * Second iteration will set the actual desired PTE attributes. - */ - __supported_pte_mask = cached_supported_pte_mask; - goto repeat; - } - return last_map_addr; } -- cgit v1.2.2 From d0d0f7432c9cbd52cb2f31d499f8292b13a7ecac Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Thu, 9 Oct 2008 12:41:50 -0500 Subject: x86: remove magic number from ACPI sleep stack buffer x86_64 SMP suspend to RAM uses a 10k temporary stack for saving the kernel state, but only 4k of it is used. Shrink it to 4k. Signed-off-by: Matt Mackall Acked-by: Pavel Machek Signed-off-by: Len Brown --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 426e5d91b63a..29cf3403abef 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -97,7 +97,7 @@ int acpi_save_state_mem(void) #else /* CONFIG_64BIT */ header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP - stack_start.sp = temp_stack + 4096; + stack_start.sp = temp_stack + sizeof(temp_stack); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; -- cgit v1.2.2 From 5000cadcf3188e935dae28c4fc7e24639704ea55 Mon Sep 17 00:00:00 2001 From: Matt Mackall Date: Thu, 9 Oct 2008 11:56:21 -0500 Subject: x86: trim ACPI sleep stack buffer x86_64 SMP suspend to RAM uses a 10k temporary stack for saving the kernel state, but only 4k of it is used. Shrink it to 4k. Signed-off-by: Matt Mackall Signed-off-by: Len Brown --- arch/x86/kernel/acpi/sleep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 29cf3403abef..55d10cbe65b1 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -21,7 +21,7 @@ unsigned long acpi_realmode_flags; static unsigned long acpi_realmode; #if defined(CONFIG_SMP) && defined(CONFIG_64BIT) -static char temp_stack[10240]; +static char temp_stack[4096]; #endif /** -- cgit v1.2.2 From ee297533279a802eac8b1cbea8e65b24b36a1aac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Sep 2008 19:04:31 -0700 Subject: ACPI: don't load acpi_cpufreq if acpi=off Signed-off-by: Yinghai Lu Signed-off-by: Len Brown --- arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index dd097b835839..9943b4c87746 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -779,6 +779,9 @@ static int __init acpi_cpufreq_init(void) { int ret; + if (acpi_disabled) + return 0; + dprintk("acpi_cpufreq_init\n"); ret = acpi_cpufreq_early_init(); -- cgit v1.2.2 From e1e23bb0513520035ec934fa3483507cb6648b7c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 7 Oct 2008 14:15:11 -0700 Subject: x86: avoid dereferencing beyond stack + THREAD_SIZE It's possible for get_wchan() to dereference past task->stack + THREAD_SIZE while iterating through instruction pointers if fp equals the upper boundary, causing a kernel panic. Signed-off-by: David Rientjes Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 2a8ccb9238b4..b6b508ea7110 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -754,12 +754,12 @@ unsigned long get_wchan(struct task_struct *p) if (!p || p == current || p->state == TASK_RUNNING) return 0; stack = (unsigned long)task_stack_page(p); - if (p->thread.sp < stack || p->thread.sp > stack+THREAD_SIZE) + if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE) return 0; fp = *(u64 *)(p->thread.sp); do { if (fp < (unsigned long)stack || - fp > (unsigned long)stack+THREAD_SIZE) + fp >= (unsigned long)stack+THREAD_SIZE) return 0; ip = *(u64 *)(fp+8); if (!in_sched_functions(ip)) -- cgit v1.2.2 From c613ec1a7ff3714da11c7c48a13bab03beb5c376 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Fri, 10 Oct 2008 10:46:45 +0100 Subject: x86, early_ioremap: fix fencepost error The x86 implementation of early_ioremap has an off by one error. If we get an object which ends on the first byte of a page we undermap by one page and this causes a crash on boot with the ASUS P5QL whose DMI table happens to fit this alignment. The size computation is currently last_addr = phys_addr + size - 1; npages = (PAGE_ALIGN(last_addr) - phys_addr) (Consider a request for 1 byte at alignment 0...) Closes #11693 Debugging work by Ian Campbell/Felix Geyer Signed-off-by: Alan Cox Cc: Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6ab3196d12b4..10b52309aefd 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -614,7 +614,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) */ offset = phys_addr & ~PAGE_MASK; phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr) - phys_addr; + size = PAGE_ALIGN(last_addr + 1) - phys_addr; /* * Mappings have to fit in the FIX_BTMAP area. -- cgit v1.2.2 From cb58ffc3889f0545628f138f849e759a331b8ddc Mon Sep 17 00:00:00 2001 From: Petr Vandrovec Date: Sun, 12 Oct 2008 10:51:03 +0200 Subject: x86: fix early panic on amd64 due to typo in supported CPU section Do not crash when enumerating supported CPU architectures SECURITY_INIT somehow ended up in x86_cpu_dev.init section. That caused printk in code which prints supported architectures to hit #GP due to non-canonical address being used. Signed-off-by: Petr Vandrovec Cc: thomas.petazzoni@free-electrons.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 201e81a91a95..46e05447405b 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -172,8 +172,8 @@ SECTIONS .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { *(.x86_cpu_dev.init) } - SECURITY_INIT __x86_cpu_dev_end = .; + SECURITY_INIT . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { -- cgit v1.2.2 From 325af5fb1418c79953db0954556de048e061d8b6 Mon Sep 17 00:00:00 2001 From: Roland McGrath Date: Fri, 8 Aug 2008 15:58:39 -0700 Subject: x86: ioperm user_regset This adds a user_regset type for the x86 io permissions bitmap. This makes it appear in core dumps (when ioperm has been used). It will also make it visible to debuggers in the future. Signed-off-by: Roland McGrath Signed-off-by: H. Peter Anvin [conflict resolutions: Signed-off-by: Ingo Molnar ] --- arch/x86/kernel/ptrace.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e375b658efc3..4e1ef66c2ea4 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -40,7 +40,9 @@ enum x86_regset { REGSET_GENERAL, REGSET_FP, REGSET_XFP, + REGSET_IOPERM64 = REGSET_XFP, REGSET_TLS, + REGSET_IOPERM32, }; /* @@ -555,6 +557,29 @@ static int ptrace_set_debugreg(struct task_struct *child, return 0; } +/* + * These access the current or another (stopped) task's io permission + * bitmap for debugging or core dump. + */ +static int ioperm_active(struct task_struct *target, + const struct user_regset *regset) +{ + return target->thread.io_bitmap_max / regset->size; +} + +static int ioperm_get(struct task_struct *target, + const struct user_regset *regset, + unsigned int pos, unsigned int count, + void *kbuf, void __user *ubuf) +{ + if (!target->thread.io_bitmap_ptr) + return -ENXIO; + + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, + target->thread.io_bitmap_ptr, + 0, IO_BITMAP_BYTES); +} + #ifdef CONFIG_X86_PTRACE_BTS /* * The configuration for a particular BTS hardware implementation. @@ -1385,6 +1410,12 @@ static const struct user_regset x86_64_regsets[] = { .size = sizeof(long), .align = sizeof(long), .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set }, + [REGSET_IOPERM64] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_LONGS, + .size = sizeof(long), .align = sizeof(long), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_64_view = { @@ -1431,6 +1462,12 @@ static const struct user_regset x86_32_regsets[] = { .active = regset_tls_active, .get = regset_tls_get, .set = regset_tls_set }, + [REGSET_IOPERM32] = { + .core_note_type = NT_386_IOPERM, + .n = IO_BITMAP_BYTES / sizeof(u32), + .size = sizeof(u32), .align = sizeof(u32), + .active = ioperm_active, .get = ioperm_get + }, }; static const struct user_regset_view user_x86_32_view = { -- cgit v1.2.2 From 46eaa6702016e3ac9a188172a2c309d6ca1be1cd Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Oct 2008 15:06:29 +0200 Subject: x86: memory corruption check - cleanup Move the prototypes from the generic kernel.h header to the more appropriate include/asm-x86/bios_ebda.h header file. Also, remove the check from the power management code - this is a pure x86 matter for now. Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 1 + arch/x86/mm/init_64.c | 1 + 2 files changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7e05462ffb11..bbe044dbe014 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index d84d3e91d348..3e10054c5731 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include -- cgit v1.2.2 From 9f482807a6bd7e2aa1ed0d8cfc48463ec4ca3568 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 Aug 2008 12:59:32 +0200 Subject: x86, fpu: check __clear_user() return value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix warning: arch/x86/kernel/xsave.c: In function ‘save_i387_xstate’: arch/x86/kernel/xsave.c:98: warning: ignoring return value of ‘__clear_user’, declared with attribute warn_unused_result check the return value and act on it. We should not be ignoring faults at this point. Signed-off-by: Ingo Molnar --- arch/x86/kernel/xsave.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 2f98323716d9..9abac8a9d823 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -95,7 +95,9 @@ int save_i387_xstate(void __user *buf) * Start with clearing the user buffer. This will present a * clean context for the bytes not touched by the fxsave/xsave. */ - __clear_user(buf, sig_xstate_size); + err = __clear_user(buf, sig_xstate_size); + if (err) + return err; if (task_thread_info(tsk)->status & TS_XSAVE) err = xsave_user(buf); -- cgit v1.2.2 From 45e96f26f257bd873017c6244a6cafd27f6f5439 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Aug 2008 10:37:14 +0200 Subject: warnings: fix arch/x86/kernel/early_printk.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix warning: arch/x86/kernel/early_printk.c:993: warning: ‘enable_debug_console’ defined but not used Eliminate dead code. Signed-off-by: Ingo Molnar --- arch/x86/kernel/early_printk.c | 18 ------------------ 1 file changed, 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 02fc5b40c14b..34ad997d3834 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -989,22 +989,4 @@ static int __init setup_early_printk(char *buf) return 0; } -static void __init enable_debug_console(char *buf) -{ -#ifdef DBGP_DEBUG - struct console *old_early_console = NULL; - - if (early_console_initialized && early_console) { - old_early_console = early_console; - unregister_console(early_console); - early_console_initialized = 0; - } - - setup_early_printk(buf); - - if (early_console == old_early_console && old_early_console) - register_console(old_early_console); -#endif -} - early_param("earlyprintk", setup_early_printk); -- cgit v1.2.2 From d562353a4533c4671af683499191707c9a77c406 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Oct 2008 15:22:22 +0200 Subject: warnings: fix arch/x86/kernel/io_apic_64.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: arch/x86/kernel/io_apic_64.c: In function ‘print_local_APIC’: arch/x86/kernel/io_apic_64.c:1284: warning: format ‘%08x’ expects type ‘unsigned int’, but argument 2 has type ‘long unsigned int’ arch/x86/kernel/io_apic_64.c:1285: warning: format ‘%08x’ expects type ‘unsigned int’, but argument 2 has type ‘long unsigned int’ We want to print the two halves of 'icr' at 32 bit width. Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index a1bec2969c6a..02063ae042f7 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1281,8 +1281,8 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC ESR: %08x\n", v); icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); -- cgit v1.2.2 From 8a66712ba0969aea5580b9e312badfc2490fc614 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Oct 2008 15:24:53 +0200 Subject: x86, amd-iommu: propagate PCI device enabling error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit propagate an error in enabling the PCI device. Also eliminates this warning: arch/x86/kernel/amd_iommu_init.c: In function ‘init_iommu_one’: arch/x86/kernel/amd_iommu_init.c:726: warning: ignoring return value of ‘pci_enable_device’, declared with attribute warn_unused_result Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_iommu_init.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 148fcfe22f17..4cd8083c58be 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -723,9 +723,7 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) init_iommu_from_acpi(iommu, h); init_iommu_devices(iommu); - pci_enable_device(iommu->dev); - - return 0; + return pci_enable_device(iommu->dev); } /* -- cgit v1.2.2 From b7b3a42533e1e41297fe517533375f9f8f7b92d0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Oct 2008 15:36:24 +0200 Subject: x86: extend processor type select help text extend the help text of the CONFIG_CPU_SUP_* config options to express what it does and what effects it has. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 63 ++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 56 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index f8843c3ae77d..a2ea1488b37e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -430,48 +430,97 @@ config CPU_SUP_INTEL default y bool "Support Intel processors" if PROCESSOR_SELECT help - This enables extended support for Intel processors + This enables detection, tunings and quirks for Intel processors + + You need this enabled if you want your kernel to run on an + Intel CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on an Intel + CPU might render the kernel unbootable. + + If unsure, say N. config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT depends on !64BIT help - This enables extended support for Cyrix processors + This enables detection, tunings and quirks for Cyrix processors + + You need this enabled if you want your kernel to run on a + Cyrix CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Cyrix + CPU might render the kernel unbootable. + + If unsure, say N. config CPU_SUP_AMD default y bool "Support AMD processors" if PROCESSOR_SELECT help - This enables extended support for AMD processors + This enables detection, tunings and quirks for AMD processors + + You need this enabled if you want your kernel to run on an + AMD CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on an AMD + CPU might render the kernel unbootable. + + If unsure, say N. config CPU_SUP_CENTAUR_32 default y bool "Support Centaur processors" if PROCESSOR_SELECT depends on !64BIT help - This enables extended support for Centaur processors + This enables detection, tunings and quirks for Centaur processors + + You need this enabled if you want your kernel to run on a + Centaur CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Centaur + CPU might render the kernel unbootable. + + If unsure, say N. config CPU_SUP_CENTAUR_64 default y bool "Support Centaur processors" if PROCESSOR_SELECT depends on 64BIT help - This enables extended support for Centaur processors + This enables detection, tunings and quirks for Centaur processors + + You need this enabled if you want your kernel to run on a + Centaur CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Centaur + CPU might render the kernel unbootable. + + If unsure, say N. config CPU_SUP_TRANSMETA_32 default y bool "Support Transmeta processors" if PROCESSOR_SELECT depends on !64BIT help - This enables extended support for Transmeta processors + This enables detection, tunings and quirks for Transmeta processors + + You need this enabled if you want your kernel to run on a + Transmeta CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a Transmeta + CPU might render the kernel unbootable. + + If unsure, say N. config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT depends on !64BIT help - This enables extended support for UMC processors + This enables detection, tunings and quirks for UMC processors + + You need this enabled if you want your kernel to run on a + UMC CPU. Disabling this option on other types of CPUs + makes the kernel a tiny bit smaller. Disabling it on a UMC + CPU might render the kernel unbootable. + + If unsure, say N. config X86_DS bool "Debug Store support" -- cgit v1.2.2 From 1db5fff9aeab18566eb380e354629fdbbe7792f0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 12 Oct 2008 15:40:45 +0200 Subject: x86: make processor type select depend on CONFIG_EMBEDDED deselecting one of the CPU type CONFIG_CPU_SUP_* config options can render a kernel unbootable. Make sure this option is only available if CONFIG_EMBEDDED is enabled. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index a2ea1488b37e..c5f101360520 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -420,7 +420,6 @@ config X86_DEBUGCTLMSR depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) menuconfig PROCESSOR_SELECT - default y bool "Supported processor vendors" if EMBEDDED help This lets you choose what x86 vendor support code your kernel -- cgit v1.2.2 From 295286a89107c353b9677bc604361c537fd6a1c0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 13:21:11 +0100 Subject: x86-64: slightly stream-line 32-bit syscall entry code Avoid updating registers or memory twice as well as needlessly loading or copying registers. Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ffc1bb4fed7d..eb4314768bf7 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -39,11 +39,11 @@ .endm /* clobbers %eax */ - .macro CLEAR_RREGS + .macro CLEAR_RREGS _r9=rax xorl %eax,%eax movq %rax,R11(%rsp) movq %rax,R10(%rsp) - movq %rax,R9(%rsp) + movq %\_r9,R9(%rsp) movq %rax,R8(%rsp) .endm @@ -52,11 +52,10 @@ * We don't reload %eax because syscall_trace_enter() returned * the value it wants us to use in the table lookup. */ - .macro LOAD_ARGS32 offset - movl \offset(%rsp),%r11d - movl \offset+8(%rsp),%r10d + .macro LOAD_ARGS32 offset, _r9=0 + .if \_r9 movl \offset+16(%rsp),%r9d - movl \offset+24(%rsp),%r8d + .endif movl \offset+40(%rsp),%ecx movl \offset+48(%rsp),%edx movl \offset+56(%rsp),%esi @@ -145,7 +144,7 @@ ENTRY(ia32_sysenter_target) SAVE_ARGS 0,0,1 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ -1: movl (%rbp),%r9d +1: movl (%rbp),%ebp .section __ex_table,"a" .quad 1b,ia32_badarg .previous @@ -157,7 +156,7 @@ ENTRY(ia32_sysenter_target) cmpl $(IA32_NR_syscalls-1),%eax ja ia32_badsys sysenter_do_call: - IA32_ARG_FIXUP 1 + IA32_ARG_FIXUP sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) @@ -234,20 +233,17 @@ sysexit_audit: #endif sysenter_tracesys: - xchgl %r9d,%ebp #ifdef CONFIG_AUDITSYSCALL testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) jz sysenter_auditsys #endif SAVE_REST CLEAR_RREGS - movq %r9,R9(%rsp) movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ RESTORE_REST - xchgl %ebp,%r9d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call @@ -314,9 +310,9 @@ ENTRY(ia32_cstar_target) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) CFI_REMEMBER_STATE jnz cstar_tracesys -cstar_do_call: cmpl $IA32_NR_syscalls-1,%eax ja ia32_badsys +cstar_do_call: IA32_ARG_FIXUP 1 cstar_dispatch: call *ia32_sys_call_table(,%rax,8) @@ -357,15 +353,13 @@ cstar_tracesys: #endif xchgl %r9d,%ebp SAVE_REST - CLEAR_RREGS - movq %r9,R9(%rsp) + CLEAR_RREGS r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ RESTORE_REST xchgl %ebp,%r9d - movl RSP-ARGOFFSET(%rsp), %r8d cmpl $(IA32_NR_syscalls-1),%eax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ jmp cstar_do_call -- cgit v1.2.2 From 0cefa5b9b0a61b62442c5d0ca00a304c5896b6e9 Mon Sep 17 00:00:00 2001 From: Manfred Spraul Date: Sun, 7 Sep 2008 11:29:58 +0200 Subject: arch/x86/kernel/smpboot.c: Clarify when irq processing begins. Secondary cpus start with local interrupts disabled. start_secondary() first initializes the new cpu, then it enables the local interrupts. (although interrupts are enabled within smp_callin() as well). Right now, the local interrupts are enabled as a side effect of calling ipi_call_lock_irq(). The attached patch clarifies when local interrupts are enabled. Signed-off-by: Manfred Spraul Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 76b6f50978f7..b700c9a10644 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -334,14 +334,17 @@ static void __cpuinit start_secondary(void *unused) * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. */ - ipi_call_lock_irq(); + ipi_call_lock(); lock_vector_lock(); __setup_vector_irq(smp_processor_id()); cpu_set(smp_processor_id(), cpu_online_map); unlock_vector_lock(); - ipi_call_unlock_irq(); + ipi_call_unlock(); per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + /* enable local interrupts */ + local_irq_enable(); + setup_secondary_clock(); wmb(); -- cgit v1.2.2 From 927604c7592473742891dc13e1da09febc06e01b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 9 Sep 2008 23:34:17 -0700 Subject: x86: rename discontig_32.c to numa_32.c name it in line with its purpose. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/Makefile | 6 +- arch/x86/mm/discontig_32.c | 444 --------------------------------------------- arch/x86/mm/numa_32.c | 444 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 445 insertions(+), 449 deletions(-) delete mode 100644 arch/x86/mm/discontig_32.c create mode 100644 arch/x86/mm/numa_32.c (limited to 'arch/x86') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index dfb932dcf136..59f89b434b45 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -13,12 +13,8 @@ obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o -ifeq ($(CONFIG_X86_32),y) -obj-$(CONFIG_NUMA) += discontig_32.o -else -obj-$(CONFIG_NUMA) += numa_64.o +obj-$(CONFIG_NUMA) += numa_$(BITS).o obj-$(CONFIG_K8_NUMA) += k8topology_64.o -endif obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/arch/x86/mm/discontig_32.c b/arch/x86/mm/discontig_32.c deleted file mode 100644 index 847c164725f4..000000000000 --- a/arch/x86/mm/discontig_32.c +++ /dev/null @@ -1,444 +0,0 @@ -/* - * Written by: Patricia Gaughen , IBM Corporation - * August 2002: added remote node KVA remap - Martin J. Bligh - * - * Copyright (C) 2002, IBM Corp. - * - * All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for more - * details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; -EXPORT_SYMBOL(node_data); - -/* - * numa interface - we expect the numa architecture specific code to have - * populated the following initialisation. - * - * 1) node_online_map - the map of all nodes configured (online) in the system - * 2) node_start_pfn - the starting page frame number for a node - * 3) node_end_pfn - the ending page fram number for a node - */ -unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; -unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; - - -#ifdef CONFIG_DISCONTIGMEM -/* - * 4) physnode_map - the mapping between a pfn and owning node - * physnode_map keeps track of the physical memory layout of a generic - * numa node on a 64Mb break (each element of the array will - * represent 64Mb of memory and will be marked by the node id. so, - * if the first gig is on node 0, and the second gig is on node 1 - * physnode_map will contain: - * - * physnode_map[0-15] = 0; - * physnode_map[16-31] = 1; - * physnode_map[32- ] = -1; - */ -s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; -EXPORT_SYMBOL(physnode_map); - -void memory_present(int nid, unsigned long start, unsigned long end) -{ - unsigned long pfn; - - printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", - nid, start, end); - printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); - printk(KERN_DEBUG " "); - for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { - physnode_map[pfn / PAGES_PER_ELEMENT] = nid; - printk(KERN_CONT "%lx ", pfn); - } - printk(KERN_CONT "\n"); -} - -unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, - unsigned long end_pfn) -{ - unsigned long nr_pages = end_pfn - start_pfn; - - if (!nr_pages) - return 0; - - return (nr_pages + 1) * sizeof(struct page); -} -#endif - -extern unsigned long find_max_low_pfn(void); -extern unsigned long highend_pfn, highstart_pfn; - -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -unsigned long node_remap_size[MAX_NUMNODES]; -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -static unsigned long kva_start_pfn; -static unsigned long kva_pages; -/* - * FLAT - support for basic PC memory model with discontig enabled, essentially - * a single node with all available processors in it with a flat - * memory map. - */ -int __init get_memcfg_numa_flat(void) -{ - printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); - - node_start_pfn[0] = 0; - node_end_pfn[0] = max_pfn; - e820_register_active_regions(0, 0, max_pfn); - memory_present(0, 0, max_pfn); - node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); - - /* Indicate there is one node available. */ - nodes_clear(node_online_map); - node_set_online(0); - return 1; -} - -/* - * Find the highest page frame number we have available for the node - */ -static void __init propagate_e820_map_node(int nid) -{ - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - /* - * if a user has given mem=XXXX, then we need to make sure - * that the node _starts_ before that, too, not just ends - */ - if (node_start_pfn[nid] > max_pfn) - node_start_pfn[nid] = max_pfn; - BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); -} - -/* - * Allocate memory for the pg_data_t for this node via a crude pre-bootmem - * method. For node zero take this from the bottom of memory, for - * subsequent nodes place them at node_remap_start_vaddr which contains - * node local data in physically node local memory. See setup_memory() - * for details. - */ -static void __init allocate_pgdat(int nid) -{ - char buf[16]; - - if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) - NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; - else { - unsigned long pgdat_phys; - pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); - memset(buf, 0, sizeof(buf)); - sprintf(buf, "NODE_DATA %d", nid); - reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); - } - printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", - nid, (unsigned long)NODE_DATA(nid)); -} - -/* - * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel - * virtual address space (KVA) is reserved and portions of nodes are mapped - * using it. This is to allow node-local memory to be allocated for - * structures that would normally require ZONE_NORMAL. The memory is - * allocated with alloc_remap() and callers should be prepared to allocate - * from the bootmem allocator instead. - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; -static unsigned long node_remap_offset[MAX_NUMNODES]; - -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) - return 0; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -static void __init remap_numa_kva(void) -{ - void *vaddr; - unsigned long pfn; - int node; - - for_each_online_node(node) { - printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); - for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { - vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) - continue; - if (!node_end_pfn[nid]) - continue; - if (node_end_pfn[nid] > max_pfn) - node_end_pfn[nid] = max_pfn; - - /* ensure the remap includes space for the pgdat. */ - size = node_remap_size[nid] + sizeof(pg_data_t); - - /* convert size to large (pmd size) pages, rounding up */ - size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; - /* now the roundup is correct, convert to PAGE_SIZE pages */ - size = size * PTRS_PER_PTE; - - node_kva_target = round_down(node_end_pfn[nid] - size, - PTRS_PER_PTE); - node_kva_target <<= PAGE_SHIFT; - do { - node_kva_final = find_e820_area(node_kva_target, - ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); - - if (node_kva_final == -1ULL) - panic("Can not get kva ram\n"); - - node_remap_size[nid] = size; - node_remap_offset[nid] = reserve_pages; - reserve_pages += size; - printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" - " node %d at %llx\n", - size, nid, node_kva_final>>PAGE_SHIFT); - - /* - * prevent kva address below max_low_pfn want it on system - * with less memory later. - * layout will be: KVA address , KVA RAM - * - * we are supposed to only record the one less then max_low_pfn - * but we could have some hole in high memory, and it will only - * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide - * to use it as free. - * So reserve_early here, hope we don't run out of that array - */ - reserve_early(node_kva_final, - node_kva_final+(((u64)size)<>PAGE_SHIFT; - remove_active_range(nid, node_remap_start_pfn[nid], - node_remap_start_pfn[nid] + size); - } - printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", - reserve_pages); - return reserve_pages; -} - -static void init_remap_allocator(int nid) -{ - node_remap_start_vaddr[nid] = pfn_to_kaddr( - kva_start_pfn + node_remap_offset[nid]); - node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + - (node_remap_size[nid] * PAGE_SIZE); - node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + - ALIGN(sizeof(pg_data_t), PAGE_SIZE); - - printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, - (ulong) node_remap_start_vaddr[nid], - (ulong) node_remap_end_vaddr[nid]); -} - -void __init initmem_init(unsigned long start_pfn, - unsigned long end_pfn) -{ - int nid; - long kva_target_pfn; - - /* - * When mapping a NUMA machine we allocate the node_mem_map arrays - * from node local memory. They are then mapped directly into KVA - * between zone normal and vmalloc space. Calculate the size of - * this space and use it to adjust the boundary between ZONE_NORMAL - * and ZONE_HIGHMEM. - */ - - get_memcfg_numa(); - - kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); - - kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); - do { - kva_start_pfn = find_e820_area(kva_target_pfn<> PAGE_SHIFT; - kva_target_pfn -= PTRS_PER_PTE; - } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); - - if (kva_start_pfn == -1UL) - panic("Can not get kva space\n"); - - printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", - kva_start_pfn, max_low_pfn); - printk(KERN_INFO "max_pfn = %lx\n", max_pfn); - - /* avoid clash with initrd */ - reserve_early(kva_start_pfn< max_low_pfn) - highstart_pfn = max_low_pfn; - printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", - pages_to_mb(highend_pfn - highstart_pfn)); - num_physpages = highend_pfn; - high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; -#else - num_physpages = max_low_pfn; - high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; -#endif - printk(KERN_NOTICE "%ldMB LOWMEM available.\n", - pages_to_mb(max_low_pfn)); - printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", - max_low_pfn, highstart_pfn); - - printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", - (ulong) pfn_to_kaddr(max_low_pfn)); - for_each_online_node(nid) { - init_remap_allocator(nid); - - allocate_pgdat(nid); - } - remap_numa_kva(); - - printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", - (ulong) pfn_to_kaddr(highstart_pfn)); - for_each_online_node(nid) - propagate_e820_map_node(nid); - - for_each_online_node(nid) - memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); - - NODE_DATA(0)->bdata = &bootmem_node_data[0]; - setup_bootmem_allocator(); -} - -void __init set_highmem_pages_init(void) -{ -#ifdef CONFIG_HIGHMEM - struct zone *zone; - int nid; - - for_each_zone(zone) { - unsigned long zone_start_pfn, zone_end_pfn; - - if (!is_highmem(zone)) - continue; - - zone_start_pfn = zone->zone_start_pfn; - zone_end_pfn = zone_start_pfn + zone->spanned_pages; - - nid = zone_to_nid(zone); - printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", - zone->name, nid, zone_start_pfn, zone_end_pfn); - - add_highpages_with_active_regions(nid, zone_start_pfn, - zone_end_pfn); - } - totalram_pages += totalhigh_pages; -#endif -} - -#ifdef CONFIG_MEMORY_HOTPLUG -static int paddr_to_nid(u64 addr) -{ - int nid; - unsigned long pfn = PFN_DOWN(addr); - - for_each_node(nid) - if (node_start_pfn[nid] <= pfn && - pfn < node_end_pfn[nid]) - return nid; - - return -1; -} - -/* - * This function is used to ask node id BEFORE memmap and mem_section's - * initialization (pfn_to_nid() can't be used yet). - * If _PXM is not defined on ACPI's DSDT, node id must be found by this. - */ -int memory_add_physaddr_to_nid(u64 addr) -{ - int nid = paddr_to_nid(addr); - return (nid >= 0) ? nid : 0; -} - -EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); -#endif - diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c new file mode 100644 index 000000000000..847c164725f4 --- /dev/null +++ b/arch/x86/mm/numa_32.c @@ -0,0 +1,444 @@ +/* + * Written by: Patricia Gaughen , IBM Corporation + * August 2002: added remote node KVA remap - Martin J. Bligh + * + * Copyright (C) 2002, IBM Corp. + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; +EXPORT_SYMBOL(node_data); + +/* + * numa interface - we expect the numa architecture specific code to have + * populated the following initialisation. + * + * 1) node_online_map - the map of all nodes configured (online) in the system + * 2) node_start_pfn - the starting page frame number for a node + * 3) node_end_pfn - the ending page fram number for a node + */ +unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly; +unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly; + + +#ifdef CONFIG_DISCONTIGMEM +/* + * 4) physnode_map - the mapping between a pfn and owning node + * physnode_map keeps track of the physical memory layout of a generic + * numa node on a 64Mb break (each element of the array will + * represent 64Mb of memory and will be marked by the node id. so, + * if the first gig is on node 0, and the second gig is on node 1 + * physnode_map will contain: + * + * physnode_map[0-15] = 0; + * physnode_map[16-31] = 1; + * physnode_map[32- ] = -1; + */ +s8 physnode_map[MAX_ELEMENTS] __read_mostly = { [0 ... (MAX_ELEMENTS - 1)] = -1}; +EXPORT_SYMBOL(physnode_map); + +void memory_present(int nid, unsigned long start, unsigned long end) +{ + unsigned long pfn; + + printk(KERN_INFO "Node: %d, start_pfn: %lx, end_pfn: %lx\n", + nid, start, end); + printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); + printk(KERN_DEBUG " "); + for (pfn = start; pfn < end; pfn += PAGES_PER_ELEMENT) { + physnode_map[pfn / PAGES_PER_ELEMENT] = nid; + printk(KERN_CONT "%lx ", pfn); + } + printk(KERN_CONT "\n"); +} + +unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long nr_pages = end_pfn - start_pfn; + + if (!nr_pages) + return 0; + + return (nr_pages + 1) * sizeof(struct page); +} +#endif + +extern unsigned long find_max_low_pfn(void); +extern unsigned long highend_pfn, highstart_pfn; + +#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) + +unsigned long node_remap_size[MAX_NUMNODES]; +static void *node_remap_start_vaddr[MAX_NUMNODES]; +void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); + +static unsigned long kva_start_pfn; +static unsigned long kva_pages; +/* + * FLAT - support for basic PC memory model with discontig enabled, essentially + * a single node with all available processors in it with a flat + * memory map. + */ +int __init get_memcfg_numa_flat(void) +{ + printk(KERN_DEBUG "NUMA - single node, flat memory mode\n"); + + node_start_pfn[0] = 0; + node_end_pfn[0] = max_pfn; + e820_register_active_regions(0, 0, max_pfn); + memory_present(0, 0, max_pfn); + node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn); + + /* Indicate there is one node available. */ + nodes_clear(node_online_map); + node_set_online(0); + return 1; +} + +/* + * Find the highest page frame number we have available for the node + */ +static void __init propagate_e820_map_node(int nid) +{ + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + /* + * if a user has given mem=XXXX, then we need to make sure + * that the node _starts_ before that, too, not just ends + */ + if (node_start_pfn[nid] > max_pfn) + node_start_pfn[nid] = max_pfn; + BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]); +} + +/* + * Allocate memory for the pg_data_t for this node via a crude pre-bootmem + * method. For node zero take this from the bottom of memory, for + * subsequent nodes place them at node_remap_start_vaddr which contains + * node local data in physically node local memory. See setup_memory() + * for details. + */ +static void __init allocate_pgdat(int nid) +{ + char buf[16]; + + if (node_has_online_mem(nid) && node_remap_start_vaddr[nid]) + NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid]; + else { + unsigned long pgdat_phys; + pgdat_phys = find_e820_area(min_low_pfn<>PAGE_SHIFT)); + memset(buf, 0, sizeof(buf)); + sprintf(buf, "NODE_DATA %d", nid); + reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf); + } + printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n", + nid, (unsigned long)NODE_DATA(nid)); +} + +/* + * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel + * virtual address space (KVA) is reserved and portions of nodes are mapped + * using it. This is to allow node-local memory to be allocated for + * structures that would normally require ZONE_NORMAL. The memory is + * allocated with alloc_remap() and callers should be prepared to allocate + * from the bootmem allocator instead. + */ +static unsigned long node_remap_start_pfn[MAX_NUMNODES]; +static void *node_remap_end_vaddr[MAX_NUMNODES]; +static void *node_remap_alloc_vaddr[MAX_NUMNODES]; +static unsigned long node_remap_offset[MAX_NUMNODES]; + +void *alloc_remap(int nid, unsigned long size) +{ + void *allocation = node_remap_alloc_vaddr[nid]; + + size = ALIGN(size, L1_CACHE_BYTES); + + if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) + return 0; + + node_remap_alloc_vaddr[nid] += size; + memset(allocation, 0, size); + + return allocation; +} + +static void __init remap_numa_kva(void) +{ + void *vaddr; + unsigned long pfn; + int node; + + for_each_online_node(node) { + printk(KERN_DEBUG "remap_numa_kva: node %d\n", node); + for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) { + vaddr = node_remap_start_vaddr[node]+(pfn< max_pfn) + continue; + if (!node_end_pfn[nid]) + continue; + if (node_end_pfn[nid] > max_pfn) + node_end_pfn[nid] = max_pfn; + + /* ensure the remap includes space for the pgdat. */ + size = node_remap_size[nid] + sizeof(pg_data_t); + + /* convert size to large (pmd size) pages, rounding up */ + size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES; + /* now the roundup is correct, convert to PAGE_SIZE pages */ + size = size * PTRS_PER_PTE; + + node_kva_target = round_down(node_end_pfn[nid] - size, + PTRS_PER_PTE); + node_kva_target <<= PAGE_SHIFT; + do { + node_kva_final = find_e820_area(node_kva_target, + ((u64)node_end_pfn[nid])<>PAGE_SHIFT) > (node_start_pfn[nid])); + + if (node_kva_final == -1ULL) + panic("Can not get kva ram\n"); + + node_remap_size[nid] = size; + node_remap_offset[nid] = reserve_pages; + reserve_pages += size; + printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of" + " node %d at %llx\n", + size, nid, node_kva_final>>PAGE_SHIFT); + + /* + * prevent kva address below max_low_pfn want it on system + * with less memory later. + * layout will be: KVA address , KVA RAM + * + * we are supposed to only record the one less then max_low_pfn + * but we could have some hole in high memory, and it will only + * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide + * to use it as free. + * So reserve_early here, hope we don't run out of that array + */ + reserve_early(node_kva_final, + node_kva_final+(((u64)size)<>PAGE_SHIFT; + remove_active_range(nid, node_remap_start_pfn[nid], + node_remap_start_pfn[nid] + size); + } + printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n", + reserve_pages); + return reserve_pages; +} + +static void init_remap_allocator(int nid) +{ + node_remap_start_vaddr[nid] = pfn_to_kaddr( + kva_start_pfn + node_remap_offset[nid]); + node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + + (node_remap_size[nid] * PAGE_SIZE); + node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + + ALIGN(sizeof(pg_data_t), PAGE_SIZE); + + printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, + (ulong) node_remap_start_vaddr[nid], + (ulong) node_remap_end_vaddr[nid]); +} + +void __init initmem_init(unsigned long start_pfn, + unsigned long end_pfn) +{ + int nid; + long kva_target_pfn; + + /* + * When mapping a NUMA machine we allocate the node_mem_map arrays + * from node local memory. They are then mapped directly into KVA + * between zone normal and vmalloc space. Calculate the size of + * this space and use it to adjust the boundary between ZONE_NORMAL + * and ZONE_HIGHMEM. + */ + + get_memcfg_numa(); + + kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); + + kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE); + do { + kva_start_pfn = find_e820_area(kva_target_pfn<> PAGE_SHIFT; + kva_target_pfn -= PTRS_PER_PTE; + } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn); + + if (kva_start_pfn == -1UL) + panic("Can not get kva space\n"); + + printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n", + kva_start_pfn, max_low_pfn); + printk(KERN_INFO "max_pfn = %lx\n", max_pfn); + + /* avoid clash with initrd */ + reserve_early(kva_start_pfn< max_low_pfn) + highstart_pfn = max_low_pfn; + printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", + pages_to_mb(highend_pfn - highstart_pfn)); + num_physpages = highend_pfn; + high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; +#else + num_physpages = max_low_pfn; + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; +#endif + printk(KERN_NOTICE "%ldMB LOWMEM available.\n", + pages_to_mb(max_low_pfn)); + printk(KERN_DEBUG "max_low_pfn = %lx, highstart_pfn = %lx\n", + max_low_pfn, highstart_pfn); + + printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", + (ulong) pfn_to_kaddr(max_low_pfn)); + for_each_online_node(nid) { + init_remap_allocator(nid); + + allocate_pgdat(nid); + } + remap_numa_kva(); + + printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", + (ulong) pfn_to_kaddr(highstart_pfn)); + for_each_online_node(nid) + propagate_e820_map_node(nid); + + for_each_online_node(nid) + memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); + + NODE_DATA(0)->bdata = &bootmem_node_data[0]; + setup_bootmem_allocator(); +} + +void __init set_highmem_pages_init(void) +{ +#ifdef CONFIG_HIGHMEM + struct zone *zone; + int nid; + + for_each_zone(zone) { + unsigned long zone_start_pfn, zone_end_pfn; + + if (!is_highmem(zone)) + continue; + + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone_start_pfn + zone->spanned_pages; + + nid = zone_to_nid(zone); + printk(KERN_INFO "Initializing %s for node %d (%08lx:%08lx)\n", + zone->name, nid, zone_start_pfn, zone_end_pfn); + + add_highpages_with_active_regions(nid, zone_start_pfn, + zone_end_pfn); + } + totalram_pages += totalhigh_pages; +#endif +} + +#ifdef CONFIG_MEMORY_HOTPLUG +static int paddr_to_nid(u64 addr) +{ + int nid; + unsigned long pfn = PFN_DOWN(addr); + + for_each_node(nid) + if (node_start_pfn[nid] <= pfn && + pfn < node_end_pfn[nid]) + return nid; + + return -1; +} + +/* + * This function is used to ask node id BEFORE memmap and mem_section's + * initialization (pfn_to_nid() can't be used yet). + * If _PXM is not defined on ACPI's DSDT, node id must be found by this. + */ +int memory_add_physaddr_to_nid(u64 addr) +{ + int nid = paddr_to_nid(addr); + return (nid >= 0) ? nid : 0; +} + +EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); +#endif + -- cgit v1.2.2 From 762db4347060c5d23e9675846e02f7d95d7f944f Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:55 +0200 Subject: i386: remove kprobes' restore_interrupts in favour of conditional_sti x86_64 uses a helper function conditional_sti in traps_64.c which is equal to restore_interrupts in kprobes.h. The only user of restore_interrupts is in traps_32.c. Introduce conditional_sti for i386 and remove restore_interrupts. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 0429c5de5ea9..203b863ac2a8 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -84,6 +84,12 @@ static unsigned int code_bytes = 64; static int ignore_nmis; static int die_counter; +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + void printk_address(unsigned long address, int reliable) { #ifdef CONFIG_KALLSYMS @@ -859,7 +865,7 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * This is an interrupt gate, because kprobes wants interrupts * disabled. Normal trap handlers don't. */ - restore_interrupts(regs); + conditional_sti(regs); do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); } -- cgit v1.2.2 From 61aef7d24909b95f9eddaa78d78902fbea1d7059 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:56 +0200 Subject: i386: prepare to convert exceptions to interrupts There is some macro magic in traps_32.c to construct standard exception dispatch functions. This patch renames the DO_ERROR- like macros to DO_TRAP, and introduces new DO_ERROR ones that conditionally reenable interrupts explicitly, like x86_64. No code changes. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 76 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 203b863ac2a8..fe6dd972068d 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -528,6 +528,56 @@ vm86_trap: return; } +#define DO_TRAP(trapnr, signr, str, name) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ +} + +#define DO_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + if (irq) \ + local_irq_enable(); \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ +} + +#define DO_VM86_TRAP(trapnr, signr, str, name) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ +} + +#define DO_VM86_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr) \ +void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + trace_hardirqs_fixup(); \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ +} + #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ @@ -535,6 +585,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ } @@ -551,6 +602,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ } @@ -560,6 +612,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ } @@ -575,22 +628,23 @@ void do_##name(struct pt_regs *regs, long error_code) \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ + conditional_sti(regs); \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_TRAP_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES -DO_VM86_ERROR(3, SIGTRAP, "int3", int3) +DO_VM86_TRAP(3, SIGTRAP, "int3", int3) #endif -DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_ERROR_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) +DO_VM86_TRAP(4, SIGSEGV, "overflow", overflow) +DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) +DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) +DO_TRAP(12, SIGBUS, "stack segment", stack_segment) +DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes do_general_protection(struct pt_regs *regs, long error_code) -- cgit v1.2.2 From 976382dcbe3a2233f48cda5b0840874f43a30825 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:57 +0200 Subject: i386: convert hardware exception 0 to an interrupt gate Handle divide error exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index fe6dd972068d..c18824b9d1ab 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -632,7 +632,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ } -DO_VM86_TRAP_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) #ifndef CONFIG_KPROBES DO_VM86_TRAP(3, SIGTRAP, "int3", int3) #endif @@ -1247,7 +1247,7 @@ void __init trap_init(void) early_iounmap(p, 4); #endif - set_trap_gate(0, ÷_error); + set_intr_gate(0, ÷_error); set_intr_gate(1, &debug); set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3 can be called from all */ -- cgit v1.2.2 From b94da1e4b7d396abe10ce4d566f1a5f623602c21 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:58 +0200 Subject: i386: expand exception 3 DO_TRAP macro The int3 exception was already takes as an interrupt and do_int3 does not fit in the new DO_ERROR macro. This patch just expands the DO_TRAP macro and rearranges the code a bit. No functional changes intended. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index c18824b9d1ab..e3c4809f414a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -633,9 +633,6 @@ void do_##name(struct pt_regs *regs, long error_code) \ } DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -#ifndef CONFIG_KPROBES -DO_VM86_TRAP(3, SIGTRAP, "int3", int3) -#endif DO_VM86_TRAP(4, SIGSEGV, "overflow", overflow) DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) @@ -907,9 +904,9 @@ void restart_nmi(void) acpi_nmi_enable(); } -#ifdef CONFIG_KPROBES void __kprobes do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_KPROBES trace_hardirqs_fixup(); if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) @@ -920,10 +917,14 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * disabled. Normal trap handlers don't. */ conditional_sti(regs); +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); } -#endif /* * Our handling of the processor debug registers is non-trivial. -- cgit v1.2.2 From 8d6f9d69bdc1dcf75a3ba436c9b2efb20478619b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:55:59 +0200 Subject: i386: convert hardware exception 4 to an interrupt gate Handle overflow exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index e3c4809f414a..6f685e61325a 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -633,7 +633,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ } DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_VM86_TRAP(4, SIGSEGV, "overflow", overflow) +DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) @@ -1252,7 +1252,7 @@ void __init trap_init(void) set_intr_gate(1, &debug); set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3 can be called from all */ - set_system_gate(4, &overflow); /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); /* int4 can be called from all */ set_trap_gate(5, &bounds); set_trap_gate(6, &invalid_op); set_trap_gate(7, &device_not_available); -- cgit v1.2.2 From 64f644c0b4b8b7818867fb27de077c98c4f0022b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:00 +0200 Subject: i386: convert hardware exception 5 to an interrupt gate Handle bounds exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 6f685e61325a..06149083763e 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -634,7 +634,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_TRAP(5, SIGSEGV, "bounds", bounds) +DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) @@ -1253,7 +1253,7 @@ void __init trap_init(void) set_intr_gate(2, &nmi); set_system_intr_gate(3, &int3); /* int3 can be called from all */ set_system_intr_gate(4, &overflow); /* int4 can be called from all */ - set_trap_gate(5, &bounds); + set_intr_gate(5, &bounds); set_trap_gate(6, &invalid_op); set_trap_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); -- cgit v1.2.2 From 12394cf567d509f6ab5d94544e1f414b35286d9e Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:01 +0200 Subject: i386: convert hardware exception 6 to an interrupt gate Handle invalid opcode exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 06149083763e..39a6101a6123 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -635,7 +635,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_TRAP_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) @@ -1254,7 +1254,7 @@ void __init trap_init(void) set_system_intr_gate(3, &int3); /* int3 can be called from all */ set_system_intr_gate(4, &overflow); /* int4 can be called from all */ set_intr_gate(5, &bounds); - set_trap_gate(6, &invalid_op); + set_intr_gate(6, &invalid_op); set_trap_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_trap_gate(9, &coprocessor_segment_overrun); -- cgit v1.2.2 From 7643e9b936b4af31ba4851eb7d5b3a3bfad52502 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:02 +0200 Subject: i386: convert hardware exception 7 to an interrupt gate Handle no coprocessor exception with interrupt initially off. device_not_available in entry_32.S calls either math_state_restore or math_emulate. This patch adds an extra indirection to be able to re-enable interrupts explicitly in traps_32.c Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 15 ++------------- arch/x86/kernel/traps_32.c | 14 +++++++++++++- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 109792bc7cfa..5a885855fc88 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -760,20 +760,9 @@ ENTRY(device_not_available) RING0_INT_FRAME pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 - SAVE_ALL - GET_CR0_INTO_EAX - testl $0x4, %eax # EM (math emulation bit) - jne device_not_available_emulate - preempt_stop(CLBR_ANY) - call math_state_restore - jmp ret_from_exception -device_not_available_emulate: - pushl $0 # temporary storage for ORIG_EIP + pushl $do_device_not_available CFI_ADJUST_CFA_OFFSET 4 - call math_emulate - addl $4, %esp - CFI_ADJUST_CFA_OFFSET -4 - jmp ret_from_exception + jmp error_code CFI_ENDPROC END(device_not_available) diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 39a6101a6123..5b15f75d0591 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -46,6 +46,7 @@ #include #endif +#include #include #include #include @@ -1236,6 +1237,17 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ +void __kprobes do_device_not_available(struct pt_regs *regs, long error) +{ + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +} + void __init trap_init(void) { int i; @@ -1255,7 +1267,7 @@ void __init trap_init(void) set_system_intr_gate(4, &overflow); /* int4 can be called from all */ set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); - set_trap_gate(7, &device_not_available); + set_intr_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_trap_gate(9, &coprocessor_segment_overrun); set_trap_gate(10, &invalid_TSS); -- cgit v1.2.2 From 51bc1ed6060e96de5099f604071961540880efc2 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:03 +0200 Subject: i386: convert hardware exception 9 to an interrupt gate Handle coprocessor segment overrun exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 5b15f75d0591..d26f32308dbe 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -637,7 +637,7 @@ DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) -DO_TRAP(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) DO_TRAP(12, SIGBUS, "stack segment", stack_segment) @@ -1269,7 +1269,7 @@ void __init trap_init(void) set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); - set_trap_gate(9, &coprocessor_segment_overrun); + set_intr_gate(9, &coprocessor_segment_overrun); set_trap_gate(10, &invalid_TSS); set_trap_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); -- cgit v1.2.2 From 6bf77bf93938ae6baad9945a0ba57b3225e8e58b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:04 +0200 Subject: i386: convert hardware exception 10 to an interrupt gate Handle invalid TSS exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index d26f32308dbe..069075e19ea7 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -638,7 +638,7 @@ DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_TRAP(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) DO_TRAP(12, SIGBUS, "stack segment", stack_segment) DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) @@ -1270,7 +1270,7 @@ void __init trap_init(void) set_intr_gate(7, &device_not_available); set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_intr_gate(9, &coprocessor_segment_overrun); - set_trap_gate(10, &invalid_TSS); + set_intr_gate(10, &invalid_TSS); set_trap_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); set_trap_gate(13, &general_protection); -- cgit v1.2.2 From 36d936c7988ac5caf575bc0e2dc618dbfebc6065 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:05 +0200 Subject: i386: convert hardware exception 11 to an interrupt gate Handle segment not present exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 069075e19ea7..6910bbe94f6c 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -639,7 +639,7 @@ DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_TRAP(11, SIGBUS, "segment not present", segment_not_present) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_TRAP(12, SIGBUS, "stack segment", stack_segment) DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) @@ -1271,7 +1271,7 @@ void __init trap_init(void) set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); - set_trap_gate(11, &segment_not_present); + set_intr_gate(11, &segment_not_present); set_trap_gate(12, &stack_segment); set_trap_gate(13, &general_protection); set_intr_gate(14, &page_fault); -- cgit v1.2.2 From f5ca81878b42ae7d1b00d0ed5f62bb1a158bfac1 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:06 +0200 Subject: i386: convert hardware exception 12 to an interrupt gate Handle stack segment exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 6910bbe94f6c..16c056ba14ee 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -640,7 +640,7 @@ DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -DO_TRAP(12, SIGBUS, "stack segment", stack_segment) +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) @@ -1272,7 +1272,7 @@ void __init trap_init(void) set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); - set_trap_gate(12, &stack_segment); + set_intr_gate(12, &stack_segment); set_trap_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_trap_gate(15, &spurious_interrupt_bug); -- cgit v1.2.2 From c6df0d71bec391e78e0a38109d63154acd69a937 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:07 +0200 Subject: i386: convert hardware exception 13 to an interrupt gate Handle general protection exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 16c056ba14ee..e2598505ef5c 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -652,6 +652,8 @@ do_general_protection(struct pt_regs *regs, long error_code) struct tss_struct *tss; int cpu; + conditional_sti(regs); + cpu = get_cpu(); tss = &per_cpu(init_tss, cpu); thread = ¤t->thread; @@ -1273,7 +1275,7 @@ void __init trap_init(void) set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); set_intr_gate(12, &stack_segment); - set_trap_gate(13, &general_protection); + set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_trap_gate(15, &spurious_interrupt_bug); set_trap_gate(16, &coprocessor_error); -- cgit v1.2.2 From cf81978d5fb32ab75f701690b372e1126b41861f Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:08 +0200 Subject: i386: convert hardware exception 15 to an interrupt gate Handle exception 15 with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index e2598505ef5c..0039856b6344 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1164,6 +1164,7 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + conditional_sti(regs); #if 0 /* No need to warn about this any longer. */ printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); @@ -1277,7 +1278,7 @@ void __init trap_init(void) set_intr_gate(12, &stack_segment); set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); - set_trap_gate(15, &spurious_interrupt_bug); + set_intr_gate(15, &spurious_interrupt_bug); set_trap_gate(16, &coprocessor_error); set_trap_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE -- cgit v1.2.2 From 252d28fe65adc2e51bc71358eae43ba94b74da91 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:09 +0200 Subject: i386: convert hardware exception 16 to an interrupt gate Handle coprocessor error exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 0039856b6344..5ab4c3fc7c1d 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1088,6 +1088,7 @@ void math_error(void __user *ip) void do_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); ignore_fpu_irq = 1; math_error((void __user *)regs->ip); } @@ -1279,7 +1280,7 @@ void __init trap_init(void) set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); - set_trap_gate(16, &coprocessor_error); + set_intr_gate(16, &coprocessor_error); set_trap_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE set_trap_gate(18, &machine_check); -- cgit v1.2.2 From 5feedfd401c91e41bbe31ddec93cbe809dc98309 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:10 +0200 Subject: i386: convert hardware exception 17 to an interrupt gate Handle alignment check exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 5ab4c3fc7c1d..86c808b4dafb 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -641,7 +641,7 @@ DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_TRAP_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes @@ -1281,7 +1281,7 @@ void __init trap_init(void) set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); set_intr_gate(16, &coprocessor_error); - set_trap_gate(17, &alignment_check); + set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE set_trap_gate(18, &machine_check); #endif -- cgit v1.2.2 From eb642f62082348c33ead53f736a9698953aa517d Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:11 +0200 Subject: i386: convert hardware exception 18 to an interrupt gate Handle machine check exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/traps_32.c | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 5a885855fc88..c5fe01bca6c0 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1019,7 +1019,7 @@ ENTRY(machine_check) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl machine_check_vector + pushl $do_machine_check CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 86c808b4dafb..54b89f497bac 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -62,6 +62,7 @@ #include #include "mach_traps.h" +#include "cpu/mcheck/mce.h" DECLARE_BITMAP(used_vectors, NR_VECTORS); EXPORT_SYMBOL_GPL(used_vectors); @@ -1252,6 +1253,14 @@ void __kprobes do_device_not_available(struct pt_regs *regs, long error) } } +#ifdef CONFIG_X86_MCE +void __kprobes do_machine_check(struct pt_regs *regs, long error) +{ + conditional_sti(regs); + machine_check_vector(regs, error); +} +#endif + void __init trap_init(void) { int i; @@ -1283,7 +1292,7 @@ void __init trap_init(void) set_intr_gate(16, &coprocessor_error); set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_trap_gate(18, &machine_check); + set_intr_gate(18, &machine_check); #endif set_trap_gate(19, &simd_coprocessor_error); -- cgit v1.2.2 From b939bde2788a7f16741e563ee90f6f3ad38935cf Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:12 +0200 Subject: i386: convert hardware exception 19 to an interrupt gate Handle SIMD coprocessor exception with interrupt initially off. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 54b89f497bac..1bd7960b1357 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -1144,6 +1144,8 @@ static void simd_math_error(void __user *ip) void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { + conditional_sti(regs); + if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; @@ -1294,7 +1296,7 @@ void __init trap_init(void) #ifdef CONFIG_X86_MCE set_intr_gate(18, &machine_check); #endif - set_trap_gate(19, &simd_coprocessor_error); + set_intr_gate(19, &simd_coprocessor_error); if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); -- cgit v1.2.2 From f8e0870f589cfa45575dc7333cdf1b8f769e7900 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:13 +0200 Subject: i386: remove temporary DO_TRAP macros, expanding the last one used Only one use of the DO_TRAP macros remains. Expand that one and remove the macros now. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 66 +++++++++++----------------------------------- 1 file changed, 15 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 1bd7960b1357..8fa8485b49ce 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -530,56 +530,6 @@ vm86_trap: return; } -#define DO_TRAP(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_TRAP(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ -} - -#define DO_VM86_TRAP_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ -} - #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ @@ -643,7 +593,6 @@ DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) -DO_TRAP_INFO(32, SIGILL, "iret exception", iret_error, ILL_BADSTK, 0, 1) void __kprobes do_general_protection(struct pt_regs *regs, long error_code) @@ -1263,6 +1212,21 @@ void __kprobes do_machine_check(struct pt_regs *regs, long error) } #endif +void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", 0, regs, error_code, &info); +} + void __init trap_init(void) { int i; -- cgit v1.2.2 From 85cea51d7e7b8d3408c8e933d88fa067309395fa Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:14 +0200 Subject: i386: add TRACE_IRQS_OFF to entry_32.S in 'error_code' Many exceptions use the same code path via the label 'error_code' in entry_32.S. At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c5fe01bca6c0..49438e58291d 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -730,6 +730,7 @@ error_code: movl $(__USER_DS), %ecx movl %ecx, %ds movl %ecx, %es + TRACE_IRQS_OFF movl %esp,%eax # pt_regs pointer call *%edi jmp ret_from_exception -- cgit v1.2.2 From 43024a8a5d4c63952687286f3083f7f34d4da2cc Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:15 +0200 Subject: i386: add TRACE_IRQS_OFF for exception 1 (debug) At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 49438e58291d..a278505baa11 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -804,6 +804,7 @@ debug_stack_correct: pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # error code 0 movl %esp,%eax # pt_regs pointer call do_debug -- cgit v1.2.2 From e0c7317557c8fc8eacf611e30c2a80f4e24e47a3 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:16 +0200 Subject: i386: add TRACE_IRQS_OFF for the nmi At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index a278505baa11..2abdc9a9f7bc 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -849,6 +849,7 @@ nmi_stack_correct: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_nmi @@ -889,6 +890,7 @@ nmi_espfix_stack: pushl %eax CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx,%edx # zero error code call do_nmi -- cgit v1.2.2 From a790392faa3a6138b6e90d0fe320a2829652ce22 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:17 +0200 Subject: i386: add TRACE_IRQS_OFF for the exception 3 (int3) At this point interrupts are off, so let's inform the tracing code of that fact before calling into C. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 2abdc9a9f7bc..b21fbfaffe39 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -921,6 +921,7 @@ KPROBE_ENTRY(int3) pushl $-1 # mark this as an int CFI_ADJUST_CFA_OFFSET 4 SAVE_ALL + TRACE_IRQS_OFF xorl %edx,%edx # zero error code movl %esp,%eax # pt_regs pointer call do_int3 -- cgit v1.2.2 From 07bb2f6236f11169fbd8a8916b16715b25fea9b6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 9 Sep 2008 21:56:18 +0200 Subject: i386: trace_hardirqs_fixup should now not be necessary: irqs are off. The exception handlers in entry_32.S should now all call TRACE_IRQS_OFF before calling the C code. The calls to trace_hardirqs_fixup should now be unnecessary. Remove them. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 8fa8485b49ce..ab5593650899 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -533,7 +533,6 @@ vm86_trap: #define DO_ERROR(trapnr, signr, str, name) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ @@ -576,7 +575,6 @@ void do_##name(struct pt_regs *regs, long error_code) \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ @@ -860,15 +858,9 @@ void restart_nmi(void) void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KPROBES - trace_hardirqs_fixup(); - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; - /* - * This is an interrupt gate, because kprobes wants interrupts - * disabled. Normal trap handlers don't. - */ conditional_sti(regs); #else if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) @@ -907,8 +899,6 @@ void __kprobes do_debug(struct pt_regs *regs, long error_code) unsigned int condition; int si_code; - trace_hardirqs_fixup(); - get_debugreg(condition, 6); /* -- cgit v1.2.2 From be43d72835ba610e4af274f2d123b26f66f4f7ed Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:13 -0700 Subject: x86: add _PAGE_IOMAP pte flag for IO mappings Use one of the software-defined PTE bits to indicate that a mapping is intended for an IO address. On native hardware this is irrelevent, since a physical address is a physical address. But in a virtual environment, physical addresses are also virtualized, so there needs to be some way to distinguish between pseudo-physical addresses and actual hardware addresses; _PAGE_IOMAP indicates this intent. By default, __supported_pte_mask masks out _PAGE_IOMAP, so it doesn't even appear in the final pagetable. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/ioremap.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index bbe044dbe014..8396868e82c5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -558,7 +558,7 @@ void zap_low_mappings(void) int nx_enabled; -pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL); +pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); #ifdef CONFIG_X86_PAE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3e10054c5731..dec5c775e92b 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -89,7 +89,7 @@ early_param("gbpages", parse_direct_gbpages_on); int after_bootmem; -unsigned long __supported_pte_mask __read_mostly = ~0UL; +pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); static int do_not_nx __cpuinitdata; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8cbeda15cd29..43c3b6896cd6 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -242,16 +242,16 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr, switch (prot_val) { case _PAGE_CACHE_UC: default: - prot = PAGE_KERNEL_NOCACHE; + prot = PAGE_KERNEL_IO_NOCACHE; break; case _PAGE_CACHE_UC_MINUS: - prot = PAGE_KERNEL_UC_MINUS; + prot = PAGE_KERNEL_IO_UC_MINUS; break; case _PAGE_CACHE_WC: - prot = PAGE_KERNEL_WC; + prot = PAGE_KERNEL_IO_WC; break; case _PAGE_CACHE_WB: - prot = PAGE_KERNEL; + prot = PAGE_KERNEL_IO; break; } -- cgit v1.2.2 From 1494177942b23b7094ca291d37e6f6263fa60fdd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:15 -0700 Subject: x86: add early_memremap() early_ioremap() is also used to map normal memory when constructing the linear memory mapping. However, since we sometimes need to be able to distinguish between actual IO mappings and normal memory mappings, add a early_memremap() call, which maps with PAGE_KERNEL (as opposed to PAGE_KERNEL_IO for early_ioremap()), and use it when constructing pagetables. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/ioremap.c | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index dec5c775e92b..5beb89683453 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -313,7 +313,7 @@ static __ref void *alloc_low_page(unsigned long *phys) if (pfn >= table_top) panic("alloc_low_page: ran out of memory"); - adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE); + adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); memset(adr, 0, PAGE_SIZE); *phys = pfn * PAGE_SIZE; return adr; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 43c3b6896cd6..7fb737c6b54f 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -568,12 +568,12 @@ static void __init __early_set_fixmap(enum fixed_addresses idx, } static inline void __init early_set_fixmap(enum fixed_addresses idx, - unsigned long phys) + unsigned long phys, pgprot_t prot) { if (after_paging_init) - set_fixmap(idx, phys); + __set_fixmap(idx, phys, prot); else - __early_set_fixmap(idx, phys, PAGE_KERNEL); + __early_set_fixmap(idx, phys, prot); } static inline void __init early_clear_fixmap(enum fixed_addresses idx) @@ -601,7 +601,7 @@ static int __init check_early_ioremap_leak(void) } late_initcall(check_early_ioremap_leak); -void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; unsigned int nrpages, nesting; @@ -650,7 +650,7 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; idx = idx0; while (nrpages > 0) { - early_set_fixmap(idx, phys_addr); + early_set_fixmap(idx, phys_addr, prot); phys_addr += PAGE_SIZE; --idx; --nrpages; @@ -661,6 +661,18 @@ void __init *early_ioremap(unsigned long phys_addr, unsigned long size) return (void *) (offset + fix_to_virt(idx0)); } +/* Remap an IO device */ +void __init *early_ioremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL_IO); +} + +/* Remap memory */ +void __init *early_memremap(unsigned long phys_addr, unsigned long size) +{ + return __early_ioremap(phys_addr, size, PAGE_KERNEL); +} + void __init early_iounmap(void *addr, unsigned long size) { unsigned long virt_addr; -- cgit v1.2.2 From 88b4c146961f4178f38a8c3e6e54709fa39a3f36 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:16 -0700 Subject: x86: use early_memremap() in setup.c The remappings in setup.c are all just ordinary memory, so use early_memremap() rather than early_ioremap(). Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 21b8e0a59780..de6a9d6d599f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -302,7 +302,7 @@ static void __init relocate_initrd(void) if (clen > MAX_MAP_CHUNK-slop) clen = MAX_MAP_CHUNK-slop; mapaddr = ramdisk_image & PAGE_MASK; - p = early_ioremap(mapaddr, clen+slop); + p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); early_iounmap(p, clen+slop); q += clen; @@ -379,7 +379,7 @@ static void __init parse_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, PAGE_SIZE); + data = early_memremap(pa_data, PAGE_SIZE); switch (data->type) { case SETUP_E820_EXT: parse_e820_ext(data, pa_data); @@ -402,7 +402,7 @@ static void __init e820_reserve_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); e820_update_range(pa_data, sizeof(*data)+data->len, E820_RAM, E820_RESERVED_KERN); found = 1; @@ -428,7 +428,7 @@ static void __init reserve_early_setup_data(void) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { - data = early_ioremap(pa_data, sizeof(*data)); + data = early_memremap(pa_data, sizeof(*data)); sprintf(buf, "setup data %x", data->type); reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); pa_data = data->next; -- cgit v1.2.2 From a32ad4626776f09b30ef98a872a5f6fb64fe6607 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 7 Sep 2008 15:21:17 -0700 Subject: x86-64: don't check for map replacement The check prevents flags on mappings from being changed, which is not desireable. There's no need to check for replacing a mapping, and x86-32 does not do this check. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5beb89683453..7c8bb46e83e4 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -196,9 +196,6 @@ set_pte_vaddr_pud(pud_t *pud_page, unsigned long vaddr, pte_t new_pte) } pte = pte_offset_kernel(pmd, vaddr); - if (!pte_none(*pte) && pte_val(new_pte) && - pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask)) - pte_ERROR(*pte); set_pte(pte, new_pte); /* -- cgit v1.2.2 From a73aaedd95703bd49f4c3f9df06fb7b7373ba905 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 14 Sep 2008 02:33:14 -0700 Subject: x86: check dsdt before find oem table for es7000, v2 v2: use __acpi_unmap_table() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/es7000_32.c | 28 +++++++++++++++++++++++----- arch/x86/mach-generic/es7000.c | 20 +++++++++++++++----- 2 files changed, 38 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/es7000_32.c b/arch/x86/kernel/es7000_32.c index 849e5cd485b8..f454c78fcef6 100644 --- a/arch/x86/kernel/es7000_32.c +++ b/arch/x86/kernel/es7000_32.c @@ -109,6 +109,7 @@ struct oem_table { }; extern int find_unisys_acpi_oem_table(unsigned long *oem_addr); +extern void unmap_unisys_acpi_oem_table(unsigned long oem_addr); #endif struct mip_reg { @@ -243,21 +244,38 @@ parse_unisys_oem (char *oemptr) } #ifdef CONFIG_ACPI -int __init -find_unisys_acpi_oem_table(unsigned long *oem_addr) +static unsigned long oem_addrX; +static unsigned long oem_size; +int __init find_unisys_acpi_oem_table(unsigned long *oem_addr) { struct acpi_table_header *header = NULL; int i = 0; - while (ACPI_SUCCESS(acpi_get_table("OEM1", i++, &header))) { + acpi_size tbl_size; + + while (ACPI_SUCCESS(acpi_get_table_with_size("OEM1", i++, &header, &tbl_size))) { if (!memcmp((char *) &header->oem_id, "UNISYS", 6)) { struct oem_table *t = (struct oem_table *)header; - *oem_addr = (unsigned long)__acpi_map_table(t->OEMTableAddr, - t->OEMTableSize); + + oem_addrX = t->OEMTableAddr; + oem_size = t->OEMTableSize; + early_acpi_os_unmap_memory(header, tbl_size); + + *oem_addr = (unsigned long)__acpi_map_table(oem_addrX, + oem_size); return 0; } + early_acpi_os_unmap_memory(header, tbl_size); } return -1; } + +void __init unmap_unisys_acpi_oem_table(unsigned long oem_addr) +{ + if (!oem_addr) + return; + + __acpi_unmap_table((char *)oem_addr, oem_size); +} #endif static void diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 520cca0ee04e..6513d41ea21e 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -47,16 +47,26 @@ static __init int mps_oem_check(struct mp_config_table *mpc, char *oem, /* Hook from generic ACPI tables.c */ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - unsigned long oem_addr; + unsigned long oem_addr = 0; + int check_dsdt; + int ret = 0; + + /* check dsdt at first to avoid clear fix_map for oem_addr */ + check_dsdt = es7000_check_dsdt(); + if (!find_unisys_acpi_oem_table(&oem_addr)) { - if (es7000_check_dsdt()) - return parse_unisys_oem((char *)oem_addr); + if (check_dsdt) + ret = parse_unisys_oem((char *)oem_addr); else { setup_unisys(); - return 1; + ret = 1; } + /* + * we need to unmap it + */ + unmap_unisys_acpi_oem_table(oem_addr); } - return 0; + return ret; } #else static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) -- cgit v1.2.2 From 3e6de5a393661c5cdabe44115e93bcbde6a742fc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 15 Sep 2008 08:26:15 +0200 Subject: x86: print out EBDA/lowmem address it's useful for debugging purposes to know the location of the EBDA. Signed-off-by: Ingo Molnar --- arch/x86/kernel/head.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index 3e66bd364a9d..1dcb0f13897e 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -35,6 +35,7 @@ void __init reserve_ebda_region(void) /* start of EBDA area */ ebda_addr = get_bios_ebda(); + printk(KERN_INFO "BIOS EBDA/lowmem at: %08x/%08x\n", ebda_addr, lowmem); /* Fixup: bios puts an EBDA in the top 64K segment */ /* of conventional memory, but does not adjust lowmem. */ -- cgit v1.2.2 From 59ef48a58e59cc27255d526ae3fa60ddcd977208 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 14 Sep 2008 21:58:49 +0400 Subject: x86: smpboot - check if we have ESR register in wakeup_secondary_cpu We should check if we have ESR register before reading from it. Signed-off-by: Cyrill Gorcunov Cc: Yinghai Lu Cc: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index b700c9a10644..a778e221ccfb 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -599,10 +599,12 @@ wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) * Give the other CPU some time to accept the IPI. */ udelay(200); - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - accept_status = (apic_read(APIC_ESR) & 0xEF); + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + accept_status = (apic_read(APIC_ESR) & 0xEF); + } pr_debug("NMI sent.\n"); if (send_status) -- cgit v1.2.2 From 5e72d9e4850c91b6a0f06fa803f7393b55a38aa8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 12 Sep 2008 15:43:04 +0100 Subject: x86-64: fix combining of regions in init_memory_mapping() When nr_range gets decremented, the same slot must be considered for coalescing with its new successor again. The issue is apparently pretty benign to native code, but surfaces as a boot time crash in our forward ported Xen tree (where the page table setup overall works differently than in native). Signed-off-by: Jan Beulich Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 7c8bb46e83e4..b8e461d49412 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -746,7 +746,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, old_start = mr[i].start; memmove(&mr[i], &mr[i+1], (nr_range - 1 - i) * sizeof (struct map_range)); - mr[i].start = old_start; + mr[i--].start = old_start; nr_range--; } -- cgit v1.2.2 From 606ee44dbb72fd48beb47f171d7b9cecf6ade6dd Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 17 Sep 2008 16:48:17 +0100 Subject: x86: make mm/gup.c more virtualization friendly Since pte_flags() is much cheaper than pte_val() in some virtualized environments (namely, Xen), use the former whereever possible. Signed-off-by: Jan Beulich Cc: "Nick Piggin" Signed-off-by: Ingo Molnar --- arch/x86/mm/gup.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 007bb06c7504..4ba373c5b8c8 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -82,7 +82,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, pte_t pte = gup_get_pte(ptep); struct page *page; - if ((pte_val(pte) & (mask | _PAGE_SPECIAL)) != mask) { + if ((pte_flags(pte) & (mask | _PAGE_SPECIAL)) != mask) { pte_unmap(ptep); return 0; } @@ -116,10 +116,10 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_val(pte) & mask) != mask) + if ((pte_flags(pte) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); VM_BUG_ON(!pfn_valid(pte_pfn(pte))); refs = 0; @@ -173,10 +173,10 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr, mask = _PAGE_PRESENT|_PAGE_USER; if (write) mask |= _PAGE_RW; - if ((pte_val(pte) & mask) != mask) + if ((pte_flags(pte) & mask) != mask) return 0; /* hugepages are never "special" */ - VM_BUG_ON(pte_val(pte) & _PAGE_SPECIAL); + VM_BUG_ON(pte_flags(pte) & _PAGE_SPECIAL); VM_BUG_ON(!pfn_valid(pte_pfn(pte))); refs = 0; -- cgit v1.2.2 From bd32a8cfa8f172bd943655a3663d8005e5c1d83c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Sep 2008 18:41:16 -0700 Subject: x86: cpu don't print duplicated vendor string Some CPUs have vendor string in the middle of model_id instead of beginning Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index fb789dd9e691..088fbdb6734e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -797,7 +797,7 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c) else if (c->cpuid_level >= 0) vendor = c->x86_vendor_id; - if (vendor && strncmp(c->x86_model_id, vendor, strlen(vendor))) + if (vendor && !strstr(c->x86_model_id, vendor)) printk(KERN_CONT "%s ", vendor); if (c->x86_model_id[0]) -- cgit v1.2.2 From cf4cfb225ab2b48611cb4f9e8db23c87486416d6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 13:45:46 -0300 Subject: x86: use user_mode macro Instead of using SEGMENT_IS_KERNEL_CODE, use the "user_mode" macro, which can play the same role. Delete the former, since it now lacks any user. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index bbecf8b6bf96..8abcb7b08694 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -47,7 +47,7 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && SEGMENT_IS_KERNEL_CODE(regs->cs) && + if (!v8086_mode(regs) && !user_mode(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + 4); -- cgit v1.2.2 From 2c44e66843cd50c5ef4f4271fbd63a4f4bf4d083 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 13:48:07 -0300 Subject: x86: coalesce tests Coalesce v8086_mode and user_mode into a single user_mode_vm() test. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 8abcb7b08694..ca0a4aab12ce 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -47,8 +47,7 @@ unsigned long profile_pc(struct pt_regs *regs) unsigned long pc = instruction_pointer(regs); #ifdef CONFIG_SMP - if (!v8086_mode(regs) && !user_mode(regs) && - in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + 4); #else -- cgit v1.2.2 From 097a0788df71b0f3328c70ab5f4e41c27ee66817 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Thu, 14 Aug 2008 17:33:12 -0300 Subject: x86: set bp field in pt_regs properly Save rbp twice: One is for marking the stack frame, as usual (already there), and the other, to fill pt_regs properly. This is because bx comes right before the last saved register in that structure, and not bp. If the base pointer were in the place bx is today, this would not be needed. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cf3a0b2d0059..25bb3f9b2552 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -667,6 +667,13 @@ END(stub_rt_sigreturn) SAVE_ARGS leaq -ARGOFFSET(%rsp),%rdi # arg1 for handler pushq %rbp + /* + * Save rbp twice: One is for marking the stack frame, as usual, and the + * other, to fill pt_regs properly. This is because bx comes right + * before the last saved register in that structure, and not bp. If the + * base pointer were in the place bx is today, this would not be needed. + */ + movq %rbp, -8(%rsp) CFI_ADJUST_CFA_OFFSET 8 CFI_REL_OFFSET rbp, 0 movq %rsp,%rbp -- cgit v1.2.2 From 3927fa9e4b5d5f346d12aa0531744daef106ebd3 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 13:53:43 -0300 Subject: x86: use frame pointer information on x86_64 profile_pc Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index e3d49c553af2..7fd995edb762 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -34,11 +34,15 @@ unsigned long profile_pc(struct pt_regs *regs) of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ if (!user_mode(regs) && in_lock_functions(pc)) { +#ifdef CONFIG_FRAME_POINTER + return *(unsigned long *)(regs->bp + sizeof(long)); +#else unsigned long *sp = (unsigned long *)regs->sp; if (sp[0] >> 22) return sp[0]; if (sp[1] >> 22) return sp[1]; +#endif } return pc; } -- cgit v1.2.2 From 8de0b8a7eaf274d197698b035090eeb805f62de6 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 14:10:13 -0300 Subject: x86: use user_mode_vm instead of user_mode For x86_64, it does not really matter. But makes the code equal to i386. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 7fd995edb762..0469243ae1bd 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -33,7 +33,7 @@ unsigned long profile_pc(struct pt_regs *regs) /* Assume the lock function has either no stack frame or a copy of flags from PUSHF Eflags always has bits 22 and up cleared unlike kernel addresses. */ - if (!user_mode(regs) && in_lock_functions(pc)) { + if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else -- cgit v1.2.2 From 2ff298372d03b01c9ca8738ee2791227a81928e2 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 14:21:29 -0300 Subject: x86: bind irq0 irq data to cpu0 Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 0469243ae1bd..841938297c2a 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -115,6 +115,7 @@ void __init hpet_time_init(void) if (!hpet_enable()) setup_pit_timer(); + irq0.mask = cpumask_of_cpu(0); setup_irq(0, &irq0); } -- cgit v1.2.2 From 2f97435e57926a5cdc104fe5a7c64932cb62cdda Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 15:08:39 -0300 Subject: x86: factor out irq initialization for x86_64 Provide apic_intr_init and smp_intr_init functions. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 1f26fd9ec4f4..18968a78ab3d 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -164,22 +164,8 @@ static void __init init_ISA_irqs (void) void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -void __init native_init_IRQ(void) +void __init smp_intr_init(void) { - int i; - - init_ISA_irqs(); - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) - set_intr_gate(vector, interrupt[i]); - } - #ifdef CONFIG_SMP /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -207,6 +193,14 @@ void __init native_init_IRQ(void) /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif +} + +void __init apic_intr_init(void) +{ +#ifdef CONFIG_SMP + smp_intr_init(); +#endif + alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); @@ -216,6 +210,25 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +} + +void __init native_init_IRQ(void) +{ + int i; + + init_ISA_irqs(); + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + if (vector != IA32_SYSCALL_VECTOR) + set_intr_gate(vector, interrupt[i]); + } + + apic_intr_init(); if (!acpi_ioapic) setup_irq(2, &irq2); -- cgit v1.2.2 From 780209af714ba733cd9e2f3526107becfc1969bc Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 15:12:39 -0300 Subject: x86: make init_ISA_irqs nonstatic Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 18968a78ab3d..b01d9549b3d4 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -135,7 +135,7 @@ DEFINE_PER_CPU(vector_irq_t, vector_irq) = { [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 }; -static void __init init_ISA_irqs (void) +void __init init_ISA_irqs(void) { int i; -- cgit v1.2.2 From 461ebd109569d666d367f774f74002add6444d49 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 15:25:13 -0300 Subject: x86: rename timer_event_interrupt to timer_interrupt Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 841938297c2a..2be67c24909e 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -48,7 +48,7 @@ unsigned long profile_pc(struct pt_regs *regs) } EXPORT_SYMBOL(profile_pc); -static irqreturn_t timer_event_interrupt(int irq, void *dev_id) +irqreturn_t timer_interrupt(int irq, void *dev_id) { add_pda(irq0_irqs, 1); @@ -104,7 +104,7 @@ unsigned long __init calibrate_cpu(void) } static struct irqaction irq0 = { - .handler = timer_event_interrupt, + .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_IRQPOLL | IRQF_NOBALANCING, .mask = CPU_MASK_NONE, .name = "timer" -- cgit v1.2.2 From 2c460d0b6813a5d2a7d571b0b561e4e727036dd7 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Fri, 11 Jul 2008 16:06:40 -0300 Subject: x86: replace hardcoded number Replace "4" in time_32.c code by sizeof(long). This way, it can work on x86_64 too. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index ca0a4aab12ce..7215bc6adfcc 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -49,7 +49,7 @@ unsigned long profile_pc(struct pt_regs *regs) #ifdef CONFIG_SMP if (!user_mode_vm(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER - return *(unsigned long *)(regs->bp + 4); + return *(unsigned long *)(regs->bp + sizeof(long)); #else unsigned long *sp = (unsigned long *)®s->sp; -- cgit v1.2.2 From 33c053d0aec344c8b2ad6966d904ebeff64e590b Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 21 Jul 2008 18:42:52 -0300 Subject: x86: wrap MCA_bus test around an ifdef Only test for MCA_bus if support for MCA is compiled in. Also, for x86_64, write the code inside the conditional for consistency with i386. It won't bite us, since it'll probably never select CONFIG_MCA anyway. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/time_32.c | 2 ++ arch/x86/kernel/time_64.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/time_32.c b/arch/x86/kernel/time_32.c index 7215bc6adfcc..77b400f06ea2 100644 --- a/arch/x86/kernel/time_32.c +++ b/arch/x86/kernel/time_32.c @@ -94,6 +94,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) do_timer_interrupt_hook(); +#ifdef CONFIG_MCA if (MCA_bus) { /* The PS/2 uses level-triggered interrupts. You can't turn them off, nor would you want to (any attempt to @@ -107,6 +108,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) u8 irq_v = inb_p( 0x61 ); /* read the current state */ outb_p( irq_v|0x80, 0x61 ); /* reset the IRQ */ } +#endif return IRQ_HANDLED; } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 2be67c24909e..207a7a1d7ac5 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -54,6 +55,13 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) global_clock_event->event_handler(global_clock_event); +#ifdef CONFIG_MCA + if (MCA_bus) { + u8 irq_v = inb_p(0x61); /* read the current state */ + outb_p(irq_v|0x80, 0x61); /* reset the IRQ */ + } +#endif + return IRQ_HANDLED; } -- cgit v1.2.2 From e04d645f326bc8e591bc4ae21c4ab535987a17c1 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 22 Sep 2008 14:35:08 -0300 Subject: x86: move vgetcpu mode probing to cpu detection Take it out of time initialization and move it to cpu detection time. Signed-off-by: Glauber Costa Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 12 ++++++++++++ arch/x86/kernel/time_64.c | 4 ---- 2 files changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 088fbdb6734e..f1af71851919 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -719,12 +719,24 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif } +#ifdef CONFIG_X86_64 +static void vgetcpu_set_mode(void) +{ + if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) + vgetcpu_mode = VGETCPU_RDTSCP; + else + vgetcpu_mode = VGETCPU_LSL; +} +#endif + void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); +#else + vgetcpu_set_mode(); #endif } diff --git a/arch/x86/kernel/time_64.c b/arch/x86/kernel/time_64.c index 207a7a1d7ac5..cb19d650c216 100644 --- a/arch/x86/kernel/time_64.c +++ b/arch/x86/kernel/time_64.c @@ -130,10 +130,6 @@ void __init hpet_time_init(void) void __init time_init(void) { tsc_init(); - if (cpu_has(&boot_cpu_data, X86_FEATURE_RDTSCP)) - vgetcpu_mode = VGETCPU_RDTSCP; - else - vgetcpu_mode = VGETCPU_LSL; late_time_init = choose_time_init(); } -- cgit v1.2.2 From 2e42060c19cb79adacc48beb5e9ec5361df976a2 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 23 Sep 2008 15:37:13 -0500 Subject: x86, uv: add early detection of UV system types Portions of the ACPI code needs to know if a system is a UV system prior to genapic initialization. This patch adds a call early_acpi_boot_init() so that the apic type is discovered earlier. V2 of the patch adding fixes from Yinghai Lu. Much cleaner and smaller. Signed-off-by: Jack Steiner Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ arch/x86/mm/srat_64.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index de6a9d6d599f..2255782e8d4b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -998,6 +998,8 @@ void __init setup_arch(char **cmdline_p) */ acpi_boot_table_init(); + early_acpi_boot_init(); + #ifdef CONFIG_ACPI_NUMA /* * Parse SRAT to discover nodes. diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 1b4763e26ea9..51c0a2fc14fe 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -138,7 +138,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa) return; } - if (is_uv_system()) + if (get_uv_system_type() >= UV_X2APIC) apic_id = (pa->apic_id << 8) | pa->local_sapic_eid; else apic_id = pa->apic_id; -- cgit v1.2.2 From 6b11d4ef3e4d6ce83fb177aa50fdf385abb17d1a Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:02 +0200 Subject: traps: x86_64: add TRACE_IRQS_OFF in error_entry Add TRACE_IRQS_OFF just before entering the C code. All exceptions are taken via interrupt gates. If irq tracing is enabled, it should be notified as soon as possible. Interrupts are only (conditionally) re-enabled in C code. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 25bb3f9b2552..963b35b81bcc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1065,7 +1065,8 @@ KPROBE_ENTRY(error_entry) je error_kernelspace error_swapgs: SWAPGS -error_sti: +error_sti: + TRACE_IRQS_OFF movq %rdi,RDI(%rsp) CFI_REL_OFFSET rdi,RDI movq %rsp,%rdi -- cgit v1.2.2 From 7e61a7932495e37685e95ec9a59ad08810dec959 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:03 +0200 Subject: traps: x86_64: add TRACE_IRQS_OFF in paranoidentry macro Add TRACE_IRQS_OFF just before entering the C code. All exceptions are taken via interrupt gates. If irq tracing is enabled, it should be notified as soon as possible. Interrupts are only (conditionally) re-enabled in C code. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 963b35b81bcc..977c8ea66028 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -939,6 +939,9 @@ END(spurious_interrupt) .if \ist movq %gs:pda_data_offset, %rbp .endif + .if \irqtrace + TRACE_IRQS_OFF + .endif movq %rsp,%rdi movq ORIG_RAX(%rsp),%rsi movq $-1,ORIG_RAX(%rsp) -- cgit v1.2.2 From 4b986a365253b57d8ab4ed7b796ba0893ff4c05c Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:04 +0200 Subject: traps: x86_64: remove trace_hardirqs_fixup from DO_ERROR_INFO macro All exceptions are taken via interrupt gates. TRACE_IRQS_OFF is called just before entering the C code, so the irq state is known to the irq tracer at this point. No need to call trace_hardirqs_fixup. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 9c0ac0cab013..9f487f374c72 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -658,7 +658,6 @@ asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ info.si_errno = 0; \ info.si_code = sicode; \ info.si_addr = (void __user *)siaddr; \ - trace_hardirqs_fixup(); \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ return; \ -- cgit v1.2.2 From 8b1c870f19849235b8c7e4dfe2ec6b0d691fa9d7 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:05 +0200 Subject: traps: x86_64: remove trace_hardirqs_fixup from int3 handler All exceptions are taken via interrupt gates. TRACE_IRQS_OFF is called just before entering the C code, so the irq state is known to the irq tracer at this point. No need to call trace_hardirqs_fixup. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 9f487f374c72..7d56c875dd03 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -861,8 +861,6 @@ void restart_nmi(void) /* runs on IST stack. */ asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { - trace_hardirqs_fixup(); - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; -- cgit v1.2.2 From a491503e4d0cb739f409069826e2746e38826099 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:06 +0200 Subject: traps: x86_64: remove trace_hardirqs_fixup from debug handler All exceptions are taken via interrupt gates. TRACE_IRQS_OFF is called just before entering the C code, so the irq state is known to the irq tracer at this point. No need to call trace_hardirqs_fixup. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 7d56c875dd03..7d59559c2dff 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -899,8 +899,6 @@ asmlinkage void __kprobes do_debug(struct pt_regs *regs, unsigned long condition; siginfo_t info; - trace_hardirqs_fixup(); - get_debugreg(condition, 6); /* -- cgit v1.2.2 From 69c89b5bf7f253756f3056e84b8603abe1c50f5b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:07 +0200 Subject: traps: x86: remove trace_hardirqs_fixup from pagefault handler The last use of trace_hardirqs_fixup is unnecessary, because the trap is taken with interrupt off on i386 as well as x86_64, and the irq-tracer is notified of this from the assembly code. trace_hardirqs_fixup and trace_hardirqs_fixup_flags are removed from include/asm-x86/irqflags.h as they are no longer used. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a742d753d5b0..3f2b8962cbd0 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -592,11 +592,6 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) unsigned long flags; #endif - /* - * We can fault from pretty much anywhere, with unknown IRQ state. - */ - trace_hardirqs_fixup(); - tsk = current; mm = tsk->mm; prefetchw(&mm->mmap_sem); -- cgit v1.2.2 From 3c1326f8a6d8b9815ca88c95441330f96eef7352 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 26 Sep 2008 14:03:08 +0200 Subject: traps: i386: make do_trap more like x86_64 This patch hardcodes which traps should be forwarded to handle_vm86_trap in do_trap. This allows to remove the vm86 parameter from the i386-version of do_trap, which makes the DO_VM86_ERROR and DO_VM86_ERROR_INFO macros unnecessary. x86_64 part is whitespace only. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 55 ++++++++++++++-------------------------------- arch/x86/kernel/traps_64.c | 2 +- 2 files changed, 17 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index ab5593650899..bf4d71231d40 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -482,13 +482,17 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err) } static void __kprobes -do_trap(int trapnr, int signr, char *str, int vm86, struct pt_regs *regs, +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) { struct task_struct *tsk = current; if (regs->flags & X86_VM_MASK) { - if (vm86) + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) goto vm86_trap; goto trap_signal; } @@ -537,37 +541,10 @@ void do_##name(struct pt_regs *regs, long error_code) \ == NOTIFY_STOP) \ return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, 0, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr, irq) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - if (irq) \ - local_irq_enable(); \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, 0, regs, error_code, &info); \ -} - -#define DO_VM86_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, 1, regs, error_code, NULL); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ } -#define DO_VM86_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ @@ -579,18 +556,18 @@ void do_##name(struct pt_regs *regs, long error_code) \ == NOTIFY_STOP) \ return; \ conditional_sti(regs); \ - do_trap(trapnr, signr, str, 1, regs, error_code, &info); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ } -DO_VM86_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_VM86_ERROR(4, SIGSEGV, "overflow", overflow) -DO_VM86_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip, 0) +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0, 0) +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) void __kprobes do_general_protection(struct pt_regs *regs, long error_code) @@ -868,7 +845,7 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif - do_trap(3, SIGTRAP, "int3", 1, regs, error_code, NULL); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); } /* @@ -1214,7 +1191,7 @@ void do_iret_error(struct pt_regs *regs, long error_code) if (notify_die(DIE_TRAP, "iret exception", regs, error_code, 32, SIGILL) == NOTIFY_STOP) return; - do_trap(32, SIGILL, "iret exception", 0, regs, error_code, &info); + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); } void __init trap_init(void) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 7d59559c2dff..1efd1ea64663 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -640,7 +640,7 @@ kernel_trap: return; } -#define DO_ERROR(trapnr, signr, str, name) \ +#define DO_ERROR(trapnr, signr, str, name) \ asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ -- cgit v1.2.2 From 9b658f6f8bd30b91aec527325e398771511ea61b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 25 Sep 2008 22:22:12 -0700 Subject: x86: cleanup, remove extra ifdef also change two functions to static. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_64.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index b01d9549b3d4..5b5be9d43c2a 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -164,7 +164,7 @@ void __init init_ISA_irqs(void) void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); -void __init smp_intr_init(void) +static void __init smp_intr_init(void) { #ifdef CONFIG_SMP /* @@ -195,11 +195,9 @@ void __init smp_intr_init(void) #endif } -void __init apic_intr_init(void) +static void __init apic_intr_init(void) { -#ifdef CONFIG_SMP smp_intr_init(); -#endif alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt); -- cgit v1.2.2 From d2f904bb9a1ba88a58a03612abd8c6c54bdaf73a Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Thu, 25 Sep 2008 07:52:10 -0500 Subject: x86, uv: fix for size of hub mappings Fix the size of the mappings of UV hub registers. Size must be a function of the maximum node number within the SSI. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index ae2ffc8a400c..8cf4ec2a37d9 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -286,12 +286,13 @@ static __init void map_low_mmrs(void) enum map_type {map_wb, map_uc}; -static __init void map_high(char *id, unsigned long base, int shift, enum map_type map_type) +static __init void map_high(char *id, unsigned long base, int shift, + int max_pnode, enum map_type map_type) { unsigned long bytes, paddr; paddr = base << shift; - bytes = (1UL << shift); + bytes = (1UL << shift) * (max_pnode + 1); printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, paddr + bytes); if (map_type == map_uc) @@ -307,7 +308,7 @@ static __init void map_gru_high(int max_pnode) gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); if (gru.s.enable) - map_high("GRU", gru.s.base, shift, map_wb); + map_high("GRU", gru.s.base, shift, max_pnode, map_wb); } static __init void map_config_high(int max_pnode) @@ -317,7 +318,7 @@ static __init void map_config_high(int max_pnode) cfg.v = uv_read_local_mmr(UVH_RH_GAM_CFG_OVERLAY_CONFIG_MMR); if (cfg.s.enable) - map_high("CONFIG", cfg.s.base, shift, map_uc); + map_high("CONFIG", cfg.s.base, shift, max_pnode, map_uc); } static __init void map_mmr_high(int max_pnode) @@ -327,7 +328,7 @@ static __init void map_mmr_high(int max_pnode) mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); if (mmr.s.enable) - map_high("MMR", mmr.s.base, shift, map_uc); + map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); } static __init void map_mmioh_high(int max_pnode) @@ -337,7 +338,7 @@ static __init void map_mmioh_high(int max_pnode) mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); if (mmioh.s.enable) - map_high("MMIOH", mmioh.s.base, shift, map_uc); + map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); } static __init void uv_rtc_init(void) -- cgit v1.2.2 From 69d45dd1c3bb512a9f5f9c464ac625eb707669ec Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Sun, 28 Sep 2008 21:28:15 +0200 Subject: x86: merge winchip-2 and winchip-2a cpu choices The Winchip-2 and Winchip-2A cpu choices select the same options for kernel and compiler. Merge them to save few bytes and reduce confusion. Signed-off-by: Krzysztof Helt Acked-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 27 +++++++++------------------ arch/x86/Makefile_32.cpu | 1 - arch/x86/configs/i386_defconfig | 1 - arch/x86/configs/x86_64_defconfig | 1 - 4 files changed, 9 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index c5f101360520..f4bdc0492d38 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -38,8 +38,7 @@ config M386 - "Crusoe" for the Transmeta Crusoe series. - "Efficeon" for the Transmeta Efficeon series. - "Winchip-C6" for original IDT Winchip. - - "Winchip-2" for IDT Winchip 2. - - "Winchip-2A" for IDT Winchips with 3dNow! capabilities. + - "Winchip-2" for IDT Winchips with 3dNow! capabilities. - "GeodeGX1" for Geode GX1 (Cyrix MediaGX). - "Geode GX/LX" For AMD Geode GX and LX processors. - "CyrixIII/VIA C3" for VIA Cyrix III or VIA C3. @@ -194,19 +193,11 @@ config MWINCHIPC6 treat this chip as a 586TSC with some extended instructions and alignment requirements. -config MWINCHIP2 - bool "Winchip-2" - depends on X86_32 - help - Select this for an IDT Winchip-2. Linux and GCC - treat this chip as a 586TSC with some extended instructions - and alignment requirements. - config MWINCHIP3D - bool "Winchip-2A/Winchip-3" + bool "Winchip-2/Winchip-2A/Winchip-3" depends on X86_32 help - Select this for an IDT Winchip-2A or 3. Linux and GCC + Select this for an IDT Winchip-2, 2A or 3. Linux and GCC treat this chip as a 586TSC with some extended instructions and alignment requirements. Also enable out of order memory stores for this CPU, which can increase performance of some @@ -318,7 +309,7 @@ config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || X86_GENERIC || GENERIC_CPU || MPSC default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MVIAC7 config X86_XADD @@ -360,7 +351,7 @@ config X86_POPAD_OK config X86_ALIGNMENT_16 def_bool y - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 config X86_INTEL_USERCOPY def_bool y @@ -368,7 +359,7 @@ config X86_INTEL_USERCOPY config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 + depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 config X86_USE_3DNOW def_bool y @@ -376,7 +367,7 @@ config X86_USE_3DNOW config X86_OOSTORE def_bool y - depends on (MWINCHIP3D || MWINCHIP2 || MWINCHIPC6) && MTRR + depends on (MWINCHIP3D || MWINCHIPC6) && MTRR # # P6_NOPs are a relatively minor optimization that require a family >= @@ -396,7 +387,7 @@ config X86_P6_NOP config X86_TSC def_bool y - depends on ((MWINCHIP3D || MWINCHIP2 || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 + depends on ((MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2) && !X86_NUMAQ) || X86_64 config X86_CMPXCHG64 def_bool y @@ -417,7 +408,7 @@ config X86_MINIMUM_CPU_FAMILY config X86_DEBUGCTLMSR def_bool y - depends on !(MK6 || MWINCHIPC6 || MWINCHIP2 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) + depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) menuconfig PROCESSOR_SELECT bool "Supported processor vendors" if EMBEDDED diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu index b72b4f753113..80177ec052f0 100644 --- a/arch/x86/Makefile_32.cpu +++ b/arch/x86/Makefile_32.cpu @@ -28,7 +28,6 @@ cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) -cflags-$(CONFIG_MWINCHIP2) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MWINCHIP3D) += $(call cc-option,-march=winchip2,-march=i586) cflags-$(CONFIG_MCYRIXIII) += $(call cc-option,-march=c3,-march=i486) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index ca226ca31288..52d0359719d7 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -213,7 +213,6 @@ CONFIG_M686=y # CONFIG_MCRUSOE is not set # CONFIG_MEFFICEON is not set # CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set # CONFIG_MGEODEGX1 is not set # CONFIG_MGEODE_LX is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 2c4b1c771e28..f0a03d7a7d63 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -210,7 +210,6 @@ CONFIG_X86_PC=y # CONFIG_MCRUSOE is not set # CONFIG_MEFFICEON is not set # CONFIG_MWINCHIPC6 is not set -# CONFIG_MWINCHIP2 is not set # CONFIG_MWINCHIP3D is not set # CONFIG_MGEODEGX1 is not set # CONFIG_MGEODE_LX is not set -- cgit v1.2.2 From 14adf855baefad5ac3b545be23a64e6b61d6b74a Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Mon, 29 Sep 2008 18:29:42 -0400 Subject: x86: move prefill_possible_map calling early, fix, V2 Commit 4a701737 ("x86: move prefill_possible_map calling early, fix") is the wrong fix: prefill_possible_map() needs to be available even when CONFIG_HOTPLUG_CPU is not set. A followon patch will do that. Fix this correctly by making prefill_possible_map() available even when CONFIG_HOTPLUG_CPU is not set. The function is needed so that the number of possible CPUs can be determined. Tested on uniprocessor machine with CPU hotplug disabled. From boot log: Before: NR_CPUS: 512, nr_cpu_ids: 512, nr_node_ids 1 After: NR_CPUS: 512, nr_cpu_ids: 1, nr_node_ids 1 Signed-off-by: Chuck Ebbert Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 62 +++++++++++++++++++++++------------------------ 1 file changed, 31 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index a778e221ccfb..23913785c262 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,39 +1261,8 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -#ifdef CONFIG_HOTPLUG_CPU - -static void remove_siblinginfo(int cpu) -{ - int sibling; - struct cpuinfo_x86 *c = &cpu_data(cpu); - - for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { - cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); - /*/ - * last thread sibling in this cpu core going down - */ - if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) - cpu_data(sibling).booted_cores--; - } - - for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) - cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); - cpus_clear(per_cpu(cpu_sibling_map, cpu)); - cpus_clear(per_cpu(cpu_core_map, cpu)); - c->phys_proc_id = 0; - c->cpu_core_id = 0; - cpu_clear(cpu, cpu_sibling_setup_map); -} - static int additional_cpus __initdata = -1; -static __init int setup_additional_cpus(char *s) -{ - return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; -} -early_param("additional_cpus", setup_additional_cpus); - /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1340,6 +1309,37 @@ __init void prefill_possible_map(void) nr_cpu_ids = possible; } +#ifdef CONFIG_HOTPLUG_CPU + +static void remove_siblinginfo(int cpu) +{ + int sibling; + struct cpuinfo_x86 *c = &cpu_data(cpu); + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_core_map, cpu)) { + cpu_clear(cpu, per_cpu(cpu_core_map, sibling)); + /*/ + * last thread sibling in this cpu core going down + */ + if (cpus_weight(per_cpu(cpu_sibling_map, cpu)) == 1) + cpu_data(sibling).booted_cores--; + } + + for_each_cpu_mask_nr(sibling, per_cpu(cpu_sibling_map, cpu)) + cpu_clear(cpu, per_cpu(cpu_sibling_map, sibling)); + cpus_clear(per_cpu(cpu_sibling_map, cpu)); + cpus_clear(per_cpu(cpu_core_map, cpu)); + c->phys_proc_id = 0; + c->cpu_core_id = 0; + cpu_clear(cpu, cpu_sibling_setup_map); +} + +static __init int setup_additional_cpus(char *s) +{ + return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; +} +early_param("additional_cpus", setup_additional_cpus); + static void __ref remove_cpu_from_maps(int cpu) { cpu_clear(cpu, cpu_online_map); -- cgit v1.2.2 From 1e0b5d00b230ceffe1bb33284b46b8572e418423 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Mon, 29 Sep 2008 08:45:29 -0500 Subject: x86, UV: new UV genapic functions for x2apic Add functions that use the infrastructure added by the x2apic code. These functions were originally stubbed out since the UV code went into the tree prior to the x2apic code. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 8cf4ec2a37d9..33581d94a90e 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -114,7 +114,7 @@ static void uv_send_IPI_one(int cpu, int vector) unsigned long val, apicid, lapicid; int pnode; - apicid = per_cpu(x86_cpu_to_apicid, cpu); /* ZZZ - cache node-local ? */ + apicid = per_cpu(x86_cpu_to_apicid, cpu); lapicid = apicid & 0x3f; /* ZZZ macro needed */ pnode = uv_apicid_to_pnode(apicid); val = @@ -202,12 +202,10 @@ static unsigned int phys_pkg_id(int index_msb) return uv_read_apic_id() >> index_msb; } -#ifdef ZZZ /* Needs x2apic patch */ static void uv_send_IPI_self(int vector) { apic_write(APIC_SELF_IPI, vector); } -#endif struct genapic apic_x2apic_uv_x = { .name = "UV large system", @@ -215,15 +213,15 @@ struct genapic apic_x2apic_uv_x = { .int_delivery_mode = dest_Fixed, .int_dest_mode = (APIC_DEST_PHYSICAL != 0), .target_cpus = uv_target_cpus, - .vector_allocation_domain = uv_vector_allocation_domain,/* Fixme ZZZ */ + .vector_allocation_domain = uv_vector_allocation_domain, .apic_id_registered = uv_apic_id_registered, .init_apic_ldr = uv_init_apic_ldr, .send_IPI_all = uv_send_IPI_all, .send_IPI_allbutself = uv_send_IPI_allbutself, .send_IPI_mask = uv_send_IPI_mask, - /* ZZZ.send_IPI_self = uv_send_IPI_self, */ + .send_IPI_self = uv_send_IPI_self, .cpu_mask_to_apicid = uv_cpu_mask_to_apicid, - .phys_pkg_id = phys_pkg_id, /* Fixme ZZZ */ + .phys_pkg_id = phys_pkg_id, .get_apic_id = get_apic_id, .set_apic_id = set_apic_id, .apic_id_mask = (0xFFFFFFFFu), -- cgit v1.2.2 From e2ce07c8042975e52df4cec1f41faf15b83f2e42 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 3 Apr 2008 16:40:48 +0300 Subject: x86: __show_registers() and __show_regs() API unification Currently the low-level function to dump user-passed registers on i386 is called __show_registers() whereas on x86-64 it's called __show_regs(). Unify the API to simplify porting of kmemcheck to x86-64. Signed-off-by: Pekka Enberg Acked-by: Vegard Nossum Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 7 +++++-- arch/x86/kernel/traps_32.c | 2 +- arch/x86/kernel/traps_64.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 922c14058f97..0a1302fe6d45 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -123,7 +123,7 @@ void cpu_idle(void) } } -void __show_registers(struct pt_regs *regs, int all) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; unsigned long d0, d1, d2, d3, d6, d7; @@ -189,7 +189,7 @@ void __show_registers(struct pt_regs *regs, int all) void show_regs(struct pt_regs *regs) { - __show_registers(regs, 1); + __show_regs(regs, 1); show_trace(NULL, regs, ®s->sp, regs->bp); } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ca80394ef5b8..cd8c0ed02b7e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -136,7 +136,7 @@ void cpu_idle(void) } /* Prints also some state that isn't saved in the pt_regs */ -void __show_regs(struct pt_regs *regs) +void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; unsigned long d0, d1, d2, d3, d6, d7; @@ -175,6 +175,9 @@ void __show_regs(struct pt_regs *regs) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + if (!all) + return; + cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); @@ -200,7 +203,7 @@ void __show_regs(struct pt_regs *regs) void show_regs(struct pt_regs *regs) { printk(KERN_INFO "CPU %d:", smp_processor_id()); - __show_regs(regs); + __show_regs(regs, 1); show_trace(NULL, regs, (void *)(regs + 1), regs->bp); } diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index bf4d71231d40..4f0831593e32 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -319,7 +319,7 @@ void show_registers(struct pt_regs *regs) int i; print_modules(); - __show_registers(regs, 0); + __show_regs(regs, 0); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", TASK_COMM_LEN, current->comm, task_pid_nr(current), diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 1efd1ea64663..729157ee4c17 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -430,7 +430,7 @@ void show_registers(struct pt_regs *regs) sp = regs->sp; printk("CPU %d ", cpu); - __show_regs(regs); + __show_regs(regs, 1); printk("Process %s (pid: %d, threadinfo %p, task %p)\n", cur->comm, cur->pid, task_thread_info(cur), cur); -- cgit v1.2.2 From 94f6bac1058fd59a8bd472d18c4b77f220d930b0 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Tue, 30 Sep 2008 23:17:51 +0200 Subject: x86: do not allow to optimize flag_is_changeable_p() (rev. 2) The flag_is_changeable_p() is used by has_cpuid_p() which can return different results in the code sequence below: if (!have_cpuid_p()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ if (!have_cpuid_p()) return; Otherwise, the gcc 3.4.6 optimizes these two calls into one which make the code not working correctly. Cyrix cpus have the CPUID instruction enabled before the second call to the have_cpuid_p() but it is not detected due to the gcc optimization. Thus the ARR registers (mtrr like) are not detected on such a cpu. Signed-off-by: Krzysztof Helt Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index f1af71851919..25581dcb280e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -124,18 +124,25 @@ static inline int flag_is_changeable_p(u32 flag) { u32 f1, f2; - asm("pushfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "movl %0,%1\n\t" - "xorl %2,%0\n\t" - "pushl %0\n\t" - "popfl\n\t" - "pushfl\n\t" - "popl %0\n\t" - "popfl\n\t" - : "=&r" (f1), "=&r" (f2) - : "ir" (flag)); + /* + * Cyrix and IDT cpus allow disabling of CPUID + * so the code below may return different results + * when it is executed before and after enabling + * the CPUID. Add "volatile" to not allow gcc to + * optimize the subsequent calls to this function. + */ + asm volatile ("pushfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "movl %0,%1\n\t" + "xorl %2,%0\n\t" + "pushl %0\n\t" + "popfl\n\t" + "pushfl\n\t" + "popl %0\n\t" + "popfl\n\t" + : "=&r" (f1), "=&r" (f2) + : "ir" (flag)); return ((f1^f2) & flag) != 0; } -- cgit v1.2.2 From 7f2f49a58283110083a7358d2d98025a11653373 Mon Sep 17 00:00:00 2001 From: Chuck Ebbert Date: Thu, 2 Oct 2008 15:30:07 -0400 Subject: x86: allow number of additional hotplug CPUs to be set at compile time, V2 x86: allow number of additional hotplug CPUs to be set at compile time, V2 The default number of additional CPU IDs for hotplugging is determined by asking ACPI or mptables how many "disabled" CPUs there are in the system, but many systems get this wrong so that e.g. a uniprocessor machine gets an extra CPU allocated and never switches to single CPU mode. And sometimes CPU hotplugging is enabled only for suspend/hibernate anyway, so the additional CPU IDs are not wanted. Allow the number to be set to zero at compile time. Also, force the number of extra CPUs to zero if hotplugging is disabled which allows removing some conditional code. Tested on uniprocessor x86_64 that ACPI claims has a disabled processor, with CPU hotplugging configured. ("After" has the number of additional CPUs set to 0) Before: NR_CPUS: 512, nr_cpu_ids: 2, nr_node_ids 1 After: NR_CPUS: 512, nr_cpu_ids: 1, nr_node_ids 1 [Changed the name of the option and the prompt according to Ingo's suggestion.] Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 18 ++++++++++++++++++ arch/x86/kernel/smpboot.c | 2 +- 2 files changed, 19 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc8351f374fd..adaca6fa927d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1438,6 +1438,24 @@ config HOTPLUG_CPU automatically on SMP systems. ) Say N if you want to disable CPU hotplug. +config HOTPLUG_RESTRICT_TO_BOOTUP_CPUS + def_bool n + prompt "Restrict CPU hotplugging to processors found during boot" if HOTPLUG_CPU + ---help--- + Say no here to use the default, which allows as many CPUs as are marked + "disabled" by ACPI or MPTABLES to be hotplugged after bootup. + + Say yes if you do not want to allow CPUs to be added after booting, for + example if you only need CPU hotplugging enabled for suspend/resume. + + If CPU_HOTPLUG is enabled this value may be overridden at boot time + with the "additional_cpus" kernel parameter. + +config HOTPLUG_ADDITIONAL_CPUS + int + default 0 if !HOTPLUG_CPU || HOTPLUG_RESTRICT_TO_BOOTUP_CPUS + default -1 + config COMPAT_VDSO def_bool y prompt "Compat VDSO support" diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 23913785c262..857a88bb9195 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,7 +1261,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -static int additional_cpus __initdata = -1; +static int additional_cpus __initdata = CONFIG_HOTPLUG_ADDITIONAL_CPUS; /* * cpu_possible_map should be static, it cannot change as cpu's -- cgit v1.2.2 From af5c2bd16ac2e5688c3bf46ea1f95112d696d294 Mon Sep 17 00:00:00 2001 From: Vegard Nossum Date: Fri, 3 Oct 2008 17:54:25 +0200 Subject: x86: fix virt_addr_valid() with CONFIG_DEBUG_VIRTUAL=y, v2 virt_addr_valid() calls __pa(), which calls __phys_addr(). With CONFIG_DEBUG_VIRTUAL=y, __phys_addr() will kill the kernel if the address *isn't* valid. That's clearly wrong for virt_addr_valid(). We also incorporate the debugging checks into virt_addr_valid(). Signed-off-by: Vegard Nossum Acked-by: Jiri Slaby Signed-off-by: Ingo Molnar --- arch/x86/kernel/doublefault_32.c | 2 +- arch/x86/mm/ioremap.c | 36 ++++++++++++++++++++++++++++++++++-- 2 files changed, 35 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/doublefault_32.c b/arch/x86/kernel/doublefault_32.c index 395acb12b0d1..b4f14c6c09d9 100644 --- a/arch/x86/kernel/doublefault_32.c +++ b/arch/x86/kernel/doublefault_32.c @@ -66,6 +66,6 @@ struct tss_struct doublefault_tss __cacheline_aligned = { .ds = __USER_DS, .fs = __KERNEL_PERCPU, - .__cr3 = __phys_addr_const((unsigned long)swapper_pg_dir) + .__cr3 = __pa_nodebug(swapper_pg_dir), } }; diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 7fb737c6b54f..8876220d906b 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -45,6 +45,27 @@ unsigned long __phys_addr(unsigned long x) } EXPORT_SYMBOL(__phys_addr); +bool __virt_addr_valid(unsigned long x) +{ + if (x >= __START_KERNEL_map) { + x -= __START_KERNEL_map; + if (x >= KERNEL_IMAGE_SIZE) + return false; + x += phys_base; + } else { + if (x < PAGE_OFFSET) + return false; + x -= PAGE_OFFSET; + if (system_state == SYSTEM_BOOTING ? + x > MAXMEM : !phys_addr_valid(x)) { + return false; + } + } + + return pfn_valid(x >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + #else static inline int phys_addr_valid(unsigned long addr) @@ -56,13 +77,24 @@ static inline int phys_addr_valid(unsigned long addr) unsigned long __phys_addr(unsigned long x) { /* VMALLOC_* aren't constants; not available at the boot time */ - VIRTUAL_BUG_ON(x < PAGE_OFFSET || (system_state != SYSTEM_BOOTING && - is_vmalloc_addr((void *)x))); + VIRTUAL_BUG_ON(x < PAGE_OFFSET); + VIRTUAL_BUG_ON(system_state != SYSTEM_BOOTING && + is_vmalloc_addr((void *) x)); return x - PAGE_OFFSET; } EXPORT_SYMBOL(__phys_addr); #endif +bool __virt_addr_valid(unsigned long x) +{ + if (x < PAGE_OFFSET) + return false; + if (system_state != SYSTEM_BOOTING && is_vmalloc_addr((void *) x)) + return false; + return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); +} +EXPORT_SYMBOL(__virt_addr_valid); + #endif int page_is_ram(unsigned long pagenr) -- cgit v1.2.2 From 2bc5f927d489f9e47b6fa71f323b653e8ec81782 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 13:12:14 +0200 Subject: i386: split out dumpstack code from traps_32.c The dumpstack code is logically quite independent from the hardware traps. Split it out into its own file. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/dumpstack_32.c | 422 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps_32.c | 404 --------------------------------------- 3 files changed, 423 insertions(+), 405 deletions(-) create mode 100644 arch/x86/kernel/dumpstack_32.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5098585f87ce..d5bde5d8c2b6 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,7 +28,7 @@ obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o -obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o +obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o dumpstack_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c new file mode 100644 index 000000000000..c398b27df6cd --- /dev/null +++ b/arch/x86/kernel/dumpstack_32.c @@ -0,0 +1,422 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 24; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ +#ifdef CONFIG_KALLSYMS + unsigned long offset = 0; + unsigned long symsize; + const char *symname; + char *modname; + char *delim = ":"; + char namebuf[KSYM_NAME_LEN]; + char reliab[4] = ""; + + symname = kallsyms_lookup(address, &symsize, &offset, + &modname, namebuf); + if (!symname) { + printk(" [<%08lx>]\n", address); + return; + } + if (!reliable) + strcpy(reliab, "? "); + + if (!modname) + modname = delim = ""; + printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", + address, reliab, delim, modname, delim, symname, offset, symsize); +#else + printk(" [<%08lx>]\n", address); +#endif +} + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size) +{ + void *t = tinfo; + return p > t && p <= t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 4) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + asm("movl %%ebp, %0" : "=r" (bp) :); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + for (;;) { + struct thread_info *context; + + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + bp = print_context_stack(context, stack, bp, ops, data); + /* + * Should be after the line below, but somewhere + * in early boot context comes out corrupted and we + * can't reference it: + */ + if (ops->stack(data, "IRQ") < 0) + break; + stack = (unsigned long *)context->previous_esp; + if (!stack) + break; + touch_nmi_watchdog(); + } +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + printk(data); + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s%s\n", (char *)data, msg); +} + +static int print_trace_stack(void *data, char *name) +{ + return 0; +} + +/* + * Print one address/symbol entries per line. + */ +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + printk("%s [<%08lx>] ", (char *)data, addr); + if (!reliable) + printk("? "); + print_symbol("%s\n", addr); + touch_nmi_watchdog(); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + printk("%s =======================\n", log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (kstack_end(stack)) + break; + if (i && ((i % 8) == 0)) + printk("\n%s ", log_lvl); + printk("%08lx ", *stack++); + } + printk("\n%sCall Trace:\n", log_lvl); + + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + printk(" "); + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movl %%ebp, %0" : "=r" (bp):); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + + show_trace(current, NULL, &stack, bp); +} + +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + + print_modules(); + __show_regs(regs, 0); + + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + TASK_COMM_LEN, current->comm, task_pid_nr(current), + current_thread_info(), current, task_thread_info(current)); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode_vm(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk("\n" KERN_EMERG "Stack: "); + show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at EIP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad EIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (ip < PAGE_OFFSET) + return 0; + if (probe_kernel_address((unsigned short *)ip, ud2)) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + unsigned long flags; + + oops_enter(); + + if (die_owner != raw_smp_processor_id()) { + console_verbose(); + raw_local_irq_save(flags); + __raw_spin_lock(&die_lock); + die_owner = smp_processor_id(); + die_nest_count = 0; + bust_spinlocks(1); + } else { + raw_local_irq_save(flags); + } + die_nest_count++; + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + bust_spinlocks(0); + die_owner = -1; + add_taint(TAINT_DIE); + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + + if (!regs) + return; + + if (kexec_should_crash(current)) + crash_kexec(regs); + + if (in_interrupt()) + panic("Fatal exception in interrupt"); + + if (panic_on_oops) + panic("Fatal exception"); + + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + unsigned short ss; + unsigned long sp; + + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + /* Executive summary in case the oops scrolled away */ + sp = (unsigned long) (®s->sp); + savesegment(ss, ss); + if (user_mode(regs)) { + sp = regs->sp; + ss = regs->ss & 0xffff; + } + printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); + print_symbol("%s", regs->ip); + printk(" SS:ESP %04x:%08lx\n", ss, sp); + return 0; +} + +/* + * This is gone through when something in the kernel has done something bad + * and is about to be terminated: + */ +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (die_nest_count < 3) { + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + } else { + printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); + } + + oops_end(flags, regs, SIGSEGV); +} + +static int __init kstack_setup(char *s) +{ + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + + return 1; +} +__setup("kstack=", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 4f0831593e32..ce1063c141fc 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -80,11 +80,7 @@ char ignore_fpu_irq; gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 24; -static unsigned int code_bytes = 64; static int ignore_nmis; -static int die_counter; static inline void conditional_sti(struct pt_regs *regs) { @@ -92,388 +88,6 @@ static inline void conditional_sti(struct pt_regs *regs) local_irq_enable(); } -void printk_address(unsigned long address, int reliable) -{ -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0; - unsigned long symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%08lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%08lx>]\n", address); -#endif -} - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size) -{ - void *t = tinfo; - return p > t && p <= t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 4) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movl %%ebp, %0" : "=r" (bp) :); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - for (;;) { - struct thread_info *context; - - context = (struct thread_info *) - ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data); - /* - * Should be after the line below, but somewhere - * in early boot context comes out corrupted and we - * can't reference it: - */ - if (ops->stack(data, "IRQ") < 0) - break; - stack = (unsigned long *)context->previous_esp; - if (!stack) - break; - touch_nmi_watchdog(); - } -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - printk(data); - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s%s\n", (char *)data, msg); -} - -static int print_trace_stack(void *data, char *name) -{ - return 0; -} - -/* - * Print one address/symbol entries per line. - */ -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - printk("%s [<%08lx>] ", (char *)data, addr); - if (!reliable) - printk("? "); - print_symbol("%s\n", addr); - touch_nmi_watchdog(); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if (i && ((i % 8) == 0)) - printk("\n%s ", log_lvl); - printk("%08lx ", *stack++); - } - printk("\n%sCall Trace:\n", log_lvl); - - show_trace_log_lvl(task, regs, sp, bp, log_lvl); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - printk(" "); - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movl %%ebp, %0" : "=r" (bp):); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - - show_trace(current, NULL, &stack, bp); -} - -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - - print_modules(); - __show_regs(regs, 0); - - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", - TASK_COMM_LEN, current->comm, task_pid_nr(current), - current_thread_info(), current, task_thread_info(current)); - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode_vm(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at EIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad EIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - unsigned long flags; - - oops_enter(); - - if (die_owner != raw_smp_processor_id()) { - console_verbose(); - raw_local_irq_save(flags); - __raw_spin_lock(&die_lock); - die_owner = smp_processor_id(); - die_nest_count = 0; - bust_spinlocks(1); - } else { - raw_local_irq_save(flags); - } - die_nest_count++; - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - bust_spinlocks(0); - die_owner = -1; - add_taint(TAINT_DIE); - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - - if (!regs) - return; - - if (kexec_should_crash(current)) - crash_kexec(regs); - - if (in_interrupt()) - panic("Fatal exception in interrupt"); - - if (panic_on_oops) - panic("Fatal exception"); - - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - unsigned short ss; - unsigned long sp; - - printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - /* Executive summary in case the oops scrolled away */ - sp = (unsigned long) (®s->sp); - savesegment(ss, ss); - if (user_mode(regs)) { - sp = regs->sp; - ss = regs->ss & 0xffff; - } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); - return 0; -} - -/* - * This is gone through when something in the kernel has done something bad - * and is about to be terminated: - */ -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (die_nest_count < 3) { - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - } else { - printk(KERN_EMERG "Recursive die() failure, output suppressed\n"); - } - - oops_end(flags, regs, SIGSEGV); -} - static inline void die_if_kernel(const char *str, struct pt_regs *regs, long err) { @@ -1256,21 +870,3 @@ void __init trap_init(void) trap_init_hook(); } - -static int __init kstack_setup(char *s) -{ - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - - return 1; -} -__setup("kstack=", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); -- cgit v1.2.2 From 6fcbede3fdfbd83d8de97296286f5a9ff5a8f371 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 13:12:15 +0200 Subject: x86_64: split out dumpstack code from traps_64.c The dumpstack code is logically quite independent from the hardware traps. Split it out into its own file. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/dumpstack_64.c | 565 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps_64.c | 545 --------------------------------------- 3 files changed, 567 insertions(+), 547 deletions(-) create mode 100644 arch/x86/kernel/dumpstack_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d5bde5d8c2b6..de63fed9fae8 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,12 +23,12 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps_$(BITS).o irq_$(BITS).o +obj-y += traps_$(BITS).o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o obj-$(CONFIG_X86_32) += probe_roms_32.o -obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o dumpstack_32.o +obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c new file mode 100644 index 000000000000..6f1505074db0 --- /dev/null +++ b/arch/x86/kernel/dumpstack_64.c @@ -0,0 +1,565 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +int panic_on_unrecovered_nmi; +int kstack_depth_to_print = 12; +static unsigned int code_bytes = 64; +static int die_counter; + +void printk_address(unsigned long address, int reliable) +{ + printk(" [<%016lx>] %s%pS\n", + address, reliable ? "" : "? ", (void *) address); +} + +static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, + unsigned *usedp, char **idp) +{ + static char ids[][8] = { + [DEBUG_STACK - 1] = "#DB", + [NMI_STACK - 1] = "NMI", + [DOUBLEFAULT_STACK - 1] = "#DF", + [STACKFAULT_STACK - 1] = "#SS", + [MCE_STACK - 1] = "#MC", +#if DEBUG_STKSZ > EXCEPTION_STKSZ + [N_EXCEPTION_STACKS ... + N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" +#endif + }; + unsigned k; + + /* + * Iterate over all exception stacks, and figure out whether + * 'stack' is in one of them: + */ + for (k = 0; k < N_EXCEPTION_STACKS; k++) { + unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + /* + * Is 'stack' above this exception frame's end? + * If yes then skip to the next frame. + */ + if (stack >= end) + continue; + /* + * Is 'stack' above this exception frame's start address? + * If yes then we found the right frame. + */ + if (stack >= end - EXCEPTION_STKSZ) { + /* + * Make sure we only iterate through an exception + * stack once. If it comes up for the second time + * then there's something wrong going on - just + * break out and return NULL: + */ + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; + *idp = ids[k]; + return (unsigned long *)end; + } + /* + * If this is a debug stack, and if it has a larger size than + * the usual exception stacks, then 'stack' might still + * be within the lower portion of the debug stack: + */ +#if DEBUG_STKSZ > EXCEPTION_STKSZ + if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { + unsigned j = N_EXCEPTION_STACKS - 1; + + /* + * Black magic. A large debug stack is composed of + * multiple exception stack entries, which we + * iterate through now. Dont look: + */ + do { + ++j; + end -= EXCEPTION_STKSZ; + ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); + } while (stack < end - EXCEPTION_STKSZ); + if (*usedp & (1U << j)) + break; + *usedp |= 1U << j; + *idp = ids[j]; + return (unsigned long *)end; + } +#endif + } + return NULL; +} + +/* + * x86-64 can have up to three kernel stacks: + * process stack + * interrupt stack + * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack + */ + +static inline int valid_stack_ptr(struct thread_info *tinfo, + void *p, unsigned int size, void *end) +{ + void *t = tinfo; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; +} + +/* The form of the top of the frame on the stack */ +struct stack_frame { + struct stack_frame *next_frame; + unsigned long return_address; +}; + +static inline unsigned long +print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data, + unsigned long *end) +{ + struct stack_frame *frame = (struct stack_frame *)bp; + + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { + unsigned long addr; + + addr = *stack; + if (__kernel_text_address(addr)) { + if ((unsigned long) stack == bp + 8) { + ops->address(data, addr, 1); + frame = frame->next_frame; + bp = (unsigned long) frame; + } else { + ops->address(data, addr, bp == 0); + } + } + stack++; + } + return bp; +} + +void dump_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, + const struct stacktrace_ops *ops, void *data) +{ + const unsigned cpu = get_cpu(); + unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + unsigned used = 0; + struct thread_info *tinfo; + + if (!task) + task = current; + + if (!stack) { + unsigned long dummy; + stack = &dummy; + if (task && task != current) + stack = (unsigned long *)task->thread.sp; + } + +#ifdef CONFIG_FRAME_POINTER + if (!bp) { + if (task == current) { + /* Grab bp right from our regs */ + asm("movq %%rbp, %0" : "=r" (bp) : ); + } else { + /* bp is the last reg pushed by switch_to */ + bp = *(unsigned long *) task->thread.sp; + } + } +#endif + + /* + * Print function call entries in all stacks, starting at the + * current stack address. If the stacks consist of nested + * exceptions + */ + tinfo = task_thread_info(task); + for (;;) { + char *id; + unsigned long *estack_end; + estack_end = in_exception_stack(cpu, (unsigned long)stack, + &used, &id); + + if (estack_end) { + if (ops->stack(data, id) < 0) + break; + + bp = print_context_stack(tinfo, stack, bp, ops, + data, estack_end); + ops->stack(data, ""); + /* + * We link to the next stack via the + * second-to-last pointer (index -2 to end) in the + * exception stack: + */ + stack = (unsigned long *) estack_end[-2]; + continue; + } + if (irqstack_end) { + unsigned long *irqstack; + irqstack = irqstack_end - + (IRQSTACKSIZE - 64) / sizeof(*irqstack); + + if (stack >= irqstack && stack < irqstack_end) { + if (ops->stack(data, "IRQ") < 0) + break; + bp = print_context_stack(tinfo, stack, bp, + ops, data, irqstack_end); + /* + * We link to the next stack (which would be + * the process stack normally) the last + * pointer (index -1 to end) in the IRQ stack: + */ + stack = (unsigned long *) (irqstack_end[-1]); + irqstack_end = NULL; + ops->stack(data, "EOI"); + continue; + } + } + break; + } + + /* + * This handles the process stack: + */ + bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); + put_cpu(); +} +EXPORT_SYMBOL(dump_trace); + +static void +print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + print_symbol(msg, symbol); + printk("\n"); +} + +static void print_trace_warning(void *data, char *msg) +{ + printk("%s\n", msg); +} + +static int print_trace_stack(void *data, char *name) +{ + printk(" <%s> ", name); + return 0; +} + +static void print_trace_address(void *data, unsigned long addr, int reliable) +{ + touch_nmi_watchdog(); + printk_address(addr, reliable); +} + +static const struct stacktrace_ops print_trace_ops = { + .warning = print_trace_warning, + .warning_symbol = print_trace_warning_symbol, + .stack = print_trace_stack, + .address = print_trace_address, +}; + +static void +show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp, char *log_lvl) +{ + printk("Call Trace:\n"); + dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); +} + +void show_trace(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, unsigned long bp) +{ + show_trace_log_lvl(task, regs, stack, bp, ""); +} + +static void +show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, unsigned long bp, char *log_lvl) +{ + unsigned long *stack; + int i; + const int cpu = smp_processor_id(); + unsigned long *irqstack_end = + (unsigned long *) (cpu_pda(cpu)->irqstackptr); + unsigned long *irqstack = + (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); + + /* + * debugging aid: "show_stack(NULL, NULL);" prints the + * back trace for this cpu. + */ + + if (sp == NULL) { + if (task) + sp = (unsigned long *)task->thread.sp; + else + sp = (unsigned long *)&sp; + } + + stack = sp; + for (i = 0; i < kstack_depth_to_print; i++) { + if (stack >= irqstack && stack <= irqstack_end) { + if (stack == irqstack_end) { + stack = (unsigned long *) (irqstack_end[-1]); + printk(" "); + } + } else { + if (((long) stack & (THREAD_SIZE-1)) == 0) + break; + } + if (i && ((i % 4) == 0)) + printk("\n"); + printk(" %016lx", *stack++); + touch_nmi_watchdog(); + } + printk("\n"); + show_trace_log_lvl(task, regs, sp, bp, log_lvl); +} + +void show_stack(struct task_struct *task, unsigned long *sp) +{ + show_stack_log_lvl(task, NULL, sp, 0, ""); +} + +/* + * The architecture-independent dump_stack generator + */ +void dump_stack(void) +{ + unsigned long bp = 0; + unsigned long stack; + +#ifdef CONFIG_FRAME_POINTER + if (!bp) + asm("movq %%rbp, %0" : "=r" (bp) : ); +#endif + + printk("Pid: %d, comm: %.20s %s %s %.*s\n", + current->pid, current->comm, print_tainted(), + init_utsname()->release, + (int)strcspn(init_utsname()->version, " "), + init_utsname()->version); + show_trace(NULL, NULL, &stack, bp); +} +EXPORT_SYMBOL(dump_stack); + +void show_registers(struct pt_regs *regs) +{ + int i; + unsigned long sp; + const int cpu = smp_processor_id(); + struct task_struct *cur = cpu_pda(cpu)->pcurrent; + + sp = regs->sp; + printk("CPU %d ", cpu); + __show_regs(regs, 1); + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", + cur->comm, cur->pid, task_thread_info(cur), cur); + + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. + */ + if (!user_mode(regs)) { + unsigned int code_prologue = code_bytes * 43 / 64; + unsigned int code_len = code_bytes; + unsigned char c; + u8 *ip; + + printk("Stack: "); + show_stack_log_lvl(NULL, regs, (unsigned long *)sp, + regs->bp, ""); + + printk(KERN_EMERG "Code: "); + + ip = (u8 *)regs->ip - code_prologue; + if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { + /* try starting at RIP */ + ip = (u8 *)regs->ip; + code_len = code_len - code_prologue + 1; + } + for (i = 0; i < code_len; i++, ip++) { + if (ip < (u8 *)PAGE_OFFSET || + probe_kernel_address(ip, c)) { + printk(" Bad RIP value."); + break; + } + if (ip == (u8 *)regs->ip) + printk("<%02x> ", c); + else + printk("%02x ", c); + } + } + printk("\n"); +} + +int is_valid_bugaddr(unsigned long ip) +{ + unsigned short ud2; + + if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) + return 0; + + return ud2 == 0x0b0f; +} + +static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; +static int die_owner = -1; +static unsigned int die_nest_count; + +unsigned __kprobes long oops_begin(void) +{ + int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + raw_local_irq_save(flags); + cpu = smp_processor_id(); + if (!__raw_spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; + else + __raw_spin_lock(&die_lock); + } + die_nest_count++; + die_owner = cpu; + console_verbose(); + bust_spinlocks(1); + return flags; +} + +void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) +{ + die_owner = -1; + bust_spinlocks(0); + die_nest_count--; + if (!die_nest_count) + /* Nest count reaches zero, release the lock. */ + __raw_spin_unlock(&die_lock); + raw_local_irq_restore(flags); + if (!regs) { + oops_exit(); + return; + } + if (in_interrupt()) + panic("Fatal exception in interrupt"); + if (panic_on_oops) + panic("Fatal exception"); + oops_exit(); + do_exit(signr); +} + +int __kprobes __die(const char *str, struct pt_regs *regs, long err) +{ + printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); +#ifdef CONFIG_PREEMPT + printk("PREEMPT "); +#endif +#ifdef CONFIG_SMP + printk("SMP "); +#endif +#ifdef CONFIG_DEBUG_PAGEALLOC + printk("DEBUG_PAGEALLOC"); +#endif + printk("\n"); + if (notify_die(DIE_OOPS, str, regs, err, + current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) + return 1; + + show_registers(regs); + add_taint(TAINT_DIE); + /* Executive summary in case the oops scrolled away */ + printk(KERN_ALERT "RIP "); + printk_address(regs->ip, 1); + printk(" RSP <%016lx>\n", regs->sp); + if (kexec_should_crash(current)) + crash_kexec(regs); + return 0; +} + +void die(const char *str, struct pt_regs *regs, long err) +{ + unsigned long flags = oops_begin(); + + if (!user_mode(regs)) + report_bug(regs->ip, regs); + + if (__die(str, regs, err)) + regs = NULL; + oops_end(flags, regs, SIGSEGV); +} + +notrace __kprobes void +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + unsigned long flags; + + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + flags = oops_begin(); + /* + * We are in trouble anyway, lets at least try + * to get a message out. + */ + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (kexec_should_crash(current)) + crash_kexec(regs); + if (do_panic || panic_on_oops) + panic("Non maskable interrupt"); + oops_end(flags, NULL, SIGBUS); + nmi_exit(); + local_irq_enable(); + do_exit(SIGBUS); +} + +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + +static int __init kstack_setup(char *s) +{ + if (!s) + return -EINVAL; + kstack_depth_to_print = simple_strtoul(s, NULL, 0); + return 0; +} +early_param("kstack", kstack_setup); + +static int __init code_bytes_setup(char *s) +{ + code_bytes = simple_strtoul(s, NULL, 0); + if (code_bytes > 8192) + code_bytes = 8192; + + return 1; +} +__setup("code_bytes=", code_bytes_setup); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 729157ee4c17..1cd61ddd90be 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -54,11 +54,7 @@ #include -int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 12; -static unsigned int code_bytes = 64; static int ignore_nmis; -static int die_counter; static inline void conditional_sti(struct pt_regs *regs) { @@ -82,518 +78,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -void printk_address(unsigned long address, int reliable) -{ - printk(" [<%016lx>] %s%pS\n", - address, reliable ? "" : "? ", (void *) address); -} - -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) -{ - static char ids[][8] = { - [DEBUG_STACK - 1] = "#DB", - [NMI_STACK - 1] = "NMI", - [DOUBLEFAULT_STACK - 1] = "#DF", - [STACKFAULT_STACK - 1] = "#SS", - [MCE_STACK - 1] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" -#endif - }; - unsigned k; - - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ - for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) - continue; - /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: - */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; - - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - ids[j][4] = '1' + (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = ids[j]; - return (unsigned long *)end; - } -#endif - } - return NULL; -} - -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size, void *end) -{ - void *t = tinfo; - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p > t && p < t + THREAD_SIZE - size; -} - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -static inline unsigned long -print_context_stack(struct thread_info *tinfo, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - unsigned long *end) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { - unsigned long addr; - - addr = *stack; - if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 8) { - ops->address(data, addr, 1); - frame = frame->next_frame; - bp = (unsigned long) frame; - } else { - ops->address(data, addr, bp == 0); - } - } - stack++; - } - return bp; -} - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - const unsigned cpu = get_cpu(); - unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; - unsigned used = 0; - struct thread_info *tinfo; - - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - stack = &dummy; - if (task && task != current) - stack = (unsigned long *)task->thread.sp; - } - -#ifdef CONFIG_FRAME_POINTER - if (!bp) { - if (task == current) { - /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) : ); - } else { - /* bp is the last reg pushed by switch_to */ - bp = *(unsigned long *) task->thread.sp; - } - } -#endif - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - tinfo = task_thread_info(task); - for (;;) { - char *id; - unsigned long *estack_end; - estack_end = in_exception_stack(cpu, (unsigned long)stack, - &used, &id); - - if (estack_end) { - if (ops->stack(data, id) < 0) - break; - - bp = print_context_stack(tinfo, stack, bp, ops, - data, estack_end); - ops->stack(data, ""); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) estack_end[-2]; - continue; - } - if (irqstack_end) { - unsigned long *irqstack; - irqstack = irqstack_end - - (IRQSTACKSIZE - 64) / sizeof(*irqstack); - - if (stack >= irqstack && stack < irqstack_end) { - if (ops->stack(data, "IRQ") < 0) - break; - bp = print_context_stack(tinfo, stack, bp, - ops, data, irqstack_end); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (irqstack_end[-1]); - irqstack_end = NULL; - ops->stack(data, "EOI"); - continue; - } - } - break; - } - - /* - * This handles the process stack: - */ - bp = print_context_stack(tinfo, stack, bp, ops, data, NULL); - put_cpu(); -} -EXPORT_SYMBOL(dump_trace); - -static void -print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) -{ - print_symbol(msg, symbol); - printk("\n"); -} - -static void print_trace_warning(void *data, char *msg) -{ - printk("%s\n", msg); -} - -static int print_trace_stack(void *data, char *name) -{ - printk(" <%s> ", name); - return 0; -} - -static void print_trace_address(void *data, unsigned long addr, int reliable) -{ - touch_nmi_watchdog(); - printk_address(addr, reliable); -} - -static const struct stacktrace_ops print_trace_ops = { - .warning = print_trace_warning, - .warning_symbol = print_trace_warning_symbol, - .stack = print_trace_stack, - .address = print_trace_address, -}; - -static void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("Call Trace:\n"); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); -} - -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - -static void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) -{ - unsigned long *stack; - int i; - const int cpu = smp_processor_id(); - unsigned long *irqstack_end = - (unsigned long *) (cpu_pda(cpu)->irqstackptr); - unsigned long *irqstack = - (unsigned long *) (cpu_pda(cpu)->irqstackptr - IRQSTACKSIZE); - - /* - * debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu. - */ - - if (sp == NULL) { - if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (stack >= irqstack && stack <= irqstack_end) { - if (stack == irqstack_end) { - stack = (unsigned long *) (irqstack_end[-1]); - printk(" "); - } - } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) - break; - } - if (i && ((i % 4) == 0)) - printk("\n"); - printk(" %016lx", *stack++); - touch_nmi_watchdog(); - } - printk("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); -} - -void show_stack(struct task_struct *task, unsigned long *sp) -{ - show_stack_log_lvl(task, NULL, sp, 0, ""); -} - -/* - * The architecture-independent dump_stack generator - */ -void dump_stack(void) -{ - unsigned long bp = 0; - unsigned long stack; - -#ifdef CONFIG_FRAME_POINTER - if (!bp) - asm("movq %%rbp, %0" : "=r" (bp) : ); -#endif - - printk("Pid: %d, comm: %.20s %s %s %.*s\n", - current->pid, current->comm, print_tainted(), - init_utsname()->release, - (int)strcspn(init_utsname()->version, " "), - init_utsname()->version); - show_trace(NULL, NULL, &stack, bp); -} -EXPORT_SYMBOL(dump_stack); - -void show_registers(struct pt_regs *regs) -{ - int i; - unsigned long sp; - const int cpu = smp_processor_id(); - struct task_struct *cur = cpu_pda(cpu)->pcurrent; - - sp = regs->sp; - printk("CPU %d ", cpu); - __show_regs(regs, 1); - printk("Process %s (pid: %d, threadinfo %p, task %p)\n", - cur->comm, cur->pid, task_thread_info(cur), cur); - - /* - * When in-kernel, we also print out the stack and code at the - * time of the fault.. - */ - if (!user_mode(regs)) { - unsigned int code_prologue = code_bytes * 43 / 64; - unsigned int code_len = code_bytes; - unsigned char c; - u8 *ip; - - printk("Stack: "); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, ""); - - printk(KERN_EMERG "Code: "); - - ip = (u8 *)regs->ip - code_prologue; - if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at RIP */ - ip = (u8 *)regs->ip; - code_len = code_len - code_prologue + 1; - } - for (i = 0; i < code_len; i++, ip++) { - if (ip < (u8 *)PAGE_OFFSET || - probe_kernel_address(ip, c)) { - printk(" Bad RIP value."); - break; - } - if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); - else - printk("%02x ", c); - } - } - printk("\n"); -} - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} - -static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; -static int die_owner = -1; -static unsigned int die_nest_count; - -unsigned __kprobes long oops_begin(void) -{ - int cpu; - unsigned long flags; - - oops_enter(); - - /* racy, but better than risking deadlock. */ - raw_local_irq_save(flags); - cpu = smp_processor_id(); - if (!__raw_spin_trylock(&die_lock)) { - if (cpu == die_owner) - /* nested oops. should stop eventually */; - else - __raw_spin_lock(&die_lock); - } - die_nest_count++; - die_owner = cpu; - console_verbose(); - bust_spinlocks(1); - return flags; -} - -void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) -{ - die_owner = -1; - bust_spinlocks(0); - die_nest_count--; - if (!die_nest_count) - /* Nest count reaches zero, release the lock. */ - __raw_spin_unlock(&die_lock); - raw_local_irq_restore(flags); - if (!regs) { - oops_exit(); - return; - } - if (panic_on_oops) - panic("Fatal exception"); - oops_exit(); - do_exit(signr); -} - -int __kprobes __die(const char *str, struct pt_regs *regs, long err) -{ - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); -#ifdef CONFIG_PREEMPT - printk("PREEMPT "); -#endif -#ifdef CONFIG_SMP - printk("SMP "); -#endif -#ifdef CONFIG_DEBUG_PAGEALLOC - printk("DEBUG_PAGEALLOC"); -#endif - printk("\n"); - if (notify_die(DIE_OOPS, str, regs, err, - current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) - return 1; - - show_registers(regs); - add_taint(TAINT_DIE); - /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip, 1); - printk(" RSP <%016lx>\n", regs->sp); - if (kexec_should_crash(current)) - crash_kexec(regs); - return 0; -} - -void die(const char *str, struct pt_regs *regs, long err) -{ - unsigned long flags = oops_begin(); - - if (!user_mode(regs)) - report_bug(regs->ip, regs); - - if (__die(str, regs, err)) - regs = NULL; - oops_end(flags, regs, SIGSEGV); -} - -notrace __kprobes void -die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - unsigned long flags; - - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - flags = oops_begin(); - /* - * We are in trouble anyway, lets at least try - * to get a message out. - */ - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (kexec_should_crash(current)) - crash_kexec(regs); - if (do_panic || panic_on_oops) - panic("Non maskable interrupt"); - oops_end(flags, NULL, SIGBUS); - nmi_exit(); - local_irq_enable(); - do_exit(SIGBUS); -} - static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) @@ -1178,32 +662,3 @@ void __init trap_init(void) */ cpu_init(); } - -static int __init oops_setup(char *s) -{ - if (!s) - return -EINVAL; - if (!strcmp(s, "panic")) - panic_on_oops = 1; - return 0; -} -early_param("oops", oops_setup); - -static int __init kstack_setup(char *s) -{ - if (!s) - return -EINVAL; - kstack_depth_to_print = simple_strtoul(s, NULL, 0); - return 0; -} -early_param("kstack", kstack_setup); - -static int __init code_bytes_setup(char *s) -{ - code_bytes = simple_strtoul(s, NULL, 0); - if (code_bytes > 8192) - code_bytes = 8192; - - return 1; -} -__setup("code_bytes=", code_bytes_setup); -- cgit v1.2.2 From a28680b4b821a262fd3b5e57a28c148b5f9e662a Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:34 +0200 Subject: x86, traps: split out math_error and simd_math_error Split out math_error from do_coprocessor_error and simd_math_error from do_simd_coprocessor_error, like on i386. While at it, add the "error_code" parameter to do_coprocessor_error, do_simd_coprocessor_error and do_spurious_interrupt_bug. This does not change the generated code, but brings the declarations in line with all the other trap handlers. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 1cd61ddd90be..b295ebf0cb3f 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -452,18 +452,12 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -asmlinkage void do_coprocessor_error(struct pt_regs *regs) +void math_error(void __user *ip) { - void __user *ip = (void __user *)(regs->ip); struct task_struct *task; siginfo_t info; unsigned short cwd, swd; - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; - /* * Save the info for the exception handler and clear the error. */ @@ -516,23 +510,26 @@ asmlinkage void do_coprocessor_error(struct pt_regs *regs) force_sig_info(SIGFPE, &info, task); } +asmlinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; + math_error((void __user *)regs->ip); +} + asmlinkage void bad_intr(void) { printk("bad interrupt"); } -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) +static void simd_math_error(void __user *ip) { - void __user *ip = (void __user *)(regs->ip); struct task_struct *task; siginfo_t info; unsigned short mxcsr; - conditional_sti(regs); - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - /* * Save the info for the exception handler and clear the error. */ @@ -575,7 +572,16 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs) +asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +} + +asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { } -- cgit v1.2.2 From ae82157b3d8bb4902f76b56c7450a945288786ac Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:35 +0200 Subject: x86, traps, i386: factor out lazy io-bitmap copy x86_64 does not do the lazy io-bitmap dance. Putting it in its own function makes i386's do_general_protection look much more like x86_64's. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 76 ++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index ce1063c141fc..0206c915748c 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -95,6 +95,47 @@ die_if_kernel(const char *str, struct pt_regs *regs, long err) die(str, regs, err); } +/* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. + */ +static int lazy_iobitmap_copy(void) +{ + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; + + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + } + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + + return 1; + } + put_cpu(); + + return 0; +} + static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) @@ -187,44 +228,13 @@ void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; conditional_sti(regs); - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - /* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS - * and we set the offset field correctly. Then we let the CPU to - * restart the faulting instruction. - */ - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ return; } - put_cpu(); if (regs->flags & X86_VM_MASK) goto gp_in_vm86; -- cgit v1.2.2 From e407d62088b7f61f38e1086062650c75a4f2757a Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:36 +0200 Subject: x86, traps: introduce dotraplinkage Mark the exception handlers with "dotraplinkage" to hide the calling convention differences between i386 and x86_64. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/traps_32.c | 28 ++++++++++++++++------------ arch/x86/kernel/traps_64.c | 31 +++++++++++++++++++------------ 3 files changed, 36 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 977c8ea66028..1db6ce4314e1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1243,7 +1243,7 @@ ENTRY(simd_coprocessor_error) END(simd_coprocessor_error) ENTRY(device_not_available) - zeroentry math_state_restore + zeroentry do_device_not_available END(device_not_available) /* runs on exception stack */ diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 0206c915748c..7eb1203c909d 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -190,7 +190,7 @@ vm86_trap: } #define DO_ERROR(trapnr, signr, str, name) \ -void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -200,7 +200,7 @@ void do_##name(struct pt_regs *regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -224,7 +224,7 @@ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR(12, SIGBUS, "stack segment", stack_segment) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) -void __kprobes +dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; @@ -428,7 +428,8 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) reassert_nmi(); } -notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) { int cpu; @@ -456,7 +457,7 @@ void restart_nmi(void) acpi_nmi_enable(); } -void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) @@ -494,7 +495,7 @@ void __kprobes do_int3(struct pt_regs *regs, long error_code) * find every occurrence of the TF bit that could be saved away even * by user code) */ -void __kprobes do_debug(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned int condition; @@ -627,7 +628,7 @@ void math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); ignore_fpu_irq = 1; @@ -682,7 +683,8 @@ static void simd_math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); @@ -706,7 +708,8 @@ void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, current); } -void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { conditional_sti(regs); #if 0 @@ -784,7 +787,8 @@ asmlinkage void math_emulate(long arg) #endif /* CONFIG_MATH_EMULATION */ -void __kprobes do_device_not_available(struct pt_regs *regs, long error) +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) { if (read_cr0() & X86_CR0_EM) { conditional_sti(regs); @@ -796,14 +800,14 @@ void __kprobes do_device_not_available(struct pt_regs *regs, long error) } #ifdef CONFIG_X86_MCE -void __kprobes do_machine_check(struct pt_regs *regs, long error) +dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) { conditional_sti(regs); machine_check_vector(regs, error); } #endif -void do_iret_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; local_irq_enable(); diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index b295ebf0cb3f..be73323ca952 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -125,7 +125,7 @@ kernel_trap: } #define DO_ERROR(trapnr, signr, str, name) \ -asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ == NOTIFY_STOP) \ @@ -135,7 +135,7 @@ asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ } #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -asmlinkage void do_##name(struct pt_regs *regs, long error_code) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ { \ siginfo_t info; \ info.si_signo = signr; \ @@ -159,7 +159,7 @@ DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) /* Runs on IST stack */ -asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { if (notify_die(DIE_TRAP, "stack segment", regs, error_code, 12, SIGBUS) == NOTIFY_STOP) @@ -169,7 +169,7 @@ asmlinkage void do_stack_segment(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); } -asmlinkage void do_double_fault(struct pt_regs *regs, long error_code) +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; @@ -186,7 +186,7 @@ asmlinkage void do_double_fault(struct pt_regs *regs, long error_code) die(str, regs, error_code); } -asmlinkage void __kprobes +dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; @@ -317,7 +317,7 @@ asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) io_check_error(reason, regs); } -asmlinkage notrace __kprobes void +dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); @@ -343,7 +343,7 @@ void restart_nmi(void) } /* runs on IST stack. */ -asmlinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) @@ -376,8 +376,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) } /* runs on IST stack. */ -asmlinkage void __kprobes do_debug(struct pt_regs *regs, - unsigned long error_code) +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned long condition; @@ -510,7 +509,7 @@ void math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); if (!user_mode(regs) && @@ -572,7 +571,8 @@ static void simd_math_error(void __user *ip) force_sig_info(SIGFPE, &info, task); } -asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); if (!user_mode(regs) && @@ -581,7 +581,8 @@ asmlinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) simd_math_error((void __user *)regs->ip); } -asmlinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { } @@ -633,6 +634,12 @@ asmlinkage void math_state_restore(void) } EXPORT_SYMBOL_GPL(math_state_restore); +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) +{ + math_state_restore(); +} + void __init trap_init(void) { set_intr_gate(0, ÷_error); -- cgit v1.2.2 From 3d2a71a596bd9c761c8487a2178e95f8a61da083 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Tue, 30 Sep 2008 18:41:37 +0200 Subject: x86, traps: converge do_debug handlers Make the x86_64-version and the i386-version of do_debug more similar. - introduce preempt_conditional_sti/cli to i386. The preempt-count is now elevated during the trap handler, like on x86_64. It does not run on a separate stack, however. - replace an open-coded "send_sigtrap" - copy some comments Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 30 +++++++++++++++++++++--------- arch/x86/kernel/traps_64.c | 17 +++++++++-------- 2 files changed, 30 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 7eb1203c909d..953172af6e51 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -88,6 +88,20 @@ static inline void conditional_sti(struct pt_regs *regs) local_irq_enable(); } +static inline void preempt_conditional_sti(struct pt_regs *regs) +{ + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + dec_preempt_count(); +} + static inline void die_if_kernel(const char *str, struct pt_regs *regs, long err) { @@ -498,7 +512,7 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - unsigned int condition; + unsigned long condition; int si_code; get_debugreg(condition, 6); @@ -512,9 +526,9 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, SIGTRAP) == NOTIFY_STOP) return; + /* It's safe to allow irq's after DR6 has been saved */ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); + preempt_conditional_sti(regs); /* Mask out spurious debug traps due to lazy DR7 setting */ if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { @@ -533,16 +547,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) * kernel space (but re-enable TF when returning to user mode). */ if (condition & DR_STEP) { - /* - * We already checked v86 mode above, so we can - * check for kernel mode by just checking the CPL - * of CS. - */ if (!user_mode(regs)) goto clear_TF_reenable; } - si_code = get_si_code((unsigned long)condition); + si_code = get_si_code(condition); /* Ok, finally something we can handle */ send_sigtrap(tsk, regs, error_code, si_code); @@ -552,15 +561,18 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) */ clear_dr7: set_debugreg(0, 7); + preempt_conditional_cli(regs); return; debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); return; clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; + preempt_conditional_cli(regs); return; } diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index be73323ca952..a851eca6d04c 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -380,7 +380,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; unsigned long condition; - siginfo_t info; + int si_code; get_debugreg(condition, 6); @@ -394,6 +394,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); /* Mask out spurious debug traps due to lazy DR7 setting */ @@ -402,6 +403,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } + /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; /* @@ -413,15 +415,14 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_TF_reenable; } + si_code = get_si_code(condition); /* Ok, finally something we can handle */ - tsk->thread.trap_no = 1; - tsk->thread.error_code = error_code; - info.si_signo = SIGTRAP; - info.si_errno = 0; - info.si_code = get_si_code(condition); - info.si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; - force_sig_info(SIGTRAP, &info, tsk); + send_sigtrap(tsk, regs, error_code, si_code); + /* + * Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ clear_dr7: set_debugreg(0, 7); preempt_conditional_cli(regs); -- cgit v1.2.2 From 699d2937d45d9dabc1772d0d07501ccc43885c23 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:32 +0200 Subject: traps: x86: converge trap_init functions - set_system_gate on i386 is really set_system_trap_gate - set_system_gate on x86_64 is really set_system_intr_gate - ist=0 means no special stack switch is done: - introduce STACKFAULT_STACK, DOUBLEFAULT_STACK, NMI_STACK, DEBUG_STACK and MCE_STACK as on x86_64. - use the _ist variants with XXX_STACK set to zero - remove set_system_gate Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar traps: x86: correct copy/paste bug: a trap is a GATE_TRAP Fix copy/paste/forgot-to-edit bug in desc.h. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 16 +++++++++------- arch/x86/kernel/traps_64.c | 6 +++--- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 953172af6e51..2c7ea3827713 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -847,10 +847,12 @@ void __init trap_init(void) #endif set_intr_gate(0, ÷_error); - set_intr_gate(1, &debug); - set_intr_gate(2, &nmi); - set_system_intr_gate(3, &int3); /* int3 can be called from all */ - set_system_intr_gate(4, &overflow); /* int4 can be called from all */ + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); @@ -858,14 +860,14 @@ void __init trap_init(void) set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); - set_intr_gate(12, &stack_segment); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); set_intr_gate(13, &general_protection); set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); set_intr_gate(16, &coprocessor_error); set_intr_gate(17, &alignment_check); #ifdef CONFIG_X86_MCE - set_intr_gate(18, &machine_check); + set_intr_gate_ist(18, &machine_check, MCE_STACK); #endif set_intr_gate(19, &simd_coprocessor_error); @@ -881,7 +883,7 @@ void __init trap_init(void) printk("done.\n"); } - set_system_gate(SYSCALL_VECTOR, &system_call); + set_system_trap_gate(SYSCALL_VECTOR, &system_call); /* Reserve all the builtin and the syscall vector: */ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index a851eca6d04c..ea091dfe0cd3 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -647,9 +647,9 @@ void __init trap_init(void) set_intr_gate_ist(1, &debug, DEBUG_STACK); set_intr_gate_ist(2, &nmi, NMI_STACK); /* int3 can be called from all */ - set_system_gate_ist(3, &int3, DEBUG_STACK); + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); /* int4 can be called from all */ - set_system_gate(4, &overflow); + set_system_intr_gate(4, &overflow); set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); @@ -669,7 +669,7 @@ void __init trap_init(void) set_intr_gate(19, &simd_coprocessor_error); #ifdef CONFIG_IA32_EMULATION - set_system_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif /* * Should be a barrier for any external CPU state: -- cgit v1.2.2 From 091d30c8f7744f43b0bb507fd30ceb95f9ff9e1b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 23:16:12 +0200 Subject: traps: x86_64: make math_state_restore more like i386 - rename variable me -> tsk - get thread and tsk like i386 - expand used_math() - copy comment Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index ea091dfe0cd3..00406c99aee4 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -604,14 +604,15 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) */ asmlinkage void math_state_restore(void) { - struct task_struct *me = current; + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; - if (!used_math()) { + if (!tsk_used_math(tsk)) { local_irq_enable(); /* * does a slab alloc which can sleep */ - if (init_fpu(me)) { + if (init_fpu(tsk)) { /* * ran out of memory! */ @@ -625,13 +626,13 @@ asmlinkage void math_state_restore(void) /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ - if (unlikely(restore_fpu_checking(me))) { + if (unlikely(restore_fpu_checking(tsk))) { stts(); - force_sig(SIGSEGV, me); + force_sig(SIGSEGV, tsk); return; } - task_thread_info(me)->status |= TS_USEDFPU; - me->fpu_counter++; + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); -- cgit v1.2.2 From 4915a35e35a037254550a2ba9f367a812bc37d40 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:34 +0200 Subject: traps: i386: use preempt_conditional_sti/cli in do_int3 Use preempt_conditional_sti/cli in do_int3, like on x86_64. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 2c7ea3827713..67953bbe193d 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -477,14 +477,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; - conditional_sti(regs); #else if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; #endif + preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); } /* -- cgit v1.2.2 From 1c9af8a9f448abfe13f17fa76b7ca72b588a1edb Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:35 +0200 Subject: traps: x86_64: make io_check_error equal to the one on i386 Make io_check_error equal to the one on i386. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 00406c99aee4..7853f488cd6c 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -252,13 +252,19 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void io_check_error(unsigned char reason, struct pt_regs *regs) { - printk("NMI: IOCK error (debug interrupt?)\n"); + unsigned long i; + + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); show_registers(regs); /* Re-enable the IOCK line, wait for a few seconds */ reason = (reason & 0xf) | 8; outb(reason, 0x61); - mdelay(2000); + + i = 2000; + while (--i) + udelay(1000); + reason &= ~8; outb(reason, 0x61); } -- cgit v1.2.2 From 7970479c4881e156a0c07c1a7fdc564c8e3b2bfc Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:36 +0200 Subject: traps: i386: expand clear_mem_error and remove from mach_traps.h This is the last user of clear_mem_error, which is defined only on i386. Expand the inline function and remove it from include/asm-x86/mach-default/mach_traps.h Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 67953bbe193d..01e7ca8c7661 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -313,7 +313,8 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); /* Clear and disable the memory parity error line. */ - clear_mem_error(reason); + reason = (reason & 0xf) | 4; + outb(reason, 0x61); } static notrace __kprobes void -- cgit v1.2.2 From a5ae2330a5a6e7948866715a845ad2e8900bd4c2 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:37 +0200 Subject: traps: x86_64: use task_pid_nr(tsk) instead of tsk->pid in do_general_protection Use task_pid_nr(tsk) instead of tsk->pid in do_general_protection. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 7853f488cd6c..71c2629997b3 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -204,7 +204,7 @@ do_general_protection(struct pt_regs *regs, long error_code) printk_ratelimit()) { printk(KERN_INFO "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, + tsk->comm, task_pid_nr(tsk), regs->ip, regs->sp, error_code); print_vma_addr(" in ", regs->ip); printk("\n"); -- cgit v1.2.2 From c1d518c8422ff7d3f377958771b265753028579c Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 23:17:11 +0200 Subject: traps: x86: various noop-changes preparing for unification of traps_xx.c - reordering include files - whitespace changes - comment changes - removed unused bad_intr() - make default_do_nmi static Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 28 +++++++++++--------- arch/x86/kernel/traps_64.c | 65 ++++++++++++++++++++++++++++++---------------- 2 files changed, 58 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 01e7ca8c7661..076739863d24 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -7,13 +7,11 @@ */ /* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'asm.s'. + * Handle hardware traps and faults. */ #include #include #include -#include #include #include #include @@ -32,6 +30,8 @@ #include #include #include +#include +#include #ifdef CONFIG_EISA #include @@ -46,22 +46,26 @@ #include #endif -#include -#include #include #include +#include #include #include #include #include +#include #include #include + +#include + +#include +#include #include #include #include #include -#include "mach_traps.h" #include "cpu/mcheck/mce.h" DECLARE_BITMAP(used_vectors, NR_VECTORS); @@ -340,7 +344,8 @@ io_check_error(unsigned char reason, struct pt_regs *regs) static notrace __kprobes void unknown_nmi_error(unsigned char reason, struct pt_regs *regs) { - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) return; #ifdef CONFIG_MCA /* @@ -446,13 +451,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { - int cpu; - nmi_enter(); - cpu = smp_processor_id(); - - ++nmi_count(cpu); + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } if (!ignore_nmis) default_do_nmi(regs); @@ -472,6 +473,7 @@ void restart_nmi(void) acpi_nmi_enable(); } +/* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { #ifdef CONFIG_KPROBES @@ -510,6 +512,8 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) * about restoring all the debug state, and ptrace doesn't have to * find every occurrence of the TF bit that could be saved away even * by user code) + * + * May run on IST stack. */ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 71c2629997b3..22fe62a24edb 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -7,10 +7,8 @@ */ /* - * 'Traps.c' handles hardware traps and faults after we have saved some - * state in 'entry.S'. + * Handle hardware traps and faults. */ -#include #include #include #include @@ -41,18 +39,21 @@ #include #include +#include #include #include #include #include +#include #include #include + +#include + #include #include #include -#include -#include static int ignore_nmis; @@ -73,8 +74,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) { if (regs->flags & X86_EFLAGS_IF) local_irq_disable(); - /* Make sure to not schedule here because we could be running - on an exception stack. */ dec_preempt_count(); } @@ -228,9 +227,12 @@ gp_in_kernel: static notrace __kprobes void mem_parity_error(unsigned char reason, struct pt_regs *regs) { - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG + "You have some hardware problem, likely on the PCI bus.\n"); #if defined(CONFIG_EDAC) if (edac_handler_set()) { @@ -275,19 +277,18 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; - printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", - reason); - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } -/* Runs on IST stack. This code must keep interrupts off all the time. - Nested NMIs are prevented by the CPU. */ -asmlinkage notrace __kprobes void default_do_nmi(struct pt_regs *regs) +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; int cpu; @@ -348,7 +349,7 @@ void restart_nmi(void) acpi_nmi_enable(); } -/* runs on IST stack. */ +/* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) @@ -381,7 +382,30 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) return regs; } -/* runs on IST stack. */ +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + * + * May run on IST stack. + */ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; @@ -525,11 +549,6 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) math_error((void __user *)regs->ip); } -asmlinkage void bad_intr(void) -{ - printk("bad interrupt"); -} - static void simd_math_error(void __user *ip) { struct task_struct *task; -- cgit v1.2.2 From 081f75bbdc86de53537e1b5aca01de72bd2fea6b Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:39 +0200 Subject: traps: x86: make traps_32.c and traps_64.c equal Use CONFIG_X86_64/CONFIG_X86_32 to condtionally compile the parts needed for x86_64 or i386 only. Runs a small userspace for a number of minimal configurations and boots the defconfigs. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps_32.c | 173 ++++++++++++++++++++- arch/x86/kernel/traps_64.c | 368 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 536 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c index 076739863d24..ffb131f74f78 100644 --- a/arch/x86/kernel/traps_32.c +++ b/arch/x86/kernel/traps_32.c @@ -48,7 +48,6 @@ #include #include -#include #include #include #include @@ -59,6 +58,11 @@ #include +#ifdef CONFIG_X86_64 +#include +#include +#include +#else #include #include #include @@ -83,6 +87,7 @@ char ignore_fpu_irq; */ gate_desc idt_table[256] __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +#endif static int ignore_nmis; @@ -106,6 +111,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } +#ifdef CONFIG_X86_32 static inline void die_if_kernel(const char *str, struct pt_regs *regs, long err) { @@ -153,6 +159,7 @@ static int lazy_iobitmap_copy(void) return 0; } +#endif static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, @@ -160,6 +167,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, { struct task_struct *tsk = current; +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) { /* * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. @@ -169,11 +177,14 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, goto vm86_trap; goto trap_signal; } +#endif if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_X86_32 trap_signal: +#endif /* * We want error_code and trap_no set for userspace faults and * kernelspace faults which result in die(), but not @@ -186,6 +197,18 @@ trap_signal: tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } +#endif + if (info) force_sig_info(signr, info, tsk); else @@ -200,11 +223,13 @@ kernel_trap: } return; +#ifdef CONFIG_X86_32 vm86_trap: if (handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, trapnr)) goto trap_signal; return; +#endif } #define DO_ERROR(trapnr, signr, str, name) \ @@ -239,9 +264,41 @@ DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +#endif DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} +#endif + dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) { @@ -249,6 +306,7 @@ do_general_protection(struct pt_regs *regs, long error_code) conditional_sti(regs); +#ifdef CONFIG_X86_32 if (lazy_iobitmap_copy()) { /* restart the faulting instruction */ return; @@ -256,6 +314,7 @@ do_general_protection(struct pt_regs *regs, long error_code) if (regs->flags & X86_VM_MASK) goto gp_in_vm86; +#endif tsk = current; if (!user_mode(regs)) @@ -277,10 +336,12 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; +#ifdef CONFIG_X86_32 gp_in_vm86: local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); return; +#endif gp_in_kernel: if (fixup_exception(regs)) @@ -368,6 +429,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } +#ifdef CONFIG_X86_32 static DEFINE_SPINLOCK(nmi_print_lock); void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) @@ -402,6 +464,7 @@ void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGSEGV); } +#endif static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { @@ -441,11 +504,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); +#ifdef CONFIG_X86_32 /* * Reassert NMI in case it became active meanwhile * as it's edge-triggered: */ reassert_nmi(); +#endif } dotraplinkage notrace __kprobes void @@ -453,7 +518,11 @@ do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); +#ifdef CONFIG_X86_32 { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else + add_pda(__nmi_count, 1); +#endif if (!ignore_nmis) default_do_nmi(regs); @@ -491,6 +560,29 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); } +#ifdef CONFIG_X86_64 +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} +#endif + /* * Our handling of the processor debug registers is non-trivial. * We do not clear them on entry and exit from the kernel. Therefore @@ -542,8 +634,10 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } +#ifdef CONFIG_X86_32 if (regs->flags & X86_VM_MASK) goto debug_vm86; +#endif /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; @@ -570,10 +664,12 @@ clear_dr7: preempt_conditional_cli(regs); return; +#ifdef CONFIG_X86_32 debug_vm86: handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); return; +#endif clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); @@ -582,6 +678,20 @@ clear_TF_reenable: return; } +#ifdef CONFIG_X86_64 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + if (fixup_exception(regs)) + return 1; + + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} +#endif + /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous @@ -618,7 +728,9 @@ void math_error(void __user *ip) swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 return; +#endif default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ @@ -649,7 +761,15 @@ void math_error(void __user *ip) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); + +#ifdef CONFIG_X86_32 ignore_fpu_irq = 1; +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; +#endif + math_error((void __user *)regs->ip); } @@ -706,6 +826,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); +#ifdef CONFIG_X86_32 if (cpu_has_xmm) { /* Handle SIMD FPU exceptions on PIII+ processors. */ ignore_fpu_irq = 1; @@ -724,6 +845,12 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) current->thread.error_code = error_code; die_if_kernel("cache flush denied", regs, error_code); force_sig(SIGSEGV, current); +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +#endif } dotraplinkage void @@ -736,6 +863,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) #endif } +#ifdef CONFIG_X86_32 unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) { struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); @@ -754,6 +882,15 @@ unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) return new_kesp; } +#else +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} +#endif /* * 'math_state_restore()' saves the current math information in the @@ -786,14 +923,24 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 restore_fpu(tsk); +#else + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } +#endif thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); #ifndef CONFIG_MATH_EMULATION - asmlinkage void math_emulate(long arg) { printk(KERN_EMERG @@ -802,12 +949,12 @@ asmlinkage void math_emulate(long arg) force_sig(SIGFPE, current); schedule(); } - #endif /* CONFIG_MATH_EMULATION */ dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error) { +#ifdef CONFIG_X86_32 if (read_cr0() & X86_CR0_EM) { conditional_sti(regs); math_emulate(0); @@ -815,8 +962,12 @@ do_device_not_available(struct pt_regs *regs, long error) math_state_restore(); /* interrupts still off */ conditional_sti(regs); } +#else + math_state_restore(); +#endif } +#ifdef CONFIG_X86_32 #ifdef CONFIG_X86_MCE dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) { @@ -839,10 +990,13 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) return; do_trap(32, SIGILL, "iret exception", regs, error_code, &info); } +#endif void __init trap_init(void) { +#ifdef CONFIG_X86_32 int i; +#endif #ifdef CONFIG_EISA void __iomem *p = early_ioremap(0x0FFFD9, 4); @@ -862,7 +1016,11 @@ void __init trap_init(void) set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); +#else + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); @@ -877,6 +1035,11 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); +#ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +#endif + +#ifdef CONFIG_X86_32 if (cpu_has_fxsr) { printk(KERN_INFO "Enabling fast FPU save and restore... "); set_in_cr4(X86_CR4_OSFXSR); @@ -896,11 +1059,13 @@ void __init trap_init(void) set_bit(i, used_vectors); set_bit(SYSCALL_VECTOR, used_vectors); - +#endif /* * Should be a barrier for any external CPU state: */ cpu_init(); +#ifdef CONFIG_X86_32 trap_init_hook(); +#endif } diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c index 22fe62a24edb..60ecc855ab81 100644 --- a/arch/x86/kernel/traps_64.c +++ b/arch/x86/kernel/traps_64.c @@ -33,13 +33,21 @@ #include #include +#ifdef CONFIG_EISA +#include +#include +#endif + +#ifdef CONFIG_MCA +#include +#endif + #if defined(CONFIG_EDAC) #include #endif #include #include -#include #include #include #include @@ -50,10 +58,35 @@ #include +#ifdef CONFIG_X86_64 #include #include #include +#else +#include +#include +#include +#include +#include + +#include "cpu/mcheck/mce.h" +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + +asmlinkage int system_call(void); + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq; + +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +#endif static int ignore_nmis; @@ -77,15 +110,80 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } +#ifdef CONFIG_X86_32 +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) +{ + if (!user_mode_vm(regs)) + die(str, regs, err); +} + +/* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. + */ +static int lazy_iobitmap_copy(void) +{ + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; + + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + } + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + + return 1; + } + put_cpu(); + + return 0; +} +#endif + static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) { struct task_struct *tsk = current; +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) { + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) + goto vm86_trap; + goto trap_signal; + } +#endif + if (!user_mode(regs)) goto kernel_trap; +#ifdef CONFIG_X86_32 +trap_signal: +#endif /* * We want error_code and trap_no set for userspace faults and * kernelspace faults which result in die(), but not @@ -98,6 +196,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; +#ifdef CONFIG_X86_64 if (show_unhandled_signals && unhandled_signal(tsk, signr) && printk_ratelimit()) { printk(KERN_INFO @@ -107,6 +206,7 @@ do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, print_vma_addr(" in ", regs->ip); printk("\n"); } +#endif if (info) force_sig_info(signr, info, tsk); @@ -121,6 +221,14 @@ kernel_trap: die(str, regs, error_code); } return; + +#ifdef CONFIG_X86_32 +vm86_trap: + if (handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + goto trap_signal; + return; +#endif } #define DO_ERROR(trapnr, signr, str, name) \ @@ -155,8 +263,12 @@ DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +#endif DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) +#ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) { @@ -184,6 +296,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) for (;;) die(str, regs, error_code); } +#endif dotraplinkage void __kprobes do_general_protection(struct pt_regs *regs, long error_code) @@ -192,6 +305,16 @@ do_general_protection(struct pt_regs *regs, long error_code) conditional_sti(regs); +#ifdef CONFIG_X86_32 + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ + return; + } + + if (regs->flags & X86_VM_MASK) + goto gp_in_vm86; +#endif + tsk = current; if (!user_mode(regs)) goto gp_in_kernel; @@ -212,6 +335,13 @@ do_general_protection(struct pt_regs *regs, long error_code) force_sig(SIGSEGV, tsk); return; +#ifdef CONFIG_X86_32 +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; +#endif + gp_in_kernel: if (fixup_exception(regs)) return; @@ -277,6 +407,16 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; +#ifdef CONFIG_MCA + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { + mca_handle_nmi(); + return; + } +#endif printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", reason, smp_processor_id()); @@ -288,6 +428,43 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } +#ifdef CONFIG_X86_32 +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} +#endif + static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; @@ -303,6 +480,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; +#ifdef CONFIG_X86_LOCAL_APIC /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -311,6 +489,9 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) return; if (!do_nmi_callback(regs, cpu)) unknown_nmi_error(reason, regs); +#else + unknown_nmi_error(reason, regs); +#endif return; } @@ -322,6 +503,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) mem_parity_error(reason, regs); if (reason & 0x40) io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered: + */ + reassert_nmi(); +#endif } dotraplinkage notrace __kprobes void @@ -329,7 +517,11 @@ do_nmi(struct pt_regs *regs, long error_code) { nmi_enter(); +#ifdef CONFIG_X86_32 + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else add_pda(__nmi_count, 1); +#endif if (!ignore_nmis) default_do_nmi(regs); @@ -352,15 +544,22 @@ void restart_nmi(void) /* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); } +#ifdef CONFIG_X86_64 /* Help handler running on IST stack to switch back to user stack for scheduling or signal handling. The actual stack switch is done in entry.S */ @@ -381,6 +580,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) *regs = *eregs; return regs; } +#endif /* * Our handling of the processor debug registers is non-trivial. @@ -433,6 +633,11 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) goto clear_dr7; } +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) + goto debug_vm86; +#endif + /* Save debug status register where ptrace can see it */ tsk->thread.debugreg6 = condition; @@ -458,6 +663,13 @@ clear_dr7: preempt_conditional_cli(regs); return; +#ifdef CONFIG_X86_32 +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); + return; +#endif + clear_TF_reenable: set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; @@ -465,6 +677,7 @@ clear_TF_reenable: return; } +#ifdef CONFIG_X86_64 static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) { if (fixup_exception(regs)) @@ -476,6 +689,7 @@ static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) die(str, regs, 0); return 0; } +#endif /* * Note that we play around with the 'TS' bit in an attempt to get @@ -513,6 +727,9 @@ void math_error(void __user *ip) swd = get_fpu_swd(task); switch (swd & ~cwd & 0x3f) { case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 + return; +#endif default: /* Multiple exceptions */ break; case 0x001: /* Invalid Op */ @@ -543,9 +760,15 @@ void math_error(void __user *ip) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); + +#ifdef CONFIG_X86_32 + ignore_fpu_irq = 1; +#else if (!user_mode(regs) && kernel_math_error(regs, "kernel x87 math error", 16)) return; +#endif + math_error((void __user *)regs->ip); } @@ -601,17 +824,64 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->ip); + return; + } + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->flags & X86_VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); + return; + } + current->thread.trap_no = 19; + current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); + force_sig(SIGSEGV, current); +#else if (!user_mode(regs) && kernel_math_error(regs, "kernel simd math error", 19)) return; simd_math_error((void __user *)regs->ip); +#endif } dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) { + conditional_sti(regs); +#if 0 + /* No need to warn about this any longer. */ + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif } +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +{ + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + + return new_kesp; +} +#else asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) { } @@ -619,6 +889,7 @@ asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) { } +#endif /* * 'math_state_restore()' saves the current math information in the @@ -626,6 +897,9 @@ asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) * * Careful.. There are problems with IBM-designed IRQ13 behaviour. * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). */ asmlinkage void math_state_restore(void) { @@ -648,6 +922,9 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 + restore_fpu(tsk); +#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -656,19 +933,78 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } +#endif thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } EXPORT_SYMBOL_GPL(math_state_restore); +#ifndef CONFIG_MATH_EMULATION +asmlinkage void math_emulate(long arg) +{ + printk(KERN_EMERG + "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n", current->comm); + force_sig(SIGFPE, current); + schedule(); +} +#endif /* CONFIG_MATH_EMULATION */ + dotraplinkage void __kprobes do_device_not_available(struct pt_regs *regs, long error) { +#ifdef CONFIG_X86_32 + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +#else math_state_restore(); +#endif +} + +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_MCE +dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) +{ + conditional_sti(regs); + machine_check_vector(regs, error); } +#endif + +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); +} +#endif void __init trap_init(void) { +#ifdef CONFIG_X86_32 + int i; +#endif + +#ifdef CONFIG_EISA + void __iomem *p = early_ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + early_iounmap(p, 4); +#endif + set_intr_gate(0, ÷_error); set_intr_gate_ist(1, &debug, DEBUG_STACK); set_intr_gate_ist(2, &nmi, NMI_STACK); @@ -679,7 +1015,11 @@ void __init trap_init(void) set_intr_gate(5, &bounds); set_intr_gate(6, &invalid_op); set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 + set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); +#else set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif set_intr_gate(9, &coprocessor_segment_overrun); set_intr_gate(10, &invalid_TSS); set_intr_gate(11, &segment_not_present); @@ -697,8 +1037,34 @@ void __init trap_init(void) #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); #endif + +#ifdef CONFIG_X86_32 + if (cpu_has_fxsr) { + printk(KERN_INFO "Enabling fast FPU save and restore... "); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + if (cpu_has_xmm) { + printk(KERN_INFO + "Enabling unmasked SIMD FPU exception support... "); + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); + } + + set_system_trap_gate(SYSCALL_VECTOR, &system_call); + + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + + set_bit(SYSCALL_VECTOR, used_vectors); +#endif /* * Should be a barrier for any external CPU state: */ cpu_init(); + +#ifdef CONFIG_X86_32 + trap_init_hook(); +#endif } -- cgit v1.2.2 From 8728861b4fead8119a1b7bb856a387320859cd98 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Fri, 3 Oct 2008 22:00:40 +0200 Subject: traps: x86: finalize unification of traps.c traps_32.c and traps_64.c are now equal. Move one to traps.c, delete the other one and change the Makefile Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/traps.c | 1071 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps_32.c | 1071 -------------------------------------------- arch/x86/kernel/traps_64.c | 1070 ------------------------------------------- 4 files changed, 1072 insertions(+), 2142 deletions(-) create mode 100644 arch/x86/kernel/traps.c delete mode 100644 arch/x86/kernel/traps_32.c delete mode 100644 arch/x86/kernel/traps_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index de63fed9fae8..0d41f0343dc0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps_$(BITS).o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c new file mode 100644 index 000000000000..ffb131f74f78 --- /dev/null +++ b/arch/x86/kernel/traps.c @@ -0,0 +1,1071 @@ +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs + * + * Pentium III FXSR, SSE support + * Gareth Hughes , May 2000 + */ + +/* + * Handle hardware traps and faults. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef CONFIG_EISA +#include +#include +#endif + +#ifdef CONFIG_MCA +#include +#endif + +#if defined(CONFIG_EDAC) +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef CONFIG_X86_64 +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include + +#include "cpu/mcheck/mce.h" + +DECLARE_BITMAP(used_vectors, NR_VECTORS); +EXPORT_SYMBOL_GPL(used_vectors); + +asmlinkage int system_call(void); + +/* Do we ignore FPU interrupts ? */ +char ignore_fpu_irq; + +/* + * The IDT has to be page-aligned to simplify the Pentium + * F0 0F bug workaround.. We have a special link segment + * for this. + */ +gate_desc idt_table[256] + __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; +#endif + +static int ignore_nmis; + +static inline void conditional_sti(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_sti(struct pt_regs *regs) +{ + inc_preempt_count(); + if (regs->flags & X86_EFLAGS_IF) + local_irq_enable(); +} + +static inline void preempt_conditional_cli(struct pt_regs *regs) +{ + if (regs->flags & X86_EFLAGS_IF) + local_irq_disable(); + dec_preempt_count(); +} + +#ifdef CONFIG_X86_32 +static inline void +die_if_kernel(const char *str, struct pt_regs *regs, long err) +{ + if (!user_mode_vm(regs)) + die(str, regs, err); +} + +/* + * Perform the lazy TSS's I/O bitmap copy. If the TSS has an + * invalid offset set (the LAZY one) and the faulting thread has + * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, + * we set the offset field correctly and return 1. + */ +static int lazy_iobitmap_copy(void) +{ + struct thread_struct *thread; + struct tss_struct *tss; + int cpu; + + cpu = get_cpu(); + tss = &per_cpu(init_tss, cpu); + thread = ¤t->thread; + + if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && + thread->io_bitmap_ptr) { + memcpy(tss->io_bitmap, thread->io_bitmap_ptr, + thread->io_bitmap_max); + /* + * If the previously set map was extending to higher ports + * than the current one, pad extra space with 0xff (no access). + */ + if (thread->io_bitmap_max < tss->io_bitmap_max) { + memset((char *) tss->io_bitmap + + thread->io_bitmap_max, 0xff, + tss->io_bitmap_max - thread->io_bitmap_max); + } + tss->io_bitmap_max = thread->io_bitmap_max; + tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; + tss->io_bitmap_owner = thread; + put_cpu(); + + return 1; + } + put_cpu(); + + return 0; +} +#endif + +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, + long error_code, siginfo_t *info) +{ + struct task_struct *tsk = current; + +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) { + /* + * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. + * On nmi (interrupt 2), do_trap should not be called. + */ + if (trapnr < 6) + goto vm86_trap; + goto trap_signal; + } +#endif + + if (!user_mode(regs)) + goto kernel_trap; + +#ifdef CONFIG_X86_32 +trap_signal: +#endif + /* + * We want error_code and trap_no set for userspace faults and + * kernelspace faults which result in die(), but not + * kernelspace faults which are fixed up. die() gives the + * process no chance to handle the signal and notice the + * kernel fault information, so that won't result in polluting + * the information about previously queued, but not yet + * delivered, faults. See also do_general_protection below. + */ + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + +#ifdef CONFIG_X86_64 + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] trap %s ip:%lx sp:%lx error:%lx", + tsk->comm, tsk->pid, str, + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } +#endif + + if (info) + force_sig_info(signr, info, tsk); + else + force_sig(signr, tsk); + return; + +kernel_trap: + if (!fixup_exception(regs)) { + tsk->thread.error_code = error_code; + tsk->thread.trap_no = trapnr; + die(str, regs, error_code); + } + return; + +#ifdef CONFIG_X86_32 +vm86_trap: + if (handle_vm86_trap((struct kernel_vm86_regs *) regs, + error_code, trapnr)) + goto trap_signal; + return; +#endif +} + +#define DO_ERROR(trapnr, signr, str, name) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, NULL); \ +} + +#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ +dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ +{ \ + siginfo_t info; \ + info.si_signo = signr; \ + info.si_errno = 0; \ + info.si_code = sicode; \ + info.si_addr = (void __user *)siaddr; \ + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ + == NOTIFY_STOP) \ + return; \ + conditional_sti(regs); \ + do_trap(trapnr, signr, str, regs, error_code, &info); \ +} + +DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) +DO_ERROR(4, SIGSEGV, "overflow", overflow) +DO_ERROR(5, SIGSEGV, "bounds", bounds) +DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) +DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) +DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) +DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) +#ifdef CONFIG_X86_32 +DO_ERROR(12, SIGBUS, "stack segment", stack_segment) +#endif +DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) + +#ifdef CONFIG_X86_64 +/* Runs on IST stack */ +dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) +{ + if (notify_die(DIE_TRAP, "stack segment", regs, error_code, + 12, SIGBUS) == NOTIFY_STOP) + return; + preempt_conditional_sti(regs); + do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) +{ + static const char str[] = "double fault"; + struct task_struct *tsk = current; + + /* Return not checked because double check cannot be ignored */ + notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 8; + + /* This is always a kernel trap and never fixable (and thus must + never return). */ + for (;;) + die(str, regs, error_code); +} +#endif + +dotraplinkage void __kprobes +do_general_protection(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk; + + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (lazy_iobitmap_copy()) { + /* restart the faulting instruction */ + return; + } + + if (regs->flags & X86_VM_MASK) + goto gp_in_vm86; +#endif + + tsk = current; + if (!user_mode(regs)) + goto gp_in_kernel; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) { + printk(KERN_INFO + "%s[%d] general protection ip:%lx sp:%lx error:%lx", + tsk->comm, task_pid_nr(tsk), + regs->ip, regs->sp, error_code); + print_vma_addr(" in ", regs->ip); + printk("\n"); + } + + force_sig(SIGSEGV, tsk); + return; + +#ifdef CONFIG_X86_32 +gp_in_vm86: + local_irq_enable(); + handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); + return; +#endif + +gp_in_kernel: + if (fixup_exception(regs)) + return; + + tsk->thread.error_code = error_code; + tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; + die("general protection fault", regs, error_code); +} + +static notrace __kprobes void +mem_parity_error(unsigned char reason, struct pt_regs *regs) +{ + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG + "You have some hardware problem, likely on the PCI bus.\n"); + +#if defined(CONFIG_EDAC) + if (edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); + + /* Clear and disable the memory parity error line. */ + reason = (reason & 0xf) | 4; + outb(reason, 0x61); +} + +static notrace __kprobes void +io_check_error(unsigned char reason, struct pt_regs *regs) +{ + unsigned long i; + + printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); + show_registers(regs); + + /* Re-enable the IOCK line, wait for a few seconds */ + reason = (reason & 0xf) | 8; + outb(reason, 0x61); + + i = 2000; + while (--i) + udelay(1000); + + reason &= ~8; + outb(reason, 0x61); +} + +static notrace __kprobes void +unknown_nmi_error(unsigned char reason, struct pt_regs *regs) +{ + if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == + NOTIFY_STOP) + return; +#ifdef CONFIG_MCA + /* + * Might actually be able to figure out what the guilty party + * is: + */ + if (MCA_bus) { + mca_handle_nmi(); + return; + } +#endif + printk(KERN_EMERG + "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", + reason, smp_processor_id()); + + printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); + if (panic_on_unrecovered_nmi) + panic("NMI: Not continuing"); + + printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); +} + +#ifdef CONFIG_X86_32 +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} +#endif + +static notrace __kprobes void default_do_nmi(struct pt_regs *regs) +{ + unsigned char reason = 0; + int cpu; + + cpu = smp_processor_id(); + + /* Only the BSP gets external NMIs from the system. */ + if (!cpu) + reason = get_nmi_reason(); + + if (!(reason & 0xc0)) { + if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Ok, so this is none of the documented NMI sources, + * so it must be the NMI watchdog. + */ + if (nmi_watchdog_tick(regs, reason)) + return; + if (!do_nmi_callback(regs, cpu)) + unknown_nmi_error(reason, regs); +#else + unknown_nmi_error(reason, regs); +#endif + + return; + } + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) + return; + + /* AK: following checks seem to be broken on modern chipsets. FIXME */ + if (reason & 0x80) + mem_parity_error(reason, regs); + if (reason & 0x40) + io_check_error(reason, regs); +#ifdef CONFIG_X86_32 + /* + * Reassert NMI in case it became active meanwhile + * as it's edge-triggered: + */ + reassert_nmi(); +#endif +} + +dotraplinkage notrace __kprobes void +do_nmi(struct pt_regs *regs, long error_code) +{ + nmi_enter(); + +#ifdef CONFIG_X86_32 + { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } +#else + add_pda(__nmi_count, 1); +#endif + + if (!ignore_nmis) + default_do_nmi(regs); + + nmi_exit(); +} + +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + +/* May run on IST stack. */ +dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) +{ +#ifdef CONFIG_KPROBES + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#else + if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif + + preempt_conditional_sti(regs); + do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); + preempt_conditional_cli(regs); +} + +#ifdef CONFIG_X86_64 +/* Help handler running on IST stack to switch back to user stack + for scheduling or signal handling. The actual stack switch is done in + entry.S */ +asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) +{ + struct pt_regs *regs = eregs; + /* Did already sync */ + if (eregs == (struct pt_regs *)eregs->sp) + ; + /* Exception from user space */ + else if (user_mode(eregs)) + regs = task_pt_regs(current); + /* Exception from kernel and interrupts are enabled. Move to + kernel process stack. */ + else if (eregs->flags & X86_EFLAGS_IF) + regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); + if (eregs != regs) + *regs = *eregs; + return regs; +} +#endif + +/* + * Our handling of the processor debug registers is non-trivial. + * We do not clear them on entry and exit from the kernel. Therefore + * it is possible to get a watchpoint trap here from inside the kernel. + * However, the code in ./ptrace.c has ensured that the user can + * only set watchpoints on userspace addresses. Therefore the in-kernel + * watchpoint trap can only occur in code which is reading/writing + * from user space. Such code must not hold kernel locks (since it + * can equally take a page fault), therefore it is safe to call + * force_sig_info even though that claims and releases locks. + * + * Code in ./signal.c ensures that the debug control register + * is restored before we deliver any signal, and therefore that + * user code runs with the correct debug control register even though + * we clear it here. + * + * Being careful here means that we don't have to be as careful in a + * lot of more complicated places (task switching can be a bit lazy + * about restoring all the debug state, and ptrace doesn't have to + * find every occurrence of the TF bit that could be saved away even + * by user code) + * + * May run on IST stack. + */ +dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) +{ + struct task_struct *tsk = current; + unsigned long condition; + int si_code; + + get_debugreg(condition, 6); + + /* + * The processor cleared BTF, so don't mark that we need it set. + */ + clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); + tsk->thread.debugctlmsr = 0; + + if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, + SIGTRAP) == NOTIFY_STOP) + return; + + /* It's safe to allow irq's after DR6 has been saved */ + preempt_conditional_sti(regs); + + /* Mask out spurious debug traps due to lazy DR7 setting */ + if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { + if (!tsk->thread.debugreg7) + goto clear_dr7; + } + +#ifdef CONFIG_X86_32 + if (regs->flags & X86_VM_MASK) + goto debug_vm86; +#endif + + /* Save debug status register where ptrace can see it */ + tsk->thread.debugreg6 = condition; + + /* + * Single-stepping through TF: make sure we ignore any events in + * kernel space (but re-enable TF when returning to user mode). + */ + if (condition & DR_STEP) { + if (!user_mode(regs)) + goto clear_TF_reenable; + } + + si_code = get_si_code(condition); + /* Ok, finally something we can handle */ + send_sigtrap(tsk, regs, error_code, si_code); + + /* + * Disable additional traps. They'll be re-enabled when + * the signal is delivered. + */ +clear_dr7: + set_debugreg(0, 7); + preempt_conditional_cli(regs); + return; + +#ifdef CONFIG_X86_32 +debug_vm86: + handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); + preempt_conditional_cli(regs); + return; +#endif + +clear_TF_reenable: + set_tsk_thread_flag(tsk, TIF_SINGLESTEP); + regs->flags &= ~X86_EFLAGS_TF; + preempt_conditional_cli(regs); + return; +} + +#ifdef CONFIG_X86_64 +static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) +{ + if (fixup_exception(regs)) + return 1; + + notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); + /* Illegal floating point operation in the kernel */ + current->thread.trap_no = trapnr; + die(str, regs, 0); + return 0; +} +#endif + +/* + * Note that we play around with the 'TS' bit in an attempt to get + * the correct behaviour even in the presence of the asynchronous + * IRQ13 behaviour + */ +void math_error(void __user *ip) +{ + struct task_struct *task; + siginfo_t info; + unsigned short cwd, swd; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 16; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = ip; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); + switch (swd & ~cwd & 0x3f) { + case 0x000: /* No unmasked exception */ +#ifdef CONFIG_X86_32 + return; +#endif + default: /* Multiple exceptions */ + break; + case 0x001: /* Invalid Op */ + /* + * swd & 0x240 == 0x040: Stack Underflow + * swd & 0x240 == 0x240: Stack Overflow + * User must clear the SF bit (0x40) if set + */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + ignore_fpu_irq = 1; +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel x87 math error", 16)) + return; +#endif + + math_error((void __user *)regs->ip); +} + +static void simd_math_error(void __user *ip) +{ + struct task_struct *task; + siginfo_t info; + unsigned short mxcsr; + + /* + * Save the info for the exception handler and clear the error. + */ + task = current; + save_init_fpu(task); + task->thread.trap_no = 19; + task->thread.error_code = 0; + info.si_signo = SIGFPE; + info.si_errno = 0; + info.si_code = __SI_FAULT; + info.si_addr = ip; + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + mxcsr = get_fpu_mxcsr(task); + switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { + case 0x000: + default: + break; + case 0x001: /* Invalid Op */ + info.si_code = FPE_FLTINV; + break; + case 0x002: /* Denormalize */ + case 0x010: /* Underflow */ + info.si_code = FPE_FLTUND; + break; + case 0x004: /* Zero Divide */ + info.si_code = FPE_FLTDIV; + break; + case 0x008: /* Overflow */ + info.si_code = FPE_FLTOVF; + break; + case 0x020: /* Precision */ + info.si_code = FPE_FLTRES; + break; + } + force_sig_info(SIGFPE, &info, task); +} + +dotraplinkage void +do_simd_coprocessor_error(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); + +#ifdef CONFIG_X86_32 + if (cpu_has_xmm) { + /* Handle SIMD FPU exceptions on PIII+ processors. */ + ignore_fpu_irq = 1; + simd_math_error((void __user *)regs->ip); + return; + } + /* + * Handle strange cache flush from user space exception + * in all other cases. This is undocumented behaviour. + */ + if (regs->flags & X86_VM_MASK) { + handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); + return; + } + current->thread.trap_no = 19; + current->thread.error_code = error_code; + die_if_kernel("cache flush denied", regs, error_code); + force_sig(SIGSEGV, current); +#else + if (!user_mode(regs) && + kernel_math_error(regs, "kernel simd math error", 19)) + return; + simd_math_error((void __user *)regs->ip); +#endif +} + +dotraplinkage void +do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) +{ + conditional_sti(regs); +#if 0 + /* No need to warn about this any longer. */ + printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); +#endif +} + +#ifdef CONFIG_X86_32 +unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) +{ + struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; + __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; + + /* Set up base for espfix segment */ + desc &= 0x00f0ff0000000000ULL; + desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | + ((((__u64)base) << 32) & 0xff00000000000000ULL) | + ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | + (lim_pages & 0xffff); + *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; + + return new_kesp; +} +#else +asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) +{ +} + +asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) +{ +} +#endif + +/* + * 'math_state_restore()' saves the current math information in the + * old math state array, and gets the new ones from the current task + * + * Careful.. There are problems with IBM-designed IRQ13 behaviour. + * Don't touch unless you *really* know how it works. + * + * Must be called with kernel preemption disabled (in this case, + * local interrupts are disabled at the call-site in entry.S). + */ +asmlinkage void math_state_restore(void) +{ + struct thread_info *thread = current_thread_info(); + struct task_struct *tsk = thread->task; + + if (!tsk_used_math(tsk)) { + local_irq_enable(); + /* + * does a slab alloc which can sleep + */ + if (init_fpu(tsk)) { + /* + * ran out of memory! + */ + do_group_exit(SIGKILL); + return; + } + local_irq_disable(); + } + + clts(); /* Allow maths ops (or we recurse) */ +#ifdef CONFIG_X86_32 + restore_fpu(tsk); +#else + /* + * Paranoid restore. send a SIGSEGV if we fail to restore the state. + */ + if (unlikely(restore_fpu_checking(tsk))) { + stts(); + force_sig(SIGSEGV, tsk); + return; + } +#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ + tsk->fpu_counter++; +} +EXPORT_SYMBOL_GPL(math_state_restore); + +#ifndef CONFIG_MATH_EMULATION +asmlinkage void math_emulate(long arg) +{ + printk(KERN_EMERG + "math-emulation not enabled and no coprocessor found.\n"); + printk(KERN_EMERG "killing %s.\n", current->comm); + force_sig(SIGFPE, current); + schedule(); +} +#endif /* CONFIG_MATH_EMULATION */ + +dotraplinkage void __kprobes +do_device_not_available(struct pt_regs *regs, long error) +{ +#ifdef CONFIG_X86_32 + if (read_cr0() & X86_CR0_EM) { + conditional_sti(regs); + math_emulate(0); + } else { + math_state_restore(); /* interrupts still off */ + conditional_sti(regs); + } +#else + math_state_restore(); +#endif +} + +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_MCE +dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) +{ + conditional_sti(regs); + machine_check_vector(regs, error); +} +#endif + +dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) +{ + siginfo_t info; + local_irq_enable(); + + info.si_signo = SIGILL; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + if (notify_die(DIE_TRAP, "iret exception", + regs, error_code, 32, SIGILL) == NOTIFY_STOP) + return; + do_trap(32, SIGILL, "iret exception", regs, error_code, &info); +} +#endif + +void __init trap_init(void) +{ +#ifdef CONFIG_X86_32 + int i; +#endif + +#ifdef CONFIG_EISA + void __iomem *p = early_ioremap(0x0FFFD9, 4); + + if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) + EISA_bus = 1; + early_iounmap(p, 4); +#endif + + set_intr_gate(0, ÷_error); + set_intr_gate_ist(1, &debug, DEBUG_STACK); + set_intr_gate_ist(2, &nmi, NMI_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + /* int4 can be called from all */ + set_system_intr_gate(4, &overflow); + set_intr_gate(5, &bounds); + set_intr_gate(6, &invalid_op); + set_intr_gate(7, &device_not_available); +#ifdef CONFIG_X86_32 + set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); +#else + set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); +#endif + set_intr_gate(9, &coprocessor_segment_overrun); + set_intr_gate(10, &invalid_TSS); + set_intr_gate(11, &segment_not_present); + set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); + set_intr_gate(13, &general_protection); + set_intr_gate(14, &page_fault); + set_intr_gate(15, &spurious_interrupt_bug); + set_intr_gate(16, &coprocessor_error); + set_intr_gate(17, &alignment_check); +#ifdef CONFIG_X86_MCE + set_intr_gate_ist(18, &machine_check, MCE_STACK); +#endif + set_intr_gate(19, &simd_coprocessor_error); + +#ifdef CONFIG_IA32_EMULATION + set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +#endif + +#ifdef CONFIG_X86_32 + if (cpu_has_fxsr) { + printk(KERN_INFO "Enabling fast FPU save and restore... "); + set_in_cr4(X86_CR4_OSFXSR); + printk("done.\n"); + } + if (cpu_has_xmm) { + printk(KERN_INFO + "Enabling unmasked SIMD FPU exception support... "); + set_in_cr4(X86_CR4_OSXMMEXCPT); + printk("done.\n"); + } + + set_system_trap_gate(SYSCALL_VECTOR, &system_call); + + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + + set_bit(SYSCALL_VECTOR, used_vectors); +#endif + /* + * Should be a barrier for any external CPU state: + */ + cpu_init(); + +#ifdef CONFIG_X86_32 + trap_init_hook(); +#endif +} diff --git a/arch/x86/kernel/traps_32.c b/arch/x86/kernel/traps_32.c deleted file mode 100644 index ffb131f74f78..000000000000 --- a/arch/x86/kernel/traps_32.c +++ /dev/null @@ -1,1071 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes , May 2000 - */ - -/* - * Handle hardware traps and faults. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_EISA -#include -#include -#endif - -#ifdef CONFIG_MCA -#include -#endif - -#if defined(CONFIG_EDAC) -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef CONFIG_X86_64 -#include -#include -#include -#else -#include -#include -#include -#include -#include -#include - -#include "cpu/mcheck/mce.h" - -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); - -asmlinkage int system_call(void); - -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq; - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. - */ -gate_desc idt_table[256] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -#endif - -static int ignore_nmis; - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - inc_preempt_count(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - dec_preempt_count(); -} - -#ifdef CONFIG_X86_32 -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); -} - -/* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, - * we set the offset field correctly and return 1. - */ -static int lazy_iobitmap_copy(void) -{ - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - - return 1; - } - put_cpu(); - - return 0; -} -#endif - -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { - /* - * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. - * On nmi (interrupt 2), do_trap should not be called. - */ - if (trapnr < 6) - goto vm86_trap; - goto trap_signal; - } -#endif - - if (!user_mode(regs)) - goto kernel_trap; - -#ifdef CONFIG_X86_32 -trap_signal: -#endif - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - -#ifdef CONFIG_X86_64 - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } -#endif - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; - -#ifdef CONFIG_X86_32 -vm86_trap: - if (handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, trapnr)) - goto trap_signal; - return; -#endif -} - -#define DO_ERROR(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -#endif -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) - -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} -#endif - -dotraplinkage void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (lazy_iobitmap_copy()) { - /* restart the faulting instruction */ - return; - } - - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; -#endif - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, task_pid_nr(tsk), - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; -#endif - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG - "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - unsigned long i; - - printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - - i = 2000; - while (--i) - udelay(1000); - - reason &= ~8; - outb(reason, 0x61); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs *regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == - NOTIFY_STOP) - return; -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -#ifdef CONFIG_X86_32 -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} -#endif - -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); -#endif - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -#ifdef CONFIG_X86_32 - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered: - */ - reassert_nmi(); -#endif -} - -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -/* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) -{ -#ifdef CONFIG_KPROBES - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#else - if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#endif - - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -#ifdef CONFIG_X86_64 -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} -#endif - -/* - * Our handling of the processor debug registers is non-trivial. - * We do not clear them on entry and exit from the kernel. Therefore - * it is possible to get a watchpoint trap here from inside the kernel. - * However, the code in ./ptrace.c has ensured that the user can - * only set watchpoints on userspace addresses. Therefore the in-kernel - * watchpoint trap can only occur in code which is reading/writing - * from user space. Such code must not hold kernel locks (since it - * can equally take a page fault), therefore it is safe to call - * force_sig_info even though that claims and releases locks. - * - * Code in ./signal.c ensures that the debug control register - * is restored before we deliver any signal, and therefore that - * user code runs with the correct debug control register even though - * we clear it here. - * - * Being careful here means that we don't have to be as careful in a - * lot of more complicated places (task switching can be a bit lazy - * about restoring all the debug state, and ptrace doesn't have to - * find every occurrence of the TF bit that could be saved away even - * by user code) - * - * May run on IST stack. - */ -dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk = current; - unsigned long condition; - int si_code; - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); - preempt_conditional_cli(regs); - return; - -#ifdef CONFIG_X86_32 -debug_vm86: - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - preempt_conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); - return; -} - -#ifdef CONFIG_X86_64 -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} -#endif - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -void math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 - return; -#endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - ignore_fpu_irq = 1; -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; -#endif - - math_error((void __user *)regs->ip); -} - -static void simd_math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->ip); - return; - } - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & X86_VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - simd_math_error((void __user *)regs->ip); -#endif -} - -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#else -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} -#endif - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). - */ -asmlinkage void math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - if (!tsk_used_math(tsk)) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(tsk)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } -#endif - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - -#ifndef CONFIG_MATH_EMULATION -asmlinkage void math_emulate(long arg) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - -dotraplinkage void __kprobes -do_device_not_available(struct pt_regs *regs, long error) -{ -#ifdef CONFIG_X86_32 - if (read_cr0() & X86_CR0_EM) { - conditional_sti(regs); - math_emulate(0); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); - } -#else - math_state_restore(); -#endif -} - -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_MCE -dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) -{ - conditional_sti(regs); - machine_check_vector(regs, error); -} -#endif - -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) -{ - siginfo_t info; - local_irq_enable(); - - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = 0; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) - return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); -} -#endif - -void __init trap_init(void) -{ -#ifdef CONFIG_X86_32 - int i; -#endif - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); - set_intr_gate_ist(2, &nmi, NMI_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); - /* int4 can be called from all */ - set_system_intr_gate(4, &overflow); - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); -#endif - set_intr_gate(19, &simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - -#ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - - set_bit(SYSCALL_VECTOR, used_vectors); -#endif - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); - -#ifdef CONFIG_X86_32 - trap_init_hook(); -#endif -} diff --git a/arch/x86/kernel/traps_64.c b/arch/x86/kernel/traps_64.c deleted file mode 100644 index 60ecc855ab81..000000000000 --- a/arch/x86/kernel/traps_64.c +++ /dev/null @@ -1,1070 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - * - * Pentium III FXSR, SSE support - * Gareth Hughes , May 2000 - */ - -/* - * Handle hardware traps and faults. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef CONFIG_EISA -#include -#include -#endif - -#ifdef CONFIG_MCA -#include -#endif - -#if defined(CONFIG_EDAC) -#include -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#ifdef CONFIG_X86_64 -#include -#include -#include -#else -#include -#include -#include -#include -#include - -#include "cpu/mcheck/mce.h" - -DECLARE_BITMAP(used_vectors, NR_VECTORS); -EXPORT_SYMBOL_GPL(used_vectors); - -asmlinkage int system_call(void); - -/* Do we ignore FPU interrupts ? */ -char ignore_fpu_irq; - -/* - * The IDT has to be page-aligned to simplify the Pentium - * F0 0F bug workaround.. We have a special link segment - * for this. - */ -gate_desc idt_table[256] - __attribute__((__section__(".data.idt"))) = { { { { 0, 0 } } }, }; -#endif - -static int ignore_nmis; - -static inline void conditional_sti(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_sti(struct pt_regs *regs) -{ - inc_preempt_count(); - if (regs->flags & X86_EFLAGS_IF) - local_irq_enable(); -} - -static inline void preempt_conditional_cli(struct pt_regs *regs) -{ - if (regs->flags & X86_EFLAGS_IF) - local_irq_disable(); - dec_preempt_count(); -} - -#ifdef CONFIG_X86_32 -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); -} - -/* - * Perform the lazy TSS's I/O bitmap copy. If the TSS has an - * invalid offset set (the LAZY one) and the faulting thread has - * a valid I/O bitmap pointer, we copy the I/O bitmap in the TSS, - * we set the offset field correctly and return 1. - */ -static int lazy_iobitmap_copy(void) -{ - struct thread_struct *thread; - struct tss_struct *tss; - int cpu; - - cpu = get_cpu(); - tss = &per_cpu(init_tss, cpu); - thread = ¤t->thread; - - if (tss->x86_tss.io_bitmap_base == INVALID_IO_BITMAP_OFFSET_LAZY && - thread->io_bitmap_ptr) { - memcpy(tss->io_bitmap, thread->io_bitmap_ptr, - thread->io_bitmap_max); - /* - * If the previously set map was extending to higher ports - * than the current one, pad extra space with 0xff (no access). - */ - if (thread->io_bitmap_max < tss->io_bitmap_max) { - memset((char *) tss->io_bitmap + - thread->io_bitmap_max, 0xff, - tss->io_bitmap_max - thread->io_bitmap_max); - } - tss->io_bitmap_max = thread->io_bitmap_max; - tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; - tss->io_bitmap_owner = thread; - put_cpu(); - - return 1; - } - put_cpu(); - - return 0; -} -#endif - -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, - long error_code, siginfo_t *info) -{ - struct task_struct *tsk = current; - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { - /* - * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. - * On nmi (interrupt 2), do_trap should not be called. - */ - if (trapnr < 6) - goto vm86_trap; - goto trap_signal; - } -#endif - - if (!user_mode(regs)) - goto kernel_trap; - -#ifdef CONFIG_X86_32 -trap_signal: -#endif - /* - * We want error_code and trap_no set for userspace faults and - * kernelspace faults which result in die(), but not - * kernelspace faults which are fixed up. die() gives the - * process no chance to handle the signal and notice the - * kernel fault information, so that won't result in polluting - * the information about previously queued, but not yet - * delivered, faults. See also do_general_protection below. - */ - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - -#ifdef CONFIG_X86_64 - if (show_unhandled_signals && unhandled_signal(tsk, signr) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] trap %s ip:%lx sp:%lx error:%lx", - tsk->comm, tsk->pid, str, - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } -#endif - - if (info) - force_sig_info(signr, info, tsk); - else - force_sig(signr, tsk); - return; - -kernel_trap: - if (!fixup_exception(regs)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_no = trapnr; - die(str, regs, error_code); - } - return; - -#ifdef CONFIG_X86_32 -vm86_trap: - if (handle_vm86_trap((struct kernel_vm86_regs *) regs, - error_code, trapnr)) - goto trap_signal; - return; -#endif -} - -#define DO_ERROR(trapnr, signr, str, name) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, NULL); \ -} - -#define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr) \ -dotraplinkage void do_##name(struct pt_regs *regs, long error_code) \ -{ \ - siginfo_t info; \ - info.si_signo = signr; \ - info.si_errno = 0; \ - info.si_code = sicode; \ - info.si_addr = (void __user *)siaddr; \ - if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) \ - == NOTIFY_STOP) \ - return; \ - conditional_sti(regs); \ - do_trap(trapnr, signr, str, regs, error_code, &info); \ -} - -DO_ERROR_INFO(0, SIGFPE, "divide error", divide_error, FPE_INTDIV, regs->ip) -DO_ERROR(4, SIGSEGV, "overflow", overflow) -DO_ERROR(5, SIGSEGV, "bounds", bounds) -DO_ERROR_INFO(6, SIGILL, "invalid opcode", invalid_op, ILL_ILLOPN, regs->ip) -DO_ERROR(9, SIGFPE, "coprocessor segment overrun", coprocessor_segment_overrun) -DO_ERROR(10, SIGSEGV, "invalid TSS", invalid_TSS) -DO_ERROR(11, SIGBUS, "segment not present", segment_not_present) -#ifdef CONFIG_X86_32 -DO_ERROR(12, SIGBUS, "stack segment", stack_segment) -#endif -DO_ERROR_INFO(17, SIGBUS, "alignment check", alignment_check, BUS_ADRALN, 0) - -#ifdef CONFIG_X86_64 -/* Runs on IST stack */ -dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code) -{ - if (notify_die(DIE_TRAP, "stack segment", regs, error_code, - 12, SIGBUS) == NOTIFY_STOP) - return; - preempt_conditional_sti(regs); - do_trap(12, SIGBUS, "stack segment", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) -{ - static const char str[] = "double fault"; - struct task_struct *tsk = current; - - /* Return not checked because double check cannot be ignored */ - notify_die(DIE_TRAP, str, regs, error_code, 8, SIGSEGV); - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 8; - - /* This is always a kernel trap and never fixable (and thus must - never return). */ - for (;;) - die(str, regs, error_code); -} -#endif - -dotraplinkage void __kprobes -do_general_protection(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk; - - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (lazy_iobitmap_copy()) { - /* restart the faulting instruction */ - return; - } - - if (regs->flags & X86_VM_MASK) - goto gp_in_vm86; -#endif - - tsk = current; - if (!user_mode(regs)) - goto gp_in_kernel; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - - if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && - printk_ratelimit()) { - printk(KERN_INFO - "%s[%d] general protection ip:%lx sp:%lx error:%lx", - tsk->comm, task_pid_nr(tsk), - regs->ip, regs->sp, error_code); - print_vma_addr(" in ", regs->ip); - printk("\n"); - } - - force_sig(SIGSEGV, tsk); - return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: - local_irq_enable(); - handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - return; -#endif - -gp_in_kernel: - if (fixup_exception(regs)) - return; - - tsk->thread.error_code = error_code; - tsk->thread.trap_no = 13; - if (notify_die(DIE_GPF, "general protection fault", regs, - error_code, 13, SIGSEGV) == NOTIFY_STOP) - return; - die("general protection fault", regs, error_code); -} - -static notrace __kprobes void -mem_parity_error(unsigned char reason, struct pt_regs *regs) -{ - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG - "You have some hardware problem, likely on the PCI bus.\n"); - -#if defined(CONFIG_EDAC) - if (edac_handler_set()) { - edac_atomic_assert_error(); - return; - } -#endif - - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); - - /* Clear and disable the memory parity error line. */ - reason = (reason & 0xf) | 4; - outb(reason, 0x61); -} - -static notrace __kprobes void -io_check_error(unsigned char reason, struct pt_regs *regs) -{ - unsigned long i; - - printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); - show_registers(regs); - - /* Re-enable the IOCK line, wait for a few seconds */ - reason = (reason & 0xf) | 8; - outb(reason, 0x61); - - i = 2000; - while (--i) - udelay(1000); - - reason &= ~8; - outb(reason, 0x61); -} - -static notrace __kprobes void -unknown_nmi_error(unsigned char reason, struct pt_regs *regs) -{ - if (notify_die(DIE_NMIUNKNOWN, "nmi", regs, reason, 2, SIGINT) == - NOTIFY_STOP) - return; -#ifdef CONFIG_MCA - /* - * Might actually be able to figure out what the guilty party - * is: - */ - if (MCA_bus) { - mca_handle_nmi(); - return; - } -#endif - printk(KERN_EMERG - "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", - reason, smp_processor_id()); - - printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); - if (panic_on_unrecovered_nmi) - panic("NMI: Not continuing"); - - printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); -} - -#ifdef CONFIG_X86_32 -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} -#endif - -static notrace __kprobes void default_do_nmi(struct pt_regs *regs) -{ - unsigned char reason = 0; - int cpu; - - cpu = smp_processor_id(); - - /* Only the BSP gets external NMIs from the system. */ - if (!cpu) - reason = get_nmi_reason(); - - if (!(reason & 0xc0)) { - if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) - return; -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Ok, so this is none of the documented NMI sources, - * so it must be the NMI watchdog. - */ - if (nmi_watchdog_tick(regs, reason)) - return; - if (!do_nmi_callback(regs, cpu)) - unknown_nmi_error(reason, regs); -#else - unknown_nmi_error(reason, regs); -#endif - - return; - } - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) - return; - - /* AK: following checks seem to be broken on modern chipsets. FIXME */ - if (reason & 0x80) - mem_parity_error(reason, regs); - if (reason & 0x40) - io_check_error(reason, regs); -#ifdef CONFIG_X86_32 - /* - * Reassert NMI in case it became active meanwhile - * as it's edge-triggered: - */ - reassert_nmi(); -#endif -} - -dotraplinkage notrace __kprobes void -do_nmi(struct pt_regs *regs, long error_code) -{ - nmi_enter(); - -#ifdef CONFIG_X86_32 - { int cpu; cpu = smp_processor_id(); ++nmi_count(cpu); } -#else - add_pda(__nmi_count, 1); -#endif - - if (!ignore_nmis) - default_do_nmi(regs); - - nmi_exit(); -} - -void stop_nmi(void) -{ - acpi_nmi_disable(); - ignore_nmis++; -} - -void restart_nmi(void) -{ - ignore_nmis--; - acpi_nmi_enable(); -} - -/* May run on IST stack. */ -dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) -{ -#ifdef CONFIG_KPROBES - if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#else - if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#endif - - preempt_conditional_sti(regs); - do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); - preempt_conditional_cli(regs); -} - -#ifdef CONFIG_X86_64 -/* Help handler running on IST stack to switch back to user stack - for scheduling or signal handling. The actual stack switch is done in - entry.S */ -asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) -{ - struct pt_regs *regs = eregs; - /* Did already sync */ - if (eregs == (struct pt_regs *)eregs->sp) - ; - /* Exception from user space */ - else if (user_mode(eregs)) - regs = task_pt_regs(current); - /* Exception from kernel and interrupts are enabled. Move to - kernel process stack. */ - else if (eregs->flags & X86_EFLAGS_IF) - regs = (struct pt_regs *)(eregs->sp -= sizeof(struct pt_regs)); - if (eregs != regs) - *regs = *eregs; - return regs; -} -#endif - -/* - * Our handling of the processor debug registers is non-trivial. - * We do not clear them on entry and exit from the kernel. Therefore - * it is possible to get a watchpoint trap here from inside the kernel. - * However, the code in ./ptrace.c has ensured that the user can - * only set watchpoints on userspace addresses. Therefore the in-kernel - * watchpoint trap can only occur in code which is reading/writing - * from user space. Such code must not hold kernel locks (since it - * can equally take a page fault), therefore it is safe to call - * force_sig_info even though that claims and releases locks. - * - * Code in ./signal.c ensures that the debug control register - * is restored before we deliver any signal, and therefore that - * user code runs with the correct debug control register even though - * we clear it here. - * - * Being careful here means that we don't have to be as careful in a - * lot of more complicated places (task switching can be a bit lazy - * about restoring all the debug state, and ptrace doesn't have to - * find every occurrence of the TF bit that could be saved away even - * by user code) - * - * May run on IST stack. - */ -dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) -{ - struct task_struct *tsk = current; - unsigned long condition; - int si_code; - - get_debugreg(condition, 6); - - /* - * The processor cleared BTF, so don't mark that we need it set. - */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; - - if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, - SIGTRAP) == NOTIFY_STOP) - return; - - /* It's safe to allow irq's after DR6 has been saved */ - preempt_conditional_sti(regs); - - /* Mask out spurious debug traps due to lazy DR7 setting */ - if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { - if (!tsk->thread.debugreg7) - goto clear_dr7; - } - -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) - goto debug_vm86; -#endif - - /* Save debug status register where ptrace can see it */ - tsk->thread.debugreg6 = condition; - - /* - * Single-stepping through TF: make sure we ignore any events in - * kernel space (but re-enable TF when returning to user mode). - */ - if (condition & DR_STEP) { - if (!user_mode(regs)) - goto clear_TF_reenable; - } - - si_code = get_si_code(condition); - /* Ok, finally something we can handle */ - send_sigtrap(tsk, regs, error_code, si_code); - - /* - * Disable additional traps. They'll be re-enabled when - * the signal is delivered. - */ -clear_dr7: - set_debugreg(0, 7); - preempt_conditional_cli(regs); - return; - -#ifdef CONFIG_X86_32 -debug_vm86: - handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); - preempt_conditional_cli(regs); - return; -#endif - -clear_TF_reenable: - set_tsk_thread_flag(tsk, TIF_SINGLESTEP); - regs->flags &= ~X86_EFLAGS_TF; - preempt_conditional_cli(regs); - return; -} - -#ifdef CONFIG_X86_64 -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} -#endif - -/* - * Note that we play around with the 'TS' bit in an attempt to get - * the correct behaviour even in the presence of the asynchronous - * IRQ13 behaviour - */ -void math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short cwd, swd; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); - switch (swd & ~cwd & 0x3f) { - case 0x000: /* No unmasked exception */ -#ifdef CONFIG_X86_32 - return; -#endif - default: /* Multiple exceptions */ - break; - case 0x001: /* Invalid Op */ - /* - * swd & 0x240 == 0x040: Stack Underflow - * swd & 0x240 == 0x240: Stack Overflow - * User must clear the SF bit (0x40) if set - */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - ignore_fpu_irq = 1; -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; -#endif - - math_error((void __user *)regs->ip); -} - -static void simd_math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); -} - -dotraplinkage void -do_simd_coprocessor_error(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->ip); - return; - } - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & X86_VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - simd_math_error((void __user *)regs->ip); -#endif -} - -dotraplinkage void -do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) -{ - conditional_sti(regs); -#if 0 - /* No need to warn about this any longer. */ - printk(KERN_INFO "Ignoring P6 Local APIC Spurious Interrupt Bug...\n"); -#endif -} - -#ifdef CONFIG_X86_32 -unsigned long patch_espfix_desc(unsigned long uesp, unsigned long kesp) -{ - struct desc_struct *gdt = get_cpu_gdt_table(smp_processor_id()); - unsigned long base = (kesp - uesp) & -THREAD_SIZE; - unsigned long new_kesp = kesp - base; - unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; - __u64 desc = *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS]; - - /* Set up base for espfix segment */ - desc &= 0x00f0ff0000000000ULL; - desc |= ((((__u64)base) << 16) & 0x000000ffffff0000ULL) | - ((((__u64)base) << 32) & 0xff00000000000000ULL) | - ((((__u64)lim_pages) << 32) & 0x000f000000000000ULL) | - (lim_pages & 0xffff); - *(__u64 *)&gdt[GDT_ENTRY_ESPFIX_SS] = desc; - - return new_kesp; -} -#else -asmlinkage void __attribute__((weak)) smp_thermal_interrupt(void) -{ -} - -asmlinkage void __attribute__((weak)) mce_threshold_interrupt(void) -{ -} -#endif - -/* - * 'math_state_restore()' saves the current math information in the - * old math state array, and gets the new ones from the current task - * - * Careful.. There are problems with IBM-designed IRQ13 behaviour. - * Don't touch unless you *really* know how it works. - * - * Must be called with kernel preemption disabled (in this case, - * local interrupts are disabled at the call-site in entry.S). - */ -asmlinkage void math_state_restore(void) -{ - struct thread_info *thread = current_thread_info(); - struct task_struct *tsk = thread->task; - - if (!tsk_used_math(tsk)) { - local_irq_enable(); - /* - * does a slab alloc which can sleep - */ - if (init_fpu(tsk)) { - /* - * ran out of memory! - */ - do_group_exit(SIGKILL); - return; - } - local_irq_disable(); - } - - clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else - /* - * Paranoid restore. send a SIGSEGV if we fail to restore the state. - */ - if (unlikely(restore_fpu_checking(tsk))) { - stts(); - force_sig(SIGSEGV, tsk); - return; - } -#endif - thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ - tsk->fpu_counter++; -} -EXPORT_SYMBOL_GPL(math_state_restore); - -#ifndef CONFIG_MATH_EMULATION -asmlinkage void math_emulate(long arg) -{ - printk(KERN_EMERG - "math-emulation not enabled and no coprocessor found.\n"); - printk(KERN_EMERG "killing %s.\n", current->comm); - force_sig(SIGFPE, current); - schedule(); -} -#endif /* CONFIG_MATH_EMULATION */ - -dotraplinkage void __kprobes -do_device_not_available(struct pt_regs *regs, long error) -{ -#ifdef CONFIG_X86_32 - if (read_cr0() & X86_CR0_EM) { - conditional_sti(regs); - math_emulate(0); - } else { - math_state_restore(); /* interrupts still off */ - conditional_sti(regs); - } -#else - math_state_restore(); -#endif -} - -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_MCE -dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) -{ - conditional_sti(regs); - machine_check_vector(regs, error); -} -#endif - -dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) -{ - siginfo_t info; - local_irq_enable(); - - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = 0; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) - return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); -} -#endif - -void __init trap_init(void) -{ -#ifdef CONFIG_X86_32 - int i; -#endif - -#ifdef CONFIG_EISA - void __iomem *p = early_ioremap(0x0FFFD9, 4); - - if (readl(p) == 'E' + ('I'<<8) + ('S'<<16) + ('A'<<24)) - EISA_bus = 1; - early_iounmap(p, 4); -#endif - - set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); - set_intr_gate_ist(2, &nmi, NMI_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); - /* int4 can be called from all */ - set_system_intr_gate(4, &overflow); - set_intr_gate(5, &bounds); - set_intr_gate(6, &invalid_op); - set_intr_gate(7, &device_not_available); -#ifdef CONFIG_X86_32 - set_task_gate(8, GDT_ENTRY_DOUBLEFAULT_TSS); -#else - set_intr_gate_ist(8, &double_fault, DOUBLEFAULT_STACK); -#endif - set_intr_gate(9, &coprocessor_segment_overrun); - set_intr_gate(10, &invalid_TSS); - set_intr_gate(11, &segment_not_present); - set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); - set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); - set_intr_gate(15, &spurious_interrupt_bug); - set_intr_gate(16, &coprocessor_error); - set_intr_gate(17, &alignment_check); -#ifdef CONFIG_X86_MCE - set_intr_gate_ist(18, &machine_check, MCE_STACK); -#endif - set_intr_gate(19, &simd_coprocessor_error); - -#ifdef CONFIG_IA32_EMULATION - set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); -#endif - -#ifdef CONFIG_X86_32 - if (cpu_has_fxsr) { - printk(KERN_INFO "Enabling fast FPU save and restore... "); - set_in_cr4(X86_CR4_OSFXSR); - printk("done.\n"); - } - if (cpu_has_xmm) { - printk(KERN_INFO - "Enabling unmasked SIMD FPU exception support... "); - set_in_cr4(X86_CR4_OSXMMEXCPT); - printk("done.\n"); - } - - set_system_trap_gate(SYSCALL_VECTOR, &system_call); - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - - set_bit(SYSCALL_VECTOR, used_vectors); -#endif - /* - * Should be a barrier for any external CPU state: - */ - cpu_init(); - -#ifdef CONFIG_X86_32 - trap_init_hook(); -#endif -} -- cgit v1.2.2 From dd6e4eba1c03c9562fced21736453396c045f869 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:40 +0200 Subject: dumpstack: x86: move die_nmi to dumpstack_32.c For some reason die_nmi is still defined in traps.c for i386, but is found in dumpstack_64.c for x86_64. Move it to dumpstack_32.c Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/traps.c | 37 ------------------------------------- 2 files changed, 36 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c398b27df6cd..dc9ca7ee1c47 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -403,6 +403,42 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, SIGSEGV); } +static DEFINE_SPINLOCK(nmi_print_lock); + +void notrace __kprobes +die_nmi(char *str, struct pt_regs *regs, int do_panic) +{ + if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) + return; + + spin_lock(&nmi_print_lock); + /* + * We are in trouble anyway, lets at least try + * to get a message out: + */ + bust_spinlocks(1); + printk(KERN_EMERG "%s", str); + printk(" on CPU%d, ip %08lx, registers:\n", + smp_processor_id(), regs->ip); + show_registers(regs); + if (do_panic) + panic("Non maskable interrupt"); + console_silent(); + spin_unlock(&nmi_print_lock); + bust_spinlocks(0); + + /* + * If we are in kernel we are probably nested up pretty bad + * and might aswell get out now while we still can: + */ + if (!user_mode_vm(regs)) { + current->thread.trap_no = 2; + crash_kexec(regs); + } + + do_exit(SIGSEGV); +} + static int __init kstack_setup(char *s) { kstack_depth_to_print = simple_strtoul(s, NULL, 0); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ffb131f74f78..e062974cce34 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -429,43 +429,6 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); } -#ifdef CONFIG_X86_32 -static DEFINE_SPINLOCK(nmi_print_lock); - -void notrace __kprobes die_nmi(char *str, struct pt_regs *regs, int do_panic) -{ - if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP) - return; - - spin_lock(&nmi_print_lock); - /* - * We are in trouble anyway, lets at least try - * to get a message out: - */ - bust_spinlocks(1); - printk(KERN_EMERG "%s", str); - printk(" on CPU%d, ip %08lx, registers:\n", - smp_processor_id(), regs->ip); - show_registers(regs); - if (do_panic) - panic("Non maskable interrupt"); - console_silent(); - spin_unlock(&nmi_print_lock); - bust_spinlocks(0); - - /* - * If we are in kernel we are probably nested up pretty bad - * and might aswell get out now while we still can: - */ - if (!user_mode_vm(regs)) { - current->thread.trap_no = 2; - crash_kexec(regs); - } - - do_exit(SIGSEGV); -} -#endif - static notrace __kprobes void default_do_nmi(struct pt_regs *regs) { unsigned char reason = 0; -- cgit v1.2.2 From 161827903bdc124655f4cd976b9f0a5ac6ebf21c Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:41 +0200 Subject: dumpstack: x86: make printk_address equal - x86_64: use %p to print an address - make i386-version the same as the above The result should be the same on x86_64; on i386 the output only changes if CONFIG_KALLSYMS is turned off, in which case the address is printed twice. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 27 ++------------------------- arch/x86/kernel/dumpstack_64.c | 4 ++-- 2 files changed, 4 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index dc9ca7ee1c47..4e67a005c7a1 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -23,31 +23,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { -#ifdef CONFIG_KALLSYMS - unsigned long offset = 0; - unsigned long symsize; - const char *symname; - char *modname; - char *delim = ":"; - char namebuf[KSYM_NAME_LEN]; - char reliab[4] = ""; - - symname = kallsyms_lookup(address, &symsize, &offset, - &modname, namebuf); - if (!symname) { - printk(" [<%08lx>]\n", address); - return; - } - if (!reliable) - strcpy(reliab, "? "); - - if (!modname) - modname = delim = ""; - printk(" [<%08lx>] %s%s%s%s%s+0x%lx/0x%lx\n", - address, reliab, delim, modname, delim, symname, offset, symsize); -#else - printk(" [<%08lx>]\n", address); -#endif + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); } static inline int valid_stack_ptr(struct thread_info *tinfo, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 6f1505074db0..563554c90686 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -23,8 +23,8 @@ static int die_counter; void printk_address(unsigned long address, int reliable) { - printk(" [<%016lx>] %s%pS\n", - address, reliable ? "" : "? ", (void *) address); + printk(" [<%p>] %s%pS\n", (void *) address, + reliable ? "" : "? ", (void *) address); } static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, -- cgit v1.2.2 From 3a18512db00e0eedca86e3db4d2e81f8fe0b1774 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:42 +0200 Subject: dumpstack: x86: add "end" parameter to valid_stack_ptr and print_context_stack - Add "end" parameter to valid_stack_ptr and print_context_stack - use sizeof(long) as the size of a word on the stack Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 19 +++++++++++++------ arch/x86/kernel/dumpstack_64.c | 2 +- 2 files changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 4e67a005c7a1..466d55dfeb87 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -28,10 +28,16 @@ void printk_address(unsigned long address, int reliable) } static inline int valid_stack_ptr(struct thread_info *tinfo, - void *p, unsigned int size) + void *p, unsigned int size, void *end) { void *t = tinfo; - return p > t && p <= t + THREAD_SIZE - size; + if (end) { + if (p < end && p >= (end-THREAD_SIZE)) + return 1; + else + return 0; + } + return p > t && p < t + THREAD_SIZE - size; } /* The form of the top of the frame on the stack */ @@ -43,16 +49,17 @@ struct stack_frame { static inline unsigned long print_context_stack(struct thread_info *tinfo, unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) + const struct stacktrace_ops *ops, void *data, + unsigned long *end) { struct stack_frame *frame = (struct stack_frame *)bp; - while (valid_stack_ptr(tinfo, stack, sizeof(*stack))) { + while (valid_stack_ptr(tinfo, stack, sizeof(*stack), end)) { unsigned long addr; addr = *stack; if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 4) { + if ((unsigned long) stack == bp + sizeof(long)) { ops->address(data, addr, 1); frame = frame->next_frame; bp = (unsigned long) frame; @@ -96,7 +103,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); - bp = print_context_stack(context, stack, bp, ops, data); + bp = print_context_stack(context, stack, bp, ops, data, NULL); /* * Should be after the line below, but somewhere * in early boot context comes out corrupted and we diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 563554c90686..361afa8864b4 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -141,7 +141,7 @@ print_context_stack(struct thread_info *tinfo, addr = *stack; if (__kernel_text_address(addr)) { - if ((unsigned long) stack == bp + 8) { + if ((unsigned long) stack == bp + sizeof(long)) { ops->address(data, addr, 1); frame = frame->next_frame; bp = (unsigned long) frame; -- cgit v1.2.2 From 2ac53721f37c79acddaf60f6ff232f56b7abddba Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:43 +0200 Subject: dumptrace: x86: consistently include loglevel, print stack switch - i386 and x86_64: always printk the 'data' parameter - i386: announce stack switch (irq -> normal) - i386: check if there is a stack switch before announcing it There is a warning that 'context' might come out corrupt in early boot. If this is true it should be fixed, not worked around. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 17 ++++++----------- arch/x86/kernel/dumpstack_64.c | 9 +++++++-- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 466d55dfeb87..517b507bc56a 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -104,16 +104,12 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, context = (struct thread_info *) ((unsigned long)stack & (~(THREAD_SIZE - 1))); bp = print_context_stack(context, stack, bp, ops, data, NULL); - /* - * Should be after the line below, but somewhere - * in early boot context comes out corrupted and we - * can't reference it: - */ - if (ops->stack(data, "IRQ") < 0) - break; + stack = (unsigned long *)context->previous_esp; if (!stack) break; + if (ops->stack(data, "IRQ") < 0) + break; touch_nmi_watchdog(); } } @@ -134,6 +130,7 @@ static void print_trace_warning(void *data, char *msg) static int print_trace_stack(void *data, char *name) { + printk("%s <%s> ", (char *)data, name); return 0; } @@ -142,11 +139,9 @@ static int print_trace_stack(void *data, char *name) */ static void print_trace_address(void *data, unsigned long addr, int reliable) { - printk("%s [<%08lx>] ", (char *)data, addr); - if (!reliable) - printk("? "); - print_symbol("%s\n", addr); touch_nmi_watchdog(); + printk(data); + printk_address(addr, reliable); } static const struct stacktrace_ops print_trace_ops = { diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 361afa8864b4..521c833cdc3b 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -247,24 +247,29 @@ EXPORT_SYMBOL(dump_trace); static void print_trace_warning_symbol(void *data, char *msg, unsigned long symbol) { + printk(data); print_symbol(msg, symbol); printk("\n"); } static void print_trace_warning(void *data, char *msg) { - printk("%s\n", msg); + printk("%s%s\n", (char *)data, msg); } static int print_trace_stack(void *data, char *name) { - printk(" <%s> ", name); + printk("%s <%s> ", (char *)data, name); return 0; } +/* + * Print one address/symbol entries per line. + */ static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); + printk(data); printk_address(addr, reliable); } -- cgit v1.2.2 From ca0a816403c53411bb6b6fb8bf60cef30695b09d Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:44 +0200 Subject: dumpstack: x86: use log_lvl and unify trace formatting - x86: Write log_lvl strings if available - start raw stack dumps on new line - i386: Remove extra indentation for raw stack dumps Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 18 +++++++++--------- arch/x86/kernel/dumpstack_64.c | 8 ++++---- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 517b507bc56a..09bc3330d557 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -155,8 +155,8 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl) { + printk("%sCall Trace:\n", log_lvl); dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); - printk("%s =======================\n", log_lvl); } void show_trace(struct task_struct *task, struct pt_regs *regs, @@ -184,17 +184,16 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; if (i && ((i % 8) == 0)) - printk("\n%s ", log_lvl); - printk("%08lx ", *stack++); + printk("\n%s", log_lvl); + printk(" %08lx", *stack++); + touch_nmi_watchdog(); } - printk("\n%sCall Trace:\n", log_lvl); - + printk("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); } void show_stack(struct task_struct *task, unsigned long *sp) { - printk(" "); show_stack_log_lvl(task, NULL, sp, 0, ""); } @@ -229,7 +228,7 @@ void show_registers(struct pt_regs *regs) print_modules(); __show_regs(regs, 0); - printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), current_thread_info(), current, task_thread_info(current)); /* @@ -242,8 +241,9 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk("\n" KERN_EMERG "Stack: "); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + printk(KERN_EMERG "Stack:\n"); + show_stack_log_lvl(NULL, regs, ®s->sp, + 0, KERN_EMERG); printk(KERN_EMERG "Code: "); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 521c833cdc3b..7fd32944ceac 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -284,7 +284,7 @@ static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, char *log_lvl) { - printk("Call Trace:\n"); + printk("%sCall Trace:\n", log_lvl); dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); } @@ -330,7 +330,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, break; } if (i && ((i % 4) == 0)) - printk("\n"); + printk("\n%s", log_lvl); printk(" %016lx", *stack++); touch_nmi_watchdog(); } @@ -388,9 +388,9 @@ void show_registers(struct pt_regs *regs) unsigned char c; u8 *ip; - printk("Stack: "); + printk(KERN_EMERG "Stack:\n"); show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - regs->bp, ""); + regs->bp, KERN_EMERG); printk(KERN_EMERG "Code: "); -- cgit v1.2.2 From 802a67de0cbd1ef10df80ad48b840e2103b13e52 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:45 +0200 Subject: dumpstack: i386: make kstack= an early boot-param and add oops=panic - make kstack= and early_param - add oops=panic, setting panic_on_oops Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 09bc3330d557..9a8920abe4ba 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -418,13 +418,24 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) do_exit(SIGSEGV); } +static int __init oops_setup(char *s) +{ + if (!s) + return -EINVAL; + if (!strcmp(s, "panic")) + panic_on_oops = 1; + return 0; +} +early_param("oops", oops_setup); + static int __init kstack_setup(char *s) { + if (!s) + return -EINVAL; kstack_depth_to_print = simple_strtoul(s, NULL, 0); - - return 1; + return 0; } -__setup("kstack=", kstack_setup); +early_param("kstack", kstack_setup); static int __init code_bytes_setup(char *s) { -- cgit v1.2.2 From 8a541665b9063c5006b370d4488cf9f6beb6083f Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sat, 4 Oct 2008 23:12:46 +0200 Subject: dumpstack: x86: various small unification steps - define STACKSLOTS_PER_LINE and use it - define get_bp macro to hide the %%ebp/%%rbp difference - i386: check task==NULL in dump_trace, like x86_64 - i386: show_trace(NULL, ...) uses current automatically - x86_64: use [#%d] for die_counter, like i386 - whitespace and comments Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 23 +++++++++++------------ arch/x86/kernel/dumpstack_64.c | 15 +++++++++------ 2 files changed, 20 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 9a8920abe4ba..201ee359a1a9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,8 +16,11 @@ #include +#define STACKSLOTS_PER_LINE 8 +#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) + int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 24; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static unsigned int code_bytes = 64; static int die_counter; @@ -82,7 +85,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!stack) { unsigned long dummy; stack = &dummy; - if (task != current) + if (task && task != current) stack = (unsigned long *)task->thread.sp; } @@ -90,7 +93,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!bp) { if (task == current) { /* Grab bp right from our regs */ - asm("movl %%ebp, %0" : "=r" (bp) :); + get_bp(bp); } else { /* bp is the last reg pushed by switch_to */ bp = *(unsigned long *) task->thread.sp; @@ -167,7 +170,7 @@ void show_trace(struct task_struct *task, struct pt_regs *regs, static void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) + unsigned long *sp, unsigned long bp, char *log_lvl) { unsigned long *stack; int i; @@ -183,7 +186,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, for (i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; - if (i && ((i % 8) == 0)) + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) printk("\n%s", log_lvl); printk(" %08lx", *stack++); touch_nmi_watchdog(); @@ -207,7 +210,7 @@ void dump_stack(void) #ifdef CONFIG_FRAME_POINTER if (!bp) - asm("movl %%ebp, %0" : "=r" (bp):); + get_bp(bp); #endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", @@ -215,8 +218,7 @@ void dump_stack(void) init_utsname()->release, (int)strcspn(init_utsname()->version, " "), init_utsname()->version); - - show_trace(current, NULL, &stack, bp); + show_trace(NULL, NULL, &stack, bp); } EXPORT_SYMBOL(dump_stack); @@ -249,7 +251,7 @@ void show_registers(struct pt_regs *regs) ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at EIP */ + /* try starting at IP */ ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } @@ -317,13 +319,10 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) if (kexec_should_crash(current)) crash_kexec(regs); - if (in_interrupt()) panic("Fatal exception in interrupt"); - if (panic_on_oops) panic("Fatal exception"); - oops_exit(); do_exit(signr); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 7fd32944ceac..13379a988292 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,8 +16,11 @@ #include +#define STACKSLOTS_PER_LINE 4 +#define get_bp(bp) asm("movl %%rbp, %0" : "=r" (bp) :) + int panic_on_unrecovered_nmi; -int kstack_depth_to_print = 12; +int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static unsigned int code_bytes = 64; static int die_counter; @@ -177,7 +180,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, if (!bp) { if (task == current) { /* Grab bp right from our regs */ - asm("movq %%rbp, %0" : "=r" (bp) : ); + get_bp(bp); } else { /* bp is the last reg pushed by switch_to */ bp = *(unsigned long *) task->thread.sp; @@ -329,7 +332,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (((long) stack & (THREAD_SIZE-1)) == 0) break; } - if (i && ((i % 4) == 0)) + if (i && ((i % STACKSLOTS_PER_LINE) == 0)) printk("\n%s", log_lvl); printk(" %016lx", *stack++); touch_nmi_watchdog(); @@ -353,7 +356,7 @@ void dump_stack(void) #ifdef CONFIG_FRAME_POINTER if (!bp) - asm("movq %%rbp, %0" : "=r" (bp) : ); + get_bp(bp); #endif printk("Pid: %d, comm: %.20s %s %s %.*s\n", @@ -396,7 +399,7 @@ void show_registers(struct pt_regs *regs) ip = (u8 *)regs->ip - code_prologue; if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - /* try starting at RIP */ + /* try starting at IP */ ip = (u8 *)regs->ip; code_len = code_len - code_prologue + 1; } @@ -475,7 +478,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) int __kprobes __die(const char *str, struct pt_regs *regs, long err) { - printk(KERN_EMERG "%s: %04lx [%u] ", str, err & 0xffff, ++die_counter); + printk(KERN_EMERG "%s: %04lx [#%d] ", str, err & 0xffff, ++die_counter); #ifdef CONFIG_PREEMPT printk("PREEMPT "); #endif -- cgit v1.2.2 From 649c6653fa94ec8f3ea32b19c97b790ec4e8e4ac Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Oct 2008 16:52:24 +0200 Subject: x86: improve UP kernel when CPU-hotplug and SMP is enabled num_possible_cpus() can be > 1 when disabled CPUs have been accounted. Disabled CPUs are not in the cpu_present_map, so we can use num_present_cpus() as a safe indicator to switch to UP alternatives. Reported-by: Chuck Ebbert Signed-off-by: Thomas Gleixner Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index fb04e49776ba..a84ac7b570e6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -444,7 +444,7 @@ void __init alternative_instructions(void) _text, _etext); /* Only switch to UP mode if we don't immediately boot others */ - if (num_possible_cpus() == 1 || setup_max_cpus <= 1) + if (num_present_cpus() == 1 || setup_max_cpus <= 1) alternatives_smp_switch(0); } #endif -- cgit v1.2.2 From b807305059c28fb8197496c944bfaa6b372a40ad Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 5 Oct 2008 17:12:36 +0200 Subject: x86: remove additional_cpus configurability additional_cpus= parameter is dangerous and broken: for example if we boot additional_cpus=-2 on a stock dual-core system it will crash the box on bootup. So reduce the maze of code a bit by removingthe user-configurability angle. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 18 ------------------ arch/x86/kernel/smpboot.c | 8 +------- 2 files changed, 1 insertion(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index adaca6fa927d..fc8351f374fd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1438,24 +1438,6 @@ config HOTPLUG_CPU automatically on SMP systems. ) Say N if you want to disable CPU hotplug. -config HOTPLUG_RESTRICT_TO_BOOTUP_CPUS - def_bool n - prompt "Restrict CPU hotplugging to processors found during boot" if HOTPLUG_CPU - ---help--- - Say no here to use the default, which allows as many CPUs as are marked - "disabled" by ACPI or MPTABLES to be hotplugged after bootup. - - Say yes if you do not want to allow CPUs to be added after booting, for - example if you only need CPU hotplugging enabled for suspend/resume. - - If CPU_HOTPLUG is enabled this value may be overridden at boot time - with the "additional_cpus" kernel parameter. - -config HOTPLUG_ADDITIONAL_CPUS - int - default 0 if !HOTPLUG_CPU || HOTPLUG_RESTRICT_TO_BOOTUP_CPUS - default -1 - config COMPAT_VDSO def_bool y prompt "Compat VDSO support" diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 857a88bb9195..8dd201c31329 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,7 +1261,7 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -static int additional_cpus __initdata = CONFIG_HOTPLUG_ADDITIONAL_CPUS; +static int additional_cpus = -1; /* * cpu_possible_map should be static, it cannot change as cpu's @@ -1334,12 +1334,6 @@ static void remove_siblinginfo(int cpu) cpu_clear(cpu, cpu_sibling_setup_map); } -static __init int setup_additional_cpus(char *s) -{ - return s && get_option(&s, &additional_cpus) ? 0 : -EINVAL; -} -early_param("additional_cpus", setup_additional_cpus); - static void __ref remove_cpu_from_maps(int cpu) { cpu_clear(cpu, cpu_online_map); -- cgit v1.2.2 From cb48bb59995d2d14a0c732835c80bbcfb354de31 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Oct 2008 17:51:52 +0200 Subject: x86: remove additional_cpus remove remainder of additional_cpus logic. We now just listen to the disabled_cpus value like we did for years. disabled_cpus is always >= 0 so no need for an extra check. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8dd201c31329..8c3aca7cb343 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1261,8 +1261,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus) check_nmi_watchdog(); } -static int additional_cpus = -1; - /* * cpu_possible_map should be static, it cannot change as cpu's * are onlined, or offlined. The reason is per-cpu data-structures @@ -1282,21 +1280,13 @@ static int additional_cpus = -1; */ __init void prefill_possible_map(void) { - int i; - int possible; + int i, possible; /* no processor from mptable or madt */ if (!num_processors) num_processors = 1; - if (additional_cpus == -1) { - if (disabled_cpus > 0) - additional_cpus = disabled_cpus; - else - additional_cpus = 0; - } - - possible = num_processors + additional_cpus; + possible = num_processors + disabled_cpus; if (possible > NR_CPUS) possible = NR_CPUS; -- cgit v1.2.2 From 6a2ae2d9f9212de47c16e23080162aada882c8b6 Mon Sep 17 00:00:00 2001 From: Alexander van Heukelum Date: Sun, 5 Oct 2008 12:39:36 +0200 Subject: dumpstack: x86: various small unification steps, fix After "dumpstack: x86: various small unification steps", the assembler gives the following compile error. The error is in dumpstack_64.c. {standard input}: Assembler messages: {standard input}:720: Error: Incorrect register `%rbx' used with `l' suffix {standard input}:1340: Error: Incorrect register `%r12' used with `l' suffix Indeed the suffix in get_bp() was wrong. Signed-off-by: Alexander van Heukelum Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 13379a988292..086cc8118e39 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -17,7 +17,7 @@ #include #define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movl %%rbp, %0" : "=r" (bp) :) +#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) int panic_on_unrecovered_nmi; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; -- cgit v1.2.2 From 79aa10dd9fb0f896ce358314fdba20606c038864 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 29 Aug 2008 12:50:38 +0100 Subject: x86: adjust dependencies for CONFIG_X86_CMOV Signed-off-by: Jan Beulich Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index f4bdc0492d38..0b7c4a3f0651 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -397,7 +397,7 @@ config X86_CMPXCHG64 # generates cmov. config X86_CMOV def_bool y - depends on (MK7 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || X86_64) + depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64) config X86_MINIMUM_CPU_FAMILY int -- cgit v1.2.2 From c1a2f4b10852ce68e70f7e4c53600c36cc63ea45 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 14 Sep 2008 02:33:12 -0700 Subject: x86: change early_ioremap to use slots instead of nesting so we could remove the requirement that one needs to call early_iounmap() in exactly reverse order of early_ioremap(). Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/ioremap.c | 77 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 57 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 8876220d906b..e4c43ec71b29 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -616,16 +616,22 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx) __early_set_fixmap(idx, 0, __pgprot(0)); } - -static int __initdata early_ioremap_nested; - +static void *prev_map[FIX_BTMAPS_SLOTS] __initdata; +static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; static int __init check_early_ioremap_leak(void) { - if (!early_ioremap_nested) + int count = 0; + int i; + + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) + if (prev_map[i]) + count++; + + if (!count) return 0; WARN(1, KERN_WARNING "Debug warning: early ioremap leak of %d areas detected.\n", - early_ioremap_nested); + count); printk(KERN_WARNING "please boot with early_ioremap_debug and report the dmesg.\n"); @@ -636,15 +642,30 @@ late_initcall(check_early_ioremap_leak); static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, pgprot_t prot) { unsigned long offset, last_addr; - unsigned int nrpages, nesting; + unsigned int nrpages; enum fixed_addresses idx0, idx; + int i, slot; WARN_ON(system_state != SYSTEM_BOOTING); - nesting = early_ioremap_nested; + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (!prev_map[i]) { + slot = i; + break; + } + } + + if (slot < 0) { + printk(KERN_INFO "early_iomap(%08lx, %08lx) not found slot\n", + phys_addr, size); + WARN_ON(1); + return NULL; + } + if (early_ioremap_debug) { printk(KERN_INFO "early_ioremap(%08lx, %08lx) [%d] => ", - phys_addr, size, nesting); + phys_addr, size, slot); dump_stack(); } @@ -655,11 +676,7 @@ static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, return NULL; } - if (nesting >= FIX_BTMAPS_NESTING) { - WARN_ON(1); - return NULL; - } - early_ioremap_nested++; + prev_size[slot] = size; /* * Mappings have to be page-aligned */ @@ -679,7 +696,7 @@ static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, /* * Ok, go for it.. */ - idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx0 = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; idx = idx0; while (nrpages > 0) { early_set_fixmap(idx, phys_addr, prot); @@ -690,7 +707,8 @@ static void __init *__early_ioremap(unsigned long phys_addr, unsigned long size, if (early_ioremap_debug) printk(KERN_CONT "%08lx + %08lx\n", offset, fix_to_virt(idx0)); - return (void *) (offset + fix_to_virt(idx0)); + prev_map[slot] = (void *) (offset + fix_to_virt(idx0)); + return prev_map[slot]; } /* Remap an IO device */ @@ -711,15 +729,33 @@ void __init early_iounmap(void *addr, unsigned long size) unsigned long offset; unsigned int nrpages; enum fixed_addresses idx; - int nesting; + int i, slot; + + slot = -1; + for (i = 0; i < FIX_BTMAPS_SLOTS; i++) { + if (prev_map[i] == addr) { + slot = i; + break; + } + } - nesting = --early_ioremap_nested; - if (WARN_ON(nesting < 0)) + if (slot < 0) { + printk(KERN_INFO "early_iounmap(%p, %08lx) not found slot\n", + addr, size); + WARN_ON(1); + return; + } + + if (prev_size[slot] != size) { + printk(KERN_INFO "early_iounmap(%p, %08lx) [%d] size not consistent %08lx\n", + addr, size, slot, prev_size[slot]); + WARN_ON(1); return; + } if (early_ioremap_debug) { printk(KERN_INFO "early_iounmap(%p, %08lx) [%d]\n", addr, - size, nesting); + size, slot); dump_stack(); } @@ -731,12 +767,13 @@ void __init early_iounmap(void *addr, unsigned long size) offset = virt_addr & ~PAGE_MASK; nrpages = PAGE_ALIGN(offset + size - 1) >> PAGE_SHIFT; - idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*nesting; + idx = FIX_BTMAP_BEGIN - NR_FIX_BTMAPS*slot; while (nrpages > 0) { early_clear_fixmap(idx); --idx; --nrpages; } + prev_map[slot] = 0; } void __this_fixmap_does_not_exist(void) -- cgit v1.2.2 From 891cffbd6bcba26409869c19c07ecd4bfc0c2460 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 12 Oct 2008 13:16:12 -0700 Subject: x86/mm: do not trigger a kernel warning if user-space disables interrupts and generates a page fault Arjan reported a spike in the following bug pattern in v2.6.27: http://www.kerneloops.org/searchweek.php?search=lock_page which happens because hwclock started triggering warnings due to a (correct) might_sleep() check in the MM code. The warning occurs because hwclock uses this dubious sequence of code to run "atomic" code: static unsigned long atomic(const char *name, unsigned long (*op)(unsigned long), unsigned long arg) { unsigned long v; __asm__ volatile ("cli"); v = (*op)(arg); __asm__ volatile ("sti"); return v; } Then it pagefaults in that "atomic" section, triggering the warning. There is no way the kernel could provide "atomicity" in this path, a page fault is a cannot-continue machine event so the kernel has to wait for the page to be filled in. Even if it was just a minor fault we'd have to take locks and might have to spend quite a bit of time with interrupts disabled - not nice to irq latencies in general. So instead just enable interrupts in the pagefault path unconditionally if we come from user-space, and handle the fault. Also, while touching this code, unify some trivial parts of the x86 VM paths at the same time. Signed-off-by: Linus Torvalds Reported-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a742d753d5b0..ac2ad781da00 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -645,24 +645,23 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) } -#ifdef CONFIG_X86_32 - /* It's safe to allow irq's after cr2 has been saved and the vmalloc - fault has been handled. */ - if (regs->flags & (X86_EFLAGS_IF | X86_VM_MASK)) - local_irq_enable(); - /* - * If we're in an interrupt, have no user context or are running in an - * atomic region then we must not take the fault. + * It's safe to allow irq's after cr2 has been saved and the + * vmalloc fault has been handled. + * + * User-mode registers count as a user access even for any + * potential system fault or CPU buglet. */ - if (in_atomic() || !mm) - goto bad_area_nosemaphore; -#else /* CONFIG_X86_64 */ - if (likely(regs->flags & X86_EFLAGS_IF)) + if (user_mode_vm(regs)) { + local_irq_enable(); + error_code |= PF_USER; + } else if (regs->flags & X86_EFLAGS_IF) local_irq_enable(); +#ifdef CONFIG_X86_64 if (unlikely(error_code & PF_RSVD)) pgtable_bad(address, regs, error_code); +#endif /* * If we're in an interrupt, have no user context or are running in an @@ -671,14 +670,7 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(in_atomic() || !mm)) goto bad_area_nosemaphore; - /* - * User-mode registers count as a user access even for any - * potential system fault or CPU buglet. - */ - if (user_mode_vm(regs)) - error_code |= PF_USER; again: -#endif /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an -- cgit v1.2.2 From 3a1dfe6eefe483589c99c909202ffe1a20d589b5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 13 Oct 2008 17:49:02 +0200 Subject: x86/mm: unify init task OOM handling Linus noticed that the "again:" versus "survive:" OOM logic for the init task was arbitrarily different. The 64-bit codepath is the better one, because it correctly re-lookups the vma after having dropped the ->mmap_sem. Signed-off-by: Ingo Molnar Acked-by: Linus Torvalds --- arch/x86/mm/fault.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ac2ad781da00..8bc5956e1af4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -671,7 +671,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code) goto bad_area_nosemaphore; again: - /* When running in the kernel we expect faults to occur only to + /* + * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an * erroneous fault occurring in a code path which already holds mmap_sem @@ -734,9 +735,6 @@ good_area: goto bad_area; } -#ifdef CONFIG_X86_32 -survive: -#endif /* * If for any reason at all we couldn't handle the fault, * make sure we exit gracefully rather than endlessly redo @@ -871,12 +869,11 @@ out_of_memory: up_read(&mm->mmap_sem); if (is_global_init(tsk)) { yield(); -#ifdef CONFIG_X86_32 - down_read(&mm->mmap_sem); - goto survive; -#else + /* + * Re-lookup the vma - in theory the vma tree might + * have changed: + */ goto again; -#endif } printk("VM: killing process %s\n", tsk->comm); -- cgit v1.2.2 From 5d4488027d9cf3161c71566dfabb116bf69ab4d9 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 18 Aug 2008 14:49:47 +0200 Subject: oprofile: drop const in num counters field allow to modify it at runtime Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/op_x86_model.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 05a0261ba0c3..3d3b85d3c257 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -34,8 +34,8 @@ struct pt_regs; struct op_x86_model_spec { int (*init)(struct oprofile_operations *ops); void (*exit)(void); - unsigned int const num_counters; - unsigned int const num_controls; + unsigned int num_counters; + unsigned int num_controls; void (*fill_in_addresses)(struct op_msrs * const msrs); void (*setup_ctrs)(struct op_msrs const * const msrs); int (*check_ctrs)(struct pt_regs * const regs, -- cgit v1.2.2 From f645f6406463a01869c50844befc76d528971690 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 20 Aug 2008 17:22:02 +0200 Subject: oprofile: Don't report Nehalem as core_2 This essentially reverts Linus' earlier 4b9f12a3779c548b68bc9af7d94030868ad3aa1b commit. Nehalem is not core_2, so it shouldn't be reported as such. However with the earlier arch perfmon patch it will fall back to arch perfmon mode now, so there is no need to fake it as core_2. The only drawback is that Linus will need to patch the arch perfmon support into his oprofile binary now, but I think he can do that. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 57f6c9088081..1059f3fe6b1d 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -420,9 +420,6 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; - case 26: - *cpu_type = "i386/core_2"; - break; default: /* Unknown */ return 0; -- cgit v1.2.2 From b99170288421c79f0c2efa8b33e26e65f4bb7fb8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 18 Aug 2008 14:50:31 +0200 Subject: oprofile: Implement Intel architectural perfmon support Newer Intel CPUs (Core1+) have support for architectural events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. The advantage of this is that it can be done without knowing about the specific CPU, because the CPU describes by itself what performance events are supported. This is only a fallback because only a limited set of 6 events are supported. This allows to do profiling on Nehalem and on Atom systems (later not tested) This patch implements support for that in oprofile's Intel Family 6 profiling module. It also has the advantage of supporting an arbitary number of events now as reported by the CPU. Also allow arbitary counter widths >32bit while we're at it. Requires a patched oprofile userland to support the new architecture. v2: update for latest oprofile tree remove force_arch_perfmon Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 23 ++++++--- arch/x86/oprofile/op_model_ppro.c | 104 ++++++++++++++++++++++++++++++-------- arch/x86/oprofile/op_x86_model.h | 3 ++ 3 files changed, 102 insertions(+), 28 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 1059f3fe6b1d..12d6f85084f1 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -429,6 +429,16 @@ static int __init ppro_init(char **cpu_type) return 1; } +static int __init arch_perfmon_init(char **cpu_type) +{ + if (!cpu_has_arch_perfmon) + return 0; + *cpu_type = "i386/arch_perfmon"; + model = &op_arch_perfmon_spec; + arch_perfmon_setup_counters(); + return 1; +} + /* in order to get sysfs right */ static int using_nmi; @@ -436,7 +446,7 @@ int __init op_nmi_init(struct oprofile_operations *ops) { __u8 vendor = boot_cpu_data.x86_vendor; __u8 family = boot_cpu_data.x86; - char *cpu_type; + char *cpu_type = NULL; int ret = 0; if (!cpu_has_apic) @@ -474,19 +484,20 @@ int __init op_nmi_init(struct oprofile_operations *ops) switch (family) { /* Pentium IV */ case 0xf: - if (!p4_init(&cpu_type)) - return -ENODEV; + p4_init(&cpu_type); break; /* A P6-class processor */ case 6: - if (!ppro_init(&cpu_type)) - return -ENODEV; + ppro_init(&cpu_type); break; default: - return -ENODEV; + break; } + + if (!cpu_type && !arch_perfmon_init(&cpu_type)) + return -ENODEV; break; default: diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57b..12e207a67f1b 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -1,32 +1,34 @@ /* * @file op_model_ppro.h - * pentium pro / P6 model-specific MSR operations + * Family 6 perfmon and architectural perfmon MSR operations * * @remark Copyright 2002 OProfile authors + * @remark Copyright 2008 Intel Corporation * @remark Read the file COPYING * * @author John Levon * @author Philippe Elie * @author Graydon Hoare + * @author Andi Kleen */ #include +#include #include #include #include #include +#include #include "op_x86_model.h" #include "op_counter.h" -#define NUM_COUNTERS 2 -#define NUM_CONTROLS 2 +static int num_counters = 2; +static int counter_width = 32; #define CTR_IS_RESERVED(msrs, c) (msrs->counters[(c)].addr ? 1 : 0) #define CTR_READ(l, h, msrs, c) do {rdmsr(msrs->counters[(c)].addr, (l), (h)); } while (0) -#define CTR_32BIT_WRITE(l, msrs, c) \ - do {wrmsr(msrs->counters[(c)].addr, -(u32)(l), 0); } while (0) -#define CTR_OVERFLOWED(n) (!((n) & (1U<<31))) +#define CTR_OVERFLOWED(n) (!((n) & (1U<<(counter_width-1)))) #define CTRL_IS_RESERVED(msrs, c) (msrs->controls[(c)].addr ? 1 : 0) #define CTRL_READ(l, h, msrs, c) do {rdmsr((msrs->controls[(c)].addr), (l), (h)); } while (0) @@ -40,20 +42,20 @@ #define CTRL_SET_UM(val, m) (val |= (m << 8)) #define CTRL_SET_EVENT(val, e) (val |= e) -static unsigned long reset_value[NUM_COUNTERS]; +static u64 *reset_value; static void ppro_fill_in_addresses(struct op_msrs * const msrs) { int i; - for (i = 0; i < NUM_COUNTERS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; else msrs->counters[i].addr = 0; } - for (i = 0; i < NUM_CONTROLS; i++) { + for (i = 0; i < num_counters; i++) { if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; else @@ -67,8 +69,22 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) unsigned int low, high; int i; + if (!reset_value) { + reset_value = kmalloc(sizeof(unsigned) * num_counters, + GFP_ATOMIC); + if (!reset_value) + return; + } + + if (cpu_has_arch_perfmon) { + union cpuid10_eax eax; + eax.full = cpuid_eax(0xa); + if (counter_width < eax.split.bit_width) + counter_width = eax.split.bit_width; + } + /* clear all counters */ - for (i = 0 ; i < NUM_CONTROLS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (unlikely(!CTRL_IS_RESERVED(msrs, i))) continue; CTRL_READ(low, high, msrs, i); @@ -77,18 +93,18 @@ static void ppro_setup_ctrs(struct op_msrs const * const msrs) } /* avoid a false detection of ctr overflows in NMI handler */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (unlikely(!CTR_IS_RESERVED(msrs, i))) continue; - CTR_32BIT_WRITE(1, msrs, i); + wrmsrl(msrs->counters[i].addr, -1LL); } /* enable active counters */ - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if ((counter_config[i].enabled) && (CTR_IS_RESERVED(msrs, i))) { reset_value[i] = counter_config[i].count; - CTR_32BIT_WRITE(counter_config[i].count, msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); CTRL_READ(low, high, msrs, i); CTRL_CLEAR(low); @@ -111,13 +127,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs, unsigned int low, high; int i; - for (i = 0 ; i < NUM_COUNTERS; ++i) { + for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; CTR_READ(low, high, msrs, i); if (CTR_OVERFLOWED(low)) { oprofile_add_sample(regs, i); - CTR_32BIT_WRITE(reset_value[i], msrs, i); + wrmsrl(msrs->counters[i].addr, -reset_value[i]); } } @@ -141,7 +157,7 @@ static void ppro_start(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (reset_value[i]) { CTRL_READ(low, high, msrs, i); CTRL_SET_ACTIVE(low); @@ -156,7 +172,7 @@ static void ppro_stop(struct op_msrs const * const msrs) unsigned int low, high; int i; - for (i = 0; i < NUM_COUNTERS; ++i) { + for (i = 0; i < num_counters; ++i) { if (!reset_value[i]) continue; CTRL_READ(low, high, msrs, i); @@ -169,21 +185,65 @@ static void ppro_shutdown(struct op_msrs const * const msrs) { int i; - for (i = 0 ; i < NUM_COUNTERS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTR_IS_RESERVED(msrs, i)) release_perfctr_nmi(MSR_P6_PERFCTR0 + i); } - for (i = 0 ; i < NUM_CONTROLS ; ++i) { + for (i = 0 ; i < num_counters ; ++i) { if (CTRL_IS_RESERVED(msrs, i)) release_evntsel_nmi(MSR_P6_EVNTSEL0 + i); } + if (reset_value) { + kfree(reset_value); + reset_value = NULL; + } } struct op_x86_model_spec const op_ppro_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, + .num_counters = 2, + .num_controls = 2, + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown +}; + +/* + * Architectural performance monitoring. + * + * Newer Intel CPUs (Core1+) have support for architectural + * events described in CPUID 0xA. See the IA32 SDM Vol3b.18 for details. + * The advantage of this is that it can be done without knowing about + * the specific CPU. + */ + +void arch_perfmon_setup_counters(void) +{ + union cpuid10_eax eax; + + eax.full = cpuid_eax(0xa); + + /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ + if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && + current_cpu_data.x86_model == 15) { + eax.split.version_id = 2; + eax.split.num_counters = 2; + eax.split.bit_width = 40; + } + + num_counters = eax.split.num_counters; + + op_arch_perfmon_spec.num_counters = num_counters; + op_arch_perfmon_spec.num_controls = num_counters; +} + +struct op_x86_model_spec op_arch_perfmon_spec = { + /* num_counters/num_controls filled in at runtime */ .fill_in_addresses = &ppro_fill_in_addresses, + /* user space does the cpuid check for available events */ .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, .start = &ppro_start, diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 3d3b85d3c257..0b601893a4df 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -49,5 +49,8 @@ extern struct op_x86_model_spec const op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; +extern struct op_x86_model_spec op_arch_perfmon_spec; + +extern void arch_perfmon_setup_counters(void); #endif /* OP_X86_MODEL_H */ -- cgit v1.2.2 From 59512900baab03c5629f2ff5efad1d5d4e682ece Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 29 Sep 2008 22:23:33 +0200 Subject: oprofile: discover counters for op ppro too Discover number of counters for all family 6 models even when not in arch perfmon mode. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_ppro.c | 8 +++++--- arch/x86/oprofile/op_x86_model.h | 2 +- 2 files changed, 6 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 12e207a67f1b..f5a226823e94 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -200,9 +200,9 @@ static void ppro_shutdown(struct op_msrs const * const msrs) } -struct op_x86_model_spec const op_ppro_spec = { - .num_counters = 2, - .num_controls = 2, +struct op_x86_model_spec op_ppro_spec = { + .num_counters = 2, /* can be overriden */ + .num_controls = 2, /* dito */ .fill_in_addresses = &ppro_fill_in_addresses, .setup_ctrs = &ppro_setup_ctrs, .check_ctrs = &ppro_check_ctrs, @@ -238,6 +238,8 @@ void arch_perfmon_setup_counters(void) op_arch_perfmon_spec.num_counters = num_counters; op_arch_perfmon_spec.num_controls = num_counters; + op_ppro_spec.num_counters = num_counters; + op_ppro_spec.num_controls = num_counters; } struct op_x86_model_spec op_arch_perfmon_spec = { diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 0b601893a4df..596de7a5559d 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -45,7 +45,7 @@ struct op_x86_model_spec { void (*shutdown)(struct op_msrs const * const msrs); }; -extern struct op_x86_model_spec const op_ppro_spec; +extern struct op_x86_model_spec op_ppro_spec; extern struct op_x86_model_spec const op_p4_spec; extern struct op_x86_model_spec const op_p4_ht2_spec; extern struct op_x86_model_spec const op_amd_spec; -- cgit v1.2.2 From 611b1597680dd4a57896bcca1af0484be463c55e Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Mon, 21 Jul 2008 18:49:56 +0300 Subject: x86: fix mmiotrace 8-bit register decoding When SIL, DIL, BPL or SPL registers were used in MMIO, the datum was extracted from AH, BH, CH, or DH, which are incorrect. Signed-off-by: Pekka Paalanen Cc: "Vegard Nossum" Cc: "Steven Rostedt" Cc: proski@gnu.org Cc: "Pekka Enberg" Signed-off-by: Ingo Molnar --- arch/x86/mm/pf_in.c | 121 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 84 insertions(+), 37 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index efa1911e20ca..df3d5c861cda 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -79,25 +79,34 @@ static unsigned int mw32[] = { 0xC7 }; static unsigned int mw64[] = { 0x89, 0x8B }; #endif /* not __i386__ */ -static int skip_prefix(unsigned char *addr, int *shorted, int *enlarged, - int *rexr) +struct prefix_bits { + unsigned shorted:1; + unsigned enlarged:1; + unsigned rexr:1; + unsigned rex:1; +}; + +static int skip_prefix(unsigned char *addr, struct prefix_bits *prf) { int i; unsigned char *p = addr; - *shorted = 0; - *enlarged = 0; - *rexr = 0; + prf->shorted = 0; + prf->enlarged = 0; + prf->rexr = 0; + prf->rex = 0; restart: for (i = 0; i < ARRAY_SIZE(prefix_codes); i++) { if (*p == prefix_codes[i]) { if (*p == 0x66) - *shorted = 1; + prf->shorted = 1; #ifdef __amd64__ if ((*p & 0xf8) == 0x48) - *enlarged = 1; + prf->enlarged = 1; if ((*p & 0xf4) == 0x44) - *rexr = 1; + prf->rexr = 1; + if ((*p & 0xf0) == 0x40) + prf->rex = 1; #endif p++; goto restart; @@ -135,12 +144,12 @@ enum reason_type get_ins_type(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int shorted, enlarged, rexr; + struct prefix_bits prf; int i; enum reason_type rv = OTHERS; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); CHECK_OP_TYPE(opcode, reg_rop, REG_READ); @@ -156,10 +165,11 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(rw8); i++) @@ -168,7 +178,7 @@ static unsigned int get_ins_reg_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(rw32); i++) if (rw32[i] == opcode) - return (shorted ? 2 : (enlarged ? 8 : 4)); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -178,10 +188,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) { unsigned int opcode; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(mw8); i++) @@ -194,11 +205,11 @@ unsigned int get_ins_mem_width(unsigned long ins_addr) for (i = 0; i < ARRAY_SIZE(mw32); i++) if (mw32[i] == opcode) - return shorted ? 2 : 4; + return prf.shorted ? 2 : 4; for (i = 0; i < ARRAY_SIZE(mw64); i++) if (mw64[i] == opcode) - return shorted ? 2 : (enlarged ? 8 : 4); + return prf.shorted ? 2 : (prf.enlarged ? 8 : 4); printk(KERN_ERR "mmiotrace: Unknown opcode 0x%02x\n", opcode); return 0; @@ -238,7 +249,7 @@ enum { #endif }; -static unsigned char *get_reg_w8(int no, struct pt_regs *regs) +static unsigned char *get_reg_w8(int no, int rex, struct pt_regs *regs) { unsigned char *rv = NULL; @@ -255,18 +266,6 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) case arg_DL: rv = (unsigned char *)®s->dx; break; - case arg_AH: - rv = 1 + (unsigned char *)®s->ax; - break; - case arg_BH: - rv = 1 + (unsigned char *)®s->bx; - break; - case arg_CH: - rv = 1 + (unsigned char *)®s->cx; - break; - case arg_DH: - rv = 1 + (unsigned char *)®s->dx; - break; #ifdef __amd64__ case arg_R8: rv = (unsigned char *)®s->r8; @@ -294,9 +293,55 @@ static unsigned char *get_reg_w8(int no, struct pt_regs *regs) break; #endif default: - printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); break; } + + if (rv) + return rv; + + if (rex) { + /* + * If REX prefix exists, access low bytes of SI etc. + * instead of AH etc. + */ + switch (no) { + case arg_SI: + rv = (unsigned char *)®s->si; + break; + case arg_DI: + rv = (unsigned char *)®s->di; + break; + case arg_BP: + rv = (unsigned char *)®s->bp; + break; + case arg_SP: + rv = (unsigned char *)®s->sp; + break; + default: + break; + } + } else { + switch (no) { + case arg_AH: + rv = 1 + (unsigned char *)®s->ax; + break; + case arg_BH: + rv = 1 + (unsigned char *)®s->bx; + break; + case arg_CH: + rv = 1 + (unsigned char *)®s->cx; + break; + case arg_DH: + rv = 1 + (unsigned char *)®s->dx; + break; + default: + break; + } + } + + if (!rv) + printk(KERN_ERR "mmiotrace: Error reg no# %d\n", no); + return rv; } @@ -368,11 +413,12 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) unsigned char mod_rm; int reg; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(reg_rop); i++) if (reg_rop[i] == opcode) { @@ -392,10 +438,10 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) do_work: mod_rm = *p; - reg = ((mod_rm >> 3) & 0x7) | (rexr << 3); + reg = ((mod_rm >> 3) & 0x7) | (prf.rexr << 3); switch (get_ins_reg_width(ins_addr)) { case 1: - return *get_reg_w8(reg, regs); + return *get_reg_w8(reg, prf.rex, regs); case 2: return *(unsigned short *)get_reg_w32(reg, regs); @@ -422,11 +468,12 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) unsigned char mod_rm; unsigned char mod; unsigned char *p; - int i, shorted, enlarged, rexr; + struct prefix_bits prf; + int i; unsigned long rv; p = (unsigned char *)ins_addr; - p += skip_prefix(p, &shorted, &enlarged, &rexr); + p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(imm_wop); i++) if (imm_wop[i] == opcode) { -- cgit v1.2.2 From 8b1fa1d7b22f386747c7b78b918d4c680c16066f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 29 Jul 2008 12:36:02 +0200 Subject: ftrace: mark lapic_wd_event() notrace it can be called in the NMI path: [ 0.645999] calling ftrace_dynamic_init+0x0/0xd6 [ 0.647521] ------------[ cut here ]------------ [ 0.647521] WARNING: at kernel/trace/ftrace.c:348 ftrace_record_ip+0x4e/0x252() [ 0.647521] Modules linked in: [ 0.647521] Pid: 15, comm: kstop1 Not tainted 2.6.27-rc1-tip #22686 [ 0.647521] [ 0.647521] Call Trace: [ 0.647521] [] warn_on_slowpath+0x5d/0x84 [ 0.647521] [] ? lapic_wd_event+0xb/0x5c [ 0.647521] [] ftrace_record_ip+0x4e/0x252 [ 0.647521] [] mcount_call+0x5/0x31 [ 0.647521] [] ? lapic_wd_event+0x10/0x5c [ 0.647521] [] nmi_watchdog_tick+0x19d/0x1ad [ 0.647521] [] default_do_nmi+0x75/0x1e3 [ 0.647521] [] do_nmi+0x5d/0x94 [ 0.647521] [] nmi+0xa2/0xc2 [ 0.647521] [] ? check_bytes_and_report+0x11/0xcc [ 0.647521] <> [] ? mcount_call+0x5/0x31 [ 0.647521] [] check_object+0x61/0x1b0 [ 0.647521] [] __slab_free+0x169/0x2ae [ 0.647521] [] ? __cleanup_sighand+0x25/0x27 [ 0.647521] [] ? __cleanup_sighand+0x25/0x27 [ 0.647521] [] kmem_cache_free+0x85/0xb9 [ 0.647521] [] __cleanup_sighand+0x25/0x27 [ 0.647521] [] release_task+0x256/0x339 [ 0.647521] [] do_exit+0x764/0x7ef [ 0.647521] [] __xchg+0x0/0x38 [ 0.647521] [] ? stop_cpu+0x0/0xb2 [ 0.647521] [] ? stop_cpu+0x0/0xb2 [ 0.647521] [] kthread+0x4e/0x7b [ 0.647521] [] child_rip+0xa/0x11 [ 0.647521] [] ? restore_args+0x0/0x30 [ 0.647521] [] ? native_load_tls+0x14/0x2e [ 0.647521] [] ? kthread+0x0/0x7b [ 0.647521] [] ? child_rip+0x0/0x11 [ 0.647521] [ 0.647521] ---[ end trace 4eaa2a86a8e2da22 ]--- [ 0.672032] initcall ftrace_dynamic_init+0x0/0xd6 returned 0 after 19 msecs also mark it no-kprobes while at it. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perfctr-watchdog.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 6bff382094f5..9abd48b22674 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -17,6 +17,8 @@ #include #include #include +#include + #include #include @@ -336,7 +338,8 @@ static void single_msr_unreserve(void) release_perfctr_nmi(wd_ops->perfctr); } -static void single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes +single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* start the cycle over again */ write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz); @@ -401,7 +404,7 @@ static int setup_p6_watchdog(unsigned nmi_hz) return 1; } -static void p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { /* * P6 based Pentium M need to re-unmask @@ -605,7 +608,7 @@ static void p4_unreserve(void) release_perfctr_nmi(MSR_P4_IQ_PERFCTR0); } -static void p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) +static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz) { unsigned dummy; /* @@ -784,7 +787,7 @@ unsigned lapic_adjust_nmi_hz(unsigned hz) return hz; } -int lapic_wd_event(unsigned nmi_hz) +int __kprobes lapic_wd_event(unsigned nmi_hz) { struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 ctr; -- cgit v1.2.2 From e4b2b8866121bd06d5f3d9de0f79a04339ffa252 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:11 -0400 Subject: ftrace: enable using mcount recording on x86 Enable the use of the __mcount_loc infrastructure on x86_64 and i386. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index fc8351f374fd..5a34c5427a07 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -25,6 +25,7 @@ config X86 select HAVE_KPROBES select ARCH_WANT_OPTIONAL_GPIOLIB select HAVE_KRETPROBES + select HAVE_FTRACE_MCOUNT_RECORD select HAVE_DYNAMIC_FTRACE select HAVE_FTRACE select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64) -- cgit v1.2.2 From 0a37605c2261a06d8cafc62dee11374ad676c8c4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 15:45:12 -0400 Subject: ftrace: x86 mcount stub x86 now sets up the mcount locations through the build and no longer needs to record the ip when the function is executed. This patch changes the initial mcount to simply return. There's no need to do any other work. If the ftrace start up test fails, the original mcount will be what everything will use, so having this as fast as possible is a good thing. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 14 -------------- arch/x86/kernel/entry_64.S | 26 -------------------------- arch/x86/kernel/ftrace.c | 14 ++------------ 3 files changed, 2 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index b21fbfaffe39..4e4269c73bb7 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1153,20 +1153,6 @@ ENDPROC(xen_failsafe_callback) #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - subl $MCOUNT_INSN_SIZE, %eax - -.globl mcount_call -mcount_call: - call ftrace_stub - - popl %edx - popl %ecx - popl %eax - ret END(mcount) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1db6ce4314e1..09e7145484c5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -64,32 +64,6 @@ #ifdef CONFIG_FTRACE #ifdef CONFIG_DYNAMIC_FTRACE ENTRY(mcount) - - subq $0x38, %rsp - movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - - movq 0x38(%rsp), %rdi - subq $MCOUNT_INSN_SIZE, %rdi - -.globl mcount_call -mcount_call: - call ftrace_stub - - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx - movq (%rsp), %rax - addq $0x38, %rsp - retq END(mcount) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ab115cd15fdf..96aadbfedcc6 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -112,18 +112,8 @@ notrace int ftrace_update_ftrace_func(ftrace_func_t func) notrace int ftrace_mcount_set(unsigned long *data) { - unsigned long ip = (long)(&mcount_call); - unsigned long *addr = data; - unsigned char old[MCOUNT_INSN_SIZE], *new; - - /* - * Replace the mcount stub with a pointer to the - * ip recorder function. - */ - memcpy(old, &mcount_call, MCOUNT_INSN_SIZE); - new = ftrace_call_replace(ip, *addr); - *addr = ftrace_modify_code(ip, old, new); - + /* mcount is initialized as a nop */ + *data = 0; return 0; } -- cgit v1.2.2 From 732f3ca7d4ba3c1be8d051d52302ef441ee7748b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Aug 2008 18:05:05 -0400 Subject: ftrace: use only 5 byte nops for x86 Mathieu Desnoyers revealed a bug in the original code. The nop that is used to relpace the mcount caller can be a two part nop. This runs the risk where a process can be preempted after executing the first nop, but before the second part of the nop. The ftrace code calls kstop_machine to keep multiple CPUs from executing code that is being modified, but it does not protect against a task preempting in the middle of a two part nop. If the above preemption happens and the tracer is enabled, after the kstop_machine runs, all those nops will be calls to the trace function. If the preempted process that was preempted between the two nops is executed again, it will execute half of the call to the trace function, and this might crash the system. This patch instead uses what both the latest Intel and AMD spec suggests. That is the P6_NOP5 sequence of "0x0f 0x1f 0x44 0x00 0x00". Note, some older CPUs and QEMU might fault on this nop, so this nop is executed with fault handling first. If it detects a fault, it will then use the code "0x66 0x66 0x66 0x66 0x90". If that faults, it will then default to a simple "jmp 1f; .byte 0x00 0x00 0x00; 1:". The jmp is not optimal but will do if the first two can not be executed. TODO: Examine the cpuid to determine the nop to use. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 68 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 96aadbfedcc6..4151c91254e8 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -16,8 +16,8 @@ #include #include -#include #include +#include /* Long is fine, even if it is only 4 bytes ;-) */ @@ -119,13 +119,67 @@ notrace int ftrace_mcount_set(unsigned long *data) int __init ftrace_dyn_arch_init(void *data) { - const unsigned char *const *noptable = find_nop_table(); - - /* This is running in kstop_machine */ - - ftrace_mcount_set(data); + extern const unsigned char ftrace_test_p6nop[]; + extern const unsigned char ftrace_test_nop5[]; + extern const unsigned char ftrace_test_jmp[]; + int faulted = 0; - ftrace_nop = (unsigned long *)noptable[MCOUNT_INSN_SIZE]; + /* + * There is no good nop for all x86 archs. + * We will default to using the P6_NOP5, but first we + * will test to make sure that the nop will actually + * work on this CPU. If it faults, we will then + * go to a lesser efficient 5 byte nop. If that fails + * we then just use a jmp as our nop. This isn't the most + * efficient nop, but we can not use a multi part nop + * since we would then risk being preempted in the middle + * of that nop, and if we enabled tracing then, it might + * cause a system crash. + * + * TODO: check the cpuid to determine the best nop. + */ + asm volatile ( + "jmp ftrace_test_jmp\n" + /* This code needs to stay around */ + ".section .text, \"ax\"\n" + "ftrace_test_jmp:" + "jmp ftrace_test_p6nop\n" + ".byte 0x00,0x00,0x00\n" /* 2 byte jmp + 3 bytes */ + "ftrace_test_p6nop:" + P6_NOP5 + "jmp 1f\n" + "ftrace_test_nop5:" + ".byte 0x66,0x66,0x66,0x66,0x90\n" + "jmp 1f\n" + ".previous\n" + "1:" + ".section .fixup, \"ax\"\n" + "2: movl $1, %0\n" + " jmp ftrace_test_nop5\n" + "3: movl $2, %0\n" + " jmp 1b\n" + ".previous\n" + _ASM_EXTABLE(ftrace_test_p6nop, 2b) + _ASM_EXTABLE(ftrace_test_nop5, 3b) + : "=r"(faulted) : "0" (faulted)); + + switch (faulted) { + case 0: + pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); + ftrace_nop = (unsigned long *)ftrace_test_p6nop; + break; + case 1: + pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); + ftrace_nop = (unsigned long *)ftrace_test_nop5; + break; + case 2: + pr_info("ftrace: converting mcount calls to jmp 1f\n"); + ftrace_nop = (unsigned long *)ftrace_test_jmp; + break; + } + + /* The return code is retured via data */ + *(unsigned long *)data = 0; return 0; } -- cgit v1.2.2 From 6f93fc076a464bfe24e8d4c5fea3f6ca5bdb264d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 20 Aug 2008 12:55:07 -0400 Subject: ftrace: x86 use copy to and from user functions The modification of code is performed either by kstop_machine, before SMP starts, or on module code before the module is executed. There is no reason to do the modifications from assembly. The copy to and from user functions are sufficient and produces cleaner and easier to read code. Thanks to Benjamin Herrenschmidt for suggesting the idea. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 4151c91254e8..082d99642622 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -60,11 +61,7 @@ notrace int ftrace_modify_code(unsigned long ip, unsigned char *old_code, unsigned char *new_code) { - unsigned replaced; - unsigned old = *(unsigned *)old_code; /* 4 bytes */ - unsigned new = *(unsigned *)new_code; /* 4 bytes */ - unsigned char newch = new_code[4]; - int faulted = 0; + unsigned char replaced[MCOUNT_INSN_SIZE]; /* * Note: Due to modules and __init, code can @@ -72,29 +69,20 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, * as well as code changing. * * No real locking needed, this code is run through - * kstop_machine. + * kstop_machine, or before SMP starts. */ - asm volatile ( - "1: lock\n" - " cmpxchg %3, (%2)\n" - " jnz 2f\n" - " movb %b4, 4(%2)\n" - "2:\n" - ".section .fixup, \"ax\"\n" - "3: movl $1, %0\n" - " jmp 2b\n" - ".previous\n" - _ASM_EXTABLE(1b, 3b) - : "=r"(faulted), "=a"(replaced) - : "r"(ip), "r"(new), "c"(newch), - "0"(faulted), "a"(old) - : "memory"); - sync_core(); + if (__copy_from_user(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) + return 1; - if (replaced != old && replaced != new) - faulted = 2; + if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) + return 2; - return faulted; + WARN_ON_ONCE(__copy_to_user((char __user *)ip, new_code, + MCOUNT_INSN_SIZE)); + + sync_core(); + + return 0; } notrace int ftrace_update_ftrace_func(ftrace_func_t func) -- cgit v1.2.2 From bbe5c7830c6dbde58726d44ec0337bc8b2d95d37 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 21:54:16 +0300 Subject: x86 mmiotrace: fix a rare memory leak Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 635b50e85581..754bd1eaf4f6 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -307,8 +307,10 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size, map.map_id = trace->id; spin_lock_irq(&trace_lock); - if (!is_enabled()) + if (!is_enabled()) { + kfree(trace); goto not_enabled; + } mmio_trace_mapping(&map); list_add_tail(&trace->list, &trace_list); -- cgit v1.2.2 From 9e57fb35d711331a9b1410c5c56ebeb3733428a0 Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:00:34 +0300 Subject: x86 mmiotrace: implement mmiotrace_printk() Offer mmiotrace users a function to inject markers from inside the kernel. This depends on the trace_vprintk() patch. Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 19 ++++++++++++++++++- arch/x86/mm/testmmiotrace.c | 4 ++++ 2 files changed, 22 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 754bd1eaf4f6..5e2e2e72ee80 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -75,7 +75,7 @@ static LIST_HEAD(trace_list); /* struct remap_trace */ * and trace_lock. * - Routines depending on is_enabled() must take trace_lock. * - trace_list users must hold trace_lock. - * - is_enabled() guarantees that mmio_trace_record is allowed. + * - is_enabled() guarantees that mmio_trace_{rw,mapping} are allowed. * - pre/post callbacks assume the effect of is_enabled() being true. */ @@ -379,6 +379,23 @@ void mmiotrace_iounmap(volatile void __iomem *addr) iounmap_trace_core(addr); } +int mmiotrace_printk(const char *fmt, ...) +{ + int ret = 0; + va_list args; + unsigned long flags; + va_start(args, fmt); + + spin_lock_irqsave(&trace_lock, flags); + if (is_enabled()) + ret = mmio_trace_printk(fmt, args); + spin_unlock_irqrestore(&trace_lock, flags); + + va_end(args); + return ret; +} +EXPORT_SYMBOL(mmiotrace_printk); + static void clear_trace_list(void) { struct remap_trace *trace; diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c index d877c5b423ef..ab50a8d7402c 100644 --- a/arch/x86/mm/testmmiotrace.c +++ b/arch/x86/mm/testmmiotrace.c @@ -3,6 +3,7 @@ */ #include #include +#include #define MODULE_NAME "testmmiotrace" @@ -13,6 +14,7 @@ MODULE_PARM_DESC(mmio_address, "Start address of the mapping of 16 kB."); static void do_write_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Write test.\n"); for (i = 0; i < 256; i++) iowrite8(i, p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -24,6 +26,7 @@ static void do_write_test(void __iomem *p) static void do_read_test(void __iomem *p) { unsigned int i; + mmiotrace_printk("Read test.\n"); for (i = 0; i < 256; i++) ioread8(p + i); for (i = 1024; i < (5 * 1024); i += 2) @@ -39,6 +42,7 @@ static void do_test(void) pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); return; } + mmiotrace_printk("ioremap returned %p.\n", p); do_write_test(p); do_read_test(p); iounmap(p); -- cgit v1.2.2 From 4427414170a63331a9cc36b9598502c5cdfe453b Mon Sep 17 00:00:00 2001 From: Pekka Paalanen Date: Tue, 16 Sep 2008 22:03:56 +0300 Subject: mmiotrace: remove left-over marker cruft Signed-off-by: Pekka Paalanen Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/mm/mmio-mod.c | 64 -------------------------------------------------- 1 file changed, 64 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index 5e2e2e72ee80..2c4baa88f2cb 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -56,13 +56,6 @@ struct remap_trace { static DEFINE_PER_CPU(struct trap_reason, pf_reason); static DEFINE_PER_CPU(struct mmiotrace_rw, cpu_trace); -#if 0 /* XXX: no way gather this info anymore */ -/* Access to this is not per-cpu. */ -static DEFINE_PER_CPU(atomic_t, dropped); -#endif - -static struct dentry *marker_file; - static DEFINE_MUTEX(mmiotrace_mutex); static DEFINE_SPINLOCK(trace_lock); static atomic_t mmiotrace_enabled; @@ -97,44 +90,6 @@ static bool is_enabled(void) return atomic_read(&mmiotrace_enabled); } -#if 0 /* XXX: needs rewrite */ -/* - * Write callback for the debugfs entry: - * Read a marker and write it to the mmio trace log - */ -static ssize_t write_marker(struct file *file, const char __user *buffer, - size_t count, loff_t *ppos) -{ - char *event = NULL; - struct mm_io_header *headp; - ssize_t len = (count > 65535) ? 65535 : count; - - event = kzalloc(sizeof(*headp) + len, GFP_KERNEL); - if (!event) - return -ENOMEM; - - headp = (struct mm_io_header *)event; - headp->type = MMIO_MAGIC | (MMIO_MARKER << MMIO_OPCODE_SHIFT); - headp->data_len = len; - - if (copy_from_user(event + sizeof(*headp), buffer, len)) { - kfree(event); - return -EFAULT; - } - - spin_lock_irq(&trace_lock); -#if 0 /* XXX: convert this to use tracing */ - if (is_enabled()) - relay_write(chan, event, sizeof(*headp) + len); - else -#endif - len = -EINVAL; - spin_unlock_irq(&trace_lock); - kfree(event); - return len; -} -#endif - static void print_pte(unsigned long address) { unsigned int level; @@ -481,26 +436,12 @@ static void leave_uniprocessor(void) } #endif -#if 0 /* XXX: out of order */ -static struct file_operations fops_marker = { - .owner = THIS_MODULE, - .write = write_marker -}; -#endif - void enable_mmiotrace(void) { mutex_lock(&mmiotrace_mutex); if (is_enabled()) goto out; -#if 0 /* XXX: tracing does not support text entries */ - marker_file = debugfs_create_file("marker", 0660, dir, NULL, - &fops_marker); - if (!marker_file) - pr_err(NAME "marker file creation failed.\n"); -#endif - if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); enter_uniprocessor(); @@ -525,11 +466,6 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); - if (marker_file) { - debugfs_remove(marker_file); - marker_file = NULL; - } - pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); -- cgit v1.2.2 From 37a52f5ef120b93734bb2461744512b55695f69c Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 23 Sep 2008 15:05:46 -0700 Subject: x86: suppress trivial sparse signedness warnings Could just as easily change the three casts to cast to the correct type...this patch changes the type of ftrace_nop instead. Supresses sparse warnings: arch/x86/kernel/ftrace.c:157:14: warning: incorrect type in assignment (different signedness) arch/x86/kernel/ftrace.c:157:14: expected long *static [toplevel] ftrace_nop arch/x86/kernel/ftrace.c:157:14: got unsigned long * arch/x86/kernel/ftrace.c:161:14: warning: incorrect type in assignment (different signedness) arch/x86/kernel/ftrace.c:161:14: expected long *static [toplevel] ftrace_nop arch/x86/kernel/ftrace.c:161:14: got unsigned long * arch/x86/kernel/ftrace.c:165:14: warning: incorrect type in assignment (different signedness) arch/x86/kernel/ftrace.c:165:14: expected long *static [toplevel] ftrace_nop arch/x86/kernel/ftrace.c:165:14: got unsigned long * Signed-off-by: Harvey Harrison Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 082d99642622..66d900248fc2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -22,7 +22,7 @@ /* Long is fine, even if it is only 4 bytes ;-) */ -static long *ftrace_nop; +static unsigned long *ftrace_nop; union ftrace_code_union { char code[MCOUNT_INSN_SIZE]; -- cgit v1.2.2 From ac2b86fdef5b44f194eefaa6b7b6aea9423d1bc2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fr=C3=A9d=C3=A9ric=20Weisbecker?= Date: Wed, 24 Sep 2008 16:31:56 +0100 Subject: x86/ftrace: use uaccess in atomic context With latest -tip I get this bug: [ 49.439988] in_atomic():0, irqs_disabled():1 [ 49.440118] INFO: lockdep is turned off. [ 49.440118] Pid: 2814, comm: modprobe Tainted: G W 2.6.27-rc7 #4 [ 49.440118] [] __might_sleep+0xe1/0x120 [ 49.440118] [] ftrace_modify_code+0x2a/0xd0 [ 49.440118] [] ? ftrace_test_p6nop+0x0/0xa [ 49.440118] [] __ftrace_update_code+0xfe/0x2f0 [ 49.440118] [] ? ftrace_test_p6nop+0x0/0xa [ 49.440118] [] ftrace_convert_nops+0x50/0x80 [ 49.440118] [] ftrace_init_module+0x16/0x20 [ 49.440118] [] load_module+0x185b/0x1d30 [ 49.440118] [] ? find_get_page+0x0/0xf0 [ 49.440118] [] ? sprintf+0x0/0x30 [ 49.440118] [] ? mutex_lock_interruptible_nested+0x1f2/0x350 [ 49.440118] [] sys_init_module+0x53/0x1b0 [ 49.440118] [] ? do_page_fault+0x0/0x740 [ 49.440118] [] syscall_call+0x7/0xb [ 49.440118] ======================= It is because ftrace_modify_code() calls copy_to_user and copy_from_user. These functions have been inserted after guessing that there couldn't be any race condition but copy_[to/from]_user might sleep and __ftrace_update_code is called with local_irq_saved. These function have been inserted since this commit: d5e92e8978fd2574e415dc2792c5eb592978243d: "ftrace: x86 use copy from user function" Signed-off-by: Frederic Weisbecker Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 66d900248fc2..222507e8157b 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -71,13 +71,13 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code, * No real locking needed, this code is run through * kstop_machine, or before SMP starts. */ - if (__copy_from_user(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) + if (__copy_from_user_inatomic(replaced, (char __user *)ip, MCOUNT_INSN_SIZE)) return 1; if (memcmp(replaced, old_code, MCOUNT_INSN_SIZE) != 0) return 2; - WARN_ON_ONCE(__copy_to_user((char __user *)ip, new_code, + WARN_ON_ONCE(__copy_to_user_inatomic((char __user *)ip, new_code, MCOUNT_INSN_SIZE)); sync_core(); -- cgit v1.2.2 From 8b27386a9ce9c7f0f8702cff7565a46802ad57d1 Mon Sep 17 00:00:00 2001 From: Anders Kaseorg Date: Thu, 9 Oct 2008 22:19:08 -0400 Subject: ftrace: make ftrace_test_p6nop disassembler-friendly Commit 4c3dc21b136f8cb4b72afee16c3ba7e961656c0b in tip introduced the 5-byte NOP ftrace_test_p6nop: jmp . + 5 .byte 0x00, 0x00, 0x00 This is not friendly to disassemblers because an odd number of 0x00s ends in the middle of an instruction boundary. This changes the 0x00s to 1-byte NOPs (0x90). Signed-off-by: Anders Kaseorg Acked-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/ftrace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 222507e8157b..d073d981a730 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -132,7 +132,9 @@ int __init ftrace_dyn_arch_init(void *data) ".section .text, \"ax\"\n" "ftrace_test_jmp:" "jmp ftrace_test_p6nop\n" - ".byte 0x00,0x00,0x00\n" /* 2 byte jmp + 3 bytes */ + "nop\n" + "nop\n" + "nop\n" /* 2 byte jmp + 3 bytes */ "ftrace_test_p6nop:" P6_NOP5 "jmp 1f\n" @@ -161,7 +163,7 @@ int __init ftrace_dyn_arch_init(void *data) ftrace_nop = (unsigned long *)ftrace_test_nop5; break; case 2: - pr_info("ftrace: converting mcount calls to jmp 1f\n"); + pr_info("ftrace: converting mcount calls to jmp . + 5\n"); ftrace_nop = (unsigned long *)ftrace_test_jmp; break; } -- cgit v1.2.2 From 758a7f7bb86b520aadc484f23da85e547b3bf3d8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Tue, 14 Oct 2008 17:01:03 -0600 Subject: x86: register a platform RTC device if PNP doesn't describe it Most if not all x86 platforms have an RTC device, but sometimes the RTC is not exposed as a PNP0b00/PNP0b01/PNP0b02 device in PNPBIOS or ACPI: http://bugzilla.kernel.org/show_bug.cgi?id=11580 https://bugzilla.redhat.com/show_bug.cgi?id=451188 It's best if we can discover the RTC via PNP because then we know which flavor of device it is, where it lives, and which IRQ it uses. But if we can't, we should register a platform device using the compiled-in RTC_PORT/RTC_IRQ resource assumptions. Signed-off-by: Bjorn Helgaas Acked-by: Rafael J. Wysocki Acked-by: David Brownell Reported-by: Rik Theys Reported-by: shr_msn@yahoo.com.tw Signed-off-by: Linus Torvalds --- arch/x86/kernel/rtc.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 05191bbc68b8..0a23b5795b25 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -223,11 +223,25 @@ static struct platform_device rtc_device = { static __init int add_rtc_cmos(void) { #ifdef CONFIG_PNP - if (!pnp_platform_devices) - platform_device_register(&rtc_device); -#else + static const char *ids[] __initconst = + { "PNP0b00", "PNP0b01", "PNP0b02", }; + struct pnp_dev *dev; + struct pnp_id *id; + int i; + + pnp_for_each_dev(dev) { + for (id = dev->id; id; id = id->next) { + for (i = 0; i < ARRAY_SIZE(ids); i++) { + if (compare_pnp_id(id, ids[i]) != 0) + return 0; + } + } + } +#endif + platform_device_register(&rtc_device); -#endif /* CONFIG_PNP */ + dev_info(&rtc_device.dev, + "registered platform RTC device (no PNP device found)\n"); return 0; } device_initcall(add_rtc_cmos); -- cgit v1.2.2 From ca60dfbb69afb549e33527cbf676e4daf8febfb5 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Tue, 24 Jun 2008 17:02:38 +0800 Subject: KVM: VMX: Rename misnamed msr bits MSR_IA32_FEATURE_LOCKED is just a bit in fact, which shouldn't be prefixed with MSR_. So is MSR_IA32_FEATURE_VMXON_ENABLED. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 18 +++++++++--------- arch/x86/kvm/vmx.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7041cc52b562..81dac721ec31 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1031,9 +1031,9 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - == MSR_IA32_FEATURE_CONTROL_LOCKED; + return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) + == IA32_FEATURE_CONTROL_LOCKED_BIT; /* locked but not enabled */ } @@ -1045,14 +1045,14 @@ static void hardware_enable(void *garbage) INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) - != (MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED)) + if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) + != (IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - MSR_IA32_FEATURE_CONTROL_LOCKED | - MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED); + IA32_FEATURE_CONTROL_LOCKED_BIT | + IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 17e25995b65b..86059f439cb4 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,8 +331,8 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define MSR_IA32_FEATURE_CONTROL_LOCKED 0x1 -#define MSR_IA32_FEATURE_CONTROL_VMXON_ENABLED 0x4 +#define IA32_FEATURE_CONTROL_LOCKED_BIT 0x1 +#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT 0x4 #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 -- cgit v1.2.2 From 5fdbf9765b7ba6a45100851154768de703d51e76 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 27 Jun 2008 14:58:02 -0300 Subject: KVM: x86: accessors for guest registers As suggested by Avi, introduce accessors to read/write guest registers. This simplifies the ->cache_regs/->decache_regs interface, and improves register caching which is important for VMX, where the cost of vmcs_read/vmcs_write is significant. [avi: fix warnings] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/kvm_cache_regs.h | 32 +++++ arch/x86/kvm/lapic.c | 4 +- arch/x86/kvm/svm.c | 56 ++++----- arch/x86/kvm/vmx.c | 103 ++++++++-------- arch/x86/kvm/x86.c | 268 ++++++++++++++++++++++-------------------- arch/x86/kvm/x86_emulate.c | 19 +-- 6 files changed, 254 insertions(+), 228 deletions(-) create mode 100644 arch/x86/kvm/kvm_cache_regs.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h new file mode 100644 index 000000000000..1ff819dce7d3 --- /dev/null +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -0,0 +1,32 @@ +#ifndef ASM_KVM_CACHE_REGS_H +#define ASM_KVM_CACHE_REGS_H + +static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, + enum kvm_reg reg) +{ + if (!test_bit(reg, (unsigned long *)&vcpu->arch.regs_avail)) + kvm_x86_ops->cache_reg(vcpu, reg); + + return vcpu->arch.regs[reg]; +} + +static inline void kvm_register_write(struct kvm_vcpu *vcpu, + enum kvm_reg reg, + unsigned long val) +{ + vcpu->arch.regs[reg] = val; + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_dirty); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); +} + +static inline unsigned long kvm_rip_read(struct kvm_vcpu *vcpu) +{ + return kvm_register_read(vcpu, VCPU_REGS_RIP); +} + +static inline void kvm_rip_write(struct kvm_vcpu *vcpu, unsigned long val) +{ + kvm_register_write(vcpu, VCPU_REGS_RIP, val); +} + +#endif diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 73f43de69f67..9fde0ac24268 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -32,6 +32,7 @@ #include #include #include +#include "kvm_cache_regs.h" #include "irq.h" #define PRId64 "d" @@ -558,8 +559,7 @@ static void __report_tpr_access(struct kvm_lapic *apic, bool write) struct kvm_run *run = vcpu->run; set_bit(KVM_REQ_REPORT_TPR_ACCESS, &vcpu->requests); - kvm_x86_ops->cache_regs(vcpu); - run->tpr_access.rip = vcpu->arch.rip; + run->tpr_access.rip = kvm_rip_read(vcpu); run->tpr_access.is_write = write; } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8233b86c778c..54b0bf33e21e 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -18,6 +18,7 @@ #include "kvm_svm.h" #include "irq.h" #include "mmu.h" +#include "kvm_cache_regs.h" #include #include @@ -236,13 +237,11 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) printk(KERN_DEBUG "%s: NOP\n", __func__); return; } - if (svm->next_rip - svm->vmcb->save.rip > MAX_INST_SIZE) - printk(KERN_ERR "%s: ip 0x%llx next 0x%llx\n", - __func__, - svm->vmcb->save.rip, - svm->next_rip); + if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE) + printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n", + __func__, kvm_rip_read(vcpu), svm->next_rip); - vcpu->arch.rip = svm->vmcb->save.rip = svm->next_rip; + kvm_rip_write(vcpu, svm->next_rip); svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; vcpu->arch.interrupt_window_open = 1; @@ -581,6 +580,7 @@ static void init_vmcb(struct vcpu_svm *svm) save->dr7 = 0x400; save->rflags = 2; save->rip = 0x0000fff0; + svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; /* * cr0 val on cpu init should be 0x60000010, we enable cpu @@ -615,10 +615,12 @@ static int svm_vcpu_reset(struct kvm_vcpu *vcpu) init_vmcb(svm); if (vcpu->vcpu_id != 0) { - svm->vmcb->save.rip = 0; + kvm_rip_write(vcpu, 0); svm->vmcb->save.cs.base = svm->vcpu.arch.sipi_vector << 12; svm->vmcb->save.cs.selector = svm->vcpu.arch.sipi_vector << 8; } + vcpu->arch.regs_avail = ~0; + vcpu->arch.regs_dirty = ~0; return 0; } @@ -721,23 +723,6 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) rdtscll(vcpu->arch.host_tsc); } -static void svm_cache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - - vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; - vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; - vcpu->arch.rip = svm->vmcb->save.rip; -} - -static void svm_decache_regs(struct kvm_vcpu *vcpu) -{ - struct vcpu_svm *svm = to_svm(vcpu); - svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; - svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - svm->vmcb->save.rip = vcpu->arch.rip; -} - static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1139,14 +1124,14 @@ static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; skip_emulated_instruction(&svm->vcpu); return kvm_emulate_halt(&svm->vcpu); } static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 3; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; skip_emulated_instruction(&svm->vcpu); kvm_emulate_hypercall(&svm->vcpu); return 1; @@ -1178,7 +1163,7 @@ static int task_switch_interception(struct vcpu_svm *svm, static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; kvm_emulate_cpuid(&svm->vcpu); return 1; } @@ -1273,9 +1258,9 @@ static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) KVMTRACE_3D(MSR_READ, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), handler); - svm->vmcb->save.rax = data & 0xffffffff; + svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; svm->vcpu.arch.regs[VCPU_REGS_RDX] = data >> 32; - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; skip_emulated_instruction(&svm->vcpu); } return 1; @@ -1359,13 +1344,13 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data) static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; - u64 data = (svm->vmcb->save.rax & -1u) + u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); KVMTRACE_3D(MSR_WRITE, &svm->vcpu, ecx, (u32)data, (u32)(data >> 32), handler); - svm->next_rip = svm->vmcb->save.rip + 2; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; if (svm_set_msr(&svm->vcpu, ecx, data)) kvm_inject_gp(&svm->vcpu, 0); else @@ -1723,6 +1708,10 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) u16 gs_selector; u16 ldt_selector; + svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; + svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; + svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; + pre_svm_run(svm); sync_lapic_to_cr8(vcpu); @@ -1858,6 +1847,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) load_db_regs(svm->host_db_regs); vcpu->arch.cr2 = svm->vmcb->save.cr2; + vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; + vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; + vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; write_dr6(svm->host_dr6); write_dr7(svm->host_dr7); @@ -1977,8 +1969,6 @@ static struct kvm_x86_ops svm_x86_ops = { .set_gdt = svm_set_gdt, .get_dr = svm_get_dr, .set_dr = svm_set_dr, - .cache_regs = svm_cache_regs, - .decache_regs = svm_decache_regs, .get_rflags = svm_get_rflags, .set_rflags = svm_set_rflags, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81dac721ec31..5a3a0326c277 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -26,6 +26,7 @@ #include #include #include +#include "kvm_cache_regs.h" #include #include @@ -715,9 +716,9 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) unsigned long rip; u32 interruptibility; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN); - vmcs_writel(GUEST_RIP, rip); + kvm_rip_write(vcpu, rip); /* * We emulated an instruction, so temporary interrupt blocking @@ -947,24 +948,19 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) return ret; } -/* - * Sync the rsp and rip registers into the vcpu structure. This allows - * registers to be accessed by indexing vcpu->arch.regs. - */ -static void vcpu_load_rsp_rip(struct kvm_vcpu *vcpu) -{ - vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); - vcpu->arch.rip = vmcs_readl(GUEST_RIP); -} - -/* - * Syncs rsp and rip back into the vmcs. Should be called after possible - * modification. - */ -static void vcpu_put_rsp_rip(struct kvm_vcpu *vcpu) +static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) { - vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); - vmcs_writel(GUEST_RIP, vcpu->arch.rip); + __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail); + switch (reg) { + case VCPU_REGS_RSP: + vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP); + break; + case VCPU_REGS_RIP: + vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(GUEST_RIP); + break; + default: + break; + } } static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) @@ -2019,6 +2015,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) u64 msr; int ret; + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); down_read(&vcpu->kvm->slots_lock); if (!init_rmode(vmx->vcpu.kvm)) { ret = -ENOMEM; @@ -2072,10 +2069,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_RFLAGS, 0x02); if (vmx->vcpu.vcpu_id == 0) - vmcs_writel(GUEST_RIP, 0xfff0); + kvm_rip_write(vcpu, 0xfff0); else - vmcs_writel(GUEST_RIP, 0); - vmcs_writel(GUEST_RSP, 0); + kvm_rip_write(vcpu, 0); + kvm_register_write(vcpu, VCPU_REGS_RSP, 0); /* todo: dr0 = dr1 = dr2 = dr3 = 0; dr6 = 0xffff0ff0 */ vmcs_writel(GUEST_DR7, 0x400); @@ -2139,11 +2136,11 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; - vmx->rmode.irq.rip = vmcs_readl(GUEST_RIP); + vmx->rmode.irq.rip = kvm_rip_read(vcpu); vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip - 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); return; } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, @@ -2288,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } error_code = 0; - rip = vmcs_readl(GUEST_RIP); + rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -2386,27 +2383,25 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) reg = (exit_qualification >> 8) & 15; switch ((exit_qualification >> 4) & 3) { case 0: /* mov to cr */ - KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), handler); + KVMTRACE_3D(CR_WRITE, vcpu, (u32)cr, + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), + handler); switch (cr) { case 0: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr0(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr0(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 3: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr3(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr3(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 4: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr4(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr4(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - kvm_set_cr8(vcpu, vcpu->arch.regs[reg]); + kvm_set_cr8(vcpu, kvm_register_read(vcpu, reg)); skip_emulated_instruction(vcpu); if (irqchip_in_kernel(vcpu->kvm)) return 1; @@ -2415,7 +2410,6 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) }; break; case 2: /* clts */ - vcpu_load_rsp_rip(vcpu); vmx_fpu_deactivate(vcpu); vcpu->arch.cr0 &= ~X86_CR0_TS; vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0); @@ -2426,21 +2420,17 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) case 1: /*mov from cr*/ switch (cr) { case 3: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = vcpu->arch.cr3; - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, vcpu->arch.cr3); KVMTRACE_3D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], - (u32)((u64)vcpu->arch.regs[reg] >> 32), + (u32)kvm_register_read(vcpu, reg), + (u32)((u64)kvm_register_read(vcpu, reg) >> 32), handler); skip_emulated_instruction(vcpu); return 1; case 8: - vcpu_load_rsp_rip(vcpu); - vcpu->arch.regs[reg] = kvm_get_cr8(vcpu); - vcpu_put_rsp_rip(vcpu); + kvm_register_write(vcpu, reg, kvm_get_cr8(vcpu)); KVMTRACE_2D(CR_READ, vcpu, (u32)cr, - (u32)vcpu->arch.regs[reg], handler); + (u32)kvm_register_read(vcpu, reg), handler); skip_emulated_instruction(vcpu); return 1; } @@ -2472,7 +2462,6 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) exit_qualification = vmcs_readl(EXIT_QUALIFICATION); dr = exit_qualification & 7; reg = (exit_qualification >> 8) & 15; - vcpu_load_rsp_rip(vcpu); if (exit_qualification & 16) { /* mov from dr */ switch (dr) { @@ -2485,12 +2474,11 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) default: val = 0; } - vcpu->arch.regs[reg] = val; + kvm_register_write(vcpu, reg, val); KVMTRACE_2D(DR_READ, vcpu, (u32)dr, (u32)val, handler); } else { /* mov to dr */ } - vcpu_put_rsp_rip(vcpu); skip_emulated_instruction(vcpu); return 1; } @@ -2735,8 +2723,8 @@ static int kvm_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 vectoring_info = vmx->idt_vectoring_info; - KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)vmcs_readl(GUEST_RIP), - (u32)((u64)vmcs_readl(GUEST_RIP) >> 32), entryexit); + KVMTRACE_3D(VMEXIT, vcpu, exit_reason, (u32)kvm_rip_read(vcpu), + (u32)((u64)kvm_rip_read(vcpu) >> 32), entryexit); /* Access CR3 don't cause VMExit in paging mode, so we need * to sync with guest real CR3. */ @@ -2922,9 +2910,9 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) static void fixup_rmode_irq(struct vcpu_vmx *vmx) { vmx->rmode.irq.pending = 0; - if (vmcs_readl(GUEST_RIP) + 1 != vmx->rmode.irq.rip) + if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) return; - vmcs_writel(GUEST_RIP, vmx->rmode.irq.rip); + kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; @@ -2941,6 +2929,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); + if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) + vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]); + /* * Loading guest fpu may have cleared host cr0.ts */ @@ -3061,6 +3054,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif ); + vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); + vcpu->arch.regs_dirty = 0; + vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); if (vmx->rmode.irq.pending) fixup_rmode_irq(vmx); @@ -3224,8 +3220,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .set_idt = vmx_set_idt, .get_gdt = vmx_get_gdt, .set_gdt = vmx_set_gdt, - .cache_regs = vcpu_load_rsp_rip, - .decache_regs = vcpu_put_rsp_rip, + .cache_reg = vmx_cache_reg, .get_rflags = vmx_get_rflags, .set_rflags = vmx_set_rflags, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d682fc6aeb3..2f0696bc7d2f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -19,6 +19,7 @@ #include "mmu.h" #include "i8254.h" #include "tss.h" +#include "kvm_cache_regs.h" #include #include @@ -61,6 +62,7 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); struct kvm_x86_ops *kvm_x86_ops; +EXPORT_SYMBOL_GPL(kvm_x86_ops); struct kvm_stats_debugfs_item debugfs_entries[] = { { "pf_fixed", VCPU_STAT(pf_fixed) }, @@ -2080,7 +2082,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) { u8 opcodes[4]; - unsigned long rip = vcpu->arch.rip; + unsigned long rip = kvm_rip_read(vcpu); unsigned long rip_linear; if (!printk_ratelimit()) @@ -2102,6 +2104,14 @@ static struct x86_emulate_ops emulate_ops = { .cmpxchg_emulated = emulator_cmpxchg_emulated, }; +static void cache_all_regs(struct kvm_vcpu *vcpu) +{ + kvm_register_read(vcpu, VCPU_REGS_RAX); + kvm_register_read(vcpu, VCPU_REGS_RSP); + kvm_register_read(vcpu, VCPU_REGS_RIP); + vcpu->arch.regs_dirty = ~0; +} + int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, unsigned long cr2, @@ -2112,7 +2122,13 @@ int emulate_instruction(struct kvm_vcpu *vcpu, struct decode_cache *c; vcpu->arch.mmio_fault_cr2 = cr2; - kvm_x86_ops->cache_regs(vcpu); + /* + * TODO: fix x86_emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ + cache_all_regs(vcpu); vcpu->mmio_is_write = 0; vcpu->arch.pio.string = 0; @@ -2172,7 +2188,6 @@ int emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DO_MMIO; } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (vcpu->mmio_is_write) { @@ -2225,20 +2240,19 @@ int complete_pio(struct kvm_vcpu *vcpu) struct kvm_pio_request *io = &vcpu->arch.pio; long delta; int r; - - kvm_x86_ops->cache_regs(vcpu); + unsigned long val; if (!io->string) { - if (io->in) - memcpy(&vcpu->arch.regs[VCPU_REGS_RAX], vcpu->arch.pio_data, - io->size); + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(&val, vcpu->arch.pio_data, io->size); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + } } else { if (io->in) { r = pio_copy_data(vcpu); - if (r) { - kvm_x86_ops->cache_regs(vcpu); + if (r) return r; - } } delta = 1; @@ -2248,19 +2262,24 @@ int complete_pio(struct kvm_vcpu *vcpu) * The size of the register should really depend on * current address size. */ - vcpu->arch.regs[VCPU_REGS_RCX] -= delta; + val = kvm_register_read(vcpu, VCPU_REGS_RCX); + val -= delta; + kvm_register_write(vcpu, VCPU_REGS_RCX, val); } if (io->down) delta = -delta; delta *= io->size; - if (io->in) - vcpu->arch.regs[VCPU_REGS_RDI] += delta; - else - vcpu->arch.regs[VCPU_REGS_RSI] += delta; + if (io->in) { + val = kvm_register_read(vcpu, VCPU_REGS_RDI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RDI, val); + } else { + val = kvm_register_read(vcpu, VCPU_REGS_RSI); + val += delta; + kvm_register_write(vcpu, VCPU_REGS_RSI, val); + } } - kvm_x86_ops->decache_regs(vcpu); - io->count -= io->cur_count; io->cur_count = 0; @@ -2313,6 +2332,7 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, int size, unsigned port) { struct kvm_io_device *pio_dev; + unsigned long val; vcpu->run->exit_reason = KVM_EXIT_IO; vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; @@ -2333,8 +2353,8 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, KVMTRACE_2D(IO_WRITE, vcpu, vcpu->run->io.port, (u32)size, handler); - kvm_x86_ops->cache_regs(vcpu); - memcpy(vcpu->arch.pio_data, &vcpu->arch.regs[VCPU_REGS_RAX], 4); + val = kvm_register_read(vcpu, VCPU_REGS_RAX); + memcpy(vcpu->arch.pio_data, &val, 4); kvm_x86_ops->skip_emulated_instruction(vcpu); @@ -2519,13 +2539,11 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) unsigned long nr, a0, a1, a2, a3, ret; int r = 1; - kvm_x86_ops->cache_regs(vcpu); - - nr = vcpu->arch.regs[VCPU_REGS_RAX]; - a0 = vcpu->arch.regs[VCPU_REGS_RBX]; - a1 = vcpu->arch.regs[VCPU_REGS_RCX]; - a2 = vcpu->arch.regs[VCPU_REGS_RDX]; - a3 = vcpu->arch.regs[VCPU_REGS_RSI]; + nr = kvm_register_read(vcpu, VCPU_REGS_RAX); + a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); + a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); + a2 = kvm_register_read(vcpu, VCPU_REGS_RDX); + a3 = kvm_register_read(vcpu, VCPU_REGS_RSI); KVMTRACE_1D(VMMCALL, vcpu, (u32)nr, handler); @@ -2548,8 +2566,7 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) ret = -KVM_ENOSYS; break; } - vcpu->arch.regs[VCPU_REGS_RAX] = ret; - kvm_x86_ops->decache_regs(vcpu); + kvm_register_write(vcpu, VCPU_REGS_RAX, ret); ++vcpu->stat.hypercalls; return r; } @@ -2559,6 +2576,7 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) { char instruction[3]; int ret = 0; + unsigned long rip = kvm_rip_read(vcpu); /* @@ -2568,9 +2586,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) */ kvm_mmu_zap_all(vcpu->kvm); - kvm_x86_ops->cache_regs(vcpu); kvm_x86_ops->patch_hypercall(vcpu, instruction); - if (emulator_write_emulated(vcpu->arch.rip, instruction, 3, vcpu) + if (emulator_write_emulated(rip, instruction, 3, vcpu) != X86EMUL_CONTINUE) ret = -EFAULT; @@ -2700,13 +2717,12 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) u32 function, index; struct kvm_cpuid_entry2 *e, *best; - kvm_x86_ops->cache_regs(vcpu); - function = vcpu->arch.regs[VCPU_REGS_RAX]; - index = vcpu->arch.regs[VCPU_REGS_RCX]; - vcpu->arch.regs[VCPU_REGS_RAX] = 0; - vcpu->arch.regs[VCPU_REGS_RBX] = 0; - vcpu->arch.regs[VCPU_REGS_RCX] = 0; - vcpu->arch.regs[VCPU_REGS_RDX] = 0; + function = kvm_register_read(vcpu, VCPU_REGS_RAX); + index = kvm_register_read(vcpu, VCPU_REGS_RCX); + kvm_register_write(vcpu, VCPU_REGS_RAX, 0); + kvm_register_write(vcpu, VCPU_REGS_RBX, 0); + kvm_register_write(vcpu, VCPU_REGS_RCX, 0); + kvm_register_write(vcpu, VCPU_REGS_RDX, 0); best = NULL; for (i = 0; i < vcpu->arch.cpuid_nent; ++i) { e = &vcpu->arch.cpuid_entries[i]; @@ -2724,18 +2740,17 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) best = e; } if (best) { - vcpu->arch.regs[VCPU_REGS_RAX] = best->eax; - vcpu->arch.regs[VCPU_REGS_RBX] = best->ebx; - vcpu->arch.regs[VCPU_REGS_RCX] = best->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = best->edx; + kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); + kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); + kvm_register_write(vcpu, VCPU_REGS_RCX, best->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, best->edx); } - kvm_x86_ops->decache_regs(vcpu); kvm_x86_ops->skip_emulated_instruction(vcpu); KVMTRACE_5D(CPUID, vcpu, function, - (u32)vcpu->arch.regs[VCPU_REGS_RAX], - (u32)vcpu->arch.regs[VCPU_REGS_RBX], - (u32)vcpu->arch.regs[VCPU_REGS_RCX], - (u32)vcpu->arch.regs[VCPU_REGS_RDX], handler); + (u32)kvm_register_read(vcpu, VCPU_REGS_RAX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RBX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RCX), + (u32)kvm_register_read(vcpu, VCPU_REGS_RDX), handler); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); @@ -2917,8 +2932,8 @@ again: * Profile KVM exit RIPs: */ if (unlikely(prof_on == KVM_PROFILING)) { - kvm_x86_ops->cache_regs(vcpu); - profile_hit(KVM_PROFILING, (void *)vcpu->arch.rip); + unsigned long rip = kvm_rip_read(vcpu); + profile_hit(KVM_PROFILING, (void *)rip); } if (vcpu->arch.exception.pending && kvm_x86_ops->exception_injected(vcpu)) @@ -2999,11 +3014,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } #endif - if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) { - kvm_x86_ops->cache_regs(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = kvm_run->hypercall.ret; - kvm_x86_ops->decache_regs(vcpu); - } + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) + kvm_register_write(vcpu, VCPU_REGS_RAX, + kvm_run->hypercall.ret); r = __vcpu_run(vcpu, kvm_run); @@ -3019,28 +3032,26 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - kvm_x86_ops->cache_regs(vcpu); - - regs->rax = vcpu->arch.regs[VCPU_REGS_RAX]; - regs->rbx = vcpu->arch.regs[VCPU_REGS_RBX]; - regs->rcx = vcpu->arch.regs[VCPU_REGS_RCX]; - regs->rdx = vcpu->arch.regs[VCPU_REGS_RDX]; - regs->rsi = vcpu->arch.regs[VCPU_REGS_RSI]; - regs->rdi = vcpu->arch.regs[VCPU_REGS_RDI]; - regs->rsp = vcpu->arch.regs[VCPU_REGS_RSP]; - regs->rbp = vcpu->arch.regs[VCPU_REGS_RBP]; + regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); + regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); + regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); + regs->rdx = kvm_register_read(vcpu, VCPU_REGS_RDX); + regs->rsi = kvm_register_read(vcpu, VCPU_REGS_RSI); + regs->rdi = kvm_register_read(vcpu, VCPU_REGS_RDI); + regs->rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); + regs->rbp = kvm_register_read(vcpu, VCPU_REGS_RBP); #ifdef CONFIG_X86_64 - regs->r8 = vcpu->arch.regs[VCPU_REGS_R8]; - regs->r9 = vcpu->arch.regs[VCPU_REGS_R9]; - regs->r10 = vcpu->arch.regs[VCPU_REGS_R10]; - regs->r11 = vcpu->arch.regs[VCPU_REGS_R11]; - regs->r12 = vcpu->arch.regs[VCPU_REGS_R12]; - regs->r13 = vcpu->arch.regs[VCPU_REGS_R13]; - regs->r14 = vcpu->arch.regs[VCPU_REGS_R14]; - regs->r15 = vcpu->arch.regs[VCPU_REGS_R15]; + regs->r8 = kvm_register_read(vcpu, VCPU_REGS_R8); + regs->r9 = kvm_register_read(vcpu, VCPU_REGS_R9); + regs->r10 = kvm_register_read(vcpu, VCPU_REGS_R10); + regs->r11 = kvm_register_read(vcpu, VCPU_REGS_R11); + regs->r12 = kvm_register_read(vcpu, VCPU_REGS_R12); + regs->r13 = kvm_register_read(vcpu, VCPU_REGS_R13); + regs->r14 = kvm_register_read(vcpu, VCPU_REGS_R14); + regs->r15 = kvm_register_read(vcpu, VCPU_REGS_R15); #endif - regs->rip = vcpu->arch.rip; + regs->rip = kvm_rip_read(vcpu); regs->rflags = kvm_x86_ops->get_rflags(vcpu); /* @@ -3058,29 +3069,29 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { vcpu_load(vcpu); - vcpu->arch.regs[VCPU_REGS_RAX] = regs->rax; - vcpu->arch.regs[VCPU_REGS_RBX] = regs->rbx; - vcpu->arch.regs[VCPU_REGS_RCX] = regs->rcx; - vcpu->arch.regs[VCPU_REGS_RDX] = regs->rdx; - vcpu->arch.regs[VCPU_REGS_RSI] = regs->rsi; - vcpu->arch.regs[VCPU_REGS_RDI] = regs->rdi; - vcpu->arch.regs[VCPU_REGS_RSP] = regs->rsp; - vcpu->arch.regs[VCPU_REGS_RBP] = regs->rbp; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); + kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); + kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); + kvm_register_write(vcpu, VCPU_REGS_RDX, regs->rdx); + kvm_register_write(vcpu, VCPU_REGS_RSI, regs->rsi); + kvm_register_write(vcpu, VCPU_REGS_RDI, regs->rdi); + kvm_register_write(vcpu, VCPU_REGS_RSP, regs->rsp); + kvm_register_write(vcpu, VCPU_REGS_RBP, regs->rbp); #ifdef CONFIG_X86_64 - vcpu->arch.regs[VCPU_REGS_R8] = regs->r8; - vcpu->arch.regs[VCPU_REGS_R9] = regs->r9; - vcpu->arch.regs[VCPU_REGS_R10] = regs->r10; - vcpu->arch.regs[VCPU_REGS_R11] = regs->r11; - vcpu->arch.regs[VCPU_REGS_R12] = regs->r12; - vcpu->arch.regs[VCPU_REGS_R13] = regs->r13; - vcpu->arch.regs[VCPU_REGS_R14] = regs->r14; - vcpu->arch.regs[VCPU_REGS_R15] = regs->r15; + kvm_register_write(vcpu, VCPU_REGS_R8, regs->r8); + kvm_register_write(vcpu, VCPU_REGS_R9, regs->r9); + kvm_register_write(vcpu, VCPU_REGS_R10, regs->r10); + kvm_register_write(vcpu, VCPU_REGS_R11, regs->r11); + kvm_register_write(vcpu, VCPU_REGS_R12, regs->r12); + kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); + kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); + kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); + #endif - vcpu->arch.rip = regs->rip; + kvm_rip_write(vcpu, regs->rip); kvm_x86_ops->set_rflags(vcpu, regs->rflags); - kvm_x86_ops->decache_regs(vcpu); vcpu->arch.exception.pending = false; @@ -3316,17 +3327,16 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu, struct tss_segment_32 *tss) { tss->cr3 = vcpu->arch.cr3; - tss->eip = vcpu->arch.rip; + tss->eip = kvm_rip_read(vcpu); tss->eflags = kvm_x86_ops->get_rflags(vcpu); - tss->eax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->ecx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->edx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->ebx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->esp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->ebp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->esi = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->edi = vcpu->arch.regs[VCPU_REGS_RDI]; - + tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->ebx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->esp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->ebp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->esi = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->edi = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); @@ -3342,17 +3352,17 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, { kvm_set_cr3(vcpu, tss->cr3); - vcpu->arch.rip = tss->eip; + kvm_rip_write(vcpu, tss->eip); kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->eax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->ecx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->edx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->ebx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->esp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->ebp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->esi; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->edi; + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->edx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->ebx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->esp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->ebp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) return 1; @@ -3380,16 +3390,16 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu, static void save_state_to_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - tss->ip = vcpu->arch.rip; + tss->ip = kvm_rip_read(vcpu); tss->flag = kvm_x86_ops->get_rflags(vcpu); - tss->ax = vcpu->arch.regs[VCPU_REGS_RAX]; - tss->cx = vcpu->arch.regs[VCPU_REGS_RCX]; - tss->dx = vcpu->arch.regs[VCPU_REGS_RDX]; - tss->bx = vcpu->arch.regs[VCPU_REGS_RBX]; - tss->sp = vcpu->arch.regs[VCPU_REGS_RSP]; - tss->bp = vcpu->arch.regs[VCPU_REGS_RBP]; - tss->si = vcpu->arch.regs[VCPU_REGS_RSI]; - tss->di = vcpu->arch.regs[VCPU_REGS_RDI]; + tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); + tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); + tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); + tss->bx = kvm_register_read(vcpu, VCPU_REGS_RBX); + tss->sp = kvm_register_read(vcpu, VCPU_REGS_RSP); + tss->bp = kvm_register_read(vcpu, VCPU_REGS_RBP); + tss->si = kvm_register_read(vcpu, VCPU_REGS_RSI); + tss->di = kvm_register_read(vcpu, VCPU_REGS_RDI); tss->es = get_segment_selector(vcpu, VCPU_SREG_ES); tss->cs = get_segment_selector(vcpu, VCPU_SREG_CS); @@ -3402,16 +3412,16 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu, static int load_state_from_tss16(struct kvm_vcpu *vcpu, struct tss_segment_16 *tss) { - vcpu->arch.rip = tss->ip; + kvm_rip_write(vcpu, tss->ip); kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); - vcpu->arch.regs[VCPU_REGS_RAX] = tss->ax; - vcpu->arch.regs[VCPU_REGS_RCX] = tss->cx; - vcpu->arch.regs[VCPU_REGS_RDX] = tss->dx; - vcpu->arch.regs[VCPU_REGS_RBX] = tss->bx; - vcpu->arch.regs[VCPU_REGS_RSP] = tss->sp; - vcpu->arch.regs[VCPU_REGS_RBP] = tss->bp; - vcpu->arch.regs[VCPU_REGS_RSI] = tss->si; - vcpu->arch.regs[VCPU_REGS_RDI] = tss->di; + kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); + kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); + kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); + kvm_register_write(vcpu, VCPU_REGS_RBX, tss->bx); + kvm_register_write(vcpu, VCPU_REGS_RSP, tss->sp); + kvm_register_write(vcpu, VCPU_REGS_RBP, tss->bp); + kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); + kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) return 1; @@ -3534,7 +3544,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) } kvm_x86_ops->skip_emulated_instruction(vcpu); - kvm_x86_ops->cache_regs(vcpu); if (nseg_desc.type & 8) ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_base, @@ -3559,7 +3568,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason) tr_seg.type = 11; kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); out: - kvm_x86_ops->decache_regs(vcpu); return ret; } EXPORT_SYMBOL_GPL(kvm_task_switch); diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index f2f90468f8b1..d5da7f14d536 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -26,6 +26,7 @@ #define DPRINTF(_f, _a ...) printf(_f , ## _a) #else #include +#include "kvm_cache_regs.h" #define DPRINTF(x...) do {} while (0) #endif #include @@ -839,7 +840,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) /* Shadow copy of register state. Committed on successful emulation. */ memset(c, 0, sizeof(struct decode_cache)); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); @@ -1267,7 +1268,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (c->regs[VCPU_REGS_RCX] == 0) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } /* The second termination condition only applies for REPE @@ -1281,17 +1282,17 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) (c->b == 0xae) || (c->b == 0xaf)) { if ((c->rep_prefix == REPE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == 0)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } if ((c->rep_prefix == REPNE_PREFIX) && ((ctxt->eflags & EFLG_ZF) == EFLG_ZF)) { - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); goto done; } } c->regs[VCPU_REGS_RCX]--; - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } if (c->src.type == OP_MEM) { @@ -1768,7 +1769,7 @@ writeback: /* Commit shadow register state. */ memcpy(ctxt->vcpu->arch.regs, c->regs, sizeof c->regs); - ctxt->vcpu->arch.rip = c->eip; + kvm_rip_write(ctxt->vcpu, c->eip); done: if (rc == X86EMUL_UNHANDLEABLE) { @@ -1793,7 +1794,7 @@ twobyte_insn: goto done; /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); /* Disable writeback. */ c->dst.type = OP_NONE; break; @@ -1889,7 +1890,7 @@ twobyte_insn: rc = kvm_set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } rc = X86EMUL_CONTINUE; c->dst.type = OP_NONE; @@ -1899,7 +1900,7 @@ twobyte_insn: rc = kvm_get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data); if (rc) { kvm_inject_gp(ctxt->vcpu, 0); - c->eip = ctxt->vcpu->arch.rip; + c->eip = kvm_rip_read(ctxt->vcpu); } else { c->regs[VCPU_REGS_RAX] = (u32)msr_data; c->regs[VCPU_REGS_RDX] = msr_data >> 32; -- cgit v1.2.2 From 867767a365ee74a3adcfaba27075eefb66b14bfd Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Fri, 27 Jun 2008 15:55:02 +0300 Subject: KVM: Introduce kvm_set_irq to inject interrupts in guests This function injects an interrupt into the guest given the kvm struct, the (guest) irq number and the interrupt level. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/irq.c | 11 +++++++++++ arch/x86/kvm/irq.h | 2 ++ 2 files changed, 13 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 76d736b5f664..0d9e55275af1 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -100,3 +100,14 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); } + +/* This should be called with the kvm->lock mutex held */ +void kvm_set_irq(struct kvm *kvm, int irq, int level) +{ + /* Not possible to detect if the guest uses the PIC or the + * IOAPIC. So set the bit in both. The guest will ignore + * writes to the unused one. + */ + kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); + kvm_pic_set_irq(pic_irqchip(kvm), irq, level); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 7ca47cbb48bb..07ff2aef0c13 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -82,6 +82,8 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); +void kvm_set_irq(struct kvm *kvm, int irq, int level); + void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -- cgit v1.2.2 From 31aa2b44afd5e73365221b1de66f6081e4616f33 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 11 Jul 2008 17:59:46 +0300 Subject: KVM: MMU: Separate the code for unlinking a shadow page from its parents Place into own function, in preparation for further cleanups. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3da2508eb22a..81016a3a6fd3 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -991,11 +991,10 @@ static void kvm_mmu_reset_last_pte_updated(struct kvm *kvm) kvm->vcpus[i]->arch.last_pte_updated = NULL; } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) { u64 *parent_pte; - ++kvm->stat.mmu_shadow_zapped; while (sp->multimapped || sp->parent_pte) { if (!sp->multimapped) parent_pte = sp->parent_pte; @@ -1010,7 +1009,13 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_mmu_put_page(sp, parent_pte); set_shadow_pte(parent_pte, shadow_trap_nonpresent_pte); } +} + +static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + ++kvm->stat.mmu_shadow_zapped; kvm_mmu_page_unlink_children(kvm, sp); + kvm_mmu_unlink_parents(kvm, sp); if (!sp->root_count) { if (!sp->role.metaphysical && !sp->role.invalid) unaccount_shadowed(kvm, sp->gfn); -- cgit v1.2.2 From 5b5c6a5a60801effb559e787a947885d9850a7da Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 11 Jul 2008 18:07:26 +0300 Subject: KVM: MMU: Simplify kvm_mmu_zap_page() The twisty maze of conditionals can be reduced. [joerg: fix tlb flushing] Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 81016a3a6fd3..c3afbfe6b0c1 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -955,7 +955,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, rmap_remove(kvm, &pt[i]); pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); return; } @@ -974,7 +973,6 @@ static void kvm_mmu_page_unlink_children(struct kvm *kvm, } pt[i] = shadow_trap_nonpresent_pte; } - kvm_flush_remote_tlbs(kvm); } static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) @@ -1016,18 +1014,16 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) ++kvm->stat.mmu_shadow_zapped; kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); + kvm_flush_remote_tlbs(kvm); + if (!sp->role.invalid && !sp->role.metaphysical) + unaccount_shadowed(kvm, sp->gfn); if (!sp->root_count) { - if (!sp->role.metaphysical && !sp->role.invalid) - unaccount_shadowed(kvm, sp->gfn); hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); } else { - int invalid = sp->role.invalid; - list_move(&sp->link, &kvm->arch.active_mmu_pages); sp->role.invalid = 1; + list_move(&sp->link, &kvm->arch.active_mmu_pages); kvm_reload_remote_mmus(kvm); - if (!sp->role.metaphysical && !invalid) - unaccount_shadowed(kvm, sp->gfn); } kvm_mmu_reset_last_pte_updated(kvm); } @@ -1842,7 +1838,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; hlist_for_each_entry_safe(sp, node, n, bucket, hash_link) { - if (sp->gfn != gfn || sp->role.metaphysical) + if (sp->gfn != gfn || sp->role.metaphysical || sp->role.invalid) continue; pte_size = sp->role.glevels == PT32_ROOT_LEVEL ? 4 : 8; misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1); -- cgit v1.2.2 From cf393f75661f4b17451377b353833eb5502a9688 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 1 Jul 2008 16:20:21 +0300 Subject: KVM: Move NMI IRET fault processing to new vmx_complete_interrupts() Currently most interrupt exit processing is handled on the entry path, which is confusing. Move the NMI IRET fault processing to a new function, vmx_complete_interrupts(), which is called on the vmexit path. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5a3a0326c277..2adfacb00336 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2817,6 +2817,27 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) enable_irq_window(vcpu); } +static void vmx_complete_interrupts(struct vcpu_vmx *vmx) +{ + u32 exit_intr_info; + bool unblock_nmi; + u8 vector; + + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + if (cpu_has_virtual_nmis()) { + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 25.7.1.2 + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + */ + if (unblock_nmi && vector != DF_VECTOR) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + } +} + static void vmx_intr_assist(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2873,23 +2894,12 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) return; } if (cpu_has_virtual_nmis()) { - /* - * SDM 3: 25.7.1.2 - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - */ - if ((exit_intr_info_field & INTR_INFO_UNBLOCK_NMI) && - (exit_intr_info_field & INTR_INFO_VECTOR_MASK) != 8) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) | - GUEST_INTR_STATE_NMI); - else if (vcpu->arch.nmi_pending) { + if (vcpu->arch.nmi_pending) { if (vmx_nmi_enabled(vcpu)) vmx_inject_nmi(vcpu); enable_intr_window(vcpu); return; } - } if (!kvm_cpu_has_interrupt(vcpu)) return; @@ -3076,6 +3086,8 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) KVMTRACE_0D(NMI, vcpu, handler); asm("int $2"); } + + vmx_complete_interrupts(vmx); } static void vmx_free_vmcs(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 668f612fa0d8d4120ec5dc0725d7e1ca3152a954 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 2 Jul 2008 09:28:55 +0300 Subject: KVM: VMX: Move nmi injection failure processing to vm exit path Instead of processing nmi injection failure in the vm entry path, move it to the vm exit path (vm_complete_interrupts()). This separates nmi injection from nmi post-processing, and moves the nmi state from the VT state into vcpu state (new variable nmi_injected specifying an injection in progress). Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2adfacb00336..ce13b53d21c4 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2151,7 +2151,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) { vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vcpu->arch.nmi_pending = 0; } static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) @@ -2820,8 +2819,11 @@ static void enable_intr_window(struct kvm_vcpu *vcpu) static void vmx_complete_interrupts(struct vcpu_vmx *vmx) { u32 exit_intr_info; + u32 idt_vectoring_info; bool unblock_nmi; u8 vector; + int type; + bool idtv_info_valid; exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (cpu_has_virtual_nmis()) { @@ -2836,18 +2838,34 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); } + + idt_vectoring_info = vmx->idt_vectoring_info; + idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; + vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; + type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; + if (vmx->vcpu.arch.nmi_injected) { + /* + * SDM 3: 25.7.1.2 + * Clear bit "block by NMI" before VM entry if a NMI delivery + * faulted. + */ + if (idtv_info_valid && type == INTR_TYPE_NMI_INTR) + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->vcpu.arch.nmi_injected = false; + } } static void vmx_intr_assist(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field, exit_intr_info_field; + u32 idtv_info_field, intr_info_field; int vector; update_tpr_threshold(vcpu); intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - exit_intr_info_field = vmcs_read32(VM_EXIT_INTR_INFO); idtv_info_field = vmx->idt_vectoring_info; if (intr_info_field & INTR_INFO_VALID_MASK) { if (idtv_info_field & INTR_INFO_VALID_MASK) { @@ -2871,17 +2889,6 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); - /* - * SDM 3: 25.7.1.2 - * Clear bit "block by NMI" before VM entry if a NMI delivery - * faulted. - */ - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_NMI_INTR && cpu_has_virtual_nmis()) - vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & - ~GUEST_INTR_STATE_NMI); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field & ~INTR_INFO_RESVD_BITS_MASK); vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, @@ -2894,9 +2901,17 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) return; } if (cpu_has_virtual_nmis()) { - if (vcpu->arch.nmi_pending) { - if (vmx_nmi_enabled(vcpu)) - vmx_inject_nmi(vcpu); + if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { + if (vmx_nmi_enabled(vcpu)) { + vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_injected = true; + } else { + enable_intr_window(vcpu); + return; + } + } + if (vcpu->arch.nmi_injected) { + vmx_inject_nmi(vcpu); enable_intr_window(vcpu); return; } -- cgit v1.2.2 From 26eef70c3e8c76e73dff2579c792fc7355f8a291 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 14:59:22 +0300 Subject: KVM: Clear exception queue before emulating an instruction If we're emulating an instruction, either it will succeed, in which case any previously queued exception will be spurious, or we will requeue the same exception. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 ++ arch/x86/kvm/x86.h | 11 +++++++++++ 2 files changed, 13 insertions(+) create mode 100644 arch/x86/kvm/x86.h (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2f0696bc7d2f..5620df2685db 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -20,6 +20,7 @@ #include "i8254.h" #include "tss.h" #include "kvm_cache_regs.h" +#include "x86.h" #include #include @@ -2121,6 +2122,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu, int r; struct decode_cache *c; + kvm_clear_exception_queue(vcpu); vcpu->arch.mmio_fault_cr2 = cr2; /* * TODO: fix x86_emulate.c to use guest_read/write_register diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h new file mode 100644 index 000000000000..c666649c4bb2 --- /dev/null +++ b/arch/x86/kvm/x86.h @@ -0,0 +1,11 @@ +#ifndef ARCH_X86_KVM_X86_H +#define ARCH_X86_KVM_X86_H + +#include + +static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.exception.pending = false; +} + +#endif -- cgit v1.2.2 From 35920a356957eea9fd1f9da043f93469e8d72eab Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 14:50:12 +0300 Subject: KVM: VMX: Fix pending exception processing The vmx code assumes that IDT-Vectoring can only be set when an exception is injected due to the exception in question. That's not true, however: if the exception is injected correctly, and later another exception occurs but its delivery is blocked due to a fault, then we will incorrectly assume the first exception was not delivered. Fix by unconditionally dequeuing the pending exception, and requeuing it (or the second exception) if we see it in the IDT-Vectoring field. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ce13b53d21c4..ed4fe8e72ad0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -27,6 +27,7 @@ #include #include #include "kvm_cache_regs.h" +#include "x86.h" #include #include @@ -744,9 +745,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, static bool vmx_exception_injected(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - - return !(vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + return false; } /* @@ -2824,6 +2823,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) u8 vector; int type; bool idtv_info_valid; + u32 error; exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); if (cpu_has_virtual_nmis()) { @@ -2855,6 +2855,15 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) else vmx->vcpu.arch.nmi_injected = false; } + kvm_clear_exception_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXCEPTION) { + if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { + error = vmcs_read32(IDT_VECTORING_ERROR_CODE); + kvm_queue_exception_e(&vmx->vcpu, vector, error); + } else + kvm_queue_exception(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } } static void vmx_intr_assist(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From 937a7eaef9f08342958d17055a350982b7bd92cb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 15:17:01 +0300 Subject: KVM: Add a pending interrupt queue Similar to the exception queue, this hold interrupts that have been accepted by the virtual processor core but not yet injected. Not yet used. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c666649c4bb2..6a4be78a7384 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,4 +8,15 @@ static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) vcpu->arch.exception.pending = false; } +static inline void kvm_queue_interrupt(struct kvm_vcpu *vcpu, u8 vector) +{ + vcpu->arch.interrupt.pending = true; + vcpu->arch.interrupt.nr = vector; +} + +static inline void kvm_clear_interrupt_queue(struct kvm_vcpu *vcpu) +{ + vcpu->arch.interrupt.pending = false; +} + #endif -- cgit v1.2.2 From f7d9238f5dc9306fd3bf1653c2939eef72f94d06 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 3 Jul 2008 16:14:28 +0300 Subject: KVM: VMX: Move interrupt post-processing to vmx_complete_interrupts() Instead of looking at failed injections in the vm entry path, move processing to the exit path in vmx_complete_interrupts(). This simplifes the logic and removes any state that is hidden in vmx registers. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 77 ++++++++++++++---------------------------------------- 1 file changed, 20 insertions(+), 57 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ed4fe8e72ad0..e8fed9b8756e 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1002,17 +1002,9 @@ static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_debug_guest *dbg) static int vmx_get_irq(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field; - - idtv_info_field = vmx->idt_vectoring_info; - if (idtv_info_field & INTR_INFO_VALID_MASK) { - if (is_external_interrupt(idtv_info_field)) - return idtv_info_field & VECTORING_INFO_VECTOR_MASK; - else - printk(KERN_DEBUG "pending exception: not handled yet\n"); - } - return -1; + if (!vcpu->arch.interrupt.pending) + return -1; + return vcpu->arch.interrupt.nr; } static __init int cpu_has_kvm_support(void) @@ -2293,7 +2285,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) cr2 = vmcs_readl(EXIT_QUALIFICATION); KVMTRACE_3D(PAGE_FAULT, vcpu, error_code, (u32)cr2, (u32)((u64)cr2 >> 32), handler); - if (vect_info & VECTORING_INFO_VALID_MASK) + if (vcpu->arch.interrupt.pending || vcpu->arch.exception.pending) kvm_mmu_unprotect_page_virt(vcpu, cr2); return kvm_mmu_page_fault(vcpu, cr2, error_code); } @@ -2864,51 +2856,20 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) kvm_queue_exception(&vmx->vcpu, vector); vmx->idt_vectoring_info = 0; } + kvm_clear_interrupt_queue(&vmx->vcpu); + if (idtv_info_valid && type == INTR_TYPE_EXT_INTR) { + kvm_queue_interrupt(&vmx->vcpu, vector); + vmx->idt_vectoring_info = 0; + } } static void vmx_intr_assist(struct kvm_vcpu *vcpu) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - u32 idtv_info_field, intr_info_field; - int vector; + u32 intr_info_field; update_tpr_threshold(vcpu); intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); - idtv_info_field = vmx->idt_vectoring_info; - if (intr_info_field & INTR_INFO_VALID_MASK) { - if (idtv_info_field & INTR_INFO_VALID_MASK) { - /* TODO: fault when IDT_Vectoring */ - if (printk_ratelimit()) - printk(KERN_ERR "Fault when IDT_Vectoring\n"); - } - enable_intr_window(vcpu); - return; - } - if (unlikely(idtv_info_field & INTR_INFO_VALID_MASK)) { - if ((idtv_info_field & VECTORING_INFO_TYPE_MASK) - == INTR_TYPE_EXT_INTR - && vcpu->arch.rmode.active) { - u8 vect = idtv_info_field & VECTORING_INFO_VECTOR_MASK; - - vmx_inject_irq(vcpu, vect); - enable_intr_window(vcpu); - return; - } - - KVMTRACE_1D(REDELIVER_EVT, vcpu, idtv_info_field, handler); - - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, idtv_info_field - & ~INTR_INFO_RESVD_BITS_MASK); - vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, - vmcs_read32(VM_EXIT_INSTRUCTION_LEN)); - - if (unlikely(idtv_info_field & INTR_INFO_DELIVER_CODE_MASK)) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, - vmcs_read32(IDT_VECTORING_ERROR_CODE)); - enable_intr_window(vcpu); - return; - } if (cpu_has_virtual_nmis()) { if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { if (vmx_nmi_enabled(vcpu)) { @@ -2925,14 +2886,16 @@ static void vmx_intr_assist(struct kvm_vcpu *vcpu) return; } } - if (!kvm_cpu_has_interrupt(vcpu)) - return; - if (vmx_irq_enabled(vcpu)) { - vector = kvm_cpu_get_interrupt(vcpu); - vmx_inject_irq(vcpu, vector); - kvm_timer_intr_post(vcpu, vector); - } else - enable_irq_window(vcpu); + if (!vcpu->arch.interrupt.pending && kvm_cpu_has_interrupt(vcpu)) { + if (vmx_irq_enabled(vcpu)) + kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu)); + else + enable_irq_window(vcpu); + } + if (vcpu->arch.interrupt.pending) { + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + kvm_timer_intr_post(vcpu, vcpu->arch.interrupt.nr); + } } /* -- cgit v1.2.2 From 60bd83a125030878665684f353c7d36fd54f09fd Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sat, 12 Jul 2008 16:02:08 +0300 Subject: KVM: VMX: Remove redundant check in handle_rmode_exception Since checking for vcpu->arch.rmode.active is already done whenever we call handle_rmode_exception(), checking it inside the function is redundant. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e8fed9b8756e..9591ca73191c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2224,9 +2224,6 @@ static void kvm_guest_debug_pre(struct kvm_vcpu *vcpu) static int handle_rmode_exception(struct kvm_vcpu *vcpu, int vec, u32 err_code) { - if (!vcpu->arch.rmode.active) - return 0; - /* * Instruction with address size override prefix opcode 0x67 * Cause the #SS fault with 0 error code in VM86 mode. -- cgit v1.2.2 From 7edd0ce05892831c77fa4cebe24a6056d33336d5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 7 Jul 2008 14:45:39 +0300 Subject: KVM: Consolidate PIC isr clearing into a function Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index c31164e8aa46..55e179ad98ef 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -30,6 +30,11 @@ #include +static void pic_clear_isr(struct kvm_kpic_state *s, int irq) +{ + s->isr &= ~(1 << irq); +} + /* * set irq level. If an edge is detected, then the IRR is set to 1 */ @@ -141,11 +146,12 @@ void kvm_pic_set_irq(void *opaque, int irq, int level) */ static inline void pic_intack(struct kvm_kpic_state *s, int irq) { + s->isr |= 1 << irq; if (s->auto_eoi) { if (s->rotate_on_auto_eoi) s->priority_add = (irq + 1) & 7; - } else - s->isr |= (1 << irq); + pic_clear_isr(s, irq); + } /* * We don't clear a level sensitive interrupt here */ @@ -243,7 +249,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) priority = get_priority(s, s->isr); if (priority != 8) { irq = (priority + s->priority_add) & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); if (cmd == 5) s->priority_add = (irq + 1) & 7; pic_update_irq(s->pics_state); @@ -251,7 +257,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 3: irq = val & 7; - s->isr &= ~(1 << irq); + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; case 6: @@ -260,8 +266,8 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) break; case 7: irq = val & 7; - s->isr &= ~(1 << irq); s->priority_add = (irq + 1) & 7; + pic_clear_isr(s, irq); pic_update_irq(s->pics_state); break; default: @@ -303,7 +309,7 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) s->pics_state->pics[0].irr &= ~(1 << 2); } s->irr &= ~(1 << ret); - s->isr &= ~(1 << ret); + pic_clear_isr(s, ret); if (addr1 >> 7 || ret != 2) pic_update_irq(s->pics_state); } else { -- cgit v1.2.2 From 19bd8afdc4e6fbb47e4841f8d771f8cb29916d9f Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 13 Jul 2008 13:40:55 +0200 Subject: KVM: Consolidate XX_VECTOR defines Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 4 ---- arch/x86/kvm/vmx.c | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 54b0bf33e21e..743c98d135c4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -36,10 +36,6 @@ MODULE_LICENSE("GPL"); #define IOPM_ALLOC_ORDER 2 #define MSRPM_ALLOC_ORDER 1 -#define DB_VECTOR 1 -#define UD_VECTOR 6 -#define GP_VECTOR 13 - #define DR7_GD_MASK (1 << 13) #define DR6_BD_MASK (1 << 13) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9591ca73191c..5c9dac567a74 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -470,7 +470,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) if (!vcpu->fpu_active) eb |= 1u << NM_VECTOR; if (vcpu->guest_debug.enabled) - eb |= 1u << 1; + eb |= 1u << DB_VECTOR; if (vcpu->arch.rmode.active) eb = ~0; if (vm_need_ept()) -- cgit v1.2.2 From 77ab6db0a1c403387b403e9351ab3f5ae1df83e6 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Mon, 14 Jul 2008 12:28:51 +0200 Subject: KVM: VMX: Reinject real mode exception As we execute real mode guests in VM86 mode, exception have to be reinjected appropriately when the guest triggered them. For this purpose the patch adopts the real-mode injection pattern used in vmx_inject_irq to vmx_queue_exception, additionally taking care that the IP is set correctly for #BP exceptions. Furthermore it extends handle_rmode_exception to reinject all those exceptions that can be raised in real mode. This fixes the execution of himem.exe from FreeDOS and also makes its debug.com work properly. Note that guest debugging in real mode is broken now. This has to be fixed by the scheduled debugging infrastructure rework (will be done once base patches for QEMU have been accepted). Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 41 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5c9dac567a74..2879880076d0 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -735,12 +735,30 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, bool has_error_code, u32 error_code) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + + if (has_error_code) + vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); + + if (vcpu->arch.rmode.active) { + vmx->rmode.irq.pending = true; + vmx->rmode.irq.vector = nr; + vmx->rmode.irq.rip = kvm_rip_read(vcpu); + if (nr == BP_VECTOR) + vmx->rmode.irq.rip++; + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, + nr | INTR_TYPE_SOFT_INTR + | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) + | INTR_INFO_VALID_MASK); + vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1); + kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1); + return; + } + vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, nr | INTR_TYPE_EXCEPTION | (has_error_code ? INTR_INFO_DELIVER_CODE_MASK : 0) | INTR_INFO_VALID_MASK); - if (has_error_code) - vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code); } static bool vmx_exception_injected(struct kvm_vcpu *vcpu) @@ -2231,6 +2249,25 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu, if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) return 1; + /* + * Forward all other exceptions that are valid in real mode. + * FIXME: Breaks guest debugging in real mode, needs to be fixed with + * the required debugging infrastructure rework. + */ + switch (vec) { + case DE_VECTOR: + case DB_VECTOR: + case BP_VECTOR: + case OF_VECTOR: + case BR_VECTOR: + case UD_VECTOR: + case DF_VECTOR: + case SS_VECTOR: + case GP_VECTOR: + case MF_VECTOR: + kvm_queue_exception(vcpu, vec); + return 1; + } return 0; } -- cgit v1.2.2 From c801949ddf0a51074937f488a52072825ed50174 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 Jul 2008 14:44:59 +0300 Subject: KVM: VMX: Unify register save/restore across 32 and 64 bit hosts Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 90 ++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 54 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 2879880076d0..fc8996b1043b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2955,6 +2955,14 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx) | vmx->rmode.irq.vector; } +#ifdef CONFIG_X86_64 +#define R "r" +#define Q "q" +#else +#define R "e" +#define Q "l" +#endif + static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -2972,26 +2980,21 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) asm( /* Store host registers */ -#ifdef CONFIG_X86_64 - "push %%rdx; push %%rbp;" - "push %%rcx \n\t" -#else - "push %%edx; push %%ebp;" - "push %%ecx \n\t" -#endif + "push %%"R"dx; push %%"R"bp;" + "push %%"R"cx \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ + "mov %c[cr2](%0), %%"R"ax \n\t" + "mov %%"R"ax, %%cr2 \n\t" + "mov %c[rax](%0), %%"R"ax \n\t" + "mov %c[rbx](%0), %%"R"bx \n\t" + "mov %c[rdx](%0), %%"R"dx \n\t" + "mov %c[rsi](%0), %%"R"si \n\t" + "mov %c[rdi](%0), %%"R"di \n\t" + "mov %c[rbp](%0), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "mov %c[cr2](%0), %%rax \n\t" - "mov %%rax, %%cr2 \n\t" - "mov %c[rax](%0), %%rax \n\t" - "mov %c[rbx](%0), %%rbx \n\t" - "mov %c[rdx](%0), %%rdx \n\t" - "mov %c[rsi](%0), %%rsi \n\t" - "mov %c[rdi](%0), %%rdi \n\t" - "mov %c[rbp](%0), %%rbp \n\t" "mov %c[r8](%0), %%r8 \n\t" "mov %c[r9](%0), %%r9 \n\t" "mov %c[r10](%0), %%r10 \n\t" @@ -3000,18 +3003,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%0), %%r13 \n\t" "mov %c[r14](%0), %%r14 \n\t" "mov %c[r15](%0), %%r15 \n\t" - "mov %c[rcx](%0), %%rcx \n\t" /* kills %0 (rcx) */ -#else - "mov %c[cr2](%0), %%eax \n\t" - "mov %%eax, %%cr2 \n\t" - "mov %c[rax](%0), %%eax \n\t" - "mov %c[rbx](%0), %%ebx \n\t" - "mov %c[rdx](%0), %%edx \n\t" - "mov %c[rsi](%0), %%esi \n\t" - "mov %c[rdi](%0), %%edi \n\t" - "mov %c[rbp](%0), %%ebp \n\t" - "mov %c[rcx](%0), %%ecx \n\t" /* kills %0 (ecx) */ #endif + "mov %c[rcx](%0), %%"R"cx \n\t" /* kills %0 (ecx) */ + /* Enter guest mode */ "jne .Llaunched \n\t" __ex(ASM_VMX_VMLAUNCH) "\n\t" @@ -3019,15 +3013,15 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" ".Lkvm_vmx_return: " /* Save guest registers, load host registers, keep flags */ + "xchg %0, (%%"R"sp) \n\t" + "mov %%"R"ax, %c[rax](%0) \n\t" + "mov %%"R"bx, %c[rbx](%0) \n\t" + "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" + "mov %%"R"dx, %c[rdx](%0) \n\t" + "mov %%"R"si, %c[rsi](%0) \n\t" + "mov %%"R"di, %c[rdi](%0) \n\t" + "mov %%"R"bp, %c[rbp](%0) \n\t" #ifdef CONFIG_X86_64 - "xchg %0, (%%rsp) \n\t" - "mov %%rax, %c[rax](%0) \n\t" - "mov %%rbx, %c[rbx](%0) \n\t" - "pushq (%%rsp); popq %c[rcx](%0) \n\t" - "mov %%rdx, %c[rdx](%0) \n\t" - "mov %%rsi, %c[rsi](%0) \n\t" - "mov %%rdi, %c[rdi](%0) \n\t" - "mov %%rbp, %c[rbp](%0) \n\t" "mov %%r8, %c[r8](%0) \n\t" "mov %%r9, %c[r9](%0) \n\t" "mov %%r10, %c[r10](%0) \n\t" @@ -3036,24 +3030,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%0) \n\t" "mov %%r14, %c[r14](%0) \n\t" "mov %%r15, %c[r15](%0) \n\t" - "mov %%cr2, %%rax \n\t" - "mov %%rax, %c[cr2](%0) \n\t" - - "pop %%rbp; pop %%rbp; pop %%rdx \n\t" -#else - "xchg %0, (%%esp) \n\t" - "mov %%eax, %c[rax](%0) \n\t" - "mov %%ebx, %c[rbx](%0) \n\t" - "pushl (%%esp); popl %c[rcx](%0) \n\t" - "mov %%edx, %c[rdx](%0) \n\t" - "mov %%esi, %c[rsi](%0) \n\t" - "mov %%edi, %c[rdi](%0) \n\t" - "mov %%ebp, %c[rbp](%0) \n\t" - "mov %%cr2, %%eax \n\t" - "mov %%eax, %c[cr2](%0) \n\t" - - "pop %%ebp; pop %%ebp; pop %%edx \n\t" #endif + "mov %%cr2, %%"R"ax \n\t" + "mov %%"R"ax, %c[cr2](%0) \n\t" + + "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" "setbe %c[fail](%0) \n\t" : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), @@ -3077,11 +3058,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) #endif [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) : "cc", "memory" + , R"bx", R"di", R"si" #ifdef CONFIG_X86_64 - , "rbx", "rdi", "rsi" , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" -#else - , "ebx", "edi", "rsi" #endif ); @@ -3111,6 +3090,9 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) vmx_complete_interrupts(vmx); } +#undef R +#undef Q + static void vmx_free_vmcs(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); -- cgit v1.2.2 From 80e31d4f61f69d0e480ae092cda0e590d6a30aeb Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 14 Jul 2008 14:44:59 +0300 Subject: KVM: SVM: Unify register save/restore across 32 and 64 bit hosts Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 78 ++++++++++++++++++------------------------------------ 1 file changed, 26 insertions(+), 52 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 743c98d135c4..179c2e00d0f9 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1697,6 +1697,12 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; } +#ifdef CONFIG_X86_64 +#define R "r" +#else +#define R "e" +#endif + static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1735,19 +1741,14 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) local_irq_enable(); asm volatile ( + "push %%"R"bp; \n\t" + "mov %c[rbx](%[svm]), %%"R"bx \n\t" + "mov %c[rcx](%[svm]), %%"R"cx \n\t" + "mov %c[rdx](%[svm]), %%"R"dx \n\t" + "mov %c[rsi](%[svm]), %%"R"si \n\t" + "mov %c[rdi](%[svm]), %%"R"di \n\t" + "mov %c[rbp](%[svm]), %%"R"bp \n\t" #ifdef CONFIG_X86_64 - "push %%rbp; \n\t" -#else - "push %%ebp; \n\t" -#endif - -#ifdef CONFIG_X86_64 - "mov %c[rbx](%[svm]), %%rbx \n\t" - "mov %c[rcx](%[svm]), %%rcx \n\t" - "mov %c[rdx](%[svm]), %%rdx \n\t" - "mov %c[rsi](%[svm]), %%rsi \n\t" - "mov %c[rdi](%[svm]), %%rdi \n\t" - "mov %c[rbp](%[svm]), %%rbp \n\t" "mov %c[r8](%[svm]), %%r8 \n\t" "mov %c[r9](%[svm]), %%r9 \n\t" "mov %c[r10](%[svm]), %%r10 \n\t" @@ -1756,41 +1757,24 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %c[r13](%[svm]), %%r13 \n\t" "mov %c[r14](%[svm]), %%r14 \n\t" "mov %c[r15](%[svm]), %%r15 \n\t" -#else - "mov %c[rbx](%[svm]), %%ebx \n\t" - "mov %c[rcx](%[svm]), %%ecx \n\t" - "mov %c[rdx](%[svm]), %%edx \n\t" - "mov %c[rsi](%[svm]), %%esi \n\t" - "mov %c[rdi](%[svm]), %%edi \n\t" - "mov %c[rbp](%[svm]), %%ebp \n\t" #endif -#ifdef CONFIG_X86_64 /* Enter guest mode */ - "push %%rax \n\t" - "mov %c[vmcb](%[svm]), %%rax \n\t" + "push %%"R"ax \n\t" + "mov %c[vmcb](%[svm]), %%"R"ax \n\t" __ex(SVM_VMLOAD) "\n\t" __ex(SVM_VMRUN) "\n\t" __ex(SVM_VMSAVE) "\n\t" - "pop %%rax \n\t" -#else - /* Enter guest mode */ - "push %%eax \n\t" - "mov %c[vmcb](%[svm]), %%eax \n\t" - __ex(SVM_VMLOAD) "\n\t" - __ex(SVM_VMRUN) "\n\t" - __ex(SVM_VMSAVE) "\n\t" - "pop %%eax \n\t" -#endif + "pop %%"R"ax \n\t" /* Save guest registers, load host registers */ + "mov %%"R"bx, %c[rbx](%[svm]) \n\t" + "mov %%"R"cx, %c[rcx](%[svm]) \n\t" + "mov %%"R"dx, %c[rdx](%[svm]) \n\t" + "mov %%"R"si, %c[rsi](%[svm]) \n\t" + "mov %%"R"di, %c[rdi](%[svm]) \n\t" + "mov %%"R"bp, %c[rbp](%[svm]) \n\t" #ifdef CONFIG_X86_64 - "mov %%rbx, %c[rbx](%[svm]) \n\t" - "mov %%rcx, %c[rcx](%[svm]) \n\t" - "mov %%rdx, %c[rdx](%[svm]) \n\t" - "mov %%rsi, %c[rsi](%[svm]) \n\t" - "mov %%rdi, %c[rdi](%[svm]) \n\t" - "mov %%rbp, %c[rbp](%[svm]) \n\t" "mov %%r8, %c[r8](%[svm]) \n\t" "mov %%r9, %c[r9](%[svm]) \n\t" "mov %%r10, %c[r10](%[svm]) \n\t" @@ -1799,18 +1783,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) "mov %%r13, %c[r13](%[svm]) \n\t" "mov %%r14, %c[r14](%[svm]) \n\t" "mov %%r15, %c[r15](%[svm]) \n\t" - - "pop %%rbp; \n\t" -#else - "mov %%ebx, %c[rbx](%[svm]) \n\t" - "mov %%ecx, %c[rcx](%[svm]) \n\t" - "mov %%edx, %c[rdx](%[svm]) \n\t" - "mov %%esi, %c[rsi](%[svm]) \n\t" - "mov %%edi, %c[rdi](%[svm]) \n\t" - "mov %%ebp, %c[rbp](%[svm]) \n\t" - - "pop %%ebp; \n\t" #endif + "pop %%"R"bp" : : [svm]"a"(svm), [vmcb]"i"(offsetof(struct vcpu_svm, vmcb_pa)), @@ -1831,11 +1805,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) [r15]"i"(offsetof(struct vcpu_svm, vcpu.arch.regs[VCPU_REGS_R15])) #endif : "cc", "memory" + , R"bx", R"cx", R"dx", R"si", R"di" #ifdef CONFIG_X86_64 - , "rbx", "rcx", "rdx", "rsi", "rdi" , "r8", "r9", "r10", "r11" , "r12", "r13", "r14", "r15" -#else - , "ebx", "ecx", "edx" , "esi", "edi" #endif ); @@ -1867,6 +1839,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) svm->next_rip = 0; } +#undef R + static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root) { struct vcpu_svm *svm = to_svm(vcpu); -- cgit v1.2.2 From 313dbd49dc239205b96da79fba09f7637cf84f3c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 17 Jul 2008 18:04:30 +0300 Subject: KVM: VMX: Avoid vmwrite(HOST_RSP) when possible Usually HOST_RSP retains its value across guest entries. Take advantage of this and avoid a vmwrite() when this is so. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index fc8996b1043b..ddb49e34697b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -58,6 +58,7 @@ struct vmcs { struct vcpu_vmx { struct kvm_vcpu vcpu; struct list_head local_vcpus_link; + unsigned long host_rsp; int launched; u8 fail; u32 idt_vectoring_info; @@ -2982,7 +2983,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Store host registers */ "push %%"R"dx; push %%"R"bp;" "push %%"R"cx \n\t" + "cmp %%"R"sp, %c[host_rsp](%0) \n\t" + "je 1f \n\t" + "mov %%"R"sp, %c[host_rsp](%0) \n\t" __ex(ASM_VMX_VMWRITE_RSP_RDX) "\n\t" + "1: \n\t" /* Check if vmlaunch of vmresume is needed */ "cmpl $0, %c[launched](%0) \n\t" /* Load guest registers. Don't clobber flags. */ @@ -3039,6 +3044,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) : : "c"(vmx), "d"((unsigned long)HOST_RSP), [launched]"i"(offsetof(struct vcpu_vmx, launched)), [fail]"i"(offsetof(struct vcpu_vmx, fail)), + [host_rsp]"i"(offsetof(struct vcpu_vmx, host_rsp)), [rax]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RAX])), [rbx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RBX])), [rcx]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_RCX])), -- cgit v1.2.2 From b5e2fec0ebc3fcaff954092bb69444a67a904c0a Mon Sep 17 00:00:00 2001 From: Alexander Graf Date: Tue, 22 Jul 2008 08:00:45 +0200 Subject: KVM: Ignore DEBUGCTL MSRs with no effect Netware writes to DEBUGCTL and reads from the DEBUGCTL and LAST*IP MSRs without further checks and is really confused to receive a #GP during that. To make it happy we should just make them stubs, which is exactly what SVM already does. Writes to DEBUGCTL that are vendor-specific are resembled to behave as if the virtual CPU does not know them. Signed-off-by: Alexander Graf Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5620df2685db..94a216562f10 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -665,6 +665,18 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) pr_unimpl(vcpu, "%s: MSR_IA32_MCG_CTL 0x%llx, nop\n", __func__, data); break; + case MSR_IA32_DEBUGCTLMSR: + if (!data) { + /* We support the non-activated case already */ + break; + } else if (data & ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_BTF)) { + /* Values other than LBR and BTF are vendor-specific, + thus reserved and should throw a #GP */ + return 1; + } + pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n", + __func__, data); + break; case MSR_IA32_UCODE_REV: case MSR_IA32_UCODE_WRITE: break; @@ -757,6 +769,11 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+16: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: + case MSR_IA32_DEBUGCTLMSR: + case MSR_IA32_LASTBRANCHFROMIP: + case MSR_IA32_LASTBRANCHTOIP: + case MSR_IA32_LASTINTFROMIP: + case MSR_IA32_LASTINTTOIP: data = 0; break; case MSR_MTRRcap: -- cgit v1.2.2 From 564f15378f04921d5749f27ec53d5e68a6d1d446 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sat, 26 Jul 2008 17:00:59 -0300 Subject: KVM: Add irq ack notifier list This can be used by kvm subsystems that are interested in when interrupts are acked, for example time drift compensation. Signed-off-by: Avi Kivity --- arch/x86/kvm/irq.c | 22 ++++++++++++++++++++++ arch/x86/kvm/irq.h | 5 +++++ 2 files changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 0d9e55275af1..90911958d853 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -111,3 +111,25 @@ void kvm_set_irq(struct kvm *kvm, int irq, int level) kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); kvm_pic_set_irq(pic_irqchip(kvm), irq, level); } + +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) +{ + struct kvm_irq_ack_notifier *kian; + struct hlist_node *n; + + hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) + if (kian->gsi == gsi) + kian->irq_acked(kian); +} + +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); +} + +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian) +{ + hlist_del(&kian->link); +} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 07ff2aef0c13..95fe718e3abc 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -83,6 +83,11 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); void kvm_set_irq(struct kvm *kvm, int irq, int level); +void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); +void kvm_register_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); +void kvm_unregister_irq_ack_notifier(struct kvm *kvm, + struct kvm_irq_ack_notifier *kian); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); -- cgit v1.2.2 From f52447261bc8c21dfd4635196e32d2da1352f589 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 26 Jul 2008 17:01:00 -0300 Subject: KVM: irq ack notification Based on a patch from: Ben-Ami Yassour which was based on a patch from: Amit Shah Notify IRQ acking on PIC/APIC emulation. The previous patch missed two things: - Edge triggered interrupts on IOAPIC - PIC reset with IRR/ISR set should be equivalent to ack (LAPIC probably needs something similar). Signed-off-by: Marcelo Tosatti CC: Amit Shah CC: Ben-Ami Yassour Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 12 +++++++++++- arch/x86/kvm/irq.c | 2 +- arch/x86/kvm/irq.h | 3 ++- arch/x86/kvm/lapic.c | 7 +++++-- 4 files changed, 19 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 55e179ad98ef..de704995b819 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -159,9 +159,10 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) s->irr &= ~(1 << irq); } -int kvm_pic_read_irq(struct kvm_pic *s) +int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; + struct kvm_pic *s = pic_irqchip(kvm); irq = pic_get_irq(&s->pics[0]); if (irq >= 0) { @@ -187,12 +188,21 @@ int kvm_pic_read_irq(struct kvm_pic *s) intno = s->pics[0].irq_base + irq; } pic_update_irq(s); + kvm_notify_acked_irq(kvm, irq); return intno; } void kvm_pic_reset(struct kvm_kpic_state *s) { + int irq; + struct kvm *kvm = s->pics_state->irq_request_opaque; + + for (irq = 0; irq < PIC_NUM_PINS; irq++) { + if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) || + s->isr & (1 << irq))) + kvm_notify_acked_irq(kvm, irq); + } s->last_irr = 0; s->irr = 0; s->imr = 0; diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 90911958d853..3c508afaa285 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -72,7 +72,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v) if (kvm_apic_accept_pic_intr(v)) { s = pic_irqchip(v->kvm); s->output = 0; /* PIC */ - vector = kvm_pic_read_irq(s); + vector = kvm_pic_read_irq(v->kvm); } } return vector; diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 95fe718e3abc..479a3d2d5614 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -63,11 +63,12 @@ struct kvm_pic { void *irq_request_opaque; int output; /* intr from master PIC */ struct kvm_io_device dev; + void (*ack_notifier)(void *opaque, int irq); }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_pic_set_irq(void *opaque, int irq, int level); -int kvm_pic_read_irq(struct kvm_pic *s); +int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 9fde0ac24268..be94f93a73f6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -439,7 +439,7 @@ struct kvm_vcpu *kvm_get_lowest_prio_vcpu(struct kvm *kvm, u8 vector, static void apic_set_eoi(struct kvm_lapic *apic) { int vector = apic_find_highest_isr(apic); - + int trigger_mode; /* * Not every write EOI will has corresponding ISR, * one example is when Kernel check timer on setup_IO_APIC @@ -451,7 +451,10 @@ static void apic_set_eoi(struct kvm_lapic *apic) apic_update_ppr(apic); if (apic_test_and_clear_vector(vector, apic->regs + APIC_TMR)) - kvm_ioapic_update_eoi(apic->vcpu->kvm, vector); + trigger_mode = IOAPIC_LEVEL_TRIG; + else + trigger_mode = IOAPIC_EDGE_TRIG; + kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); } static void apic_send_ipi(struct kvm_lapic *apic) -- cgit v1.2.2 From 3cf57fed216e2c1b6fdfeccb792650bab72a350a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Sat, 26 Jul 2008 17:01:01 -0300 Subject: KVM: PIT: fix injection logic and count The PIT injection logic is problematic under the following cases: 1) If there is a higher priority vector to be delivered by the time kvm_pit_timer_intr_post is invoked ps->inject_pending won't be set. This opens the possibility for missing many PIT event injections (say if guest executes hlt at this point). 2) ps->inject_pending is racy with more than two vcpus. Since there's no locking around read/dec of pt->pending, two vcpu's can inject two interrupts for a single pt->pending count. Fix 1 by using an irq ack notifier: only reinject when the previous irq has been acked. Fix 2 with appropriate locking around manipulation of pending count and irq_ack by the injection / ack paths. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 70 ++++++++++++++++++++++++++-------------------------- arch/x86/kvm/i8254.h | 7 +++--- arch/x86/kvm/irq.c | 1 - 3 files changed, 38 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c0f7872a9124..7d04dd3ef857 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -207,6 +207,8 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); pt->scheduled = ktime_to_ns(pt->timer.expires); + if (pt->period) + ps->channels[0].count_load_time = pt->timer.expires; return (pt->period == 0 ? 0 : 1); } @@ -215,12 +217,22 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) { struct kvm_pit *pit = vcpu->kvm->arch.vpit; - if (pit && vcpu->vcpu_id == 0 && pit->pit_state.inject_pending) + if (pit && vcpu->vcpu_id == 0 && pit->pit_state.irq_ack) return atomic_read(&pit->pit_state.pit_timer.pending); - return 0; } +void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, + irq_ack_notifier); + spin_lock(&ps->inject_lock); + if (atomic_dec_return(&ps->pit_timer.pending) < 0) + WARN_ON(1); + ps->irq_ack = 1; + spin_unlock(&ps->inject_lock); +} + static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) { struct kvm_kpit_state *ps; @@ -255,8 +267,9 @@ static void destroy_pit_timer(struct kvm_kpit_timer *pt) hrtimer_cancel(&pt->timer); } -static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) +static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period) { + struct kvm_kpit_timer *pt = &ps->pit_timer; s64 interval; interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); @@ -268,6 +281,7 @@ static void create_pit_timer(struct kvm_kpit_timer *pt, u32 val, int is_period) pt->period = (is_period == 0) ? 0 : interval; pt->timer.function = pit_timer_fn; atomic_set(&pt->pending, 0); + ps->irq_ack = 1; hrtimer_start(&pt->timer, ktime_add_ns(ktime_get(), interval), HRTIMER_MODE_ABS); @@ -302,11 +316,11 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val) case 1: /* FIXME: enhance mode 4 precision */ case 4: - create_pit_timer(&ps->pit_timer, val, 0); + create_pit_timer(ps, val, 0); break; case 2: case 3: - create_pit_timer(&ps->pit_timer, val, 1); + create_pit_timer(ps, val, 1); break; default: destroy_pit_timer(&ps->pit_timer); @@ -520,7 +534,7 @@ void kvm_pit_reset(struct kvm_pit *pit) mutex_unlock(&pit->pit_state.lock); atomic_set(&pit->pit_state.pit_timer.pending, 0); - pit->pit_state.inject_pending = 1; + pit->pit_state.irq_ack = 1; } struct kvm_pit *kvm_create_pit(struct kvm *kvm) @@ -534,6 +548,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) mutex_init(&pit->pit_state.lock); mutex_lock(&pit->pit_state.lock); + spin_lock_init(&pit->pit_state.inject_lock); /* Initialize PIO device */ pit->dev.read = pit_ioport_read; @@ -555,6 +570,9 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm) pit_state->pit = pit; hrtimer_init(&pit_state->pit_timer.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + pit_state->irq_ack_notifier.gsi = 0; + pit_state->irq_ack_notifier.irq_acked = kvm_pit_ack_irq; + kvm_register_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier); mutex_unlock(&pit->pit_state.lock); kvm_pit_reset(pit); @@ -592,37 +610,19 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu) struct kvm_kpit_state *ps; if (vcpu && pit) { + int inject = 0; ps = &pit->pit_state; - /* Try to inject pending interrupts when: - * 1. Pending exists - * 2. Last interrupt was accepted or waited for too long time*/ - if (atomic_read(&ps->pit_timer.pending) && - (ps->inject_pending || - (jiffies - ps->last_injected_time - >= KVM_MAX_PIT_INTR_INTERVAL))) { - ps->inject_pending = 0; - __inject_pit_timer_intr(kvm); - ps->last_injected_time = jiffies; - } - } -} - -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec) -{ - struct kvm_arch *arch = &vcpu->kvm->arch; - struct kvm_kpit_state *ps; - - if (vcpu && arch->vpit) { - ps = &arch->vpit->pit_state; - if (atomic_read(&ps->pit_timer.pending) && - (((arch->vpic->pics[0].imr & 1) == 0 && - arch->vpic->pics[0].irq_base == vec) || - (arch->vioapic->redirtbl[0].fields.vector == vec && - arch->vioapic->redirtbl[0].fields.mask != 1))) { - ps->inject_pending = 1; - atomic_dec(&ps->pit_timer.pending); - ps->channels[0].count_load_time = ktime_get(); + /* Try to inject pending interrupts when + * last one has been acked. + */ + spin_lock(&ps->inject_lock); + if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { + ps->irq_ack = 0; + inject = 1; } + spin_unlock(&ps->inject_lock); + if (inject) + __inject_pit_timer_intr(kvm); } } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index db25c2a6c8c4..e436d4983aa1 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -8,7 +8,6 @@ struct kvm_kpit_timer { int irq; s64 period; /* unit: ns */ s64 scheduled; - ktime_t last_update; atomic_t pending; }; @@ -34,8 +33,9 @@ struct kvm_kpit_state { u32 speaker_data_on; struct mutex lock; struct kvm_pit *pit; - bool inject_pending; /* if inject pending interrupts */ - unsigned long last_injected_time; + spinlock_t inject_lock; + unsigned long irq_ack; + struct kvm_irq_ack_notifier irq_ack_notifier; }; struct kvm_pit { @@ -54,7 +54,6 @@ struct kvm_pit { #define KVM_PIT_CHANNEL_MASK 0x3 void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); -void kvm_pit_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val); struct kvm_pit *kvm_create_pit(struct kvm *kvm); void kvm_free_pit(struct kvm *kvm); diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 3c508afaa285..8c1b9c5def78 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -90,7 +90,6 @@ EXPORT_SYMBOL_GPL(kvm_inject_pending_timer_irqs); void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec) { kvm_apic_timer_intr_post(vcpu, vec); - kvm_pit_timer_intr_post(vcpu, vec); /* TODO: PIT, RTC etc. */ } EXPORT_SYMBOL_GPL(kvm_timer_intr_post); -- cgit v1.2.2 From 3807f345b2c610336c17c7624a0d496a38df75a0 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 28 Jul 2008 11:47:52 -0300 Subject: x86: paravirt: factor out cpu_khz to common code KVM intends to use paravirt code to calibrate khz. Xen current code will do just fine. So as a first step, factor out code to pvclock.c. Signed-off-by: Glauber Costa Signed-off-by: Avi Kivity --- arch/x86/kernel/pvclock.c | 12 ++++++++++++ arch/x86/xen/time.c | 11 ++--------- 2 files changed, 14 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 05fbe9a0325a..1c54b5fb7aed 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -97,6 +97,18 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, return dst->version; } +unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) +{ + u64 tsc_khz = 1000000ULL << 32; + + do_div(tsc_khz, src->tsc_to_system_mul); + if (src->tsc_shift < 0) + tsc_khz <<= -src->tsc_shift; + else + tsc_khz >>= src->tsc_shift; + return tsc_khz; +} + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 004ba86326ae..c9f7cda48ed7 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -198,17 +198,10 @@ unsigned long long xen_sched_clock(void) /* Get the TSC speed from Xen */ unsigned long xen_tsc_khz(void) { - u64 xen_khz = 1000000ULL << 32; - const struct pvclock_vcpu_time_info *info = + struct pvclock_vcpu_time_info *info = &HYPERVISOR_shared_info->vcpu_info[0].time; - do_div(xen_khz, info->tsc_to_system_mul); - if (info->tsc_shift < 0) - xen_khz <<= -info->tsc_shift; - else - xen_khz >>= info->tsc_shift; - - return xen_khz; + return pvclock_tsc_khz(info); } cycle_t xen_clocksource_read(void) -- cgit v1.2.2 From 0293615f3fb9886b6b23800c121be293bb7483e9 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Mon, 28 Jul 2008 11:47:53 -0300 Subject: x86: KVM guest: use paravirt function to calculate cpu khz We're currently facing timing problems in guests that do calibration under heavy load, and then the load vanishes. This means we'll have a much lower lpj than we actually should, and delays end up taking less time than they should, which is a nasty bug. Solution is to pass on the lpj value from host to guest, and have it preset. Signed-off-by: Glauber Costa Signed-off-by: Avi Kivity --- arch/x86/kernel/kvmclock.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index d02def06ca91..774ac4991568 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -78,6 +78,34 @@ static cycle_t kvm_clock_read(void) return ret; } +/* + * If we don't do that, there is the possibility that the guest + * will calibrate under heavy load - thus, getting a lower lpj - + * and execute the delays themselves without load. This is wrong, + * because no delay loop can finish beforehand. + * Any heuristics is subject to fail, because ultimately, a large + * poll of guests can be running and trouble each other. So we preset + * lpj here + */ +static unsigned long kvm_get_tsc_khz(void) +{ + return preset_lpj; +} + +static void kvm_get_preset_lpj(void) +{ + struct pvclock_vcpu_time_info *src; + unsigned long khz; + u64 lpj; + + src = &per_cpu(hv_clock, 0); + khz = pvclock_tsc_khz(src); + + lpj = ((u64)khz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; +} + static struct clocksource kvm_clock = { .name = "kvm-clock", .read = kvm_clock_read, @@ -153,6 +181,7 @@ void __init kvmclock_init(void) pv_time_ops.get_wallclock = kvm_get_wallclock; pv_time_ops.set_wallclock = kvm_set_wallclock; pv_time_ops.sched_clock = kvm_clock_read; + pv_time_ops.get_tsc_khz = kvm_get_tsc_khz; #ifdef CONFIG_X86_LOCAL_APIC pv_apic_ops.setup_secondary_clock = kvm_setup_secondary_clock; #endif @@ -163,6 +192,7 @@ void __init kvmclock_init(void) #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; #endif + kvm_get_preset_lpj(); clocksource_register(&kvm_clock); } } -- cgit v1.2.2 From 4d5c5d0fe89c921336b95f5e7e4f529a9df92f53 Mon Sep 17 00:00:00 2001 From: Ben-Ami Yassour Date: Mon, 28 Jul 2008 19:26:26 +0300 Subject: KVM: pci device assignment Based on a patch from: Amit Shah This patch adds support for handling PCI devices that are assigned to the guest. The device to be assigned to the guest is registered in the host kernel and interrupt delivery is handled. If a device is already assigned, or the device driver for it is still loaded on the host, the device assignment is failed by conveying a -EBUSY reply to the userspace. Devices that share their interrupt line are not supported at the moment. By itself, this patch will not make devices work within the guest. The VT-d extension is required to enable the device to perform DMA. Another alternative is PVDMA. Signed-off-by: Amit Shah Signed-off-by: Ben-Ami Yassour Signed-off-by: Weidong Han Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 243 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 243 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 94a216562f10..a97157cc42ae 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4,10 +4,14 @@ * derived from drivers/kvm/kvm_main.c * * Copyright (C) 2006 Qumranet, Inc. + * Copyright (C) 2008 Qumranet, Inc. + * Copyright IBM Corporation, 2008 * * Authors: * Avi Kivity * Yaniv Kamay + * Amit Shah + * Ben-Ami Yassour * * This work is licensed under the terms of the GNU GPL, version 2. See * the COPYING file in the top-level directory. @@ -23,8 +27,10 @@ #include "x86.h" #include +#include #include #include +#include #include #include #include @@ -98,6 +104,219 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; +struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, + int assigned_dev_id) +{ + struct list_head *ptr; + struct kvm_assigned_dev_kernel *match; + + list_for_each(ptr, head) { + match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); + if (match->assigned_dev_id == assigned_dev_id) + return match; + } + return NULL; +} + +static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) +{ + struct kvm_assigned_dev_kernel *assigned_dev; + + assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, + interrupt_work); + + /* This is taken to safely inject irq inside the guest. When + * the interrupt injection (or the ioapic code) uses a + * finer-grained lock, update this + */ + mutex_lock(&assigned_dev->kvm->lock); + kvm_set_irq(assigned_dev->kvm, + assigned_dev->guest_irq, 1); + mutex_unlock(&assigned_dev->kvm->lock); + kvm_put_kvm(assigned_dev->kvm); +} + +/* FIXME: Implement the OR logic needed to make shared interrupts on + * this line behave properly + */ +static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) +{ + struct kvm_assigned_dev_kernel *assigned_dev = + (struct kvm_assigned_dev_kernel *) dev_id; + + kvm_get_kvm(assigned_dev->kvm); + schedule_work(&assigned_dev->interrupt_work); + disable_irq_nosync(irq); + return IRQ_HANDLED; +} + +/* Ack the irq line for an assigned device */ +static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) +{ + struct kvm_assigned_dev_kernel *dev; + + if (kian->gsi == -1) + return; + + dev = container_of(kian, struct kvm_assigned_dev_kernel, + ack_notifier); + kvm_set_irq(dev->kvm, dev->guest_irq, 0); + enable_irq(dev->host_irq); +} + +static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, + struct kvm_assigned_irq + *assigned_irq) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_irq->assigned_dev_id); + if (!match) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + + if (match->irq_requested) { + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + mutex_unlock(&kvm->lock); + return 0; + } + + INIT_WORK(&match->interrupt_work, + kvm_assigned_dev_interrupt_work_handler); + + if (irqchip_in_kernel(kvm)) { + if (assigned_irq->host_irq) + match->host_irq = assigned_irq->host_irq; + else + match->host_irq = match->dev->irq; + match->guest_irq = assigned_irq->guest_irq; + match->ack_notifier.gsi = assigned_irq->guest_irq; + match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; + kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); + + /* Even though this is PCI, we don't want to use shared + * interrupts. Sharing host devices with guest-assigned devices + * on the same interrupt line is not a happy situation: there + * are going to be long delays in accepting, acking, etc. + */ + if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, + "kvm_assigned_device", (void *)match)) { + printk(KERN_INFO "%s: couldn't allocate irq for pv " + "device\n", __func__); + r = -EIO; + goto out; + } + } + + match->irq_requested = true; +out: + mutex_unlock(&kvm->lock); + return r; +} + +static int kvm_vm_ioctl_assign_device(struct kvm *kvm, + struct kvm_assigned_pci_dev *assigned_dev) +{ + int r = 0; + struct kvm_assigned_dev_kernel *match; + struct pci_dev *dev; + + mutex_lock(&kvm->lock); + + match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, + assigned_dev->assigned_dev_id); + if (match) { + /* device already assigned */ + r = -EINVAL; + goto out; + } + + match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); + if (match == NULL) { + printk(KERN_INFO "%s: Couldn't allocate memory\n", + __func__); + r = -ENOMEM; + goto out; + } + dev = pci_get_bus_and_slot(assigned_dev->busnr, + assigned_dev->devfn); + if (!dev) { + printk(KERN_INFO "%s: host device not found\n", __func__); + r = -EINVAL; + goto out_free; + } + if (pci_enable_device(dev)) { + printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); + r = -EBUSY; + goto out_put; + } + r = pci_request_regions(dev, "kvm_assigned_device"); + if (r) { + printk(KERN_INFO "%s: Could not get access to device regions\n", + __func__); + goto out_disable; + } + match->assigned_dev_id = assigned_dev->assigned_dev_id; + match->host_busnr = assigned_dev->busnr; + match->host_devfn = assigned_dev->devfn; + match->dev = dev; + + match->kvm = kvm; + + list_add(&match->list, &kvm->arch.assigned_dev_head); + +out: + mutex_unlock(&kvm->lock); + return r; +out_disable: + pci_disable_device(dev); +out_put: + pci_dev_put(dev); +out_free: + kfree(match); + mutex_unlock(&kvm->lock); + return r; +} + +static void kvm_free_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) { + free_irq(assigned_dev->host_irq, + (void *)assigned_dev); + + kvm_unregister_irq_ack_notifier(kvm, + &assigned_dev-> + ack_notifier); + } + + if (cancel_work_sync(&assigned_dev->interrupt_work)) + /* We had pending work. That means we will have to take + * care of kvm_put_kvm. + */ + kvm_put_kvm(kvm); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); + } +} unsigned long segment_base(u16 selector) { @@ -1766,6 +1985,28 @@ long kvm_arch_vm_ioctl(struct file *filp, r = 0; break; } + case KVM_ASSIGN_PCI_DEVICE: { + struct kvm_assigned_pci_dev assigned_dev; + + r = -EFAULT; + if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) + goto out; + r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); + if (r) + goto out; + break; + } + case KVM_ASSIGN_IRQ: { + struct kvm_assigned_irq assigned_irq; + + r = -EFAULT; + if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) + goto out; + r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); + if (r) + goto out; + break; + } case KVM_GET_PIT: { struct kvm_pit_state ps; r = -EFAULT; @@ -3945,6 +4186,7 @@ struct kvm *kvm_arch_create_vm(void) return ERR_PTR(-ENOMEM); INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); + INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); return kvm; } @@ -3977,6 +4219,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_free_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); -- cgit v1.2.2 From f0d662759a2465babdba1160749c446648c9d159 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 11 Aug 2008 10:01:45 -0700 Subject: KVM: Reduce kvm stack usage in kvm_arch_vm_ioctl() On my machine with gcc 3.4, kvm uses ~2k of stack in a few select functions. This is mostly because gcc fails to notice that the different case: statements could have their stack usage combined. It overflows very nicely if interrupts happen during one of these large uses. This patch uses two methods for reducing stack usage. 1. dynamically allocate large objects instead of putting on the stack. 2. Use a union{} member for all of the case variables. This tricks gcc into combining them all into a single stack allocation. (There's also a comment on this) Signed-off-by: Dave Hansen Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 72 ++++++++++++++++++++++++++++++++++-------------------- 1 file changed, 45 insertions(+), 27 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a97157cc42ae..87d434228fe2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1869,6 +1869,15 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; void __user *argp = (void __user *)arg; int r = -EINVAL; + /* + * This union makes it completely explicit to gcc-3.x + * that these two variables' stack usage should be + * combined, not added together. + */ + union { + struct kvm_pit_state ps; + struct kvm_memory_alias alias; + } u; switch (ioctl) { case KVM_SET_TSS_ADDR: @@ -1900,17 +1909,14 @@ long kvm_arch_vm_ioctl(struct file *filp, case KVM_GET_NR_MMU_PAGES: r = kvm_vm_ioctl_get_nr_mmu_pages(kvm); break; - case KVM_SET_MEMORY_ALIAS: { - struct kvm_memory_alias alias; - + case KVM_SET_MEMORY_ALIAS: r = -EFAULT; - if (copy_from_user(&alias, argp, sizeof alias)) + if (copy_from_user(&u.alias, argp, sizeof(struct kvm_memory_alias))) goto out; - r = kvm_vm_ioctl_set_memory_alias(kvm, &alias); + r = kvm_vm_ioctl_set_memory_alias(kvm, &u.alias); if (r) goto out; break; - } case KVM_CREATE_IRQCHIP: r = -ENOMEM; kvm->arch.vpic = kvm_create_pic(kvm); @@ -1952,37 +1958,51 @@ long kvm_arch_vm_ioctl(struct file *filp, } case KVM_GET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto get_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_get_irqchip(kvm, &chip); + goto get_irqchip_out; + r = kvm_vm_ioctl_get_irqchip(kvm, chip); if (r) - goto out; + goto get_irqchip_out; r = -EFAULT; - if (copy_to_user(argp, &chip, sizeof chip)) - goto out; + if (copy_to_user(argp, chip, sizeof *chip)) + goto get_irqchip_out; r = 0; + get_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_SET_IRQCHIP: { /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ - struct kvm_irqchip chip; + struct kvm_irqchip *chip = kmalloc(sizeof(*chip), GFP_KERNEL); - r = -EFAULT; - if (copy_from_user(&chip, argp, sizeof chip)) + r = -ENOMEM; + if (!chip) goto out; + r = -EFAULT; + if (copy_from_user(chip, argp, sizeof *chip)) + goto set_irqchip_out; r = -ENXIO; if (!irqchip_in_kernel(kvm)) - goto out; - r = kvm_vm_ioctl_set_irqchip(kvm, &chip); + goto set_irqchip_out; + r = kvm_vm_ioctl_set_irqchip(kvm, chip); if (r) - goto out; + goto set_irqchip_out; r = 0; + set_irqchip_out: + kfree(chip); + if (r) + goto out; break; } case KVM_ASSIGN_PCI_DEVICE: { @@ -2008,31 +2028,29 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } case KVM_GET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_get_pit(kvm, &ps); + r = kvm_vm_ioctl_get_pit(kvm, &u.ps); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &ps, sizeof ps)) + if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state))) goto out; r = 0; break; } case KVM_SET_PIT: { - struct kvm_pit_state ps; r = -EFAULT; - if (copy_from_user(&ps, argp, sizeof ps)) + if (copy_from_user(&u.ps, argp, sizeof u.ps)) goto out; r = -ENXIO; if (!kvm->arch.vpit) goto out; - r = kvm_vm_ioctl_set_pit(kvm, &ps); + r = kvm_vm_ioctl_set_pit(kvm, &u.ps); if (r) goto out; r = 0; -- cgit v1.2.2 From b772ff362ec6b821c8a5227a3355e263f917bfad Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 11 Aug 2008 10:01:47 -0700 Subject: KVM: Reduce stack usage in kvm_arch_vcpu_ioctl() [sheng: fix KVM_GET_LAPIC using wrong size] Signed-off-by: Dave Hansen Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 87d434228fe2..f1b0223c4088 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1542,28 +1542,33 @@ long kvm_arch_vcpu_ioctl(struct file *filp, struct kvm_vcpu *vcpu = filp->private_data; void __user *argp = (void __user *)arg; int r; + struct kvm_lapic_state *lapic = NULL; switch (ioctl) { case KVM_GET_LAPIC: { - struct kvm_lapic_state lapic; + lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); - memset(&lapic, 0, sizeof lapic); - r = kvm_vcpu_ioctl_get_lapic(vcpu, &lapic); + r = -ENOMEM; + if (!lapic) + goto out; + r = kvm_vcpu_ioctl_get_lapic(vcpu, lapic); if (r) goto out; r = -EFAULT; - if (copy_to_user(argp, &lapic, sizeof lapic)) + if (copy_to_user(argp, lapic, sizeof(struct kvm_lapic_state))) goto out; r = 0; break; } case KVM_SET_LAPIC: { - struct kvm_lapic_state lapic; - + lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); + r = -ENOMEM; + if (!lapic) + goto out; r = -EFAULT; - if (copy_from_user(&lapic, argp, sizeof lapic)) + if (copy_from_user(lapic, argp, sizeof(struct kvm_lapic_state))) goto out; - r = kvm_vcpu_ioctl_set_lapic(vcpu, &lapic);; + r = kvm_vcpu_ioctl_set_lapic(vcpu, lapic); if (r) goto out; r = 0; @@ -1661,6 +1666,8 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = -EINVAL; } out: + if (lapic) + kfree(lapic); return r; } -- cgit v1.2.2 From 6ad18fba05228fb1d47cdbc0339fe8b3fca1ca26 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Mon, 11 Aug 2008 10:01:49 -0700 Subject: KVM: Reduce stack usage in kvm_pv_mmu_op() We're in a hot path. We can't use kmalloc() because it might impact performance. So, we just stick the buffer that we need into the kvm_vcpu_arch structure. This is used very often, so it is not really a waste. We also have to move the buffer structure's definition to the arch-specific x86 kvm header. Signed-off-by: Dave Hansen Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c3afbfe6b0c1..171bcea1be21 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -135,13 +135,6 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) -struct kvm_pv_mmu_op_buffer { - void *ptr; - unsigned len; - unsigned processed; - char buf[512] __aligned(sizeof(long)); -}; - struct kvm_rmap_desc { u64 *shadow_ptes[RMAP_EXT]; struct kvm_rmap_desc *more; @@ -2292,18 +2285,18 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, gpa_t addr, unsigned long *ret) { int r; - struct kvm_pv_mmu_op_buffer buffer; + struct kvm_pv_mmu_op_buffer *buffer = &vcpu->arch.mmu_op_buffer; - buffer.ptr = buffer.buf; - buffer.len = min_t(unsigned long, bytes, sizeof buffer.buf); - buffer.processed = 0; + buffer->ptr = buffer->buf; + buffer->len = min_t(unsigned long, bytes, sizeof buffer->buf); + buffer->processed = 0; - r = kvm_read_guest(vcpu->kvm, addr, buffer.buf, buffer.len); + r = kvm_read_guest(vcpu->kvm, addr, buffer->buf, buffer->len); if (r) goto out; - while (buffer.len) { - r = kvm_pv_mmu_op_one(vcpu, &buffer); + while (buffer->len) { + r = kvm_pv_mmu_op_one(vcpu, buffer); if (r < 0) goto out; if (r == 0) @@ -2312,7 +2305,7 @@ int kvm_pv_mmu_op(struct kvm_vcpu *vcpu, unsigned long bytes, r = 1; out: - *ret = buffer.processed; + *ret = buffer->processed; return r; } -- cgit v1.2.2 From 464d17c8b747deb77d1bf8c14cc4f28aab2a4952 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Wed, 13 Aug 2008 14:10:33 +0800 Subject: KVM: VMX: Clean up magic number 0x66 in init_rmode_tss Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ddb49e34697b..229e2d06fa2a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1732,7 +1732,8 @@ static int init_rmode_tss(struct kvm *kvm) if (r < 0) goto out; data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE; - r = kvm_write_guest_page(kvm, fn++, &data, 0x66, sizeof(u16)); + r = kvm_write_guest_page(kvm, fn++, &data, + TSS_IOPB_BASE_OFFSET, sizeof(u16)); if (r < 0) goto out; r = kvm_clear_guest_page(kvm, fn++, 0, PAGE_SIZE); -- cgit v1.2.2 From 29415c37f043d1d54dcf356601d738ff6633b72b Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Fri, 1 Aug 2008 20:09:13 -0300 Subject: KVM: set debug registers after "schedulable" section The vcpu thread can be preempted after the guest_debug_pre() callback, resulting in invalid debug registers on the new vcpu. Move it inside the non-preemptable section. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f1b0223c4088..4a033757a19e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3113,10 +3113,6 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) down_read(&vcpu->kvm->slots_lock); vapic_enter(vcpu); -preempted: - if (vcpu->guest_debug.enabled) - kvm_x86_ops->guest_debug_pre(vcpu); - again: if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) @@ -3170,6 +3166,9 @@ again: goto out; } + if (vcpu->guest_debug.enabled) + kvm_x86_ops->guest_debug_pre(vcpu); + vcpu->guest_mode = 1; /* * Make sure that guest_mode assignment won't happen after @@ -3244,7 +3243,7 @@ out: if (r > 0) { kvm_resched(vcpu); down_read(&vcpu->kvm->slots_lock); - goto preempted; + goto again; } post_kvm_run_save(vcpu, kvm_run); -- cgit v1.2.2 From ecfc79c700b02c5ad1ccae58718015caa84824be Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 14 Aug 2008 11:13:16 +0300 Subject: KVM: VMX: Use interrupt queue for !irqchip_in_kernel Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 229e2d06fa2a..81db7d48ab80 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2173,7 +2173,7 @@ static void kvm_do_inject_irq(struct kvm_vcpu *vcpu) clear_bit(bit_index, &vcpu->arch.irq_pending[word_index]); if (!vcpu->arch.irq_pending[word_index]) clear_bit(word_index, &vcpu->arch.irq_summary); - vmx_inject_irq(vcpu, irq); + kvm_queue_interrupt(vcpu, irq); } @@ -2187,13 +2187,12 @@ static void do_interrupt_requests(struct kvm_vcpu *vcpu, (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3) == 0); if (vcpu->arch.interrupt_window_open && - vcpu->arch.irq_summary && - !(vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) & INTR_INFO_VALID_MASK)) - /* - * If interrupts enabled, and not blocked by sti or mov ss. Good. - */ + vcpu->arch.irq_summary && !vcpu->arch.interrupt.pending) kvm_do_inject_irq(vcpu); + if (vcpu->arch.interrupt_window_open && vcpu->arch.interrupt.pending) + vmx_inject_irq(vcpu, vcpu->arch.interrupt.nr); + cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); if (!vcpu->arch.interrupt_window_open && (vcpu->arch.irq_summary || kvm_run->request_interrupt_window)) -- cgit v1.2.2 From 85428ac7c39ab5fff23b5d14ccb32941e9401285 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Thu, 14 Aug 2008 20:53:25 -0300 Subject: KVM: fix i8259 reset irq acking The irq ack during pic reset has three problems: - Ignores slave/master PIC, using gsi 0-8 for both. - Generates an ACK even if the APIC is in control. - Depends upon IMR being clear, which is broken if the irq was masked at the time it was generated. The last one causes the BIOS to hang after the first reboot of Windows installation, since PIT interrupts stop. [avi: fix check whether pic interrupts are seen by cpu] Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index de704995b819..71e3eeeccae8 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -195,13 +195,19 @@ int kvm_pic_read_irq(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s) { - int irq; + int irq, irqbase; struct kvm *kvm = s->pics_state->irq_request_opaque; + struct kvm_vcpu *vcpu0 = kvm->vcpus[0]; - for (irq = 0; irq < PIC_NUM_PINS; irq++) { - if (!(s->imr & (1 << irq)) && (s->irr & (1 << irq) || - s->isr & (1 << irq))) - kvm_notify_acked_irq(kvm, irq); + if (s == &s->pics_state->pics[0]) + irqbase = 0; + else + irqbase = 8; + + for (irq = 0; irq < PIC_NUM_PINS/2; irq++) { + if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0)) + if (s->irr & (1 << irq) || s->isr & (1 << irq)) + kvm_notify_acked_irq(kvm, irq+irqbase); } s->last_irr = 0; s->irr = 0; -- cgit v1.2.2 From dc7404cea34ef997dfe89ca94d16358e9d29c8d8 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Sun, 17 Aug 2008 16:03:46 +0300 Subject: KVM: Handle spurious acks for PIT interrupts Spurious acks can be generated, for example if the PIC is being reset. Handle those acks gracefully rather than flooding the log with warnings. Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 7d04dd3ef857..c842060c6c04 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -228,7 +228,7 @@ void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) irq_ack_notifier); spin_lock(&ps->inject_lock); if (atomic_dec_return(&ps->pit_timer.pending) < 0) - WARN_ON(1); + atomic_inc(&ps->pit_timer.pending); ps->irq_ack = 1; spin_unlock(&ps->inject_lock); } -- cgit v1.2.2 From 6762b7299aa115e11815decd1fd982d015f09615 Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Wed, 13 Aug 2008 16:22:37 +0300 Subject: KVM: Device assignment: Check for privileges before assigning irq Even though we don't share irqs at the moment, we should ensure regular user processes don't try to allocate system resources. We check for capability to access IO devices (CAP_SYS_RAWIO) before we request_irq on behalf of the guest. Noticed by Avi. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a033757a19e..fffdf4f69c5a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -191,6 +191,11 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, kvm_assigned_dev_interrupt_work_handler); if (irqchip_in_kernel(kvm)) { + if (!capable(CAP_SYS_RAWIO)) { + return -EPERM; + goto out; + } + if (assigned_irq->host_irq) match->host_irq = assigned_irq->host_irq; else -- cgit v1.2.2 From 648dfaa7df2d3692db4e63dcb18dccb275d9c5a7 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:38:32 +0300 Subject: KVM: VMX: Add Guest State Validity Checks This patch adds functions to check whether guest state is VMX compliant. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 180 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 81db7d48ab80..e889b768c751 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1721,6 +1721,186 @@ static void vmx_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt) vmcs_writel(GUEST_GDTR_BASE, dt->base); } +static bool rmode_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + u32 ar; + + vmx_get_segment(vcpu, &var, seg); + ar = vmx_segment_access_rights(&var); + + if (var.base != (var.selector << 4)) + return false; + if (var.limit != 0xffff) + return false; + if (ar != 0xf3) + return false; + + return true; +} + +static bool code_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs; + unsigned int cs_rpl; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + cs_rpl = cs.selector & SELECTOR_RPL_MASK; + + if (~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_ACCESSES_MASK)) + return false; + if (!cs.s) + return false; + if (!(~cs.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK))) { + if (cs.dpl > cs_rpl) + return false; + } else if (cs.type & AR_TYPE_CODE_MASK) { + if (cs.dpl != cs_rpl) + return false; + } + if (!cs.present) + return false; + + /* TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure */ + return true; +} + +static bool stack_segment_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ss; + unsigned int ss_rpl; + + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + ss_rpl = ss.selector & SELECTOR_RPL_MASK; + + if ((ss.type != 3) || (ss.type != 7)) + return false; + if (!ss.s) + return false; + if (ss.dpl != ss_rpl) /* DPL != RPL */ + return false; + if (!ss.present) + return false; + + return true; +} + +static bool data_segment_valid(struct kvm_vcpu *vcpu, int seg) +{ + struct kvm_segment var; + unsigned int rpl; + + vmx_get_segment(vcpu, &var, seg); + rpl = var.selector & SELECTOR_RPL_MASK; + + if (!var.s) + return false; + if (!var.present) + return false; + if (~var.type & (AR_TYPE_CODE_MASK|AR_TYPE_WRITEABLE_MASK)) { + if (var.dpl < rpl) /* DPL < RPL */ + return false; + } + + /* TODO: Add other members to kvm_segment_field to allow checking for other access + * rights flags + */ + return true; +} + +static bool tr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment tr; + + vmx_get_segment(vcpu, &tr, VCPU_SREG_TR); + + if (tr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if ((tr.type != 3) || (tr.type != 11)) /* TODO: Check if guest is in IA32e mode */ + return false; + if (!tr.present) + return false; + + return true; +} + +static bool ldtr_valid(struct kvm_vcpu *vcpu) +{ + struct kvm_segment ldtr; + + vmx_get_segment(vcpu, &ldtr, VCPU_SREG_LDTR); + + if (ldtr.selector & SELECTOR_TI_MASK) /* TI = 1 */ + return false; + if (ldtr.type != 2) + return false; + if (!ldtr.present) + return false; + + return true; +} + +static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu) +{ + struct kvm_segment cs, ss; + + vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); + vmx_get_segment(vcpu, &ss, VCPU_SREG_SS); + + return ((cs.selector & SELECTOR_RPL_MASK) == + (ss.selector & SELECTOR_RPL_MASK)); +} + +/* + * Check if guest state is valid. Returns true if valid, false if + * not. + * We assume that registers are always usable + */ +static bool guest_state_valid(struct kvm_vcpu *vcpu) +{ + /* real mode guest state checks */ + if (!(vcpu->arch.cr0 & X86_CR0_PE)) { + if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!rmode_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + } else { + /* protected mode guest state checks */ + if (!cs_ss_rpl_check(vcpu)) + return false; + if (!code_segment_valid(vcpu)) + return false; + if (!stack_segment_valid(vcpu)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_DS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_ES)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_FS)) + return false; + if (!data_segment_valid(vcpu, VCPU_SREG_GS)) + return false; + if (!tr_valid(vcpu)) + return false; + if (!ldtr_valid(vcpu)) + return false; + } + /* TODO: + * - Add checks on RIP + * - Add checks on RFLAGS + */ + + return true; +} + static int init_rmode_tss(struct kvm *kvm) { gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; -- cgit v1.2.2 From 04fa4d32117b1a7773290fd59a97cf90cfc2a0d4 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:39:48 +0300 Subject: KVM: VMX: Add module parameter and emulation flag. The patch adds the module parameter required to enable emulating invalid guest state, as well as the emulation_required flag used to drive emulation whenever needed. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e889b768c751..7c5f611e1a94 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -49,6 +49,9 @@ module_param(flexpriority_enabled, bool, 0); static int enable_ept = 1; module_param(enable_ept, bool, 0); +static int emulate_invalid_guest_state = 0; +module_param(emulate_invalid_guest_state, bool, 0); + struct vmcs { u32 revision_id; u32 abort; @@ -86,6 +89,7 @@ struct vcpu_vmx { } irq; } rmode; int vpid; + bool emulation_required; }; static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) -- cgit v1.2.2 From ea953ef0ca84e778187905177e2a789a1974837b Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:47:05 +0300 Subject: KVM: VMX: Add invalid guest state handler This adds the invalid guest state handler function which invokes the x86 emulator until getting the guest to a VMX-friendly state. [avi: leave atomic context if scheduling] [guillaume: return to atomic context correctly] Signed-off-by: Laurent Vivier Signed-off-by: Guillaume Thouvenin Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 7c5f611e1a94..eae1f2c64f97 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2892,6 +2892,43 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, + struct kvm_run *kvm_run) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + int err; + + preempt_enable(); + local_irq_enable(); + + while (!guest_state_valid(vcpu)) { + err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); + + switch (err) { + case EMULATE_DONE: + break; + case EMULATE_DO_MMIO: + kvm_report_emulation_failure(vcpu, "mmio"); + /* TODO: Handle MMIO */ + return; + default: + kvm_report_emulation_failure(vcpu, "emulation failure"); + return; + } + + if (signal_pending(current)) + break; + if (need_resched()) + schedule(); + } + + local_irq_disable(); + preempt_disable(); + + /* Guest state should be valid now, no more emulation should be needed */ + vmx->emulation_required = 0; +} + /* * The exit handlers return 1 if the exit was handled fully and guest execution * may resume. Otherwise they set the kvm_run parameter to indicate what needs -- cgit v1.2.2 From a89a8fb93ba63d4ad39db97c90997c409c87ccb9 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sun, 17 Aug 2008 16:42:16 +0300 Subject: KVM: VMX: Modify mode switching and vmentry functions This patch modifies mode switching and vmentry function in order to drive invalid guest state emulation. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index eae1f2c64f97..9840f37925a2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1298,7 +1298,9 @@ static void fix_pmode_dataseg(int seg, struct kvm_save_segment *save) static void enter_pmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 0; vmcs_writel(GUEST_TR_BASE, vcpu->arch.rmode.tr.base); @@ -1315,6 +1317,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu) update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + return; + fix_pmode_dataseg(VCPU_SREG_ES, &vcpu->arch.rmode.es); fix_pmode_dataseg(VCPU_SREG_DS, &vcpu->arch.rmode.ds); fix_pmode_dataseg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); @@ -1355,7 +1360,9 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save) static void enter_rmode(struct kvm_vcpu *vcpu) { unsigned long flags; + struct vcpu_vmx *vmx = to_vmx(vcpu); + vmx->emulation_required = 1; vcpu->arch.rmode.active = 1; vcpu->arch.rmode.tr.base = vmcs_readl(GUEST_TR_BASE); @@ -1377,6 +1384,9 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vmcs_writel(GUEST_CR4, vmcs_readl(GUEST_CR4) | X86_CR4_VME); update_exception_bitmap(vcpu); + if (emulate_invalid_guest_state) + goto continue_rmode; + vmcs_write16(GUEST_SS_SELECTOR, vmcs_readl(GUEST_SS_BASE) >> 4); vmcs_write32(GUEST_SS_LIMIT, 0xffff); vmcs_write32(GUEST_SS_AR_BYTES, 0xf3); @@ -1392,6 +1402,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_GS, &vcpu->arch.rmode.gs); fix_rmode_seg(VCPU_SREG_FS, &vcpu->arch.rmode.fs); +continue_rmode: kvm_mmu_reset_context(vcpu); init_rmode(vcpu->kvm); } @@ -2317,6 +2328,9 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) ret = 0; + /* HACK: Don't enable emulation on guest boot/reset */ + vmx->emulation_required = 0; + out: up_read(&vcpu->kvm->slots_lock); return ret; @@ -3190,6 +3204,12 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) struct vcpu_vmx *vmx = to_vmx(vcpu); u32 intr_info; + /* Handle invalid guest state instead of entering VMX */ + if (vmx->emulation_required && emulate_invalid_guest_state) { + handle_invalid_guest_state(vcpu, kvm_run); + return; + } + if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty)) -- cgit v1.2.2 From 94c935a1ee7a9c134f0ebed656384186619d05cb Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Mon, 18 Aug 2008 13:11:46 +0300 Subject: KVM: SVM: Fix typo Fix typo in as-yet unused macro definition. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 179c2e00d0f9..be86c096385b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -44,7 +44,7 @@ MODULE_LICENSE("GPL"); #define SVM_FEATURE_NPT (1 << 0) #define SVM_FEATURE_LBRV (1 << 1) -#define SVM_DEATURE_SVML (1 << 2) +#define SVM_FEATURE_SVML (1 << 2) #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) -- cgit v1.2.2 From 29c8fa32c5d1e2d26d53ad9467b3a13130014cdf Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Mon, 18 Aug 2008 15:07:05 +0300 Subject: KVM: Use kvm_set_irq to inject interrupts ... instead of using the pic and ioapic variants Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 6 ++---- arch/x86/kvm/x86.c | 8 +------- 2 files changed, 3 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index c842060c6c04..fdaa0f00e478 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -596,10 +596,8 @@ void kvm_free_pit(struct kvm *kvm) static void __inject_pit_timer_intr(struct kvm *kvm) { mutex_lock(&kvm->lock); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 1); - kvm_ioapic_set_irq(kvm->arch.vioapic, 0, 0); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 1); - kvm_pic_set_irq(pic_irqchip(kvm), 0, 0); + kvm_set_irq(kvm, 0, 1); + kvm_set_irq(kvm, 0, 0); mutex_unlock(&kvm->lock); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index fffdf4f69c5a..5b3c8821b191 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1956,13 +1956,7 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; if (irqchip_in_kernel(kvm)) { mutex_lock(&kvm->lock); - if (irq_event.irq < 16) - kvm_pic_set_irq(pic_irqchip(kvm), - irq_event.irq, - irq_event.level); - kvm_ioapic_set_irq(kvm->arch.vioapic, - irq_event.irq, - irq_event.level); + kvm_set_irq(kvm, irq_event.irq, irq_event.level); mutex_unlock(&kvm->lock); r = 0; } -- cgit v1.2.2 From ee032c993edd34e0bdf64dab06a55d0e08a4eeb9 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Mon, 11 Aug 2008 16:54:20 -0700 Subject: KVM: make irq ack notifier functions static sparse says: arch/x86/kvm/x86.c:107:32: warning: symbol 'kvm_find_assigned_dev' was not declared. Should it be static? arch/x86/kvm/i8254.c:225:6: warning: symbol 'kvm_pit_ack_irq' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 2 +- arch/x86/kvm/x86.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index fdaa0f00e478..4cb443026ec4 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -222,7 +222,7 @@ int pit_has_pending_timer(struct kvm_vcpu *vcpu) return 0; } -void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) +static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) { struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, irq_ack_notifier); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 5b3c8821b191..22edd95712ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -104,7 +104,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, +static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, int assigned_dev_id) { struct list_head *ptr; -- cgit v1.2.2 From 5706be0dafd6f42852f85fbae292301dcad4ccec Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Aug 2008 15:07:31 +0300 Subject: KVM: VMX: Change cs reset state to be a data segment Real mode cs is a data segment, not a code segment. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 9840f37925a2..6aa305ace79f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2239,6 +2239,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) fx_init(&vmx->vcpu); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode * insists on having GUEST_CS_BASE == GUEST_CS_SELECTOR << 4. Sigh. @@ -2250,8 +2251,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) vmcs_write16(GUEST_CS_SELECTOR, vmx->vcpu.arch.sipi_vector << 8); vmcs_writel(GUEST_CS_BASE, vmx->vcpu.arch.sipi_vector << 12); } - vmcs_write32(GUEST_CS_LIMIT, 0xffff); - vmcs_write32(GUEST_CS_AR_BYTES, 0x9b); seg_setup(VCPU_SREG_DS); seg_setup(VCPU_SREG_ES); -- cgit v1.2.2 From a16b20da879430fdf245ed45461ed40ffef8db3c Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Aug 2008 15:48:27 +0300 Subject: KVM: VMX: Change segment dpl at reset to 3 This is more emulation friendly, if not 100% correct. Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 6aa305ace79f..71e57ae1cab7 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1991,7 +1991,7 @@ static void seg_setup(int seg) vmcs_write16(sf->selector, 0); vmcs_writel(sf->base, 0); vmcs_write32(sf->limit, 0xffff); - vmcs_write32(sf->ar_bytes, 0x93); + vmcs_write32(sf->ar_bytes, 0xf3); } static int alloc_apic_access_page(struct kvm *kvm) -- cgit v1.2.2 From f4bbd9aaaae23007e4d79536d35a30cbbb11d407 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 20 Aug 2008 15:51:42 +0300 Subject: KVM: Load real mode segments correctly Real mode segments to not reference the GDT or LDT; they simply compute base = selector * 16. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 22edd95712ee..bfc7c332c5d7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3588,11 +3588,33 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } +int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +{ + struct kvm_segment segvar = { + .base = selector << 4, + .limit = 0xffff, + .selector = selector, + .type = 3, + .present = 1, + .dpl = 3, + .db = 0, + .s = 1, + .l = 0, + .g = 0, + .avl = 0, + .unusable = 0, + }; + kvm_x86_ops->set_segment(vcpu, &segvar, seg); + return 0; +} + int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int type_bits, int seg) { struct kvm_segment kvm_seg; + if (!(vcpu->arch.cr0 & X86_CR0_PE)) + return kvm_load_realmode_segment(vcpu, selector, seg); if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg)) return 1; kvm_seg.type |= type_bits; -- cgit v1.2.2 From 41afa025878bc31c9c4e18415fba2435fe035376 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Mon, 18 Aug 2008 21:25:01 -0400 Subject: KVM: x86 emulator: remove duplicate SrcImm Signed-off-by: Roel Kluin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index d5da7f14d536..5d6c1444b611 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -269,7 +269,7 @@ static u16 group_table[] = { ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group3*8] = - DstMem | SrcImm | ModRM | SrcImm, 0, + DstMem | SrcImm | ModRM, 0, DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = -- cgit v1.2.2 From 6eb06cb2863a2ff5704b501f1699216180e790b5 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Aug 2008 17:41:39 +0300 Subject: KVM: x86 emulator: remove bad ByteOp specifier from NEG descriptor Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 5d6c1444b611..ae30435ad331 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -270,7 +270,7 @@ static u16 group_table[] = { 0, 0, 0, 0, [Group3*8] = DstMem | SrcImm | ModRM, 0, - DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, 0, 0, [Group4*8] = ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, -- cgit v1.2.2 From 135f8c2b078533cc74e75f696e73d47304a61125 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 21 Aug 2008 17:49:56 +0300 Subject: KVM: MMU: Move SHADOW_PT_INDEX to mmu.c It is not specific to the paging mode, so can be made global (and reusable). Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ arch/x86/kvm/paging_tmpl.h | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 171bcea1be21..51d4cd7ae4f9 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -135,6 +135,8 @@ module_param(dbg, bool, 0644); #define ACC_USER_MASK PT_USER_MASK #define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) +#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) + struct kvm_rmap_desc { u64 *shadow_ptes[RMAP_EXT]; struct kvm_rmap_desc *more; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 4a814bff21f2..ebb26a09d311 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -29,7 +29,6 @@ #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT64_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level) #define PT_LEVEL_BITS PT64_LEVEL_BITS #ifdef CONFIG_X86_64 @@ -46,7 +45,6 @@ #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK #define PT_INDEX(addr, level) PT32_INDEX(addr, level) - #define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 @@ -504,7 +502,6 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -#undef SHADOW_PT_INDEX #undef PT_LEVEL_MASK #undef PT_DIR_BASE_ADDR_MASK #undef PT_LEVEL_BITS -- cgit v1.2.2 From 6e37d3dc3e358dbf907f8b96a51282966934124b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:14:17 +0300 Subject: KVM: MMU: Unify direct map 4K and large page paths The two paths are equivalent except for one argument, which is already available. Merge the two codepaths. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 51d4cd7ae4f9..3ee856f6812d 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1240,15 +1240,10 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, ASSERT(VALID_PAGE(table_addr)); table = __va(table_addr); - if (level == 1) { + if (level == 1 || (largepage && level == 2)) { mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 0, gfn, pfn, false); - return pt_write; - } - - if (largepage && level == 2) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, 1, gfn, pfn, false); + 0, write, 1, &pt_write, largepage, + gfn, pfn, false); return pt_write; } -- cgit v1.2.2 From 6c41f428b72afe5a581b967590c12538db31d399 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 26 Aug 2008 16:16:08 +0300 Subject: KVM: MMU: Infer shadow root level in direct_map() In all cases the shadow root level is available in mmu.shadow_root_level, so there is no need to pass it as a parameter. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3ee856f6812d..72f739aa8623 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1227,11 +1227,11 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) } static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn, - int level) + int largepage, gfn_t gfn, pfn_t pfn) { hpa_t table_addr = vcpu->arch.mmu.root_hpa; int pt_write = 0; + int level = vcpu->arch.mmu.shadow_root_level; for (; ; level--) { u32 index = PT64_INDEX(v, level); @@ -1299,8 +1299,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) if (mmu_notifier_retry(vcpu, mmu_seq)) goto out_unlock; kvm_mmu_free_some_pages(vcpu); - r = __direct_map(vcpu, v, write, largepage, gfn, pfn, - PT32E_ROOT_LEVEL); + r = __direct_map(vcpu, v, write, largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); @@ -1455,7 +1454,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, goto out_unlock; kvm_mmu_free_some_pages(vcpu); r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, - largepage, gfn, pfn, kvm_x86_ops->get_tdp_level()); + largepage, gfn, pfn); spin_unlock(&vcpu->kvm->mmu_lock); return r; -- cgit v1.2.2 From 3d000db5688c8beff6319fb9ff4b98dcac32f798 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:24:38 +0300 Subject: KVM: MMU: Add generic shadow walker We currently walk the shadow page tables in two places: direct map (for real mode and two dimensional paging) and paging mode shadow. Since we anticipate requiring a third walk (for invlpg), it makes sense to have a generic facility for shadow walk. This patch adds such a shadow walker, walks the page tables and calls a method for every spte encountered. The method can examine the spte, modify it, or even instantiate it. The walk can be aborted by returning nonzero from the method. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 72f739aa8623..8b95cf748b53 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -142,6 +142,11 @@ struct kvm_rmap_desc { struct kvm_rmap_desc *more; }; +struct kvm_shadow_walk { + int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, + gva_t addr, u64 *spte, int level); +}; + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -935,6 +940,35 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, return sp; } +static int walk_shadow(struct kvm_shadow_walk *walker, + struct kvm_vcpu *vcpu, gva_t addr) +{ + hpa_t shadow_addr; + int level; + int r; + u64 *sptep; + unsigned index; + + shadow_addr = vcpu->arch.mmu.root_hpa; + level = vcpu->arch.mmu.shadow_root_level; + if (level == PT32E_ROOT_LEVEL) { + shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; + shadow_addr &= PT64_BASE_ADDR_MASK; + --level; + } + + while (level >= PT_PAGE_TABLE_LEVEL) { + index = SHADOW_PT_INDEX(addr, level); + sptep = ((u64 *)__va(shadow_addr)) + index; + r = walker->entry(walker, vcpu, addr, sptep, level); + if (r) + return r; + shadow_addr = *sptep & PT64_BASE_ADDR_MASK; + --level; + } + return 0; +} + static void kvm_mmu_page_unlink_children(struct kvm *kvm, struct kvm_mmu_page *sp) { -- cgit v1.2.2 From 140754bc80e1cdbf2d14cdb10d900da1f7718e7b Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:28:04 +0300 Subject: KVM: MMU: Convert direct maps to use the generic shadow walker Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 93 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8b95cf748b53..a1ca4ff9c116 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1260,49 +1260,66 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu) { } -static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, - int largepage, gfn_t gfn, pfn_t pfn) -{ - hpa_t table_addr = vcpu->arch.mmu.root_hpa; - int pt_write = 0; - int level = vcpu->arch.mmu.shadow_root_level; - - for (; ; level--) { - u32 index = PT64_INDEX(v, level); - u64 *table; +struct direct_shadow_walk { + struct kvm_shadow_walk walker; + pfn_t pfn; + int write; + int largepage; + int pt_write; +}; - ASSERT(VALID_PAGE(table_addr)); - table = __va(table_addr); +static int direct_map_entry(struct kvm_shadow_walk *_walk, + struct kvm_vcpu *vcpu, + gva_t addr, u64 *sptep, int level) +{ + struct direct_shadow_walk *walk = + container_of(_walk, struct direct_shadow_walk, walker); + struct kvm_mmu_page *sp; + gfn_t pseudo_gfn; + gfn_t gfn = addr >> PAGE_SHIFT; + + if (level == PT_PAGE_TABLE_LEVEL + || (walk->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, + 0, walk->write, 1, &walk->pt_write, + walk->largepage, gfn, walk->pfn, false); + return 1; + } - if (level == 1 || (largepage && level == 2)) { - mmu_set_spte(vcpu, &table[index], ACC_ALL, ACC_ALL, - 0, write, 1, &pt_write, largepage, - gfn, pfn, false); - return pt_write; + if (*sptep == shadow_trap_nonpresent_pte) { + pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1, + 1, ACC_ALL, sptep); + if (!sp) { + pgprintk("nonpaging_map: ENOMEM\n"); + kvm_release_pfn_clean(walk->pfn); + return -ENOMEM; } - if (table[index] == shadow_trap_nonpresent_pte) { - struct kvm_mmu_page *new_table; - gfn_t pseudo_gfn; - - pseudo_gfn = (v & PT64_DIR_BASE_ADDR_MASK) - >> PAGE_SHIFT; - new_table = kvm_mmu_get_page(vcpu, pseudo_gfn, - v, level - 1, - 1, ACC_ALL, &table[index]); - if (!new_table) { - pgprintk("nonpaging_map: ENOMEM\n"); - kvm_release_pfn_clean(pfn); - return -ENOMEM; - } - - set_shadow_pte(&table[index], - __pa(new_table->spt) - | PT_PRESENT_MASK | PT_WRITABLE_MASK - | shadow_user_mask | shadow_x_mask); - } - table_addr = table[index] & PT64_BASE_ADDR_MASK; + set_shadow_pte(sptep, + __pa(sp->spt) + | PT_PRESENT_MASK | PT_WRITABLE_MASK + | shadow_user_mask | shadow_x_mask); } + return 0; +} + +static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, + int largepage, gfn_t gfn, pfn_t pfn) +{ + int r; + struct direct_shadow_walk walker = { + .walker = { .entry = direct_map_entry, }, + .pfn = pfn, + .largepage = largepage, + .write = write, + .pt_write = 0, + }; + + r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT); + if (r < 0) + return r; + return walker.pt_write; } static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) -- cgit v1.2.2 From abb9e0b8e33e58ac8e04e03b680c46435bc312fd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Fri, 22 Aug 2008 19:11:39 +0300 Subject: KVM: MMU: Convert the paging mode shadow walk to use the generic walker Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 158 ++++++++++++++++++++++++--------------------- 1 file changed, 86 insertions(+), 72 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index ebb26a09d311..b7064e1e1e17 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -25,6 +25,7 @@ #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 + #define shadow_walker shadow_walker64 #define FNAME(name) paging##64_##name #define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT64_DIR_BASE_ADDR_MASK @@ -41,6 +42,7 @@ #elif PTTYPE == 32 #define pt_element_t u32 #define guest_walker guest_walker32 + #define shadow_walker shadow_walker32 #define FNAME(name) paging##32_##name #define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK #define PT_DIR_BASE_ADDR_MASK PT32_DIR_BASE_ADDR_MASK @@ -71,6 +73,17 @@ struct guest_walker { u32 error_code; }; +struct shadow_walker { + struct kvm_shadow_walk walker; + struct guest_walker *guest_walker; + int user_fault; + int write_fault; + int largepage; + int *ptwrite; + pfn_t pfn; + u64 *sptep; +}; + static gfn_t gpte_to_gfn(pt_element_t gpte) { return (gpte & PT_BASE_ADDR_MASK) >> PAGE_SHIFT; @@ -272,86 +285,86 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, /* * Fetch a shadow pte for a specific level in the paging hierarchy. */ -static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, - struct guest_walker *walker, - int user_fault, int write_fault, int largepage, - int *ptwrite, pfn_t pfn) +static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, gva_t addr, + u64 *sptep, int level) { - hpa_t shadow_addr; - int level; - u64 *shadow_ent; - unsigned access = walker->pt_access; - - if (!is_present_pte(walker->ptes[walker->level - 1])) - return NULL; - - shadow_addr = vcpu->arch.mmu.root_hpa; - level = vcpu->arch.mmu.shadow_root_level; - if (level == PT32E_ROOT_LEVEL) { - shadow_addr = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; - shadow_addr &= PT64_BASE_ADDR_MASK; - --level; + struct shadow_walker *sw = + container_of(_sw, struct shadow_walker, walker); + struct guest_walker *gw = sw->guest_walker; + unsigned access = gw->pt_access; + struct kvm_mmu_page *shadow_page; + u64 spte; + int metaphysical; + gfn_t table_gfn; + int r; + pt_element_t curr_pte; + + if (level == PT_PAGE_TABLE_LEVEL + || (sw->largepage && level == PT_DIRECTORY_LEVEL)) { + mmu_set_spte(vcpu, sptep, access, gw->pte_access & access, + sw->user_fault, sw->write_fault, + gw->ptes[gw->level-1] & PT_DIRTY_MASK, + sw->ptwrite, sw->largepage, gw->gfn, sw->pfn, + false); + sw->sptep = sptep; + return 1; } - for (; ; level--) { - u32 index = SHADOW_PT_INDEX(addr, level); - struct kvm_mmu_page *shadow_page; - u64 shadow_pte; - int metaphysical; - gfn_t table_gfn; - - shadow_ent = ((u64 *)__va(shadow_addr)) + index; - if (level == PT_PAGE_TABLE_LEVEL) - break; - - if (largepage && level == PT_DIRECTORY_LEVEL) - break; - - if (is_shadow_present_pte(*shadow_ent) - && !is_large_pte(*shadow_ent)) { - shadow_addr = *shadow_ent & PT64_BASE_ADDR_MASK; - continue; - } + if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) + return 0; - if (is_large_pte(*shadow_ent)) - rmap_remove(vcpu->kvm, shadow_ent); - - if (level - 1 == PT_PAGE_TABLE_LEVEL - && walker->level == PT_DIRECTORY_LEVEL) { - metaphysical = 1; - if (!is_dirty_pte(walker->ptes[level - 1])) - access &= ~ACC_WRITE_MASK; - table_gfn = gpte_to_gfn(walker->ptes[level - 1]); - } else { - metaphysical = 0; - table_gfn = walker->table_gfn[level - 2]; - } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, - metaphysical, access, - shadow_ent); - if (!metaphysical) { - int r; - pt_element_t curr_pte; - r = kvm_read_guest_atomic(vcpu->kvm, - walker->pte_gpa[level - 2], - &curr_pte, sizeof(curr_pte)); - if (r || curr_pte != walker->ptes[level - 2]) { - kvm_release_pfn_clean(pfn); - return NULL; - } + if (is_large_pte(*sptep)) + rmap_remove(vcpu->kvm, sptep); + + if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { + metaphysical = 1; + if (!is_dirty_pte(gw->ptes[level - 1])) + access &= ~ACC_WRITE_MASK; + table_gfn = gpte_to_gfn(gw->ptes[level - 1]); + } else { + metaphysical = 0; + table_gfn = gw->table_gfn[level - 2]; + } + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + metaphysical, access, sptep); + if (!metaphysical) { + r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], + &curr_pte, sizeof(curr_pte)); + if (r || curr_pte != gw->ptes[level - 2]) { + kvm_release_pfn_clean(sw->pfn); + sw->sptep = NULL; + return 1; } - shadow_addr = __pa(shadow_page->spt); - shadow_pte = shadow_addr | PT_PRESENT_MASK | PT_ACCESSED_MASK - | PT_WRITABLE_MASK | PT_USER_MASK; - set_shadow_pte(shadow_ent, shadow_pte); } - mmu_set_spte(vcpu, shadow_ent, access, walker->pte_access & access, - user_fault, write_fault, - walker->ptes[walker->level-1] & PT_DIRTY_MASK, - ptwrite, largepage, walker->gfn, pfn, false); + spte = __pa(shadow_page->spt) | PT_PRESENT_MASK | PT_ACCESSED_MASK + | PT_WRITABLE_MASK | PT_USER_MASK; + *sptep = spte; + return 0; +} + +static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, + struct guest_walker *guest_walker, + int user_fault, int write_fault, int largepage, + int *ptwrite, pfn_t pfn) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_walk_entry), }, + .guest_walker = guest_walker, + .user_fault = user_fault, + .write_fault = write_fault, + .largepage = largepage, + .ptwrite = ptwrite, + .pfn = pfn, + }; + + if (!is_present_pte(guest_walker->ptes[guest_walker->level - 1])) + return NULL; + + walk_shadow(&walker.walker, vcpu, addr); - return shadow_ent; + return walker.sptep; } /* @@ -499,6 +512,7 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, #undef pt_element_t #undef guest_walker +#undef shadow_walker #undef FNAME #undef PT_BASE_ADDR_MASK #undef PT_INDEX -- cgit v1.2.2 From acee3c04e8208c17aad1baff99baa68d71640a19 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Tue, 26 Aug 2008 17:22:47 +0300 Subject: KVM: Allocate guest memory as MAP_PRIVATE, not MAP_SHARED There is no reason to share internal memory slots with fork()ed instances. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bfc7c332c5d7..675d010995a2 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4296,7 +4296,7 @@ int kvm_arch_set_memory_region(struct kvm *kvm, userspace_addr = do_mmap(NULL, 0, npages * PAGE_SIZE, PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, + MAP_PRIVATE | MAP_ANONYMOUS, 0); up_write(¤t->mm->mmap_sem); -- cgit v1.2.2 From a5e2e82b8b62acd24a44b851e6bb4fd0793ead01 Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Wed, 27 Aug 2008 05:02:56 +0300 Subject: KVM: x86 emulator: Add mov r, imm instructions (opcodes 0xb0-0xbf) The emulator only supported one instance of mov r, imm instruction (opcode 0xb8), this adds the rest of these instructions. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index ae30435ad331..66e0bd6c628b 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -154,9 +154,16 @@ static u16 opcode_table[256] = { 0, 0, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | Mov | String, ImplicitOps | Mov | String, ByteOp | ImplicitOps | String, ImplicitOps | String, - /* 0xB0 - 0xBF */ - 0, 0, 0, 0, 0, 0, 0, 0, - DstReg | SrcImm | Mov, 0, 0, 0, 0, 0, 0, 0, + /* 0xB0 - 0xB7 */ + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov, + /* 0xB8 - 0xBF */ + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, + DstReg | SrcImm | Mov, DstReg | SrcImm | Mov, /* 0xC0 - 0xC7 */ ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM, 0, ImplicitOps | Stack, 0, 0, @@ -1660,7 +1667,7 @@ special_insn: case 0xae ... 0xaf: /* scas */ DPRINTF("Urk! I don't handle SCAS.\n"); goto cannot_emulate; - case 0xb8: /* mov r, imm */ + case 0xb0 ... 0xbf: /* mov r, imm */ goto mov; case 0xc0 ... 0xc1: emulate_grp2(ctxt); -- cgit v1.2.2 From bc2d429979451d69d0985c5dbdf908cace2831cc Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 16:30:56 +0300 Subject: KVM: MMU: Account for npt/ept/realmode page faults Now that two-dimensional paging is becoming common, account for tdp page faults. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a1ca4ff9c116..a24da8f2ee91 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1283,6 +1283,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, mmu_set_spte(vcpu, sptep, ACC_ALL, ACC_ALL, 0, walk->write, 1, &walk->pt_write, walk->largepage, gfn, walk->pfn, false); + ++vcpu->stat.pf_fixed; return 1; } -- cgit v1.2.2 From 2245a28fe2e6fdb1bdabc4dcde1ea3a5c37e2a9e Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 16:32:24 +0300 Subject: KVM: MMU: Add locking around kvm_mmu_slot_remove_write_access() It was generally safe due to slots_lock being held for write, but it wasn't very nice. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index a24da8f2ee91..5052acdc0a77 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2097,6 +2097,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) { struct kvm_mmu_page *sp; + spin_lock(&kvm->mmu_lock); list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link) { int i; u64 *pt; @@ -2110,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + spin_unlock(&kvm->mmu_lock); } void kvm_mmu_zap_all(struct kvm *kvm) -- cgit v1.2.2 From 171d595d3b3254b9a952af8d1f6965d2e85dcbaa Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 16:40:51 +0300 Subject: KVM: MMU: Flush tlbs after clearing write permission when accessing dirty log Otherwise, the cpu may allow writes to the tracked pages, and we lose some display bits or fail to migrate correctly. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5052acdc0a77..853a2889b202 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2111,6 +2111,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) if (pt[i] & PT_WRITABLE_MASK) pt[i] &= ~PT_WRITABLE_MASK; } + kvm_flush_remote_tlbs(kvm); spin_unlock(&kvm->mmu_lock); } -- cgit v1.2.2 From 3201b5d9f0f7ef392886cd76dcd2c69186d9d5cd Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 27 Aug 2008 20:01:04 +0300 Subject: KVM: MMU: Fix setting the accessed bit on non-speculative sptes The accessed bit was accidentally turned on in a random flag word, rather than, the spte itself, which was lucky, since it used the non-EPT compatible PT_ACCESSED_MASK. Fix by turning the bit on in the spte and changing it to use the portable accessed mask. Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 853a2889b202..866d7133cad8 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1192,7 +1192,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, */ spte = shadow_base_present_pte | shadow_dirty_mask; if (!speculative) - pte_access |= PT_ACCESSED_MASK; + spte |= shadow_accessed_mask; if (!dirty) pte_access &= ~ACC_WRITE_MASK; if (pte_access & ACC_EXEC_MASK) -- cgit v1.2.2 From 48d150394999c542b2e828f03da226842950d46a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 28 Aug 2008 18:27:15 +0300 Subject: KVM: SVM: No need to unprotect memory during event injection when using npt No memory is protected anyway. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index be86c096385b..60228888d1b5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1021,7 +1021,7 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) if (npt_enabled) svm_flush_tlb(&svm->vcpu); - if (event_injection) + if (!npt_enabled && event_injection) kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); } -- cgit v1.2.2 From a89c1ad270ca7ad0eec2667bc754362ce7b142be Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 29 Aug 2008 11:52:07 +0200 Subject: KVM: add MC5_MISC msr read support Currently KVM implements MC0-MC4_MISC read support. When booting Linux this results in KVM warnings in the kernel log when the guest tries to read MC5_MISC. Fix this warnings with this patch. Signed-off-by: Joerg Roedel Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 675d010995a2..e3b89662cf6d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -991,6 +991,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_IA32_MC0_MISC+8: case MSR_IA32_MC0_MISC+12: case MSR_IA32_MC0_MISC+16: + case MSR_IA32_MC0_MISC+20: case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: -- cgit v1.2.2 From fb4616f43148c5b3f3e453a47657572d1bda39ee Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 1 Sep 2008 04:52:24 +0300 Subject: KVM: x86 emulator: Add std and cld instructions (opcodes 0xfc-0xfd) This adds the std and cld instructions to the emulator. Encountered while running the BIOS with invalid guest state emulation enabled. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 66e0bd6c628b..944f1f4d4be4 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -187,7 +187,7 @@ static u16 opcode_table[256] = { ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, /* 0xF8 - 0xFF */ ImplicitOps, 0, ImplicitOps, ImplicitOps, - 0, 0, Group | Group4, Group | Group5, + ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, }; static u16 twobyte_table[256] = { @@ -1762,6 +1762,14 @@ special_insn: ctxt->eflags |= X86_EFLAGS_IF; c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xfc: /* cld */ + ctxt->eflags &= ~EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; + case 0xfd: /* std */ + ctxt->eflags |= EFLG_DF; + c->dst.type = OP_NONE; /* Disable writeback. */ + break; case 0xfe ... 0xff: /* Grp4/Grp5 */ rc = emulate_grp45(ctxt, ops); if (rc != 0) -- cgit v1.2.2 From d40a1ee4859c673677c9811ae84475c4051baca5 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Mon, 1 Sep 2008 19:41:20 +0800 Subject: KVM: MMU: Modify kvm_shadow_walk.entry to accept u64 addr EPT is 4 level by default in 32pae(48 bits), but the addr parameter of kvm_shadow_walk->entry() only accept unsigned long as virtual address, which is 32bit in 32pae. This result in SHADOW_PT_INDEX() overflow when try to fetch level 4 index. Fix it by extend kvm_shadow_walk->entry() to accept 64bit addr in parameter. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++----- arch/x86/kvm/paging_tmpl.h | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 866d7133cad8..bce3e25ec79b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -144,7 +144,7 @@ struct kvm_rmap_desc { struct kvm_shadow_walk { int (*entry)(struct kvm_shadow_walk *walk, struct kvm_vcpu *vcpu, - gva_t addr, u64 *spte, int level); + u64 addr, u64 *spte, int level); }; static struct kmem_cache *pte_chain_cache; @@ -941,7 +941,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, } static int walk_shadow(struct kvm_shadow_walk *walker, - struct kvm_vcpu *vcpu, gva_t addr) + struct kvm_vcpu *vcpu, u64 addr) { hpa_t shadow_addr; int level; @@ -1270,7 +1270,7 @@ struct direct_shadow_walk { static int direct_map_entry(struct kvm_shadow_walk *_walk, struct kvm_vcpu *vcpu, - gva_t addr, u64 *sptep, int level) + u64 addr, u64 *sptep, int level) { struct direct_shadow_walk *walk = container_of(_walk, struct direct_shadow_walk, walker); @@ -1289,7 +1289,7 @@ static int direct_map_entry(struct kvm_shadow_walk *_walk, if (*sptep == shadow_trap_nonpresent_pte) { pseudo_gfn = (addr & PT64_DIR_BASE_ADDR_MASK) >> PAGE_SHIFT; - sp = kvm_mmu_get_page(vcpu, pseudo_gfn, addr, level - 1, + sp = kvm_mmu_get_page(vcpu, pseudo_gfn, (gva_t)addr, level - 1, 1, ACC_ALL, sptep); if (!sp) { pgprintk("nonpaging_map: ENOMEM\n"); @@ -1317,7 +1317,7 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, .pt_write = 0, }; - r = walk_shadow(&walker.walker, vcpu, (gva_t)gfn << PAGE_SHIFT); + r = walk_shadow(&walker.walker, vcpu, gfn << PAGE_SHIFT); if (r < 0) return r; return walker.pt_write; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b7064e1e1e17..b671f61be41e 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -286,7 +286,7 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *page, * Fetch a shadow pte for a specific level in the paging hierarchy. */ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, - struct kvm_vcpu *vcpu, gva_t addr, + struct kvm_vcpu *vcpu, u64 addr, u64 *sptep, int level) { struct shadow_walker *sw = @@ -326,7 +326,7 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, metaphysical = 0; table_gfn = gw->table_gfn[level - 2]; } - shadow_page = kvm_mmu_get_page(vcpu, table_gfn, addr, level-1, + shadow_page = kvm_mmu_get_page(vcpu, table_gfn, (gva_t)addr, level-1, metaphysical, access, sptep); if (!metaphysical) { r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 2], -- cgit v1.2.2 From fa89a81766e33343fa8f0fe0e371819bf94a17a1 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 1 Sep 2008 15:57:51 +0300 Subject: KVM: Add statistics for guest irq injections These can help show whether a guest is making progress or not. Signed-off-by: Avi Kivity --- arch/x86/kvm/svm.c | 1 + arch/x86/kvm/vmx.c | 1 + arch/x86/kvm/x86.c | 1 + 3 files changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 60228888d1b5..9b54550fa4d2 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1519,6 +1519,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq) KVMTRACE_1D(INJ_VIRQ, &svm->vcpu, (u32)irq, handler); + ++svm->vcpu.stat.irq_injections; control = &svm->vmcb->control; control->int_vector = irq; control->int_ctl &= ~V_INTR_PRIO_MASK; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 71e57ae1cab7..e7e8c86f1b7d 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2341,6 +2341,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu, int irq) KVMTRACE_1D(INJ_VIRQ, vcpu, (u32)irq, handler); + ++vcpu->stat.irq_injections; if (vcpu->arch.rmode.active) { vmx->rmode.irq.pending = true; vmx->rmode.irq.vector = irq; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e3b89662cf6d..3f3cb7107c03 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -92,6 +92,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "fpu_reload", VCPU_STAT(fpu_reload) }, { "insn_emulation", VCPU_STAT(insn_emulation) }, { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) }, + { "irq_injections", VCPU_STAT(irq_injections) }, { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) }, { "mmu_pte_write", VM_STAT(mmu_pte_write) }, { "mmu_pte_updated", VM_STAT(mmu_pte_updated) }, -- cgit v1.2.2 From a6a3034cb979b1fa3948d8e1e91b2387fc66b89b Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Sat, 6 Sep 2008 17:22:29 +0300 Subject: KVM: x86 emulator: Add in/out instructions (opcodes 0xe4-0xe7, 0xec-0xef) The patch adds in/out instructions to the x86 emulator. The instruction was encountered while running the BIOS while using the invalid guest state emulation patch. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 35 +++++++++++++++++++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 944f1f4d4be4..3ac2f1485223 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -177,11 +177,14 @@ static u16 opcode_table[256] = { /* 0xD8 - 0xDF */ 0, 0, 0, 0, 0, 0, 0, 0, /* 0xE0 - 0xE7 */ - 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xE8 - 0xEF */ ImplicitOps | Stack, SrcImm | ImplicitOps, ImplicitOps, SrcImmByte | ImplicitOps, - 0, 0, 0, 0, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, + SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, /* 0xF0 - 0xF7 */ 0, 0, 0, 0, ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, @@ -1259,6 +1262,8 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) u64 msr_data; unsigned long saved_eip = 0; struct decode_cache *c = &ctxt->decode; + unsigned int port; + int io_dir_in; int rc = 0; /* Shadow copy of register state. Committed on successful emulation. @@ -1687,6 +1692,16 @@ special_insn: c->src.val = c->regs[VCPU_REGS_RCX]; emulate_grp2(ctxt); break; + case 0xe4: /* inb */ + case 0xe5: /* in */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 1; + goto do_io; + case 0xe6: /* outb */ + case 0xe7: /* out */ + port = insn_fetch(u8, 1, c->eip); + io_dir_in = 0; + goto do_io; case 0xe8: /* call (near) */ { long int rel; switch (c->op_bytes) { @@ -1737,6 +1752,22 @@ special_insn: jmp_rel(c, c->src.val); c->dst.type = OP_NONE; /* Disable writeback. */ break; + case 0xec: /* in al,dx */ + case 0xed: /* in (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 1; + goto do_io; + case 0xee: /* out al,dx */ + case 0xef: /* out (e/r)ax,dx */ + port = c->regs[VCPU_REGS_RDX]; + io_dir_in = 0; + do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, + (c->d & ByteOp) ? 1 : c->op_bytes, + port) != 0) { + c->eip = saved_eip; + goto cannot_emulate; + } + return 0; case 0xf4: /* hlt */ ctxt->vcpu->arch.halt_request = 1; break; -- cgit v1.2.2 From d76901750ab9f71091d33ef3d2b5909d8a9a4ad4 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Mon, 8 Sep 2008 15:23:48 -0300 Subject: KVM: x86: do not execute halted vcpus Offline or uninitialized vcpu's can be executed if requested to perform userspace work. Follow Avi's suggestion to handle halted vcpu's in the main loop, simplifying kvm_emulate_halt(). Introduce a new vcpu->requests bit to indicate events that promote state from halted to running. Also standardize vcpu wake sites. Signed-off-by: Marcelo Tosatti redhat.com> Signed-off-by: Avi Kivity --- arch/x86/kvm/i8254.c | 5 ++- arch/x86/kvm/lapic.c | 16 +++------ arch/x86/kvm/x86.c | 100 ++++++++++++++++++++++++++++----------------------- 3 files changed, 61 insertions(+), 60 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 4cb443026ec4..634132a9a512 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -200,10 +200,9 @@ static int __pit_timer_fn(struct kvm_kpit_state *ps) if (!atomic_inc_and_test(&pt->pending)) set_bit(KVM_REQ_PENDING_TIMER, &vcpu0->requests); - if (vcpu0 && waitqueue_active(&vcpu0->wq)) { - vcpu0->arch.mp_state = KVM_MP_STATE_RUNNABLE; + + if (vcpu0 && waitqueue_active(&vcpu0->wq)) wake_up_interruptible(&vcpu0->wq); - } pt->timer.expires = ktime_add_ns(pt->timer.expires, pt->period); pt->scheduled = ktime_to_ns(pt->timer.expires); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index be94f93a73f6..fd00f698692f 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -339,13 +339,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, } else apic_clear_vector(vector, apic->regs + APIC_TMR); - if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) - kvm_vcpu_kick(vcpu); - else if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) { - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); - } + kvm_vcpu_kick(vcpu); result = (orig_irr == 0); break; @@ -384,8 +378,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; - if (waitqueue_active(&vcpu->wq)) - wake_up_interruptible(&vcpu->wq); + kvm_vcpu_kick(vcpu); } break; @@ -950,10 +943,9 @@ static int __apic_timer_fn(struct kvm_lapic *apic) if(!atomic_inc_and_test(&apic->timer.pending)) set_bit(KVM_REQ_PENDING_TIMER, &apic->vcpu->requests); - if (waitqueue_active(q)) { - apic->vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + if (waitqueue_active(q)) wake_up_interruptible(q); - } + if (apic_lvtt_period(apic)) { result = 1; apic->timer.dev.expires = ktime_add_ns( diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3f3cb7107c03..bf98d40b21ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2798,11 +2798,6 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu) KVMTRACE_0D(HLT, vcpu, handler); if (irqchip_in_kernel(vcpu->kvm)) { vcpu->arch.mp_state = KVM_MP_STATE_HALTED; - up_read(&vcpu->kvm->slots_lock); - kvm_vcpu_block(vcpu); - down_read(&vcpu->kvm->slots_lock); - if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) - return -EINTR; return 1; } else { vcpu->run->exit_reason = KVM_EXIT_HLT; @@ -3097,24 +3092,10 @@ static void vapic_exit(struct kvm_vcpu *vcpu) up_read(&vcpu->kvm->slots_lock); } -static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; - if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - pr_debug("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); - kvm_lapic_reset(vcpu); - r = kvm_x86_ops->vcpu_reset(vcpu); - if (r) - return r; - vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; - } - - down_read(&vcpu->kvm->slots_lock); - vapic_enter(vcpu); - -again: if (vcpu->requests) if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) kvm_mmu_unload(vcpu); @@ -3151,22 +3132,13 @@ again: local_irq_disable(); - if (vcpu->requests || need_resched()) { + if (vcpu->requests || need_resched() || signal_pending(current)) { local_irq_enable(); preempt_enable(); r = 1; goto out; } - if (signal_pending(current)) { - local_irq_enable(); - preempt_enable(); - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.signal_exits; - goto out; - } - if (vcpu->guest_debug.enabled) kvm_x86_ops->guest_debug_pre(vcpu); @@ -3227,26 +3199,63 @@ again: kvm_lapic_sync_from_vapic(vcpu); r = kvm_x86_ops->handle_exit(kvm_run, vcpu); +out: + return r; +} - if (r > 0) { - if (dm_request_for_irq_injection(vcpu, kvm_run)) { - r = -EINTR; - kvm_run->exit_reason = KVM_EXIT_INTR; - ++vcpu->stat.request_irq_exits; - goto out; - } - if (!need_resched()) - goto again; +static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + int r; + + if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { + printk("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); + kvm_lapic_reset(vcpu); + r = kvm_x86_ops->vcpu_reset(vcpu); + if (r) + return r; + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; } -out: - up_read(&vcpu->kvm->slots_lock); - if (r > 0) { - kvm_resched(vcpu); - down_read(&vcpu->kvm->slots_lock); - goto again; + down_read(&vcpu->kvm->slots_lock); + vapic_enter(vcpu); + + r = 1; + while (r > 0) { + if (kvm_arch_vcpu_runnable(vcpu)) + r = vcpu_enter_guest(vcpu, kvm_run); + else { + up_read(&vcpu->kvm->slots_lock); + kvm_vcpu_block(vcpu); + down_read(&vcpu->kvm->slots_lock); + if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) + if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) + vcpu->arch.mp_state = + KVM_MP_STATE_RUNNABLE; + if (vcpu->arch.mp_state != KVM_MP_STATE_RUNNABLE) + r = -EINTR; + } + + if (r > 0) { + if (dm_request_for_irq_injection(vcpu, kvm_run)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.request_irq_exits; + } + if (signal_pending(current)) { + r = -EINTR; + kvm_run->exit_reason = KVM_EXIT_INTR; + ++vcpu->stat.signal_exits; + } + if (need_resched()) { + up_read(&vcpu->kvm->slots_lock); + kvm_resched(vcpu); + down_read(&vcpu->kvm->slots_lock); + } + } } + up_read(&vcpu->kvm->slots_lock); post_kvm_run_save(vcpu, kvm_run); vapic_exit(vcpu); @@ -3266,6 +3275,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); r = -EAGAIN; goto out; } -- cgit v1.2.2 From d19292e457a7c1b7f6c12bccbfdfd53630de1cee Mon Sep 17 00:00:00 2001 From: Mohammed Gamal Date: Mon, 8 Sep 2008 21:47:19 +0300 Subject: KVM: x86 emulator: Add call near absolute instruction (opcode 0xff/2) Add call near absolute instruction. Signed-off-by: Mohammed Gamal Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 3ac2f1485223..0630d2198763 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -286,7 +286,8 @@ static u16 group_table[] = { ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM, 0, 0, 0, 0, 0, 0, [Group5*8] = - DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, 0, 0, + DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem, ModRM | SrcMem, @@ -1162,6 +1163,14 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, case 1: /* dec */ emulate_1op("dec", c->dst, ctxt->eflags); break; + case 2: /* call near abs */ { + long int old_eip; + old_eip = c->eip; + c->eip = c->src.val; + c->src.val = old_eip; + emulate_push(ctxt); + break; + } case 4: /* jmp abs */ c->eip = c->src.val; break; -- cgit v1.2.2 From 9c3e4aab5ae27bf8589d61c7874e213832b4b7d2 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 10 Sep 2008 16:40:55 -0300 Subject: KVM: x86: unhalt vcpu0 on reset Since "KVM: x86: do not execute halted vcpus", HLT by vcpu0 before system reset by the IO thread will hang the guest. Mark vcpu as runnable in such case. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bf98d40b21ec..2134f3e0a516 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3959,6 +3959,12 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR); kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR); + /* Older userspace won't unhalt the vcpu on reset. */ + if (vcpu->vcpu_id == 0 && kvm_rip_read(vcpu) == 0xfff0 && + sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && + !(vcpu->arch.cr0 & X86_CR0_PE)) + vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; + vcpu_put(vcpu); return 0; -- cgit v1.2.2 From 4b92fe0c9d0376eb105c88d49b77595e8e5b1548 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Thu, 11 Sep 2008 12:58:00 +0200 Subject: KVM: VMX: Cleanup stalled INTR_INFO read Commit 1c0f4f5011829dac96347b5f84ba37c2252e1e08 left a useless access of VM_ENTRY_INTR_INFO_FIELD in vmx_intr_assist behind. Clean this up. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e7e8c86f1b7d..f8e615fc8744 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3135,11 +3135,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx) static void vmx_intr_assist(struct kvm_vcpu *vcpu) { - u32 intr_info_field; - update_tpr_threshold(vcpu); - intr_info_field = vmcs_read32(VM_ENTRY_INTR_INFO_FIELD); if (cpu_has_virtual_nmis()) { if (vcpu->arch.nmi_pending && !vcpu->arch.nmi_injected) { if (vmx_nmi_enabled(vcpu)) { -- cgit v1.2.2 From ef46f18ea010359f7536afd3f56a92c9c83ac09a Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Thu, 11 Sep 2008 19:47:13 +0300 Subject: KVM: x86 emulator: fix jmp r/m64 instruction jmp r/m64 doesn't require the rex.w prefix to indicate the operand size is 64 bits. Set the Stack attribute (even though it doesn't involve the stack, really) to indicate this. Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 0630d2198763..0c120c4c9c0f 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -288,7 +288,7 @@ static u16 group_table[] = { [Group5*8] = DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM, SrcMem | ModRM | Stack, 0, - SrcMem | ModRM, 0, SrcMem | ModRM | Stack, 0, + SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, [Group7*8] = 0, 0, ModRM | SrcMem, ModRM | SrcMem, SrcNone | ModRM | DstMem | Mov, 0, -- cgit v1.2.2 From 9ea542facbd0fd12a53e169953a3fdd6d0364532 Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 11 Sep 2008 15:27:49 +0800 Subject: KVM: VMX: Rename IA32_FEATURE_CONTROL bits Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 18 +++++++++--------- arch/x86/kvm/vmx.h | 4 ++-- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f8e615fc8744..046a91b5a4ba 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1041,9 +1041,9 @@ static __init int vmx_disabled_by_bios(void) u64 msr; rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); - return (msr & (IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) - == IA32_FEATURE_CONTROL_LOCKED_BIT; + return (msr & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + == FEATURE_CONTROL_LOCKED; /* locked but not enabled */ } @@ -1055,14 +1055,14 @@ static void hardware_enable(void *garbage) INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); rdmsrl(MSR_IA32_FEATURE_CONTROL, old); - if ((old & (IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) - != (IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT)) + if ((old & (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) + != (FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED)) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | - IA32_FEATURE_CONTROL_LOCKED_BIT | - IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT); + FEATURE_CONTROL_LOCKED | + FEATURE_CONTROL_VMXON_ENABLED); write_cr4(read_cr4() | X86_CR4_VMXE); /* FIXME: not cpu hotplug safe */ asm volatile (ASM_VMX_VMXON_RAX : : "a"(&phys_addr), "m"(phys_addr) diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 86059f439cb4..44cfab706443 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,8 +331,8 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define IA32_FEATURE_CONTROL_LOCKED_BIT 0x1 -#define IA32_FEATURE_CONTROL_VMXON_ENABLED_BIT 0x4 +#define FEATURE_CONTROL_LOCKED (1<<0) +#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 -- cgit v1.2.2 From defed7ed926eca9f178a632df05baa866abe754e Mon Sep 17 00:00:00 2001 From: Sheng Yang Date: Thu, 11 Sep 2008 15:27:50 +0800 Subject: x86: Move FEATURE_CONTROL bits to msr-index.h For MSR_IA32_FEATURE_CONTROL is already there. Signed-off-by: Sheng Yang Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.h | 3 --- 1 file changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.h b/arch/x86/kvm/vmx.h index 44cfab706443..3e010d21fdd7 100644 --- a/arch/x86/kvm/vmx.h +++ b/arch/x86/kvm/vmx.h @@ -331,9 +331,6 @@ enum vmcs_field { #define AR_RESERVD_MASK 0xfffe0f00 -#define FEATURE_CONTROL_LOCKED (1<<0) -#define FEATURE_CONTROL_VMXON_ENABLED (1<<2) - #define APIC_ACCESS_PAGE_PRIVATE_MEMSLOT 9 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT 10 -- cgit v1.2.2 From 9c9fddd0e784346a6d0ce82ed20d9ad21f3c8a2c Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Fri, 12 Sep 2008 13:50:25 +0200 Subject: KVM: x86 emulator: Add DstAcc operand type Add DstAcc operand type. That means that there are 4 bits now for DstMask. "In the good old days cpus would have only one register that was able to fully participate in arithmetic operations, typically called A for Accumulator. The x86 retains this tradition by having special, shorter encodings for the A register (like the cmp opcode), and even some instructions that only operate on A (like mul). SrcAcc and DstAcc would accommodate these instructions by decoding A into the corresponding 'struct operand'." -- Avi Kivity Signed-off-by: Guillaume Thouvenin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 50 +++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 0c120c4c9c0f..4390ec8c47a6 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -47,25 +47,26 @@ #define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ #define DstReg (2<<1) /* Register operand. */ #define DstMem (3<<1) /* Memory operand. */ -#define DstMask (3<<1) +#define DstAcc (4<<1) /* Destination Accumulator */ +#define DstMask (7<<1) /* Source operand type. */ -#define SrcNone (0<<3) /* No source operand. */ -#define SrcImplicit (0<<3) /* Source operand is implicit in the opcode. */ -#define SrcReg (1<<3) /* Register operand. */ -#define SrcMem (2<<3) /* Memory operand. */ -#define SrcMem16 (3<<3) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<3) /* Memory operand (32-bit). */ -#define SrcImm (5<<3) /* Immediate operand. */ -#define SrcImmByte (6<<3) /* 8-bit sign-extended immediate operand. */ -#define SrcMask (7<<3) +#define SrcNone (0<<4) /* No source operand. */ +#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ +#define SrcReg (1<<4) /* Register operand. */ +#define SrcMem (2<<4) /* Memory operand. */ +#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ +#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ +#define SrcImm (5<<4) /* Immediate operand. */ +#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ +#define SrcMask (7<<4) /* Generic ModRM decode. */ -#define ModRM (1<<6) +#define ModRM (1<<7) /* Destination is only written; never read. */ -#define Mov (1<<7) -#define BitOp (1<<8) -#define MemAbs (1<<9) /* Memory operand is absolute displacement */ -#define String (1<<10) /* String instruction (rep capable) */ -#define Stack (1<<11) /* Stack instruction (push/pop) */ +#define Mov (1<<8) +#define BitOp (1<<9) +#define MemAbs (1<<10) /* Memory operand is absolute displacement */ +#define String (1<<12) /* String instruction (rep capable) */ +#define Stack (1<<13) /* Stack instruction (push/pop) */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ #define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ #define GroupMask 0xff /* Group number stored in bits 0:7 */ @@ -1060,6 +1061,23 @@ done_prefixes: } c->dst.type = OP_MEM; break; + case DstAcc: + c->dst.type = OP_REG; + c->dst.bytes = c->op_bytes; + c->dst.ptr = &c->regs[VCPU_REGS_RAX]; + switch (c->op_bytes) { + case 1: + c->dst.val = *(u8 *)c->dst.ptr; + break; + case 2: + c->dst.val = *(u16 *)c->dst.ptr; + break; + case 4: + c->dst.val = *(u32 *)c->dst.ptr; + break; + } + c->dst.orig_val = c->dst.val; + break; } if (c->rip_relative) -- cgit v1.2.2 From 8a9fee67fba585b4c4731a749367e1751ebf416c Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Fri, 12 Sep 2008 13:51:15 +0200 Subject: KVM: x86 emulator: Add cmp al, imm and cmp ax, imm instructions (ocodes 3c, 3d) Add decode entries for these opcodes; execution is already implemented. Signed-off-by: Guillaume Thouvenin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 4390ec8c47a6..2b43208a38bc 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -108,7 +108,8 @@ static u16 opcode_table[256] = { /* 0x38 - 0x3F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - 0, 0, 0, 0, + ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, + 0, 0, /* 0x40 - 0x47 */ DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, /* 0x48 - 0x4F */ -- cgit v1.2.2 From aa3a816b6d0bd59e1a9c548cc7d2dd829f26534f Mon Sep 17 00:00:00 2001 From: Guillaume Thouvenin Date: Fri, 12 Sep 2008 13:52:18 +0200 Subject: KVM: x86 emulator: Use DstAcc for 'and' For instruction 'and al,imm' we use DstAcc instead of doing the emulation directly into the instruction's opcode. Signed-off-by: Guillaume Thouvenin Signed-off-by: Avi Kivity --- arch/x86/kvm/x86_emulate.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86_emulate.c b/arch/x86/kvm/x86_emulate.c index 2b43208a38bc..ea051173b0da 100644 --- a/arch/x86/kvm/x86_emulate.c +++ b/arch/x86/kvm/x86_emulate.c @@ -96,7 +96,7 @@ static u16 opcode_table[256] = { /* 0x20 - 0x27 */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, - SrcImmByte, SrcImm, 0, 0, + DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, /* 0x28 - 0x2F */ ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, @@ -1392,27 +1392,10 @@ special_insn: sbb: /* sbb */ emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); break; - case 0x20 ... 0x23: + case 0x20 ... 0x25: and: /* and */ emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); break; - case 0x24: /* and al imm8 */ - c->dst.type = OP_REG; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - c->dst.val = *(u8 *)c->dst.ptr; - c->dst.bytes = 1; - c->dst.orig_val = c->dst.val; - goto and; - case 0x25: /* and ax imm16, or eax imm32 */ - c->dst.type = OP_REG; - c->dst.bytes = c->op_bytes; - c->dst.ptr = &c->regs[VCPU_REGS_RAX]; - if (c->op_bytes == 2) - c->dst.val = *(u16 *)c->dst.ptr; - else - c->dst.val = *(u32 *)c->dst.ptr; - c->dst.orig_val = c->dst.val; - goto and; case 0x28 ... 0x2d: sub: /* sub */ emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); -- cgit v1.2.2 From 62c476c7c7f25a5b245b9902a935636e6316e58c Mon Sep 17 00:00:00 2001 From: Ben-Ami Yassour Date: Sun, 14 Sep 2008 03:48:28 +0300 Subject: KVM: Device Assignment with VT-d Based on a patch by: Kay, Allen M This patch enables PCI device assignment based on VT-d support. When a device is assigned to the guest, the guest memory is pinned and the mapping is updated in the VT-d IOMMU. [Amit: Expose KVM_CAP_IOMMU so we can check if an IOMMU is present and also control enable/disable from userspace] Signed-off-by: Kay, Allen M Signed-off-by: Weidong Han Signed-off-by: Ben-Ami Yassour Signed-off-by: Amit Shah Acked-by: Mark Gross Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 3 + arch/x86/kvm/vtd.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 14 ++++ 3 files changed, 215 insertions(+) create mode 100644 arch/x86/kvm/vtd.c (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index d0e940bb6f40..3072b17447ab 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -12,6 +12,9 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ i8254.o +ifeq ($(CONFIG_DMAR),y) +kvm-objs += vtd.o +endif obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c new file mode 100644 index 000000000000..667bf3fb64bf --- /dev/null +++ b/arch/x86/kvm/vtd.c @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2006, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 Temple + * Place - Suite 330, Boston, MA 02111-1307 USA. + * + * Copyright (C) 2006-2008 Intel Corporation + * Copyright IBM Corporation, 2008 + * Author: Allen M. Kay + * Author: Weidong Han + * Author: Ben-Ami Yassour + */ + +#include +#include +#include +#include +#include + +static int kvm_iommu_unmap_memslots(struct kvm *kvm); +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages); + +int kvm_iommu_map_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + int i, r; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + r = -EINVAL; + for (i = 0; i < npages; i++) { + /* check if already mapped */ + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + if (pfn && !is_mmio_pfn(pfn)) + continue; + + pfn = gfn_to_pfn(kvm, gfn); + if (!is_mmio_pfn(pfn)) { + r = intel_iommu_page_mapping(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, + DMA_PTE_READ | + DMA_PTE_WRITE); + if (r) { + printk(KERN_DEBUG "kvm_iommu_map_pages:" + "iommu failed to map pfn=%lx\n", pfn); + goto unmap_pages; + } + } else { + printk(KERN_DEBUG "kvm_iommu_map_page:" + "invalid pfn=%lx\n", pfn); + goto unmap_pages; + } + gfn++; + } + return 0; + +unmap_pages: + kvm_iommu_put_pages(kvm, base_gfn, i); + return r; +} + +static int kvm_iommu_map_memslots(struct kvm *kvm) +{ + int i, r; + + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + if (r) + break; + } + up_read(&kvm->slots_lock); + return r; +} + +int kvm_iommu_map_guest(struct kvm *kvm, + struct kvm_assigned_dev_kernel *assigned_dev) +{ + struct pci_dev *pdev = NULL; + int r; + + if (!intel_iommu_found()) { + printk(KERN_ERR "%s: intel iommu not found\n", __func__); + return -ENODEV; + } + + printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n", + assigned_dev->host_busnr, + PCI_SLOT(assigned_dev->host_devfn), + PCI_FUNC(assigned_dev->host_devfn)); + + pdev = assigned_dev->dev; + + if (pdev == NULL) { + if (kvm->arch.intel_iommu_domain) { + intel_iommu_domain_exit(kvm->arch.intel_iommu_domain); + kvm->arch.intel_iommu_domain = NULL; + } + return -ENODEV; + } + + kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev); + if (!kvm->arch.intel_iommu_domain) + return -ENODEV; + + r = kvm_iommu_map_memslots(kvm); + if (r) + goto out_unmap; + + intel_iommu_detach_dev(kvm->arch.intel_iommu_domain, + pdev->bus->number, pdev->devfn); + + r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain, + pdev); + if (r) { + printk(KERN_ERR "Domain context map for %s failed", + pci_name(pdev)); + goto out_unmap; + } + return 0; + +out_unmap: + kvm_iommu_unmap_memslots(kvm); + return r; +} + +static void kvm_iommu_put_pages(struct kvm *kvm, + gfn_t base_gfn, unsigned long npages) +{ + gfn_t gfn = base_gfn; + pfn_t pfn; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + int i; + + for (i = 0; i < npages; i++) { + pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, + gfn_to_gpa(gfn)); + kvm_release_pfn_clean(pfn); + gfn++; + } +} + +static int kvm_iommu_unmap_memslots(struct kvm *kvm) +{ + int i; + down_read(&kvm->slots_lock); + for (i = 0; i < kvm->nmemslots; i++) { + kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, + kvm->memslots[i].npages); + } + up_read(&kvm->slots_lock); + + return 0; +} + +int kvm_iommu_unmap_guest(struct kvm *kvm) +{ + struct kvm_assigned_dev_kernel *entry; + struct dmar_domain *domain = kvm->arch.intel_iommu_domain; + + /* check if iommu exists and in use */ + if (!domain) + return 0; + + list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) { + printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n", + entry->host_busnr, + PCI_SLOT(entry->host_devfn), + PCI_FUNC(entry->host_devfn)); + + /* detach kvm dmar domain */ + intel_iommu_detach_dev(domain, entry->host_busnr, + entry->host_devfn); + } + kvm_iommu_unmap_memslots(kvm); + intel_iommu_domain_exit(domain); + return 0; +} diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2134f3e0a516..c8a2793626ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -277,9 +278,18 @@ static int kvm_vm_ioctl_assign_device(struct kvm *kvm, list_add(&match->list, &kvm->arch.assigned_dev_head); + if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { + r = kvm_iommu_map_guest(kvm, match); + if (r) + goto out_list_del; + } + out: mutex_unlock(&kvm->lock); return r; +out_list_del: + list_del(&match->list); + pci_release_regions(dev); out_disable: pci_disable_device(dev); out_put: @@ -1147,6 +1157,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_PV_MMU: r = !tdp_enabled; break; + case KVM_CAP_IOMMU: + r = intel_iommu_found(); + break; default: r = 0; break; @@ -4282,6 +4295,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { + kvm_iommu_unmap_guest(kvm); kvm_free_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); -- cgit v1.2.2 From bfadaded0dc323a1cf3f08b5068f12955b54cbaa Mon Sep 17 00:00:00 2001 From: Amit Shah Date: Tue, 16 Sep 2008 18:04:28 +0300 Subject: KVM: Device Assignment: Free device structures if IRQ allocation fails When an IRQ allocation fails, we free up the device structures and disable the device so that we can unregister the device in the userspace and not expose it to the guest at all. Signed-off-by: Amit Shah Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 86 ++++++++++++++++++++++++++++-------------------------- 1 file changed, 45 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c8a2793626ec..61eddbeabeb4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -166,6 +166,43 @@ static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) enable_irq(dev->host_irq); } +static void kvm_free_assigned_device(struct kvm *kvm, + struct kvm_assigned_dev_kernel + *assigned_dev) +{ + if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) + free_irq(assigned_dev->host_irq, (void *)assigned_dev); + + kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); + + if (cancel_work_sync(&assigned_dev->interrupt_work)) + /* We had pending work. That means we will have to take + * care of kvm_put_kvm. + */ + kvm_put_kvm(kvm); + + pci_release_regions(assigned_dev->dev); + pci_disable_device(assigned_dev->dev); + pci_dev_put(assigned_dev->dev); + + list_del(&assigned_dev->list); + kfree(assigned_dev); +} + +static void kvm_free_all_assigned_devices(struct kvm *kvm) +{ + struct list_head *ptr, *ptr2; + struct kvm_assigned_dev_kernel *assigned_dev; + + list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { + assigned_dev = list_entry(ptr, + struct kvm_assigned_dev_kernel, + list); + + kvm_free_assigned_device(kvm, assigned_dev); + } +} + static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, struct kvm_assigned_irq *assigned_irq) @@ -194,8 +231,8 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, if (irqchip_in_kernel(kvm)) { if (!capable(CAP_SYS_RAWIO)) { - return -EPERM; - goto out; + r = -EPERM; + goto out_release; } if (assigned_irq->host_irq) @@ -214,17 +251,18 @@ static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, */ if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, "kvm_assigned_device", (void *)match)) { - printk(KERN_INFO "%s: couldn't allocate irq for pv " - "device\n", __func__); r = -EIO; - goto out; + goto out_release; } } match->irq_requested = true; -out: mutex_unlock(&kvm->lock); return r; +out_release: + mutex_unlock(&kvm->lock); + kvm_free_assigned_device(kvm, match); + return r; } static int kvm_vm_ioctl_assign_device(struct kvm *kvm, @@ -300,40 +338,6 @@ out_free: return r; } -static void kvm_free_assigned_devices(struct kvm *kvm) -{ - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); - - if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) { - free_irq(assigned_dev->host_irq, - (void *)assigned_dev); - - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev-> - ack_notifier); - } - - if (cancel_work_sync(&assigned_dev->interrupt_work)) - /* We had pending work. That means we will have to take - * care of kvm_put_kvm. - */ - kvm_put_kvm(kvm); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); - } -} - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -4296,7 +4300,7 @@ static void kvm_free_vcpus(struct kvm *kvm) void kvm_arch_destroy_vm(struct kvm *kvm) { kvm_iommu_unmap_guest(kvm); - kvm_free_assigned_devices(kvm); + kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); kfree(kvm->arch.vpic); kfree(kvm->arch.vioapic); -- cgit v1.2.2 From 4c2155ce81c193788082d4b8cdbc26d79edebc58 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 16 Sep 2008 20:54:47 -0300 Subject: KVM: switch to get_user_pages_fast Convert gfn_to_pfn to use get_user_pages_fast, which can do lockless pagetable lookups on x86. Kernel compilation on 4-way guest is 3.7% faster on VMX. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 23 +++++++++-------------- arch/x86/kvm/paging_tmpl.h | 8 +------- arch/x86/kvm/vmx.c | 4 ---- arch/x86/kvm/x86.c | 6 ------ 4 files changed, 10 insertions(+), 31 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bce3e25ec79b..5779a2323e23 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -405,16 +405,19 @@ static int host_largepage_backed(struct kvm *kvm, gfn_t gfn) { struct vm_area_struct *vma; unsigned long addr; + int ret = 0; addr = gfn_to_hva(kvm, gfn); if (kvm_is_error_hva(addr)) - return 0; + return ret; + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, addr); if (vma && is_vm_hugetlb_page(vma)) - return 1; + ret = 1; + up_read(¤t->mm->mmap_sem); - return 0; + return ret; } static int is_largepage_backed(struct kvm_vcpu *vcpu, gfn_t large_gfn) @@ -1140,9 +1143,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) if (gpa == UNMAPPED_GVA) return NULL; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); return page; } @@ -1330,16 +1331,14 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) pfn_t pfn; unsigned long mmu_seq; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { @@ -1488,15 +1487,13 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, if (r) return r; - down_read(¤t->mm->mmap_sem); if (is_largepage_backed(vcpu, gfn & ~(KVM_PAGES_PER_HPAGE-1))) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); largepage = 1; } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); return 1; @@ -1809,15 +1806,13 @@ static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, return; gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT; - down_read(¤t->mm->mmap_sem); if (is_large_pte(gpte) && is_largepage_backed(vcpu, gfn)) { gfn &= ~(KVM_PAGES_PER_HPAGE-1); vcpu->arch.update_pte.largepage = 1; } vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, gfn); - up_read(¤t->mm->mmap_sem); if (is_error_pfn(pfn)) { kvm_release_pfn_clean(pfn); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b671f61be41e..6dd08e096e24 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -102,14 +102,10 @@ static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, pt_element_t *table; struct page *page; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(kvm, table_gfn); - up_read(¤t->mm->mmap_sem); table = kmap_atomic(page, KM_USER0); - ret = CMPXCHG(&table[index], orig_pte, new_pte); - kunmap_atomic(table, KM_USER0); kvm_release_page_dirty(page); @@ -418,7 +414,6 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, return 0; } - down_read(¤t->mm->mmap_sem); if (walker.level == PT_DIRECTORY_LEVEL) { gfn_t large_gfn; large_gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE-1); @@ -428,9 +423,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, } } mmu_seq = vcpu->kvm->mmu_notifier_seq; - /* implicit mb(), we'll read before PT lock is unlocked */ + smp_rmb(); pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); - up_read(¤t->mm->mmap_sem); /* mmio */ if (is_error_pfn(pfn)) { diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 046a91b5a4ba..025bf4011abc 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2010,9 +2010,7 @@ static int alloc_apic_access_page(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; @@ -2034,10 +2032,8 @@ static int alloc_identity_pagetable(struct kvm *kvm) if (r) goto out; - down_read(¤t->mm->mmap_sem); kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, VMX_EPT_IDENTITY_PAGETABLE_ADDR >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); out: up_write(&kvm->slots_lock); return r; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 61eddbeabeb4..108f07267e87 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -946,10 +946,8 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) /* ...but clean it before doing the actual write */ vcpu->arch.time_offset = data & ~(PAGE_MASK | 1); - down_read(¤t->mm->mmap_sem); vcpu->arch.time_page = gfn_to_page(vcpu->kvm, data >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); if (is_error_page(vcpu->arch.time_page)) { kvm_release_page_clean(vcpu->arch.time_page); @@ -2322,9 +2320,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, val = *(u64 *)new; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, gpa >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); kaddr = kmap_atomic(page, KM_USER0); set_64bit((u64 *)(kaddr + offset_in_page(gpa)), val); @@ -3089,9 +3085,7 @@ static void vapic_enter(struct kvm_vcpu *vcpu) if (!apic || !apic->vapic_addr) return; - down_read(¤t->mm->mmap_sem); page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); - up_read(¤t->mm->mmap_sem); vcpu->arch.apic->vapic_page = page; } -- cgit v1.2.2 From 2259e3a7a6089007839cd4bbf7c9867190c67238 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Fri, 22 Aug 2008 13:29:17 -0700 Subject: KVM: x86.c make kvm_load_realmode_segment static Noticed by sparse: arch/x86/kvm/x86.c:3591:5: warning: symbol 'kvm_load_realmode_segment' was not declared. Should it be static? Signed-off-by: Harvey Harrison Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 108f07267e87..1b738cb02831 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3611,7 +3611,7 @@ static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu, return 0; } -int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) +static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) { struct kvm_segment segvar = { .base = selector << 4, -- cgit v1.2.2 From af2152f5457448bd90cb019c108e0a85e716fdbe Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Mon, 22 Sep 2008 14:28:53 +0300 Subject: KVM: don't enter guest after SIPI was received by a CPU The vcpu should process pending SIPI message before entering guest mode again. kvm_arch_vcpu_runnable() returns true if the vcpu is in SIPI state, so we can't call it here. Signed-off-by: Gleb Natapov Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1b738cb02831..08edeabf15e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3233,7 +3233,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) r = 1; while (r > 0) { - if (kvm_arch_vcpu_runnable(vcpu)) + if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) r = vcpu_enter_guest(vcpu, kvm_run); else { up_read(&vcpu->kvm->slots_lock); -- cgit v1.2.2 From a08546001c2b0f584ffc81987340943a7d6d6acb Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Tue, 23 Sep 2008 11:01:45 -0700 Subject: x86: pvclock: fix shadowed variable warning arch/x86/kernel/pvclock.c:102:6: warning: symbol 'tsc_khz' shadows an earlier one include/asm/tsc.h:18:21: originally declared here Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Avi Kivity --- arch/x86/kernel/pvclock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 1c54b5fb7aed..4f9c55f3a7c0 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -99,14 +99,14 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) { - u64 tsc_khz = 1000000ULL << 32; + u64 pv_tsc_khz = 1000000ULL << 32; - do_div(tsc_khz, src->tsc_to_system_mul); + do_div(pv_tsc_khz, src->tsc_to_system_mul); if (src->tsc_shift < 0) - tsc_khz <<= -src->tsc_shift; + pv_tsc_khz <<= -src->tsc_shift; else - tsc_khz >>= src->tsc_shift; - return tsc_khz; + pv_tsc_khz >>= src->tsc_shift; + return pv_tsc_khz; } cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) -- cgit v1.2.2 From 93a423e7045cf3cf69f960ff307edda1afcd7b41 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:29 -0300 Subject: KVM: MMU: flush remote TLBs on large->normal entry overwrite It is necessary to flush all TLB's when a large spte entry is overwritten with a normal page directory pointer. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/paging_tmpl.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 6dd08e096e24..e9fbaa44d444 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -310,8 +310,11 @@ static int FNAME(shadow_walk_entry)(struct kvm_shadow_walk *_sw, if (is_shadow_present_pte(*sptep) && !is_large_pte(*sptep)) return 0; - if (is_large_pte(*sptep)) + if (is_large_pte(*sptep)) { + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + kvm_flush_remote_tlbs(vcpu->kvm); rmap_remove(vcpu->kvm, sptep); + } if (level == PT_DIRECTORY_LEVEL && gw->level == PT_DIRECTORY_LEVEL) { metaphysical = 1; -- cgit v1.2.2 From 1e73f9dd885957bf0c7bb5e63b350d5aeb06b726 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:30 -0300 Subject: KVM: MMU: split mmu_set_spte Split the spte entry creation code into a new set_spte function. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 101 ++++++++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 44 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5779a2323e23..9ad4cc553893 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1148,44 +1148,13 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } -static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, - unsigned pt_access, unsigned pte_access, - int user_fault, int write_fault, int dirty, - int *ptwrite, int largepage, gfn_t gfn, - pfn_t pfn, bool speculative) +static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pte_access, int user_fault, + int write_fault, int dirty, int largepage, + gfn_t gfn, pfn_t pfn, bool speculative) { u64 spte; - int was_rmapped = 0; - int was_writeble = is_writeble_pte(*shadow_pte); - - pgprintk("%s: spte %llx access %x write_fault %d" - " user_fault %d gfn %lx\n", - __func__, *shadow_pte, pt_access, - write_fault, user_fault, gfn); - - if (is_rmap_pte(*shadow_pte)) { - /* - * If we overwrite a PTE page pointer with a 2MB PMD, unlink - * the parent of the now unreachable PTE. - */ - if (largepage && !is_large_pte(*shadow_pte)) { - struct kvm_mmu_page *child; - u64 pte = *shadow_pte; - - child = page_header(pte & PT64_BASE_ADDR_MASK); - mmu_page_remove_parent_pte(child, shadow_pte); - } else if (pfn != spte_to_pfn(*shadow_pte)) { - pgprintk("hfn old %lx new %lx\n", - spte_to_pfn(*shadow_pte), pfn); - rmap_remove(vcpu->kvm, shadow_pte); - } else { - if (largepage) - was_rmapped = is_large_pte(*shadow_pte); - else - was_rmapped = 1; - } - } - + int ret = 0; /* * We don't set the accessed bit, since we sometimes want to see * whether the guest actually used the pte (in order to detect @@ -1218,26 +1187,70 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); + ret = 1; pte_access &= ~ACC_WRITE_MASK; if (is_writeble_pte(spte)) { spte &= ~PT_WRITABLE_MASK; kvm_x86_ops->tlb_flush(vcpu); } - if (write_fault) - *ptwrite = 1; } } if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); - pgprintk("%s: setting spte %llx\n", __func__, spte); - pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", - (spte&PT_PAGE_SIZE_MASK)? "2MB" : "4kB", - (spte&PT_WRITABLE_MASK)?"RW":"R", gfn, spte, shadow_pte); set_shadow_pte(shadow_pte, spte); - if (!was_rmapped && (spte & PT_PAGE_SIZE_MASK) - && (spte & PT_PRESENT_MASK)) + return ret; +} + + +static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, + unsigned pt_access, unsigned pte_access, + int user_fault, int write_fault, int dirty, + int *ptwrite, int largepage, gfn_t gfn, + pfn_t pfn, bool speculative) +{ + int was_rmapped = 0; + int was_writeble = is_writeble_pte(*shadow_pte); + + pgprintk("%s: spte %llx access %x write_fault %d" + " user_fault %d gfn %lx\n", + __func__, *shadow_pte, pt_access, + write_fault, user_fault, gfn); + + if (is_rmap_pte(*shadow_pte)) { + /* + * If we overwrite a PTE page pointer with a 2MB PMD, unlink + * the parent of the now unreachable PTE. + */ + if (largepage && !is_large_pte(*shadow_pte)) { + struct kvm_mmu_page *child; + u64 pte = *shadow_pte; + + child = page_header(pte & PT64_BASE_ADDR_MASK); + mmu_page_remove_parent_pte(child, shadow_pte); + } else if (pfn != spte_to_pfn(*shadow_pte)) { + pgprintk("hfn old %lx new %lx\n", + spte_to_pfn(*shadow_pte), pfn); + rmap_remove(vcpu->kvm, shadow_pte); + } else { + if (largepage) + was_rmapped = is_large_pte(*shadow_pte); + else + was_rmapped = 1; + } + } + if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, + dirty, largepage, gfn, pfn, speculative)) + if (write_fault) + *ptwrite = 1; + + pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); + pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", + is_large_pte(*shadow_pte)? "2MB" : "4kB", + is_present_pte(*shadow_pte)?"RW":"R", gfn, + *shadow_pte, shadow_pte); + if (!was_rmapped && is_large_pte(*shadow_pte)) ++vcpu->kvm->stat.lpages; page_header_update_slot(vcpu->kvm, shadow_pte, gfn); -- cgit v1.2.2 From a378b4e64c0fef2d9e53214db167878b7673a7a3 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:31 -0300 Subject: KVM: MMU: move local TLB flush to mmu_set_spte Since the sync page path can collapse flushes. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9ad4cc553893..23752ef0839c 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1189,10 +1189,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, __func__, gfn); ret = 1; pte_access &= ~ACC_WRITE_MASK; - if (is_writeble_pte(spte)) { + if (is_writeble_pte(spte)) spte &= ~PT_WRITABLE_MASK; - kvm_x86_ops->tlb_flush(vcpu); - } } } @@ -1241,9 +1239,11 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative)) + dirty, largepage, gfn, pfn, speculative)) { if (write_fault) *ptwrite = 1; + kvm_x86_ops->tlb_flush(vcpu); + } pgprintk("%s: setting spte %llx\n", __func__, *shadow_pte); pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", -- cgit v1.2.2 From 38187c830cab84daecb41169948467f1f19317e3 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:32 -0300 Subject: KVM: MMU: do not write-protect large mappings There is not much point in write protecting large mappings. This can only happen when a page is shadowed during the window between is_largepage_backed and mmu_lock acquision. Zap the entry instead, so the next pagefault will find a shadowed page via is_largepage_backed and fallback to 4k translations. Simplifies out of sync shadow. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 23752ef0839c..731e6fe9cb07 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1180,11 +1180,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, || (write_fault && !is_write_protection(vcpu) && !user_fault)) { struct kvm_mmu_page *shadow; + if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { + ret = 1; + spte = shadow_trap_nonpresent_pte; + goto set_pte; + } + spte |= PT_WRITABLE_MASK; shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow || - (largepage && has_wrprotected_page(vcpu->kvm, gfn))) { + if (shadow) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); ret = 1; @@ -1197,6 +1202,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if (pte_access & ACC_WRITE_MASK) mark_page_dirty(vcpu->kvm, gfn); +set_pte: set_shadow_pte(shadow_pte, spte); return ret; } -- cgit v1.2.2 From e8bc217aef67d41d767ede6e7a7eb10f1d47c86c Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:33 -0300 Subject: KVM: MMU: mode specific sync_page Examine guest pagetable and bring the shadow back in sync. Caller is responsible for local TLB flush before re-entering guest mode. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 10 +++++++++ arch/x86/kvm/paging_tmpl.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 731e6fe9cb07..90f01169c8f0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -871,6 +871,12 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, sp->spt[i] = shadow_trap_nonpresent_pte; } +static int nonpaging_sync_page(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + return 1; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -1547,6 +1553,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = nonpaging_gva_to_gpa; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1594,6 +1601,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->page_fault = paging64_page_fault; context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; + context->sync_page = paging64_sync_page; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1615,6 +1623,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->gva_to_gpa = paging32_gva_to_gpa; context->free = paging_free; context->prefetch_page = paging32_prefetch_page; + context->sync_page = paging32_sync_page; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1634,6 +1643,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->page_fault = tdp_page_fault; context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; + context->sync_page = nonpaging_sync_page; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index e9fbaa44d444..776fb6d2fd81 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -507,6 +507,60 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, } } +/* + * Using the cached information from sp->gfns is safe because: + * - The spte has a reference to the struct page, so the pfn for a given gfn + * can't change unless all sptes pointing to it are nuked first. + * - Alias changes zap the entire shadow cache. + */ +static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + int i, offset, nr_present; + + offset = nr_present = 0; + + if (PTTYPE == 32) + offset = sp->role.quadrant << PT64_LEVEL_BITS; + + for (i = 0; i < PT64_ENT_PER_PAGE; i++) { + unsigned pte_access; + pt_element_t gpte; + gpa_t pte_gpa; + gfn_t gfn = sp->gfns[i]; + + if (!is_shadow_present_pte(sp->spt[i])) + continue; + + pte_gpa = gfn_to_gpa(sp->gfn); + pte_gpa += (i+offset) * sizeof(pt_element_t); + + if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte, + sizeof(pt_element_t))) + return -EINVAL; + + if (gpte_to_gfn(gpte) != gfn || !is_present_pte(gpte) || + !(gpte & PT_ACCESSED_MASK)) { + u64 nonpresent; + + rmap_remove(vcpu->kvm, &sp->spt[i]); + if (is_present_pte(gpte)) + nonpresent = shadow_trap_nonpresent_pte; + else + nonpresent = shadow_notrap_nonpresent_pte; + set_shadow_pte(&sp->spt[i], nonpresent); + continue; + } + + nr_present++; + pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); + set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, + is_dirty_pte(gpte), 0, gfn, + spte_to_pfn(sp->spt[i]), true); + } + + return !nr_present; +} + #undef pt_element_t #undef guest_walker #undef shadow_walker -- cgit v1.2.2 From 0ba73cdadb8ac172f396df7e23c4a9cebd59b550 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:34 -0300 Subject: KVM: MMU: sync roots on mmu reload Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kvm/x86.c | 1 + 2 files changed, 37 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 90f01169c8f0..9d8c4bb68a81 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1471,6 +1471,41 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } +static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ +} + +static void mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + int i; + struct kvm_mmu_page *sp; + + if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) + return; + if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { + hpa_t root = vcpu->arch.mmu.root_hpa; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + return; + } + for (i = 0; i < 4; ++i) { + hpa_t root = vcpu->arch.mmu.pae_root[i]; + + if (root) { + root &= PT64_BASE_ADDR_MASK; + sp = page_header(root); + mmu_sync_children(vcpu, sp); + } + } +} + +void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->kvm->mmu_lock); + mmu_sync_roots(vcpu); + spin_unlock(&vcpu->kvm->mmu_lock); +} + static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) { return vaddr; @@ -1715,6 +1750,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu) spin_lock(&vcpu->kvm->mmu_lock); kvm_mmu_free_some_pages(vcpu); mmu_alloc_roots(vcpu); + mmu_sync_roots(vcpu); spin_unlock(&vcpu->kvm->mmu_lock); kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); kvm_mmu_flush_tlb(vcpu); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 08edeabf15e6..88e6d9abbd2b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -594,6 +594,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4); void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) { if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { + kvm_mmu_sync_roots(vcpu); kvm_mmu_flush_tlb(vcpu); return; } -- cgit v1.2.2 From a7052897b3bcd568a9f5bfaa558957039e7e7ec0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:35 -0300 Subject: KVM: x86: trap invlpg With pages out of sync invlpg needs to be trapped. For now simply nuke the entry. Untested on AMD. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 18 ++++++++++++++++++ arch/x86/kvm/paging_tmpl.h | 25 +++++++++++++++++++++++++ arch/x86/kvm/svm.c | 13 +++++++++++-- arch/x86/kvm/vmx.c | 19 ++++++++++++++++--- arch/x86/kvm/x86.c | 1 + 5 files changed, 71 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 9d8c4bb68a81..e89af1df4fcd 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -877,6 +877,10 @@ static int nonpaging_sync_page(struct kvm_vcpu *vcpu, return 1; } +static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -1589,6 +1593,7 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->root_level = 0; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1637,6 +1642,7 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) context->gva_to_gpa = paging64_gva_to_gpa; context->prefetch_page = paging64_prefetch_page; context->sync_page = paging64_sync_page; + context->invlpg = paging64_invlpg; context->free = paging_free; context->root_level = level; context->shadow_root_level = level; @@ -1659,6 +1665,7 @@ static int paging32_init_context(struct kvm_vcpu *vcpu) context->free = paging_free; context->prefetch_page = paging32_prefetch_page; context->sync_page = paging32_sync_page; + context->invlpg = paging32_invlpg; context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; @@ -1679,6 +1686,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->free = nonpaging_free; context->prefetch_page = nonpaging_prefetch_page; context->sync_page = nonpaging_sync_page; + context->invlpg = nonpaging_invlpg; context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->root_hpa = INVALID_PAGE; @@ -2071,6 +2079,16 @@ out: } EXPORT_SYMBOL_GPL(kvm_mmu_page_fault); +void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva) +{ + spin_lock(&vcpu->kvm->mmu_lock); + vcpu->arch.mmu.invlpg(vcpu, gva); + spin_unlock(&vcpu->kvm->mmu_lock); + kvm_mmu_flush_tlb(vcpu); + ++vcpu->stat.invlpg; +} +EXPORT_SYMBOL_GPL(kvm_mmu_invlpg); + void kvm_enable_tdp(void) { tdp_enabled = true; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 776fb6d2fd81..dc169e8148b1 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -461,6 +461,31 @@ out_unlock: return 0; } +static int FNAME(shadow_invlpg_entry)(struct kvm_shadow_walk *_sw, + struct kvm_vcpu *vcpu, u64 addr, + u64 *sptep, int level) +{ + + if (level == PT_PAGE_TABLE_LEVEL) { + if (is_shadow_present_pte(*sptep)) + rmap_remove(vcpu->kvm, sptep); + set_shadow_pte(sptep, shadow_trap_nonpresent_pte); + return 1; + } + if (!is_shadow_present_pte(*sptep)) + return 1; + return 0; +} + +static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) +{ + struct shadow_walker walker = { + .walker = { .entry = FNAME(shadow_invlpg_entry), }, + }; + + walk_shadow(&walker.walker, vcpu, gva); +} + static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) { struct guest_walker walker; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 9b54550fa4d2..9c4ce657d963 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -525,6 +525,7 @@ static void init_vmcb(struct vcpu_svm *svm) (1ULL << INTERCEPT_CPUID) | (1ULL << INTERCEPT_INVD) | (1ULL << INTERCEPT_HLT) | + (1ULL << INTERCEPT_INVLPG) | (1ULL << INTERCEPT_INVLPGA) | (1ULL << INTERCEPT_IOIO_PROT) | (1ULL << INTERCEPT_MSR_PROT) | @@ -589,7 +590,8 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - control->intercept &= ~(1ULL << INTERCEPT_TASK_SWITCH); + control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | + (1ULL << INTERCEPT_INVLPG)); control->intercept_exceptions &= ~(1 << PF_VECTOR); control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| INTERCEPT_CR3_MASK); @@ -1164,6 +1166,13 @@ static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) return 1; } +static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) +{ + if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) + pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); + return 1; +} + static int emulate_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) { @@ -1417,7 +1426,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm, [SVM_EXIT_CPUID] = cpuid_interception, [SVM_EXIT_INVD] = emulate_on_interception, [SVM_EXIT_HLT] = halt_interception, - [SVM_EXIT_INVLPG] = emulate_on_interception, + [SVM_EXIT_INVLPG] = invlpg_interception, [SVM_EXIT_INVLPGA] = invalid_op_interception, [SVM_EXIT_IOIO] = io_interception, [SVM_EXIT_MSR] = msr_interception, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 025bf4011abc..4556cc3715bb 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1130,7 +1130,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_CR3_STORE_EXITING | CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | - CPU_BASED_USE_TSC_OFFSETING; + CPU_BASED_USE_TSC_OFFSETING | + CPU_BASED_INVLPG_EXITING; opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -1159,9 +1160,11 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) _cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW; #endif if (_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) { - /* CR3 accesses don't need to cause VM Exits when EPT enabled */ + /* CR3 accesses and invlpg don't need to cause VM Exits when EPT + enabled */ min &= ~(CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); + CPU_BASED_CR3_STORE_EXITING | + CPU_BASED_INVLPG_EXITING); if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PROCBASED_CTLS, &_cpu_based_exec_control) < 0) return -EIO; @@ -2790,6 +2793,15 @@ static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) return 1; } +static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) +{ + u64 exit_qualification = vmcs_read64(EXIT_QUALIFICATION); + + kvm_mmu_invlpg(vcpu, exit_qualification); + skip_emulated_instruction(vcpu); + return 1; +} + static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { skip_emulated_instruction(vcpu); @@ -2958,6 +2970,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, [EXIT_REASON_MSR_WRITE] = handle_wrmsr, [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, [EXIT_REASON_HLT] = handle_halt, + [EXIT_REASON_INVLPG] = handle_invlpg, [EXIT_REASON_VMCALL] = handle_vmcall, [EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold, [EXIT_REASON_APIC_ACCESS] = handle_apic_access, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 88e6d9abbd2b..efee85ba07e5 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2341,6 +2341,7 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) { + kvm_mmu_invlpg(vcpu, address); return X86EMUL_CONTINUE; } -- cgit v1.2.2 From ad8cfbe3fffdc09704f0808fde3934855620d545 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:36 -0300 Subject: KVM: MMU: mmu_parent_walk Introduce a function to walk all parents of a given page, invoking a handler. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index e89af1df4fcd..b82abee78f17 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,8 @@ struct kvm_shadow_walk { u64 addr, u64 *spte, int level); }; +typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); + static struct kmem_cache *pte_chain_cache; static struct kmem_cache *rmap_desc_cache; static struct kmem_cache *mmu_page_header_cache; @@ -862,6 +864,31 @@ static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp, BUG(); } + +static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, + mmu_parent_walk_fn fn) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + struct kvm_mmu_page *parent_sp; + int i; + + if (!sp->multimapped && sp->parent_pte) { + parent_sp = page_header(__pa(sp->parent_pte)); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + return; + } + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + parent_sp = page_header(__pa(pte_chain->parent_ptes[i])); + fn(vcpu, parent_sp); + mmu_parent_walk(vcpu, parent_sp, fn); + } +} + static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { -- cgit v1.2.2 From 0738541396be165995c7f2387746eb0b47024fec Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:37 -0300 Subject: KVM: MMU: awareness of new kvm_mmu_zap_page behaviour kvm_mmu_zap_page will soon zap the unsynced children of a page. Restart list walk in such case. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index b82abee78f17..c9b4b902527b 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1078,7 +1078,7 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } -static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { ++kvm->stat.mmu_shadow_zapped; kvm_mmu_page_unlink_children(kvm, sp); @@ -1095,6 +1095,7 @@ static void kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); + return 0; } /* @@ -1147,8 +1148,9 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) if (sp->gfn == gfn && !sp->role.metaphysical) { pgprintk("%s: gfn %lx role %x\n", __func__, gfn, sp->role.word); - kvm_mmu_zap_page(kvm, sp); r = 1; + if (kvm_mmu_zap_page(kvm, sp)) + n = bucket->first; } return r; } @@ -1992,7 +1994,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, */ pgprintk("misaligned: gpa %llx bytes %d role %x\n", gpa, bytes, sp->role.word); - kvm_mmu_zap_page(vcpu->kvm, sp); + if (kvm_mmu_zap_page(vcpu->kvm, sp)) + n = bucket->first; ++vcpu->kvm->stat.mmu_flooded; continue; } @@ -2226,7 +2229,9 @@ void kvm_mmu_zap_all(struct kvm *kvm) spin_lock(&kvm->mmu_lock); list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) - kvm_mmu_zap_page(kvm, sp); + if (kvm_mmu_zap_page(kvm, sp)) + node = container_of(kvm->arch.active_mmu_pages.next, + struct kvm_mmu_page, link); spin_unlock(&kvm->mmu_lock); kvm_flush_remote_tlbs(kvm); -- cgit v1.2.2 From 6844dec6948679d084f054235fee19ba4e3a3096 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:38 -0300 Subject: KVM: MMU: mmu_convert_notrap helper Need to convert shadow_notrap_nonpresent -> shadow_trap_nonpresent when unsyncing pages. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index c9b4b902527b..57c7580e7f98 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1173,6 +1173,20 @@ static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) __set_bit(slot, &sp->slot_bitmap); } +static void mmu_convert_notrap(struct kvm_mmu_page *sp) +{ + int i; + u64 *pt = sp->spt; + + if (shadow_trap_nonpresent_pte == shadow_notrap_nonpresent_pte) + return; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + if (pt[i] == shadow_notrap_nonpresent_pte) + set_shadow_pte(&pt[i], shadow_trap_nonpresent_pte); + } +} + struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) { struct page *page; -- cgit v1.2.2 From 4731d4c7a07769cf2926c327177b97bb8c68cafc Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:39 -0300 Subject: KVM: MMU: out of sync shadow core Allow guest pagetables to go out of sync. Instead of emulating write accesses to guest pagetables, or unshadowing them, we un-write-protect the page table and allow the guest to modify it at will. We rely on invlpg executions to synchronize individual ptes, and will synchronize the entire pagetable on tlb flushes. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 210 +++++++++++++++++++++++++++++++++++++++++---- arch/x86/kvm/paging_tmpl.h | 2 +- arch/x86/kvm/x86.c | 3 + 3 files changed, 197 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 57c7580e7f98..d88659ae7778 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -147,6 +147,10 @@ struct kvm_shadow_walk { u64 addr, u64 *spte, int level); }; +struct kvm_unsync_walk { + int (*entry) (struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk); +}; + typedef int (*mmu_parent_walk_fn) (struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp); static struct kmem_cache *pte_chain_cache; @@ -654,8 +658,6 @@ static void rmap_write_protect(struct kvm *kvm, u64 gfn) if (write_protected) kvm_flush_remote_tlbs(kvm); - - account_shadowed(kvm, gfn); } static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp) @@ -908,6 +910,41 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +static int mmu_unsync_walk(struct kvm_mmu_page *sp, + struct kvm_unsync_walk *walker) +{ + int i, ret; + + if (!sp->unsync_children) + return 0; + + for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + u64 ent = sp->spt[i]; + + if (is_shadow_present_pte(ent)) { + struct kvm_mmu_page *child; + child = page_header(ent & PT64_BASE_ADDR_MASK); + + if (child->unsync_children) { + ret = mmu_unsync_walk(child, walker); + if (ret) + return ret; + } + + if (child->unsync) { + ret = walker->entry(child, walker); + if (ret) + return ret; + } + } + } + + if (i == PT64_ENT_PER_PAGE) + sp->unsync_children = 0; + + return 0; +} + static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) { unsigned index; @@ -928,6 +965,59 @@ static struct kvm_mmu_page *kvm_mmu_lookup_page(struct kvm *kvm, gfn_t gfn) return NULL; } +static void kvm_unlink_unsync_page(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + WARN_ON(!sp->unsync); + sp->unsync = 0; + --kvm->stat.mmu_unsync; +} + +static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp); + +static int kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + if (sp->role.glevels != vcpu->arch.mmu.root_level) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + rmap_write_protect(vcpu->kvm, sp->gfn); + if (vcpu->arch.mmu.sync_page(vcpu, sp)) { + kvm_mmu_zap_page(vcpu->kvm, sp); + return 1; + } + + kvm_mmu_flush_tlb(vcpu); + kvm_unlink_unsync_page(vcpu->kvm, sp); + return 0; +} + +struct sync_walker { + struct kvm_vcpu *vcpu; + struct kvm_unsync_walk walker; +}; + +static int mmu_sync_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct sync_walker *sync_walk = container_of(walk, struct sync_walker, + walker); + struct kvm_vcpu *vcpu = sync_walk->vcpu; + + kvm_sync_page(vcpu, sp); + return (need_resched() || spin_needbreak(&vcpu->kvm->mmu_lock)); +} + +static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + struct sync_walker walker = { + .walker = { .entry = mmu_sync_fn, }, + .vcpu = vcpu, + }; + + while (mmu_unsync_walk(sp, &walker.walker)) + cond_resched_lock(&vcpu->kvm->mmu_lock); +} + static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn_t gfn, gva_t gaddr, @@ -941,7 +1031,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, unsigned quadrant; struct hlist_head *bucket; struct kvm_mmu_page *sp; - struct hlist_node *node; + struct hlist_node *node, *tmp; role.word = 0; role.glevels = vcpu->arch.mmu.root_level; @@ -957,8 +1047,18 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, gfn, role.word); index = kvm_page_table_hashfn(gfn); bucket = &vcpu->kvm->arch.mmu_page_hash[index]; - hlist_for_each_entry(sp, node, bucket, hash_link) - if (sp->gfn == gfn && sp->role.word == role.word) { + hlist_for_each_entry_safe(sp, node, tmp, bucket, hash_link) + if (sp->gfn == gfn) { + if (sp->unsync) + if (kvm_sync_page(vcpu, sp)) + continue; + + if (sp->role.word != role.word) + continue; + + if (sp->unsync_children) + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + mmu_page_add_parent_pte(vcpu, sp, parent_pte); pgprintk("%s: found\n", __func__); return sp; @@ -971,8 +1071,10 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, sp->gfn = gfn; sp->role = role; hlist_add_head(&sp->hash_link, bucket); - if (!metaphysical) + if (!metaphysical) { rmap_write_protect(vcpu->kvm, gfn); + account_shadowed(vcpu->kvm, gfn); + } if (shadow_trap_nonpresent_pte != shadow_notrap_nonpresent_pte) vcpu->arch.mmu.prefetch_page(vcpu, sp); else @@ -1078,14 +1180,47 @@ static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) } } +struct zap_walker { + struct kvm_unsync_walk walker; + struct kvm *kvm; + int zapped; +}; + +static int mmu_zap_fn(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walk) +{ + struct zap_walker *zap_walk = container_of(walk, struct zap_walker, + walker); + kvm_mmu_zap_page(zap_walk->kvm, sp); + zap_walk->zapped = 1; + return 0; +} + +static int mmu_zap_unsync_children(struct kvm *kvm, struct kvm_mmu_page *sp) +{ + struct zap_walker walker = { + .walker = { .entry = mmu_zap_fn, }, + .kvm = kvm, + .zapped = 0, + }; + + if (sp->role.level == PT_PAGE_TABLE_LEVEL) + return 0; + mmu_unsync_walk(sp, &walker.walker); + return walker.zapped; +} + static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) { + int ret; ++kvm->stat.mmu_shadow_zapped; + ret = mmu_zap_unsync_children(kvm, sp); kvm_mmu_page_unlink_children(kvm, sp); kvm_mmu_unlink_parents(kvm, sp); kvm_flush_remote_tlbs(kvm); if (!sp->role.invalid && !sp->role.metaphysical) unaccount_shadowed(kvm, sp->gfn); + if (sp->unsync) + kvm_unlink_unsync_page(kvm, sp); if (!sp->root_count) { hlist_del(&sp->hash_link); kvm_mmu_free_page(kvm, sp); @@ -1095,7 +1230,7 @@ static int kvm_mmu_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp) kvm_reload_remote_mmus(kvm); } kvm_mmu_reset_last_pte_updated(kvm); - return 0; + return ret; } /* @@ -1201,10 +1336,58 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + sp->unsync_children = 1; + return 1; +} + +static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + unsigned index; + struct hlist_head *bucket; + struct kvm_mmu_page *s; + struct hlist_node *node, *n; + + index = kvm_page_table_hashfn(sp->gfn); + bucket = &vcpu->kvm->arch.mmu_page_hash[index]; + /* don't unsync if pagetable is shadowed with multiple roles */ + hlist_for_each_entry_safe(s, node, n, bucket, hash_link) { + if (s->gfn != sp->gfn || s->role.metaphysical) + continue; + if (s->role.word != sp->role.word) + return 1; + } + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + ++vcpu->kvm->stat.mmu_unsync; + sp->unsync = 1; + mmu_convert_notrap(sp); + return 0; +} + +static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, + bool can_unsync) +{ + struct kvm_mmu_page *shadow; + + shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); + if (shadow) { + if (shadow->role.level != PT_PAGE_TABLE_LEVEL) + return 1; + if (shadow->unsync) + return 0; + if (can_unsync) + return kvm_unsync_page(vcpu, shadow); + return 1; + } + return 0; +} + static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pte_access, int user_fault, int write_fault, int dirty, int largepage, - gfn_t gfn, pfn_t pfn, bool speculative) + gfn_t gfn, pfn_t pfn, bool speculative, + bool can_unsync) { u64 spte; int ret = 0; @@ -1231,7 +1414,6 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, if ((pte_access & ACC_WRITE_MASK) || (write_fault && !is_write_protection(vcpu) && !user_fault)) { - struct kvm_mmu_page *shadow; if (largepage && has_wrprotected_page(vcpu->kvm, gfn)) { ret = 1; @@ -1241,8 +1423,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, spte |= PT_WRITABLE_MASK; - shadow = kvm_mmu_lookup_page(vcpu->kvm, gfn); - if (shadow) { + if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { pgprintk("%s: found shadow page for %lx, marking ro\n", __func__, gfn); ret = 1; @@ -1260,7 +1441,6 @@ set_pte: return ret; } - static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, unsigned pt_access, unsigned pte_access, int user_fault, int write_fault, int dirty, @@ -1298,7 +1478,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *shadow_pte, } } if (set_spte(vcpu, shadow_pte, pte_access, user_fault, write_fault, - dirty, largepage, gfn, pfn, speculative)) { + dirty, largepage, gfn, pfn, speculative, true)) { if (write_fault) *ptwrite = 1; kvm_x86_ops->tlb_flush(vcpu); @@ -1518,10 +1698,6 @@ static void mmu_alloc_roots(struct kvm_vcpu *vcpu) vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); } -static void mmu_sync_children(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ -} - static void mmu_sync_roots(struct kvm_vcpu *vcpu) { int i; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index dc169e8148b1..613ec9aa674a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -580,7 +580,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, is_dirty_pte(gpte), 0, gfn, - spte_to_pfn(sp->spt[i]), true); + spte_to_pfn(sp->spt[i]), true, false); } return !nr_present; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index efee85ba07e5..1c5864ac0837 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -101,6 +101,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "mmu_flooded", VM_STAT(mmu_flooded) }, { "mmu_recycled", VM_STAT(mmu_recycled) }, { "mmu_cache_miss", VM_STAT(mmu_cache_miss) }, + { "mmu_unsync", VM_STAT(mmu_unsync) }, { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, { "largepages", VM_STAT(lpages) }, { NULL } @@ -3120,6 +3121,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (vcpu->requests) { if (test_and_clear_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests)) __kvm_migrate_timers(vcpu); + if (test_and_clear_bit(KVM_REQ_MMU_SYNC, &vcpu->requests)) + kvm_mmu_sync_roots(vcpu); if (test_and_clear_bit(KVM_REQ_TLB_FLUSH, &vcpu->requests)) kvm_x86_ops->tlb_flush(vcpu); if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, -- cgit v1.2.2 From 0074ff63ebc195701062ca46e0d82fcea0fa3a0a Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:40 -0300 Subject: KVM: MMU: speed up mmu_unsync_walk Cache the unsynced children information in a per-page bitmap. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 72 +++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d88659ae7778..cb391d629af2 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -891,6 +891,52 @@ static void mmu_parent_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, } } +static void kvm_mmu_update_unsync_bitmap(u64 *spte) +{ + unsigned int index; + struct kvm_mmu_page *sp = page_header(__pa(spte)); + + index = spte - sp->spt; + __set_bit(index, sp->unsync_child_bitmap); + sp->unsync_children = 1; +} + +static void kvm_mmu_update_parents_unsync(struct kvm_mmu_page *sp) +{ + struct kvm_pte_chain *pte_chain; + struct hlist_node *node; + int i; + + if (!sp->parent_pte) + return; + + if (!sp->multimapped) { + kvm_mmu_update_unsync_bitmap(sp->parent_pte); + return; + } + + hlist_for_each_entry(pte_chain, node, &sp->parent_ptes, link) + for (i = 0; i < NR_PTE_CHAIN_ENTRIES; ++i) { + if (!pte_chain->parent_ptes[i]) + break; + kvm_mmu_update_unsync_bitmap(pte_chain->parent_ptes[i]); + } +} + +static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) +{ + sp->unsync_children = 1; + kvm_mmu_update_parents_unsync(sp); + return 1; +} + +static void kvm_mmu_mark_parents_unsync(struct kvm_vcpu *vcpu, + struct kvm_mmu_page *sp) +{ + mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_update_parents_unsync(sp); +} + static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { @@ -910,6 +956,11 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) { } +#define for_each_unsync_children(bitmap, idx) \ + for (idx = find_first_bit(bitmap, 512); \ + idx < 512; \ + idx = find_next_bit(bitmap, 512, idx+1)) + static int mmu_unsync_walk(struct kvm_mmu_page *sp, struct kvm_unsync_walk *walker) { @@ -918,7 +969,7 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, if (!sp->unsync_children) return 0; - for (i = 0; i < PT64_ENT_PER_PAGE; ++i) { + for_each_unsync_children(sp->unsync_child_bitmap, i) { u64 ent = sp->spt[i]; if (is_shadow_present_pte(ent)) { @@ -929,17 +980,19 @@ static int mmu_unsync_walk(struct kvm_mmu_page *sp, ret = mmu_unsync_walk(child, walker); if (ret) return ret; + __clear_bit(i, sp->unsync_child_bitmap); } if (child->unsync) { ret = walker->entry(child, walker); + __clear_bit(i, sp->unsync_child_bitmap); if (ret) return ret; } } } - if (i == PT64_ENT_PER_PAGE) + if (find_first_bit(sp->unsync_child_bitmap, 512) == 512) sp->unsync_children = 0; return 0; @@ -1056,10 +1109,11 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, if (sp->role.word != role.word) continue; - if (sp->unsync_children) - set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); - mmu_page_add_parent_pte(vcpu, sp, parent_pte); + if (sp->unsync_children) { + set_bit(KVM_REQ_MMU_SYNC, &vcpu->requests); + kvm_mmu_mark_parents_unsync(vcpu, sp); + } pgprintk("%s: found\n", __func__); return sp; } @@ -1336,12 +1390,6 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva) return page; } -static int unsync_walk_fn(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) -{ - sp->unsync_children = 1; - return 1; -} - static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { unsigned index; @@ -1358,7 +1406,7 @@ static int kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) if (s->role.word != sp->role.word) return 1; } - mmu_parent_walk(vcpu, sp, unsync_walk_fn); + kvm_mmu_mark_parents_unsync(vcpu, sp); ++vcpu->kvm->stat.mmu_unsync; sp->unsync = 1; mmu_convert_notrap(sp); -- cgit v1.2.2 From 582801a95d2f2ceab841779e1dec0e11dfec44c0 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 23 Sep 2008 13:18:41 -0300 Subject: KVM: MMU: add "oos_shadow" parameter to disable oos Subject says it all. Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/mmu.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index cb391d629af2..99c239c5c0ac 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -70,6 +70,9 @@ static int dbg = 0; module_param(dbg, bool, 0644); #endif +static int oos_shadow = 1; +module_param(oos_shadow, bool, 0644); + #ifndef MMU_DEBUG #define ASSERT(x) do { } while (0) #else @@ -1424,7 +1427,7 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 1; if (shadow->unsync) return 0; - if (can_unsync) + if (can_unsync && oos_shadow) return kvm_unsync_page(vcpu, shadow); return 1; } -- cgit v1.2.2 From e48258009d941891fca35348986b8d280caf31cd Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Wed, 24 Sep 2008 20:28:34 -0300 Subject: KVM: PIC: enhance IPI avoidance The PIC code makes little effort to avoid kvm_vcpu_kick(), resulting in unnecessary guest exits in some conditions. For example, if the timer interrupt is routed through the IOAPIC, IRR for IRQ 0 will get set but not cleared, since the APIC is handling the acks. This means that everytime an interrupt < 16 is triggered, the priority logic will find IRQ0 pending and send an IPI to vcpu0 (in case IRQ0 is not masked, which is Linux's case). Introduce a new variable isr_ack to represent the IRQ's for which the guest has been signalled / cleared the ISR. Use it to avoid more than one IPI per trigger-ack cycle, in addition to the avoidance when ISR is set in get_priority(). Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/i8259.c | 17 +++++++++++++++-- arch/x86/kvm/irq.h | 2 ++ arch/x86/kvm/x86.c | 1 + 3 files changed, 18 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 71e3eeeccae8..17e41e165f1a 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -33,6 +33,14 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq) { s->isr &= ~(1 << irq); + s->isr_ack |= (1 << irq); +} + +void kvm_pic_clear_isr_ack(struct kvm *kvm) +{ + struct kvm_pic *s = pic_irqchip(kvm); + s->pics[0].isr_ack = 0xff; + s->pics[1].isr_ack = 0xff; } /* @@ -213,6 +221,7 @@ void kvm_pic_reset(struct kvm_kpic_state *s) s->irr = 0; s->imr = 0; s->isr = 0; + s->isr_ack = 0xff; s->priority_add = 0; s->irq_base = 0; s->read_reg_select = 0; @@ -444,10 +453,14 @@ static void pic_irq_request(void *opaque, int level) { struct kvm *kvm = opaque; struct kvm_vcpu *vcpu = kvm->vcpus[0]; + struct kvm_pic *s = pic_irqchip(kvm); + int irq = pic_get_irq(&s->pics[0]); - pic_irqchip(kvm)->output = level; - if (vcpu) + s->output = level; + if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) { + s->pics[0].isr_ack &= ~(1 << irq); kvm_vcpu_kick(vcpu); + } } struct kvm_pic *kvm_create_pic(struct kvm *kvm) diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 479a3d2d5614..4748532fd1da 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -42,6 +42,7 @@ struct kvm_kpic_state { u8 irr; /* interrupt request register */ u8 imr; /* interrupt mask register */ u8 isr; /* interrupt service register */ + u8 isr_ack; /* interrupt ack detection */ u8 priority_add; /* highest irq priority */ u8 irq_base; u8 read_reg_select; @@ -70,6 +71,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_pic_set_irq(void *opaque, int irq, int level); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); +void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 1c5864ac0837..4cfdd1b6df0b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3963,6 +3963,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, pr_debug("Set back pending irq %d\n", pending_vec); } + kvm_pic_clear_isr_ack(vcpu->kvm); } kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); -- cgit v1.2.2 From e5fcfc821a467bd0827635db8fd39ab1213987e5 Mon Sep 17 00:00:00 2001 From: Weidong Han Date: Thu, 25 Sep 2008 23:32:02 +0800 Subject: KVM: Device Assignment: Map mmio pages into VT-d page table Assigned device could DMA to mmio pages, so also need to map mmio pages into VT-d page table. Signed-off-by: Weidong Han Signed-off-by: Avi Kivity --- arch/x86/kvm/vtd.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c index 667bf3fb64bf..a770874f3a3a 100644 --- a/arch/x86/kvm/vtd.c +++ b/arch/x86/kvm/vtd.c @@ -36,37 +36,30 @@ int kvm_iommu_map_pages(struct kvm *kvm, { gfn_t gfn = base_gfn; pfn_t pfn; - int i, r; + int i, r = 0; struct dmar_domain *domain = kvm->arch.intel_iommu_domain; /* check if iommu exists and in use */ if (!domain) return 0; - r = -EINVAL; for (i = 0; i < npages; i++) { /* check if already mapped */ pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, gfn_to_gpa(gfn)); - if (pfn && !is_mmio_pfn(pfn)) + if (pfn) continue; pfn = gfn_to_pfn(kvm, gfn); - if (!is_mmio_pfn(pfn)) { - r = intel_iommu_page_mapping(domain, - gfn_to_gpa(gfn), - pfn_to_hpa(pfn), - PAGE_SIZE, - DMA_PTE_READ | - DMA_PTE_WRITE); - if (r) { - printk(KERN_DEBUG "kvm_iommu_map_pages:" - "iommu failed to map pfn=%lx\n", pfn); - goto unmap_pages; - } - } else { - printk(KERN_DEBUG "kvm_iommu_map_page:" - "invalid pfn=%lx\n", pfn); + r = intel_iommu_page_mapping(domain, + gfn_to_gpa(gfn), + pfn_to_hpa(pfn), + PAGE_SIZE, + DMA_PTE_READ | + DMA_PTE_WRITE); + if (r) { + printk(KERN_ERR "kvm_iommu_map_pages:" + "iommu failed to map pfn=%lx\n", pfn); goto unmap_pages; } gfn++; -- cgit v1.2.2 From 1b10bf31a5de5b76e2e9c2937878a45c5ae2be37 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Tue, 30 Sep 2008 10:41:06 +0200 Subject: KVM: x86: Silence various LAPIC-related host kernel messages KVM-x86 dumps a lot of debug messages that have no meaning for normal operation: - INIT de-assertion is ignored - SIPIs are sent and received - APIC writes are unaligned or < 4 byte long (Windows Server 2003 triggers this on SMP) Degrade them to true debug messages, keeping the host kernel log clean for real problems. Signed-off-by: Jan Kiszka Signed-off-by: Avi Kivity --- arch/x86/kvm/lapic.c | 16 +++++++--------- arch/x86/kvm/x86.c | 4 ++-- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index fd00f698692f..6571926bfd33 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -365,16 +365,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_vcpu_kick(vcpu); } else { - printk(KERN_DEBUG - "Ignoring de-assert INIT to vcpu %d\n", - vcpu->vcpu_id); + apic_debug("Ignoring de-assert INIT to vcpu %d\n", + vcpu->vcpu_id); } - break; case APIC_DM_STARTUP: - printk(KERN_DEBUG "SIPI to vcpu %d vector 0x%02x\n", - vcpu->vcpu_id, vector); + apic_debug("SIPI to vcpu %d vector 0x%02x\n", + vcpu->vcpu_id, vector); if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { vcpu->arch.sipi_vector = vector; vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; @@ -679,9 +677,9 @@ static void apic_mmio_write(struct kvm_io_device *this, * Refer SDM 8.4.1 */ if (len != 4 || alignment) { - if (printk_ratelimit()) - printk(KERN_ERR "apic write: bad size=%d %lx\n", - len, (long)address); + /* Don't shout loud, $infamous_os would cause only noise. */ + apic_debug("apic write: bad size=%d %lx\n", + len, (long)address); return; } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4cfdd1b6df0b..d6d7123d2644 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3224,8 +3224,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) int r; if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { - printk("vcpu %d received sipi with vector # %x\n", - vcpu->vcpu_id, vcpu->arch.sipi_vector); + pr_debug("vcpu %d received sipi with vector # %x\n", + vcpu->vcpu_id, vcpu->arch.sipi_vector); kvm_lapic_reset(vcpu); r = kvm_x86_ops->vcpu_reset(vcpu); if (r) -- cgit v1.2.2 From 83dbc83a0d7c88c919d769177bd1924a46c9c034 Mon Sep 17 00:00:00 2001 From: Marcelo Tosatti Date: Tue, 7 Oct 2008 17:01:27 -0300 Subject: KVM: VMX: enable invlpg exiting if EPT is disabled Manually disabling EPT via module option fails to re-enable INVLPG exiting. Reported-by: Gleb Natapov Signed-off-by: Marcelo Tosatti Signed-off-by: Avi Kivity --- arch/x86/kvm/vmx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4556cc3715bb..2643b430d83a 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2118,7 +2118,8 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx) } if (!vm_need_ept()) exec_control |= CPU_BASED_CR3_STORE_EXITING | - CPU_BASED_CR3_LOAD_EXITING; + CPU_BASED_CR3_LOAD_EXITING | + CPU_BASED_INVLPG_EXITING; vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control); if (cpu_has_secondary_exec_ctrls()) { -- cgit v1.2.2 From 371c01b28e4049d1fbf60a9631cdad98f7cae030 Mon Sep 17 00:00:00 2001 From: Zhang xiantao Date: Thu, 11 Sep 2008 13:19:32 +0800 Subject: KVM: Device Assignment: Move vtd.c from arch/x86/kvm/ to virt/kvm/ Preparation for kvm/ia64 VT-d support. Signed-off-by: Zhang xiantao Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 6 +- arch/x86/kvm/vtd.c | 191 -------------------------------------------------- 2 files changed, 3 insertions(+), 194 deletions(-) delete mode 100644 arch/x86/kvm/vtd.c (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3072b17447ab..7dce593b9c66 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -7,14 +7,14 @@ common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif +ifeq ($(CONFIG_DMAR),y) +common-objs += $(addprefix ../../../virt/kvm/, vtd.o) +endif EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm kvm-objs := $(common-objs) x86.o mmu.o x86_emulate.o i8259.o irq.o lapic.o \ i8254.o -ifeq ($(CONFIG_DMAR),y) -kvm-objs += vtd.o -endif obj-$(CONFIG_KVM) += kvm.o kvm-intel-objs = vmx.o obj-$(CONFIG_KVM_INTEL) += kvm-intel.o diff --git a/arch/x86/kvm/vtd.c b/arch/x86/kvm/vtd.c deleted file mode 100644 index a770874f3a3a..000000000000 --- a/arch/x86/kvm/vtd.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Author: Allen M. Kay - * Author: Weidong Han - * Author: Ben-Ami Yassour - */ - -#include -#include -#include -#include -#include - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -int kvm_iommu_map_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - gfn_t gfn = base_gfn; - pfn_t pfn; - int i, r = 0; - struct dmar_domain *domain = kvm->arch.intel_iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - for (i = 0; i < npages; i++) { - /* check if already mapped */ - pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, - gfn_to_gpa(gfn)); - if (pfn) - continue; - - pfn = gfn_to_pfn(kvm, gfn); - r = intel_iommu_page_mapping(domain, - gfn_to_gpa(gfn), - pfn_to_hpa(pfn), - PAGE_SIZE, - DMA_PTE_READ | - DMA_PTE_WRITE); - if (r) { - printk(KERN_ERR "kvm_iommu_map_pages:" - "iommu failed to map pfn=%lx\n", pfn); - goto unmap_pages; - } - gfn++; - } - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, base_gfn, i); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int i, r; - - down_read(&kvm->slots_lock); - for (i = 0; i < kvm->nmemslots; i++) { - r = kvm_iommu_map_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); - if (r) - break; - } - up_read(&kvm->slots_lock); - return r; -} - -int kvm_iommu_map_guest(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - struct pci_dev *pdev = NULL; - int r; - - if (!intel_iommu_found()) { - printk(KERN_ERR "%s: intel iommu not found\n", __func__); - return -ENODEV; - } - - printk(KERN_DEBUG "VT-d direct map: host bdf = %x:%x:%x\n", - assigned_dev->host_busnr, - PCI_SLOT(assigned_dev->host_devfn), - PCI_FUNC(assigned_dev->host_devfn)); - - pdev = assigned_dev->dev; - - if (pdev == NULL) { - if (kvm->arch.intel_iommu_domain) { - intel_iommu_domain_exit(kvm->arch.intel_iommu_domain); - kvm->arch.intel_iommu_domain = NULL; - } - return -ENODEV; - } - - kvm->arch.intel_iommu_domain = intel_iommu_domain_alloc(pdev); - if (!kvm->arch.intel_iommu_domain) - return -ENODEV; - - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - - intel_iommu_detach_dev(kvm->arch.intel_iommu_domain, - pdev->bus->number, pdev->devfn); - - r = intel_iommu_context_mapping(kvm->arch.intel_iommu_domain, - pdev); - if (r) { - printk(KERN_ERR "Domain context map for %s failed", - pci_name(pdev)); - goto out_unmap; - } - return 0; - -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - gfn_t gfn = base_gfn; - pfn_t pfn; - struct dmar_domain *domain = kvm->arch.intel_iommu_domain; - int i; - - for (i = 0; i < npages; i++) { - pfn = (pfn_t)intel_iommu_iova_to_pfn(domain, - gfn_to_gpa(gfn)); - kvm_release_pfn_clean(pfn); - gfn++; - } -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int i; - down_read(&kvm->slots_lock); - for (i = 0; i < kvm->nmemslots; i++) { - kvm_iommu_put_pages(kvm, kvm->memslots[i].base_gfn, - kvm->memslots[i].npages); - } - up_read(&kvm->slots_lock); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *entry; - struct dmar_domain *domain = kvm->arch.intel_iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - list_for_each_entry(entry, &kvm->arch.assigned_dev_head, list) { - printk(KERN_DEBUG "VT-d unmap: host bdf = %x:%x:%x\n", - entry->host_busnr, - PCI_SLOT(entry->host_devfn), - PCI_FUNC(entry->host_devfn)); - - /* detach kvm dmar domain */ - intel_iommu_detach_dev(domain, entry->host_busnr, - entry->host_devfn); - } - kvm_iommu_unmap_memslots(kvm); - intel_iommu_domain_exit(domain); - return 0; -} -- cgit v1.2.2 From 8a98f6648a2b0756d8f26d6c13332f5526355fec Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Mon, 6 Oct 2008 13:47:38 +0800 Subject: KVM: Move device assignment logic to common code To share with other archs, this patch moves device assignment logic to common parts. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/x86/kvm/x86.c | 255 ----------------------------------------------------- 1 file changed, 255 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index d6d7123d2644..f8bde01ba8e6 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -107,238 +106,6 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { NULL } }; -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct list_head *ptr; - struct kvm_assigned_dev_kernel *match; - - list_for_each(ptr, head) { - match = list_entry(ptr, struct kvm_assigned_dev_kernel, list); - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static void kvm_assigned_dev_interrupt_work_handler(struct work_struct *work) -{ - struct kvm_assigned_dev_kernel *assigned_dev; - - assigned_dev = container_of(work, struct kvm_assigned_dev_kernel, - interrupt_work); - - /* This is taken to safely inject irq inside the guest. When - * the interrupt injection (or the ioapic code) uses a - * finer-grained lock, update this - */ - mutex_lock(&assigned_dev->kvm->lock); - kvm_set_irq(assigned_dev->kvm, - assigned_dev->guest_irq, 1); - mutex_unlock(&assigned_dev->kvm->lock); - kvm_put_kvm(assigned_dev->kvm); -} - -/* FIXME: Implement the OR logic needed to make shared interrupts on - * this line behave properly - */ -static irqreturn_t kvm_assigned_dev_intr(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = - (struct kvm_assigned_dev_kernel *) dev_id; - - kvm_get_kvm(assigned_dev->kvm); - schedule_work(&assigned_dev->interrupt_work); - disable_irq_nosync(irq); - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev; - - if (kian->gsi == -1) - return; - - dev = container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - kvm_set_irq(dev->kvm, dev->guest_irq, 0); - enable_irq(dev->host_irq); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - if (irqchip_in_kernel(kvm) && assigned_dev->irq_requested) - free_irq(assigned_dev->host_irq, (void *)assigned_dev); - - kvm_unregister_irq_ack_notifier(kvm, &assigned_dev->ack_notifier); - - if (cancel_work_sync(&assigned_dev->interrupt_work)) - /* We had pending work. That means we will have to take - * care of kvm_put_kvm. - */ - kvm_put_kvm(kvm); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -static void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct list_head *ptr, *ptr2; - struct kvm_assigned_dev_kernel *assigned_dev; - - list_for_each_safe(ptr, ptr2, &kvm->arch.assigned_dev_head) { - assigned_dev = list_entry(ptr, - struct kvm_assigned_dev_kernel, - list); - - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) { - mutex_unlock(&kvm->lock); - return -EINVAL; - } - - if (match->irq_requested) { - match->guest_irq = assigned_irq->guest_irq; - match->ack_notifier.gsi = assigned_irq->guest_irq; - mutex_unlock(&kvm->lock); - return 0; - } - - INIT_WORK(&match->interrupt_work, - kvm_assigned_dev_interrupt_work_handler); - - if (irqchip_in_kernel(kvm)) { - if (!capable(CAP_SYS_RAWIO)) { - r = -EPERM; - goto out_release; - } - - if (assigned_irq->host_irq) - match->host_irq = assigned_irq->host_irq; - else - match->host_irq = match->dev->irq; - match->guest_irq = assigned_irq->guest_irq; - match->ack_notifier.gsi = assigned_irq->guest_irq; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - kvm_register_irq_ack_notifier(kvm, &match->ack_notifier); - - /* Even though this is PCI, we don't want to use shared - * interrupts. Sharing host devices with guest-assigned devices - * on the same interrupt line is not a happy situation: there - * are going to be long delays in accepting, acking, etc. - */ - if (request_irq(match->host_irq, kvm_assigned_dev_intr, 0, - "kvm_assigned_device", (void *)match)) { - r = -EIO; - goto out_release; - } - } - - match->irq_requested = true; - mutex_unlock(&kvm->lock); - return r; -out_release: - mutex_unlock(&kvm->lock); - kvm_free_assigned_device(kvm, match); - return r; -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EINVAL; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_bus_and_slot(assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->dev = dev; - - match->kvm = kvm; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU) { - r = kvm_iommu_map_guest(kvm, match); - if (r) - goto out_list_del; - } - -out: - mutex_unlock(&kvm->lock); - return r; -out_list_del: - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - mutex_unlock(&kvm->lock); - return r; -} - unsigned long segment_base(u16 selector) { struct descriptor_table gdt; @@ -2030,28 +1797,6 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; break; } - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } case KVM_GET_PIT: { r = -EFAULT; if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state))) -- cgit v1.2.2 From 3de42dc094ecd313dc7d551e007a134b52f8663d Mon Sep 17 00:00:00 2001 From: Xiantao Zhang Date: Mon, 6 Oct 2008 13:48:45 +0800 Subject: KVM: Separate irq ack notification out of arch/x86/kvm/irq.c Moving irq ack notification logic as common, and make it shared with ia64 side. Signed-off-by: Xiantao Zhang Signed-off-by: Avi Kivity --- arch/x86/kvm/Makefile | 2 +- arch/x86/kvm/irq.c | 33 --------------------------------- arch/x86/kvm/irq.h | 8 -------- 3 files changed, 1 insertion(+), 42 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 7dce593b9c66..c02343594b4d 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -3,7 +3,7 @@ # common-objs = $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ - coalesced_mmio.o) + coalesced_mmio.o irq_comm.o) ifeq ($(CONFIG_KVM_TRACE),y) common-objs += $(addprefix ../../../virt/kvm/, kvm_trace.o) endif diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 8c1b9c5def78..c019b8edcdb7 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -99,36 +99,3 @@ void __kvm_migrate_timers(struct kvm_vcpu *vcpu) __kvm_migrate_apic_timer(vcpu); __kvm_migrate_pit_timer(vcpu); } - -/* This should be called with the kvm->lock mutex held */ -void kvm_set_irq(struct kvm *kvm, int irq, int level) -{ - /* Not possible to detect if the guest uses the PIC or the - * IOAPIC. So set the bit in both. The guest will ignore - * writes to the unused one. - */ - kvm_ioapic_set_irq(kvm->arch.vioapic, irq, level); - kvm_pic_set_irq(pic_irqchip(kvm), irq, level); -} - -void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi) -{ - struct kvm_irq_ack_notifier *kian; - struct hlist_node *n; - - hlist_for_each_entry(kian, n, &kvm->arch.irq_ack_notifier_list, link) - if (kian->gsi == gsi) - kian->irq_acked(kian); -} - -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - hlist_add_head(&kian->link, &kvm->arch.irq_ack_notifier_list); -} - -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian) -{ - hlist_del(&kian->link); -} diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 4748532fd1da..f17c8f5bbf31 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -68,7 +68,6 @@ struct kvm_pic { }; struct kvm_pic *kvm_create_pic(struct kvm *kvm); -void kvm_pic_set_irq(void *opaque, int irq, int level); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); void kvm_pic_clear_isr_ack(struct kvm *kvm); @@ -85,13 +84,6 @@ static inline int irqchip_in_kernel(struct kvm *kvm) void kvm_pic_reset(struct kvm_kpic_state *s); -void kvm_set_irq(struct kvm *kvm, int irq, int level); -void kvm_notify_acked_irq(struct kvm *kvm, unsigned gsi); -void kvm_register_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian); -void kvm_unregister_irq_ack_notifier(struct kvm *kvm, - struct kvm_irq_ack_notifier *kian); - void kvm_timer_intr_post(struct kvm_vcpu *vcpu, int vec); void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); -- cgit v1.2.2 From e2fee2761ad1df2d29b9d502a3cefc87a17b32ca Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 18 Jul 2008 17:36:20 +0200 Subject: OProfile: Rework oprofile_add_ibs_sample() function Code looks much more cleaner now. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index d9faf607b3a6..54632e0d6bf4 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -67,8 +67,9 @@ static unsigned long reset_value[NUM_COUNTERS]; /* The function interface needs to be fixed, something like add data. Should then be added to linux/oprofile.h. */ -extern void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, u8 code); +extern void +oprofile_add_ibs_sample(struct pt_regs *const regs, + unsigned int * const ibs_sample, int ibs_code); struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ -- cgit v1.2.2 From 2d55a478827f3eed2ee9701605fdeb9cac2d78dc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 18 Jul 2008 17:56:05 +0200 Subject: OProfile: Rework string handling in setup_ibs_files() Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 54632e0d6bf4..0657c56a66b5 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -473,7 +473,6 @@ static int (*create_arch_files)(struct super_block * sb, struct dentry * root); static int setup_ibs_files(struct super_block * sb, struct dentry * root) { - char buf[12]; struct dentry *dir; int ret = 0; @@ -495,22 +494,22 @@ static int setup_ibs_files(struct super_block * sb, struct dentry * root) ibs_config.max_cnt_op = 250000; ibs_config.op_enabled = 0; ibs_config.dispatched_ops = 1; - snprintf(buf, sizeof(buf), "ibs_fetch"); - dir = oprofilefs_mkdir(sb, root, buf); - oprofilefs_create_ulong(sb, dir, "rand_enable", - &ibs_config.rand_en); + + dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.fetch_enabled); + &ibs_config.fetch_enabled); oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_fetch); - snprintf(buf, sizeof(buf), "ibs_uops"); - dir = oprofilefs_mkdir(sb, root, buf); + &ibs_config.max_cnt_fetch); + oprofilefs_create_ulong(sb, dir, "rand_enable", + &ibs_config.rand_en); + + dir = oprofilefs_mkdir(sb, root, "ibs_uops"); oprofilefs_create_ulong(sb, dir, "enable", - &ibs_config.op_enabled); + &ibs_config.op_enabled); oprofilefs_create_ulong(sb, dir, "max_count", - &ibs_config.max_cnt_op); + &ibs_config.max_cnt_op); oprofilefs_create_ulong(sb, dir, "dispatched_ops", - &ibs_config.dispatched_ops); + &ibs_config.dispatched_ops); return 0; } -- cgit v1.2.2 From ccd755c2d90dd9b9729ba5975f7c92bf206ddcf7 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 29 Jul 2008 16:57:10 +0200 Subject: OProfile: Rename IBS sysfs dir into "ibs_op" The new name is now more close to those used in the spec. Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 0657c56a66b5..23ce63f2762c 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -503,7 +503,7 @@ static int setup_ibs_files(struct super_block * sb, struct dentry * root) oprofilefs_create_ulong(sb, dir, "rand_enable", &ibs_config.rand_en); - dir = oprofilefs_mkdir(sb, root, "ibs_uops"); + dir = oprofilefs_mkdir(sb, root, "ibs_op"); oprofilefs_create_ulong(sb, dir, "enable", &ibs_config.op_enabled); oprofilefs_create_ulong(sb, dir, "max_count", -- cgit v1.2.2 From c92960fccb9f32a1d6110f6dcfe483ed96c62beb Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Sep 2008 17:12:36 +0200 Subject: oprofile: whitespace fixes Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 20 ++++++++++---------- arch/x86/oprofile/op_model_p4.c | 32 ++++++++++++++++---------------- arch/x86/oprofile/op_model_ppro.c | 16 ++++++++-------- 3 files changed, 34 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 23ce63f2762c..b9a810b33261 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -530,14 +530,14 @@ static void op_amd_exit(void) #endif struct op_x86_model_spec const op_amd_spec = { - .init = op_amd_init, - .exit = op_amd_exit, - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &op_amd_fill_in_addresses, - .setup_ctrs = &op_amd_setup_ctrs, - .check_ctrs = &op_amd_check_ctrs, - .start = &op_amd_start, - .stop = &op_amd_stop, - .shutdown = &op_amd_shutdown + .init = op_amd_init, + .exit = op_amd_exit, + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &op_amd_fill_in_addresses, + .setup_ctrs = &op_amd_setup_ctrs, + .check_ctrs = &op_amd_check_ctrs, + .start = &op_amd_start, + .stop = &op_amd_stop, + .shutdown = &op_amd_shutdown }; diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c index 43ac5af338d8..4c4a51c90bc2 100644 --- a/arch/x86/oprofile/op_model_p4.c +++ b/arch/x86/oprofile/op_model_p4.c @@ -698,24 +698,24 @@ static void p4_shutdown(struct op_msrs const * const msrs) #ifdef CONFIG_SMP struct op_x86_model_spec const op_p4_ht2_spec = { - .num_counters = NUM_COUNTERS_HT2, - .num_controls = NUM_CONTROLS_HT2, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown + .num_counters = NUM_COUNTERS_HT2, + .num_controls = NUM_CONTROLS_HT2, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown }; #endif struct op_x86_model_spec const op_p4_spec = { - .num_counters = NUM_COUNTERS_NON_HT, - .num_controls = NUM_CONTROLS_NON_HT, - .fill_in_addresses = &p4_fill_in_addresses, - .setup_ctrs = &p4_setup_ctrs, - .check_ctrs = &p4_check_ctrs, - .start = &p4_start, - .stop = &p4_stop, - .shutdown = &p4_shutdown + .num_counters = NUM_COUNTERS_NON_HT, + .num_controls = NUM_CONTROLS_NON_HT, + .fill_in_addresses = &p4_fill_in_addresses, + .setup_ctrs = &p4_setup_ctrs, + .check_ctrs = &p4_check_ctrs, + .start = &p4_start, + .stop = &p4_stop, + .shutdown = &p4_shutdown }; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index eff431f6c57b..c665bac4a143 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -181,12 +181,12 @@ static void ppro_shutdown(struct op_msrs const * const msrs) struct op_x86_model_spec const op_ppro_spec = { - .num_counters = NUM_COUNTERS, - .num_controls = NUM_CONTROLS, - .fill_in_addresses = &ppro_fill_in_addresses, - .setup_ctrs = &ppro_setup_ctrs, - .check_ctrs = &ppro_check_ctrs, - .start = &ppro_start, - .stop = &ppro_stop, - .shutdown = &ppro_shutdown + .num_counters = NUM_COUNTERS, + .num_controls = NUM_CONTROLS, + .fill_in_addresses = &ppro_fill_in_addresses, + .setup_ctrs = &ppro_setup_ctrs, + .check_ctrs = &ppro_check_ctrs, + .start = &ppro_start, + .stop = &ppro_stop, + .shutdown = &ppro_shutdown }; -- cgit v1.2.2 From 25ad2913cae9c9e3ed28075caeb2eefccd636f4f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Sep 2008 17:12:36 +0200 Subject: oprofile: more whitespace fixes Signed-off-by: Robert Richter --- arch/x86/oprofile/backtrace.c | 2 +- arch/x86/oprofile/op_model_amd.c | 6 +++--- arch/x86/oprofile/op_x86_model.h | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index e2095cba409f..36e324139f77 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -53,7 +53,7 @@ struct frame_head { } __attribute__((packed)); static struct frame_head * -dump_user_backtrace(struct frame_head * head) +dump_user_backtrace(struct frame_head *head) { struct frame_head bufhead[2]; diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index b9a810b33261..a2e83afbe3e1 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -69,7 +69,7 @@ static unsigned long reset_value[NUM_COUNTERS]; data. Should then be added to linux/oprofile.h. */ extern void oprofile_add_ibs_sample(struct pt_regs *const regs, - unsigned int * const ibs_sample, int ibs_code); + unsigned int *const ibs_sample, int ibs_code); struct ibs_fetch_sample { /* MSRC001_1031 IBS Fetch Linear Address Register */ @@ -469,9 +469,9 @@ static void clear_ibs_nmi(void) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } -static int (*create_arch_files)(struct super_block * sb, struct dentry * root); +static int (*create_arch_files)(struct super_block *sb, struct dentry *root); -static int setup_ibs_files(struct super_block * sb, struct dentry * root) +static int setup_ibs_files(struct super_block *sb, struct dentry *root) { struct dentry *dir; int ret = 0; diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h index 05a0261ba0c3..24ccdebf3ac1 100644 --- a/arch/x86/oprofile/op_x86_model.h +++ b/arch/x86/oprofile/op_x86_model.h @@ -22,8 +22,8 @@ struct op_msr { }; struct op_msrs { - struct op_msr * counters; - struct op_msr * controls; + struct op_msr *counters; + struct op_msr *controls; }; struct pt_regs; -- cgit v1.2.2 From 69046d430417c30f48867fc52e892c9050b3a29b Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 5 Sep 2008 12:17:40 +0200 Subject: x86/oprofile: reordering functions in nmi_int.c No functional changes. The intension is to remove static function declarations. Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 147 +++++++++++++++++++++----------------------- 1 file changed, 71 insertions(+), 76 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 57f6c9088081..370d832f398d 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -28,85 +28,9 @@ static struct op_x86_model_spec const *model; static DEFINE_PER_CPU(struct op_msrs, cpu_msrs); static DEFINE_PER_CPU(unsigned long, saved_lvtpc); -static int nmi_start(void); -static void nmi_stop(void); -static void nmi_cpu_start(void *dummy); -static void nmi_cpu_stop(void *dummy); - /* 0 == registered but off, 1 == registered and on */ static int nmi_enabled = 0; -#ifdef CONFIG_SMP -static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, - void *data) -{ - int cpu = (unsigned long)data; - switch (action) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block oprofile_cpu_nb = { - .notifier_call = oprofile_cpu_notifier -}; -#endif - -#ifdef CONFIG_PM - -static int nmi_suspend(struct sys_device *dev, pm_message_t state) -{ - /* Only one CPU left, just stop that one */ - if (nmi_enabled == 1) - nmi_cpu_stop(NULL); - return 0; -} - -static int nmi_resume(struct sys_device *dev) -{ - if (nmi_enabled == 1) - nmi_cpu_start(NULL); - return 0; -} - -static struct sysdev_class oprofile_sysclass = { - .name = "oprofile", - .resume = nmi_resume, - .suspend = nmi_suspend, -}; - -static struct sys_device device_oprofile = { - .id = 0, - .cls = &oprofile_sysclass, -}; - -static int __init init_sysfs(void) -{ - int error; - - error = sysdev_class_register(&oprofile_sysclass); - if (!error) - error = sysdev_register(&device_oprofile); - return error; -} - -static void exit_sysfs(void) -{ - sysdev_unregister(&device_oprofile); - sysdev_class_unregister(&oprofile_sysclass); -} - -#else -#define init_sysfs() do { } while (0) -#define exit_sysfs() do { } while (0) -#endif /* CONFIG_PM */ - static int profile_exceptions_notify(struct notifier_block *self, unsigned long val, void *data) { @@ -361,6 +285,77 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root) return 0; } +#ifdef CONFIG_SMP +static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, + void *data) +{ + int cpu = (unsigned long)data; + switch (action) { + case CPU_DOWN_FAILED: + case CPU_ONLINE: + smp_call_function_single(cpu, nmi_cpu_start, NULL, 0); + break; + case CPU_DOWN_PREPARE: + smp_call_function_single(cpu, nmi_cpu_stop, NULL, 1); + break; + } + return NOTIFY_DONE; +} + +static struct notifier_block oprofile_cpu_nb = { + .notifier_call = oprofile_cpu_notifier +}; +#endif + +#ifdef CONFIG_PM + +static int nmi_suspend(struct sys_device *dev, pm_message_t state) +{ + /* Only one CPU left, just stop that one */ + if (nmi_enabled == 1) + nmi_cpu_stop(NULL); + return 0; +} + +static int nmi_resume(struct sys_device *dev) +{ + if (nmi_enabled == 1) + nmi_cpu_start(NULL); + return 0; +} + +static struct sysdev_class oprofile_sysclass = { + .name = "oprofile", + .resume = nmi_resume, + .suspend = nmi_suspend, +}; + +static struct sys_device device_oprofile = { + .id = 0, + .cls = &oprofile_sysclass, +}; + +static int __init init_sysfs(void) +{ + int error; + + error = sysdev_class_register(&oprofile_sysclass); + if (!error) + error = sysdev_register(&device_oprofile); + return error; +} + +static void exit_sysfs(void) +{ + sysdev_unregister(&device_oprofile); + sysdev_class_unregister(&oprofile_sysclass); +} + +#else +#define init_sysfs() do { } while (0) +#define exit_sysfs() do { } while (0) +#endif /* CONFIG_PM */ + static int p4force; module_param(p4force, int, 0); -- cgit v1.2.2 From 5f87dfb79f829339508a5d989b8252eb30842587 Mon Sep 17 00:00:00 2001 From: Suravee Suthikulpanit Date: Wed, 15 Oct 2008 08:15:51 -0500 Subject: x86/oprofile: add the logic for enabling additional IBS bits This patch adds the logic for enabling additional IBS control bits : * IBS-Fetch IbsRandEn bit (bit 57) * IBS-Op IbsOpCntCtl bit (bit 19) Signed-off-by: Suravee Suthikulpanit Signed-off-by: Robert Richter --- arch/x86/oprofile/op_model_amd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index a2e83afbe3e1..509513760a6e 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -310,12 +310,15 @@ static void op_amd_start(struct op_msrs const * const msrs) #ifdef CONFIG_OPROFILE_IBS if (ibs_allowed && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; - high = IBS_FETCH_HIGH_ENABLE; + high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ + + IBS_FETCH_HIGH_ENABLE; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } if (ibs_allowed && ibs_config.op_enabled) { - low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + IBS_OP_LOW_ENABLE; + low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ + + IBS_OP_LOW_ENABLE; high = 0; wrmsr(MSR_AMD64_IBSOPCTL, low, high); } -- cgit v1.2.2 From 1339c367a842a9fc34b8fcfa1abadb07b11848ad Mon Sep 17 00:00:00 2001 From: Pavel Vasilyev Date: Wed, 15 Oct 2008 17:33:48 -0400 Subject: fix CONFIG_MMCONFIG=n build warning arch/x86/kernel/acpi/boot.c:100: warning: 'acpi_mcfg_64bit_base_addr' defined but not used http://bugzilla.kernel.org/show_bug.cgi?id=11743 Signed-off-by: Pavel Vasilyev Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index c102af85df9c..0c2742f8c4da 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -97,7 +97,6 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #warning ACPI uses CMPXCHG, i486 and later hardware #endif -static int acpi_mcfg_64bit_base_addr __initdata = FALSE; /* -------------------------------------------------------------------------- Boot-time Configuration @@ -156,6 +155,9 @@ char *__init __acpi_map_table(unsigned long phys, unsigned long size) } #ifdef CONFIG_PCI_MMCONFIG + +static int acpi_mcfg_64bit_base_addr __initdata = FALSE; + /* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ struct acpi_mcfg_allocation *pci_mmcfg_config; int pci_mmcfg_config_num; -- cgit v1.2.2 From fe648be019119722ec0ac54e3a4b2e5bf5168589 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:41 -0700 Subject: x86: add after_bootmem flag for 32bit to prepare to use dyn_array support etc. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8396868e82c5..91343c2694b4 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -66,6 +66,7 @@ static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; +int after_bootmem; static __init void *alloc_low_page(unsigned long *phys) { @@ -988,6 +989,8 @@ void __init mem_init(void) set_highmem_pages_init(); + after_bootmem = 1; + codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; -- cgit v1.2.2 From 1f3fcd4b1adc972d5c6a34cfed98931c46575b49 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:44 -0700 Subject: add per_cpu_dyn_array support allow dyn-array in per_cpu area, allocated dynamically. usage: | /* in .h */ | struct kernel_stat { | struct cpu_usage_stat cpustat; | unsigned int *irqs; | }; | | /* in .c */ | DEFINE_PER_CPU(struct kernel_stat, kstat); | | DEFINE_PER_CPU_DYN_ARRAY_ADDR(per_cpu__kstat_irqs, per_cpu__kstat.irqs, sizeof(unsigned int), nr_irqs, sizeof(unsigned long), NULL); after setup_percpu()/per_cpu_alloc_dyn_array(), the dyn_array in per_cpu area is ready to use. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0e67f72d9316..13ba7a83808d 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,7 +140,7 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size = PERCPU_ENOUGH_ROOM; + ssize_t size, old_size; char *ptr; int cpu; @@ -148,7 +148,8 @@ void __init setup_per_cpu_areas(void) setup_cpu_pda_map(); /* Copy section for each CPU (we discard the original) */ - size = PERCPU_ENOUGH_ROOM; + old_size = PERCPU_ENOUGH_ROOM; + size = old_size + per_cpu_dyn_array_size(); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@ -176,6 +177,8 @@ void __init setup_per_cpu_areas(void) per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); + per_cpu_alloc_dyn_array(cpu, ptr + old_size); + } printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", -- cgit v1.2.2 From 1f8ff037a871690c762d267d8a052529d3102fc9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:45 -0700 Subject: x86: alloc dyn_array all together so could spare some memory with small alignment in bootmem also tighten the alignment checking, and make print out less debug info. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 13ba7a83808d..2b7dab699e83 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,26 +140,31 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size, old_size; + ssize_t size, old_size, da_size; char *ptr; int cpu; + unsigned long align = 1; /* Setup cpu_pda map */ setup_cpu_pda_map(); /* Copy section for each CPU (we discard the original) */ old_size = PERCPU_ENOUGH_ROOM; - size = old_size + per_cpu_dyn_array_size(); + da_size = per_cpu_dyn_array_size(&align); + align = max_t(unsigned long, PAGE_SIZE, align); + size = roundup(old_size + da_size, align); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); for_each_possible_cpu(cpu) { #ifndef CONFIG_NEED_MULTIPLE_NODES - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); #else int node = early_cpu_to_node(cpu); if (!node_online(node) || !NODE_DATA(node)) { - ptr = alloc_bootmem_pages(size); + ptr = __alloc_bootmem(size, align, + __pa(MAX_DMA_ADDRESS)); printk(KERN_INFO "cpu %d has no node %d or node-local memory\n", cpu, node); @@ -168,7 +173,8 @@ void __init setup_per_cpu_areas(void) cpu, __pa(ptr)); } else { - ptr = alloc_bootmem_pages_node(NODE_DATA(node), size); + ptr = __alloc_bootmem_node(NODE_DATA(node), size, align, + __pa(MAX_DMA_ADDRESS)); if (ptr) printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n", cpu, node, __pa(ptr)); -- cgit v1.2.2 From 6da55c3e8da88e8a7cb6452160776ad6706798ad Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:46 -0700 Subject: x86: enable dyn_array support Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/vmlinux_32.lds.S | 1 + arch/x86/kernel/vmlinux_64.lds.S | 3 +++ 3 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f65c2744d573..42f98009d752 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -33,6 +33,7 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS + select HAVE_DYN_ARRAY config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index a9b8560adbc2..c36007ab3940 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -145,6 +145,7 @@ SECTIONS *(.x86_cpu_dev.init) __x86_cpu_dev_end = .; } + DYN_ARRAY_INIT(8) SECURITY_INIT . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 46e05447405b..30973dbac8c2 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -173,6 +173,9 @@ SECTIONS *(.x86_cpu_dev.init) } __x86_cpu_dev_end = .; + + DYN_ARRAY_INIT(8) + SECURITY_INIT . = ALIGN(8); -- cgit v1.2.2 From 0799e432acfda879eaeef9622426bfa1434f3786 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:49:48 -0700 Subject: x86: use nr_irqs also add first_free_entry and pin_map_size, which were NR_IRQS derived constants. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 26 ++++++++++++++------------ arch/x86/kernel/io_apic_64.c | 33 +++++++++++++++++---------------- arch/x86/kernel/irq_32.c | 8 ++++---- arch/x86/kernel/irq_64.c | 8 ++++---- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 2 +- 6 files changed, 41 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index e710289f673e..d382990244f0 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -70,6 +70,7 @@ int timer_through_8259 __initdata; */ int sis_apic_bug = -1; +int first_free_entry = NR_IRQS; /* * # of IRQ routing registers */ @@ -100,6 +101,8 @@ static int disable_timer_pin_1 __initdata; #define MAX_PLUS_SHARED_IRQS NR_IRQS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +int pin_map_size = PIN_MAP_SIZE; + /* * This is performance-critical, we want to do it O(1) * @@ -213,7 +216,6 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; while (entry->next) @@ -222,7 +224,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) if (entry->pin != -1) { entry->next = first_free_entry; entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) + if (++first_free_entry >= pin_map_size) panic("io_apic.c: whoops"); } entry->apic = apic; @@ -457,7 +459,7 @@ static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) int i, j; for_each_online_cpu(i) { - for (j = 0; j < NR_IRQS; j++) { + for (j = 0; j < nr_irqs; j++) { if (!irq_desc[j].action) continue; /* Is it a significant load ? */ @@ -492,7 +494,7 @@ static void do_irq_balance(void) if (!cpu_online(i)) continue; package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < NR_IRQS; j++) { + for (j = 0; j < nr_irqs; j++) { unsigned long value_now, delta; /* Is this an active IRQ or balancing disabled ? */ if (!irq_desc[j].action || irq_balancing_disabled(j)) @@ -587,7 +589,7 @@ tryanotherirq: */ move_this_load = 0; selected_irq = -1; - for (j = 0; j < NR_IRQS; j++) { + for (j = 0; j < nr_irqs; j++) { /* Is this an active IRQ? */ if (!irq_desc[j].action) continue; @@ -664,7 +666,7 @@ static int balanced_irq(void *unused) long time_remaining = balanced_irq_interval; /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < NR_IRQS ; i++) { + for (i = 0 ; i < nr_irqs ; i++) { irq_desc[i].pending_mask = cpumask_of_cpu(0); set_pending_irq(i, cpumask_of_cpu(0)); } @@ -712,8 +714,8 @@ static int __init balanced_irq_init(void) physical_balance = 1; for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * NR_IRQS, GFP_KERNEL); + irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); + irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { printk(KERN_ERR "balanced_irq_init: out of memory"); goto failed; @@ -1441,7 +1443,7 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; @@ -1621,7 +1623,7 @@ static void __init enable_IO_APIC(void) int i, apic; unsigned long flags; - for (i = 0; i < PIN_MAP_SIZE; i++) { + for (i = 0; i < pin_map_size; i++) { irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } @@ -2005,7 +2007,7 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < NR_IRQS ; irq++) { + for (irq = 0; irq < nr_irqs ; irq++) { if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { /* * Hmm.. We don't have an entry for this, @@ -2449,7 +2451,7 @@ int create_irq(void) irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { + for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; if (irq_vector[new] != 0) diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 02063ae042f7..448384c7c1e8 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -132,6 +132,7 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); #define MAX_PLUS_SHARED_IRQS NR_IRQS #define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) +int pin_map_size = PIN_MAP_SIZE; /* * This is performance-critical, we want to do it O(1) * @@ -224,7 +225,7 @@ static inline void io_apic_sync(unsigned int apic) int pin; \ struct irq_pin_list *entry = irq_2_pin + irq; \ \ - BUG_ON(irq >= NR_IRQS); \ + BUG_ON(irq >= nr_irqs); \ for (;;) { \ unsigned int reg; \ pin = entry->pin; \ @@ -301,7 +302,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) int apic, pin; struct irq_pin_list *entry = irq_2_pin + irq; - BUG_ON(irq >= NR_IRQS); + BUG_ON(irq >= nr_irqs); for (;;) { unsigned int reg; apic = entry->apic; @@ -358,19 +359,19 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ +int first_free_entry = NR_IRQS; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - static int first_free_entry = NR_IRQS; struct irq_pin_list *entry = irq_2_pin + irq; - BUG_ON(irq >= NR_IRQS); + BUG_ON(irq >= nr_irqs); while (entry->next) entry = irq_2_pin + entry->next; if (entry->pin != -1) { entry->next = first_free_entry; entry = irq_2_pin + entry->next; - if (++first_free_entry >= PIN_MAP_SIZE) + if (++first_free_entry >= pin_map_size) panic("io_apic.c: ran out of irq_2_pin entries!"); } entry->apic = apic; @@ -634,7 +635,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } - BUG_ON(best_guess >= NR_IRQS); + BUG_ON(best_guess >= nr_irqs); return best_guess; } @@ -766,7 +767,7 @@ static int pin_2_irq(int idx, int apic, int pin) irq += nr_ioapic_registers[i++]; irq += pin; } - BUG_ON(irq >= NR_IRQS); + BUG_ON(irq >= nr_irqs); return irq; } @@ -801,7 +802,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) int cpu; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON((unsigned)irq >= nr_irqs); cfg = &irq_cfg[irq]; /* Only try and allocate irqs on cpus that are present */ @@ -875,7 +876,7 @@ static void __clear_irq_vector(int irq) cpumask_t mask; int cpu, vector; - BUG_ON((unsigned)irq >= NR_IRQS); + BUG_ON((unsigned)irq >= nr_irqs); cfg = &irq_cfg[irq]; BUG_ON(!cfg->vector); @@ -895,7 +896,7 @@ void __setup_vector_irq(int cpu) int irq, vector; /* Mark the inuse vectors */ - for (irq = 0; irq < NR_IRQS; ++irq) { + for (irq = 0; irq < nr_irqs; ++irq) { if (!cpu_isset(cpu, irq_cfg[irq].domain)) continue; vector = irq_cfg[irq].vector; @@ -1193,7 +1194,7 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { struct irq_pin_list *entry = irq_2_pin + i; if (entry->pin < 0) continue; @@ -1366,7 +1367,7 @@ void __init enable_IO_APIC(void) int i, apic; unsigned long flags; - for (i = 0; i < PIN_MAP_SIZE; i++) { + for (i = 0; i < pin_map_size; i++) { irq_2_pin[i].pin = -1; irq_2_pin[i].next = 0; } @@ -1658,7 +1659,7 @@ static void ir_irq_migration(struct work_struct *work) { int irq; - for (irq = 0; irq < NR_IRQS; irq++) { + for (irq = 0; irq < nr_irqs; irq++) { struct irq_desc *desc = irq_desc + irq; if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -1707,7 +1708,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= NR_IRQS) + if (irq >= nr_irqs) continue; desc = irq_desc + irq; @@ -1865,7 +1866,7 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < NR_IRQS ; irq++) { + for (irq = 0; irq < nr_irqs ; irq++) { if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { /* * Hmm.. We don't have an entry for this, @@ -2279,7 +2280,7 @@ int create_irq(void) irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); - for (new = (NR_IRQS - 1); new >= 0; new--) { + for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; if (irq_cfg[new].vector != 0) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b71e02d42f4f..4c7ffb32854c 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -226,7 +226,7 @@ unsigned int do_IRQ(struct pt_regs *regs) int overflow, irq = ~regs->orig_ax; struct irq_desc *desc = irq_desc + irq; - if (unlikely((unsigned)irq >= NR_IRQS)) { + if (unlikely((unsigned)irq >= nr_irqs)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __func__, irq); BUG(); @@ -271,7 +271,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < NR_IRQS) { + if (i < nr_irqs) { unsigned any_count = 0; spin_lock_irqsave(&irq_desc[i].lock, flags); @@ -303,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { + } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); @@ -396,7 +396,7 @@ void fixup_irqs(cpumask_t map) unsigned int irq; static int warned; - for (irq = 0; irq < NR_IRQS; irq++) { + for (irq = 0; irq < nr_irqs; irq++) { cpumask_t mask; if (irq == 2) continue; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f065fe9071b9..e1f0839430d2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -81,7 +81,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < NR_IRQS) { + if (i < nr_irqs) { unsigned any_count = 0; spin_lock_irqsave(&irq_desc[i].lock, flags); @@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == NR_IRQS) { + } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); @@ -201,7 +201,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(irq < NR_IRQS)) + if (likely(irq < nr_irqs)) generic_handle_irq(irq); else { if (!disable_apic) @@ -224,7 +224,7 @@ void fixup_irqs(cpumask_t map) unsigned int irq; static int warned; - for (irq = 0; irq < NR_IRQS; irq++) { + for (irq = 0; irq < nr_irqs; irq++) { cpumask_t mask; int break_affinity = 0; int set_affinity = 1; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9200a1e2752d..65c1c9507707 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -100,7 +100,7 @@ void __init native_init_IRQ(void) */ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= NR_IRQS) + if (i >= nr_irqs) break; /* SYSCALL_VECTOR was reserved in trap_init. */ if (!test_bit(vector, used_vectors)) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 5b5be9d43c2a..165c5d9b0d1a 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -142,7 +142,7 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < NR_IRQS; i++) { + for (i = 0; i < nr_irqs; i++) { irq_desc[i].status = IRQ_DISABLED; irq_desc[i].action = NULL; irq_desc[i].depth = 1; -- cgit v1.2.2 From 301e619020dd67bde7e7e64bb9ffb7f30d26c979 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:02 -0700 Subject: x86: use dyn_array in io_apic_xx.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 54 +++++++++++++++++++++++++++++++++++--------- arch/x86/kernel/io_apic_64.c | 28 +++++++++++++++++------ arch/x86/kernel/setup.c | 6 +++++ 3 files changed, 70 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index d382990244f0..7f2bcc3dad82 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -70,7 +70,7 @@ int timer_through_8259 __initdata; */ int sis_apic_bug = -1; -int first_free_entry = NR_IRQS; +int first_free_entry; /* * # of IRQ routing registers */ @@ -98,10 +98,7 @@ static int disable_timer_pin_1 __initdata; * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) - -int pin_map_size = PIN_MAP_SIZE; +int pin_map_size; /* * This is performance-critical, we want to do it O(1) @@ -112,7 +109,9 @@ int pin_map_size = PIN_MAP_SIZE; static struct irq_pin_list { int apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; +} *irq_2_pin; + +DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, 16, NULL); struct io_apic { unsigned int index; @@ -403,9 +402,28 @@ static struct irq_cpu_info { #define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) -static cpumask_t balance_irq_affinity[NR_IRQS] = { - [0 ... NR_IRQS-1] = CPU_MASK_ALL -}; +static cpumask_t balance_irq_affinity_init __initdata = CPU_MASK_ALL; + +static cpumask_t *balance_irq_affinity; + + +static void __init irq_affinity_init_work(void *data) +{ + struct dyn_array *da = data; + + int i; + struct balance_irq_affinity *affinity; + + affinity = *da->name; + + for (i = 0; i < *da->nr; i++) + memcpy(&affinity[i], &balance_irq_affinity_init, + sizeof(struct balance_irq_affinity)); + +} + +DEFINE_DYN_ARRAY(balance_irq_affinity, sizeof(struct balance_irq_affinity), nr_irqs, PAGE_SIZE, irq_affinity_init_work); + void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) { @@ -1170,14 +1188,28 @@ static inline int IO_APIC_irq_trigger(int irq) } /* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -static u8 irq_vector[NR_IRQ_VECTORS] __read_mostly = { FIRST_DEVICE_VECTOR , 0 }; +static u8 irq_vector_init_first __initdata = FIRST_DEVICE_VECTOR; +static u8 *irq_vector; + +static void __init irq_vector_init_work(void *data) +{ + struct dyn_array *da = data; + + u8 *irq_vec; + + irq_vec = *da->name; + + irq_vec[0] = irq_vector_init_first; +} + +DEFINE_DYN_ARRAY(irq_vector, sizeof(u8), nr_irqs, PAGE_SIZE, irq_vector_init_work); static int __assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, current_offset; int vector, offset; - BUG_ON((unsigned)irq >= NR_IRQ_VECTORS); + BUG_ON((unsigned)irq >= nr_irqs); if (irq_vector[irq] > 0) return irq_vector[irq]; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 448384c7c1e8..93a3ffabfe6a 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -66,7 +66,7 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { +static struct irq_cfg irq_cfg_legacy[] __initdata = { [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, @@ -85,6 +85,17 @@ static struct irq_cfg irq_cfg[NR_IRQS] __read_mostly = { [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; +static struct irq_cfg *irq_cfg; + +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + + memcpy(*da->name, irq_cfg_legacy, sizeof(irq_cfg_legacy)); +} + +DEFINE_DYN_ARRAY(irq_cfg, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); + static int assign_irq_vector(int irq, cpumask_t mask); int first_system_vector = 0xfe; @@ -129,10 +140,9 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); * Rough estimation of how many shared IRQs there are, can * be changed anytime. */ -#define MAX_PLUS_SHARED_IRQS NR_IRQS -#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) -int pin_map_size = PIN_MAP_SIZE; +int pin_map_size; + /* * This is performance-critical, we want to do it O(1) * @@ -141,8 +151,12 @@ int pin_map_size = PIN_MAP_SIZE; */ static struct irq_pin_list { - short apic, pin, next; -} irq_2_pin[PIN_MAP_SIZE]; + short apic, pin; + int next; +} *irq_2_pin; + +DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, sizeof(struct irq_pin_list), NULL); + struct io_apic { unsigned int index; @@ -359,7 +373,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -int first_free_entry = NR_IRQS; +int first_free_entry; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { struct irq_pin_list *entry = irq_2_pin + irq; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2255782e8d4b..558ec26b08e2 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1067,9 +1067,15 @@ void __init setup_arch(char **cmdline_p) #endif prefill_possible_map(); + #ifdef CONFIG_X86_64 + /* need to wait for nr_cpu_ids settle down */ + if (nr_irqs == NR_IRQS) + nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif + pin_map_size = nr_irqs * 2; + first_free_entry = nr_irqs; init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.2.2 From a84488c213a8cfc29200344a6fb6357d48c8ed85 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 Aug 2008 20:50:31 -0700 Subject: irq: sparse irqs, fix #3 fix non-APIC UP build: arch/x86/kernel/built-in.o: In function `setup_arch': : undefined reference to `pin_map_size' arch/x86/kernel/built-in.o: In function `setup_arch': : undefined reference to `first_free_entry' Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 558ec26b08e2..02e3a6697977 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1074,8 +1074,10 @@ void __init setup_arch(char **cmdline_p) nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif +#ifdef CONFIG_X86_IO_APIC pin_map_size = nr_irqs * 2; first_free_entry = nr_irqs; +#endif init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.2.2 From 71f521bbaf375b685aeea20c6b0ed8600cd6edfe Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:03 -0700 Subject: x86, irq: get nr_irqs from madt Until now, NR_IRQS was derived from black magic defines that had to be "large enough" to both accomodate NR_CPUS and MAX_NR_IO_APICs. This resulted in a way too large irq_desc[] array on most x86 systems. Especially with larger CPU masks, the size of irq_desc can spiral out of control quickly. So be smarter about it and use precise allocation instead: determine the default maximum possible IRQ number from the ACPI MADT. Use a minimum limit of at least 32 IRQs for broken BIOSes. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index eb875cdc7367..3e9d163fd92f 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -957,6 +957,29 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } +int get_nr_irqs_via_madt(void) +{ + int idx; + int nr = 0; + + for (idx = 0; idx < nr_ioapics; idx++) { + if (mp_ioapic_routing[idx].gsi_end > nr) + nr = mp_ioapic_routing[idx].gsi_end; + } + + nr++; + + /* double it for hotplug and msi and nmi */ + nr <<= 1; + + /* something wrong ? */ + if (nr < 32) + nr = 32; + + return nr; + +} + static void assign_to_mp_irq(struct mp_config_intsrc *m, struct mp_config_intsrc *mp_irq) { @@ -1254,9 +1277,12 @@ static int __init acpi_parse_madt_ioapic_entries(void) return count; } + + nr_irqs = get_nr_irqs_via_madt(); + count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing interrupt source overrides entry\n"); @@ -1276,7 +1302,7 @@ static int __init acpi_parse_madt_ioapic_entries(void) count = acpi_table_parse_madt(ACPI_MADT_TYPE_NMI_SOURCE, acpi_parse_nmi_src, - NR_IRQ_VECTORS); + nr_irqs); if (count < 0) { printk(KERN_ERR PREFIX "Error parsing NMI SRC entry\n"); /* TBD: Cleanup to allow fallback to MPS */ -- cgit v1.2.2 From 08678b0841267c1d00d771fe01548d86043d065e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:05 -0700 Subject: generic: sparse irqs: use irq_desc() together with dyn_array, instead of irq_desc[] add CONFIG_HAVE_SPARSE_IRQ to for use condensed array. Get rid of irq_desc[] array assumptions. Preallocate 32 irq_desc, and irq_desc() will try to get more. ( No change in functionality is expected anywhere, except the odd build failure where we missed a code site or where a crossing commit itroduces new irq_desc[] usage. ) v2: according to Eric, change get_irq_desc() to irq_desc() Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/kernel/io_apic_32.c | 46 +++++++++++++++++------ arch/x86/kernel/io_apic_64.c | 75 ++++++++++++++++++++++++------------- arch/x86/kernel/irq_32.c | 24 +++++++----- arch/x86/kernel/irq_64.c | 35 +++++++++-------- arch/x86/kernel/irqinit_64.c | 10 +++-- arch/x86/kernel/visws_quirks.c | 30 ++++++++------- arch/x86/mach-voyager/voyager_smp.c | 4 +- 8 files changed, 142 insertions(+), 83 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 42f98009d752..1004888e9b13 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,6 +34,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY + select HAVE_SPARSE_IRQ if X86_64 config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 7f2bcc3dad82..c2160cfdec9b 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) struct irq_pin_list *entry = irq_2_pin + irq; unsigned int apicid_value; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, cpumask, cpu_online_map); if (cpus_empty(tmp)) @@ -365,7 +366,8 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) break; entry = irq_2_pin + entry->next; } - irq_desc[irq].affinity = cpumask; + desc = irq_to_desc(irq); + desc->affinity = cpumask; spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -475,10 +477,12 @@ static inline void balance_irq(int cpu, int irq) static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) { int i, j; + struct irq_desc *desc; for_each_online_cpu(i) { for (j = 0; j < nr_irqs; j++) { - if (!irq_desc[j].action) + desc = irq_to_desc(j); + if (!desc->action) continue; /* Is it a significant load ? */ if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < @@ -505,6 +509,7 @@ static void do_irq_balance(void) unsigned long tmp_cpu_irq; unsigned long imbalance = 0; cpumask_t allowed_mask, target_cpu_mask, tmp; + struct irq_desc *desc; for_each_possible_cpu(i) { int package_index; @@ -515,7 +520,8 @@ static void do_irq_balance(void) for (j = 0; j < nr_irqs; j++) { unsigned long value_now, delta; /* Is this an active IRQ or balancing disabled ? */ - if (!irq_desc[j].action || irq_balancing_disabled(j)) + desc = irq_to_desc(j); + if (!desc->action || irq_balancing_disabled(j)) continue; if (package_index == i) IRQ_DELTA(package_index, j) = 0; @@ -609,7 +615,8 @@ tryanotherirq: selected_irq = -1; for (j = 0; j < nr_irqs; j++) { /* Is this an active IRQ? */ - if (!irq_desc[j].action) + desc = irq_to_desc(j); + if (!desc->action) continue; if (imbalance <= IRQ_DELTA(max_loaded, j)) continue; @@ -682,10 +689,12 @@ static int balanced_irq(void *unused) int i; unsigned long prev_balance_time = jiffies; long time_remaining = balanced_irq_interval; + struct irq_desc *desc; /* push everything to CPU 0 to give us a starting point. */ for (i = 0 ; i < nr_irqs ; i++) { - irq_desc[i].pending_mask = cpumask_of_cpu(0); + desc = irq_to_desc(i); + desc->pending_mask = cpumask_of_cpu(0); set_pending_irq(i, cpumask_of_cpu(0)); } @@ -1254,13 +1263,16 @@ static struct irq_chip ioapic_chip; static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { + struct irq_desc *desc; + + desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { - irq_desc[irq].status |= IRQ_LEVEL; + desc->status |= IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); } else { - irq_desc[irq].status &= ~IRQ_LEVEL; + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); } @@ -2027,6 +2039,7 @@ static struct irq_chip ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; /* * NOTE! The local APIC isn't very good at handling @@ -2048,9 +2061,11 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } @@ -2089,7 +2104,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq, int vector) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); set_intr_gate(vector, interrupt[irq]); @@ -2556,6 +2574,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; int vector; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2575,7 +2594,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2649,6 +2669,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2659,7 +2680,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(mask); target_ht_irq(irq, dest); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 93a3ffabfe6a..cab5a25d81b1 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -345,6 +345,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) unsigned long flags; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -361,9 +362,10 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) */ dest = SET_APIC_LOGICAL_ID(dest); + desc = irq_to_desc(irq); spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } #endif @@ -933,14 +935,17 @@ static struct irq_chip ir_ioapic_chip; static void ioapic_register_intr(int irq, unsigned long trigger) { + struct irq_desc *desc; + + desc = irq_to_desc(irq); if (trigger) - irq_desc[irq].status |= IRQ_LEVEL; + desc->status |= IRQ_LEVEL; else - irq_desc[irq].status &= ~IRQ_LEVEL; + desc->status &= ~IRQ_LEVEL; #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - irq_desc[irq].status |= IRQ_MOVE_PCNTXT; + desc->status |= IRQ_MOVE_PCNTXT; if (trigger) set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, handle_fasteoi_irq, @@ -1596,10 +1601,10 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); static void migrate_ioapic_irq(int irq, cpumask_t mask) { struct irq_cfg *cfg = irq_cfg + irq; - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc; cpumask_t tmp, cleanup_mask; struct irte irte; - int modify_ioapic_rte = desc->status & IRQ_LEVEL; + int modify_ioapic_rte; unsigned int dest; unsigned long flags; @@ -1616,6 +1621,8 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; if (modify_ioapic_rte) { spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); @@ -1637,12 +1644,13 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc->affinity = mask; } static int migrate_irq_remapped_level(int irq) { int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); mask_IO_APIC_irq(irq); @@ -1658,11 +1666,11 @@ static int migrate_irq_remapped_level(int irq) } /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, irq_desc[irq].pending_mask); + migrate_ioapic_irq(irq, desc->pending_mask); ret = 0; - irq_desc[irq].status &= ~IRQ_MOVE_PENDING; - cpus_clear(irq_desc[irq].pending_mask); + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); unmask: unmask_IO_APIC_irq(irq); @@ -1674,7 +1682,7 @@ static void ir_irq_migration(struct work_struct *work) int irq; for (irq = 0; irq < nr_irqs; irq++) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; @@ -1686,8 +1694,7 @@ static void ir_irq_migration(struct work_struct *work) continue; } - desc->chip->set_affinity(irq, - irq_desc[irq].pending_mask); + desc->chip->set_affinity(irq, desc->pending_mask); spin_unlock_irqrestore(&desc->lock, flags); } } @@ -1698,9 +1705,11 @@ static void ir_irq_migration(struct work_struct *work) */ static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - if (irq_desc[irq].status & IRQ_LEVEL) { - irq_desc[irq].status |= IRQ_MOVE_PENDING; - irq_desc[irq].pending_mask = mask; + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; migrate_irq_remapped_level(irq); return; } @@ -1725,7 +1734,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) if (irq >= nr_irqs) continue; - desc = irq_desc + irq; + desc = irq_to_desc(irq); cfg = irq_cfg + irq; spin_lock(&desc->lock); if (!cfg->move_cleanup_count) @@ -1791,7 +1800,7 @@ static void ack_apic_level(unsigned int irq) irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ /* If we are moving the irq we need to mask it */ - if (unlikely(irq_desc[irq].status & IRQ_MOVE_PENDING)) { + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { do_unmask_irq = 1; mask_IO_APIC_irq(irq); } @@ -1868,6 +1877,7 @@ static struct irq_chip ir_ioapic_chip __read_mostly = { static inline void init_IO_APIC_traps(void) { int irq; + struct irq_desc *desc; /* * NOTE! The local APIC isn't very good at handling @@ -1889,9 +1899,11 @@ static inline void init_IO_APIC_traps(void) */ if (irq < 16) make_8259A_irq(irq); - else + else { + desc = irq_to_desc(irq); /* Strange. Oh, well.. */ - irq_desc[irq].chip = &no_irq_chip; + desc->chip = &no_irq_chip; + } } } } @@ -1926,7 +1938,10 @@ static struct irq_chip lapic_chip __read_mostly = { static void lapic_register_intr(int irq) { - irq_desc[irq].status &= ~IRQ_LEVEL; + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); } @@ -2402,6 +2417,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2421,7 +2437,8 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #ifdef CONFIG_INTR_REMAP @@ -2435,6 +2452,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp, cleanup_mask; struct irte irte; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2469,7 +2487,8 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) cfg->move_in_progress = 0; } - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif #endif /* CONFIG_SMP */ @@ -2543,7 +2562,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) #ifdef CONFIG_INTR_REMAP if (irq_remapped(irq)) { - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); /* * irq migration in process context */ @@ -2655,6 +2674,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2674,7 +2694,8 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); dmar_msi_write(irq, &msg); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2731,6 +2752,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) struct irq_cfg *cfg = irq_cfg + irq; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2743,7 +2765,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - irq_desc[irq].affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 4c7ffb32854c..ede513be517d 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -224,7 +224,7 @@ unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs; /* high bit used in ret_from_ code */ int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_desc + irq; + struct irq_desc *desc = irq_to_desc(irq); if (unlikely((unsigned)irq >= nr_irqs)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", @@ -273,15 +273,16 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; + struct irq_desc *desc = irq_to_desc(i); - spin_lock_irqsave(&irq_desc[i].lock, flags); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); #else for_each_online_cpu(j) any_count |= kstat_cpu(j).irqs[i]; #endif - action = irq_desc[i].action; + action = desc->action; if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); @@ -291,8 +292,8 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); if (action) { seq_printf(p, " %s", action->name); @@ -302,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) @@ -398,17 +399,20 @@ void fixup_irqs(cpumask_t map) for (irq = 0; irq < nr_irqs; irq++) { cpumask_t mask; + struct irq_desc *desc; + if (irq == 2) continue; - cpus_and(mask, irq_desc[irq].affinity, map); + desc = irq_to_desc(irq); + cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); mask = map; } - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); - else if (irq_desc[irq].action && !(warned++)) + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); + else if (desc->action && !(warned++)) printk("Cannot set affinity for irq %i\n", irq); } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e1f0839430d2..738eb65a924e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,15 +83,16 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; + struct irq_desc *desc = irq_to_desc(i); - spin_lock_irqsave(&irq_desc[i].lock, flags); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP any_count = kstat_irqs(i); #else for_each_online_cpu(j) any_count |= kstat_cpu(j).irqs[i]; #endif - action = irq_desc[i].action; + action = desc->action; if (!action && !any_count) goto skip; seq_printf(p, "%3d: ",i); @@ -101,8 +102,8 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); #endif - seq_printf(p, " %8s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); if (action) { seq_printf(p, " %s", action->name); @@ -111,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) } seq_putc(p, '\n'); skip: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); + spin_unlock_irqrestore(&desc->lock, flags); } else if (i == nr_irqs) { seq_printf(p, "NMI: "); for_each_online_cpu(j) @@ -228,37 +229,39 @@ void fixup_irqs(cpumask_t map) cpumask_t mask; int break_affinity = 0; int set_affinity = 1; + struct irq_desc *desc; if (irq == 2) continue; + desc = irq_to_desc(irq); /* interrupt's are disabled at this point */ - spin_lock(&irq_desc[irq].lock); + spin_lock(&desc->lock); if (!irq_has_action(irq) || - cpus_equal(irq_desc[irq].affinity, map)) { - spin_unlock(&irq_desc[irq].lock); + cpus_equal(desc->affinity, map)) { + spin_unlock(&desc->lock); continue; } - cpus_and(mask, irq_desc[irq].affinity, map); + cpus_and(mask, desc->affinity, map); if (cpus_empty(mask)) { break_affinity = 1; mask = map; } - if (irq_desc[irq].chip->mask) - irq_desc[irq].chip->mask(irq); + if (desc->chip->mask) + desc->chip->mask(irq); - if (irq_desc[irq].chip->set_affinity) - irq_desc[irq].chip->set_affinity(irq, mask); + if (desc->chip->set_affinity) + desc->chip->set_affinity(irq, mask); else if (!(warned++)) set_affinity = 0; - if (irq_desc[irq].chip->unmask) - irq_desc[irq].chip->unmask(irq); + if (desc->chip->unmask) + desc->chip->unmask(irq); - spin_unlock(&irq_desc[irq].lock); + spin_unlock(&desc->lock); if (break_affinity && set_affinity) printk("Broke affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 165c5d9b0d1a..0744b49b4d12 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -143,9 +143,11 @@ void __init init_ISA_irqs(void) init_8259A(0); for (i = 0; i < nr_irqs; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = NULL; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; if (i < 16) { /* @@ -157,7 +159,7 @@ void __init init_ISA_irqs(void) /* * 'high' PCI IRQs filled in on demand */ - irq_desc[i].chip = &no_irq_chip; + desc->chip = &no_irq_chip; } } } diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 61a97e616f70..9d85ab384435 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -484,10 +484,11 @@ static void disable_cobalt_irq(unsigned int irq) static unsigned int startup_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if ((irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) - irq_desc[irq].status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); + if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) + desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING); enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); return 0; @@ -506,9 +507,10 @@ static void ack_cobalt_irq(unsigned int irq) static void end_cobalt_irq(unsigned int irq) { unsigned long flags; + struct irq_desc *desc = irq_to_desc(irq); spin_lock_irqsave(&cobalt_lock, flags); - if (!(irq_desc[irq].status & (IRQ_DISABLED | IRQ_INPROGRESS))) + if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS))) enable_cobalt_irq(irq); spin_unlock_irqrestore(&cobalt_lock, flags); } @@ -626,7 +628,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) spin_unlock_irqrestore(&i8259A_lock, flags); - desc = irq_desc + realirq; + desc = irq_to_desc(realirq); /* * handle this 'virtual interrupt' as a Cobalt one now. @@ -662,27 +664,29 @@ void init_VISWS_APIC_irqs(void) int i; for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { - irq_desc[i].status = IRQ_DISABLED; - irq_desc[i].action = 0; - irq_desc[i].depth = 1; + struct irq_desc *desc = irq_to_desc(i); + + desc->status = IRQ_DISABLED; + desc->action = 0; + desc->depth = 1; if (i == 0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE0) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_IDE1) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } else if (i == CO_IRQ_8259) { - irq_desc[i].chip = &piix4_master_irq_type; + desc->chip = &piix4_master_irq_type; } else if (i < CO_IRQ_APIC0) { - irq_desc[i].chip = &piix4_virtual_irq_type; + desc->chip = &piix4_virtual_irq_type; } else if (IS_CO_APIC(i)) { - irq_desc[i].chip = &cobalt_irq_type; + desc->chip = &cobalt_irq_type; } } diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 199a5f4a873c..0f6e8a6523ae 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1483,7 +1483,7 @@ static void disable_local_vic_irq(unsigned int irq) * the interrupt off to another CPU */ static void before_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); __u8 cpu = smp_processor_id(); _raw_spin_lock(&vic_irq_lock); @@ -1518,7 +1518,7 @@ static void before_handle_vic_irq(unsigned int irq) /* Finish the VIC interrupt: basically mask */ static void after_handle_vic_irq(unsigned int irq) { - irq_desc_t *desc = irq_desc + irq; + irq_desc_t *desc = irq_to_desc(irq); _raw_spin_lock(&vic_irq_lock); { -- cgit v1.2.2 From 3ac2de48ed3c998df7f366e039c97eedb27e7c3d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:06 -0700 Subject: x86: add irq_cfg in io_apic_64.c preallocate size is 32, and if it is not enough, irq_cfg will more via alloc_bootmem() or kzalloc(). (depending on how early we are in system setup) v2: fix typo about size of init_one_irq_cfg ... should use sizeof(struct irq_cfg) v3: according to Eric, change get_irq_cfg() to irq_cfg() v4: squash add irq_cfg_alloc in Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 209 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 169 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index cab5a25d81b1..858c37a31a2f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -57,7 +57,11 @@ #define __apicdebuginit(type) static type __init +struct irq_cfg; + struct irq_cfg { + unsigned int irq; + struct irq_cfg *next; cpumask_t domain; cpumask_t old_domain; unsigned move_cleanup_count; @@ -67,34 +71,132 @@ struct irq_cfg { /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; -static struct irq_cfg *irq_cfg; +static struct irq_cfg irq_cfg_init = { .irq = -1U, }; +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; +} + +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +static void init_one_irq_cfg(struct irq_cfg *cfg) +{ + memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); +} static void __init init_work(void *data) { struct dyn_array *da = data; + struct irq_cfg *cfg; + int i; - memcpy(*da->name, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + cfg = *da->name; + + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + + i = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (; i < *da->nr; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < *da->nr; i++) + cfg[i-1].next = &cfg[i]; } -DEFINE_DYN_ARRAY(irq_cfg, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); +static struct irq_cfg *irq_cfgx; +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + struct irq_cfg *cfg; + + BUG_ON(irq == -1U); + + cfg = &irq_cfgx[0]; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + if (cfg->irq == -1U) + return NULL; + + cfg = cfg->next; + } + + return NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + struct irq_cfg *cfg, *cfg_pri; + int i; + int count = 0; + + BUG_ON(irq == -1U); + + cfg_pri = cfg = &irq_cfgx[0]; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + if (cfg->irq == -1U) { + cfg->irq = irq; + return cfg; + } + cfg_pri = cfg; + cfg = cfg->next; + count++; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + + if (after_bootmem) + cfg = kzalloc(sizeof(struct irq_cfg)*nr_irq_cfg, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(sizeof(struct irq_cfg)*nr_irq_cfg, PAGE_SIZE, 0); + + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); + + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + cfg->irq = irq; + cfg_pri->next = cfg; + + return cfg; +} static int assign_irq_vector(int irq, cpumask_t mask); @@ -341,7 +443,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; unsigned int dest; cpumask_t tmp; @@ -381,6 +483,8 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) struct irq_pin_list *entry = irq_2_pin + irq; BUG_ON(irq >= nr_irqs); + irq_cfg_alloc(irq); + while (entry->next) entry = irq_2_pin + entry->next; @@ -819,7 +923,7 @@ static int __assign_irq_vector(int irq, cpumask_t mask) struct irq_cfg *cfg; BUG_ON((unsigned)irq >= nr_irqs); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); /* Only try and allocate irqs on cpus that are present */ cpus_and(mask, mask, cpu_online_map); @@ -893,7 +997,7 @@ static void __clear_irq_vector(int irq) int cpu, vector; BUG_ON((unsigned)irq >= nr_irqs); - cfg = &irq_cfg[irq]; + cfg = irq_cfg(irq); BUG_ON(!cfg->vector); vector = cfg->vector; @@ -913,17 +1017,23 @@ void __setup_vector_irq(int cpu) /* Mark the inuse vectors */ for (irq = 0; irq < nr_irqs; ++irq) { - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + struct irq_cfg *cfg = irq_cfg(irq); + + if (!cpu_isset(cpu, cfg->domain)) continue; - vector = irq_cfg[irq].vector; + vector = cfg->vector; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { + struct irq_cfg *cfg; + irq = per_cpu(vector_irq, cpu)[vector]; if (irq < 0) continue; - if (!cpu_isset(cpu, irq_cfg[irq].domain)) + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; } } @@ -1029,13 +1139,15 @@ static int setup_ioapic_entry(int apic, int irq, static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, int trigger, int polarity) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct IO_APIC_route_entry entry; cpumask_t mask; if (!IO_APIC_IRQ(irq)) return; + cfg = irq_cfg(irq); + mask = TARGET_CPUS; if (assign_irq_vector(irq, mask)) return; @@ -1553,7 +1665,7 @@ static unsigned int startup_ioapic_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { - struct irq_cfg *cfg = &irq_cfg[irq]; + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; spin_lock_irqsave(&vector_lock, flags); @@ -1600,7 +1712,7 @@ static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); */ static void migrate_ioapic_irq(int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct irq_desc *desc; cpumask_t tmp, cleanup_mask; struct irte irte; @@ -1618,6 +1730,7 @@ static void migrate_ioapic_irq(int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -1735,7 +1848,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) continue; desc = irq_to_desc(irq); - cfg = irq_cfg + irq; + cfg = irq_cfg(irq); spin_lock(&desc->lock); if (!cfg->move_cleanup_count) goto unlock; @@ -1754,7 +1867,7 @@ unlock: static void irq_complete_move(unsigned int irq) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg = irq_cfg(irq); unsigned vector, me; if (likely(!cfg->move_in_progress)) @@ -1891,7 +2004,10 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for (irq = 0; irq < nr_irqs ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_cfg[irq].vector) { + struct irq_cfg *cfg; + + cfg = irq_cfg(irq); + if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2028,7 +2144,7 @@ static inline void __init unlock_ExtINT_logic(void) */ static inline void __init check_timer(void) { - struct irq_cfg *cfg = irq_cfg + 0; + struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; unsigned long flags; int no_pin1 = 0; @@ -2306,14 +2422,19 @@ int create_irq(void) int irq; int new; unsigned long flags; + struct irq_cfg *cfg_new; irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_cfg[new].vector != 0) + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) continue; + /* check if need to create one */ + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); if (__assign_irq_vector(new, TARGET_CPUS) == 0) irq = new; break; @@ -2346,7 +2467,7 @@ void destroy_irq(unsigned int irq) #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; unsigned dest; cpumask_t tmp; @@ -2356,6 +2477,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms if (err) return err; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -2413,7 +2535,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms #ifdef CONFIG_SMP static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; @@ -2426,6 +2548,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2448,7 +2571,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) */ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp, cleanup_mask; struct irte irte; @@ -2464,6 +2587,7 @@ static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2670,7 +2794,7 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_SMP static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; @@ -2683,6 +2807,7 @@ static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2749,7 +2874,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; struct irq_desc *desc; @@ -2761,6 +2886,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) if (assign_irq_vector(irq, mask)) return; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); @@ -2783,7 +2909,7 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { - struct irq_cfg *cfg = irq_cfg + irq; + struct irq_cfg *cfg; int err; cpumask_t tmp; @@ -2793,6 +2919,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) struct ht_irq_msg msg; unsigned dest; + cfg = irq_cfg(irq); cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); @@ -2891,6 +3018,7 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; if (skip_ioapic_setup == 1) return; @@ -2906,7 +3034,8 @@ void __init setup_ioapic_dest(void) * when you have too many devices, because at that time only boot * cpu is online. */ - if (!irq_cfg[irq].vector) + cfg = irq_cfg(irq); + if (!cfg->vector) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); -- cgit v1.2.2 From e5a53714acfc7b5f868d07d27c5f02cb00b118db Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:07 -0700 Subject: x86: put irq_2_pin pointer into irq_cfg preallocate 32 irq_2_pin entries, and use get_one_free_irq_2_pin() to get one more and link to irq_cfg if needed. so don't waste one where no irq is enabled. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 161 ++++++++++++++++++++++++++++++++----------- 1 file changed, 120 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 858c37a31a2f..51ef7eb75f2e 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -58,10 +58,11 @@ #define __apicdebuginit(type) static type __init struct irq_cfg; - +struct irq_pin_list; struct irq_cfg { unsigned int irq; struct irq_cfg *next; + struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; unsigned move_cleanup_count; @@ -252,13 +253,66 @@ int pin_map_size; * between pins and IRQs. */ -static struct irq_pin_list { - short apic, pin; - int next; -} *irq_2_pin; +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *irq_2_pin_head; +/* fill one page ? */ +static int nr_irq_2_pin = 0x100; +static struct irq_pin_list *irq_2_pin_ptr; +static void __init irq_2_pin_init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_pin_list *pin; + int i; + + pin = *da->name; + + for (i = 1; i < *da->nr; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} +DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin; + int i; + + pin = irq_2_pin_ptr; + + if (pin) { + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); + + if (after_bootmem) + pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, + GFP_ATOMIC); + else + pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * + nr_irq_2_pin, PAGE_SIZE, 0); + + if (!pin) + panic("can not get more irq_2_pin\n"); -DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, sizeof(struct irq_pin_list), NULL); + for (i = 1; i < nr_irq_2_pin; i++) + pin[i-1].next = &pin[i]; + irq_2_pin_ptr = pin->next; + pin->next = NULL; + + return pin; +} struct io_apic { unsigned int index; @@ -300,16 +354,17 @@ static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); spin_lock_irqsave(&ioapic_lock, flags); - entry = irq_2_pin + irq; + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; int pin; - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); /* Is the remote IRR bit set? */ if (reg & IO_APIC_REDIR_REMOTE_IRR) { @@ -318,7 +373,7 @@ static bool io_apic_level_ack_pending(unsigned int irq) } if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } spin_unlock_irqrestore(&ioapic_lock, flags); @@ -339,21 +394,24 @@ static inline void io_apic_sync(unsigned int apic) \ { \ int pin; \ - struct irq_pin_list *entry = irq_2_pin + irq; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ \ BUG_ON(irq >= nr_irqs); \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ for (;;) { \ unsigned int reg; \ - pin = entry->pin; \ - if (pin == -1) \ + if (!entry) \ break; \ + pin = entry->pin; \ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ io_apic_modify(entry->apic, reg); \ FINAL; \ if (!entry->next) \ break; \ - entry = irq_2_pin + entry->next; \ + entry = entry->next; \ } \ } @@ -416,15 +474,20 @@ static void ioapic_mask_entry(int apic, int pin) static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) { int apic, pin; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; BUG_ON(irq >= nr_irqs); + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; for (;;) { unsigned int reg; + + if (!entry) + break; + apic = entry->apic; pin = entry->pin; - if (pin == -1) - break; /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. @@ -437,7 +500,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) io_apic_modify(apic, reg); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } } @@ -480,22 +543,35 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) int first_free_entry; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; BUG_ON(irq >= nr_irqs); - irq_cfg_alloc(irq); + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); + return; + } - while (entry->next) - entry = irq_2_pin + entry->next; + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= pin_map_size) - panic("io_apic.c: ran out of irq_2_pin entries!"); + entry = entry->next; } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; entry->apic = apic; entry->pin = pin; + printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); } /* @@ -505,17 +581,24 @@ static void __init replace_pin_at_irq(unsigned int irq, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; - while (1) { + while (entry) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - } - if (!entry->next) + replaced = 1; + /* every one is different, right? */ break; - entry = irq_2_pin + entry->next; + } + entry = entry->next; } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); } @@ -1326,15 +1409,16 @@ __apicdebuginit(void) print_IO_APIC(void) } printk(KERN_DEBUG "IRQ to pin mappings:\n"); for (i = 0; i < nr_irqs; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) + struct irq_cfg *cfg = irq_cfg(i); + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } printk("\n"); } @@ -1495,14 +1579,9 @@ void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; - int i, apic; + int apic; unsigned long flags; - for (i = 0; i < pin_map_size; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } - /* * The number of IO-APIC IRQ registers (== #pins): */ -- cgit v1.2.2 From 7f95ec9e4c12fd067febfd57532da1166d75d858 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:09 -0700 Subject: x86: move kstat_irqs from kstat to irq_desc based on Eric's patch ... together mold it with dyn_array for irq_desc, will allcate kstat_irqs for nr_irq_desc alltogether if needed. -- at that point nr_cpus is known already. v2: make sure system without generic_hardirqs works they don't have irq_desc v3: fix merging v4: [mingo@elte.hu] fix typo [ mingo@elte.hu ] irq: build fix fix: arch/x86/xen/spinlock.c: In function 'xen_spin_lock_slow': arch/x86/xen/spinlock.c:90: error: 'struct kernel_stat' has no member named 'irqs' Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 2 +- arch/x86/kernel/irq_32.c | 4 ++-- arch/x86/kernel/irq_64.c | 4 ++-- arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/xen/spinlock.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index c2160cfdec9b..204884b1415a 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -526,7 +526,7 @@ static void do_irq_balance(void) if (package_index == i) IRQ_DELTA(package_index, j) = 0; /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_cpu(i).irqs[j]; + value_now = (unsigned long) kstat_irqs_cpu(j, i); /* Determine the activity per processor per IRQ */ delta = value_now - LAST_CPU_IRQ(i, j); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index ede513be517d..576c5df6cad8 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -280,7 +280,7 @@ int show_interrupts(struct seq_file *p, void *v) any_count = kstat_irqs(i); #else for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; + any_count |= kstat_irqs_cpu(i, j); #endif action = desc->action; if (!action && !any_count) @@ -290,7 +290,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%10u ", kstat_irqs(i)); #else for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 738eb65a924e..4a0a4eb44dcb 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -90,7 +90,7 @@ int show_interrupts(struct seq_file *p, void *v) any_count = kstat_irqs(i); #else for_each_online_cpu(j) - any_count |= kstat_cpu(j).irqs[i]; + any_count |= kstat_irqs_cpu(i, j); #endif action = desc->action; if (!action && !any_count) @@ -100,7 +100,7 @@ int show_interrupts(struct seq_file *p, void *v) seq_printf(p, "%10u ", kstat_irqs(i)); #else for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]); + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif seq_printf(p, " %8s", desc->chip->name); seq_printf(p, "-%-8s", desc->name); diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 9d85ab384435..817aa55a1209 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_cpu(smp_processor_id()).irqs[realirq]++; + kstat_irqs_this_cpu(desc)++; if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index dd71e3a021cd..bb6bc721b13d 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - kstat_this_cpu.irqs[irq]++; + kstat_irqs_this_cpu(irq_to_desc(irq))++; out: raw_local_irq_restore(flags); -- cgit v1.2.2 From 2c6927a38f65b53b62f86158fba29a068c4e8b6a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:11 -0700 Subject: irq: replace loop with nr_irqs with for_each_irq_desc There are a handful of loops that go from 0 to nr_irqs and use get_irq_desc() on them. These would allocate all the irq_desc entries, regardless of the need for them. Use the smarter for_each_irq_desc() iterator that will only iterate over the present ones. v2: make sure arch without GENERIC_HARDIRQS work too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 6 +++--- arch/x86/kernel/irq_64.c | 5 ++--- arch/x86/kernel/irqinit_64.c | 17 +++++------------ 3 files changed, 10 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 51ef7eb75f2e..708be9724daf 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1871,10 +1871,10 @@ unmask: static void ir_irq_migration(struct work_struct *work) { - int irq; + unsigned int irq; + struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) { - struct irq_desc *desc = irq_to_desc(irq); + for_each_irq_desc(irq, desc) { if (desc->status & IRQ_MOVE_PENDING) { unsigned long flags; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4a0a4eb44dcb..b3cf55e325f5 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -224,17 +224,16 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; int break_affinity = 0; int set_affinity = 1; - struct irq_desc *desc; if (irq == 2) continue; - desc = irq_to_desc(irq); /* interrupt's are disabled at this point */ spin_lock(&desc->lock); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 0744b49b4d12..cd9f42d028d9 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -142,25 +142,18 @@ void __init init_ISA_irqs(void) init_bsp_APIC(); init_8259A(0); - for (i = 0; i < nr_irqs; i++) { + for (i = 0; i < 16; i++) { struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; desc->action = NULL; desc->depth = 1; - if (i < 16) { - /* - * 16 old-style INTA-cycle interrupts: - */ - set_irq_chip_and_handler_name(i, &i8259A_chip, + /* + * 16 old-style INTA-cycle interrupts: + */ + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); - } else { - /* - * 'high' PCI IRQs filled in on demand - */ - desc->chip = &no_irq_chip; - } } } -- cgit v1.2.2 From 46b8214d12c274bd4265aae482ab7ffe69d94818 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:13 -0700 Subject: x86, ioapic: replace loop with nr_irqs with for_each_irq_icfg Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 708be9724daf..60d60061659c 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -129,6 +129,9 @@ static void __init init_work(void *data) cfg[i-1].next = &cfg[i]; } +#define for_each_irq_cfg(cfg) \ + for (cfg = irq_cfgx; cfg && cfg->irq != -1U; cfg = cfg->next) + static struct irq_cfg *irq_cfgx; DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); @@ -1097,20 +1100,18 @@ void __setup_vector_irq(int cpu) /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ int irq, vector; + struct irq_cfg *cfg; /* Mark the inuse vectors */ - for (irq = 0; irq < nr_irqs; ++irq) { - struct irq_cfg *cfg = irq_cfg(irq); - + for_each_irq_cfg(cfg) { if (!cpu_isset(cpu, cfg->domain)) continue; vector = cfg->vector; + irq = cfg->irq; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ for (vector = 0; vector < NR_VECTORS; ++vector) { - struct irq_cfg *cfg; - irq = per_cpu(vector_irq, cpu)[vector]; if (irq < 0) continue; @@ -1340,6 +1341,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; unsigned long flags; + struct irq_cfg *cfg; if (apic_verbosity == APIC_QUIET) return; @@ -1408,12 +1410,11 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < nr_irqs; i++) { - struct irq_cfg *cfg = irq_cfg(i); + for_each_irq_cfg(cfg) { struct irq_pin_list *entry = cfg->irq_2_pin; if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", cfg->irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -2070,6 +2071,7 @@ static inline void init_IO_APIC_traps(void) { int irq; struct irq_desc *desc; + struct irq_cfg *cfg; /* * NOTE! The local APIC isn't very good at handling @@ -2082,10 +2084,8 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < nr_irqs ; irq++) { - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); + for_each_irq_cfg(cfg) { + irq = cfg->irq; if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, -- cgit v1.2.2 From 7d94f7ca401dd7f445fda9a971a48aa5427b3e55 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:14 -0700 Subject: irq: remove >= nr_irqs checking with config_have_sparse_irq remove irq limit checks - nr_irqs is dynamic and we expand anytime. v2: fix checking about result irq_cfg_without_new, so could use msi again v3: use irq_desc_without_new to check irq is valid Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 9 --------- arch/x86/kernel/irq_64.c | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 60d60061659c..1b8cccb5ba25 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -400,7 +400,6 @@ static inline void io_apic_sync(unsigned int apic) struct irq_cfg *cfg; \ struct irq_pin_list *entry; \ \ - BUG_ON(irq >= nr_irqs); \ cfg = irq_cfg(irq); \ entry = cfg->irq_2_pin; \ for (;;) { \ @@ -480,7 +479,6 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) struct irq_cfg *cfg; struct irq_pin_list *entry; - BUG_ON(irq >= nr_irqs); cfg = irq_cfg(irq); entry = cfg->irq_2_pin; for (;;) { @@ -549,7 +547,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) struct irq_cfg *cfg; struct irq_pin_list *entry; - BUG_ON(irq >= nr_irqs); /* first time to refer irq_cfg, so with new */ cfg = irq_cfg_alloc(irq); entry = cfg->irq_2_pin; @@ -841,7 +838,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) best_guess = irq; } } - BUG_ON(best_guess >= nr_irqs); return best_guess; } @@ -973,7 +969,6 @@ static int pin_2_irq(int idx, int apic, int pin) irq += nr_ioapic_registers[i++]; irq += pin; } - BUG_ON(irq >= nr_irqs); return irq; } @@ -1008,7 +1003,6 @@ static int __assign_irq_vector(int irq, cpumask_t mask) int cpu; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= nr_irqs); cfg = irq_cfg(irq); /* Only try and allocate irqs on cpus that are present */ @@ -1082,7 +1076,6 @@ static void __clear_irq_vector(int irq) cpumask_t mask; int cpu, vector; - BUG_ON((unsigned)irq >= nr_irqs); cfg = irq_cfg(irq); BUG_ON(!cfg->vector); @@ -1924,8 +1917,6 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) struct irq_desc *desc; struct irq_cfg *cfg; irq = __get_cpu_var(vector_irq)[vector]; - if (irq >= nr_irqs) - continue; desc = irq_to_desc(irq); cfg = irq_cfg(irq); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index b3cf55e325f5..a3e36336d914 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -202,7 +202,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(irq < nr_irqs)) + if (likely(__irq_to_desc(irq))) generic_handle_irq(irq); else { if (!disable_apic) -- cgit v1.2.2 From 46926b67fc663d357a1a8174328998a9e49da0b8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:15 -0700 Subject: generic: add irq_desc in function in parameter So we could remove some duplicated calling to irq_desc v2: make sure irq_desc in init/main.c is not used without generic_hardirqs Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index a3e36336d914..f58b995b30ee 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -189,6 +189,7 @@ u64 arch_irq_stat(void) asmlinkage unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs = set_irq_regs(regs); + struct irq_desc *desc; /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; @@ -202,8 +203,9 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - if (likely(__irq_to_desc(irq))) - generic_handle_irq(irq); + desc = __irq_to_desc(irq); + if (likely(desc)) + generic_handle_irq_desc(irq, desc); else { if (!disable_apic) ack_APIC_irq(); -- cgit v1.2.2 From 1d5f6b36c4736af1dac396d6267eb53dcc8c0021 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:16 -0700 Subject: x86: check with without_new in show_interrupts so we don't get new one that we don't need it. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f58b995b30ee..f337f87c1e16 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,7 +83,10 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); + struct irq_desc *desc = __irq_to_desc(i); + + if (!desc) + return 0; spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP -- cgit v1.2.2 From cb5bc83225a86ca53bbb889ed8439e4fd6cf44ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:17 -0700 Subject: x86_64: rename irq_desc/irq_desc_alloc change names: irq_desc() ==> irq_desc_alloc __irq_desc() ==> irq_desc Also split a few of the uses in lowlevel x86 code. v2: need to check if desc is null in smp_irq_move_cleanup Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 10 +++++++++- arch/x86/kernel/irq_64.c | 4 ++-- arch/x86/kernel/irqinit_64.c | 3 ++- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 1b8cccb5ba25..a054db9ef190 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1124,7 +1124,12 @@ static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; - desc = irq_to_desc(irq); + /* first time to use this irq_desc */ + if (irq < 16) + desc = irq_to_desc(irq); + else + desc = irq_to_desc_alloc(irq); + if (trigger) desc->status |= IRQ_LEVEL; else @@ -1919,6 +1924,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) irq = __get_cpu_var(vector_irq)[vector]; desc = irq_to_desc(irq); + if (!desc) + continue; + cfg = irq_cfg(irq); spin_lock(&desc->lock); if (!cfg->move_cleanup_count) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index f337f87c1e16..5d5976e0311a 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -83,7 +83,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i < nr_irqs) { unsigned any_count = 0; - struct irq_desc *desc = __irq_to_desc(i); + struct irq_desc *desc = irq_to_desc(i); if (!desc) return 0; @@ -206,7 +206,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) stack_overflow_check(regs); #endif - desc = __irq_to_desc(irq); + desc = irq_to_desc(irq); if (likely(desc)) generic_handle_irq_desc(irq, desc); else { diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index cd9f42d028d9..d17fbc26d96f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -143,7 +143,8 @@ void __init init_ISA_irqs(void) init_8259A(0); for (i = 0; i < 16; i++) { - struct irq_desc *desc = irq_to_desc(i); + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc_alloc(i); desc->status = IRQ_DISABLED; desc->action = NULL; -- cgit v1.2.2 From a2f9f43858db64cb8b45c4f6746d7a52b80d4dcb Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:19 -0700 Subject: x86_64: separate irq_cfgx from irq_cfgx_free so later don't need to compare with -1U Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 92 ++++++++++++++++++++++++++++---------------- 1 file changed, 59 insertions(+), 33 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index a054db9ef190..8ab7ae01773f 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -111,44 +111,44 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); } +static struct irq_cfg *irq_cfgx; +static struct irq_cfg *irq_cfgx_free; static void __init init_work(void *data) { struct dyn_array *da = data; struct irq_cfg *cfg; + int legacy_count; int i; cfg = *da->name; memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - i = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); - for (; i < *da->nr; i++) + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); for (i = 1; i < *da->nr; i++) cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = &irq_cfgx[legacy_count]; + irq_cfgx[legacy_count - 1].next = NULL; } #define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg && cfg->irq != -1U; cfg = cfg->next) + for (cfg = irq_cfgx; cfg; cfg = cfg->next) -static struct irq_cfg *irq_cfgx; DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); static struct irq_cfg *irq_cfg(unsigned int irq) { struct irq_cfg *cfg; - BUG_ON(irq == -1U); - - cfg = &irq_cfgx[0]; + cfg = irq_cfgx; while (cfg) { if (cfg->irq == irq) return cfg; - if (cfg->irq == -1U) - return NULL; - cfg = cfg->next; } @@ -161,44 +161,70 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) int i; int count = 0; - BUG_ON(irq == -1U); - - cfg_pri = cfg = &irq_cfgx[0]; + cfg_pri = cfg = irq_cfgx; while (cfg) { if (cfg->irq == irq) return cfg; - if (cfg->irq == -1U) { - cfg->irq = irq; - return cfg; - } cfg_pri = cfg; cfg = cfg->next; count++; } - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + if (!irq_cfgx_free) { + unsigned long phys; + unsigned long total_bytes; + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - if (after_bootmem) - cfg = kzalloc(sizeof(struct irq_cfg)*nr_irq_cfg, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(sizeof(struct irq_cfg)*nr_irq_cfg, PAGE_SIZE, 0); + total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; + if (after_bootmem) + cfg = kzalloc(total_bytes, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); + phys = __pa(cfg); + printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); - cfg->irq = irq; - cfg_pri->next = cfg; + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = cfg; + } + cfg = irq_cfgx_free; + irq_cfgx_free = irq_cfgx_free->next; + cfg->next = NULL; + if (cfg_pri) + cfg_pri->next = cfg; + else + irq_cfgx = cfg; + cfg->irq = irq; + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); +#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG + { + /* dump the results */ + struct irq_cfg *cfg; + unsigned long phys; + unsigned long bytes = sizeof(struct irq_cfg); + + printk(KERN_DEBUG "=========================== %d\n", irq); + printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); + for_each_irq_cfg(cfg) { + phys = __pa(cfg); + printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); + } + printk(KERN_DEBUG "===========================\n"); + } +#endif return cfg; } -- cgit v1.2.2 From 52b17329d6d0a4824b89206803a430915031ff23 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:20 -0700 Subject: x86_64: make /proc/interrupts work with dyn irq_desc loop with irq_desc list Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_64.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 5d5976e0311a..7bd841a9c640 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -70,9 +70,27 @@ static inline void stack_overflow_check(struct pt_regs *regs) int show_interrupts(struct seq_file *p, void *v) { - int i = *(loff_t *) v, j; + int i, j; struct irqaction * action; unsigned long flags; + unsigned int entries; + struct irq_desc *desc; + int tail = 0; + +#ifdef CONFIG_HAVE_SPARSE_IRQ + desc = (struct irq_desc *)v; + entries = -1U; + i = desc->irq; + if (!desc->next) + tail = 1; +#else + entries = nr_irqs - 1; + i = *(loff_t *) v; + if (i == nr_irqs) + tail = 1; + else + desc = irq_to_desc(i); +#endif if (i == 0) { seq_printf(p, " "); @@ -81,12 +99,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < nr_irqs) { + if (i <= entries) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); - - if (!desc) - return 0; spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP @@ -116,7 +130,9 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&desc->lock, flags); - } else if (i == nr_irqs) { + } + + if (tail) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); @@ -155,6 +171,7 @@ skip: seq_printf(p, " Spurious interrupts\n"); seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); } + return 0; } -- cgit v1.2.2 From 6d50bc26836e16a9589e0b128d527c29e30d722a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:22 -0700 Subject: x86: use 28 bits irq NR for pci msi/msix and ht also print out irq no in /proc/interrups and /proc/stat in hex, so could tell bus/dev/func. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 64 ++++++++++++++++++++++++++++++++++---------- arch/x86/kernel/irq_64.c | 2 +- 2 files changed, 51 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 8ab7ae01773f..b0d4abc55a11 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -2520,17 +2520,21 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq; - int new; + unsigned int irq; + unsigned int new; unsigned long flags; struct irq_cfg *cfg_new; - irq = -ENOSPC; +#ifndef CONFIG_HAVE_SPARSE_IRQ + irq_want = nr_irqs - 1; +#endif + + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new >= 0; new--) { + for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2545,12 +2549,24 @@ int create_irq(void) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + void destroy_irq(unsigned int irq) { unsigned long flags; @@ -2803,13 +2819,29 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) return 0; } +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - int irq, ret; + unsigned int irq; + int ret; + unsigned int irq_want; - irq = create_irq(); - if (irq < 0) - return irq; + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) @@ -2836,18 +2868,22 @@ error: int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - int irq, ret, sub_handle; + unsigned int irq; + int ret, sub_handle; struct msi_desc *desc; + unsigned int irq_want; + #ifdef CONFIG_INTR_REMAP struct intel_iommu *iommu = 0; int index = 0; #endif + irq_want = build_irq_for_pci_dev(dev) + 0x100; sub_handle = 0; list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq(); - if (irq < 0) - return irq; + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; #ifdef CONFIG_INTR_REMAP if (!intr_remapping_enabled) goto no_ir; diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 7bd841a9c640..348a11168c2b 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%3d: ",i); + seq_printf(p, "%#x: ",i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else -- cgit v1.2.2 From 8b8e8c1bf7275eca859fe551dfa484134eaf013b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:23 -0700 Subject: x86: remove irqbalance in kernel for 32 bit This has been deprecated for years, the user space irqbalanced utility works better with numa, has configurable policies, etc... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 8 - arch/x86/configs/i386_defconfig | 1 - arch/x86/kernel/io_apic_32.c | 402 ---------------------------------------- arch/x86/kernel/quirks.c | 3 - 4 files changed, 414 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1004888e9b13..3e0eaaa1a339 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1254,14 +1254,6 @@ config EFI resultant kernel should continue to boot on existing non-EFI platforms. -config IRQBALANCE - def_bool y - prompt "Enable kernel irq balancing" - depends on X86_32 && SMP && X86_IO_APIC - help - The default yes will allow the kernel to do irq load balancing. - Saying no will keep the kernel from doing irq load balancing. - config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 52d0359719d7..13b8c86ae985 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -287,7 +287,6 @@ CONFIG_MTRR=y # CONFIG_MTRR_SANITIZER is not set CONFIG_X86_PAT=y CONFIG_EFI=y -# CONFIG_IRQBALANCE is not set CONFIG_SECCOMP=y # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 204884b1415a..668edf226067 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -371,408 +371,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) spin_unlock_irqrestore(&ioapic_lock, flags); } -#if defined(CONFIG_IRQBALANCE) -# include /* kernel_thread() */ -# include /* kstat */ -# include /* kmalloc() */ -# include - -#define IRQBALANCE_CHECK_ARCH -999 -#define MAX_BALANCED_IRQ_INTERVAL (5*HZ) -#define MIN_BALANCED_IRQ_INTERVAL (HZ/2) -#define BALANCED_IRQ_MORE_DELTA (HZ/10) -#define BALANCED_IRQ_LESS_DELTA (HZ) - -static int irqbalance_disabled __read_mostly = IRQBALANCE_CHECK_ARCH; -static int physical_balance __read_mostly; -static long balanced_irq_interval __read_mostly = MAX_BALANCED_IRQ_INTERVAL; - -static struct irq_cpu_info { - unsigned long *last_irq; - unsigned long *irq_delta; - unsigned long irq; -} irq_cpu_data[NR_CPUS]; - -#define CPU_IRQ(cpu) (irq_cpu_data[cpu].irq) -#define LAST_CPU_IRQ(cpu, irq) (irq_cpu_data[cpu].last_irq[irq]) -#define IRQ_DELTA(cpu, irq) (irq_cpu_data[cpu].irq_delta[irq]) - -#define IDLE_ENOUGH(cpu,now) \ - (idle_cpu(cpu) && ((now) - per_cpu(irq_stat, (cpu)).idle_timestamp > 1)) - -#define IRQ_ALLOWED(cpu, allowed_mask) cpu_isset(cpu, allowed_mask) - -#define CPU_TO_PACKAGEINDEX(i) (first_cpu(per_cpu(cpu_sibling_map, i))) - -static cpumask_t balance_irq_affinity_init __initdata = CPU_MASK_ALL; - -static cpumask_t *balance_irq_affinity; - - -static void __init irq_affinity_init_work(void *data) -{ - struct dyn_array *da = data; - - int i; - struct balance_irq_affinity *affinity; - - affinity = *da->name; - - for (i = 0; i < *da->nr; i++) - memcpy(&affinity[i], &balance_irq_affinity_init, - sizeof(struct balance_irq_affinity)); - -} - -DEFINE_DYN_ARRAY(balance_irq_affinity, sizeof(struct balance_irq_affinity), nr_irqs, PAGE_SIZE, irq_affinity_init_work); - - -void set_balance_irq_affinity(unsigned int irq, cpumask_t mask) -{ - balance_irq_affinity[irq] = mask; -} - -static unsigned long move(int curr_cpu, cpumask_t allowed_mask, - unsigned long now, int direction) -{ - int search_idle = 1; - int cpu = curr_cpu; - - goto inside; - - do { - if (unlikely(cpu == curr_cpu)) - search_idle = 0; -inside: - if (direction == 1) { - cpu++; - if (cpu >= NR_CPUS) - cpu = 0; - } else { - cpu--; - if (cpu == -1) - cpu = NR_CPUS-1; - } - } while (!cpu_online(cpu) || !IRQ_ALLOWED(cpu, allowed_mask) || - (search_idle && !IDLE_ENOUGH(cpu, now))); - - return cpu; -} - -static inline void balance_irq(int cpu, int irq) -{ - unsigned long now = jiffies; - cpumask_t allowed_mask; - unsigned int new_cpu; - - if (irqbalance_disabled) - return; - - cpus_and(allowed_mask, cpu_online_map, balance_irq_affinity[irq]); - new_cpu = move(cpu, allowed_mask, now, 1); - if (cpu != new_cpu) - set_pending_irq(irq, cpumask_of_cpu(new_cpu)); -} - -static inline void rotate_irqs_among_cpus(unsigned long useful_load_threshold) -{ - int i, j; - struct irq_desc *desc; - - for_each_online_cpu(i) { - for (j = 0; j < nr_irqs; j++) { - desc = irq_to_desc(j); - if (!desc->action) - continue; - /* Is it a significant load ? */ - if (IRQ_DELTA(CPU_TO_PACKAGEINDEX(i), j) < - useful_load_threshold) - continue; - balance_irq(i, j); - } - } - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; -} - -static void do_irq_balance(void) -{ - int i, j; - unsigned long max_cpu_irq = 0, min_cpu_irq = (~0); - unsigned long move_this_load = 0; - int max_loaded = 0, min_loaded = 0; - int load; - unsigned long useful_load_threshold = balanced_irq_interval + 10; - int selected_irq; - int tmp_loaded, first_attempt = 1; - unsigned long tmp_cpu_irq; - unsigned long imbalance = 0; - cpumask_t allowed_mask, target_cpu_mask, tmp; - struct irq_desc *desc; - - for_each_possible_cpu(i) { - int package_index; - CPU_IRQ(i) = 0; - if (!cpu_online(i)) - continue; - package_index = CPU_TO_PACKAGEINDEX(i); - for (j = 0; j < nr_irqs; j++) { - unsigned long value_now, delta; - /* Is this an active IRQ or balancing disabled ? */ - desc = irq_to_desc(j); - if (!desc->action || irq_balancing_disabled(j)) - continue; - if (package_index == i) - IRQ_DELTA(package_index, j) = 0; - /* Determine the total count per processor per IRQ */ - value_now = (unsigned long) kstat_irqs_cpu(j, i); - - /* Determine the activity per processor per IRQ */ - delta = value_now - LAST_CPU_IRQ(i, j); - - /* Update last_cpu_irq[][] for the next time */ - LAST_CPU_IRQ(i, j) = value_now; - - /* Ignore IRQs whose rate is less than the clock */ - if (delta < useful_load_threshold) - continue; - /* update the load for the processor or package total */ - IRQ_DELTA(package_index, j) += delta; - - /* Keep track of the higher numbered sibling as well */ - if (i != package_index) - CPU_IRQ(i) += delta; - /* - * We have sibling A and sibling B in the package - * - * cpu_irq[A] = load for cpu A + load for cpu B - * cpu_irq[B] = load for cpu B - */ - CPU_IRQ(package_index) += delta; - } - } - /* Find the least loaded processor package */ - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (min_cpu_irq > CPU_IRQ(i)) { - min_cpu_irq = CPU_IRQ(i); - min_loaded = i; - } - } - max_cpu_irq = ULONG_MAX; - -tryanothercpu: - /* - * Look for heaviest loaded processor. - * We may come back to get the next heaviest loaded processor. - * Skip processors with trivial loads. - */ - tmp_cpu_irq = 0; - tmp_loaded = -1; - for_each_online_cpu(i) { - if (i != CPU_TO_PACKAGEINDEX(i)) - continue; - if (max_cpu_irq <= CPU_IRQ(i)) - continue; - if (tmp_cpu_irq < CPU_IRQ(i)) { - tmp_cpu_irq = CPU_IRQ(i); - tmp_loaded = i; - } - } - - if (tmp_loaded == -1) { - /* - * In the case of small number of heavy interrupt sources, - * loading some of the cpus too much. We use Ingo's original - * approach to rotate them around. - */ - if (!first_attempt && imbalance >= useful_load_threshold) { - rotate_irqs_among_cpus(useful_load_threshold); - return; - } - goto not_worth_the_effort; - } - - first_attempt = 0; /* heaviest search */ - max_cpu_irq = tmp_cpu_irq; /* load */ - max_loaded = tmp_loaded; /* processor */ - imbalance = (max_cpu_irq - min_cpu_irq) / 2; - - /* - * if imbalance is less than approx 10% of max load, then - * observe diminishing returns action. - quit - */ - if (imbalance < (max_cpu_irq >> 3)) - goto not_worth_the_effort; - -tryanotherirq: - /* if we select an IRQ to move that can't go where we want, then - * see if there is another one to try. - */ - move_this_load = 0; - selected_irq = -1; - for (j = 0; j < nr_irqs; j++) { - /* Is this an active IRQ? */ - desc = irq_to_desc(j); - if (!desc->action) - continue; - if (imbalance <= IRQ_DELTA(max_loaded, j)) - continue; - /* Try to find the IRQ that is closest to the imbalance - * without going over. - */ - if (move_this_load < IRQ_DELTA(max_loaded, j)) { - move_this_load = IRQ_DELTA(max_loaded, j); - selected_irq = j; - } - } - if (selected_irq == -1) - goto tryanothercpu; - - imbalance = move_this_load; - - /* For physical_balance case, we accumulated both load - * values in the one of the siblings cpu_irq[], - * to use the same code for physical and logical processors - * as much as possible. - * - * NOTE: the cpu_irq[] array holds the sum of the load for - * sibling A and sibling B in the slot for the lowest numbered - * sibling (A), _AND_ the load for sibling B in the slot for - * the higher numbered sibling. - * - * We seek the least loaded sibling by making the comparison - * (A+B)/2 vs B - */ - load = CPU_IRQ(min_loaded) >> 1; - for_each_cpu_mask(j, per_cpu(cpu_sibling_map, min_loaded)) { - if (load > CPU_IRQ(j)) { - /* This won't change cpu_sibling_map[min_loaded] */ - load = CPU_IRQ(j); - min_loaded = j; - } - } - - cpus_and(allowed_mask, - cpu_online_map, - balance_irq_affinity[selected_irq]); - target_cpu_mask = cpumask_of_cpu(min_loaded); - cpus_and(tmp, target_cpu_mask, allowed_mask); - - if (!cpus_empty(tmp)) { - /* mark for change destination */ - set_pending_irq(selected_irq, cpumask_of_cpu(min_loaded)); - - /* Since we made a change, come back sooner to - * check for more variation. - */ - balanced_irq_interval = max((long)MIN_BALANCED_IRQ_INTERVAL, - balanced_irq_interval - BALANCED_IRQ_LESS_DELTA); - return; - } - goto tryanotherirq; - -not_worth_the_effort: - /* - * if we did not find an IRQ to move, then adjust the time interval - * upward - */ - balanced_irq_interval = min((long)MAX_BALANCED_IRQ_INTERVAL, - balanced_irq_interval + BALANCED_IRQ_MORE_DELTA); - return; -} - -static int balanced_irq(void *unused) -{ - int i; - unsigned long prev_balance_time = jiffies; - long time_remaining = balanced_irq_interval; - struct irq_desc *desc; - - /* push everything to CPU 0 to give us a starting point. */ - for (i = 0 ; i < nr_irqs ; i++) { - desc = irq_to_desc(i); - desc->pending_mask = cpumask_of_cpu(0); - set_pending_irq(i, cpumask_of_cpu(0)); - } - - set_freezable(); - for ( ; ; ) { - time_remaining = schedule_timeout_interruptible(time_remaining); - try_to_freeze(); - if (time_after(jiffies, - prev_balance_time+balanced_irq_interval)) { - preempt_disable(); - do_irq_balance(); - prev_balance_time = jiffies; - time_remaining = balanced_irq_interval; - preempt_enable(); - } - } - return 0; -} - -static int __init balanced_irq_init(void) -{ - int i; - struct cpuinfo_x86 *c; - cpumask_t tmp; - - cpus_shift_right(tmp, cpu_online_map, 2); - c = &boot_cpu_data; - /* When not overwritten by the command line ask subarchitecture. */ - if (irqbalance_disabled == IRQBALANCE_CHECK_ARCH) - irqbalance_disabled = NO_BALANCE_IRQ; - if (irqbalance_disabled) - return 0; - - /* disable irqbalance completely if there is only one processor online */ - if (num_online_cpus() < 2) { - irqbalance_disabled = 1; - return 0; - } - /* - * Enable physical balance only if more than 1 physical processor - * is present - */ - if (smp_num_siblings > 1 && !cpus_empty(tmp)) - physical_balance = 1; - - for_each_online_cpu(i) { - irq_cpu_data[i].irq_delta = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); - irq_cpu_data[i].last_irq = kzalloc(sizeof(unsigned long) * nr_irqs, GFP_KERNEL); - if (irq_cpu_data[i].irq_delta == NULL || irq_cpu_data[i].last_irq == NULL) { - printk(KERN_ERR "balanced_irq_init: out of memory"); - goto failed; - } - } - - printk(KERN_INFO "Starting balanced_irq\n"); - if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) - return 0; - printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); -failed: - for_each_possible_cpu(i) { - kfree(irq_cpu_data[i].irq_delta); - irq_cpu_data[i].irq_delta = NULL; - kfree(irq_cpu_data[i].last_irq); - irq_cpu_data[i].last_irq = NULL; - } - return 0; -} - -int __devinit irqbalance_disable(char *str) -{ - irqbalance_disabled = 1; - return 1; -} - -__setup("noirqbalance", irqbalance_disable); - -late_initcall(balanced_irq_init); -#endif /* CONFIG_IRQBALANCE */ #endif /* CONFIG_SMP */ #ifndef CONFIG_SMP diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index f6a11b9b1f98..67465ed89310 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -35,9 +35,6 @@ static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) if (!(word & (1 << 13))) { dev_info(&dev->dev, "Intel E7520/7320/7525 detected; " "disabling irq balancing and affinity\n"); -#ifdef CONFIG_IRQBALANCE - irqbalance_disable(""); -#endif noirqdebug_setup(""); #ifdef CONFIG_PROC_FS no_irq_affinity = 1; -- cgit v1.2.2 From a1420f395d7721895c05ba3dedf150294d2c0e4d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:24 -0700 Subject: x86: add irq_cfg for 32bit it only contains vector ... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 71 ++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 668edf226067..033ad953bee3 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -94,6 +94,43 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); static int disable_timer_pin_1 __initdata; +struct irq_cfg { + u8 vector; +}; + + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfg_legacy[] __initdata = { + [0] = { .vector = FIRST_DEVICE_VECTOR, }, +}; + +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_cfg *cfg; + int legacy_count; + int i; + + cfg = *da->name; + + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + + BUG_ON(legacy_count > nr_irqs); + + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); +} + +static struct irq_cfg *irq_cfgx; +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + if (irq >= nr_irqs) + return NULL; + + return &irq_cfgx[irq]; +} + /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. @@ -794,22 +831,6 @@ static inline int IO_APIC_irq_trigger(int irq) return 0; } -/* irq_vectors is indexed by the sum of all RTEs in all I/O APICs. */ -static u8 irq_vector_init_first __initdata = FIRST_DEVICE_VECTOR; -static u8 *irq_vector; - -static void __init irq_vector_init_work(void *data) -{ - struct dyn_array *da = data; - - u8 *irq_vec; - - irq_vec = *da->name; - - irq_vec[0] = irq_vector_init_first; -} - -DEFINE_DYN_ARRAY(irq_vector, sizeof(u8), nr_irqs, PAGE_SIZE, irq_vector_init_work); static int __assign_irq_vector(int irq) { @@ -818,8 +839,8 @@ static int __assign_irq_vector(int irq) BUG_ON((unsigned)irq >= nr_irqs); - if (irq_vector[irq] > 0) - return irq_vector[irq]; + if (irq_cfg(irq)->vector > 0) + return irq_cfg(irq)->vector; vector = current_vector; offset = current_offset; @@ -836,7 +857,7 @@ next: current_vector = vector; current_offset = offset; - irq_vector[irq] = vector; + irq_cfg(irq)->vector = vector; return vector; } @@ -1598,7 +1619,7 @@ static void ack_ioapic_quirk_irq(unsigned int irq) * operation to prevent an edge-triggered interrupt escaping meanwhile. * The idea is from Manfred Spraul. --macro */ - i = irq_vector[irq]; + i = irq_cfg(irq)->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -1615,7 +1636,7 @@ static void ack_ioapic_quirk_irq(unsigned int irq) static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_vector[irq]); + send_IPI_self(irq_cfg(irq)->vector); return 1; } @@ -1651,7 +1672,7 @@ static inline void init_IO_APIC_traps(void) * 0x80, because int 0x80 is hm, kind of importantish. ;) */ for (irq = 0; irq < nr_irqs ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_vector[irq]) { + if (IO_APIC_IRQ(irq) && !irq_cfg(irq)->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2102,7 +2123,7 @@ int create_irq(void) for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_vector[new] != 0) + if (irq_cfg(new)->vector != 0) continue; vector = __assign_irq_vector(new); if (likely(vector > 0)) @@ -2125,8 +2146,8 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); - clear_bit(irq_vector[irq], used_vectors); - irq_vector[irq] = 0; + clear_bit(irq_cfg(irq)->vector, used_vectors); + irq_cfg(irq)->vector = 0; spin_unlock_irqrestore(&vector_lock, flags); } -- cgit v1.2.2 From da51a821314dd0ec7126f2bf9a62117146fbb6b9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:25 -0700 Subject: x86: make 32bit use irq_cfg_alloc, etc Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 180 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 160 insertions(+), 20 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 033ad953bee3..5a83d7f5b147 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -94,41 +94,172 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); static int disable_timer_pin_1 __initdata; +struct irq_cfg; + struct irq_cfg { + unsigned int irq; + struct irq_cfg *next; u8 vector; }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .vector = FIRST_DEVICE_VECTOR, }, + [0] = { .irq = 0, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .vector = IRQ15_VECTOR, }, }; +static struct irq_cfg irq_cfg_init = { .irq = -1U, }; +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; +} + +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +static void init_one_irq_cfg(struct irq_cfg *cfg) +{ + memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); +} + +static struct irq_cfg *irq_cfgx; +static struct irq_cfg *irq_cfgx_free; static void __init init_work(void *data) { - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; + struct dyn_array *da = data; + struct irq_cfg *cfg; + int legacy_count; + int i; + + cfg = *da->name; - cfg = *da->name; + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (i = legacy_count; i < *da->nr; i++) + init_one_irq_cfg(&cfg[i]); - BUG_ON(legacy_count > nr_irqs); + for (i = 1; i < *da->nr; i++) + cfg[i-1].next = &cfg[i]; - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + irq_cfgx_free = &irq_cfgx[legacy_count]; + irq_cfgx[legacy_count - 1].next = NULL; } -static struct irq_cfg *irq_cfgx; -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); +#define for_each_irq_cfg(cfg) \ + for (cfg = irq_cfgx; cfg; cfg = cfg->next) + +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); static struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq >= nr_irqs) - return NULL; + struct irq_cfg *cfg; + + cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; - return &irq_cfgx[irq]; + cfg = cfg->next; + } + + return NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + struct irq_cfg *cfg, *cfg_pri; + int i; + int count = 0; + + cfg_pri = cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + cfg_pri = cfg; + cfg = cfg->next; + count++; + } + + if (!irq_cfgx_free) { + unsigned long phys; + unsigned long total_bytes; + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + + total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; + if (after_bootmem) + cfg = kzalloc(total_bytes, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); + + phys = __pa(cfg); + printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = cfg; + } + + cfg = irq_cfgx_free; + irq_cfgx_free = irq_cfgx_free->next; + cfg->next = NULL; + if (cfg_pri) + cfg_pri->next = cfg; + else + irq_cfgx = cfg; + cfg->irq = irq; + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); + +#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG + { + /* dump the results */ + struct irq_cfg *cfg; + unsigned long phys; + unsigned long bytes = sizeof(struct irq_cfg); + + printk(KERN_DEBUG "=========================== %d\n", irq); + printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); + for_each_irq_cfg(cfg) { + phys = __pa(cfg); + printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); + } + printk(KERN_DEBUG "===========================\n"); + } +#endif + return cfg; } /* @@ -254,6 +385,7 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { struct irq_pin_list *entry = irq_2_pin + irq; + irq_cfg_alloc(irq); while (entry->next) entry = irq_2_pin + entry->next; @@ -836,11 +968,13 @@ static int __assign_irq_vector(int irq) { static int current_vector = FIRST_DEVICE_VECTOR, current_offset; int vector, offset; + struct irq_cfg *cfg; BUG_ON((unsigned)irq >= nr_irqs); - if (irq_cfg(irq)->vector > 0) - return irq_cfg(irq)->vector; + cfg = irq_cfg(irq); + if (cfg->vector > 0) + return cfg->vector; vector = current_vector; offset = current_offset; @@ -857,7 +991,7 @@ next: current_vector = vector; current_offset = offset; - irq_cfg(irq)->vector = vector; + cfg->vector = vector; return vector; } @@ -1659,6 +1793,7 @@ static inline void init_IO_APIC_traps(void) { int irq; struct irq_desc *desc; + struct irq_cfg *cfg; /* * NOTE! The local APIC isn't very good at handling @@ -1671,8 +1806,9 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for (irq = 0; irq < nr_irqs ; irq++) { - if (IO_APIC_IRQ(irq) && !irq_cfg(irq)->vector) { + for_each_irq_cfg(cfg) { + irq = cfg->irq; + if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, * so default to an old-fashioned 8259 @@ -2117,14 +2253,18 @@ int create_irq(void) /* Allocate an unused irq */ int irq, new, vector = 0; unsigned long flags; + struct irq_cfg *cfg_new; irq = -ENOSPC; spin_lock_irqsave(&vector_lock, flags); for (new = (nr_irqs - 1); new >= 0; new--) { if (platform_legacy_irq(new)) continue; - if (irq_cfg(new)->vector != 0) + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) continue; + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); vector = __assign_irq_vector(new); if (likely(vector > 0)) irq = new; -- cgit v1.2.2 From 0f978f4505e96227a89b3c9447552aca983c6b57 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:26 -0700 Subject: x86: make 32bit to use irq_2_pin in irq_cfg so it is more like 64 bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 152 +++++++++++++++++++++++++++++++++---------- 1 file changed, 117 insertions(+), 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 5a83d7f5b147..59d2e8a273e3 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -95,10 +95,11 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); static int disable_timer_pin_1 __initdata; struct irq_cfg; - +struct irq_pin_list; struct irq_cfg { unsigned int irq; struct irq_cfg *next; + struct irq_pin_list *irq_2_pin; u8 vector; }; @@ -275,11 +276,66 @@ int pin_map_size; * between pins and IRQs. */ -static struct irq_pin_list { - int apic, pin, next; -} *irq_2_pin; +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *irq_2_pin_head; +/* fill one page ? */ +static int nr_irq_2_pin = 0x100; +static struct irq_pin_list *irq_2_pin_ptr; +static void __init irq_2_pin_init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_pin_list *pin; + int i; + + pin = *da->name; + + for (i = 1; i < *da->nr; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} +DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin; + int i; + + pin = irq_2_pin_ptr; + + if (pin) { + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); + + if (after_bootmem) + pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, + GFP_ATOMIC); + else + pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * + nr_irq_2_pin, PAGE_SIZE, 0); + + if (!pin) + panic("can not get more irq_2_pin\n"); -DEFINE_DYN_ARRAY(irq_2_pin, sizeof(struct irq_pin_list), pin_map_size, 16, NULL); + for (i = 1; i < nr_irq_2_pin; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = pin->next; + pin->next = NULL; + + return pin; +} struct io_apic { unsigned int index; @@ -383,20 +439,34 @@ static void ioapic_mask_entry(int apic, int pin) */ static void add_pin_to_irq(unsigned int irq, int apic, int pin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); + return; + } - irq_cfg_alloc(irq); - while (entry->next) - entry = irq_2_pin + entry->next; + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; - if (entry->pin != -1) { - entry->next = first_free_entry; - entry = irq_2_pin + entry->next; - if (++first_free_entry >= pin_map_size) - panic("io_apic.c: whoops"); + entry = entry->next; } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; entry->apic = apic; entry->pin = pin; + printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); } /* @@ -406,35 +476,45 @@ static void __init replace_pin_at_irq(unsigned int irq, int oldapic, int oldpin, int newapic, int newpin) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; - while (1) { + while (entry) { if (entry->apic == oldapic && entry->pin == oldpin) { entry->apic = newapic; entry->pin = newpin; - } - if (!entry->next) + replaced = 1; + /* every one is different, right? */ break; - entry = irq_2_pin + entry->next; + } + entry = entry->next; } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); } static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) { - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_cfg *cfg; + struct irq_pin_list *entry; unsigned int pin, reg; + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; for (;;) { - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; reg = io_apic_read(entry->apic, 0x10 + pin*2); reg &= ~disable; reg |= enable; io_apic_modify(entry->apic, 0x10 + pin*2, reg); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } } @@ -509,13 +589,18 @@ static void clear_IO_APIC(void) #ifdef CONFIG_SMP static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) { + struct irq_cfg *cfg; unsigned long flags; int pin; - struct irq_pin_list *entry = irq_2_pin + irq; + struct irq_pin_list *entry; unsigned int apicid_value; cpumask_t tmp; struct irq_desc *desc; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + cpus_and(tmp, cpumask, cpu_online_map); if (cpus_empty(tmp)) tmp = TARGET_CPUS; @@ -527,13 +612,13 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) apicid_value = apicid_value << 24; spin_lock_irqsave(&ioapic_lock, flags); for (;;) { - pin = entry->pin; - if (pin == -1) + if (!entry) break; + pin = entry->pin; io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } desc = irq_to_desc(irq); desc->affinity = cpumask; @@ -1152,6 +1237,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_02 reg_02; union IO_APIC_reg_03 reg_03; unsigned long flags; + struct irq_cfg *cfg; if (apic_verbosity == APIC_QUIET) return; @@ -1240,16 +1326,16 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for (i = 0; i < nr_irqs; i++) { - struct irq_pin_list *entry = irq_2_pin + i; - if (entry->pin < 0) + for_each_irq_cfg(cfg) { + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) continue; printk(KERN_DEBUG "IRQ%d ", i); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) break; - entry = irq_2_pin + entry->next; + entry = entry->next; } printk("\n"); } @@ -1420,10 +1506,6 @@ static void __init enable_IO_APIC(void) int i, apic; unsigned long flags; - for (i = 0; i < pin_map_size; i++) { - irq_2_pin[i].pin = -1; - irq_2_pin[i].next = 0; - } if (!pirqs_enabled) for (i = 0; i < MAX_PIRQS; i++) pirq_entries[i] = -1; -- cgit v1.2.2 From 199751d715bba5b469ea22adadc68a4166bfa4f5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:27 -0700 Subject: x86: make 32 bit to use sparse_irq but actually irq still needs to be less than NR_IRQS, because interrupt[NR_IRQS] in entry.S. need to enable per_cpu vector... Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/io_apic_32.c | 63 ++++++++++++++++++++++++++++++-------------- arch/x86/kernel/irq_32.c | 37 +++++++++++++++++++------- arch/x86/kernel/irqinit_32.c | 7 +++++ 4 files changed, 79 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 3e0eaaa1a339..b157e94637bf 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,7 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY - select HAVE_SPARSE_IRQ if X86_64 + select HAVE_SPARSE_IRQ config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 59d2e8a273e3..66c0a91362a7 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -595,7 +595,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) struct irq_pin_list *entry; unsigned int apicid_value; cpumask_t tmp; - struct irq_desc *desc; cfg = irq_cfg(irq); @@ -620,8 +619,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) break; entry = entry->next; } - desc = irq_to_desc(irq); - desc->affinity = cpumask; + irq_to_desc(irq)->affinity = cpumask; spin_unlock_irqrestore(&ioapic_lock, flags); } @@ -1055,8 +1053,6 @@ static int __assign_irq_vector(int irq) int vector, offset; struct irq_cfg *cfg; - BUG_ON((unsigned)irq >= nr_irqs); - cfg = irq_cfg(irq); if (cfg->vector > 0) return cfg->vector; @@ -1103,7 +1099,12 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) { struct irq_desc *desc; - desc = irq_to_desc(irq); + /* first time to use this irq_desc */ + if (irq < 16) + desc = irq_to_desc(irq); + else + desc = irq_to_desc_alloc(irq); + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) { desc->status |= IRQ_LEVEL; @@ -2330,16 +2331,19 @@ device_initcall(ioapic_init_sysfs); /* * Dynamic irq allocate and deallocation */ -int create_irq(void) +unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - int irq, new, vector = 0; + unsigned int irq, new, vector = 0; unsigned long flags; struct irq_cfg *cfg_new; - irq = -ENOSPC; + /* only can use bus/dev/fn.. when per_cpu vector is used */ + irq_want = nr_irqs - 1; + + irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new >= 0; new--) { + for (new = (nr_irqs - 1); new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2354,13 +2358,18 @@ int create_irq(void) } spin_unlock_irqrestore(&vector_lock, flags); - if (irq >= 0) { + if (irq > 0) { set_intr_gate(vector, interrupt[irq]); dynamic_irq_init(irq); } return irq; } +int create_irq(void) +{ + return create_irq_nr(nr_irqs - 1); +} + void destroy_irq(unsigned int irq) { unsigned long flags; @@ -2415,7 +2424,6 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; int vector; - struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2435,8 +2443,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; + irq_to_desc(irq)->affinity = mask; } #endif /* CONFIG_SMP */ @@ -2455,13 +2462,31 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { struct msi_msg msg; int irq, ret; - irq = create_irq(); - if (irq < 0) - return irq; + + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + + if (irq == 0) + return -1; ret = msi_compose_msg(dev, irq, &msg); if (ret < 0) { @@ -2510,7 +2535,6 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { unsigned int dest; cpumask_t tmp; - struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2521,8 +2545,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(mask); target_ht_irq(irq, dest); - desc = irq_to_desc(irq); - desc->affinity = mask; + irq_to_desc(irq)->affinity = mask; } #endif diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 576c5df6cad8..0a57e39159a8 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -224,9 +224,10 @@ unsigned int do_IRQ(struct pt_regs *regs) struct pt_regs *old_regs; /* high bit used in ret_from_ code */ int overflow, irq = ~regs->orig_ax; - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; - if (unlikely((unsigned)irq >= nr_irqs)) { + desc = irq_to_desc(irq); + if (unlikely(!desc)) { printk(KERN_EMERG "%s: cannot handle IRQ %d\n", __func__, irq); BUG(); @@ -263,6 +264,24 @@ int show_interrupts(struct seq_file *p, void *v) int i = *(loff_t *) v, j; struct irqaction * action; unsigned long flags; + unsigned int entries; + struct irq_desc *desc; + int tail = 0; + +#ifdef CONFIG_HAVE_SPARSE_IRQ + desc = (struct irq_desc *)v; + entries = -1U; + i = desc->irq; + if (!desc->next) + tail = 1; +#else + entries = nr_irqs - 1; + i = *(loff_t *) v; + if (i == nr_irqs) + tail = 1; + else + desc = irq_to_desc(i); +#endif if (i == 0) { seq_printf(p, " "); @@ -271,9 +290,8 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i < nr_irqs) { + if (i <= entries) { unsigned any_count = 0; - struct irq_desc *desc = irq_to_desc(i); spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP @@ -285,7 +303,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%3d: ",i); + seq_printf(p, "%#x: ",i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else @@ -304,7 +322,9 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); skip: spin_unlock_irqrestore(&desc->lock, flags); - } else if (i == nr_irqs) { + } + + if (tail) { seq_printf(p, "NMI: "); for_each_online_cpu(j) seq_printf(p, "%10u ", nmi_count(j)); @@ -396,15 +416,14 @@ void fixup_irqs(cpumask_t map) { unsigned int irq; static int warned; + struct irq_desc *desc; - for (irq = 0; irq < nr_irqs; irq++) { + for_each_irq_desc(irq, desc) { cpumask_t mask; - struct irq_desc *desc; if (irq == 2) continue; - desc = irq_to_desc(irq); cpus_and(mask, desc->affinity, map); if (any_online_cpu(mask) == NR_CPUS) { printk("Breaking affinity for irq %i\n", irq); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 65c1c9507707..ded09ac2642e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -69,6 +69,13 @@ void __init init_ISA_irqs (void) * 16 old-style INTA-cycle interrupts: */ for (i = 0; i < 16; i++) { + /* first time call this irq_desc */ + struct irq_desc *desc = irq_to_desc_alloc(i); + + desc->status = IRQ_DISABLED; + desc->action = NULL; + desc->depth = 1; + set_irq_chip_and_handler_name(i, &i8259A_chip, handle_level_irq, "XT"); } -- cgit v1.2.2 From 497c9a195db918d3f035e8cb3021e5d4d035516e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:28 -0700 Subject: x86: make 32bit support per_cpu vector so we can merge io_apic_32.c and io_apic_64.c v2: Use cpu_online_map as target cpus for bigsmp, just like 64-bit is doing. Also remove some unused TARGET_CPUS macro. v3: need to check if desc is null in smp_irq_move_cleanup also migration needs to reset vector too, so copy __target_IO_APIC_irq from 64bit. (the duplication will go away once the two files are unified.) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/io_apic_32.c | 719 ++++++++++++++++++++++++++--------------- arch/x86/kernel/irq_32.c | 18 +- arch/x86/kernel/irqinit_32.c | 41 ++- arch/x86/lguest/boot.c | 2 +- arch/x86/mach-generic/bigsmp.c | 4 + arch/x86/mach-generic/es7000.c | 14 + arch/x86/mach-generic/numaq.c | 14 + arch/x86/mach-generic/summit.c | 14 + 9 files changed, 547 insertions(+), 281 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index b21fbfaffe39..4d82171d0f9c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -629,7 +629,7 @@ ENTRY(interrupt) ENTRY(irq_entries_start) RING0_INT_FRAME vector=0 -.rept NR_IRQS +.rept NR_VECTORS ALIGN .if vector CFI_ADJUST_CFA_OFFSET -4 diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 66c0a91362a7..ea33d3c74970 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -48,6 +48,7 @@ #include #include +#include #include #include @@ -60,7 +61,7 @@ atomic_t irq_mis_count; static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; static DEFINE_SPINLOCK(ioapic_lock); -DEFINE_SPINLOCK(vector_lock); +static DEFINE_SPINLOCK(vector_lock); int timer_through_8259 __initdata; @@ -100,28 +101,32 @@ struct irq_cfg { unsigned int irq; struct irq_cfg *next; struct irq_pin_list *irq_2_pin; + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; u8 vector; + u8 move_in_progress : 1; }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .irq = 0, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .vector = IRQ15_VECTOR, }, + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; static struct irq_cfg irq_cfg_init = { .irq = -1U, }; @@ -263,6 +268,7 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) return cfg; } +static int assign_irq_vector(int irq, cpumask_t mask); /* * Rough estimation of how many shared IRQs there are, can * be changed anytime. @@ -432,6 +438,65 @@ static void ioapic_mask_entry(int apic, int pin) spin_unlock_irqrestore(&ioapic_lock, flags); } +#ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; + io_apic_write(apic, 0x11 + pin*2, dest); + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin *2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + cpumask_t tmp; + + cfg = irq_cfg(irq); + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + + dest = cpu_mask_to_apicid(tmp); + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + irq_to_desc(irq)->affinity = mask; + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#endif /* CONFIG_SMP */ + /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are * shared ISA-space IRQs, so we have to support them. We are super @@ -586,45 +651,6 @@ static void clear_IO_APIC(void) clear_IO_APIC_pin(apic, pin); } -#ifdef CONFIG_SMP -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t cpumask) -{ - struct irq_cfg *cfg; - unsigned long flags; - int pin; - struct irq_pin_list *entry; - unsigned int apicid_value; - cpumask_t tmp; - - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - - cpus_and(tmp, cpumask, cpu_online_map); - if (cpus_empty(tmp)) - tmp = TARGET_CPUS; - - cpus_and(cpumask, tmp, CPU_MASK_ALL); - - apicid_value = cpu_mask_to_apicid(cpumask); - /* Prepare to do the io_apic_write */ - apicid_value = apicid_value << 24; - spin_lock_irqsave(&ioapic_lock, flags); - for (;;) { - if (!entry) - break; - pin = entry->pin; - io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value); - if (!entry->next) - break; - entry = entry->next; - } - irq_to_desc(irq)->affinity = cpumask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#endif /* CONFIG_SMP */ - #ifndef CONFIG_SMP void send_IPI_self(int vector) { @@ -789,32 +815,6 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) } EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - #if defined(CONFIG_EISA) || defined(CONFIG_MCA) /* * EISA Edge/Level control register, ELCR @@ -1046,47 +1046,138 @@ static inline int IO_APIC_irq_trigger(int irq) return 0; } +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} -static int __assign_irq_vector(int irq) +void unlock_vector_lock(void) { - static int current_vector = FIRST_DEVICE_VECTOR, current_offset; - int vector, offset; - struct irq_cfg *cfg; + spin_unlock(&vector_lock); +} - cfg = irq_cfg(irq); - if (cfg->vector > 0) - return cfg->vector; +static int __assign_irq_vector(int irq, cpumask_t mask) +{ + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu; + struct irq_cfg *cfg; - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (vector == current_vector) - return -ENOSPC; - if (test_and_set_bit(vector, used_vectors)) - goto next; + cfg = irq_cfg(irq); - current_vector = vector; - current_offset = offset; - cfg->vector = vector; + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); - return vector; -} + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; -static int assign_irq_vector(int irq) -{ + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } + + for_each_cpu_mask_nr(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector, offset; + + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= first_system_vector) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; + if (vector == SYSCALL_VECTOR) + goto next; + + for_each_cpu_mask_nr(new_cpu, new_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + } + for_each_cpu_mask_nr(new_cpu, new_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + return 0; + } + return -ENOSPC; +} + +static int assign_irq_vector(int irq, cpumask_t mask) +{ + int err; unsigned long flags; - int vector; spin_lock_irqsave(&vector_lock, flags); - vector = __assign_irq_vector(irq); + err = __assign_irq_vector(irq, mask); spin_unlock_irqrestore(&vector_lock, flags); - return vector; + return err; +} + +static void __clear_irq_vector(int irq) +{ + struct irq_cfg *cfg; + cpumask_t mask; + int cpu, vector; + + cfg = irq_cfg(irq); + BUG_ON(!cfg->vector); + + vector = cfg->vector; + cpus_and(mask, cfg->domain, cpu_online_map); + for_each_cpu_mask_nr(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + cfg->vector = 0; + cpus_clear(cfg->domain); +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + /* This function must be called with vector_lock held */ + int irq, vector; + struct irq_cfg *cfg; + + /* Mark the inuse vectors */ + for_each_irq_cfg(cfg) { + if (!cpu_isset(cpu, cfg->domain)) + continue; + vector = cfg->vector; + irq = cfg->irq; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = -1; + } } static struct irq_chip ioapic_chip; @@ -1095,7 +1186,7 @@ static struct irq_chip ioapic_chip; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 -static void ioapic_register_intr(int irq, int vector, unsigned long trigger) +static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; @@ -1115,79 +1206,109 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); } - set_intr_gate(vector, interrupt[irq]); } -static void __init setup_IO_APIC_irqs(void) +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) { + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest.logical.logical_dest = destination; + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + + return 0; +} + +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, + int trigger, int polarity) +{ + struct irq_cfg *cfg; struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; + cpumask_t mask; + + if (!IO_APIC_IRQ(irq)) + return; + + cfg = irq_cfg(irq); + + mask = TARGET_CPUS; + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(mask, cfg->domain, mask); + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, + irq, trigger, polarity); + + + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } + + ioapic_register_intr(irq, trigger); + if (irq < 16) + disable_8259A_irq(irq); + + ioapic_write_entry(apic, pin, entry); +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq, first_notcon = 1; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ - entry.dest.logical.logical_dest = - cpu_mask_to_apicid(TARGET_CPUS); - - idx = find_irq_entry(apic, pin, mp_INT); + idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - " IO-APIC (apicid-pin) %d-%d", - mp_ioapics[apic].mp_apicid, - pin); + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); first_notcon = 0; } else - apic_printk(APIC_VERBOSE, ", %d-%d", - mp_ioapics[apic].mp_apicid, pin); + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); continue; } - if (!first_notcon) { apic_printk(APIC_VERBOSE, " not connected.\n"); first_notcon = 1; } - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - } - irq = pin_2_irq(idx, apic, pin); - /* - * skip adding the timer int on secondary nodes, which causes - * a small but painful rift in the time-space continuum - */ - if (multi_timer_check(apic, irq)) - continue; - else - add_pin_to_irq(irq, apic, pin); - if (!apic && !IO_APIC_IRQ(irq)) - continue; + if (multi_timer_check(apic, irq)) + continue; - if (IO_APIC_IRQ(irq)) { - vector = assign_irq_vector(irq); - entry.vector = vector; - ioapic_register_intr(irq, vector, IOAPIC_AUTO); + add_pin_to_irq(irq, apic, pin); - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - ioapic_write_entry(apic, pin, entry); + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); } } @@ -1221,7 +1342,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - ioapic_register_intr(0, vector, IOAPIC_EDGE); + ioapic_register_intr(0, IOAPIC_EDGE); /* * Add it to the IO-APIC irq-routing table: @@ -1805,8 +1926,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +static void irq_complete_move(unsigned int irq); static void ack_ioapic_irq(unsigned int irq) { + irq_complete_move(irq); move_native_irq(irq); ack_APIC_irq(); } @@ -1816,6 +1939,7 @@ static void ack_ioapic_quirk_irq(unsigned int irq) unsigned long v; int i; + irq_complete_move(irq); move_native_irq(irq); /* * It appears there is an erratum which affects at least version 0x11 @@ -1858,6 +1982,64 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } +#ifdef CONFIG_SMP +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_ax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } +} +#else +static inline void irq_complete_move(unsigned int irq) {} +#endif + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .startup = startup_ioapic_irq, @@ -1940,7 +2122,7 @@ static struct irq_chip lapic_chip __read_mostly = { .ack = ack_lapic_irq, }; -static void lapic_register_intr(int irq, int vector) +static void lapic_register_intr(int irq) { struct irq_desc *desc; @@ -1948,7 +2130,6 @@ static void lapic_register_intr(int irq, int vector) desc->status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, "edge"); - set_intr_gate(vector, interrupt[irq]); } static void __init setup_nmi(void) @@ -2036,9 +2217,9 @@ static inline void __init unlock_ExtINT_logic(void) */ static inline void __init check_timer(void) { + struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; int no_pin1 = 0; - int vector; unsigned int ver; unsigned long flags; @@ -2051,8 +2232,7 @@ static inline void __init check_timer(void) * get/set the timer IRQ vector: */ disable_8259A_irq(0); - vector = assign_irq_vector(0); - set_intr_gate(vector, interrupt[0]); + assign_irq_vector(0, TARGET_CPUS); /* * As IRQ0 is to be enabled in the 8259A, the virtual @@ -2074,7 +2254,7 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " "apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); + cfg->vector, apic1, pin1, apic2, pin2); /* * Some BIOS writers are clueless and report the ExtINTA @@ -2098,7 +2278,7 @@ static inline void __init check_timer(void) */ if (no_pin1) { add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, vector); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); if (timer_irq_works()) { @@ -2123,7 +2303,7 @@ static inline void __init check_timer(void) * legacy devices should be connected to IO APIC #0 */ replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, vector); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); unmask_IO_APIC_irq(0); enable_8259A_irq(0); if (timer_irq_works()) { @@ -2154,8 +2334,8 @@ static inline void __init check_timer(void) apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); - lapic_register_intr(0, vector); - apic_write(APIC_LVT0, APIC_DM_FIXED | vector); /* Fixed mode */ + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ enable_8259A_irq(0); if (timer_irq_works()) { @@ -2163,7 +2343,7 @@ static inline void __init check_timer(void) goto out; } disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | vector); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); apic_printk(APIC_QUIET, KERN_INFO @@ -2207,12 +2387,6 @@ out: void __init setup_IO_APIC(void) { - int i; - - /* Reserve all the system vectors. */ - for (i = first_system_vector; i < NR_VECTORS; i++) - set_bit(i, used_vectors); - enable_IO_APIC(); io_apic_irqs = ~PIC_IRQS; @@ -2334,12 +2508,14 @@ device_initcall(ioapic_init_sysfs); unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - unsigned int irq, new, vector = 0; + unsigned int irq, new; unsigned long flags; struct irq_cfg *cfg_new; +#ifndef CONFIG_HAVE_SPARSE_IRQ /* only can use bus/dev/fn.. when per_cpu vector is used */ irq_want = nr_irqs - 1; +#endif irq = 0; spin_lock_irqsave(&vector_lock, flags); @@ -2351,15 +2527,13 @@ unsigned int create_irq_nr(unsigned int irq_want) continue; if (!cfg_new) cfg_new = irq_cfg_alloc(new); - vector = __assign_irq_vector(new); - if (likely(vector > 0)) + if (__assign_irq_vector(new, TARGET_CPUS) == 0) irq = new; break; } spin_unlock_irqrestore(&vector_lock, flags); if (irq > 0) { - set_intr_gate(vector, interrupt[irq]); dynamic_irq_init(irq); } return irq; @@ -2377,8 +2551,7 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); - clear_bit(irq_cfg(irq)->vector, used_vectors); - irq_cfg(irq)->vector = 0; + __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); } @@ -2388,57 +2561,65 @@ void destroy_irq(unsigned int irq) #ifdef CONFIG_PCI_MSI static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) { - int vector; + struct irq_cfg *cfg; + int err; unsigned dest; + cpumask_t tmp; - vector = assign_irq_vector(irq); - if (vector >= 0) { - dest = cpu_mask_to_apicid(TARGET_CPUS); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? -MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (err) + return err; - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? -MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(vector); - } - return vector; + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + + return err; } #ifdef CONFIG_SMP static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) { + struct irq_cfg *cfg; struct msi_msg msg; unsigned int dest; cpumask_t tmp; - int vector; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) - tmp = TARGET_CPUS; + return; - vector = assign_irq_vector(irq); - if (vector < 0) + if (assign_irq_vector(irq, mask)) return; - dest = cpu_mask_to_apicid(mask); + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); read_msi_msg(irq, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(vector); + msg.data |= MSI_DATA_VECTOR(cfg->vector); msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; msg.address_lo |= MSI_ADDR_DEST_ID(dest); @@ -2517,15 +2698,15 @@ void arch_teardown_msi_irq(unsigned int irq) #ifdef CONFIG_SMP -static void target_ht_irq(unsigned int irq, unsigned int dest) +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) { struct ht_irq_msg msg; fetch_ht_irq_msg(irq, &msg); - msg.address_lo &= ~(HT_IRQ_LOW_DEST_ID_MASK); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - msg.address_lo |= HT_IRQ_LOW_DEST_ID(dest); + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); write_ht_irq_msg(irq, &msg); @@ -2533,18 +2714,22 @@ static void target_ht_irq(unsigned int irq, unsigned int dest) static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) { + struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) - tmp = TARGET_CPUS; + return; - cpus_and(mask, tmp, CPU_MASK_ALL); + if (assign_irq_vector(irq, mask)) + return; - dest = cpu_mask_to_apicid(mask); + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); - target_ht_irq(irq, dest); + target_ht_irq(irq, dest, cfg->vector); irq_to_desc(irq)->affinity = mask; } #endif @@ -2562,16 +2747,18 @@ static struct irq_chip ht_irq_chip = { int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) { - int vector; + struct irq_cfg *cfg; + int err; + cpumask_t tmp; - vector = assign_irq_vector(irq); - if (vector >= 0) { + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if ( !err) { struct ht_irq_msg msg; unsigned dest; - cpumask_t tmp; - cpus_clear(tmp); - cpu_set(vector >> 8, tmp); + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); @@ -2579,7 +2766,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) msg.address_lo = HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(vector) | + HT_IRQ_LOW_VECTOR(cfg->vector) | ((INT_DEST_MODE == 0) ? HT_IRQ_LOW_DM_PHYSICAL : HT_IRQ_LOW_DM_LOGICAL) | @@ -2594,7 +2781,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); } - return vector; + return err; } #endif /* CONFIG_HT_IRQ */ @@ -2705,50 +2892,21 @@ int __init io_apic_get_redir_entries(int ioapic) } -int io_apic_set_pci_routing(int ioapic, int pin, int irq, int edge_level, int active_high_low) +int io_apic_set_pci_routing(int ioapic, int pin, int irq, int triggering, int polarity) { - struct IO_APIC_route_entry entry; - if (!IO_APIC_IRQ(irq)) { printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } - /* - * Generate a PCI IRQ routing entry and program the IOAPIC accordingly. - * Note that we mask (disable) IRQs now -- these get enabled when the - * corresponding device driver registers for this IRQ. - */ - - memset(&entry, 0, sizeof(entry)); - - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.trigger = edge_level; - entry.polarity = active_high_low; - entry.mask = 1; - /* * IRQs < 16 are already in the irq_2_pin[] map */ if (irq >= 16) add_pin_to_irq(irq, ioapic, pin); - entry.vector = assign_irq_vector(irq); - - apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry " - "(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic, - mp_ioapics[ioapic].mp_apicid, pin, entry.vector, irq, - edge_level, active_high_low); - - ioapic_register_intr(irq, entry.vector, edge_level); - - if (!ioapic && (irq < 16)) - disable_8259A_irq(irq); - - ioapic_write_entry(ioapic, pin, entry); + setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); return 0; } @@ -2774,6 +2932,47 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) #endif /* CONFIG_ACPI */ +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; + struct irq_desc *desc; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + cfg = irq_cfg(irq); + if (!cfg->vector) + setup_IO_APIC_irq(ioapic, pin, irq, + irq_trigger(irq_entry), + irq_polarity(irq_entry)); + else { + desc = irq_to_desc(irq); + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + } + + } +} +#endif + static int __init parse_disable_timer_pin_1(char *arg) { disable_timer_pin_1 = 1; diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 0a57e39159a8..b51ffdcfa31a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -223,21 +223,25 @@ unsigned int do_IRQ(struct pt_regs *regs) { struct pt_regs *old_regs; /* high bit used in ret_from_ code */ - int overflow, irq = ~regs->orig_ax; + int overflow; + unsigned vector = ~regs->orig_ax; struct irq_desc *desc; + unsigned irq; - desc = irq_to_desc(irq); - if (unlikely(!desc)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d\n", - __func__, irq); - BUG(); - } old_regs = set_irq_regs(regs); irq_enter(); + irq = __get_cpu_var(vector_irq)[vector]; overflow = check_stack_overflow(); + desc = irq_to_desc(irq); + if (unlikely(!desc)) { + printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x\n", + __func__, irq, vector); + BUG(); + } + if (!execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ded09ac2642e..9092103a18eb 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -90,6 +90,27 @@ static struct irqaction irq2 = { .name = "cascade", }; +DEFINE_PER_CPU(vector_irq_t, vector_irq) = { + [0 ... IRQ0_VECTOR - 1] = -1, + [IRQ0_VECTOR] = 0, + [IRQ1_VECTOR] = 1, + [IRQ2_VECTOR] = 2, + [IRQ3_VECTOR] = 3, + [IRQ4_VECTOR] = 4, + [IRQ5_VECTOR] = 5, + [IRQ6_VECTOR] = 6, + [IRQ7_VECTOR] = 7, + [IRQ8_VECTOR] = 8, + [IRQ9_VECTOR] = 9, + [IRQ10_VECTOR] = 10, + [IRQ11_VECTOR] = 11, + [IRQ12_VECTOR] = 12, + [IRQ13_VECTOR] = 13, + [IRQ14_VECTOR] = 14, + [IRQ15_VECTOR] = 15, + [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1 +}; + /* Overridden in paravirt.c */ void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); @@ -105,22 +126,14 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { - int vector = FIRST_EXTERNAL_VECTOR + i; - if (i >= nr_irqs) - break; + for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { /* SYSCALL_VECTOR was reserved in trap_init. */ - if (!test_bit(vector, used_vectors)) - set_intr_gate(vector, interrupt[i]); + if (i != SYSCALL_VECTOR) + set_intr_gate(i, interrupt[i]); } -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) - /* - * IRQ0 must be given a fixed assignment and initialized, - * because it's used before the IO-APIC is set up. - */ - set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); +#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper * IPI, driven by wakeup. @@ -135,6 +148,9 @@ void __init native_init_IRQ(void) /* IPI for single call function */ set_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + + /* Low priority IPI to cleanup after moving an irq */ + set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); #endif #ifdef CONFIG_X86_LOCAL_APIC @@ -168,3 +184,4 @@ void __init native_init_IRQ(void) irq_ctx_init(smp_processor_id()); } + diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 65f0b8a47bed..48ee4f9435f4 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -582,7 +582,7 @@ static void __init lguest_init_IRQ(void) for (i = 0; i < LGUEST_IRQS; i++) { int vector = FIRST_EXTERNAL_VECTOR + i; if (vector != SYSCALL_VECTOR) { - set_intr_gate(vector, interrupt[i]); + set_intr_gate(vector, interrupt[vector]); set_irq_chip_and_handler_name(i, &lguest_irq_controller, handle_level_irq, "level"); diff --git a/arch/x86/mach-generic/bigsmp.c b/arch/x86/mach-generic/bigsmp.c index df37fc9d6a26..3c3b471ea496 100644 --- a/arch/x86/mach-generic/bigsmp.c +++ b/arch/x86/mach-generic/bigsmp.c @@ -41,6 +41,10 @@ static const struct dmi_system_id bigsmp_dmi_table[] = { { } }; +static cpumask_t vector_allocation_domain(int cpu) +{ + return cpumask_of_cpu(cpu); +} static int probe_bigsmp(void) { diff --git a/arch/x86/mach-generic/es7000.c b/arch/x86/mach-generic/es7000.c index 6513d41ea21e..28459cab3ddb 100644 --- a/arch/x86/mach-generic/es7000.c +++ b/arch/x86/mach-generic/es7000.c @@ -75,4 +75,18 @@ static int __init acpi_madt_oem_check(char *oem_id, char *oem_table_id) } #endif +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic __initdata_refok apic_es7000 = APIC_INIT("es7000", probe_es7000); diff --git a/arch/x86/mach-generic/numaq.c b/arch/x86/mach-generic/numaq.c index 8cf58394975e..71a309b122e6 100644 --- a/arch/x86/mach-generic/numaq.c +++ b/arch/x86/mach-generic/numaq.c @@ -38,4 +38,18 @@ static int acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_numaq = APIC_INIT("NUMAQ", probe_numaq); diff --git a/arch/x86/mach-generic/summit.c b/arch/x86/mach-generic/summit.c index 6ad6b67a723d..6272b5e69da6 100644 --- a/arch/x86/mach-generic/summit.c +++ b/arch/x86/mach-generic/summit.c @@ -23,4 +23,18 @@ static int probe_summit(void) return 0; } +static cpumask_t vector_allocation_domain(int cpu) +{ + /* Careful. Some cpus do not strictly honor the set of cpus + * specified in the interrupt destination when using lowest + * priority interrupt delivery mode. + * + * In particular there was a hyperthreading cpu observed to + * deliver interrupts to the wrong hyperthread when only one + * hyperthread was specified in the interrupt desitination. + */ + cpumask_t domain = { { [0] = APIC_ALL_CPUS, } }; + return domain; +} + struct genapic apic_summit = APIC_INIT("summit", probe_summit); -- cgit v1.2.2 From 7a959cff725872ce9c3a534f10724d7bb2cb3c4a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:32 -0700 Subject: x86: add debug info for 32bit sparse_irq so could figure out bugs where we get an interrupt, but vector_irq is not initialized yet. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 8 ++++++-- arch/x86/kernel/io_apic_64.c | 2 ++ arch/x86/kernel/irq_32.c | 4 ++-- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index ea33d3c74970..3001924bdd36 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1114,8 +1114,12 @@ next: cfg->move_in_progress = 1; cfg->old_domain = cfg->domain; } - for_each_cpu_mask_nr(new_cpu, new_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; + printk(KERN_DEBUG "assign_irq_vector: irq %d vector %#x cpu ", irq, vector); + for_each_cpu_mask_nr(new_cpu, new_mask) { + per_cpu(vector_irq, new_cpu)[vector] = irq; + printk(KERN_CONT " %d ", new_cpu); + } + printk(KERN_CONT "\n"); cfg->vector = vector; cfg->domain = domain; return 0; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index b0d4abc55a11..30d2e3811313 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -1394,6 +1394,8 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b51ffdcfa31a..cc929f2f84f1 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -237,8 +237,8 @@ unsigned int do_IRQ(struct pt_regs *regs) desc = irq_to_desc(irq); if (unlikely(!desc)) { - printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x\n", - __func__, irq, vector); + printk(KERN_EMERG "%s: cannot handle IRQ %d vector %#x cpu %d\n", + __func__, irq, vector, smp_processor_id()); BUG(); } -- cgit v1.2.2 From d83e94acd95789829804fd9e442bd18975f4dc89 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:33 -0700 Subject: x86, io-apic: remove union about dest for log/phy let user decide the meaning of the bits. This unifies the 32-bit and 64-bit io-apic code a bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 3001924bdd36..353e586822af 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1224,7 +1224,7 @@ static int setup_ioapic_entry(int apic, int irq, entry->delivery_mode = INT_DELIVERY_MODE; entry->dest_mode = INT_DEST_MODE; - entry->dest.logical.logical_dest = destination; + entry->dest = destination; entry->mask = 0; /* enable IRQ */ entry->trigger = trigger; @@ -1336,7 +1336,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, */ entry.dest_mode = INT_DEST_MODE; entry.mask = 1; /* mask IRQ now */ - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); entry.delivery_mode = INT_DELIVERY_MODE; entry.polarity = 0; entry.trigger = 0; @@ -1425,19 +1425,15 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG ".... IRQ redirection table:\n"); - printk(KERN_DEBUG " NR Log Phy Mask Trig IRR Pol" - " Stat Dest Deli Vect: \n"); + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); for (i = 0; i <= reg_01.bits.entries; i++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %03X %02X ", - i, - entry.dest.logical.logical_dest, - entry.dest.physical.physical_dest - ); + printk(KERN_DEBUG " %02x %02X ", i, entry.dest); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", entry.mask, @@ -1717,7 +1713,7 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest.physical.physical_dest = read_apic_id(); + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: @@ -2185,7 +2181,7 @@ static inline void __init unlock_ExtINT_logic(void) entry1.dest_mode = 0; /* physical delivery */ entry1.mask = 0; /* unmask IRQ now */ - entry1.dest.physical.physical_dest = hard_smp_processor_id(); + entry1.dest = hard_smp_processor_id(); entry1.delivery_mode = dest_ExtINT; entry1.polarity = entry0.polarity; entry1.trigger = 0; -- cgit v1.2.2 From 1d02519242c23450b043e5e8a9e3cb84a8666fe3 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:34 -0700 Subject: x86: ordering functions in io_apic_32.c prepare for unification: try to make functions be of the same order to io_apic_64.c. v2: add calling setup_msi_irq back to arch_setup_msi_irq Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 178 +++++++++++++++++++++++-------------------- 1 file changed, 94 insertions(+), 84 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 353e586822af..9531ef33362b 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -1029,23 +1029,6 @@ static int pin_2_irq(int idx, int apic, int pin) return irq; } -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} - void lock_vector_lock(void) { /* Used to the online set of cpus does not change @@ -1190,6 +1173,23 @@ static struct irq_chip ioapic_chip; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} + static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; @@ -1926,55 +1926,6 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } -static void irq_complete_move(unsigned int irq); -static void ack_ioapic_irq(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -static void ack_ioapic_quirk_irq(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} - static int ioapic_retrigger_irq(unsigned int irq) { send_IPI_self(irq_cfg(irq)->vector); @@ -2040,13 +1991,61 @@ static void irq_complete_move(unsigned int irq) static inline void irq_complete_move(unsigned int irq) {} #endif +static void ack_apic_edge(unsigned int irq) +{ + irq_complete_move(irq); + move_native_irq(irq); + ack_APIC_irq(); +} + +static void ack_apic_level(unsigned int irq) +{ + unsigned long v; + int i; + + irq_complete_move(irq); + move_native_irq(irq); +/* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} + static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", .startup = startup_ioapic_irq, .mask = mask_IO_APIC_irq, .unmask = unmask_IO_APIC_irq, - .ack = ack_ioapic_irq, - .eoi = ack_ioapic_quirk_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP .set_affinity = set_ioapic_affinity_irq, #endif @@ -2094,11 +2093,6 @@ static inline void init_IO_APIC_traps(void) * The local APIC irq-chip implementation: */ -static void ack_lapic_irq(unsigned int irq) -{ - ack_APIC_irq(); -} - static void mask_lapic_irq(unsigned int irq) { unsigned long v; @@ -2115,6 +2109,11 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } +static void ack_lapic_irq(unsigned int irq) +{ + ack_APIC_irq(); +} + static struct irq_chip lapic_chip __read_mostly = { .name = "local-APIC", .mask = mask_lapic_irq, @@ -2636,13 +2635,31 @@ static struct irq_chip msi_chip = { .name = "PCI-MSI", .unmask = unmask_msi_irq, .mask = mask_msi_irq, - .ack = ack_ioapic_irq, + .ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_msi_irq_affinity, #endif .retrigger = ioapic_retrigger_irq, }; + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) { unsigned int irq; @@ -2657,7 +2674,6 @@ static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - struct msi_msg msg; int irq, ret; unsigned int irq_want; @@ -2669,17 +2685,11 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) if (irq == 0) return -1; - ret = msi_compose_msg(dev, irq, &msg); + ret = setup_msi_irq(dev, desc, irq); if (ret < 0) { destroy_irq(irq); return ret; - } - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, - "edge"); + } return 0; } @@ -2738,7 +2748,7 @@ static struct irq_chip ht_irq_chip = { .name = "PCI-HT", .mask = mask_ht_irq, .unmask = unmask_ht_irq, - .ack = ack_ioapic_irq, + .ack = ack_apic_edge, #ifdef CONFIG_SMP .set_affinity = set_ht_irq_affinity, #endif -- cgit v1.2.2 From 8ea5371baa82db452a8d93e9977b418d30944e32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:35 -0700 Subject: x86: ordering functions in io_apic_64.c try to make functions have the same order between 32-bit and 64-bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 67 ++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 30d2e3811313..07e1e45ee026 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -409,40 +409,6 @@ static bool io_apic_level_ack_pending(unsigned int irq) return false; } -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -627,6 +593,39 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ + \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ + for (;;) { \ + unsigned int reg; \ + if (!entry) \ + break; \ + pin = entry->pin; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, reg); \ + FINAL; \ + if (!entry->next) \ + break; \ + entry = entry->next; \ + } \ +} #define DO_ACTION(name,R,ACTION, FINAL) \ \ -- cgit v1.2.2 From efa2559f65167989f1893cb065e3126d4f13ba60 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:36 -0700 Subject: x86: order variables in io_apic_xx.c move first_system_vector to apic_64.c. also add #ifdef CONFIG_INTR_REMAP to prepare 32 bit to use same file. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 5 ++ arch/x86/kernel/io_apic_32.c | 79 ++++++++++----------- arch/x86/kernel/io_apic_64.c | 160 +++++++++++++++++++++++-------------------- 3 files changed, 127 insertions(+), 117 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 94ddb69ae15e..4d7a188025b3 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -59,6 +60,10 @@ int x2apic_preenabled; int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + /* * Debug level, exported for io_apic.c */ diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 9531ef33362b..3010bdd3352d 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -54,24 +54,22 @@ #define __apicdebuginit(type) static type __init -int (*ioapic_renumber_irq)(int ioapic, int irq); -atomic_t irq_mis_count; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int timer_through_8259 __initdata; - /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes */ int sis_apic_bug = -1; +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + int first_free_entry; +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +int pin_map_size; + /* * # of IRQ routing registers */ @@ -93,7 +91,15 @@ int mp_bus_id_to_type[MAX_MP_BUSSES]; DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); -static int disable_timer_pin_1 __initdata; +int skip_ioapic_setup; + +static int __init parse_noapic(char *arg) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); struct irq_cfg; struct irq_pin_list; @@ -268,13 +274,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) return cfg; } -static int assign_irq_vector(int irq, cpumask_t mask); -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - /* * This is performance-critical, we want to do it O(1) * @@ -465,6 +464,9 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) entry = entry->next; } } + +static int assign_irq_vector(int irq, cpumask_t mask); + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { struct irq_cfg *cfg; @@ -677,7 +679,6 @@ void send_IPI_self(int vector) #define MAX_PIRQS 8 static int pirq_entries [MAX_PIRQS]; static int pirqs_enabled; -int skip_ioapic_setup; static int __init ioapic_pirq_setup(char *str) { @@ -981,6 +982,7 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +int (*ioapic_renumber_irq)(int ioapic, int irq); static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -1621,6 +1623,9 @@ __apicdebuginit(int) print_all_ICs(void) fs_initcall(print_all_ICs); +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + static void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; @@ -1998,6 +2003,7 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { unsigned long v; @@ -2208,6 +2214,17 @@ static inline void __init unlock_ExtINT_logic(void) ioapic_write_entry(apic, pin, entry0); } +static int disable_timer_pin_1 __initdata; + +static int __init parse_disable_timer_pin_1(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", parse_disable_timer_pin_1); + +int timer_through_8259 __initdata; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2983,28 +3000,6 @@ void __init setup_ioapic_dest(void) } #endif -static int __init parse_disable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", parse_disable_timer_pin_1); - -static int __init parse_enable_timer_pin_1(char *arg) -{ - disable_timer_pin_1 = -1; - return 0; -} -early_param("enable_timer_pin_1", parse_enable_timer_pin_1); - -static int __init parse_noapic(char *arg) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 07e1e45ee026..847aa7987645 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -57,6 +57,47 @@ #define __apicdebuginit(type) static type __init +int ioapic_force; + +int sis_apic_bug; /* not actually supported, dummy for compile */ + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +int first_free_entry; +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +int pin_map_size; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* I/O APIC entries */ +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +int nr_ioapics; + +/* MP IRQ source entries */ +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +static int __init parse_noapic(char *str) +{ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + + struct irq_cfg; struct irq_pin_list; struct irq_cfg { @@ -228,53 +269,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) return cfg; } -static int assign_irq_vector(int irq, cpumask_t mask); - -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -int sis_apic_bug; /* not actually supported, dummy for compile */ - -static int no_timer_check; - -static int disable_timer_pin_1 __initdata; - -int timer_through_8259 __initdata; - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC RTE contents at the OS boot up */ -struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ - -int pin_map_size; - /* * This is performance-critical, we want to do it O(1) * @@ -481,12 +475,16 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) apic = entry->apic; pin = entry->pin; +#ifdef CONFIG_INTR_REMAP /* * With interrupt-remapping, destination information comes * from interrupt-remapping table entry. */ if (!irq_remapped(irq)) io_apic_write(apic, 0x11 + pin*2, dest); +#else + io_apic_write(apic, 0x11 + pin*2, dest); +#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; @@ -497,6 +495,8 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) } } +static int assign_irq_vector(int irq, cpumask_t mask); + static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { struct irq_cfg *cfg = irq_cfg(irq); @@ -533,7 +533,6 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) * shared ISA-space IRQs, so we have to support them. We are super * fast in the common case, and fast for shared ISA-space IRQs. */ -int first_free_entry; static void add_pin_to_irq(unsigned int irq, int apic, int pin) { struct irq_cfg *cfg; @@ -679,6 +678,10 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + /* * Saves and masks all the unmasked IO-APIC RTE's */ @@ -741,25 +744,7 @@ void reinit_intr_remapped_IO_APIC(int intr_remapping) */ restore_IO_APIC_setup(); } - -int skip_ioapic_setup; -int ioapic_force; - -static int __init parse_noapic(char *str) -{ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 1; -} -__setup("disable_timer_pin_1", disable_timer_pin_setup); - +#endif /* * Find the IRQ entry number of a certain pin. @@ -1327,8 +1312,10 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) return; +#endif memset(&entry, 0, sizeof(entry)); @@ -1601,6 +1588,9 @@ __apicdebuginit(int) print_all_ICs(void) fs_initcall(print_all_ICs); +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; @@ -1695,6 +1685,15 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +static int no_timer_check; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + /* * There is a nasty bug in some older SMP boards, their mptable lies * about the timer IRQ. We do the following to work around the situation: @@ -1708,6 +1707,9 @@ static int __init timer_irq_works(void) unsigned long t1 = jiffies; unsigned long flags; + if (no_timer_check) + return 1; + local_save_flags(flags); local_irq_enable(); /* Let ten ticks pass... */ @@ -2239,6 +2241,17 @@ static inline void __init unlock_ExtINT_logic(void) ioapic_write_entry(apic, pin, entry0); } +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + /* * This code may look a bit paranoid, but it's supposed to cooperate with * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ @@ -2286,8 +2299,10 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("BIOS bug: timer not connected to IO-APIC"); +#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2305,7 +2320,7 @@ static inline void __init check_timer(void) setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); } unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { + if (timer_irq_works()) { if (nmi_watchdog == NMI_IO_APIC) { setup_nmi(); enable_8259A_irq(0); @@ -2314,8 +2329,10 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } +#ifdef CONFIG_INTR_REMAP if (intr_remapping_enabled) panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2391,13 +2408,6 @@ out: local_irq_restore(flags); } -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - /* * Traditionally ISA IRQ2 is the cascade IRQ, and is not available * to devices. However there may be an I/O APIC pin available for -- cgit v1.2.2 From d4057bdb6a3bb85dd44f9f39f41eac53696fd637 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:38 -0700 Subject: x86: make headers files the same in io_apic_xx.c also make no_timer_check to be global on 64 bit, because vmi_32 is using that. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 15 ++++++++++++--- arch/x86/kernel/io_apic_64.c | 12 +++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 3010bdd3352d..48184e126f2c 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -25,28 +25,37 @@ #include #include #include -#include +#include #include #include #include #include #include -#include #include #include #include #include -#include /* time_after() */ +#include /* time_after() */ +#ifdef CONFIG_ACPI +#include +#endif +#include +#include +#include #include #include #include +#include +#include +#include #include #include #include #include #include #include +#include #include #include diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 847aa7987645..3c66e5a8e899 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -27,12 +27,15 @@ #include #include #include +#include #include +#include #include #include #include -#include -#include +#include +#include +#include /* time_after() */ #ifdef CONFIG_ACPI #include #endif @@ -46,14 +49,17 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include +#include #define __apicdebuginit(type) static type __init @@ -1685,7 +1691,7 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } -static int no_timer_check; +int no_timer_check __initdata; static int __init notimercheck(char *s) { -- cgit v1.2.2 From f876d213a59c363d2492e399cc6c24edd6f3c368 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:39 -0700 Subject: x86: make 64 handle sis_apic_bug like the 32 bit do we have 64bit system with sis chipset? [ mingo@elte.hu: nope, the problem chipset was 32-bit only. The code symmetry is good nevertheless. ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_64.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 3c66e5a8e899..915af9b87aa4 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -65,7 +65,11 @@ int ioapic_force; -int sis_apic_bug; /* not actually supported, dummy for compile */ +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); @@ -373,9 +377,11 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. */ -static inline void io_apic_modify(unsigned int apic, unsigned int value) +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -494,7 +500,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; - io_apic_modify(apic, reg); + io_apic_modify(apic, 0x10 + pin*2, reg); if (!entry->next) break; entry = entry->next; @@ -624,7 +630,7 @@ static inline void io_apic_sync(unsigned int apic) pin = entry->pin; \ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ reg ACTION; \ - io_apic_modify(entry->apic, reg); \ + io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ FINAL; \ if (!entry->next) \ break; \ @@ -2450,6 +2456,20 @@ void __init setup_IO_APIC(void) check_timer(); } +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + struct sysfs_ioapic_data { struct sys_device dev; struct IO_APIC_route_entry entry[0]; -- cgit v1.2.2 From aa45f97b1bb40adae1288669e73350907ffae85e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:40 -0700 Subject: x86: remove ioapic_force no user left. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 1 - arch/x86/kernel/io_apic_64.c | 2 -- 2 files changed, 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 4d7a188025b3..cd860856a0b7 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1813,7 +1813,6 @@ static int __init apic_set_verbosity(char *arg) if (!arg) { #ifdef CONFIG_X86_64 skip_ioapic_setup = 0; - ioapic_force = 1; return 0; #endif return -EINVAL; diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index 915af9b87aa4..b70fd8232185 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -63,8 +63,6 @@ #define __apicdebuginit(type) static type __init -int ioapic_force; - /* * Is the SiS APIC rmw bug present ? * -1 = don't know, 0 = no, 1 = yes -- cgit v1.2.2 From 047c8fdb8718890e3340073b178d0859d0c7f91f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:41 -0700 Subject: x86: make io_apic_64.c and io_apic_32.c the same all the same except INTR_REMAPPING related and ioapic io resource. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 213 +++++++++++++- arch/x86/kernel/io_apic_64.c | 663 ++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 829 insertions(+), 47 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 48184e126f2c..3ed36041c81e 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -123,7 +123,6 @@ struct irq_cfg { u8 move_in_progress : 1; }; - /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ static struct irq_cfg irq_cfg_legacy[] __initdata = { [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, @@ -391,6 +390,38 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } +#ifdef CONFIG_X86_64 +static bool io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + int pin; + + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + if (!entry->next) + break; + entry = entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} +#endif + union entry_union { struct { u32 w1, w2; }; struct IO_APIC_route_entry entry; @@ -483,17 +514,15 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) unsigned int dest; cpumask_t tmp; - cfg = irq_cfg(irq); - cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) return; + cfg = irq_cfg(irq); if (assign_irq_vector(irq, mask)) return; cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); /* * Only the high 8 bits are valid. @@ -572,6 +601,54 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } +#ifdef CONFIG_X86_64 +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ + \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ + for (;;) { \ + unsigned int reg; \ + if (!entry) \ + break; \ + pin = entry->pin; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ + FINAL; \ + if (!entry->next) \ + break; \ + entry = entry->next; \ + } \ +} + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +/* mask = 1 */ +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) + +/* mask = 0 */ +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) + +#else + static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) { struct irq_cfg *cfg; @@ -620,6 +697,8 @@ static void __unmask_and_level_IO_APIC_irq(unsigned int irq) IO_APIC_REDIR_MASKED); } +#endif + static void mask_IO_APIC_irq(unsigned int irq) { unsigned long flags; @@ -1055,6 +1134,17 @@ void unlock_vector_lock(void) static int __assign_irq_vector(int irq, cpumask_t mask) { + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; unsigned int old_vector; int cpu; @@ -1095,9 +1185,13 @@ next: } if (unlikely(current_vector == vector)) continue; - if (vector == SYSCALL_VECTOR) +#ifdef CONFIG_X86_64 + if (vector == IA32_SYSCALL_VECTOR) goto next; - +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; @@ -1184,6 +1278,7 @@ static struct irq_chip ioapic_chip; #define IOAPIC_EDGE 0 #define IOAPIC_LEVEL 1 +#ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { int apic, idx, pin; @@ -1200,6 +1295,12 @@ static inline int IO_APIC_irq_trigger(int irq) */ return 0; } +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif static void ioapic_register_intr(int irq, unsigned long trigger) { @@ -1212,15 +1313,18 @@ static void ioapic_register_intr(int irq, unsigned long trigger) desc = irq_to_desc_alloc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) { + trigger == IOAPIC_LEVEL) desc->status |= IRQ_LEVEL; + else + desc->status &= ~IRQ_LEVEL; + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - } else { - desc->status &= ~IRQ_LEVEL; + else set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); - } } static int setup_ioapic_entry(int apic, int irq, @@ -1662,7 +1766,6 @@ static void __init enable_IO_APIC(void) struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, pin); - /* If the interrupt line is enabled and in ExtInt mode * I have found the pin where the i8259 is connected. */ @@ -2012,6 +2115,60 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +#ifdef CONFIG_X86_64 +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + + irq_complete_move(irq); +#ifdef CONFIG_GENERIC_PENDING_IRQ + /* If we are moving the irq we need to mask it */ + if (unlikely(desc->status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } +#endif + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq, desc); + unmask_IO_APIC_irq(irq); + } +} +#else atomic_t irq_mis_count; static void ack_apic_level(unsigned int irq) { @@ -2053,6 +2210,7 @@ static void ack_apic_level(unsigned int irq) spin_unlock(&ioapic_lock); } } +#endif static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", @@ -2224,7 +2382,7 @@ static inline void __init unlock_ExtINT_logic(void) } static int disable_timer_pin_1 __initdata; - +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ static int __init parse_disable_timer_pin_1(char *arg) { disable_timer_pin_1 = 1; @@ -2244,9 +2402,9 @@ static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; - int no_pin1 = 0; - unsigned int ver; unsigned long flags; + unsigned int ver; + int no_pin1 = 0; local_irq_save(flags); @@ -2550,6 +2708,7 @@ unsigned int create_irq_nr(unsigned int irq_want) cfg_new = irq_cfg(new); if (cfg_new && cfg_new->vector != 0) continue; + /* check if need to create one */ if (!cfg_new) cfg_new = irq_cfg_alloc(new); if (__assign_irq_vector(new, TARGET_CPUS) == 0) @@ -2720,6 +2879,32 @@ int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) return 0; } +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; + +error: + destroy_irq(irq); + return ret; +} + + void arch_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c index b70fd8232185..940c4167b325 100644 --- a/arch/x86/kernel/io_apic_64.c +++ b/arch/x86/kernel/io_apic_64.c @@ -94,18 +94,22 @@ struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; /* # of MP IRQ source entries */ int mp_irq_entries; +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; static int __init parse_noapic(char *str) { + /* disable IO-APIC */ disable_ioapic_setup(); return 0; } early_param("noapic", parse_noapic); - struct irq_cfg; struct irq_pin_list; struct irq_cfg { @@ -374,6 +378,8 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i /* * Re-write a value: to be used for read-modify-write * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register */ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { @@ -383,6 +389,7 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } +#ifdef CONFIG_X86_64 static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; @@ -412,6 +419,7 @@ static bool io_apic_level_ack_pending(unsigned int irq) return false; } +#endif union entry_union { struct { u32 w1, w2; }; @@ -509,7 +517,7 @@ static int assign_irq_vector(int irq, cpumask_t mask); static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg(irq); + struct irq_cfg *cfg; unsigned long flags; unsigned int dest; cpumask_t tmp; @@ -519,12 +527,12 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) if (cpus_empty(tmp)) return; + cfg = irq_cfg(irq); if (assign_irq_vector(irq, mask)) return; cpus_and(tmp, cfg->domain, mask); dest = cpu_mask_to_apicid(tmp); - /* * Only the high 8 bits are valid. */ @@ -536,7 +544,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } -#endif +#endif /* CONFIG_SMP */ /* * The common case is 1:1 IRQ<->pin mappings. Sometimes there are @@ -602,6 +610,7 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } +#ifdef CONFIG_X86_64 /* * Synchronize the IO-APIC and the CPU by doing * a dummy read from the IO-APIC @@ -647,6 +656,58 @@ DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) /* mask = 0 */ DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) +#else + +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_cfg *cfg; + struct irq_pin_list *entry; + unsigned int pin, reg; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED); +} + +#endif + static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -688,6 +749,64 @@ static void clear_IO_APIC (void) clear_IO_APIC_pin(apic, pin); } +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +void send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + #ifdef CONFIG_INTR_REMAP /* I/O APIC RTE contents at the OS boot up */ static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; @@ -861,18 +980,51 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) return best_guess; } +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + /* ISA interrupts are always polarity zero edge triggered, * when listed as conforming in the MP table. */ #define default_ISA_trigger(idx) (0) #define default_ISA_polarity(idx) (0) +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + /* PCI interrupts are always polarity one level triggered, * when listed as conforming in the MP table. */ #define default_PCI_trigger(idx) (1) #define default_PCI_polarity(idx) (1) +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + static int MPBIOS_polarity(int idx) { int bus = mp_irqs[idx].mp_srcbus; @@ -930,6 +1082,36 @@ static int MPBIOS_trigger(int idx) trigger = default_ISA_trigger(idx); else trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif break; case 1: /* edge */ { @@ -967,6 +1149,7 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } +int (*ioapic_renumber_irq)(int ioapic, int irq); static int pin_2_irq(int idx, int apic, int pin) { int irq, i; @@ -988,7 +1171,32 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + return irq; } @@ -1058,8 +1266,13 @@ next: } if (unlikely(current_vector == vector)) continue; +#ifdef CONFIG_X86_64 if (vector == IA32_SYSCALL_VECTOR) goto next; +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif for_each_cpu_mask_nr(new_cpu, new_mask) if (per_cpu(vector_irq, new_cpu)[vector] != -1) goto next; @@ -1140,6 +1353,34 @@ static struct irq_chip ioapic_chip; static struct irq_chip ir_ioapic_chip; #endif +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; @@ -1150,7 +1391,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger) else desc = irq_to_desc_alloc(irq); - if (trigger) + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) desc->status |= IRQ_LEVEL; else desc->status &= ~IRQ_LEVEL; @@ -1168,7 +1410,8 @@ static void ioapic_register_intr(int irq, unsigned long trigger) return; } #endif - if (trigger) + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); @@ -1303,6 +1546,10 @@ static void __init setup_IO_APIC_irqs(void) } irq = pin_2_irq(idx, apic, pin); +#ifdef CONFIG_X86_32 + if (multi_timer_check(apic, irq)) + continue; +#endif add_pin_to_irq(irq, apic, pin); setup_IO_APIC_irq(apic, pin, irq, @@ -1360,6 +1607,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_00 reg_00; union IO_APIC_reg_01 reg_01; union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; @@ -1384,6 +1632,8 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); @@ -1399,11 +1649,27 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - if (reg_01.bits.version >= 0x10) { + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); } + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + printk(KERN_DEBUG ".... IRQ redirection table:\n"); printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" @@ -1475,7 +1741,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base) __apicdebuginit(void) print_local_APIC(void *dummy) { unsigned int v, ver, maxlvt; - unsigned long icr; + u64 icr; if (apic_verbosity == APIC_QUIET) return; @@ -1492,11 +1758,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); - v = apic_read(APIC_PROCPRI); - printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + v = apic_read(APIC_PROCPRI); + printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); + } v = apic_read(APIC_EOI); printk(KERN_DEBUG "... APIC EOI: %08x\n", v); @@ -1516,8 +1784,13 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } icr = apic_icr_read(); printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); @@ -1608,6 +1881,13 @@ void __init enable_IO_APIC(void) int apic; unsigned long flags; +#ifdef CONFIG_X86_32 + int i; + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; +#endif + /* * The number of IO-APIC IRQ registers (== #pins): */ @@ -1636,6 +1916,10 @@ void __init enable_IO_APIC(void) } found_i8259: /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ i8259_pin = find_isa_irq_pin(0, mp_ExtINT); i8259_apic = find_isa_irq_apic(0, mp_ExtINT); /* Trust the MP table if nothing is setup in the hardware */ @@ -1695,6 +1979,122 @@ void disable_IO_APIC(void) disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + return; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mp_apicid; + + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mp_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mp_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mp_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mp_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mp_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mp_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic + = mp_ioapics[apic].mp_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mp_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#endif + int no_timer_check __initdata; static int __init notimercheck(char *s) @@ -1780,8 +2180,10 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { + struct irq_cfg *cfg = irq_cfg(irq); unsigned long flags; @@ -1791,6 +2193,14 @@ static int ioapic_retrigger_irq(unsigned int irq) return 1; } +#else +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_cfg(irq)->vector); + + return 1; +} +#endif /* * Level and edge triggered IO-APIC interrupts need different handling, @@ -1952,7 +2362,9 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) { unsigned vector, me; ack_APIC_irq(); +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); me = smp_processor_id(); @@ -2024,6 +2436,7 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } +#ifdef CONFIG_X86_64 static void ack_apic_level(unsigned int irq) { int do_unmask_irq = 0; @@ -2076,6 +2489,49 @@ static void ack_apic_level(unsigned int irq) unmask_IO_APIC_irq(irq); } } +#else +atomic_t irq_mis_count; +static void ack_apic_level(unsigned int irq) +{ + unsigned long v; + int i; + + irq_complete_move(irq); + move_native_irq(irq); + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} +#endif static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", @@ -2141,20 +2597,24 @@ static inline void init_IO_APIC_traps(void) } } -static void unmask_lapic_irq(unsigned int irq) +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); } -static void mask_lapic_irq(unsigned int irq) +static void unmask_lapic_irq(unsigned int irq) { unsigned long v; v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } static void ack_lapic_irq (unsigned int irq) @@ -2182,19 +2642,19 @@ static void lapic_register_intr(int irq) static void __init setup_nmi(void) { /* - * Dirty trick to enable the NMI watchdog ... + * Dirty trick to enable the NMI watchdog ... * We put the 8259A master into AEOI mode and * unmask on all local APICs LVT0 as NMI. * * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') * is from Maciej W. Rozycki - so we do not have to EOI from * the NMI handler or the timer interrupt. - */ - printk(KERN_INFO "activating NMI Watchdog ..."); + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); enable_NMI_through_LVT0(); - printk(" done.\n"); + apic_printk(APIC_VERBOSE, " done.\n"); } /* @@ -2211,12 +2671,17 @@ static inline void __init unlock_ExtINT_logic(void) unsigned char save_control, save_freq_select; pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } apic = find_isa_irq_apic(8, mp_INT); - if (pin == -1) + if (apic == -1) { + WARN_ON_ONCE(1); return; + } entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); memset(&entry1, 0, sizeof(entry1)); @@ -2268,17 +2733,21 @@ int timer_through_8259 __initdata; * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. * - * FIXME: really need to revamp this for modern platforms only. + * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { struct irq_cfg *cfg = irq_cfg(0); int apic1, pin1, apic2, pin2; unsigned long flags; + unsigned int ver; int no_pin1 = 0; local_irq_save(flags); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + /* * get/set the timer IRQ vector: */ @@ -2287,10 +2756,18 @@ static inline void __init check_timer(void) /* * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); +#ifdef CONFIG_X86_32 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2382,6 +2859,9 @@ static inline void __init check_timer(void) "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -2435,19 +2915,29 @@ out: * the I/O APIC in all cases now. No actual device should request * it anyway. --macro */ -#define PIC_IRQS (1<<2) +#define PIC_IRQS (1 << PIC_CASCADE_IR) void __init setup_IO_APIC(void) { +#ifdef CONFIG_X86_32 + enable_IO_APIC(); +#else /* * calling enable_IO_APIC() is moved to setup_local_APIC for BP */ +#endif io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); @@ -3128,7 +3618,93 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) #ifdef CONFIG_ACPI -#define IO_APIC_MAX_ID 0xFE +#ifdef CONFIG_X86_32 +int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +int __init io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} +#endif int __init io_apic_get_redir_entries (int ioapic) { @@ -3226,6 +3802,7 @@ void __init setup_ioapic_dest(void) } #endif +#ifdef CONFIG_X86_64 #define IOAPIC_RESOURCE_NAME_SIZE 11 static struct resource *ioapic_resources; @@ -3261,36 +3838,56 @@ static struct resource * __init ioapic_setup_resources(void) return res; } +#endif void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - struct resource *ioapic_res; int i; +#ifdef CONFIG_X86_64 + struct resource *ioapic_res; ioapic_res = ioapic_setup_resources(); +#endif for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif ioapic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %016lx (%016lx)\n", + "mapped IOAPIC to %08lx (%08lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; +#ifdef CONFIG_X86_64 if (ioapic_res != NULL) { ioapic_res->start = ioapic_phys; ioapic_res->end = ioapic_phys + (4 * 1024) - 1; ioapic_res++; } +#endif } } +#ifdef CONFIG_X86_64 static int __init ioapic_insert_resources(void) { int i; @@ -3313,4 +3910,4 @@ static int __init ioapic_insert_resources(void) /* Insert the IO APIC resources after PCI initialization has occured to handle * IO APICS that are mapped in on a BAR in PCI space. */ late_initcall(ioapic_insert_resources); - +#endif -- cgit v1.2.2 From 54168ed7f2a4f3fc2780e645124ae952598da601 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 Aug 2008 09:07:45 +0200 Subject: x86: make io_apic_32.c the same as io_apic_64.c Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic_32.c | 1521 ++++++++++++++++++++++++++++++------------ 1 file changed, 1104 insertions(+), 417 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c index 3ed36041c81e..fba6d6ee3480 100644 --- a/arch/x86/kernel/io_apic_32.c +++ b/arch/x86/kernel/io_apic_32.c @@ -35,7 +35,7 @@ #include #include #include -#include /* time_after() */ +#include /* time_after() */ #ifdef CONFIG_ACPI #include #endif @@ -64,8 +64,8 @@ #define __apicdebuginit(type) static type __init /* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes */ int sis_apic_bug = -1; @@ -102,7 +102,7 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); int skip_ioapic_setup; -static int __init parse_noapic(char *arg) +static int __init parse_noapic(char *str) { /* disable IO-APIC */ disable_ioapic_setup(); @@ -188,7 +188,7 @@ static void __init init_work(void *data) irq_cfgx[legacy_count - 1].next = NULL; } -#define for_each_irq_cfg(cfg) \ +#define for_each_irq_cfg(cfg) \ for (cfg = irq_cfgx; cfg; cfg = cfg->next) DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); @@ -262,7 +262,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) irq_cfgx = cfg; cfg->irq = irq; printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); - #ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG { /* dump the results */ @@ -384,9 +383,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i */ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { - volatile struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); + struct io_apic __iomem *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -494,11 +493,20 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) apic = entry->apic; pin = entry->pin; +#ifdef CONFIG_INTR_REMAP + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); +#else io_apic_write(apic, 0x11 + pin*2, dest); +#endif reg = io_apic_read(apic, 0x10 + pin*2); reg &= ~IO_APIC_REDIR_VECTOR_MASK; reg |= vector; - io_apic_modify(apic, 0x10 + pin *2, reg); + io_apic_modify(apic, 0x10 + pin*2, reg); if (!entry->next) break; entry = entry->next; @@ -513,6 +521,7 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) unsigned long flags; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -529,12 +538,12 @@ static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) */ dest = SET_APIC_LOGICAL_ID(dest); + desc = irq_to_desc(irq); spin_lock_irqsave(&ioapic_lock, flags); __target_IO_APIC_irq(irq, dest, cfg->vector); - irq_to_desc(irq)->affinity = mask; + desc->affinity = mask; spin_unlock_irqrestore(&ioapic_lock, flags); } - #endif /* CONFIG_SMP */ /* @@ -699,7 +708,7 @@ static void __unmask_and_level_IO_APIC_irq(unsigned int irq) #endif -static void mask_IO_APIC_irq(unsigned int irq) +static void mask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -708,7 +717,7 @@ static void mask_IO_APIC_irq(unsigned int irq) spin_unlock_irqrestore(&ioapic_lock, flags); } -static void unmask_IO_APIC_irq(unsigned int irq) +static void unmask_IO_APIC_irq (unsigned int irq) { unsigned long flags; @@ -725,14 +734,13 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) entry = ioapic_read_entry(apic, pin); if (entry.delivery_mode == dest_SMI) return; - /* * Disable it in the IO-APIC irq-routing table: */ ioapic_mask_entry(apic, pin); } -static void clear_IO_APIC(void) +static void clear_IO_APIC (void) { int apic, pin; @@ -741,7 +749,7 @@ static void clear_IO_APIC(void) clear_IO_APIC_pin(apic, pin); } -#ifndef CONFIG_SMP +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) void send_IPI_self(int vector) { unsigned int cfg; @@ -756,9 +764,9 @@ void send_IPI_self(int vector) */ apic_write(APIC_ICR, cfg); } -#endif /* !CONFIG_SMP */ - +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ +#ifdef CONFIG_X86_32 /* * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to * specific CPU-side IRQs. @@ -797,6 +805,75 @@ static int __init ioapic_pirq_setup(char *str) } __setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + return -ENOMEM; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + return 0; +} + +void restore_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); +} + +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} +#endif /* * Find the IRQ entry number of a certain pin. @@ -848,7 +925,7 @@ static int __init find_isa_irq_apic(int irq, int type) } if (i < mp_irq_entries) { int apic; - for (apic = 0; apic < nr_ioapics; apic++) { + for(apic = 0; apic < nr_ioapics; apic++) { if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) return apic; } @@ -867,10 +944,10 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) { int apic, i, best_guess = -1; - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, " - "slot:%d, pin:%d.\n", bus, slot, pin); + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); if (test_bit(bus, mp_bus_not_pci)) { - printk(KERN_WARNING "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); return -1; } for (i = 0; i < mp_irq_entries; i++) { @@ -885,7 +962,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) !mp_irqs[i].mp_irqtype && (bus == lbus) && (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i, apic, mp_irqs[i].mp_dstirq); + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); if (!(apic || IO_APIC_IRQ(irq))) continue; @@ -902,6 +979,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) } return best_guess; } + EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) @@ -918,6 +996,7 @@ static int EISA_ELCR(unsigned int irq) "Broken MPtable reports ISA irq %d\n", irq); return 0; } + #endif /* ISA interrupts are always polarity zero edge triggered, @@ -954,36 +1033,36 @@ static int MPBIOS_polarity(int idx) /* * Determine IRQ line polarity (high active or low active): */ - switch (mp_irqs[idx].mp_irqflag & 3) { - case 0: /* conforms, ie. bus-type dependent polarity */ - { - polarity = test_bit(bus, mp_bus_not_pci)? - default_ISA_polarity(idx): - default_PCI_polarity(idx); - break; - } - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ + switch (mp_irqs[idx].mp_irqflag & 3) { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } } return polarity; } @@ -996,67 +1075,67 @@ static int MPBIOS_trigger(int idx) /* * Determine IRQ trigger mode (edge or level sensitive): */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { - case 0: /* conforms, ie. bus-type dependent */ + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) { - trigger = test_bit(bus, mp_bus_not_pci)? - default_ISA_trigger(idx): - default_PCI_trigger(idx); + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif break; - } - case MP_BUS_EISA: /* EISA pin */ + case 1: /* edge */ { - trigger = default_EISA_trigger(idx); + trigger = 0; break; } - case MP_BUS_PCI: /* PCI pin */ + case 2: /* reserved */ { - /* set before the switch */ + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; break; } - case MP_BUS_MCA: /* MCA pin */ + case 3: /* level */ { - trigger = default_MCA_trigger(idx); + trigger = 1; break; } - default: + default: /* invalid */ { printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; + trigger = 0; break; } } -#endif - break; - } - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } return trigger; } @@ -1082,9 +1161,9 @@ static int pin_2_irq(int idx, int apic, int pin) if (mp_irqs[idx].mp_dstirq != pin) printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - if (test_bit(bus, mp_bus_not_pci)) + if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].mp_srcbusirq; - else { + } else { /* * PCI IRQs are mapped in order */ @@ -1092,14 +1171,14 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; - - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } +#ifdef CONFIG_X86_32 /* * PCI IRQ command line redirection. Yes, limits are hardcoded. */ @@ -1116,6 +1195,8 @@ static int pin_2_irq(int idx, int apic, int pin) } } } +#endif + return irq; } @@ -1145,74 +1226,70 @@ static int __assign_irq_vector(int irq, cpumask_t mask) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu; + struct irq_cfg *cfg; - cfg = irq_cfg(irq); + cfg = irq_cfg(irq); - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; - old_vector = cfg->vector; - if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) - return 0; - } + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; - int new_cpu; - int vector, offset; + for_each_cpu_mask_nr(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector, offset; - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); - vector = current_vector; - offset = current_offset; + vector = current_vector; + offset = current_offset; next: - vector += 8; - if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (unlikely(current_vector == vector)) - continue; + vector += 8; + if (vector >= first_system_vector) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; #ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; + if (vector == IA32_SYSCALL_VECTOR) + goto next; #else - if (vector == SYSCALL_VECTOR) - goto next; + if (vector == SYSCALL_VECTOR) + goto next; #endif - for_each_cpu_mask_nr(new_cpu, new_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) - goto next; - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (old_vector) { - cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; - } - printk(KERN_DEBUG "assign_irq_vector: irq %d vector %#x cpu ", irq, vector); - for_each_cpu_mask_nr(new_cpu, new_mask) { - per_cpu(vector_irq, new_cpu)[vector] = irq; - printk(KERN_CONT " %d ", new_cpu); + for_each_cpu_mask_nr(new_cpu, new_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; } - printk(KERN_CONT "\n"); - cfg->vector = vector; - cfg->domain = domain; - return 0; - } - return -ENOSPC; + for_each_cpu_mask_nr(new_cpu, new_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + return 0; + } + return -ENOSPC; } static int assign_irq_vector(int irq, cpumask_t mask) @@ -1223,7 +1300,6 @@ static int assign_irq_vector(int irq, cpumask_t mask) spin_lock_irqsave(&vector_lock, flags); err = __assign_irq_vector(irq, mask); spin_unlock_irqrestore(&vector_lock, flags); - return err; } @@ -1269,36 +1345,39 @@ void __setup_vector_irq(int cpu) cfg = irq_cfg(irq); if (!cpu_isset(cpu, cfg->domain)) per_cpu(vector_irq, cpu)[vector] = -1; - } + } } static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { - int apic, idx, pin; + int apic, idx, pin; - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; } #else static inline int IO_APIC_irq_trigger(int irq) { - return 1; + return 1; } #endif @@ -1318,13 +1397,27 @@ static void ioapic_register_intr(int irq, unsigned long trigger) else desc->status &= ~IRQ_LEVEL; +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + desc->status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, "fasteoi"); + handle_fasteoi_irq, + "fasteoi"); else set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); + handle_edge_irq, "edge"); } static int setup_ioapic_entry(int apic, int irq, @@ -1337,11 +1430,45 @@ static int setup_ioapic_entry(int apic, int irq, */ memset(entry,0,sizeof(*entry)); - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; - entry->dest = destination; +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; + } - entry->mask = 0; /* enable IRQ */ + entry->mask = 0; /* enable IRQ */ entry->trigger = trigger; entry->polarity = polarity; entry->vector = vector; @@ -1351,12 +1478,11 @@ static int setup_ioapic_entry(int apic, int irq, */ if (trigger) entry->mask = 1; - return 0; } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) + int trigger, int polarity) { struct irq_cfg *cfg; struct IO_APIC_route_entry entry; @@ -1420,10 +1546,10 @@ static void __init setup_IO_APIC_irqs(void) } irq = pin_2_irq(idx, apic, pin); - +#ifdef CONFIG_X86_32 if (multi_timer_check(apic, irq)) continue; - +#endif add_pin_to_irq(irq, apic, pin); setup_IO_APIC_irq(apic, pin, irq, @@ -1443,6 +1569,11 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, { struct IO_APIC_route_entry entry; +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + return; +#endif + memset(&entry, 0, sizeof(entry)); /* @@ -1461,7 +1592,7 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, * The timer IRQ doesn't have to know that behind the * scene we may have a 8259A-master in AEOI mode ... */ - ioapic_register_intr(0, IOAPIC_EDGE); + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); /* * Add it to the IO-APIC irq-routing table: @@ -1501,17 +1632,18 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); + printk("\n"); printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); @@ -1548,7 +1680,10 @@ __apicdebuginit(void) print_IO_APIC(void) entry = ioapic_read_entry(apic, i); - printk(KERN_DEBUG " %02x %02X ", i, entry.dest); + printk(KERN_DEBUG " %02x %03X ", + i, + entry.dest + ); printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", entry.mask, @@ -1567,7 +1702,7 @@ __apicdebuginit(void) print_IO_APIC(void) struct irq_pin_list *entry = cfg->irq_2_pin; if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", i); + printk(KERN_DEBUG "IRQ%d ", cfg->irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -1614,8 +1749,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk("\n" KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", smp_processor_id(), hard_smp_processor_id()); v = apic_read(APIC_ID); - printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, - GET_APIC_ID(v)); + printk(KERN_INFO "... APIC ID: %08x (%01x)\n", v, read_apic_id()); v = apic_read(APIC_LVR); printk(KERN_INFO "... APIC VERSION: %08x\n", v); ver = GET_APIC_VERSION(v); @@ -1624,7 +1758,7 @@ __apicdebuginit(void) print_local_APIC(void *dummy) v = apic_read(APIC_TASKPRI); printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (APIC_INTEGRATED(ver)) { /* !82489DX */ v = apic_read(APIC_ARBPRI); printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, v & APIC_ARBPRI_MASK); @@ -1650,9 +1784,10 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC IRR field:\n"); print_APIC_bitfield(APIC_IRR); - if (APIC_INTEGRATED(ver)) { /* !82489DX */ - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + if (APIC_INTEGRATED(ver)) { /* !82489DX */ + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ apic_write(APIC_ESR, 0); + v = apic_read(APIC_ESR); printk(KERN_DEBUG "... APIC ESR: %08x\n", v); } @@ -1710,11 +1845,11 @@ __apicdebuginit(void) print_PIC(void) v = inb(0xa0) << 8 | inb(0x20); printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - outb(0x0b, 0xa0); - outb(0x0b, 0x20); + outb(0x0b,0xa0); + outb(0x0b,0x20); v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a, 0xa0); - outb(0x0a, 0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); spin_unlock_irqrestore(&i8259A_lock, flags); @@ -1739,16 +1874,19 @@ fs_initcall(print_all_ICs); /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; -static void __init enable_IO_APIC(void) +void __init enable_IO_APIC(void) { union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; - int i, apic; + int apic; unsigned long flags; +#ifdef CONFIG_X86_32 + int i; if (!pirqs_enabled) for (i = 0; i < MAX_PIRQS; i++) pirq_entries[i] = -1; +#endif /* * The number of IO-APIC IRQ registers (== #pins): @@ -1759,7 +1897,7 @@ static void __init enable_IO_APIC(void) spin_unlock_irqrestore(&ioapic_lock, flags); nr_ioapic_registers[apic] = reg_01.bits.entries+1; } - for (apic = 0; apic < nr_ioapics; apic++) { + for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { @@ -1830,16 +1968,18 @@ void disable_IO_APIC(void) entry.dest_mode = 0; /* Physical */ entry.delivery_mode = dest_ExtINT; /* ExtInt */ entry.vector = 0; - entry.dest = read_apic_id(); + entry.dest = read_apic_id(); /* * Add it to the IO-APIC irq-routing table: */ ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); } + disconnect_bsp_APIC(ioapic_i8259.pin != -1); } +#ifdef CONFIG_X86_32 /* * function to set the IO-APIC physical IDs based on the * values stored in the MPC table. @@ -1940,8 +2080,6 @@ static void __init setup_ioapic_ids_from_mpc(void) reg_00.bits.ID = mp_ioapics[apic].mp_apicid; spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0, reg_00.raw); - spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check @@ -1955,6 +2093,7 @@ static void __init setup_ioapic_ids_from_mpc(void) apic_printk(APIC_VERBOSE, " ok.\n"); } } +#endif int no_timer_check __initdata; @@ -1994,9 +2133,10 @@ static int __init timer_irq_works(void) * might have cached one ExtINT interrupt. Finally, at * least one tick may be lost due to delays. */ + + /* jiffies wrap? */ if (time_after(jiffies, t1 + 4)) return 1; - return 0; } @@ -2014,8 +2154,6 @@ static int __init timer_irq_works(void) */ /* - * Startup quirk: - * * Starting up a edge-triggered IO-APIC interrupt is * nasty - we need to make sure that we get the edge. * If it is already asserted for some reason, we need @@ -2023,9 +2161,8 @@ static int __init timer_irq_works(void) * * This is not complete - we should be able to fake * an edge even if it isn't on the 8259A... - * - * (We do this for level-triggered IRQs too - it cannot hurt.) */ + static unsigned int startup_ioapic_irq(unsigned int irq) { int was_pending = 0; @@ -2043,70 +2180,254 @@ static unsigned int startup_ioapic_irq(unsigned int irq) return was_pending; } +#ifdef CONFIG_X86_64 static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_cfg(irq)->vector); + + struct irq_cfg *cfg = irq_cfg(irq); + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + spin_unlock_irqrestore(&vector_lock, flags); return 1; } - -#ifdef CONFIG_SMP -asmlinkage void smp_irq_move_cleanup_interrupt(void) +#else +static int ioapic_retrigger_irq(unsigned int irq) { - unsigned vector, me; - ack_APIC_irq(); - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; + send_IPI_self(irq_cfg(irq)->vector); - desc = irq_to_desc(irq); - if (!desc) - continue; + return 1; +} +#endif - cfg = irq_cfg(irq); - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; +#ifdef CONFIG_SMP - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); - irq_exit(); -} +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); -static void irq_complete_move(unsigned int irq) +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) { - struct irq_cfg *cfg = irq_cfg(irq); - unsigned vector, me; + struct irq_cfg *cfg; + struct irq_desc *desc; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte; + unsigned int dest; + unsigned long flags; - if (likely(!cfg->move_in_progress)) + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) return; - vector = ~get_irq_regs()->orig_ax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; + if (get_irte(irq, &irte)) + return; - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc->affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, desc->pending_mask); + + ret = 0; + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + unsigned int irq; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, desc->pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_ax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); cfg->move_in_progress = 0; } } #else static inline void irq_complete_move(unsigned int irq) {} #endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif static void ack_apic_edge(unsigned int irq) { @@ -2118,55 +2439,55 @@ static void ack_apic_edge(unsigned int irq) #ifdef CONFIG_X86_64 static void ack_apic_level(unsigned int irq) { - int do_unmask_irq = 0; + int do_unmask_irq = 0; - irq_complete_move(irq); + irq_complete_move(irq); #ifdef CONFIG_GENERIC_PENDING_IRQ - /* If we are moving the irq we need to mask it */ - if (unlikely(desc->status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } #endif - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq, desc); - unmask_IO_APIC_irq(irq); - } + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); + unmask_IO_APIC_irq(irq); + } } #else atomic_t irq_mis_count; @@ -2177,25 +2498,25 @@ static void ack_apic_level(unsigned int irq) irq_complete_move(irq); move_native_irq(irq); -/* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ i = irq_cfg(irq)->vector; v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); @@ -2225,6 +2546,20 @@ static struct irq_chip ioapic_chip __read_mostly = { .retrigger = ioapic_retrigger_irq, }; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif static inline void init_IO_APIC_traps(void) { @@ -2282,7 +2617,7 @@ static void unmask_lapic_irq(unsigned int irq) apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); } -static void ack_lapic_irq(unsigned int irq) +static void ack_lapic_irq (unsigned int irq) { ack_APIC_irq(); } @@ -2383,12 +2718,12 @@ static inline void __init unlock_ExtINT_logic(void) static int disable_timer_pin_1 __initdata; /* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init parse_disable_timer_pin_1(char *arg) +static int __init disable_timer_pin_setup(char *arg) { disable_timer_pin_1 = 1; return 0; } -early_param("disable_timer_pin_1", parse_disable_timer_pin_1); +early_param("disable_timer_pin_1", disable_timer_pin_setup); int timer_through_8259 __initdata; @@ -2397,6 +2732,8 @@ int timer_through_8259 __initdata; * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. */ static inline void __init check_timer(void) { @@ -2408,8 +2745,8 @@ static inline void __init check_timer(void) local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); /* * get/set the timer IRQ vector: @@ -2428,7 +2765,9 @@ static inline void __init check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); +#ifdef CONFIG_X86_32 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); @@ -2447,6 +2786,10 @@ static inline void __init check_timer(void) * 8259A. */ if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); +#endif pin1 = pin2; apic1 = apic2; no_pin1 = 1; @@ -2473,6 +2816,10 @@ static inline void __init check_timer(void) clear_IO_APIC_pin(0, pin1); goto out; } +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif clear_IO_APIC_pin(apic1, pin1); if (!no_pin1) apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " @@ -2512,7 +2859,9 @@ static inline void __init check_timer(void) "through the IO-APIC - disabling NMI Watchdog!\n"); nmi_watchdog = NMI_NONE; } +#ifdef CONFIG_X86_32 timer_ack = 0; +#endif apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer as Virtual Wire IRQ...\n"); @@ -2570,17 +2919,25 @@ out: void __init setup_IO_APIC(void) { + +#ifdef CONFIG_X86_32 enable_IO_APIC(); +#else + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ +#endif io_apic_irqs = ~PIC_IRQS; - printk("ENABLING IO-APIC IRQs\n"); - - /* - * Set up IO-APIC IRQ routing. - */ - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif sync_Arb_IDs(); setup_IO_APIC_irqs(); init_IO_APIC_traps(); @@ -2588,15 +2945,15 @@ void __init setup_IO_APIC(void) } /* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path */ static int __init io_apic_bug_finalize(void) { - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; } late_initcall(io_apic_bug_finalize); @@ -2605,7 +2962,7 @@ struct sysfs_ioapic_data { struct sys_device dev; struct IO_APIC_route_entry entry[0]; }; -static struct sysfs_ioapic_data *mp_ioapic_data[MAX_IO_APICS]; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; static int ioapic_suspend(struct sys_device *dev, pm_message_t state) { @@ -2615,8 +2972,8 @@ static int ioapic_suspend(struct sys_device *dev, pm_message_t state) data = container_of(dev, struct sysfs_ioapic_data, dev); entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - entry[i] = ioapic_read_entry(dev->id, i); + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); return 0; } @@ -2653,14 +3010,14 @@ static struct sysdev_class ioapic_sysdev_class = { static int __init ioapic_init_sysfs(void) { - struct sys_device *dev; - int i, size, error = 0; + struct sys_device * dev; + int i, size, error; error = sysdev_class_register(&ioapic_sysdev_class); if (error) return error; - for (i = 0; i < nr_ioapics; i++) { + for (i = 0; i < nr_ioapics; i++ ) { size = sizeof(struct sys_device) + nr_ioapic_registers[i] * sizeof(struct IO_APIC_route_entry); mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); @@ -2691,18 +3048,18 @@ device_initcall(ioapic_init_sysfs); unsigned int create_irq_nr(unsigned int irq_want) { /* Allocate an unused irq */ - unsigned int irq, new; + unsigned int irq; + unsigned int new; unsigned long flags; struct irq_cfg *cfg_new; #ifndef CONFIG_HAVE_SPARSE_IRQ - /* only can use bus/dev/fn.. when per_cpu vector is used */ irq_want = nr_irqs - 1; #endif irq = 0; spin_lock_irqsave(&vector_lock, flags); - for (new = (nr_irqs - 1); new > 0; new--) { + for (new = irq_want; new > 0; new--) { if (platform_legacy_irq(new)) continue; cfg_new = irq_cfg(new); @@ -2725,7 +3082,14 @@ unsigned int create_irq_nr(unsigned int irq_want) int create_irq(void) { - return create_irq_nr(nr_irqs - 1); + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; } void destroy_irq(unsigned int irq) @@ -2734,6 +3098,9 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif spin_lock_irqsave(&vector_lock, flags); __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); @@ -2759,25 +3126,54 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms cpus_and(tmp, cfg->domain, tmp); dest = cpu_mask_to_apicid(tmp); - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + } return err; } @@ -2788,6 +3184,7 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) struct msi_msg msg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2808,8 +3205,61 @@ static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) msg.address_lo |= MSI_ADDR_DEST_ID(dest); write_msi_msg(irq, &msg); - irq_to_desc(irq)->affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif #endif /* CONFIG_SMP */ /* @@ -2827,6 +3277,45 @@ static struct irq_chip msi_chip = { .retrigger = ioapic_retrigger_irq, }; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) { @@ -2840,7 +3329,17 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) set_irq_msi(irq, desc); write_msi_msg(irq, &msg); - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_to_desc(irq); + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); return 0; } @@ -2859,59 +3358,164 @@ static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) { - int irq, ret; - + unsigned int irq; + int ret; unsigned int irq_want; irq_want = build_irq_for_pci_dev(dev) + 0x100; irq = create_irq_nr(irq_want); - if (irq == 0) return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif ret = setup_msi_irq(dev, desc, irq); if (ret < 0) { destroy_irq(irq); return ret; - } - + } return 0; + +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif } int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) { - unsigned int irq; - int ret, sub_handle; - struct msi_desc *desc; - unsigned int irq_want; - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); - if (irq == 0) - return -1; - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) - goto error; - sub_handle++; - } - return 0; + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; + +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; error: - destroy_irq(irq); - return ret; + destroy_irq(irq); + return ret; } - void arch_teardown_msi_irq(unsigned int irq) { destroy_irq(irq); } -#endif /* CONFIG_PCI_MSI */ +#ifdef CONFIG_DMAR +#ifdef CONFIG_SMP +static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .unmask = dmar_msi_unmask, + .mask = dmar_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = dmar_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +#endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support */ @@ -2938,6 +3542,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) struct irq_cfg *cfg; unsigned int dest; cpumask_t tmp; + struct irq_desc *desc; cpus_and(tmp, mask, cpu_online_map); if (cpus_empty(tmp)) @@ -2951,7 +3556,8 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(tmp); target_ht_irq(irq, dest, cfg->vector); - irq_to_desc(irq)->affinity = mask; + desc = irq_to_desc(irq); + desc->affinity = mask; } #endif @@ -2974,7 +3580,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) tmp = TARGET_CPUS; err = assign_irq_vector(irq, tmp); - if ( !err) { + if (!err) { struct ht_irq_msg msg; unsigned dest; @@ -3007,11 +3613,12 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) #endif /* CONFIG_HT_IRQ */ /* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration + ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ #ifdef CONFIG_ACPI +#ifdef CONFIG_X86_32 int __init io_apic_get_unique_id(int ioapic, int apic_id) { union IO_APIC_reg_00 reg_00; @@ -3086,7 +3693,6 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id) return apic_id; } - int __init io_apic_get_version(int ioapic) { union IO_APIC_reg_01 reg_01; @@ -3098,9 +3704,9 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } +#endif - -int __init io_apic_get_redir_entries(int ioapic) +int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; unsigned long flags; @@ -3113,10 +3719,10 @@ int __init io_apic_get_redir_entries(int ioapic) } -int io_apic_set_pci_routing(int ioapic, int pin, int irq, int triggering, int polarity) +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { if (!IO_APIC_IRQ(irq)) { - printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", ioapic); return -EINVAL; } @@ -3132,6 +3738,7 @@ int io_apic_set_pci_routing(int ioapic, int pin, int irq, int triggering, int po return 0; } + int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) { int i; @@ -3163,7 +3770,6 @@ void __init setup_ioapic_dest(void) { int pin, ioapic, irq, irq_entry; struct irq_cfg *cfg; - struct irq_desc *desc; if (skip_ioapic_setup == 1) return; @@ -3184,43 +3790,124 @@ void __init setup_ioapic_dest(void) setup_IO_APIC_irq(ioapic, pin, irq, irq_trigger(irq_entry), irq_polarity(irq_entry)); - else { - desc = irq_to_desc(irq); +#ifdef CONFIG_INTR_REMAP + else if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); +#endif + else set_ioapic_affinity_irq(irq, TARGET_CPUS); - } } } } #endif +#ifdef CONFIG_X86_64 +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} +#endif + void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; +#ifdef CONFIG_X86_64 + struct resource *ioapic_res; + ioapic_res = ioapic_setup_resources(); +#endif for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif } else { +#ifdef CONFIG_X86_32 fake_ioapic_page: +#endif ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); + alloc_bootmem_pages(PAGE_SIZE); ioapic_phys = __pa(ioapic_phys); } set_fixmap_nocache(idx, ioapic_phys); - printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); idx++; + +#ifdef CONFIG_X86_64 + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } +#endif } } +#ifdef CONFIG_X86_64 +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif -- cgit v1.2.2 From 26d347c2c035b1f4c5b3c5094f3046db9ec920f5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:42 -0700 Subject: rename io_apic_64.c and io_apic_32.c to io_apic.c The two files are now line by line equal. (sans a printk) Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/io_apic.c | 3913 ++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/io_apic_32.c | 3913 ------------------------------------------ arch/x86/kernel/io_apic_64.c | 3913 ------------------------------------------ 4 files changed, 3914 insertions(+), 7827 deletions(-) create mode 100644 arch/x86/kernel/io_apic.c delete mode 100644 arch/x86/kernel/io_apic_32.c delete mode 100644 arch/x86/kernel/io_apic_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0d41f0343dc0..7264f9c3af8f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -61,7 +61,7 @@ obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o -obj-$(CONFIG_X86_IO_APIC) += io_apic_$(BITS).o +obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_KEXEC) += machine_kexec_$(BITS).o diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c new file mode 100644 index 000000000000..fba6d6ee3480 --- /dev/null +++ b/arch/x86/kernel/io_apic.c @@ -0,0 +1,3913 @@ +/* + * Intel IO-APIC support for multi-Pentium hosts. + * + * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo + * + * Many thanks to Stig Venaas for trying out countless experimental + * patches and reporting/debugging problems patiently! + * + * (c) 1999, Multiple IO-APIC support, developed by + * Ken-ichi Yaku and + * Hidemi Kishimoto , + * further tested and cleaned up by Zach Brown + * and Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively + * Paul Diefenbaugh : Added full ACPI support + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* time_after() */ +#ifdef CONFIG_ACPI +#include +#endif +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#define __apicdebuginit(type) static type __init + +/* + * Is the SiS APIC rmw bug present ? + * -1 = don't know, 0 = no, 1 = yes + */ +int sis_apic_bug = -1; + +static DEFINE_SPINLOCK(ioapic_lock); +static DEFINE_SPINLOCK(vector_lock); + +int first_free_entry; +/* + * Rough estimation of how many shared IRQs there are, can + * be changed anytime. + */ +int pin_map_size; + +/* + * # of IRQ routing registers + */ +int nr_ioapic_registers[MAX_IO_APICS]; + +/* I/O APIC entries */ +struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; +int nr_ioapics; + +/* MP IRQ source entries */ +struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; + +/* # of MP IRQ source entries */ +int mp_irq_entries; + +#if defined (CONFIG_MCA) || defined (CONFIG_EISA) +int mp_bus_id_to_type[MAX_MP_BUSSES]; +#endif + +DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); + +int skip_ioapic_setup; + +static int __init parse_noapic(char *str) +{ + /* disable IO-APIC */ + disable_ioapic_setup(); + return 0; +} +early_param("noapic", parse_noapic); + +struct irq_cfg; +struct irq_pin_list; +struct irq_cfg { + unsigned int irq; + struct irq_cfg *next; + struct irq_pin_list *irq_2_pin; + cpumask_t domain; + cpumask_t old_domain; + unsigned move_cleanup_count; + u8 vector; + u8 move_in_progress : 1; +}; + +/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ +static struct irq_cfg irq_cfg_legacy[] __initdata = { + [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, + [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, + [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, + [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, + [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, + [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, + [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, + [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, + [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, + [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, + [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, + [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, + [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, + [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, + [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, + [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, +}; + +static struct irq_cfg irq_cfg_init = { .irq = -1U, }; +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; +} + +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +static void init_one_irq_cfg(struct irq_cfg *cfg) +{ + memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); +} + +static struct irq_cfg *irq_cfgx; +static struct irq_cfg *irq_cfgx_free; +static void __init init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_cfg *cfg; + int legacy_count; + int i; + + cfg = *da->name; + + memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); + + legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + for (i = legacy_count; i < *da->nr; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < *da->nr; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = &irq_cfgx[legacy_count]; + irq_cfgx[legacy_count - 1].next = NULL; +} + +#define for_each_irq_cfg(cfg) \ + for (cfg = irq_cfgx; cfg; cfg = cfg->next) + +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); + +static struct irq_cfg *irq_cfg(unsigned int irq) +{ + struct irq_cfg *cfg; + + cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + cfg = cfg->next; + } + + return NULL; +} + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + struct irq_cfg *cfg, *cfg_pri; + int i; + int count = 0; + + cfg_pri = cfg = irq_cfgx; + while (cfg) { + if (cfg->irq == irq) + return cfg; + + cfg_pri = cfg; + cfg = cfg->next; + count++; + } + + if (!irq_cfgx_free) { + unsigned long phys; + unsigned long total_bytes; + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); + + total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; + if (after_bootmem) + cfg = kzalloc(total_bytes, GFP_ATOMIC); + else + cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); + + if (!cfg) + panic("please boot with nr_irq_cfg= %d\n", count * 2); + + phys = __pa(cfg); + printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + + for (i = 0; i < nr_irq_cfg; i++) + init_one_irq_cfg(&cfg[i]); + + for (i = 1; i < nr_irq_cfg; i++) + cfg[i-1].next = &cfg[i]; + + irq_cfgx_free = cfg; + } + + cfg = irq_cfgx_free; + irq_cfgx_free = irq_cfgx_free->next; + cfg->next = NULL; + if (cfg_pri) + cfg_pri->next = cfg; + else + irq_cfgx = cfg; + cfg->irq = irq; + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); +#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG + { + /* dump the results */ + struct irq_cfg *cfg; + unsigned long phys; + unsigned long bytes = sizeof(struct irq_cfg); + + printk(KERN_DEBUG "=========================== %d\n", irq); + printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); + for_each_irq_cfg(cfg) { + phys = __pa(cfg); + printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); + } + printk(KERN_DEBUG "===========================\n"); + } +#endif + return cfg; +} + +/* + * This is performance-critical, we want to do it O(1) + * + * the indexing order of this array favors 1:1 mappings + * between pins and IRQs. + */ + +struct irq_pin_list { + int apic, pin; + struct irq_pin_list *next; +}; + +static struct irq_pin_list *irq_2_pin_head; +/* fill one page ? */ +static int nr_irq_2_pin = 0x100; +static struct irq_pin_list *irq_2_pin_ptr; +static void __init irq_2_pin_init_work(void *data) +{ + struct dyn_array *da = data; + struct irq_pin_list *pin; + int i; + + pin = *da->name; + + for (i = 1; i < *da->nr; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = &pin[0]; +} +DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); + +static struct irq_pin_list *get_one_free_irq_2_pin(void) +{ + struct irq_pin_list *pin; + int i; + + pin = irq_2_pin_ptr; + + if (pin) { + irq_2_pin_ptr = pin->next; + pin->next = NULL; + return pin; + } + + /* + * we run out of pre-allocate ones, allocate more + */ + printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); + + if (after_bootmem) + pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, + GFP_ATOMIC); + else + pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * + nr_irq_2_pin, PAGE_SIZE, 0); + + if (!pin) + panic("can not get more irq_2_pin\n"); + + for (i = 1; i < nr_irq_2_pin; i++) + pin[i-1].next = &pin[i]; + + irq_2_pin_ptr = pin->next; + pin->next = NULL; + + return pin; +} + +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); +} + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + * + * Older SiS APIC requires we rewrite the index register + */ +static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + if (sis_apic_bug) + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +#ifdef CONFIG_X86_64 +static bool io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + struct irq_cfg *cfg = irq_cfg(irq); + + spin_lock_irqsave(&ioapic_lock, flags); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + int pin; + + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + if (reg & IO_APIC_REDIR_REMOTE_IRR) { + spin_unlock_irqrestore(&ioapic_lock, flags); + return true; + } + if (!entry->next) + break; + entry = entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + + return false; +} +#endif + +union entry_union { + struct { u32 w1, w2; }; + struct IO_APIC_route_entry entry; +}; + +static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) +{ + union entry_union eu; + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); + eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); + spin_unlock_irqrestore(&ioapic_lock, flags); + return eu.entry; +} + +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + union entry_union eu; + eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +#ifdef CONFIG_SMP +static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + int apic, pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + unsigned int reg; + + if (!entry) + break; + + apic = entry->apic; + pin = entry->pin; +#ifdef CONFIG_INTR_REMAP + /* + * With interrupt-remapping, destination information comes + * from interrupt-remapping table entry. + */ + if (!irq_remapped(irq)) + io_apic_write(apic, 0x11 + pin*2, dest); +#else + io_apic_write(apic, 0x11 + pin*2, dest); +#endif + reg = io_apic_read(apic, 0x10 + pin*2); + reg &= ~IO_APIC_REDIR_VECTOR_MASK; + reg |= vector; + io_apic_modify(apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +static int assign_irq_vector(int irq, cpumask_t mask); + +static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned long flags; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + cfg = irq_cfg(irq); + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + /* + * Only the high 8 bits are valid. + */ + dest = SET_APIC_LOGICAL_ID(dest); + + desc = irq_to_desc(irq); + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + desc->affinity = mask; + spin_unlock_irqrestore(&ioapic_lock, flags); +} +#endif /* CONFIG_SMP */ + +/* + * The common case is 1:1 IRQ<->pin mappings. Sometimes there are + * shared ISA-space IRQs, so we have to support them. We are super + * fast in the common case, and fast for shared ISA-space IRQs. + */ +static void add_pin_to_irq(unsigned int irq, int apic, int pin) +{ + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + /* first time to refer irq_cfg, so with new */ + cfg = irq_cfg_alloc(irq); + entry = cfg->irq_2_pin; + if (!entry) { + entry = get_one_free_irq_2_pin(); + cfg->irq_2_pin = entry; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); + return; + } + + while (entry->next) { + /* not again, please */ + if (entry->apic == apic && entry->pin == pin) + return; + + entry = entry->next; + } + + entry->next = get_one_free_irq_2_pin(); + entry = entry->next; + entry->apic = apic; + entry->pin = pin; + printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); +} + +/* + * Reroute an IRQ to a different pin. + */ +static void __init replace_pin_at_irq(unsigned int irq, + int oldapic, int oldpin, + int newapic, int newpin) +{ + struct irq_cfg *cfg = irq_cfg(irq); + struct irq_pin_list *entry = cfg->irq_2_pin; + int replaced = 0; + + while (entry) { + if (entry->apic == oldapic && entry->pin == oldpin) { + entry->apic = newapic; + entry->pin = newpin; + replaced = 1; + /* every one is different, right? */ + break; + } + entry = entry->next; + } + + /* why? call replace before add? */ + if (!replaced) + add_pin_to_irq(irq, newapic, newpin); +} + +#ifdef CONFIG_X86_64 +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + +#define __DO_ACTION(R, ACTION, FINAL) \ + \ +{ \ + int pin; \ + struct irq_cfg *cfg; \ + struct irq_pin_list *entry; \ + \ + cfg = irq_cfg(irq); \ + entry = cfg->irq_2_pin; \ + for (;;) { \ + unsigned int reg; \ + if (!entry) \ + break; \ + pin = entry->pin; \ + reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ + reg ACTION; \ + io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ + FINAL; \ + if (!entry->next) \ + break; \ + entry = entry->next; \ + } \ +} + +#define DO_ACTION(name,R,ACTION, FINAL) \ + \ + static void name##_IO_APIC_irq (unsigned int irq) \ + __DO_ACTION(R, ACTION, FINAL) + +/* mask = 1 */ +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) + +/* mask = 0 */ +DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) + +#else + +static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) +{ + struct irq_cfg *cfg; + struct irq_pin_list *entry; + unsigned int pin, reg; + + cfg = irq_cfg(irq); + entry = cfg->irq_2_pin; + for (;;) { + if (!entry) + break; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + reg &= ~disable; + reg |= enable; + io_apic_modify(entry->apic, 0x10 + pin*2, reg); + if (!entry->next) + break; + entry = entry->next; + } +} + +/* mask = 1 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); +} + +/* mask = 0 */ +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); +} + +/* mask = 1, trigger = 0 */ +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER); +} + +/* mask = 0, trigger = 1 */ +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED); +} + +#endif + +static void mask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __mask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void unmask_IO_APIC_irq (unsigned int irq) +{ + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) +{ + struct IO_APIC_route_entry entry; + + /* Check delivery_mode to be sure we're not clearing an SMI pin */ + entry = ioapic_read_entry(apic, pin); + if (entry.delivery_mode == dest_SMI) + return; + /* + * Disable it in the IO-APIC irq-routing table: + */ + ioapic_mask_entry(apic, pin); +} + +static void clear_IO_APIC (void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + clear_IO_APIC_pin(apic, pin); +} + +#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) +void send_IPI_self(int vector) +{ + unsigned int cfg; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; + /* + * Send the IPI. The write to APIC_ICR fires this off. + */ + apic_write(APIC_ICR, cfg); +} +#endif /* !CONFIG_SMP && CONFIG_X86_32*/ + +#ifdef CONFIG_X86_32 +/* + * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to + * specific CPU-side IRQs. + */ + +#define MAX_PIRQS 8 +static int pirq_entries [MAX_PIRQS]; +static int pirqs_enabled; + +static int __init ioapic_pirq_setup(char *str) +{ + int i, max; + int ints[MAX_PIRQS+1]; + + get_options(str, ARRAY_SIZE(ints), ints); + + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; + + pirqs_enabled = 1; + apic_printk(APIC_VERBOSE, KERN_INFO + "PIRQ redirection, working around broken MP-BIOS.\n"); + max = MAX_PIRQS; + if (ints[0] < MAX_PIRQS) + max = ints[0]; + + for (i = 0; i < max; i++) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); + /* + * PIRQs are mapped upside down, usually. + */ + pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; + } + return 1; +} + +__setup("pirq=", ioapic_pirq_setup); +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_INTR_REMAP +/* I/O APIC RTE contents at the OS boot up */ +static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; + +/* + * Saves and masks all the unmasked IO-APIC RTE's + */ +int save_mask_IO_APIC_setup(void) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + int apic, pin; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + + for (apic = 0; apic < nr_ioapics; apic++) { + early_ioapic_entries[apic] = + kzalloc(sizeof(struct IO_APIC_route_entry) * + nr_ioapic_registers[apic], GFP_KERNEL); + if (!early_ioapic_entries[apic]) + return -ENOMEM; + } + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + + entry = early_ioapic_entries[apic][pin] = + ioapic_read_entry(apic, pin); + if (!entry.mask) { + entry.mask = 1; + ioapic_write_entry(apic, pin, entry); + } + } + return 0; +} + +void restore_IO_APIC_setup(void) +{ + int apic, pin; + + for (apic = 0; apic < nr_ioapics; apic++) + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + ioapic_write_entry(apic, pin, + early_ioapic_entries[apic][pin]); +} + +void reinit_intr_remapped_IO_APIC(int intr_remapping) +{ + /* + * for now plain restore of previous settings. + * TBD: In the case of OS enabling interrupt-remapping, + * IO-APIC RTE's need to be setup to point to interrupt-remapping + * table entries. for now, do a plain restore, and wait for + * the setup_IO_APIC_irqs() to do proper initialization. + */ + restore_IO_APIC_setup(); +} +#endif + +/* + * Find the IRQ entry number of a certain pin. + */ +static int find_irq_entry(int apic, int pin, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_irqtype == type && + (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) && + mp_irqs[i].mp_dstirq == pin) + return i; + + return -1; +} + +/* + * Find the pin to which IRQ[irq] (ISA) is connected + */ +static int __init find_isa_irq_pin(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) + + return mp_irqs[i].mp_dstirq; + } + return -1; +} + +static int __init find_isa_irq_apic(int irq, int type) +{ + int i; + + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + if (test_bit(lbus, mp_bus_not_pci) && + (mp_irqs[i].mp_irqtype == type) && + (mp_irqs[i].mp_srcbusirq == irq)) + break; + } + if (i < mp_irq_entries) { + int apic; + for(apic = 0; apic < nr_ioapics; apic++) { + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) + return apic; + } + } + + return -1; +} + +/* + * Find a specific PCI IRQ entry. + * Not an __init, possibly needed by modules + */ +static int pin_2_irq(int idx, int apic, int pin); + +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) +{ + int apic, i, best_guess = -1; + + apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", + bus, slot, pin); + if (test_bit(bus, mp_bus_not_pci)) { + apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); + return -1; + } + for (i = 0; i < mp_irq_entries; i++) { + int lbus = mp_irqs[i].mp_srcbus; + + for (apic = 0; apic < nr_ioapics; apic++) + if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || + mp_irqs[i].mp_dstapic == MP_APIC_ALL) + break; + + if (!test_bit(lbus, mp_bus_not_pci) && + !mp_irqs[i].mp_irqtype && + (bus == lbus) && + (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { + int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); + + if (!(apic || IO_APIC_IRQ(irq))) + continue; + + if (pin == (mp_irqs[i].mp_srcbusirq & 3)) + return irq; + /* + * Use the first all-but-pin matching entry as a + * best-guess fuzzy result for broken mptables. + */ + if (best_guess < 0) + best_guess = irq; + } + } + return best_guess; +} + +EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); + +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) +/* + * EISA Edge/Level control register, ELCR + */ +static int EISA_ELCR(unsigned int irq) +{ + if (irq < 16) { + unsigned int port = 0x4d0 + (irq >> 3); + return (inb(port) >> (irq & 7)) & 1; + } + apic_printk(APIC_VERBOSE, KERN_INFO + "Broken MPtable reports ISA irq %d\n", irq); + return 0; +} + +#endif + +/* ISA interrupts are always polarity zero edge triggered, + * when listed as conforming in the MP table. */ + +#define default_ISA_trigger(idx) (0) +#define default_ISA_polarity(idx) (0) + +/* EISA interrupts are always polarity zero and can be edge or level + * trigger depending on the ELCR value. If an interrupt is listed as + * EISA conforming in the MP table, that means its trigger type must + * be read in from the ELCR */ + +#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) +#define default_EISA_polarity(idx) default_ISA_polarity(idx) + +/* PCI interrupts are always polarity one level triggered, + * when listed as conforming in the MP table. */ + +#define default_PCI_trigger(idx) (1) +#define default_PCI_polarity(idx) (1) + +/* MCA interrupts are always polarity zero level triggered, + * when listed as conforming in the MP table. */ + +#define default_MCA_trigger(idx) (1) +#define default_MCA_polarity(idx) default_ISA_polarity(idx) + +static int MPBIOS_polarity(int idx) +{ + int bus = mp_irqs[idx].mp_srcbus; + int polarity; + + /* + * Determine IRQ line polarity (high active or low active): + */ + switch (mp_irqs[idx].mp_irqflag & 3) + { + case 0: /* conforms, ie. bus-type dependent polarity */ + if (test_bit(bus, mp_bus_not_pci)) + polarity = default_ISA_polarity(idx); + else + polarity = default_PCI_polarity(idx); + break; + case 1: /* high active */ + { + polarity = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + case 3: /* low active */ + { + polarity = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + polarity = 1; + break; + } + } + return polarity; +} + +static int MPBIOS_trigger(int idx) +{ + int bus = mp_irqs[idx].mp_srcbus; + int trigger; + + /* + * Determine IRQ trigger mode (edge or level sensitive): + */ + switch ((mp_irqs[idx].mp_irqflag>>2) & 3) + { + case 0: /* conforms, ie. bus-type dependent */ + if (test_bit(bus, mp_bus_not_pci)) + trigger = default_ISA_trigger(idx); + else + trigger = default_PCI_trigger(idx); +#if defined(CONFIG_EISA) || defined(CONFIG_MCA) + switch (mp_bus_id_to_type[bus]) { + case MP_BUS_ISA: /* ISA pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_EISA: /* EISA pin */ + { + trigger = default_EISA_trigger(idx); + break; + } + case MP_BUS_PCI: /* PCI pin */ + { + /* set before the switch */ + break; + } + case MP_BUS_MCA: /* MCA pin */ + { + trigger = default_MCA_trigger(idx); + break; + } + default: + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + } +#endif + break; + case 1: /* edge */ + { + trigger = 0; + break; + } + case 2: /* reserved */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 1; + break; + } + case 3: /* level */ + { + trigger = 1; + break; + } + default: /* invalid */ + { + printk(KERN_WARNING "broken BIOS!!\n"); + trigger = 0; + break; + } + } + return trigger; +} + +static inline int irq_polarity(int idx) +{ + return MPBIOS_polarity(idx); +} + +static inline int irq_trigger(int idx) +{ + return MPBIOS_trigger(idx); +} + +int (*ioapic_renumber_irq)(int ioapic, int irq); +static int pin_2_irq(int idx, int apic, int pin) +{ + int irq, i; + int bus = mp_irqs[idx].mp_srcbus; + + /* + * Debugging check, we are in big trouble if this message pops up! + */ + if (mp_irqs[idx].mp_dstirq != pin) + printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); + + if (test_bit(bus, mp_bus_not_pci)) { + irq = mp_irqs[idx].mp_srcbusirq; + } else { + /* + * PCI IRQs are mapped in order + */ + i = irq = 0; + while (i < apic) + irq += nr_ioapic_registers[i++]; + irq += pin; + /* + * For MPS mode, so far only needed by ES7000 platform + */ + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); + } + +#ifdef CONFIG_X86_32 + /* + * PCI IRQ command line redirection. Yes, limits are hardcoded. + */ + if ((pin >= 16) && (pin <= 23)) { + if (pirq_entries[pin-16] != -1) { + if (!pirq_entries[pin-16]) { + apic_printk(APIC_VERBOSE, KERN_DEBUG + "disabling PIRQ%d\n", pin-16); + } else { + irq = pirq_entries[pin-16]; + apic_printk(APIC_VERBOSE, KERN_DEBUG + "using PIRQ%d -> IRQ %d\n", + pin-16, irq); + } + } + } +#endif + + return irq; +} + +void lock_vector_lock(void) +{ + /* Used to the online set of cpus does not change + * during assign_irq_vector. + */ + spin_lock(&vector_lock); +} + +void unlock_vector_lock(void) +{ + spin_unlock(&vector_lock); +} + +static int __assign_irq_vector(int irq, cpumask_t mask) +{ + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; + unsigned int old_vector; + int cpu; + struct irq_cfg *cfg; + + cfg = irq_cfg(irq); + + /* Only try and allocate irqs on cpus that are present */ + cpus_and(mask, mask, cpu_online_map); + + if ((cfg->move_in_progress) || cfg->move_cleanup_count) + return -EBUSY; + + old_vector = cfg->vector; + if (old_vector) { + cpumask_t tmp; + cpus_and(tmp, cfg->domain, mask); + if (!cpus_empty(tmp)) + return 0; + } + + for_each_cpu_mask_nr(cpu, mask) { + cpumask_t domain, new_mask; + int new_cpu; + int vector, offset; + + domain = vector_allocation_domain(cpu); + cpus_and(new_mask, domain, cpu_online_map); + + vector = current_vector; + offset = current_offset; +next: + vector += 8; + if (vector >= first_system_vector) { + /* If we run out of vectors on large boxen, must share them. */ + offset = (offset + 1) % 8; + vector = FIRST_DEVICE_VECTOR + offset; + } + if (unlikely(current_vector == vector)) + continue; +#ifdef CONFIG_X86_64 + if (vector == IA32_SYSCALL_VECTOR) + goto next; +#else + if (vector == SYSCALL_VECTOR) + goto next; +#endif + for_each_cpu_mask_nr(new_cpu, new_mask) + if (per_cpu(vector_irq, new_cpu)[vector] != -1) + goto next; + /* Found one! */ + current_vector = vector; + current_offset = offset; + if (old_vector) { + cfg->move_in_progress = 1; + cfg->old_domain = cfg->domain; + } + for_each_cpu_mask_nr(new_cpu, new_mask) + per_cpu(vector_irq, new_cpu)[vector] = irq; + cfg->vector = vector; + cfg->domain = domain; + return 0; + } + return -ENOSPC; +} + +static int assign_irq_vector(int irq, cpumask_t mask) +{ + int err; + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + err = __assign_irq_vector(irq, mask); + spin_unlock_irqrestore(&vector_lock, flags); + return err; +} + +static void __clear_irq_vector(int irq) +{ + struct irq_cfg *cfg; + cpumask_t mask; + int cpu, vector; + + cfg = irq_cfg(irq); + BUG_ON(!cfg->vector); + + vector = cfg->vector; + cpus_and(mask, cfg->domain, cpu_online_map); + for_each_cpu_mask_nr(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + cfg->vector = 0; + cpus_clear(cfg->domain); +} + +void __setup_vector_irq(int cpu) +{ + /* Initialize vector_irq on a new cpu */ + /* This function must be called with vector_lock held */ + int irq, vector; + struct irq_cfg *cfg; + + /* Mark the inuse vectors */ + for_each_irq_cfg(cfg) { + if (!cpu_isset(cpu, cfg->domain)) + continue; + vector = cfg->vector; + irq = cfg->irq; + per_cpu(vector_irq, cpu)[vector] = irq; + } + /* Mark the free vectors */ + for (vector = 0; vector < NR_VECTORS; ++vector) { + irq = per_cpu(vector_irq, cpu)[vector]; + if (irq < 0) + continue; + + cfg = irq_cfg(irq); + if (!cpu_isset(cpu, cfg->domain)) + per_cpu(vector_irq, cpu)[vector] = -1; + } +} + +static struct irq_chip ioapic_chip; +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip; +#endif + +#define IOAPIC_AUTO -1 +#define IOAPIC_EDGE 0 +#define IOAPIC_LEVEL 1 + +#ifdef CONFIG_X86_32 +static inline int IO_APIC_irq_trigger(int irq) +{ + int apic, idx, pin; + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* + * nonexistent IRQs are edge default + */ + return 0; +} +#else +static inline int IO_APIC_irq_trigger(int irq) +{ + return 1; +} +#endif + +static void ioapic_register_intr(int irq, unsigned long trigger) +{ + struct irq_desc *desc; + + /* first time to use this irq_desc */ + if (irq < 16) + desc = irq_to_desc(irq); + else + desc = irq_to_desc_alloc(irq); + + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + desc->status |= IRQ_LEVEL; + else + desc->status &= ~IRQ_LEVEL; + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + desc->status |= IRQ_MOVE_PCNTXT; + if (trigger) + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, + handle_edge_irq, "edge"); + return; + } +#endif + if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || + trigger == IOAPIC_LEVEL) + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_fasteoi_irq, + "fasteoi"); + else + set_irq_chip_and_handler_name(irq, &ioapic_chip, + handle_edge_irq, "edge"); +} + +static int setup_ioapic_entry(int apic, int irq, + struct IO_APIC_route_entry *entry, + unsigned int destination, int trigger, + int polarity, int vector) +{ + /* + * add it to the IO-APIC irq-routing table: + */ + memset(entry,0,sizeof(*entry)); + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) { + struct intel_iommu *iommu = map_ioapic_to_ir(apic); + struct irte irte; + struct IR_IO_APIC_route_entry *ir_entry = + (struct IR_IO_APIC_route_entry *) entry; + int index; + + if (!iommu) + panic("No mapping iommu for ioapic %d\n", apic); + + index = alloc_irte(iommu, irq, 1); + if (index < 0) + panic("Failed to allocate IRTE for ioapic %d\n", apic); + + memset(&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = trigger; + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = vector; + irte.dest_id = IRTE_DEST(destination); + + modify_irte(irq, &irte); + + ir_entry->index2 = (index >> 15) & 0x1; + ir_entry->zero = 0; + ir_entry->format = 1; + ir_entry->index = (index & 0x7fff); + } else +#endif + { + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->dest = destination; + } + + entry->mask = 0; /* enable IRQ */ + entry->trigger = trigger; + entry->polarity = polarity; + entry->vector = vector; + + /* Mask level triggered irqs. + * Use IRQ_DELAYED_DISABLE for edge triggered irqs. + */ + if (trigger) + entry->mask = 1; + return 0; +} + +static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, + int trigger, int polarity) +{ + struct irq_cfg *cfg; + struct IO_APIC_route_entry entry; + cpumask_t mask; + + if (!IO_APIC_IRQ(irq)) + return; + + cfg = irq_cfg(irq); + + mask = TARGET_CPUS; + if (assign_irq_vector(irq, mask)) + return; + + cpus_and(mask, cfg->domain, mask); + + apic_printk(APIC_VERBOSE,KERN_DEBUG + "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " + "IRQ %d Mode:%i Active:%i)\n", + apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, + irq, trigger, polarity); + + + if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, + cpu_mask_to_apicid(mask), trigger, polarity, + cfg->vector)) { + printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", + mp_ioapics[apic].mp_apicid, pin); + __clear_irq_vector(irq); + return; + } + + ioapic_register_intr(irq, trigger); + if (irq < 16) + disable_8259A_irq(irq); + + ioapic_write_entry(apic, pin, entry); +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq, first_notcon = 1; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + idx = find_irq_entry(apic,pin,mp_INT); + if (idx == -1) { + if (first_notcon) { + apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); + first_notcon = 0; + } else + apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); + continue; + } + if (!first_notcon) { + apic_printk(APIC_VERBOSE, " not connected.\n"); + first_notcon = 1; + } + + irq = pin_2_irq(idx, apic, pin); +#ifdef CONFIG_X86_32 + if (multi_timer_check(apic, irq)) + continue; +#endif + add_pin_to_irq(irq, apic, pin); + + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } + } + + if (!first_notcon) + apic_printk(APIC_VERBOSE, " not connected.\n"); +} + +/* + * Set up the timer pin, possibly with the 8259A-master behind. + */ +static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, + int vector) +{ + struct IO_APIC_route_entry entry; + +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + return; +#endif + + memset(&entry, 0, sizeof(entry)); + + /* + * We use logical delivery to get the timer IRQ + * to the first CPU. + */ + entry.dest_mode = INT_DEST_MODE; + entry.mask = 1; /* mask IRQ now */ + entry.dest = cpu_mask_to_apicid(TARGET_CPUS); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.polarity = 0; + entry.trigger = 0; + entry.vector = vector; + + /* + * The timer IRQ doesn't have to know that behind the + * scene we may have a 8259A-master in AEOI mode ... + */ + set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(apic, pin, entry); +} + + +__apicdebuginit(void) print_IO_APIC(void) +{ + int apic, i; + union IO_APIC_reg_00 reg_00; + union IO_APIC_reg_01 reg_01; + union IO_APIC_reg_02 reg_02; + union IO_APIC_reg_03 reg_03; + unsigned long flags; + struct irq_cfg *cfg; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); + for (i = 0; i < nr_ioapics; i++) + printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", + mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); + + /* + * We are a bit conservative about what we expect. We have to + * know about every hardware change ASAP. + */ + printk(KERN_INFO "testing the IO APIC.......................\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + reg_01.raw = io_apic_read(apic, 1); + if (reg_01.bits.version >= 0x10) + reg_02.raw = io_apic_read(apic, 2); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); + spin_unlock_irqrestore(&ioapic_lock, flags); + + printk("\n"); + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, + * but the value of reg_02 is read as the previous read register + * value, so ignore it if reg_02 == reg_01. + */ + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); + } + + /* + * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 + * or reg_03, but the value of reg_0[23] is read as the previous read + * register value, so ignore it if reg_03 == reg_0[12]. + */ + if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); + + printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" + " Stat Dmod Deli Vect: \n"); + + for (i = 0; i <= reg_01.bits.entries; i++) { + struct IO_APIC_route_entry entry; + + entry = ioapic_read_entry(apic, i); + + printk(KERN_DEBUG " %02x %03X ", + i, + entry.dest + ); + + printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", + entry.mask, + entry.trigger, + entry.irr, + entry.polarity, + entry.delivery_status, + entry.dest_mode, + entry.delivery_mode, + entry.vector + ); + } + } + printk(KERN_DEBUG "IRQ to pin mappings:\n"); + for_each_irq_cfg(cfg) { + struct irq_pin_list *entry = cfg->irq_2_pin; + if (!entry) + continue; + printk(KERN_DEBUG "IRQ%d ", cfg->irq); + for (;;) { + printk("-> %d:%d", entry->apic, entry->pin); + if (!entry->next) + break; + entry = entry->next; + } + printk("\n"); + } + + printk(KERN_INFO ".................................... done.\n"); + + return; +} + +__apicdebuginit(void) print_APIC_bitfield(int base) +{ + unsigned int v; + int i, j; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); + for (i = 0; i < 8; i++) { + v = apic_read(base + i*0x10); + for (j = 0; j < 32; j++) { + if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + + v = apic_read(APIC_ESR); + printk(KERN_DEBUG "... APIC ESR: %08x\n", v); + } + + icr = apic_icr_read(); + printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); + + v = apic_read(APIC_LVTT); + printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); + + if (maxlvt > 3) { /* PC is LVT#4. */ + v = apic_read(APIC_LVTPC); + printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); + } + v = apic_read(APIC_LVT0); + printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); + v = apic_read(APIC_LVT1); + printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); + + if (maxlvt > 2) { /* ERR is LVT#3. */ + v = apic_read(APIC_LVTERR); + printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); + } + + v = apic_read(APIC_TMICT); + printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); + v = apic_read(APIC_TMCCT); + printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); + v = apic_read(APIC_TDCR); + printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); + printk("\n"); +} + +__apicdebuginit(void) print_all_local_APICs(void) +{ + on_each_cpu(print_local_APIC, NULL, 1); +} + +__apicdebuginit(void) print_PIC(void) +{ + unsigned int v; + unsigned long flags; + + if (apic_verbosity == APIC_QUIET) + return; + + printk(KERN_DEBUG "\nprinting PIC contents\n"); + + spin_lock_irqsave(&i8259A_lock, flags); + + v = inb(0xa1) << 8 | inb(0x21); + printk(KERN_DEBUG "... PIC IMR: %04x\n", v); + + v = inb(0xa0) << 8 | inb(0x20); + printk(KERN_DEBUG "... PIC IRR: %04x\n", v); + + outb(0x0b,0xa0); + outb(0x0b,0x20); + v = inb(0xa0) << 8 | inb(0x20); + outb(0x0a,0xa0); + outb(0x0a,0x20); + + spin_unlock_irqrestore(&i8259A_lock, flags); + + printk(KERN_DEBUG "... PIC ISR: %04x\n", v); + + v = inb(0x4d1) << 8 | inb(0x4d0); + printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); +} + +__apicdebuginit(int) print_all_ICs(void) +{ + print_PIC(); + print_all_local_APICs(); + print_IO_APIC(); + + return 0; +} + +fs_initcall(print_all_ICs); + + +/* Where if anywhere is the i8259 connect in external int mode */ +static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; + +void __init enable_IO_APIC(void) +{ + union IO_APIC_reg_01 reg_01; + int i8259_apic, i8259_pin; + int apic; + unsigned long flags; + +#ifdef CONFIG_X86_32 + int i; + if (!pirqs_enabled) + for (i = 0; i < MAX_PIRQS; i++) + pirq_entries[i] = -1; +#endif + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + for (apic = 0; apic < nr_ioapics; apic++) { + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(apic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + nr_ioapic_registers[apic] = reg_01.bits.entries+1; + } + for(apic = 0; apic < nr_ioapics; apic++) { + int pin; + /* See if any of the pins is in ExtINT mode */ + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + struct IO_APIC_route_entry entry; + entry = ioapic_read_entry(apic, pin); + + /* If the interrupt line is enabled and in ExtInt mode + * I have found the pin where the i8259 is connected. + */ + if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { + ioapic_i8259.apic = apic; + ioapic_i8259.pin = pin; + goto found_i8259; + } + } + } + found_i8259: + /* Look to see what if the MP table has reported the ExtINT */ + /* If we could not find the appropriate pin by looking at the ioapic + * the i8259 probably is not connected the ioapic but give the + * mptable a chance anyway. + */ + i8259_pin = find_isa_irq_pin(0, mp_ExtINT); + i8259_apic = find_isa_irq_apic(0, mp_ExtINT); + /* Trust the MP table if nothing is setup in the hardware */ + if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { + printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); + ioapic_i8259.pin = i8259_pin; + ioapic_i8259.apic = i8259_apic; + } + /* Complain if the MP table and the hardware disagree */ + if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && + (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) + { + printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); + } + + /* + * Do not trust the IO-APIC being empty at bootup + */ + clear_IO_APIC(); +} + +/* + * Not an __init, needed by the reboot code + */ +void disable_IO_APIC(void) +{ + /* + * Clear the IO-APIC before rebooting: + */ + clear_IO_APIC(); + + /* + * If the i8259 is routed through an IOAPIC + * Put that IOAPIC in virtual wire mode + * so legacy interrupts can be delivered. + */ + if (ioapic_i8259.pin != -1) { + struct IO_APIC_route_entry entry; + + memset(&entry, 0, sizeof(entry)); + entry.mask = 0; /* Enabled */ + entry.trigger = 0; /* Edge */ + entry.irr = 0; + entry.polarity = 0; /* High */ + entry.delivery_status = 0; + entry.dest_mode = 0; /* Physical */ + entry.delivery_mode = dest_ExtINT; /* ExtInt */ + entry.vector = 0; + entry.dest = read_apic_id(); + + /* + * Add it to the IO-APIC irq-routing table: + */ + ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); + } + + disconnect_bsp_APIC(ioapic_i8259.pin != -1); +} + +#ifdef CONFIG_X86_32 +/* + * function to set the IO-APIC physical IDs based on the + * values stored in the MPC table. + * + * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 + */ + +static void __init setup_ioapic_ids_from_mpc(void) +{ + union IO_APIC_reg_00 reg_00; + physid_mask_t phys_id_present_map; + int apic; + int i; + unsigned char old_id; + unsigned long flags; + + if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) + return; + + /* + * Don't check I/O APIC IDs for xAPIC systems. They have + * no meaning without the serial APIC bus. + */ + if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) + return; + /* + * This is broken; anything with a real cpu count has to + * circumvent this idiocy regardless. + */ + phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); + + /* + * Set the IOAPIC ID to the value stored in the MPC table. + */ + for (apic = 0; apic < nr_ioapics; apic++) { + + /* Read the register 0 value */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + old_id = mp_ioapics[apic].mp_apicid; + + if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", + apic, mp_ioapics[apic].mp_apicid); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + reg_00.bits.ID); + mp_ioapics[apic].mp_apicid = reg_00.bits.ID; + } + + /* + * Sanity check, is the ID really free? Every APIC in a + * system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(phys_id_present_map, + mp_ioapics[apic].mp_apicid)) { + printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", + apic, mp_ioapics[apic].mp_apicid); + for (i = 0; i < get_physical_broadcast(); i++) + if (!physid_isset(i, phys_id_present_map)) + break; + if (i >= get_physical_broadcast()) + panic("Max APIC ID exceeded!\n"); + printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", + i); + physid_set(i, phys_id_present_map); + mp_ioapics[apic].mp_apicid = i; + } else { + physid_mask_t tmp; + tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); + apic_printk(APIC_VERBOSE, "Setting %d in the " + "phys_id_present_map\n", + mp_ioapics[apic].mp_apicid); + physids_or(phys_id_present_map, phys_id_present_map, tmp); + } + + + /* + * We need to adjust the IRQ routing table + * if the ID changed. + */ + if (old_id != mp_ioapics[apic].mp_apicid) + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_dstapic == old_id) + mp_irqs[i].mp_dstapic + = mp_ioapics[apic].mp_apicid; + + /* + * Read the right value from the MPC table and + * write it into the ID register. + */ + apic_printk(APIC_VERBOSE, KERN_INFO + "...changing IO-APIC physical APIC ID to %d ...", + mp_ioapics[apic].mp_apicid); + + reg_00.bits.ID = mp_ioapics[apic].mp_apicid; + spin_lock_irqsave(&ioapic_lock, flags); + + /* + * Sanity check + */ + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(apic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) + printk("could not set ID!\n"); + else + apic_printk(APIC_VERBOSE, " ok.\n"); + } +} +#endif + +int no_timer_check __initdata; + +static int __init notimercheck(char *s) +{ + no_timer_check = 1; + return 1; +} +__setup("no_timer_check", notimercheck); + +/* + * There is a nasty bug in some older SMP boards, their mptable lies + * about the timer IRQ. We do the following to work around the situation: + * + * - timer IRQ defaults to IO-APIC IRQ + * - if this function detects that timer IRQs are defunct, then we fall + * back to ISA timer IRQs + */ +static int __init timer_irq_works(void) +{ + unsigned long t1 = jiffies; + unsigned long flags; + + if (no_timer_check) + return 1; + + local_save_flags(flags); + local_irq_enable(); + /* Let ten ticks pass... */ + mdelay((10 * 1000) / HZ); + local_irq_restore(flags); + + /* + * Expect a few ticks at least, to be sure some possible + * glue logic does not lock up after one or two first + * ticks in a non-ExtINT mode. Also the local APIC + * might have cached one ExtINT interrupt. Finally, at + * least one tick may be lost due to delays. + */ + + /* jiffies wrap? */ + if (time_after(jiffies, t1 + 4)) + return 1; + return 0; +} + +/* + * In the SMP+IOAPIC case it might happen that there are an unspecified + * number of pending IRQ events unhandled. These cases are very rare, + * so we 'resend' these IRQs via IPIs, to the same CPU. It's much + * better to do it this way as thus we do not have to be aware of + * 'pending' interrupts in the IRQ path, except at this point. + */ +/* + * Edge triggered needs to resend any interrupt + * that was delayed but this is now handled in the device + * independent code. + */ + +/* + * Starting up a edge-triggered IO-APIC interrupt is + * nasty - we need to make sure that we get the edge. + * If it is already asserted for some reason, we need + * return 1 to indicate that is was pending. + * + * This is not complete - we should be able to fake + * an edge even if it isn't on the 8259A... + */ + +static unsigned int startup_ioapic_irq(unsigned int irq) +{ + int was_pending = 0; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + if (irq < 16) { + disable_8259A_irq(irq); + if (i8259A_irq_pending(irq)) + was_pending = 1; + } + __unmask_IO_APIC_irq(irq); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return was_pending; +} + +#ifdef CONFIG_X86_64 +static int ioapic_retrigger_irq(unsigned int irq) +{ + + struct irq_cfg *cfg = irq_cfg(irq); + unsigned long flags; + + spin_lock_irqsave(&vector_lock, flags); + send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); + spin_unlock_irqrestore(&vector_lock, flags); + + return 1; +} +#else +static int ioapic_retrigger_irq(unsigned int irq) +{ + send_IPI_self(irq_cfg(irq)->vector); + + return 1; +} +#endif + +/* + * Level and edge triggered IO-APIC interrupts need different handling, + * so we use two separate IRQ descriptors. Edge triggered IRQs can be + * handled with the level-triggered descriptor, but that one has slightly + * more overhead. Level-triggered interrupts cannot be handled with the + * edge-triggered handler, without risking IRQ storms and other ugly + * races. + */ + +#ifdef CONFIG_SMP + +#ifdef CONFIG_INTR_REMAP +static void ir_irq_migration(struct work_struct *work); + +static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); + +/* + * Migrate the IO-APIC irq in the presence of intr-remapping. + * + * For edge triggered, irq migration is a simple atomic update(of vector + * and cpu destination) of IRTE and flush the hardware cache. + * + * For level triggered, we need to modify the io-apic RTE aswell with the update + * vector information, along with modifying IRTE with vector and destination. + * So irq migration for level triggered is little bit more complex compared to + * edge triggered migration. But the good news is, we use the same algorithm + * for level triggered migration as we have today, only difference being, + * we now initiate the irq migration from process context instead of the + * interrupt context. + * + * In future, when we do a directed EOI (combined with cpu EOI broadcast + * suppression) to the IO-APIC, level triggered irq migration will also be + * as simple as edge triggered migration and we can do the irq migration + * with a simple atomic update to IO-APIC RTE. + */ +static void migrate_ioapic_irq(int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + cpumask_t tmp, cleanup_mask; + struct irte irte; + int modify_ioapic_rte; + unsigned int dest; + unsigned long flags; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + desc = irq_to_desc(irq); + modify_ioapic_rte = desc->status & IRQ_LEVEL; + if (modify_ioapic_rte) { + spin_lock_irqsave(&ioapic_lock, flags); + __target_IO_APIC_irq(irq, dest, cfg->vector); + spin_unlock_irqrestore(&ioapic_lock, flags); + } + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * Modified the IRTE and flushes the Interrupt entry cache. + */ + modify_irte(irq, &irte); + + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc->affinity = mask; +} + +static int migrate_irq_remapped_level(int irq) +{ + int ret = -1; + struct irq_desc *desc = irq_to_desc(irq); + + mask_IO_APIC_irq(irq); + + if (io_apic_level_ack_pending(irq)) { + /* + * Interrupt in progress. Migrating irq now will change the + * vector information in the IO-APIC RTE and that will confuse + * the EOI broadcast performed by cpu. + * So, delay the irq migration to the next instance. + */ + schedule_delayed_work(&ir_migration_work, 1); + goto unmask; + } + + /* everthing is clear. we have right of way */ + migrate_ioapic_irq(irq, desc->pending_mask); + + ret = 0; + desc->status &= ~IRQ_MOVE_PENDING; + cpus_clear(desc->pending_mask); + +unmask: + unmask_IO_APIC_irq(irq); + return ret; +} + +static void ir_irq_migration(struct work_struct *work) +{ + unsigned int irq; + struct irq_desc *desc; + + for_each_irq_desc(irq, desc) { + if (desc->status & IRQ_MOVE_PENDING) { + unsigned long flags; + + spin_lock_irqsave(&desc->lock, flags); + if (!desc->chip->set_affinity || + !(desc->status & IRQ_MOVE_PENDING)) { + desc->status &= ~IRQ_MOVE_PENDING; + spin_unlock_irqrestore(&desc->lock, flags); + continue; + } + + desc->chip->set_affinity(irq, desc->pending_mask); + spin_unlock_irqrestore(&desc->lock, flags); + } + } +} + +/* + * Migrates the IRQ destination in the process context. + */ +static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) +{ + struct irq_desc *desc = irq_to_desc(irq); + + if (desc->status & IRQ_LEVEL) { + desc->status |= IRQ_MOVE_PENDING; + desc->pending_mask = mask; + migrate_irq_remapped_level(irq); + return; + } + + migrate_ioapic_irq(irq, mask); +} +#endif + +asmlinkage void smp_irq_move_cleanup_interrupt(void) +{ + unsigned vector, me; + ack_APIC_irq(); +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + + me = smp_processor_id(); + for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { + unsigned int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + irq = __get_cpu_var(vector_irq)[vector]; + + desc = irq_to_desc(irq); + if (!desc) + continue; + + cfg = irq_cfg(irq); + spin_lock(&desc->lock); + if (!cfg->move_cleanup_count) + goto unlock; + + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) + goto unlock; + + __get_cpu_var(vector_irq)[vector] = -1; + cfg->move_cleanup_count--; +unlock: + spin_unlock(&desc->lock); + } + + irq_exit(); +} + +static void irq_complete_move(unsigned int irq) +{ + struct irq_cfg *cfg = irq_cfg(irq); + unsigned vector, me; + + if (likely(!cfg->move_in_progress)) + return; + + vector = ~get_irq_regs()->orig_ax; + me = smp_processor_id(); + if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } +} +#else +static inline void irq_complete_move(unsigned int irq) {} +#endif +#ifdef CONFIG_INTR_REMAP +static void ack_x2apic_level(unsigned int irq) +{ + ack_x2APIC_irq(); +} + +static void ack_x2apic_edge(unsigned int irq) +{ + ack_x2APIC_irq(); +} +#endif + +static void ack_apic_edge(unsigned int irq) +{ + irq_complete_move(irq); + move_native_irq(irq); + ack_APIC_irq(); +} + +#ifdef CONFIG_X86_64 +static void ack_apic_level(unsigned int irq) +{ + int do_unmask_irq = 0; + + irq_complete_move(irq); +#ifdef CONFIG_GENERIC_PENDING_IRQ + /* If we are moving the irq we need to mask it */ + if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { + do_unmask_irq = 1; + mask_IO_APIC_irq(irq); + } +#endif + + /* + * We must acknowledge the irq before we move it or the acknowledge will + * not propagate properly. + */ + ack_APIC_irq(); + + /* Now we can move and renable the irq */ + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); + unmask_IO_APIC_irq(irq); + } +} +#else +atomic_t irq_mis_count; +static void ack_apic_level(unsigned int irq) +{ + unsigned long v; + int i; + + irq_complete_move(irq); + move_native_irq(irq); + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); + + ack_APIC_irq(); + + if (!(v & (1 << (i & 0x1f)))) { + atomic_inc(&irq_mis_count); + spin_lock(&ioapic_lock); + __mask_and_edge_IO_APIC_irq(irq); + __unmask_and_level_IO_APIC_irq(irq); + spin_unlock(&ioapic_lock); + } +} +#endif + +static struct irq_chip ioapic_chip __read_mostly = { + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +#ifdef CONFIG_INTR_REMAP +static struct irq_chip ir_ioapic_chip __read_mostly = { + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, +#ifdef CONFIG_SMP + .set_affinity = set_ir_ioapic_affinity_irq, +#endif + .retrigger = ioapic_retrigger_irq, +}; +#endif + +static inline void init_IO_APIC_traps(void) +{ + int irq; + struct irq_desc *desc; + struct irq_cfg *cfg; + + /* + * NOTE! The local APIC isn't very good at handling + * multiple interrupts at the same interrupt level. + * As the interrupt level is determined by taking the + * vector number and shifting that right by 4, we + * want to spread these out a bit so that they don't + * all fall in the same interrupt level. + * + * Also, we've got to be careful not to trash gate + * 0x80, because int 0x80 is hm, kind of importantish. ;) + */ + for_each_irq_cfg(cfg) { + irq = cfg->irq; + if (IO_APIC_IRQ(irq) && !cfg->vector) { + /* + * Hmm.. We don't have an entry for this, + * so default to an old-fashioned 8259 + * interrupt if we can.. + */ + if (irq < 16) + make_8259A_irq(irq); + else { + desc = irq_to_desc(irq); + /* Strange. Oh, well.. */ + desc->chip = &no_irq_chip; + } + } + } +} + +/* + * The local APIC irq-chip implementation: + */ + +static void mask_lapic_irq(unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); +} + +static void unmask_lapic_irq(unsigned int irq) +{ + unsigned long v; + + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); +} + +static void ack_lapic_irq (unsigned int irq) +{ + ack_APIC_irq(); +} + +static struct irq_chip lapic_chip __read_mostly = { + .name = "local-APIC", + .mask = mask_lapic_irq, + .unmask = unmask_lapic_irq, + .ack = ack_lapic_irq, +}; + +static void lapic_register_intr(int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + desc->status &= ~IRQ_LEVEL; + set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, + "edge"); +} + +static void __init setup_nmi(void) +{ + /* + * Dirty trick to enable the NMI watchdog ... + * We put the 8259A master into AEOI mode and + * unmask on all local APICs LVT0 as NMI. + * + * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') + * is from Maciej W. Rozycki - so we do not have to EOI from + * the NMI handler or the timer interrupt. + */ + apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); + + enable_NMI_through_LVT0(); + + apic_printk(APIC_VERBOSE, " done.\n"); +} + +/* + * This looks a bit hackish but it's about the only one way of sending + * a few INTA cycles to 8259As and any associated glue logic. ICR does + * not support the ExtINT mode, unfortunately. We need to send these + * cycles as some i82489DX-based boards have glue logic that keeps the + * 8259A interrupt line asserted until INTA. --macro + */ +static inline void __init unlock_ExtINT_logic(void) +{ + int apic, pin, i; + struct IO_APIC_route_entry entry0, entry1; + unsigned char save_control, save_freq_select; + + pin = find_isa_irq_pin(8, mp_INT); + if (pin == -1) { + WARN_ON_ONCE(1); + return; + } + apic = find_isa_irq_apic(8, mp_INT); + if (apic == -1) { + WARN_ON_ONCE(1); + return; + } + + entry0 = ioapic_read_entry(apic, pin); + clear_IO_APIC_pin(apic, pin); + + memset(&entry1, 0, sizeof(entry1)); + + entry1.dest_mode = 0; /* physical delivery */ + entry1.mask = 0; /* unmask IRQ now */ + entry1.dest = hard_smp_processor_id(); + entry1.delivery_mode = dest_ExtINT; + entry1.polarity = entry0.polarity; + entry1.trigger = 0; + entry1.vector = 0; + + ioapic_write_entry(apic, pin, entry1); + + save_control = CMOS_READ(RTC_CONTROL); + save_freq_select = CMOS_READ(RTC_FREQ_SELECT); + CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, + RTC_FREQ_SELECT); + CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); + + i = 100; + while (i-- > 0) { + mdelay(10); + if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) + i -= 10; + } + + CMOS_WRITE(save_control, RTC_CONTROL); + CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); + clear_IO_APIC_pin(apic, pin); + + ioapic_write_entry(apic, pin, entry0); +} + +static int disable_timer_pin_1 __initdata; +/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ +static int __init disable_timer_pin_setup(char *arg) +{ + disable_timer_pin_1 = 1; + return 0; +} +early_param("disable_timer_pin_1", disable_timer_pin_setup); + +int timer_through_8259 __initdata; + +/* + * This code may look a bit paranoid, but it's supposed to cooperate with + * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ + * is so screwy. Thanks to Brian Perkins for testing/hacking this beast + * fanatically on his truly buggy board. + * + * FIXME: really need to revamp this for all platforms. + */ +static inline void __init check_timer(void) +{ + struct irq_cfg *cfg = irq_cfg(0); + int apic1, pin1, apic2, pin2; + unsigned long flags; + unsigned int ver; + int no_pin1 = 0; + + local_irq_save(flags); + + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); + + /* + * get/set the timer IRQ vector: + */ + disable_8259A_irq(0); + assign_irq_vector(0, TARGET_CPUS); + + /* + * As IRQ0 is to be enabled in the 8259A, the virtual + * wire has to be disabled in the local APIC. Also + * timer interrupts need to be acknowledged manually in + * the 8259A for the i82489DX when using the NMI + * watchdog as that APIC treats NMIs as level-triggered. + * The AEOI mode will finish them in the 8259A + * automatically. + */ + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); + init_8259A(1); +#ifdef CONFIG_X86_32 + timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); +#endif + + pin1 = find_isa_irq_pin(0, mp_INT); + apic1 = find_isa_irq_apic(0, mp_INT); + pin2 = ioapic_i8259.pin; + apic2 = ioapic_i8259.apic; + + apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " + "apic1=%d pin1=%d apic2=%d pin2=%d\n", + cfg->vector, apic1, pin1, apic2, pin2); + + /* + * Some BIOS writers are clueless and report the ExtINTA + * I/O APIC input from the cascaded 8259A as the timer + * interrupt input. So just in case, if only one pin + * was found above, try it both directly and through the + * 8259A. + */ + if (pin1 == -1) { +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("BIOS bug: timer not connected to IO-APIC"); +#endif + pin1 = pin2; + apic1 = apic2; + no_pin1 = 1; + } else if (pin2 == -1) { + pin2 = pin1; + apic2 = apic1; + } + + if (pin1 != -1) { + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + if (no_pin1) { + add_pin_to_irq(0, apic1, pin1); + setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); + } + unmask_IO_APIC_irq(0); + if (timer_irq_works()) { + if (nmi_watchdog == NMI_IO_APIC) { + setup_nmi(); + enable_8259A_irq(0); + } + if (disable_timer_pin_1 > 0) + clear_IO_APIC_pin(0, pin1); + goto out; + } +#ifdef CONFIG_INTR_REMAP + if (intr_remapping_enabled) + panic("timer doesn't work through Interrupt-remapped IO-APIC"); +#endif + clear_IO_APIC_pin(apic1, pin1); + if (!no_pin1) + apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " + "8254 timer not connected to IO-APIC\n"); + + apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " + "(IRQ0) through the 8259A ...\n"); + apic_printk(APIC_QUIET, KERN_INFO + "..... (found apic %d pin %d) ...\n", apic2, pin2); + /* + * legacy devices should be connected to IO APIC #0 + */ + replace_pin_at_irq(0, apic1, pin1, apic2, pin2); + setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); + unmask_IO_APIC_irq(0); + enable_8259A_irq(0); + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); + timer_through_8259 = 1; + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + goto out; + } + /* + * Cleanup, just in case ... + */ + disable_8259A_irq(0); + clear_IO_APIC_pin(apic2, pin2); + apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); + } + + if (nmi_watchdog == NMI_IO_APIC) { + apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " + "through the IO-APIC - disabling NMI Watchdog!\n"); + nmi_watchdog = NMI_NONE; + } +#ifdef CONFIG_X86_32 + timer_ack = 0; +#endif + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as Virtual Wire IRQ...\n"); + + lapic_register_intr(0); + apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ + enable_8259A_irq(0); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + disable_8259A_irq(0); + apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); + apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); + + apic_printk(APIC_QUIET, KERN_INFO + "...trying to set up timer as ExtINT IRQ...\n"); + + init_8259A(0); + make_8259A_irq(0); + apic_write(APIC_LVT0, APIC_DM_EXTINT); + + unlock_ExtINT_logic(); + + if (timer_irq_works()) { + apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); + goto out; + } + apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " + "report. Then try booting with the 'noapic' option.\n"); +out: + local_irq_restore(flags); +} + +/* + * Traditionally ISA IRQ2 is the cascade IRQ, and is not available + * to devices. However there may be an I/O APIC pin available for + * this interrupt regardless. The pin may be left unconnected, but + * typically it will be reused as an ExtINT cascade interrupt for + * the master 8259A. In the MPS case such a pin will normally be + * reported as an ExtINT interrupt in the MP table. With ACPI + * there is no provision for ExtINT interrupts, and in the absence + * of an override it would be treated as an ordinary ISA I/O APIC + * interrupt, that is edge-triggered and unmasked by default. We + * used to do this, but it caused problems on some systems because + * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using + * the same ExtINT cascade interrupt to drive the local APIC of the + * bootstrap processor. Therefore we refrain from routing IRQ2 to + * the I/O APIC in all cases now. No actual device should request + * it anyway. --macro + */ +#define PIC_IRQS (1 << PIC_CASCADE_IR) + +void __init setup_IO_APIC(void) +{ + +#ifdef CONFIG_X86_32 + enable_IO_APIC(); +#else + /* + * calling enable_IO_APIC() is moved to setup_local_APIC for BP + */ +#endif + + io_apic_irqs = ~PIC_IRQS; + + apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); + /* + * Set up IO-APIC IRQ routing. + */ +#ifdef CONFIG_X86_32 + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); +#endif + sync_Arb_IDs(); + setup_IO_APIC_irqs(); + init_IO_APIC_traps(); + check_timer(); +} + +/* + * Called after all the initialization is done. If we didnt find any + * APIC bugs then we can allow the modify fast path + */ + +static int __init io_apic_bug_finalize(void) +{ + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; +} + +late_initcall(io_apic_bug_finalize); + +struct sysfs_ioapic_data { + struct sys_device dev; + struct IO_APIC_route_entry entry[0]; +}; +static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; + +static int ioapic_suspend(struct sys_device *dev, pm_message_t state) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) + *entry = ioapic_read_entry(dev->id, i); + + return 0; +} + +static int ioapic_resume(struct sys_device *dev) +{ + struct IO_APIC_route_entry *entry; + struct sysfs_ioapic_data *data; + unsigned long flags; + union IO_APIC_reg_00 reg_00; + int i; + + data = container_of(dev, struct sysfs_ioapic_data, dev); + entry = data->entry; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(dev->id, 0); + if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { + reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; + io_apic_write(dev->id, 0, reg_00.raw); + } + spin_unlock_irqrestore(&ioapic_lock, flags); + for (i = 0; i < nr_ioapic_registers[dev->id]; i++) + ioapic_write_entry(dev->id, i, entry[i]); + + return 0; +} + +static struct sysdev_class ioapic_sysdev_class = { + .name = "ioapic", + .suspend = ioapic_suspend, + .resume = ioapic_resume, +}; + +static int __init ioapic_init_sysfs(void) +{ + struct sys_device * dev; + int i, size, error; + + error = sysdev_class_register(&ioapic_sysdev_class); + if (error) + return error; + + for (i = 0; i < nr_ioapics; i++ ) { + size = sizeof(struct sys_device) + nr_ioapic_registers[i] + * sizeof(struct IO_APIC_route_entry); + mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); + if (!mp_ioapic_data[i]) { + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + dev = &mp_ioapic_data[i]->dev; + dev->id = i; + dev->cls = &ioapic_sysdev_class; + error = sysdev_register(dev); + if (error) { + kfree(mp_ioapic_data[i]); + mp_ioapic_data[i] = NULL; + printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); + continue; + } + } + + return 0; +} + +device_initcall(ioapic_init_sysfs); + +/* + * Dynamic irq allocate and deallocation + */ +unsigned int create_irq_nr(unsigned int irq_want) +{ + /* Allocate an unused irq */ + unsigned int irq; + unsigned int new; + unsigned long flags; + struct irq_cfg *cfg_new; + +#ifndef CONFIG_HAVE_SPARSE_IRQ + irq_want = nr_irqs - 1; +#endif + + irq = 0; + spin_lock_irqsave(&vector_lock, flags); + for (new = irq_want; new > 0; new--) { + if (platform_legacy_irq(new)) + continue; + cfg_new = irq_cfg(new); + if (cfg_new && cfg_new->vector != 0) + continue; + /* check if need to create one */ + if (!cfg_new) + cfg_new = irq_cfg_alloc(new); + if (__assign_irq_vector(new, TARGET_CPUS) == 0) + irq = new; + break; + } + spin_unlock_irqrestore(&vector_lock, flags); + + if (irq > 0) { + dynamic_irq_init(irq); + } + return irq; +} + +int create_irq(void) +{ + int irq; + + irq = create_irq_nr(nr_irqs - 1); + + if (irq == 0) + irq = -1; + + return irq; +} + +void destroy_irq(unsigned int irq) +{ + unsigned long flags; + + dynamic_irq_cleanup(irq); + +#ifdef CONFIG_INTR_REMAP + free_irte(irq); +#endif + spin_lock_irqsave(&vector_lock, flags); + __clear_irq_vector(irq); + spin_unlock_irqrestore(&vector_lock, flags); +} + +/* + * MSI message composition + */ +#ifdef CONFIG_PCI_MSI +static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) +{ + struct irq_cfg *cfg; + int err; + unsigned dest; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (err) + return err; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irte irte; + int ir_index; + u16 sub_handle; + + ir_index = map_irq_to_irte_handle(irq, &sub_handle); + BUG_ON(ir_index == -1); + + memset (&irte, 0, sizeof(irte)); + + irte.present = 1; + irte.dst_mode = INT_DEST_MODE; + irte.trigger_mode = 0; /* edge */ + irte.dlvry_mode = INT_DELIVERY_MODE; + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + modify_irte(irq, &irte); + + msg->address_hi = MSI_ADDR_BASE_HI; + msg->data = sub_handle; + msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | + MSI_ADDR_IR_SHV | + MSI_ADDR_IR_INDEX1(ir_index) | + MSI_ADDR_IR_INDEX2(ir_index); + } else +#endif + { + msg->address_hi = MSI_ADDR_BASE_HI; + msg->address_lo = + MSI_ADDR_BASE_LO | + ((INT_DEST_MODE == 0) ? + MSI_ADDR_DEST_MODE_PHYSICAL: + MSI_ADDR_DEST_MODE_LOGICAL) | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_ADDR_REDIRECTION_CPU: + MSI_ADDR_REDIRECTION_LOWPRI) | + MSI_ADDR_DEST_ID(dest); + + msg->data = + MSI_DATA_TRIGGER_EDGE | + MSI_DATA_LEVEL_ASSERT | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + MSI_DATA_DELIVERY_FIXED: + MSI_DATA_DELIVERY_LOWPRI) | + MSI_DATA_VECTOR(cfg->vector); + } + return err; +} + +#ifdef CONFIG_SMP +static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + read_msi_msg(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + write_msi_msg(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} + +#ifdef CONFIG_INTR_REMAP +/* + * Migrate the MSI irq to another cpumask. This migration is + * done in the process context using interrupt-remapping hardware. + */ +static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp, cleanup_mask; + struct irte irte; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (get_irte(irq, &irte)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + irte.vector = cfg->vector; + irte.dest_id = IRTE_DEST(dest); + + /* + * atomically update the IRTE with the new destination and vector. + */ + modify_irte(irq, &irte); + + /* + * After this point, all the interrupts will start arriving + * at the new destination. So, time to cleanup the previous + * vector allocation. + */ + if (cfg->move_in_progress) { + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); + cfg->move_cleanup_count = cpus_weight(cleanup_mask); + send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); + cfg->move_in_progress = 0; + } + + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif +#endif /* CONFIG_SMP */ + +/* + * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, + * which implement the MSI or MSI-X Capability Structure. + */ +static struct irq_chip msi_chip = { + .name = "PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +#ifdef CONFIG_INTR_REMAP +static struct irq_chip msi_ir_chip = { + .name = "IR-PCI-MSI", + .unmask = unmask_msi_irq, + .mask = mask_msi_irq, + .ack = ack_x2apic_edge, +#ifdef CONFIG_SMP + .set_affinity = ir_set_msi_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +/* + * Map the PCI dev to the corresponding remapping hardware unit + * and allocate 'nvec' consecutive interrupt-remapping table entries + * in it. + */ +static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) +{ + struct intel_iommu *iommu; + int index; + + iommu = map_dev_to_ir(dev); + if (!iommu) { + printk(KERN_ERR + "Unable to map PCI %s to iommu\n", pci_name(dev)); + return -ENOENT; + } + + index = alloc_irte(iommu, irq, nvec); + if (index < 0) { + printk(KERN_ERR + "Unable to allocate %d IRTE for PCI %s\n", nvec, + pci_name(dev)); + return -ENOSPC; + } + return index; +} +#endif + +static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) + return ret; + + set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + +#ifdef CONFIG_INTR_REMAP + if (irq_remapped(irq)) { + struct irq_desc *desc = irq_to_desc(irq); + /* + * irq migration in process context + */ + desc->status |= IRQ_MOVE_PCNTXT; + set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); + } else +#endif + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + + return 0; +} + +static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) +{ + unsigned int irq; + + irq = dev->bus->number; + irq <<= 8; + irq |= dev->devfn; + irq <<= 12; + + return irq; +} + +int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) +{ + unsigned int irq; + int ret; + unsigned int irq_want; + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + + irq = create_irq_nr(irq_want); + if (irq == 0) + return -1; + +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + ret = msi_alloc_irte(dev, irq, 1); + if (ret < 0) + goto error; +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) { + destroy_irq(irq); + return ret; + } + return 0; + +#ifdef CONFIG_INTR_REMAP +error: + destroy_irq(irq); + return ret; +#endif +} + +int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) +{ + unsigned int irq; + int ret, sub_handle; + struct msi_desc *desc; + unsigned int irq_want; + +#ifdef CONFIG_INTR_REMAP + struct intel_iommu *iommu = 0; + int index = 0; +#endif + + irq_want = build_irq_for_pci_dev(dev) + 0x100; + sub_handle = 0; + list_for_each_entry(desc, &dev->msi_list, list) { + irq = create_irq_nr(irq_want--); + if (irq == 0) + return -1; +#ifdef CONFIG_INTR_REMAP + if (!intr_remapping_enabled) + goto no_ir; + + if (!sub_handle) { + /* + * allocate the consecutive block of IRTE's + * for 'nvec' + */ + index = msi_alloc_irte(dev, irq, nvec); + if (index < 0) { + ret = index; + goto error; + } + } else { + iommu = map_dev_to_ir(dev); + if (!iommu) { + ret = -ENOENT; + goto error; + } + /* + * setup the mapping between the irq and the IRTE + * base index, the sub_handle pointing to the + * appropriate interrupt remap table entry. + */ + set_irte_irq(irq, iommu, index, sub_handle); + } +no_ir: +#endif + ret = setup_msi_irq(dev, desc, irq); + if (ret < 0) + goto error; + sub_handle++; + } + return 0; + +error: + destroy_irq(irq); + return ret; +} + +void arch_teardown_msi_irq(unsigned int irq) +{ + destroy_irq(irq); +} + +#ifdef CONFIG_DMAR +#ifdef CONFIG_SMP +static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + dmar_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + dmar_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip dmar_msi_type = { + .name = "DMAR_MSI", + .unmask = dmar_msi_unmask, + .mask = dmar_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = dmar_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_dmar_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + dmar_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + +#endif /* CONFIG_PCI_MSI */ +/* + * Hypertransport interrupt support + */ +#ifdef CONFIG_HT_IRQ + +#ifdef CONFIG_SMP + +static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) +{ + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); + + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); + + write_ht_irq_msg(irq, &msg); +} + +static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + unsigned int dest; + cpumask_t tmp; + struct irq_desc *desc; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + target_ht_irq(irq, dest, cfg->vector); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif + +static struct irq_chip ht_irq_chip = { + .name = "PCI-HT", + .mask = mask_ht_irq, + .unmask = unmask_ht_irq, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = set_ht_irq_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) +{ + struct irq_cfg *cfg; + int err; + cpumask_t tmp; + + tmp = TARGET_CPUS; + err = assign_irq_vector(irq, tmp); + if (!err) { + struct ht_irq_msg msg; + unsigned dest; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, tmp); + dest = cpu_mask_to_apicid(tmp); + + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); + + msg.address_lo = + HT_IRQ_LOW_BASE | + HT_IRQ_LOW_DEST_ID(dest) | + HT_IRQ_LOW_VECTOR(cfg->vector) | + ((INT_DEST_MODE == 0) ? + HT_IRQ_LOW_DM_PHYSICAL : + HT_IRQ_LOW_DM_LOGICAL) | + HT_IRQ_LOW_RQEOI_EDGE | + ((INT_DELIVERY_MODE != dest_LowestPrio) ? + HT_IRQ_LOW_MT_FIXED : + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; + + write_ht_irq_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &ht_irq_chip, + handle_edge_irq, "edge"); + } + return err; +} +#endif /* CONFIG_HT_IRQ */ + +/* -------------------------------------------------------------------------- + ACPI-based IOAPIC Configuration + -------------------------------------------------------------------------- */ + +#ifdef CONFIG_ACPI + +#ifdef CONFIG_X86_32 +int __init io_apic_get_unique_id(int ioapic, int apic_id) +{ + union IO_APIC_reg_00 reg_00; + static physid_mask_t apic_id_map = PHYSID_MASK_NONE; + physid_mask_t tmp; + unsigned long flags; + int i = 0; + + /* + * The P4 platform supports up to 256 APIC IDs on two separate APIC + * buses (one for LAPICs, one for IOAPICs), where predecessors only + * supports up to 16 on one shared APIC bus. + * + * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full + * advantage of new APIC bus architecture. + */ + + if (physids_empty(apic_id_map)) + apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); + + spin_lock_irqsave(&ioapic_lock, flags); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + if (apic_id >= get_physical_broadcast()) { + printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " + "%d\n", ioapic, apic_id, reg_00.bits.ID); + apic_id = reg_00.bits.ID; + } + + /* + * Every APIC in a system must have a unique ID or we get lots of nice + * 'stuck on smp_invalidate_needed IPI wait' messages. + */ + if (check_apicid_used(apic_id_map, apic_id)) { + + for (i = 0; i < get_physical_broadcast(); i++) { + if (!check_apicid_used(apic_id_map, i)) + break; + } + + if (i == get_physical_broadcast()) + panic("Max apic_id exceeded!\n"); + + printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " + "trying %d\n", ioapic, apic_id, i); + + apic_id = i; + } + + tmp = apicid_to_cpu_present(apic_id); + physids_or(apic_id_map, apic_id_map, tmp); + + if (reg_00.bits.ID != apic_id) { + reg_00.bits.ID = apic_id; + + spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(ioapic, 0, reg_00.raw); + reg_00.raw = io_apic_read(ioapic, 0); + spin_unlock_irqrestore(&ioapic_lock, flags); + + /* Sanity check */ + if (reg_00.bits.ID != apic_id) { + printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); + return -1; + } + } + + apic_printk(APIC_VERBOSE, KERN_INFO + "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); + + return apic_id; +} + +int __init io_apic_get_version(int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.version; +} +#endif + +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + + +int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) +{ + if (!IO_APIC_IRQ(irq)) { + apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", + ioapic); + return -EINVAL; + } + + /* + * IRQs < 16 are already in the irq_2_pin[] map + */ + if (irq >= 16) + add_pin_to_irq(irq, ioapic, pin); + + setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); + + return 0; +} + + +int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) +{ + int i; + + if (skip_ioapic_setup) + return -1; + + for (i = 0; i < mp_irq_entries; i++) + if (mp_irqs[i].mp_irqtype == mp_INT && + mp_irqs[i].mp_srcbusirq == bus_irq) + break; + if (i >= mp_irq_entries) + return -1; + + *trigger = irq_trigger(i); + *polarity = irq_polarity(i); + return 0; +} + +#endif /* CONFIG_ACPI */ + +/* + * This function currently is only a helper for the i386 smp boot process where + * we need to reprogram the ioredtbls to cater for the cpus which have come online + * so mask in all cases should simply be TARGET_CPUS + */ +#ifdef CONFIG_SMP +void __init setup_ioapic_dest(void) +{ + int pin, ioapic, irq, irq_entry; + struct irq_cfg *cfg; + + if (skip_ioapic_setup == 1) + return; + + for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { + for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + irq_entry = find_irq_entry(ioapic, pin, mp_INT); + if (irq_entry == -1) + continue; + irq = pin_2_irq(irq_entry, ioapic, pin); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + cfg = irq_cfg(irq); + if (!cfg->vector) + setup_IO_APIC_irq(ioapic, pin, irq, + irq_trigger(irq_entry), + irq_polarity(irq_entry)); +#ifdef CONFIG_INTR_REMAP + else if (intr_remapping_enabled) + set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); +#endif + else + set_ioapic_affinity_irq(irq, TARGET_CPUS); + } + + } +} +#endif + +#ifdef CONFIG_X86_64 +#define IOAPIC_RESOURCE_NAME_SIZE 11 + +static struct resource *ioapic_resources; + +static struct resource * __init ioapic_setup_resources(void) +{ + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} +#endif + +void __init ioapic_init_mappings(void) +{ + unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; + int i; +#ifdef CONFIG_X86_64 + struct resource *ioapic_res; + + ioapic_res = ioapic_setup_resources(); +#endif + for (i = 0; i < nr_ioapics; i++) { + if (smp_found_config) { + ioapic_phys = mp_ioapics[i].mp_apicaddr; +#ifdef CONFIG_X86_32 + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } +#endif + } else { +#ifdef CONFIG_X86_32 +fake_ioapic_page: +#endif + ioapic_phys = (unsigned long) + alloc_bootmem_pages(PAGE_SIZE); + ioapic_phys = __pa(ioapic_phys); + } + set_fixmap_nocache(idx, ioapic_phys); + apic_printk(APIC_VERBOSE, + "mapped IOAPIC to %08lx (%08lx)\n", + __fix_to_virt(idx), ioapic_phys); + idx++; + +#ifdef CONFIG_X86_64 + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } +#endif + } +} + +#ifdef CONFIG_X86_64 +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk(KERN_ERR + "IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif diff --git a/arch/x86/kernel/io_apic_32.c b/arch/x86/kernel/io_apic_32.c deleted file mode 100644 index fba6d6ee3480..000000000000 --- a/arch/x86/kernel/io_apic_32.c +++ /dev/null @@ -1,3913 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku and - * Hidemi Kishimoto , - * further tested and cleaned up by Zach Brown - * and Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* time_after() */ -#ifdef CONFIG_ACPI -#include -#endif -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define __apicdebuginit(type) static type __init - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int first_free_entry; -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -int skip_ioapic_setup; - -static int __init parse_noapic(char *str) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -struct irq_cfg; -struct irq_pin_list; -struct irq_cfg { - unsigned int irq; - struct irq_cfg *next; - struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, -}; - -static struct irq_cfg irq_cfg_init = { .irq = -1U, }; -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); - -static void init_one_irq_cfg(struct irq_cfg *cfg) -{ - memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); -} - -static struct irq_cfg *irq_cfgx; -static struct irq_cfg *irq_cfgx_free; -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; - - cfg = *da->name; - - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); - for (i = legacy_count; i < *da->nr; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < *da->nr; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = &irq_cfgx[legacy_count]; - irq_cfgx[legacy_count - 1].next = NULL; -} - -#define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg; cfg = cfg->next) - -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); - -static struct irq_cfg *irq_cfg(unsigned int irq) -{ - struct irq_cfg *cfg; - - cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg = cfg->next; - } - - return NULL; -} - -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - struct irq_cfg *cfg, *cfg_pri; - int i; - int count = 0; - - cfg_pri = cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg_pri = cfg; - cfg = cfg->next; - count++; - } - - if (!irq_cfgx_free) { - unsigned long phys; - unsigned long total_bytes; - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - - total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; - if (after_bootmem) - cfg = kzalloc(total_bytes, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); - - phys = __pa(cfg); - printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = cfg; - } - - cfg = irq_cfgx_free; - irq_cfgx_free = irq_cfgx_free->next; - cfg->next = NULL; - if (cfg_pri) - cfg_pri->next = cfg; - else - irq_cfgx = cfg; - cfg->irq = irq; - printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_cfg *cfg; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_cfg); - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); - for_each_irq_cfg(cfg) { - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif - return cfg; -} - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; - -static struct irq_pin_list *irq_2_pin_head; -/* fill one page ? */ -static int nr_irq_2_pin = 0x100; -static struct irq_pin_list *irq_2_pin_ptr; -static void __init irq_2_pin_init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_pin_list *pin; - int i; - - pin = *da->name; - - for (i = 1; i < *da->nr; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = &pin[0]; -} -DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); - -static struct irq_pin_list *get_one_free_irq_2_pin(void) -{ - struct irq_pin_list *pin; - int i; - - pin = irq_2_pin_ptr; - - if (pin) { - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; - } - - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); - - if (after_bootmem) - pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, - GFP_ATOMIC); - else - pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * - nr_irq_2_pin, PAGE_SIZE, 0); - - if (!pin) - panic("can not get more irq_2_pin\n"); - - for (i = 1; i < nr_irq_2_pin; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = pin->next; - pin->next = NULL; - - return pin; -} - -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -#ifdef CONFIG_X86_64 -static bool io_apic_level_ack_pending(unsigned int irq) -{ - struct irq_pin_list *entry; - unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); - - spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - int pin; - - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - if (!entry->next) - break; - entry = entry->next; - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} -#endif - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - int apic, pin; - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; -#ifdef CONFIG_INTR_REMAP - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); -#else - io_apic_write(apic, 0x11 + pin*2, dest); -#endif - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int assign_irq_vector(int irq, cpumask_t mask); - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#endif /* CONFIG_SMP */ - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(); - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); - return; - } - - while (entry->next) { - /* not again, please */ - if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; - } - - entry->next = get_one_free_irq_2_pin(); - entry = entry->next; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_cfg *cfg = irq_cfg(irq); - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; - - while (entry) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - replaced = 1; - /* every one is different, right? */ - break; - } - entry = entry->next; - } - - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq(irq, newapic, newpin); -} - -#ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) - -/* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) - -#else - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - unsigned int pin, reg; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} - -#endif - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) -void send_IPI_self(int vector) -{ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP && CONFIG_X86_32*/ - -#ifdef CONFIG_X86_32 -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_INTR_REMAP -/* I/O APIC RTE contents at the OS boot up */ -static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; - -/* - * Saves and masks all the unmasked IO-APIC RTE's - */ -int save_mask_IO_APIC_setup(void) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - int apic, pin; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - - for (apic = 0; apic < nr_ioapics; apic++) { - early_ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!early_ioapic_entries[apic]) - return -ENOMEM; - } - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - - entry = early_ioapic_entries[apic][pin] = - ioapic_read_entry(apic, pin); - if (!entry.mask) { - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); - } - } - return 0; -} - -void restore_IO_APIC_setup(void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - ioapic_write_entry(apic, pin, - early_ioapic_entries[apic][pin]); -} - -void reinit_intr_remapped_IO_APIC(int intr_remapping) -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(); -} -#endif - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -int (*ioapic_renumber_irq)(int ioapic, int irq); -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mp_srcbusirq; - } else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - } - -#ifdef CONFIG_X86_32 - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } -#endif - - return irq; -} - -void lock_vector_lock(void) -{ - /* Used to the online set of cpus does not change - * during assign_irq_vector. - */ - spin_lock(&vector_lock); -} - -void unlock_vector_lock(void) -{ - spin_unlock(&vector_lock); -} - -static int __assign_irq_vector(int irq, cpumask_t mask) -{ - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); - - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; - - old_vector = cfg->vector; - if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) - return 0; - } - - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; - int new_cpu; - int vector, offset; - - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); - - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (unlikely(current_vector == vector)) - continue; -#ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; -#else - if (vector == SYSCALL_VECTOR) - goto next; -#endif - for_each_cpu_mask_nr(new_cpu, new_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) - goto next; - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (old_vector) { - cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; - } - for_each_cpu_mask_nr(new_cpu, new_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cfg->domain = domain; - return 0; - } - return -ENOSPC; -} - -static int assign_irq_vector(int irq, cpumask_t mask) -{ - int err; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); - spin_unlock_irqrestore(&vector_lock, flags); - return err; -} - -static void __clear_irq_vector(int irq) -{ - struct irq_cfg *cfg; - cpumask_t mask; - int cpu, vector; - - cfg = irq_cfg(irq); - BUG_ON(!cfg->vector); - - vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) - per_cpu(vector_irq, cpu)[vector] = -1; - - cfg->vector = 0; - cpus_clear(cfg->domain); -} - -void __setup_vector_irq(int cpu) -{ - /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ - int irq, vector; - struct irq_cfg *cfg; - - /* Mark the inuse vectors */ - for_each_irq_cfg(cfg) { - if (!cpu_isset(cpu, cfg->domain)) - continue; - vector = cfg->vector; - irq = cfg->irq; - per_cpu(vector_irq, cpu)[vector] = irq; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) - continue; - - cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = -1; - } -} - -static struct irq_chip ioapic_chip; -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip; -#endif - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - -static void ioapic_register_intr(int irq, unsigned long trigger) -{ - struct irq_desc *desc; - - /* first time to use this irq_desc */ - if (irq < 16) - desc = irq_to_desc(irq); - else - desc = irq_to_desc_alloc(irq); - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - desc->status |= IRQ_LEVEL; - else - desc->status &= ~IRQ_LEVEL; - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - desc->status |= IRQ_MOVE_PCNTXT; - if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; - } -#endif - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); -} - -static int setup_ioapic_entry(int apic, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector) -{ - /* - * add it to the IO-APIC irq-routing table: - */ - memset(entry,0,sizeof(*entry)); - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_ioapic_to_ir(apic); - struct irte irte; - struct IR_IO_APIC_route_entry *ir_entry = - (struct IR_IO_APIC_route_entry *) entry; - int index; - - if (!iommu) - panic("No mapping iommu for ioapic %d\n", apic); - - index = alloc_irte(iommu, irq, 1); - if (index < 0) - panic("Failed to allocate IRTE for ioapic %d\n", apic); - - memset(&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = trigger; - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = vector; - irte.dest_id = IRTE_DEST(destination); - - modify_irte(irq, &irte); - - ir_entry->index2 = (index >> 15) & 0x1; - ir_entry->zero = 0; - ir_entry->format = 1; - ir_entry->index = (index & 0x7fff); - } else -#endif - { - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; - entry->dest = destination; - } - - entry->mask = 0; /* enable IRQ */ - entry->trigger = trigger; - entry->polarity = polarity; - entry->vector = vector; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry->mask = 1; - return 0; -} - -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) -{ - struct irq_cfg *cfg; - struct IO_APIC_route_entry entry; - cpumask_t mask; - - if (!IO_APIC_IRQ(irq)) - return; - - cfg = irq_cfg(irq); - - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(mask, cfg->domain, mask); - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, - irq, trigger, polarity); - - - if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { - printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); - return; - } - - ioapic_register_intr(irq, trigger); - if (irq < 16) - disable_8259A_irq(irq); - - ioapic_write_entry(apic, pin, entry); -} - -static void __init setup_IO_APIC_irqs(void) -{ - int apic, pin, idx, irq, first_notcon = 1; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - irq = pin_2_irq(idx, apic, pin); -#ifdef CONFIG_X86_32 - if (multi_timer_check(apic, irq)) - continue; -#endif - add_pin_to_irq(irq, apic, pin); - - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - return; -#endif - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - - -__apicdebuginit(void) print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - struct irq_cfg *cfg; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X ", - i, - entry.dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; - if (!entry) - continue; - printk(KERN_DEBUG "IRQ%d ", cfg->irq); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -__apicdebuginit(void) print_APIC_bitfield(int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -__apicdebuginit(void) print_all_local_APICs(void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -__apicdebuginit(void) print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -__apicdebuginit(int) print_all_ICs(void) -{ - print_PIC(); - print_all_local_APICs(); - print_IO_APIC(); - - return 0; -} - -fs_initcall(print_all_ICs); - - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i8259_apic, i8259_pin; - int apic; - unsigned long flags; - -#ifdef CONFIG_X86_32 - int i; - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; -#endif - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the - * mptable a chance anyway. - */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -} - -#ifdef CONFIG_X86_32 -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) - return; - - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mp_apicid; - - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; - } - - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mp_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} -#endif - -int no_timer_check __initdata; - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - if (no_timer_check) - return 1; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -#ifdef CONFIG_X86_64 -static int ioapic_retrigger_irq(unsigned int irq) -{ - - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); - - return 1; -} -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -#ifdef CONFIG_SMP - -#ifdef CONFIG_INTR_REMAP -static void ir_irq_migration(struct work_struct *work); - -static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For edge triggered, irq migration is a simple atomic update(of vector - * and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we need to modify the io-apic RTE aswell with the update - * vector information, along with modifying IRTE with vector and destination. - * So irq migration for level triggered is little bit more complex compared to - * edge triggered migration. But the good news is, we use the same algorithm - * for level triggered migration as we have today, only difference being, - * we now initiate the irq migration from process context instead of the - * interrupt context. - * - * In future, when we do a directed EOI (combined with cpu EOI broadcast - * suppression) to the IO-APIC, level triggered irq migration will also be - * as simple as edge triggered migration and we can do the irq migration - * with a simple atomic update to IO-APIC RTE. - */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; - struct irte irte; - int modify_ioapic_rte; - unsigned int dest; - unsigned long flags; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - desc = irq_to_desc(irq); - modify_ioapic_rte = desc->status & IRQ_LEVEL; - if (modify_ioapic_rte) { - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Modified the IRTE and flushes the Interrupt entry cache. - */ - modify_irte(irq, &irte); - - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc->affinity = mask; -} - -static int migrate_irq_remapped_level(int irq) -{ - int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); - - mask_IO_APIC_irq(irq); - - if (io_apic_level_ack_pending(irq)) { - /* - * Interrupt in progress. Migrating irq now will change the - * vector information in the IO-APIC RTE and that will confuse - * the EOI broadcast performed by cpu. - * So, delay the irq migration to the next instance. - */ - schedule_delayed_work(&ir_migration_work, 1); - goto unmask; - } - - /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); - - ret = 0; - desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); - -unmask: - unmask_IO_APIC_irq(irq); - return ret; -} - -static void ir_irq_migration(struct work_struct *work) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - if (desc->status & IRQ_MOVE_PENDING) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->chip->set_affinity || - !(desc->status & IRQ_MOVE_PENDING)) { - desc->status &= ~IRQ_MOVE_PENDING; - spin_unlock_irqrestore(&desc->lock, flags); - continue; - } - - desc->chip->set_affinity(irq, desc->pending_mask); - spin_unlock_irqrestore(&desc->lock, flags); - } - } -} - -/* - * Migrates the IRQ destination in the process context. - */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc->status & IRQ_LEVEL) { - desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); - return; - } - - migrate_ioapic_irq(irq, mask); -} -#endif - -asmlinkage void smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - ack_APIC_irq(); -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; - - desc = irq_to_desc(irq); - if (!desc) - continue; - - cfg = irq_cfg(irq); - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; - - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; - - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } - - irq_exit(); -} - -static void irq_complete_move(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - unsigned vector, me; - - if (likely(!cfg->move_in_progress)) - return; - - vector = ~get_irq_regs()->orig_ax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } -} -#else -static inline void irq_complete_move(unsigned int irq) {} -#endif -#ifdef CONFIG_INTR_REMAP -static void ack_x2apic_level(unsigned int irq) -{ - ack_x2APIC_irq(); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - -static void ack_apic_edge(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -#ifdef CONFIG_X86_64 -static void ack_apic_level(unsigned int irq) -{ - int do_unmask_irq = 0; - - irq_complete_move(irq); -#ifdef CONFIG_GENERIC_PENDING_IRQ - /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } -#endif - - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq); - unmask_IO_APIC_irq(irq); - } -} -#else -atomic_t irq_mis_count; -static void ack_apic_level(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); - /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} -#endif - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; -#endif - -static inline void init_IO_APIC_traps(void) -{ - int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for_each_irq_cfg(cfg) { - irq = cfg->irq; - if (IO_APIC_IRQ(irq) && !cfg->vector) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); - /* Strange. Oh, well.. */ - desc->chip = &no_irq_chip; - } - } - } -} - -/* - * The local APIC irq-chip implementation: - */ - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - desc->status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) { - WARN_ON_ONCE(1); - return; - } - apic = find_isa_irq_apic(8, mp_INT); - if (apic == -1) { - WARN_ON_ONCE(1); - return; - } - - entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -static int disable_timer_pin_1 __initdata; -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", disable_timer_pin_setup); - -int timer_through_8259 __initdata; - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for all platforms. - */ -static inline void __init check_timer(void) -{ - struct irq_cfg *cfg = irq_cfg(0); - int apic1, pin1, apic2, pin2; - unsigned long flags; - unsigned int ver; - int no_pin1 = 0; - - local_irq_save(flags); - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. Also - * timer interrupts need to be acknowledged manually in - * the 8259A for the i82489DX when using the NMI - * watchdog as that APIC treats NMIs as level-triggered. - * The AEOI mode will finish them in the 8259A - * automatically. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); -#ifdef CONFIG_X86_32 - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); -#endif - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("BIOS bug: timer not connected to IO-APIC"); -#endif - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); - } - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("timer doesn't work through Interrupt-remapped IO-APIC"); -#endif - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); - - lapic_register_intr(0); - apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option.\n"); -out: - local_irq_restore(flags); -} - -/* - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ - -#ifdef CONFIG_X86_32 - enable_IO_APIC(); -#else - /* - * calling enable_IO_APIC() is moved to setup_local_APIC for BP - */ -#endif - - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up IO-APIC IRQ routing. - */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -unsigned int create_irq_nr(unsigned int irq_want) -{ - /* Allocate an unused irq */ - unsigned int irq; - unsigned int new; - unsigned long flags; - struct irq_cfg *cfg_new; - -#ifndef CONFIG_HAVE_SPARSE_IRQ - irq_want = nr_irqs - 1; -#endif - - irq = 0; - spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { - if (platform_legacy_irq(new)) - continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) - continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq > 0) { - dynamic_irq_init(irq); - } - return irq; -} - -int create_irq(void) -{ - int irq; - - irq = create_irq_nr(nr_irqs - 1); - - if (irq == 0) - irq = -1; - - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - -#ifdef CONFIG_INTR_REMAP - free_irte(irq); -#endif - spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); -} - -/* - * MSI message composition - */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (err) - return err; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - memset (&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = 0; /* edge */ - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - modify_irte(irq, &irte); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else -#endif - { - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } - return err; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} - -#ifdef CONFIG_INTR_REMAP -/* - * Migrate the MSI irq to another cpumask. This migration is - * done in the process context using interrupt-remapping hardware. - */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp, cleanup_mask; - struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * atomically update the IRTE with the new destination and vector. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip msi_ir_chip = { - .name = "IR-PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_x2apic_edge, -#ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} -#endif - -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) - return ret; - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irq_desc *desc = irq_to_desc(irq); - /* - * irq migration in process context - */ - desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); - } else -#endif - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - - return 0; -} - -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - unsigned int irq; - int ret; - unsigned int irq_want; - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - - irq = create_irq_nr(irq_want); - if (irq == 0) - return -1; - -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - ret = msi_alloc_irte(dev, irq, 1); - if (ret < 0) - goto error; -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - return 0; - -#ifdef CONFIG_INTR_REMAP -error: - destroy_irq(irq); - return ret; -#endif -} - -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - unsigned int irq; - int ret, sub_handle; - struct msi_desc *desc; - unsigned int irq_want; - -#ifdef CONFIG_INTR_REMAP - struct intel_iommu *iommu = 0; - int index = 0; -#endif - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); - if (irq == 0) - return -1; -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - if (!sub_handle) { - /* - * allocate the consecutive block of IRTE's - * for 'nvec' - */ - index = msi_alloc_irte(dev, irq, nvec); - if (index < 0) { - ret = index; - goto error; - } - } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; - goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); - } -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) - goto error; - sub_handle++; - } - return 0; - -error: - destroy_irq(irq); - return ret; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#ifdef CONFIG_DMAR -#ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif /* CONFIG_SMP */ - -struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .unmask = dmar_msi_unmask, - .mask = dmar_msi_mask, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = dmar_msi_set_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_dmar_msi(unsigned int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} -#endif - -#endif /* CONFIG_PCI_MSI */ -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - struct irq_cfg *cfg; - int err; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return err; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); - return -1; - } - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} - -int __init io_apic_get_version(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} -#endif - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); - - return 0; -} - - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - struct irq_cfg *cfg; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - cfg = irq_cfg(irq); - if (!cfg->vector) - setup_IO_APIC_irq(ioapic, pin, irq, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); -#ifdef CONFIG_INTR_REMAP - else if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); -#endif - else - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - -#ifdef CONFIG_X86_64 -#define IOAPIC_RESOURCE_NAME_SIZE 11 - -static struct resource *ioapic_resources; - -static struct resource * __init ioapic_setup_resources(void) -{ - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} -#endif - -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; -#ifdef CONFIG_X86_64 - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); -#endif - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; -#ifdef CONFIG_X86_32 - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } -#endif - } else { -#ifdef CONFIG_X86_32 -fake_ioapic_page: -#endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - -#ifdef CONFIG_X86_64 - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } -#endif - } -} - -#ifdef CONFIG_X86_64 -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk(KERN_ERR - "IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif diff --git a/arch/x86/kernel/io_apic_64.c b/arch/x86/kernel/io_apic_64.c deleted file mode 100644 index 940c4167b325..000000000000 --- a/arch/x86/kernel/io_apic_64.c +++ /dev/null @@ -1,3913 +0,0 @@ -/* - * Intel IO-APIC support for multi-Pentium hosts. - * - * Copyright (C) 1997, 1998, 1999, 2000 Ingo Molnar, Hajnalka Szabo - * - * Many thanks to Stig Venaas for trying out countless experimental - * patches and reporting/debugging problems patiently! - * - * (c) 1999, Multiple IO-APIC support, developed by - * Ken-ichi Yaku and - * Hidemi Kishimoto , - * further tested and cleaned up by Zach Brown - * and Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively - * Paul Diefenbaugh : Added full ACPI support - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* time_after() */ -#ifdef CONFIG_ACPI -#include -#endif -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -#define __apicdebuginit(type) static type __init - -/* - * Is the SiS APIC rmw bug present ? - * -1 = don't know, 0 = no, 1 = yes - */ -int sis_apic_bug = -1; - -static DEFINE_SPINLOCK(ioapic_lock); -static DEFINE_SPINLOCK(vector_lock); - -int first_free_entry; -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; - -/* I/O APIC entries */ -struct mp_config_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; - -/* MP IRQ source entries */ -struct mp_config_intsrc mp_irqs[MAX_IRQ_SOURCES]; - -/* # of MP IRQ source entries */ -int mp_irq_entries; - -#if defined (CONFIG_MCA) || defined (CONFIG_EISA) -int mp_bus_id_to_type[MAX_MP_BUSSES]; -#endif - -DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES); - -int skip_ioapic_setup; - -static int __init parse_noapic(char *str) -{ - /* disable IO-APIC */ - disable_ioapic_setup(); - return 0; -} -early_param("noapic", parse_noapic); - -struct irq_cfg; -struct irq_pin_list; -struct irq_cfg { - unsigned int irq; - struct irq_cfg *next; - struct irq_pin_list *irq_2_pin; - cpumask_t domain; - cpumask_t old_domain; - unsigned move_cleanup_count; - u8 vector; - u8 move_in_progress : 1; -}; - -/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg_legacy[] __initdata = { - [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, - [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, - [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, - [3] = { .irq = 3, .domain = CPU_MASK_ALL, .vector = IRQ3_VECTOR, }, - [4] = { .irq = 4, .domain = CPU_MASK_ALL, .vector = IRQ4_VECTOR, }, - [5] = { .irq = 5, .domain = CPU_MASK_ALL, .vector = IRQ5_VECTOR, }, - [6] = { .irq = 6, .domain = CPU_MASK_ALL, .vector = IRQ6_VECTOR, }, - [7] = { .irq = 7, .domain = CPU_MASK_ALL, .vector = IRQ7_VECTOR, }, - [8] = { .irq = 8, .domain = CPU_MASK_ALL, .vector = IRQ8_VECTOR, }, - [9] = { .irq = 9, .domain = CPU_MASK_ALL, .vector = IRQ9_VECTOR, }, - [10] = { .irq = 10, .domain = CPU_MASK_ALL, .vector = IRQ10_VECTOR, }, - [11] = { .irq = 11, .domain = CPU_MASK_ALL, .vector = IRQ11_VECTOR, }, - [12] = { .irq = 12, .domain = CPU_MASK_ALL, .vector = IRQ12_VECTOR, }, - [13] = { .irq = 13, .domain = CPU_MASK_ALL, .vector = IRQ13_VECTOR, }, - [14] = { .irq = 14, .domain = CPU_MASK_ALL, .vector = IRQ14_VECTOR, }, - [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, -}; - -static struct irq_cfg irq_cfg_init = { .irq = -1U, }; -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); - -static void init_one_irq_cfg(struct irq_cfg *cfg) -{ - memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); -} - -static struct irq_cfg *irq_cfgx; -static struct irq_cfg *irq_cfgx_free; -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; - - cfg = *da->name; - - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); - for (i = legacy_count; i < *da->nr; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < *da->nr; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = &irq_cfgx[legacy_count]; - irq_cfgx[legacy_count - 1].next = NULL; -} - -#define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg; cfg = cfg->next) - -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); - -static struct irq_cfg *irq_cfg(unsigned int irq) -{ - struct irq_cfg *cfg; - - cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg = cfg->next; - } - - return NULL; -} - -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - struct irq_cfg *cfg, *cfg_pri; - int i; - int count = 0; - - cfg_pri = cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg_pri = cfg; - cfg = cfg->next; - count++; - } - - if (!irq_cfgx_free) { - unsigned long phys; - unsigned long total_bytes; - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - - total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; - if (after_bootmem) - cfg = kzalloc(total_bytes, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); - - phys = __pa(cfg); - printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = cfg; - } - - cfg = irq_cfgx_free; - irq_cfgx_free = irq_cfgx_free->next; - cfg->next = NULL; - if (cfg_pri) - cfg_pri->next = cfg; - else - irq_cfgx = cfg; - cfg->irq = irq; - printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_cfg *cfg; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_cfg); - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); - for_each_irq_cfg(cfg) { - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif - return cfg; -} - -/* - * This is performance-critical, we want to do it O(1) - * - * the indexing order of this array favors 1:1 mappings - * between pins and IRQs. - */ - -struct irq_pin_list { - int apic, pin; - struct irq_pin_list *next; -}; - -static struct irq_pin_list *irq_2_pin_head; -/* fill one page ? */ -static int nr_irq_2_pin = 0x100; -static struct irq_pin_list *irq_2_pin_ptr; -static void __init irq_2_pin_init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_pin_list *pin; - int i; - - pin = *da->name; - - for (i = 1; i < *da->nr; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = &pin[0]; -} -DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); - -static struct irq_pin_list *get_one_free_irq_2_pin(void) -{ - struct irq_pin_list *pin; - int i; - - pin = irq_2_pin_ptr; - - if (pin) { - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; - } - - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); - - if (after_bootmem) - pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, - GFP_ATOMIC); - else - pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * - nr_irq_2_pin, PAGE_SIZE, 0); - - if (!pin) - panic("can not get more irq_2_pin\n"); - - for (i = 1; i < nr_irq_2_pin; i++) - pin[i-1].next = &pin[i]; - - irq_2_pin_ptr = pin->next; - pin->next = NULL; - - return pin; -} - -struct io_apic { - unsigned int index; - unsigned int unused[3]; - unsigned int data; -}; - -static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) -{ - return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].mp_apicaddr & ~PAGE_MASK); -} - -static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - return readl(&io_apic->data); -} - -static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -/* - * Re-write a value: to be used for read-modify-write - * cycles where the read already set up the index register. - * - * Older SiS APIC requires we rewrite the index register - */ -static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); - writel(value, &io_apic->data); -} - -#ifdef CONFIG_X86_64 -static bool io_apic_level_ack_pending(unsigned int irq) -{ - struct irq_pin_list *entry; - unsigned long flags; - struct irq_cfg *cfg = irq_cfg(irq); - - spin_lock_irqsave(&ioapic_lock, flags); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - int pin; - - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - /* Is the remote IRR bit set? */ - if (reg & IO_APIC_REDIR_REMOTE_IRR) { - spin_unlock_irqrestore(&ioapic_lock, flags); - return true; - } - if (!entry->next) - break; - entry = entry->next; - } - spin_unlock_irqrestore(&ioapic_lock, flags); - - return false; -} -#endif - -union entry_union { - struct { u32 w1, w2; }; - struct IO_APIC_route_entry entry; -}; - -static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) -{ - union entry_union eu; - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); - eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); - spin_unlock_irqrestore(&ioapic_lock, flags); - return eu.entry; -} - -/* - * When we write a new IO APIC routing entry, we need to write the high - * word first! If the mask bit in the low word is clear, we will enable - * the interrupt, and we need to make sure the entry is fully populated - * before that happens. - */ -static void -__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - union entry_union eu; - eu.entry = e; - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); -} - -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) -{ - unsigned long flags; - spin_lock_irqsave(&ioapic_lock, flags); - __ioapic_write_entry(apic, pin, e); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -/* - * When we mask an IO APIC routing entry, we need to write the low - * word first, in order to set the mask bit before we change the - * high bits! - */ -static void ioapic_mask_entry(int apic, int pin) -{ - unsigned long flags; - union entry_union eu = { .entry.mask = 1 }; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(apic, 0x10 + 2*pin, eu.w1); - io_apic_write(apic, 0x11 + 2*pin, eu.w2); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -#ifdef CONFIG_SMP -static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - int apic, pin; - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - unsigned int reg; - - if (!entry) - break; - - apic = entry->apic; - pin = entry->pin; -#ifdef CONFIG_INTR_REMAP - /* - * With interrupt-remapping, destination information comes - * from interrupt-remapping table entry. - */ - if (!irq_remapped(irq)) - io_apic_write(apic, 0x11 + pin*2, dest); -#else - io_apic_write(apic, 0x11 + pin*2, dest); -#endif - reg = io_apic_read(apic, 0x10 + pin*2); - reg &= ~IO_APIC_REDIR_VECTOR_MASK; - reg |= vector; - io_apic_modify(apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -static int assign_irq_vector(int irq, cpumask_t mask); - -static void set_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned long flags; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - cfg = irq_cfg(irq); - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - /* - * Only the high 8 bits are valid. - */ - dest = SET_APIC_LOGICAL_ID(dest); - - desc = irq_to_desc(irq); - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - desc->affinity = mask; - spin_unlock_irqrestore(&ioapic_lock, flags); -} -#endif /* CONFIG_SMP */ - -/* - * The common case is 1:1 IRQ<->pin mappings. Sometimes there are - * shared ISA-space IRQs, so we have to support them. We are super - * fast in the common case, and fast for shared ISA-space IRQs. - */ -static void add_pin_to_irq(unsigned int irq, int apic, int pin) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - - /* first time to refer irq_cfg, so with new */ - cfg = irq_cfg_alloc(irq); - entry = cfg->irq_2_pin; - if (!entry) { - entry = get_one_free_irq_2_pin(); - cfg->irq_2_pin = entry; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); - return; - } - - while (entry->next) { - /* not again, please */ - if (entry->apic == apic && entry->pin == pin) - return; - - entry = entry->next; - } - - entry->next = get_one_free_irq_2_pin(); - entry = entry->next; - entry->apic = apic; - entry->pin = pin; - printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); -} - -/* - * Reroute an IRQ to a different pin. - */ -static void __init replace_pin_at_irq(unsigned int irq, - int oldapic, int oldpin, - int newapic, int newpin) -{ - struct irq_cfg *cfg = irq_cfg(irq); - struct irq_pin_list *entry = cfg->irq_2_pin; - int replaced = 0; - - while (entry) { - if (entry->apic == oldapic && entry->pin == oldpin) { - entry->apic = newapic; - entry->pin = newpin; - replaced = 1; - /* every one is different, right? */ - break; - } - entry = entry->next; - } - - /* why? call replace before add? */ - if (!replaced) - add_pin_to_irq(irq, newapic, newpin); -} - -#ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - -#define DO_ACTION(name,R,ACTION, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) - -/* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) - -#else - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) -{ - struct irq_cfg *cfg; - struct irq_pin_list *entry; - unsigned int pin, reg; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } -} - -/* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} - -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} - -/* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} - -/* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} - -#endif - -static void mask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __mask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void unmask_IO_APIC_irq (unsigned int irq) -{ - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); -} - -static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) -{ - struct IO_APIC_route_entry entry; - - /* Check delivery_mode to be sure we're not clearing an SMI pin */ - entry = ioapic_read_entry(apic, pin); - if (entry.delivery_mode == dest_SMI) - return; - /* - * Disable it in the IO-APIC irq-routing table: - */ - ioapic_mask_entry(apic, pin); -} - -static void clear_IO_APIC (void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - clear_IO_APIC_pin(apic, pin); -} - -#if !defined(CONFIG_SMP) && defined(CONFIG_X86_32) -void send_IPI_self(int vector) -{ - unsigned int cfg; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - cfg = APIC_DM_FIXED | APIC_DEST_SELF | vector | APIC_DEST_LOGICAL; - /* - * Send the IPI. The write to APIC_ICR fires this off. - */ - apic_write(APIC_ICR, cfg); -} -#endif /* !CONFIG_SMP && CONFIG_X86_32*/ - -#ifdef CONFIG_X86_32 -/* - * support for broken MP BIOSs, enables hand-redirection of PIRQ0-7 to - * specific CPU-side IRQs. - */ - -#define MAX_PIRQS 8 -static int pirq_entries [MAX_PIRQS]; -static int pirqs_enabled; - -static int __init ioapic_pirq_setup(char *str) -{ - int i, max; - int ints[MAX_PIRQS+1]; - - get_options(str, ARRAY_SIZE(ints), ints); - - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; - - pirqs_enabled = 1; - apic_printk(APIC_VERBOSE, KERN_INFO - "PIRQ redirection, working around broken MP-BIOS.\n"); - max = MAX_PIRQS; - if (ints[0] < MAX_PIRQS) - max = ints[0]; - - for (i = 0; i < max; i++) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "... PIRQ%d -> IRQ %d\n", i, ints[i+1]); - /* - * PIRQs are mapped upside down, usually. - */ - pirq_entries[MAX_PIRQS-i-1] = ints[i+1]; - } - return 1; -} - -__setup("pirq=", ioapic_pirq_setup); -#endif /* CONFIG_X86_32 */ - -#ifdef CONFIG_INTR_REMAP -/* I/O APIC RTE contents at the OS boot up */ -static struct IO_APIC_route_entry *early_ioapic_entries[MAX_IO_APICS]; - -/* - * Saves and masks all the unmasked IO-APIC RTE's - */ -int save_mask_IO_APIC_setup(void) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - int apic, pin; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - - for (apic = 0; apic < nr_ioapics; apic++) { - early_ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!early_ioapic_entries[apic]) - return -ENOMEM; - } - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - - entry = early_ioapic_entries[apic][pin] = - ioapic_read_entry(apic, pin); - if (!entry.mask) { - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); - } - } - return 0; -} - -void restore_IO_APIC_setup(void) -{ - int apic, pin; - - for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - ioapic_write_entry(apic, pin, - early_ioapic_entries[apic][pin]); -} - -void reinit_intr_remapped_IO_APIC(int intr_remapping) -{ - /* - * for now plain restore of previous settings. - * TBD: In the case of OS enabling interrupt-remapping, - * IO-APIC RTE's need to be setup to point to interrupt-remapping - * table entries. for now, do a plain restore, and wait for - * the setup_IO_APIC_irqs() to do proper initialization. - */ - restore_IO_APIC_setup(); -} -#endif - -/* - * Find the IRQ entry number of a certain pin. - */ -static int find_irq_entry(int apic, int pin, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == type && - (mp_irqs[i].mp_dstapic == mp_ioapics[apic].mp_apicid || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) && - mp_irqs[i].mp_dstirq == pin) - return i; - - return -1; -} - -/* - * Find the pin to which IRQ[irq] (ISA) is connected - */ -static int __init find_isa_irq_pin(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - - return mp_irqs[i].mp_dstirq; - } - return -1; -} - -static int __init find_isa_irq_apic(int irq, int type) -{ - int i; - - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - if (test_bit(lbus, mp_bus_not_pci) && - (mp_irqs[i].mp_irqtype == type) && - (mp_irqs[i].mp_srcbusirq == irq)) - break; - } - if (i < mp_irq_entries) { - int apic; - for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic) - return apic; - } - } - - return -1; -} - -/* - * Find a specific PCI IRQ entry. - * Not an __init, possibly needed by modules - */ -static int pin_2_irq(int idx, int apic, int pin); - -int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin) -{ - int apic, i, best_guess = -1; - - apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n", - bus, slot, pin); - if (test_bit(bus, mp_bus_not_pci)) { - apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus); - return -1; - } - for (i = 0; i < mp_irq_entries; i++) { - int lbus = mp_irqs[i].mp_srcbus; - - for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].mp_apicid == mp_irqs[i].mp_dstapic || - mp_irqs[i].mp_dstapic == MP_APIC_ALL) - break; - - if (!test_bit(lbus, mp_bus_not_pci) && - !mp_irqs[i].mp_irqtype && - (bus == lbus) && - (slot == ((mp_irqs[i].mp_srcbusirq >> 2) & 0x1f))) { - int irq = pin_2_irq(i,apic,mp_irqs[i].mp_dstirq); - - if (!(apic || IO_APIC_IRQ(irq))) - continue; - - if (pin == (mp_irqs[i].mp_srcbusirq & 3)) - return irq; - /* - * Use the first all-but-pin matching entry as a - * best-guess fuzzy result for broken mptables. - */ - if (best_guess < 0) - best_guess = irq; - } - } - return best_guess; -} - -EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector); - -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) -/* - * EISA Edge/Level control register, ELCR - */ -static int EISA_ELCR(unsigned int irq) -{ - if (irq < 16) { - unsigned int port = 0x4d0 + (irq >> 3); - return (inb(port) >> (irq & 7)) & 1; - } - apic_printk(APIC_VERBOSE, KERN_INFO - "Broken MPtable reports ISA irq %d\n", irq); - return 0; -} - -#endif - -/* ISA interrupts are always polarity zero edge triggered, - * when listed as conforming in the MP table. */ - -#define default_ISA_trigger(idx) (0) -#define default_ISA_polarity(idx) (0) - -/* EISA interrupts are always polarity zero and can be edge or level - * trigger depending on the ELCR value. If an interrupt is listed as - * EISA conforming in the MP table, that means its trigger type must - * be read in from the ELCR */ - -#define default_EISA_trigger(idx) (EISA_ELCR(mp_irqs[idx].mp_srcbusirq)) -#define default_EISA_polarity(idx) default_ISA_polarity(idx) - -/* PCI interrupts are always polarity one level triggered, - * when listed as conforming in the MP table. */ - -#define default_PCI_trigger(idx) (1) -#define default_PCI_polarity(idx) (1) - -/* MCA interrupts are always polarity zero level triggered, - * when listed as conforming in the MP table. */ - -#define default_MCA_trigger(idx) (1) -#define default_MCA_polarity(idx) default_ISA_polarity(idx) - -static int MPBIOS_polarity(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int polarity; - - /* - * Determine IRQ line polarity (high active or low active): - */ - switch (mp_irqs[idx].mp_irqflag & 3) - { - case 0: /* conforms, ie. bus-type dependent polarity */ - if (test_bit(bus, mp_bus_not_pci)) - polarity = default_ISA_polarity(idx); - else - polarity = default_PCI_polarity(idx); - break; - case 1: /* high active */ - { - polarity = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - case 3: /* low active */ - { - polarity = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - polarity = 1; - break; - } - } - return polarity; -} - -static int MPBIOS_trigger(int idx) -{ - int bus = mp_irqs[idx].mp_srcbus; - int trigger; - - /* - * Determine IRQ trigger mode (edge or level sensitive): - */ - switch ((mp_irqs[idx].mp_irqflag>>2) & 3) - { - case 0: /* conforms, ie. bus-type dependent */ - if (test_bit(bus, mp_bus_not_pci)) - trigger = default_ISA_trigger(idx); - else - trigger = default_PCI_trigger(idx); -#if defined(CONFIG_EISA) || defined(CONFIG_MCA) - switch (mp_bus_id_to_type[bus]) { - case MP_BUS_ISA: /* ISA pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_EISA: /* EISA pin */ - { - trigger = default_EISA_trigger(idx); - break; - } - case MP_BUS_PCI: /* PCI pin */ - { - /* set before the switch */ - break; - } - case MP_BUS_MCA: /* MCA pin */ - { - trigger = default_MCA_trigger(idx); - break; - } - default: - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - } -#endif - break; - case 1: /* edge */ - { - trigger = 0; - break; - } - case 2: /* reserved */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 1; - break; - } - case 3: /* level */ - { - trigger = 1; - break; - } - default: /* invalid */ - { - printk(KERN_WARNING "broken BIOS!!\n"); - trigger = 0; - break; - } - } - return trigger; -} - -static inline int irq_polarity(int idx) -{ - return MPBIOS_polarity(idx); -} - -static inline int irq_trigger(int idx) -{ - return MPBIOS_trigger(idx); -} - -int (*ioapic_renumber_irq)(int ioapic, int irq); -static int pin_2_irq(int idx, int apic, int pin) -{ - int irq, i; - int bus = mp_irqs[idx].mp_srcbus; - - /* - * Debugging check, we are in big trouble if this message pops up! - */ - if (mp_irqs[idx].mp_dstirq != pin) - printk(KERN_ERR "broken BIOS or MPTABLE parser, ayiee!!\n"); - - if (test_bit(bus, mp_bus_not_pci)) { - irq = mp_irqs[idx].mp_srcbusirq; - } else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); - } - -#ifdef CONFIG_X86_32 - /* - * PCI IRQ command line redirection. Yes, limits are hardcoded. - */ - if ((pin >= 16) && (pin <= 23)) { - if (pirq_entries[pin-16] != -1) { - if (!pirq_entries[pin-16]) { - apic_printk(APIC_VERBOSE, KERN_DEBUG - "disabling PIRQ%d\n", pin-16); - } else { - irq = pirq_entries[pin-16]; - apic_printk(APIC_VERBOSE, KERN_DEBUG - "using PIRQ%d -> IRQ %d\n", - pin-16, irq); - } - } - } -#endif - - return irq; -} - -void lock_vector_lock(void) -{ - /* Used to the online set of cpus does not change - * during assign_irq_vector. - */ - spin_lock(&vector_lock); -} - -void unlock_vector_lock(void) -{ - spin_unlock(&vector_lock); -} - -static int __assign_irq_vector(int irq, cpumask_t mask) -{ - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; - unsigned int old_vector; - int cpu; - struct irq_cfg *cfg; - - cfg = irq_cfg(irq); - - /* Only try and allocate irqs on cpus that are present */ - cpus_and(mask, mask, cpu_online_map); - - if ((cfg->move_in_progress) || cfg->move_cleanup_count) - return -EBUSY; - - old_vector = cfg->vector; - if (old_vector) { - cpumask_t tmp; - cpus_and(tmp, cfg->domain, mask); - if (!cpus_empty(tmp)) - return 0; - } - - for_each_cpu_mask_nr(cpu, mask) { - cpumask_t domain, new_mask; - int new_cpu; - int vector, offset; - - domain = vector_allocation_domain(cpu); - cpus_and(new_mask, domain, cpu_online_map); - - vector = current_vector; - offset = current_offset; -next: - vector += 8; - if (vector >= first_system_vector) { - /* If we run out of vectors on large boxen, must share them. */ - offset = (offset + 1) % 8; - vector = FIRST_DEVICE_VECTOR + offset; - } - if (unlikely(current_vector == vector)) - continue; -#ifdef CONFIG_X86_64 - if (vector == IA32_SYSCALL_VECTOR) - goto next; -#else - if (vector == SYSCALL_VECTOR) - goto next; -#endif - for_each_cpu_mask_nr(new_cpu, new_mask) - if (per_cpu(vector_irq, new_cpu)[vector] != -1) - goto next; - /* Found one! */ - current_vector = vector; - current_offset = offset; - if (old_vector) { - cfg->move_in_progress = 1; - cfg->old_domain = cfg->domain; - } - for_each_cpu_mask_nr(new_cpu, new_mask) - per_cpu(vector_irq, new_cpu)[vector] = irq; - cfg->vector = vector; - cfg->domain = domain; - return 0; - } - return -ENOSPC; -} - -static int assign_irq_vector(int irq, cpumask_t mask) -{ - int err; - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - err = __assign_irq_vector(irq, mask); - spin_unlock_irqrestore(&vector_lock, flags); - return err; -} - -static void __clear_irq_vector(int irq) -{ - struct irq_cfg *cfg; - cpumask_t mask; - int cpu, vector; - - cfg = irq_cfg(irq); - BUG_ON(!cfg->vector); - - vector = cfg->vector; - cpus_and(mask, cfg->domain, cpu_online_map); - for_each_cpu_mask_nr(cpu, mask) - per_cpu(vector_irq, cpu)[vector] = -1; - - cfg->vector = 0; - cpus_clear(cfg->domain); -} - -void __setup_vector_irq(int cpu) -{ - /* Initialize vector_irq on a new cpu */ - /* This function must be called with vector_lock held */ - int irq, vector; - struct irq_cfg *cfg; - - /* Mark the inuse vectors */ - for_each_irq_cfg(cfg) { - if (!cpu_isset(cpu, cfg->domain)) - continue; - vector = cfg->vector; - irq = cfg->irq; - per_cpu(vector_irq, cpu)[vector] = irq; - } - /* Mark the free vectors */ - for (vector = 0; vector < NR_VECTORS; ++vector) { - irq = per_cpu(vector_irq, cpu)[vector]; - if (irq < 0) - continue; - - cfg = irq_cfg(irq); - if (!cpu_isset(cpu, cfg->domain)) - per_cpu(vector_irq, cpu)[vector] = -1; - } -} - -static struct irq_chip ioapic_chip; -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip; -#endif - -#define IOAPIC_AUTO -1 -#define IOAPIC_EDGE 0 -#define IOAPIC_LEVEL 1 - -#ifdef CONFIG_X86_32 -static inline int IO_APIC_irq_trigger(int irq) -{ - int apic, idx, pin; - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* - * nonexistent IRQs are edge default - */ - return 0; -} -#else -static inline int IO_APIC_irq_trigger(int irq) -{ - return 1; -} -#endif - -static void ioapic_register_intr(int irq, unsigned long trigger) -{ - struct irq_desc *desc; - - /* first time to use this irq_desc */ - if (irq < 16) - desc = irq_to_desc(irq); - else - desc = irq_to_desc_alloc(irq); - - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - desc->status |= IRQ_LEVEL; - else - desc->status &= ~IRQ_LEVEL; - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - desc->status |= IRQ_MOVE_PCNTXT; - if (trigger) - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ir_ioapic_chip, - handle_edge_irq, "edge"); - return; - } -#endif - if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || - trigger == IOAPIC_LEVEL) - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_fasteoi_irq, - "fasteoi"); - else - set_irq_chip_and_handler_name(irq, &ioapic_chip, - handle_edge_irq, "edge"); -} - -static int setup_ioapic_entry(int apic, int irq, - struct IO_APIC_route_entry *entry, - unsigned int destination, int trigger, - int polarity, int vector) -{ - /* - * add it to the IO-APIC irq-routing table: - */ - memset(entry,0,sizeof(*entry)); - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) { - struct intel_iommu *iommu = map_ioapic_to_ir(apic); - struct irte irte; - struct IR_IO_APIC_route_entry *ir_entry = - (struct IR_IO_APIC_route_entry *) entry; - int index; - - if (!iommu) - panic("No mapping iommu for ioapic %d\n", apic); - - index = alloc_irte(iommu, irq, 1); - if (index < 0) - panic("Failed to allocate IRTE for ioapic %d\n", apic); - - memset(&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = trigger; - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = vector; - irte.dest_id = IRTE_DEST(destination); - - modify_irte(irq, &irte); - - ir_entry->index2 = (index >> 15) & 0x1; - ir_entry->zero = 0; - ir_entry->format = 1; - ir_entry->index = (index & 0x7fff); - } else -#endif - { - entry->delivery_mode = INT_DELIVERY_MODE; - entry->dest_mode = INT_DEST_MODE; - entry->dest = destination; - } - - entry->mask = 0; /* enable IRQ */ - entry->trigger = trigger; - entry->polarity = polarity; - entry->vector = vector; - - /* Mask level triggered irqs. - * Use IRQ_DELAYED_DISABLE for edge triggered irqs. - */ - if (trigger) - entry->mask = 1; - return 0; -} - -static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, - int trigger, int polarity) -{ - struct irq_cfg *cfg; - struct IO_APIC_route_entry entry; - cpumask_t mask; - - if (!IO_APIC_IRQ(irq)) - return; - - cfg = irq_cfg(irq); - - mask = TARGET_CPUS; - if (assign_irq_vector(irq, mask)) - return; - - cpus_and(mask, cfg->domain, mask); - - apic_printk(APIC_VERBOSE,KERN_DEBUG - "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " - "IRQ %d Mode:%i Active:%i)\n", - apic, mp_ioapics[apic].mp_apicid, pin, cfg->vector, - irq, trigger, polarity); - - - if (setup_ioapic_entry(mp_ioapics[apic].mp_apicid, irq, &entry, - cpu_mask_to_apicid(mask), trigger, polarity, - cfg->vector)) { - printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic].mp_apicid, pin); - __clear_irq_vector(irq); - return; - } - - ioapic_register_intr(irq, trigger); - if (irq < 16) - disable_8259A_irq(irq); - - ioapic_write_entry(apic, pin, entry); -} - -static void __init setup_IO_APIC_irqs(void) -{ - int apic, pin, idx, irq, first_notcon = 1; - - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } - - irq = pin_2_irq(idx, apic, pin); -#ifdef CONFIG_X86_32 - if (multi_timer_check(apic, irq)) - continue; -#endif - add_pin_to_irq(irq, apic, pin); - - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } - } - - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); -} - -/* - * Set up the timer pin, possibly with the 8259A-master behind. - */ -static void __init setup_timer_IRQ0_pin(unsigned int apic, unsigned int pin, - int vector) -{ - struct IO_APIC_route_entry entry; - -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - return; -#endif - - memset(&entry, 0, sizeof(entry)); - - /* - * We use logical delivery to get the timer IRQ - * to the first CPU. - */ - entry.dest_mode = INT_DEST_MODE; - entry.mask = 1; /* mask IRQ now */ - entry.dest = cpu_mask_to_apicid(TARGET_CPUS); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.polarity = 0; - entry.trigger = 0; - entry.vector = vector; - - /* - * The timer IRQ doesn't have to know that behind the - * scene we may have a 8259A-master in AEOI mode ... - */ - set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(apic, pin, entry); -} - - -__apicdebuginit(void) print_IO_APIC(void) -{ - int apic, i; - union IO_APIC_reg_00 reg_00; - union IO_APIC_reg_01 reg_01; - union IO_APIC_reg_02 reg_02; - union IO_APIC_reg_03 reg_03; - unsigned long flags; - struct irq_cfg *cfg; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); - for (i = 0; i < nr_ioapics; i++) - printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].mp_apicid, nr_ioapic_registers[i]); - - /* - * We are a bit conservative about what we expect. We have to - * know about every hardware change ASAP. - */ - printk(KERN_INFO "testing the IO APIC.......................\n"); - - for (apic = 0; apic < nr_ioapics; apic++) { - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - reg_01.raw = io_apic_read(apic, 1); - if (reg_01.bits.version >= 0x10) - reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); - spin_unlock_irqrestore(&ioapic_lock, flags); - - printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mp_apicid); - printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); - printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); - printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); - printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); - - printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); - printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); - - printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); - printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); - - /* - * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, - * but the value of reg_02 is read as the previous read register - * value, so ignore it if reg_02 == reg_01. - */ - if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); - printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); - } - - /* - * Some Intel chipsets with IO APIC VERSION of 0x2? don't have reg_02 - * or reg_03, but the value of reg_0[23] is read as the previous read - * register value, so ignore it if reg_03 == reg_0[12]. - */ - if (reg_01.bits.version >= 0x20 && reg_03.raw != reg_02.raw && - reg_03.raw != reg_01.raw) { - printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); - printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); - } - - printk(KERN_DEBUG ".... IRQ redirection table:\n"); - - printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" - " Stat Dmod Deli Vect: \n"); - - for (i = 0; i <= reg_01.bits.entries; i++) { - struct IO_APIC_route_entry entry; - - entry = ioapic_read_entry(apic, i); - - printk(KERN_DEBUG " %02x %03X ", - i, - entry.dest - ); - - printk("%1d %1d %1d %1d %1d %1d %1d %02X\n", - entry.mask, - entry.trigger, - entry.irr, - entry.polarity, - entry.delivery_status, - entry.dest_mode, - entry.delivery_mode, - entry.vector - ); - } - } - printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(cfg) { - struct irq_pin_list *entry = cfg->irq_2_pin; - if (!entry) - continue; - printk(KERN_DEBUG "IRQ%d ", cfg->irq); - for (;;) { - printk("-> %d:%d", entry->apic, entry->pin); - if (!entry->next) - break; - entry = entry->next; - } - printk("\n"); - } - - printk(KERN_INFO ".................................... done.\n"); - - return; -} - -__apicdebuginit(void) print_APIC_bitfield(int base) -{ - unsigned int v; - int i, j; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "0123456789abcdef0123456789abcdef\n" KERN_DEBUG); - for (i = 0; i < 8; i++) { - v = apic_read(base + i*0x10); - for (j = 0; j < 32; j++) { - if (v & (1< 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - - v = apic_read(APIC_ESR); - printk(KERN_DEBUG "... APIC ESR: %08x\n", v); - } - - icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); - - v = apic_read(APIC_LVTT); - printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); - - if (maxlvt > 3) { /* PC is LVT#4. */ - v = apic_read(APIC_LVTPC); - printk(KERN_DEBUG "... APIC LVTPC: %08x\n", v); - } - v = apic_read(APIC_LVT0); - printk(KERN_DEBUG "... APIC LVT0: %08x\n", v); - v = apic_read(APIC_LVT1); - printk(KERN_DEBUG "... APIC LVT1: %08x\n", v); - - if (maxlvt > 2) { /* ERR is LVT#3. */ - v = apic_read(APIC_LVTERR); - printk(KERN_DEBUG "... APIC LVTERR: %08x\n", v); - } - - v = apic_read(APIC_TMICT); - printk(KERN_DEBUG "... APIC TMICT: %08x\n", v); - v = apic_read(APIC_TMCCT); - printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v); - v = apic_read(APIC_TDCR); - printk(KERN_DEBUG "... APIC TDCR: %08x\n", v); - printk("\n"); -} - -__apicdebuginit(void) print_all_local_APICs(void) -{ - on_each_cpu(print_local_APIC, NULL, 1); -} - -__apicdebuginit(void) print_PIC(void) -{ - unsigned int v; - unsigned long flags; - - if (apic_verbosity == APIC_QUIET) - return; - - printk(KERN_DEBUG "\nprinting PIC contents\n"); - - spin_lock_irqsave(&i8259A_lock, flags); - - v = inb(0xa1) << 8 | inb(0x21); - printk(KERN_DEBUG "... PIC IMR: %04x\n", v); - - v = inb(0xa0) << 8 | inb(0x20); - printk(KERN_DEBUG "... PIC IRR: %04x\n", v); - - outb(0x0b,0xa0); - outb(0x0b,0x20); - v = inb(0xa0) << 8 | inb(0x20); - outb(0x0a,0xa0); - outb(0x0a,0x20); - - spin_unlock_irqrestore(&i8259A_lock, flags); - - printk(KERN_DEBUG "... PIC ISR: %04x\n", v); - - v = inb(0x4d1) << 8 | inb(0x4d0); - printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); -} - -__apicdebuginit(int) print_all_ICs(void) -{ - print_PIC(); - print_all_local_APICs(); - print_IO_APIC(); - - return 0; -} - -fs_initcall(print_all_ICs); - - -/* Where if anywhere is the i8259 connect in external int mode */ -static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; - -void __init enable_IO_APIC(void) -{ - union IO_APIC_reg_01 reg_01; - int i8259_apic, i8259_pin; - int apic; - unsigned long flags; - -#ifdef CONFIG_X86_32 - int i; - if (!pirqs_enabled) - for (i = 0; i < MAX_PIRQS; i++) - pirq_entries[i] = -1; -#endif - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } - for(apic = 0; apic < nr_ioapics; apic++) { - int pin; - /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - struct IO_APIC_route_entry entry; - entry = ioapic_read_entry(apic, pin); - - /* If the interrupt line is enabled and in ExtInt mode - * I have found the pin where the i8259 is connected. - */ - if ((entry.mask == 0) && (entry.delivery_mode == dest_ExtINT)) { - ioapic_i8259.apic = apic; - ioapic_i8259.pin = pin; - goto found_i8259; - } - } - } - found_i8259: - /* Look to see what if the MP table has reported the ExtINT */ - /* If we could not find the appropriate pin by looking at the ioapic - * the i8259 probably is not connected the ioapic but give the - * mptable a chance anyway. - */ - i8259_pin = find_isa_irq_pin(0, mp_ExtINT); - i8259_apic = find_isa_irq_apic(0, mp_ExtINT); - /* Trust the MP table if nothing is setup in the hardware */ - if ((ioapic_i8259.pin == -1) && (i8259_pin >= 0)) { - printk(KERN_WARNING "ExtINT not setup in hardware but reported by MP table\n"); - ioapic_i8259.pin = i8259_pin; - ioapic_i8259.apic = i8259_apic; - } - /* Complain if the MP table and the hardware disagree */ - if (((ioapic_i8259.apic != i8259_apic) || (ioapic_i8259.pin != i8259_pin)) && - (i8259_pin >= 0) && (ioapic_i8259.pin >= 0)) - { - printk(KERN_WARNING "ExtINT in hardware and MP table differ\n"); - } - - /* - * Do not trust the IO-APIC being empty at bootup - */ - clear_IO_APIC(); -} - -/* - * Not an __init, needed by the reboot code - */ -void disable_IO_APIC(void) -{ - /* - * Clear the IO-APIC before rebooting: - */ - clear_IO_APIC(); - - /* - * If the i8259 is routed through an IOAPIC - * Put that IOAPIC in virtual wire mode - * so legacy interrupts can be delivered. - */ - if (ioapic_i8259.pin != -1) { - struct IO_APIC_route_entry entry; - - memset(&entry, 0, sizeof(entry)); - entry.mask = 0; /* Enabled */ - entry.trigger = 0; /* Edge */ - entry.irr = 0; - entry.polarity = 0; /* High */ - entry.delivery_status = 0; - entry.dest_mode = 0; /* Physical */ - entry.delivery_mode = dest_ExtINT; /* ExtInt */ - entry.vector = 0; - entry.dest = read_apic_id(); - - /* - * Add it to the IO-APIC irq-routing table: - */ - ioapic_write_entry(ioapic_i8259.apic, ioapic_i8259.pin, entry); - } - - disconnect_bsp_APIC(ioapic_i8259.pin != -1); -} - -#ifdef CONFIG_X86_32 -/* - * function to set the IO-APIC physical IDs based on the - * values stored in the MPC table. - * - * by Matt Domsch Tue Dec 21 12:25:05 CST 1999 - */ - -static void __init setup_ioapic_ids_from_mpc(void) -{ - union IO_APIC_reg_00 reg_00; - physid_mask_t phys_id_present_map; - int apic; - int i; - unsigned char old_id; - unsigned long flags; - - if (x86_quirks->setup_ioapic_ids && x86_quirks->setup_ioapic_ids()) - return; - - /* - * Don't check I/O APIC IDs for xAPIC systems. They have - * no meaning without the serial APIC bus. - */ - if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - || APIC_XAPIC(apic_version[boot_cpu_physical_apicid])) - return; - /* - * This is broken; anything with a real cpu count has to - * circumvent this idiocy regardless. - */ - phys_id_present_map = ioapic_phys_id_map(phys_cpu_present_map); - - /* - * Set the IOAPIC ID to the value stored in the MPC table. - */ - for (apic = 0; apic < nr_ioapics; apic++) { - - /* Read the register 0 value */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - old_id = mp_ioapics[apic].mp_apicid; - - if (mp_ioapics[apic].mp_apicid >= get_physical_broadcast()) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic, mp_ioapics[apic].mp_apicid); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - reg_00.bits.ID); - mp_ioapics[apic].mp_apicid = reg_00.bits.ID; - } - - /* - * Sanity check, is the ID really free? Every APIC in a - * system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(phys_id_present_map, - mp_ioapics[apic].mp_apicid)) { - printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic, mp_ioapics[apic].mp_apicid); - for (i = 0; i < get_physical_broadcast(); i++) - if (!physid_isset(i, phys_id_present_map)) - break; - if (i >= get_physical_broadcast()) - panic("Max APIC ID exceeded!\n"); - printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", - i); - physid_set(i, phys_id_present_map); - mp_ioapics[apic].mp_apicid = i; - } else { - physid_mask_t tmp; - tmp = apicid_to_cpu_present(mp_ioapics[apic].mp_apicid); - apic_printk(APIC_VERBOSE, "Setting %d in the " - "phys_id_present_map\n", - mp_ioapics[apic].mp_apicid); - physids_or(phys_id_present_map, phys_id_present_map, tmp); - } - - - /* - * We need to adjust the IRQ routing table - * if the ID changed. - */ - if (old_id != mp_ioapics[apic].mp_apicid) - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_dstapic == old_id) - mp_irqs[i].mp_dstapic - = mp_ioapics[apic].mp_apicid; - - /* - * Read the right value from the MPC table and - * write it into the ID register. - */ - apic_printk(APIC_VERBOSE, KERN_INFO - "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic].mp_apicid); - - reg_00.bits.ID = mp_ioapics[apic].mp_apicid; - spin_lock_irqsave(&ioapic_lock, flags); - - /* - * Sanity check - */ - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(apic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic].mp_apicid) - printk("could not set ID!\n"); - else - apic_printk(APIC_VERBOSE, " ok.\n"); - } -} -#endif - -int no_timer_check __initdata; - -static int __init notimercheck(char *s) -{ - no_timer_check = 1; - return 1; -} -__setup("no_timer_check", notimercheck); - -/* - * There is a nasty bug in some older SMP boards, their mptable lies - * about the timer IRQ. We do the following to work around the situation: - * - * - timer IRQ defaults to IO-APIC IRQ - * - if this function detects that timer IRQs are defunct, then we fall - * back to ISA timer IRQs - */ -static int __init timer_irq_works(void) -{ - unsigned long t1 = jiffies; - unsigned long flags; - - if (no_timer_check) - return 1; - - local_save_flags(flags); - local_irq_enable(); - /* Let ten ticks pass... */ - mdelay((10 * 1000) / HZ); - local_irq_restore(flags); - - /* - * Expect a few ticks at least, to be sure some possible - * glue logic does not lock up after one or two first - * ticks in a non-ExtINT mode. Also the local APIC - * might have cached one ExtINT interrupt. Finally, at - * least one tick may be lost due to delays. - */ - - /* jiffies wrap? */ - if (time_after(jiffies, t1 + 4)) - return 1; - return 0; -} - -/* - * In the SMP+IOAPIC case it might happen that there are an unspecified - * number of pending IRQ events unhandled. These cases are very rare, - * so we 'resend' these IRQs via IPIs, to the same CPU. It's much - * better to do it this way as thus we do not have to be aware of - * 'pending' interrupts in the IRQ path, except at this point. - */ -/* - * Edge triggered needs to resend any interrupt - * that was delayed but this is now handled in the device - * independent code. - */ - -/* - * Starting up a edge-triggered IO-APIC interrupt is - * nasty - we need to make sure that we get the edge. - * If it is already asserted for some reason, we need - * return 1 to indicate that is was pending. - * - * This is not complete - we should be able to fake - * an edge even if it isn't on the 8259A... - */ - -static unsigned int startup_ioapic_irq(unsigned int irq) -{ - int was_pending = 0; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - if (irq < 16) { - disable_8259A_irq(irq); - if (i8259A_irq_pending(irq)) - was_pending = 1; - } - __unmask_IO_APIC_irq(irq); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return was_pending; -} - -#ifdef CONFIG_X86_64 -static int ioapic_retrigger_irq(unsigned int irq) -{ - - struct irq_cfg *cfg = irq_cfg(irq); - unsigned long flags; - - spin_lock_irqsave(&vector_lock, flags); - send_IPI_mask(cpumask_of_cpu(first_cpu(cfg->domain)), cfg->vector); - spin_unlock_irqrestore(&vector_lock, flags); - - return 1; -} -#else -static int ioapic_retrigger_irq(unsigned int irq) -{ - send_IPI_self(irq_cfg(irq)->vector); - - return 1; -} -#endif - -/* - * Level and edge triggered IO-APIC interrupts need different handling, - * so we use two separate IRQ descriptors. Edge triggered IRQs can be - * handled with the level-triggered descriptor, but that one has slightly - * more overhead. Level-triggered interrupts cannot be handled with the - * edge-triggered handler, without risking IRQ storms and other ugly - * races. - */ - -#ifdef CONFIG_SMP - -#ifdef CONFIG_INTR_REMAP -static void ir_irq_migration(struct work_struct *work); - -static DECLARE_DELAYED_WORK(ir_migration_work, ir_irq_migration); - -/* - * Migrate the IO-APIC irq in the presence of intr-remapping. - * - * For edge triggered, irq migration is a simple atomic update(of vector - * and cpu destination) of IRTE and flush the hardware cache. - * - * For level triggered, we need to modify the io-apic RTE aswell with the update - * vector information, along with modifying IRTE with vector and destination. - * So irq migration for level triggered is little bit more complex compared to - * edge triggered migration. But the good news is, we use the same algorithm - * for level triggered migration as we have today, only difference being, - * we now initiate the irq migration from process context instead of the - * interrupt context. - * - * In future, when we do a directed EOI (combined with cpu EOI broadcast - * suppression) to the IO-APIC, level triggered irq migration will also be - * as simple as edge triggered migration and we can do the irq migration - * with a simple atomic update to IO-APIC RTE. - */ -static void migrate_ioapic_irq(int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct irq_desc *desc; - cpumask_t tmp, cleanup_mask; - struct irte irte; - int modify_ioapic_rte; - unsigned int dest; - unsigned long flags; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - desc = irq_to_desc(irq); - modify_ioapic_rte = desc->status & IRQ_LEVEL; - if (modify_ioapic_rte) { - spin_lock_irqsave(&ioapic_lock, flags); - __target_IO_APIC_irq(irq, dest, cfg->vector); - spin_unlock_irqrestore(&ioapic_lock, flags); - } - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * Modified the IRTE and flushes the Interrupt entry cache. - */ - modify_irte(irq, &irte); - - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc->affinity = mask; -} - -static int migrate_irq_remapped_level(int irq) -{ - int ret = -1; - struct irq_desc *desc = irq_to_desc(irq); - - mask_IO_APIC_irq(irq); - - if (io_apic_level_ack_pending(irq)) { - /* - * Interrupt in progress. Migrating irq now will change the - * vector information in the IO-APIC RTE and that will confuse - * the EOI broadcast performed by cpu. - * So, delay the irq migration to the next instance. - */ - schedule_delayed_work(&ir_migration_work, 1); - goto unmask; - } - - /* everthing is clear. we have right of way */ - migrate_ioapic_irq(irq, desc->pending_mask); - - ret = 0; - desc->status &= ~IRQ_MOVE_PENDING; - cpus_clear(desc->pending_mask); - -unmask: - unmask_IO_APIC_irq(irq); - return ret; -} - -static void ir_irq_migration(struct work_struct *work) -{ - unsigned int irq; - struct irq_desc *desc; - - for_each_irq_desc(irq, desc) { - if (desc->status & IRQ_MOVE_PENDING) { - unsigned long flags; - - spin_lock_irqsave(&desc->lock, flags); - if (!desc->chip->set_affinity || - !(desc->status & IRQ_MOVE_PENDING)) { - desc->status &= ~IRQ_MOVE_PENDING; - spin_unlock_irqrestore(&desc->lock, flags); - continue; - } - - desc->chip->set_affinity(irq, desc->pending_mask); - spin_unlock_irqrestore(&desc->lock, flags); - } - } -} - -/* - * Migrates the IRQ destination in the process context. - */ -static void set_ir_ioapic_affinity_irq(unsigned int irq, cpumask_t mask) -{ - struct irq_desc *desc = irq_to_desc(irq); - - if (desc->status & IRQ_LEVEL) { - desc->status |= IRQ_MOVE_PENDING; - desc->pending_mask = mask; - migrate_irq_remapped_level(irq); - return; - } - - migrate_ioapic_irq(irq, mask); -} -#endif - -asmlinkage void smp_irq_move_cleanup_interrupt(void) -{ - unsigned vector, me; - ack_APIC_irq(); -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - - me = smp_processor_id(); - for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { - unsigned int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - irq = __get_cpu_var(vector_irq)[vector]; - - desc = irq_to_desc(irq); - if (!desc) - continue; - - cfg = irq_cfg(irq); - spin_lock(&desc->lock); - if (!cfg->move_cleanup_count) - goto unlock; - - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) - goto unlock; - - __get_cpu_var(vector_irq)[vector] = -1; - cfg->move_cleanup_count--; -unlock: - spin_unlock(&desc->lock); - } - - irq_exit(); -} - -static void irq_complete_move(unsigned int irq) -{ - struct irq_cfg *cfg = irq_cfg(irq); - unsigned vector, me; - - if (likely(!cfg->move_in_progress)) - return; - - vector = ~get_irq_regs()->orig_ax; - me = smp_processor_id(); - if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { - cpumask_t cleanup_mask; - - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } -} -#else -static inline void irq_complete_move(unsigned int irq) {} -#endif -#ifdef CONFIG_INTR_REMAP -static void ack_x2apic_level(unsigned int irq) -{ - ack_x2APIC_irq(); -} - -static void ack_x2apic_edge(unsigned int irq) -{ - ack_x2APIC_irq(); -} -#endif - -static void ack_apic_edge(unsigned int irq) -{ - irq_complete_move(irq); - move_native_irq(irq); - ack_APIC_irq(); -} - -#ifdef CONFIG_X86_64 -static void ack_apic_level(unsigned int irq) -{ - int do_unmask_irq = 0; - - irq_complete_move(irq); -#ifdef CONFIG_GENERIC_PENDING_IRQ - /* If we are moving the irq we need to mask it */ - if (unlikely(irq_to_desc(irq)->status & IRQ_MOVE_PENDING)) { - do_unmask_irq = 1; - mask_IO_APIC_irq(irq); - } -#endif - - /* - * We must acknowledge the irq before we move it or the acknowledge will - * not propagate properly. - */ - ack_APIC_irq(); - - /* Now we can move and renable the irq */ - if (unlikely(do_unmask_irq)) { - /* Only migrate the irq if the ack has been received. - * - * On rare occasions the broadcast level triggered ack gets - * delayed going to ioapics, and if we reprogram the - * vector while Remote IRR is still set the irq will never - * fire again. - * - * To prevent this scenario we read the Remote IRR bit - * of the ioapic. This has two effects. - * - On any sane system the read of the ioapic will - * flush writes (and acks) going to the ioapic from - * this cpu. - * - We get to see if the ACK has actually been delivered. - * - * Based on failed experiments of reprogramming the - * ioapic entry from outside of irq context starting - * with masking the ioapic entry and then polling until - * Remote IRR was clear before reprogramming the - * ioapic I don't trust the Remote IRR bit to be - * completey accurate. - * - * However there appears to be no other way to plug - * this race, so if the Remote IRR bit is not - * accurate and is causing problems then it is a hardware bug - * and you can go talk to the chipset vendor about it. - */ - if (!io_apic_level_ack_pending(irq)) - move_masked_irq(irq); - unmask_IO_APIC_irq(irq); - } -} -#else -atomic_t irq_mis_count; -static void ack_apic_level(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); - /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); - - if (!(v & (1 << (i & 0x1f)))) { - atomic_inc(&irq_mis_count); - spin_lock(&ioapic_lock); - __mask_and_edge_IO_APIC_irq(irq); - __unmask_and_level_IO_APIC_irq(irq); - spin_unlock(&ioapic_lock); - } -} -#endif - -static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, -#ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, -#endif - .retrigger = ioapic_retrigger_irq, -}; -#endif - -static inline void init_IO_APIC_traps(void) -{ - int irq; - struct irq_desc *desc; - struct irq_cfg *cfg; - - /* - * NOTE! The local APIC isn't very good at handling - * multiple interrupts at the same interrupt level. - * As the interrupt level is determined by taking the - * vector number and shifting that right by 4, we - * want to spread these out a bit so that they don't - * all fall in the same interrupt level. - * - * Also, we've got to be careful not to trash gate - * 0x80, because int 0x80 is hm, kind of importantish. ;) - */ - for_each_irq_cfg(cfg) { - irq = cfg->irq; - if (IO_APIC_IRQ(irq) && !cfg->vector) { - /* - * Hmm.. We don't have an entry for this, - * so default to an old-fashioned 8259 - * interrupt if we can.. - */ - if (irq < 16) - make_8259A_irq(irq); - else { - desc = irq_to_desc(irq); - /* Strange. Oh, well.. */ - desc->chip = &no_irq_chip; - } - } - } -} - -/* - * The local APIC irq-chip implementation: - */ - -static void mask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); -} - -static void unmask_lapic_irq(unsigned int irq) -{ - unsigned long v; - - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); -} - -static void ack_lapic_irq (unsigned int irq) -{ - ack_APIC_irq(); -} - -static struct irq_chip lapic_chip __read_mostly = { - .name = "local-APIC", - .mask = mask_lapic_irq, - .unmask = unmask_lapic_irq, - .ack = ack_lapic_irq, -}; - -static void lapic_register_intr(int irq) -{ - struct irq_desc *desc; - - desc = irq_to_desc(irq); - desc->status &= ~IRQ_LEVEL; - set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, - "edge"); -} - -static void __init setup_nmi(void) -{ - /* - * Dirty trick to enable the NMI watchdog ... - * We put the 8259A master into AEOI mode and - * unmask on all local APICs LVT0 as NMI. - * - * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire') - * is from Maciej W. Rozycki - so we do not have to EOI from - * the NMI handler or the timer interrupt. - */ - apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ..."); - - enable_NMI_through_LVT0(); - - apic_printk(APIC_VERBOSE, " done.\n"); -} - -/* - * This looks a bit hackish but it's about the only one way of sending - * a few INTA cycles to 8259As and any associated glue logic. ICR does - * not support the ExtINT mode, unfortunately. We need to send these - * cycles as some i82489DX-based boards have glue logic that keeps the - * 8259A interrupt line asserted until INTA. --macro - */ -static inline void __init unlock_ExtINT_logic(void) -{ - int apic, pin, i; - struct IO_APIC_route_entry entry0, entry1; - unsigned char save_control, save_freq_select; - - pin = find_isa_irq_pin(8, mp_INT); - if (pin == -1) { - WARN_ON_ONCE(1); - return; - } - apic = find_isa_irq_apic(8, mp_INT); - if (apic == -1) { - WARN_ON_ONCE(1); - return; - } - - entry0 = ioapic_read_entry(apic, pin); - clear_IO_APIC_pin(apic, pin); - - memset(&entry1, 0, sizeof(entry1)); - - entry1.dest_mode = 0; /* physical delivery */ - entry1.mask = 0; /* unmask IRQ now */ - entry1.dest = hard_smp_processor_id(); - entry1.delivery_mode = dest_ExtINT; - entry1.polarity = entry0.polarity; - entry1.trigger = 0; - entry1.vector = 0; - - ioapic_write_entry(apic, pin, entry1); - - save_control = CMOS_READ(RTC_CONTROL); - save_freq_select = CMOS_READ(RTC_FREQ_SELECT); - CMOS_WRITE((save_freq_select & ~RTC_RATE_SELECT) | 0x6, - RTC_FREQ_SELECT); - CMOS_WRITE(save_control | RTC_PIE, RTC_CONTROL); - - i = 100; - while (i-- > 0) { - mdelay(10); - if ((CMOS_READ(RTC_INTR_FLAGS) & RTC_PF) == RTC_PF) - i -= 10; - } - - CMOS_WRITE(save_control, RTC_CONTROL); - CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT); - clear_IO_APIC_pin(apic, pin); - - ioapic_write_entry(apic, pin, entry0); -} - -static int disable_timer_pin_1 __initdata; -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 0; -} -early_param("disable_timer_pin_1", disable_timer_pin_setup); - -int timer_through_8259 __initdata; - -/* - * This code may look a bit paranoid, but it's supposed to cooperate with - * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ - * is so screwy. Thanks to Brian Perkins for testing/hacking this beast - * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for all platforms. - */ -static inline void __init check_timer(void) -{ - struct irq_cfg *cfg = irq_cfg(0); - int apic1, pin1, apic2, pin2; - unsigned long flags; - unsigned int ver; - int no_pin1 = 0; - - local_irq_save(flags); - - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); - - /* - * get/set the timer IRQ vector: - */ - disable_8259A_irq(0); - assign_irq_vector(0, TARGET_CPUS); - - /* - * As IRQ0 is to be enabled in the 8259A, the virtual - * wire has to be disabled in the local APIC. Also - * timer interrupts need to be acknowledged manually in - * the 8259A for the i82489DX when using the NMI - * watchdog as that APIC treats NMIs as level-triggered. - * The AEOI mode will finish them in the 8259A - * automatically. - */ - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); - init_8259A(1); -#ifdef CONFIG_X86_32 - timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver)); -#endif - - pin1 = find_isa_irq_pin(0, mp_INT); - apic1 = find_isa_irq_apic(0, mp_INT); - pin2 = ioapic_i8259.pin; - apic2 = ioapic_i8259.apic; - - apic_printk(APIC_QUIET, KERN_INFO "..TIMER: vector=0x%02X " - "apic1=%d pin1=%d apic2=%d pin2=%d\n", - cfg->vector, apic1, pin1, apic2, pin2); - - /* - * Some BIOS writers are clueless and report the ExtINTA - * I/O APIC input from the cascaded 8259A as the timer - * interrupt input. So just in case, if only one pin - * was found above, try it both directly and through the - * 8259A. - */ - if (pin1 == -1) { -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("BIOS bug: timer not connected to IO-APIC"); -#endif - pin1 = pin2; - apic1 = apic2; - no_pin1 = 1; - } else if (pin2 == -1) { - pin2 = pin1; - apic2 = apic1; - } - - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - if (no_pin1) { - add_pin_to_irq(0, apic1, pin1); - setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); - } - unmask_IO_APIC_irq(0); - if (timer_irq_works()) { - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - goto out; - } -#ifdef CONFIG_INTR_REMAP - if (intr_remapping_enabled) - panic("timer doesn't work through Interrupt-remapped IO-APIC"); -#endif - clear_IO_APIC_pin(apic1, pin1); - if (!no_pin1) - apic_printk(APIC_QUIET, KERN_ERR "..MP-BIOS bug: " - "8254 timer not connected to IO-APIC\n"); - - apic_printk(APIC_QUIET, KERN_INFO "...trying to set up timer " - "(IRQ0) through the 8259A ...\n"); - apic_printk(APIC_QUIET, KERN_INFO - "..... (found apic %d pin %d) ...\n", apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ - replace_pin_at_irq(0, apic1, pin1, apic2, pin2); - setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); - unmask_IO_APIC_irq(0); - enable_8259A_irq(0); - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); - timer_through_8259 = 1; - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - goto out; - } - /* - * Cleanup, just in case ... - */ - disable_8259A_irq(0); - clear_IO_APIC_pin(apic2, pin2); - apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); - } - - if (nmi_watchdog == NMI_IO_APIC) { - apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work " - "through the IO-APIC - disabling NMI Watchdog!\n"); - nmi_watchdog = NMI_NONE; - } -#ifdef CONFIG_X86_32 - timer_ack = 0; -#endif - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as Virtual Wire IRQ...\n"); - - lapic_register_intr(0); - apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ - enable_8259A_irq(0); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - disable_8259A_irq(0); - apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); - apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); - - apic_printk(APIC_QUIET, KERN_INFO - "...trying to set up timer as ExtINT IRQ...\n"); - - init_8259A(0); - make_8259A_irq(0); - apic_write(APIC_LVT0, APIC_DM_EXTINT); - - unlock_ExtINT_logic(); - - if (timer_irq_works()) { - apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); - goto out; - } - apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); - panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " - "report. Then try booting with the 'noapic' option.\n"); -out: - local_irq_restore(flags); -} - -/* - * Traditionally ISA IRQ2 is the cascade IRQ, and is not available - * to devices. However there may be an I/O APIC pin available for - * this interrupt regardless. The pin may be left unconnected, but - * typically it will be reused as an ExtINT cascade interrupt for - * the master 8259A. In the MPS case such a pin will normally be - * reported as an ExtINT interrupt in the MP table. With ACPI - * there is no provision for ExtINT interrupts, and in the absence - * of an override it would be treated as an ordinary ISA I/O APIC - * interrupt, that is edge-triggered and unmasked by default. We - * used to do this, but it caused problems on some systems because - * of the NMI watchdog and sometimes IRQ0 of the 8254 timer using - * the same ExtINT cascade interrupt to drive the local APIC of the - * bootstrap processor. Therefore we refrain from routing IRQ2 to - * the I/O APIC in all cases now. No actual device should request - * it anyway. --macro - */ -#define PIC_IRQS (1 << PIC_CASCADE_IR) - -void __init setup_IO_APIC(void) -{ - -#ifdef CONFIG_X86_32 - enable_IO_APIC(); -#else - /* - * calling enable_IO_APIC() is moved to setup_local_APIC for BP - */ -#endif - - io_apic_irqs = ~PIC_IRQS; - - apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* - * Set up IO-APIC IRQ routing. - */ -#ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); -#endif - sync_Arb_IDs(); - setup_IO_APIC_irqs(); - init_IO_APIC_traps(); - check_timer(); -} - -/* - * Called after all the initialization is done. If we didnt find any - * APIC bugs then we can allow the modify fast path - */ - -static int __init io_apic_bug_finalize(void) -{ - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; -} - -late_initcall(io_apic_bug_finalize); - -struct sysfs_ioapic_data { - struct sys_device dev; - struct IO_APIC_route_entry entry[0]; -}; -static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS]; - -static int ioapic_suspend(struct sys_device *dev, pm_message_t state) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) - *entry = ioapic_read_entry(dev->id, i); - - return 0; -} - -static int ioapic_resume(struct sys_device *dev) -{ - struct IO_APIC_route_entry *entry; - struct sysfs_ioapic_data *data; - unsigned long flags; - union IO_APIC_reg_00 reg_00; - int i; - - data = container_of(dev, struct sysfs_ioapic_data, dev); - entry = data->entry; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(dev->id, 0); - if (reg_00.bits.ID != mp_ioapics[dev->id].mp_apicid) { - reg_00.bits.ID = mp_ioapics[dev->id].mp_apicid; - io_apic_write(dev->id, 0, reg_00.raw); - } - spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[dev->id]; i++) - ioapic_write_entry(dev->id, i, entry[i]); - - return 0; -} - -static struct sysdev_class ioapic_sysdev_class = { - .name = "ioapic", - .suspend = ioapic_suspend, - .resume = ioapic_resume, -}; - -static int __init ioapic_init_sysfs(void) -{ - struct sys_device * dev; - int i, size, error; - - error = sysdev_class_register(&ioapic_sysdev_class); - if (error) - return error; - - for (i = 0; i < nr_ioapics; i++ ) { - size = sizeof(struct sys_device) + nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL); - if (!mp_ioapic_data[i]) { - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - dev = &mp_ioapic_data[i]->dev; - dev->id = i; - dev->cls = &ioapic_sysdev_class; - error = sysdev_register(dev); - if (error) { - kfree(mp_ioapic_data[i]); - mp_ioapic_data[i] = NULL; - printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i); - continue; - } - } - - return 0; -} - -device_initcall(ioapic_init_sysfs); - -/* - * Dynamic irq allocate and deallocation - */ -unsigned int create_irq_nr(unsigned int irq_want) -{ - /* Allocate an unused irq */ - unsigned int irq; - unsigned int new; - unsigned long flags; - struct irq_cfg *cfg_new; - -#ifndef CONFIG_HAVE_SPARSE_IRQ - irq_want = nr_irqs - 1; -#endif - - irq = 0; - spin_lock_irqsave(&vector_lock, flags); - for (new = irq_want; new > 0; new--) { - if (platform_legacy_irq(new)) - continue; - cfg_new = irq_cfg(new); - if (cfg_new && cfg_new->vector != 0) - continue; - /* check if need to create one */ - if (!cfg_new) - cfg_new = irq_cfg_alloc(new); - if (__assign_irq_vector(new, TARGET_CPUS) == 0) - irq = new; - break; - } - spin_unlock_irqrestore(&vector_lock, flags); - - if (irq > 0) { - dynamic_irq_init(irq); - } - return irq; -} - -int create_irq(void) -{ - int irq; - - irq = create_irq_nr(nr_irqs - 1); - - if (irq == 0) - irq = -1; - - return irq; -} - -void destroy_irq(unsigned int irq) -{ - unsigned long flags; - - dynamic_irq_cleanup(irq); - -#ifdef CONFIG_INTR_REMAP - free_irte(irq); -#endif - spin_lock_irqsave(&vector_lock, flags); - __clear_irq_vector(irq); - spin_unlock_irqrestore(&vector_lock, flags); -} - -/* - * MSI message composition - */ -#ifdef CONFIG_PCI_MSI -static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) -{ - struct irq_cfg *cfg; - int err; - unsigned dest; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (err) - return err; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irte irte; - int ir_index; - u16 sub_handle; - - ir_index = map_irq_to_irte_handle(irq, &sub_handle); - BUG_ON(ir_index == -1); - - memset (&irte, 0, sizeof(irte)); - - irte.present = 1; - irte.dst_mode = INT_DEST_MODE; - irte.trigger_mode = 0; /* edge */ - irte.dlvry_mode = INT_DELIVERY_MODE; - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - modify_irte(irq, &irte); - - msg->address_hi = MSI_ADDR_BASE_HI; - msg->data = sub_handle; - msg->address_lo = MSI_ADDR_BASE_LO | MSI_ADDR_IR_EXT_INT | - MSI_ADDR_IR_SHV | - MSI_ADDR_IR_INDEX1(ir_index) | - MSI_ADDR_IR_INDEX2(ir_index); - } else -#endif - { - msg->address_hi = MSI_ADDR_BASE_HI; - msg->address_lo = - MSI_ADDR_BASE_LO | - ((INT_DEST_MODE == 0) ? - MSI_ADDR_DEST_MODE_PHYSICAL: - MSI_ADDR_DEST_MODE_LOGICAL) | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_ADDR_REDIRECTION_CPU: - MSI_ADDR_REDIRECTION_LOWPRI) | - MSI_ADDR_DEST_ID(dest); - - msg->data = - MSI_DATA_TRIGGER_EDGE | - MSI_DATA_LEVEL_ASSERT | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - MSI_DATA_DELIVERY_FIXED: - MSI_DATA_DELIVERY_LOWPRI) | - MSI_DATA_VECTOR(cfg->vector); - } - return err; -} - -#ifdef CONFIG_SMP -static void set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - read_msi_msg(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - write_msi_msg(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} - -#ifdef CONFIG_INTR_REMAP -/* - * Migrate the MSI irq to another cpumask. This migration is - * done in the process context using interrupt-remapping hardware. - */ -static void ir_set_msi_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp, cleanup_mask; - struct irte irte; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (get_irte(irq, &irte)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - irte.vector = cfg->vector; - irte.dest_id = IRTE_DEST(dest); - - /* - * atomically update the IRTE with the new destination and vector. - */ - modify_irte(irq, &irte); - - /* - * After this point, all the interrupts will start arriving - * at the new destination. So, time to cleanup the previous - * vector allocation. - */ - if (cfg->move_in_progress) { - cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); - cfg->move_cleanup_count = cpus_weight(cleanup_mask); - send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); - cfg->move_in_progress = 0; - } - - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif -#endif /* CONFIG_SMP */ - -/* - * IRQ Chip for MSI PCI/PCI-X/PCI-Express Devices, - * which implement the MSI or MSI-X Capability Structure. - */ -static struct irq_chip msi_chip = { - .name = "PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -#ifdef CONFIG_INTR_REMAP -static struct irq_chip msi_ir_chip = { - .name = "IR-PCI-MSI", - .unmask = unmask_msi_irq, - .mask = mask_msi_irq, - .ack = ack_x2apic_edge, -#ifdef CONFIG_SMP - .set_affinity = ir_set_msi_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -/* - * Map the PCI dev to the corresponding remapping hardware unit - * and allocate 'nvec' consecutive interrupt-remapping table entries - * in it. - */ -static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) -{ - struct intel_iommu *iommu; - int index; - - iommu = map_dev_to_ir(dev); - if (!iommu) { - printk(KERN_ERR - "Unable to map PCI %s to iommu\n", pci_name(dev)); - return -ENOENT; - } - - index = alloc_irte(iommu, irq, nvec); - if (index < 0) { - printk(KERN_ERR - "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); - return -ENOSPC; - } - return index; -} -#endif - -static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(dev, irq, &msg); - if (ret < 0) - return ret; - - set_irq_msi(irq, desc); - write_msi_msg(irq, &msg); - -#ifdef CONFIG_INTR_REMAP - if (irq_remapped(irq)) { - struct irq_desc *desc = irq_to_desc(irq); - /* - * irq migration in process context - */ - desc->status |= IRQ_MOVE_PCNTXT; - set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge"); - } else -#endif - set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); - - return 0; -} - -static unsigned int build_irq_for_pci_dev(struct pci_dev *dev) -{ - unsigned int irq; - - irq = dev->bus->number; - irq <<= 8; - irq |= dev->devfn; - irq <<= 12; - - return irq; -} - -int arch_setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc) -{ - unsigned int irq; - int ret; - unsigned int irq_want; - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - - irq = create_irq_nr(irq_want); - if (irq == 0) - return -1; - -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - ret = msi_alloc_irte(dev, irq, 1); - if (ret < 0) - goto error; -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) { - destroy_irq(irq); - return ret; - } - return 0; - -#ifdef CONFIG_INTR_REMAP -error: - destroy_irq(irq); - return ret; -#endif -} - -int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) -{ - unsigned int irq; - int ret, sub_handle; - struct msi_desc *desc; - unsigned int irq_want; - -#ifdef CONFIG_INTR_REMAP - struct intel_iommu *iommu = 0; - int index = 0; -#endif - - irq_want = build_irq_for_pci_dev(dev) + 0x100; - sub_handle = 0; - list_for_each_entry(desc, &dev->msi_list, list) { - irq = create_irq_nr(irq_want--); - if (irq == 0) - return -1; -#ifdef CONFIG_INTR_REMAP - if (!intr_remapping_enabled) - goto no_ir; - - if (!sub_handle) { - /* - * allocate the consecutive block of IRTE's - * for 'nvec' - */ - index = msi_alloc_irte(dev, irq, nvec); - if (index < 0) { - ret = index; - goto error; - } - } else { - iommu = map_dev_to_ir(dev); - if (!iommu) { - ret = -ENOENT; - goto error; - } - /* - * setup the mapping between the irq and the IRTE - * base index, the sub_handle pointing to the - * appropriate interrupt remap table entry. - */ - set_irte_irq(irq, iommu, index, sub_handle); - } -no_ir: -#endif - ret = setup_msi_irq(dev, desc, irq); - if (ret < 0) - goto error; - sub_handle++; - } - return 0; - -error: - destroy_irq(irq); - return ret; -} - -void arch_teardown_msi_irq(unsigned int irq) -{ - destroy_irq(irq); -} - -#ifdef CONFIG_DMAR -#ifdef CONFIG_SMP -static void dmar_msi_set_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - struct msi_msg msg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - dmar_msi_read(irq, &msg); - - msg.data &= ~MSI_DATA_VECTOR_MASK; - msg.data |= MSI_DATA_VECTOR(cfg->vector); - msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; - msg.address_lo |= MSI_ADDR_DEST_ID(dest); - - dmar_msi_write(irq, &msg); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif /* CONFIG_SMP */ - -struct irq_chip dmar_msi_type = { - .name = "DMAR_MSI", - .unmask = dmar_msi_unmask, - .mask = dmar_msi_mask, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = dmar_msi_set_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_dmar_msi(unsigned int irq) -{ - int ret; - struct msi_msg msg; - - ret = msi_compose_msg(NULL, irq, &msg); - if (ret < 0) - return ret; - dmar_msi_write(irq, &msg); - set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, - "edge"); - return 0; -} -#endif - -#endif /* CONFIG_PCI_MSI */ -/* - * Hypertransport interrupt support - */ -#ifdef CONFIG_HT_IRQ - -#ifdef CONFIG_SMP - -static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) -{ - struct ht_irq_msg msg; - fetch_ht_irq_msg(irq, &msg); - - msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - - msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - - write_ht_irq_msg(irq, &msg); -} - -static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) -{ - struct irq_cfg *cfg; - unsigned int dest; - cpumask_t tmp; - struct irq_desc *desc; - - cpus_and(tmp, mask, cpu_online_map); - if (cpus_empty(tmp)) - return; - - if (assign_irq_vector(irq, mask)) - return; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, mask); - dest = cpu_mask_to_apicid(tmp); - - target_ht_irq(irq, dest, cfg->vector); - desc = irq_to_desc(irq); - desc->affinity = mask; -} -#endif - -static struct irq_chip ht_irq_chip = { - .name = "PCI-HT", - .mask = mask_ht_irq, - .unmask = unmask_ht_irq, - .ack = ack_apic_edge, -#ifdef CONFIG_SMP - .set_affinity = set_ht_irq_affinity, -#endif - .retrigger = ioapic_retrigger_irq, -}; - -int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) -{ - struct irq_cfg *cfg; - int err; - cpumask_t tmp; - - tmp = TARGET_CPUS; - err = assign_irq_vector(irq, tmp); - if (!err) { - struct ht_irq_msg msg; - unsigned dest; - - cfg = irq_cfg(irq); - cpus_and(tmp, cfg->domain, tmp); - dest = cpu_mask_to_apicid(tmp); - - msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - - msg.address_lo = - HT_IRQ_LOW_BASE | - HT_IRQ_LOW_DEST_ID(dest) | - HT_IRQ_LOW_VECTOR(cfg->vector) | - ((INT_DEST_MODE == 0) ? - HT_IRQ_LOW_DM_PHYSICAL : - HT_IRQ_LOW_DM_LOGICAL) | - HT_IRQ_LOW_RQEOI_EDGE | - ((INT_DELIVERY_MODE != dest_LowestPrio) ? - HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED) | - HT_IRQ_LOW_IRQ_MASKED; - - write_ht_irq_msg(irq, &msg); - - set_irq_chip_and_handler_name(irq, &ht_irq_chip, - handle_edge_irq, "edge"); - } - return err; -} -#endif /* CONFIG_HT_IRQ */ - -/* -------------------------------------------------------------------------- - ACPI-based IOAPIC Configuration - -------------------------------------------------------------------------- */ - -#ifdef CONFIG_ACPI - -#ifdef CONFIG_X86_32 -int __init io_apic_get_unique_id(int ioapic, int apic_id) -{ - union IO_APIC_reg_00 reg_00; - static physid_mask_t apic_id_map = PHYSID_MASK_NONE; - physid_mask_t tmp; - unsigned long flags; - int i = 0; - - /* - * The P4 platform supports up to 256 APIC IDs on two separate APIC - * buses (one for LAPICs, one for IOAPICs), where predecessors only - * supports up to 16 on one shared APIC bus. - * - * TBD: Expand LAPIC/IOAPIC support on P4-class systems to take full - * advantage of new APIC bus architecture. - */ - - if (physids_empty(apic_id_map)) - apic_id_map = ioapic_phys_id_map(phys_cpu_present_map); - - spin_lock_irqsave(&ioapic_lock, flags); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - if (apic_id >= get_physical_broadcast()) { - printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " - "%d\n", ioapic, apic_id, reg_00.bits.ID); - apic_id = reg_00.bits.ID; - } - - /* - * Every APIC in a system must have a unique ID or we get lots of nice - * 'stuck on smp_invalidate_needed IPI wait' messages. - */ - if (check_apicid_used(apic_id_map, apic_id)) { - - for (i = 0; i < get_physical_broadcast(); i++) { - if (!check_apicid_used(apic_id_map, i)) - break; - } - - if (i == get_physical_broadcast()) - panic("Max apic_id exceeded!\n"); - - printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, " - "trying %d\n", ioapic, apic_id, i); - - apic_id = i; - } - - tmp = apicid_to_cpu_present(apic_id); - physids_or(apic_id_map, apic_id_map, tmp); - - if (reg_00.bits.ID != apic_id) { - reg_00.bits.ID = apic_id; - - spin_lock_irqsave(&ioapic_lock, flags); - io_apic_write(ioapic, 0, reg_00.raw); - reg_00.raw = io_apic_read(ioapic, 0); - spin_unlock_irqrestore(&ioapic_lock, flags); - - /* Sanity check */ - if (reg_00.bits.ID != apic_id) { - printk("IOAPIC[%d]: Unable to change apic_id!\n", ioapic); - return -1; - } - } - - apic_printk(APIC_VERBOSE, KERN_INFO - "IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id); - - return apic_id; -} - -int __init io_apic_get_version(int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.version; -} -#endif - -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - -int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) -{ - if (!IO_APIC_IRQ(irq)) { - apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", - ioapic); - return -EINVAL; - } - - /* - * IRQs < 16 are already in the irq_2_pin[] map - */ - if (irq >= 16) - add_pin_to_irq(irq, ioapic, pin); - - setup_IO_APIC_irq(ioapic, pin, irq, triggering, polarity); - - return 0; -} - - -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) -{ - int i; - - if (skip_ioapic_setup) - return -1; - - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].mp_irqtype == mp_INT && - mp_irqs[i].mp_srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) - return -1; - - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); - return 0; -} - -#endif /* CONFIG_ACPI */ - -/* - * This function currently is only a helper for the i386 smp boot process where - * we need to reprogram the ioredtbls to cater for the cpus which have come online - * so mask in all cases should simply be TARGET_CPUS - */ -#ifdef CONFIG_SMP -void __init setup_ioapic_dest(void) -{ - int pin, ioapic, irq, irq_entry; - struct irq_cfg *cfg; - - if (skip_ioapic_setup == 1) - return; - - for (ioapic = 0; ioapic < nr_ioapics; ioapic++) { - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { - irq_entry = find_irq_entry(ioapic, pin, mp_INT); - if (irq_entry == -1) - continue; - irq = pin_2_irq(irq_entry, ioapic, pin); - - /* setup_IO_APIC_irqs could fail to get vector for some device - * when you have too many devices, because at that time only boot - * cpu is online. - */ - cfg = irq_cfg(irq); - if (!cfg->vector) - setup_IO_APIC_irq(ioapic, pin, irq, - irq_trigger(irq_entry), - irq_polarity(irq_entry)); -#ifdef CONFIG_INTR_REMAP - else if (intr_remapping_enabled) - set_ir_ioapic_affinity_irq(irq, TARGET_CPUS); -#endif - else - set_ioapic_affinity_irq(irq, TARGET_CPUS); - } - - } -} -#endif - -#ifdef CONFIG_X86_64 -#define IOAPIC_RESOURCE_NAME_SIZE 11 - -static struct resource *ioapic_resources; - -static struct resource * __init ioapic_setup_resources(void) -{ - unsigned long n; - struct resource *res; - char *mem; - int i; - - if (nr_ioapics <= 0) - return NULL; - - n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); - n *= nr_ioapics; - - mem = alloc_bootmem(n); - res = (void *)mem; - - if (mem != NULL) { - mem += sizeof(struct resource) * nr_ioapics; - - for (i = 0; i < nr_ioapics; i++) { - res[i].name = mem; - res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; - sprintf(mem, "IOAPIC %u", i); - mem += IOAPIC_RESOURCE_NAME_SIZE; - } - } - - ioapic_resources = res; - - return res; -} -#endif - -void __init ioapic_init_mappings(void) -{ - unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; -#ifdef CONFIG_X86_64 - struct resource *ioapic_res; - - ioapic_res = ioapic_setup_resources(); -#endif - for (i = 0; i < nr_ioapics; i++) { - if (smp_found_config) { - ioapic_phys = mp_ioapics[i].mp_apicaddr; -#ifdef CONFIG_X86_32 - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } -#endif - } else { -#ifdef CONFIG_X86_32 -fake_ioapic_page: -#endif - ioapic_phys = (unsigned long) - alloc_bootmem_pages(PAGE_SIZE); - ioapic_phys = __pa(ioapic_phys); - } - set_fixmap_nocache(idx, ioapic_phys); - apic_printk(APIC_VERBOSE, - "mapped IOAPIC to %08lx (%08lx)\n", - __fix_to_virt(idx), ioapic_phys); - idx++; - -#ifdef CONFIG_X86_64 - if (ioapic_res != NULL) { - ioapic_res->start = ioapic_phys; - ioapic_res->end = ioapic_phys + (4 * 1024) - 1; - ioapic_res++; - } -#endif - } -} - -#ifdef CONFIG_X86_64 -static int __init ioapic_insert_resources(void) -{ - int i; - struct resource *r = ioapic_resources; - - if (!r) { - printk(KERN_ERR - "IO APIC resources could be not be allocated.\n"); - return -1; - } - - for (i = 0; i < nr_ioapics; i++) { - insert_resource(&iomem_resource, r); - r++; - } - - return 0; -} - -/* Insert the IO APIC resources after PCI initialization has occured to handle - * IO APICS that are mapped in on a BAR in PCI space. */ -late_initcall(ioapic_insert_resources); -#endif -- cgit v1.2.2 From c691cc84529ec88ccb32b174535bb61875888c90 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:43 -0700 Subject: io_apic: make 32 bit have io_apic resource in /proc/iomem Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index fba6d6ee3480..7e303e0967a4 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3802,7 +3802,6 @@ void __init setup_ioapic_dest(void) } #endif -#ifdef CONFIG_X86_64 #define IOAPIC_RESOURCE_NAME_SIZE 11 static struct resource *ioapic_resources; @@ -3838,17 +3837,14 @@ static struct resource * __init ioapic_setup_resources(void) return res; } -#endif void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; -#ifdef CONFIG_X86_64 struct resource *ioapic_res; ioapic_res = ioapic_setup_resources(); -#endif for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; @@ -3877,17 +3873,14 @@ fake_ioapic_page: __fix_to_virt(idx), ioapic_phys); idx++; -#ifdef CONFIG_X86_64 if (ioapic_res != NULL) { ioapic_res->start = ioapic_phys; ioapic_res->end = ioapic_phys + (4 * 1024) - 1; ioapic_res++; } -#endif } } -#ifdef CONFIG_X86_64 static int __init ioapic_insert_resources(void) { int i; @@ -3910,4 +3903,3 @@ static int __init ioapic_insert_resources(void) /* Insert the IO APIC resources after PCI initialization has occured to handle * IO APICS that are mapped in on a BAR in PCI space. */ late_initcall(ioapic_insert_resources); -#endif -- cgit v1.2.2 From 4e738e2f307113feaedebae147c3e0d072e39648 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:47 -0700 Subject: x86: unify mask_IO_APIC_irq use MACRO for 32 bit too Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 80 +++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 59 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 7e303e0967a4..099696109ca8 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -610,18 +610,7 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } -#ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) -{ - struct io_apic __iomem *io_apic = io_apic_base(apic); - readl(&io_apic->data); -} - -#define __DO_ACTION(R, ACTION, FINAL) \ +#define __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ \ { \ int pin; \ @@ -636,7 +625,8 @@ static inline void io_apic_sync(unsigned int apic) break; \ pin = entry->pin; \ reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION; \ + reg ACTION_DISABLE; \ + reg ACTION_ENABLE; \ io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ FINAL; \ if (!entry->next) \ @@ -645,66 +635,38 @@ static inline void io_apic_sync(unsigned int apic) } \ } -#define DO_ACTION(name,R,ACTION, FINAL) \ +#define DO_ACTION(name,R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ \ static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION, FINAL) - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, io_apic_sync(entry->apic)) + __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) /* mask = 0 */ -DO_ACTION(__unmask, 0, &= ~IO_APIC_REDIR_MASKED, ) +DO_ACTION(__unmask, 0, |= 0, &= ~IO_APIC_REDIR_MASKED, ) -#else - -static void __modify_IO_APIC_irq(unsigned int irq, unsigned long enable, unsigned long disable) +#ifdef CONFIG_X86_64 +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) { - struct irq_cfg *cfg; - struct irq_pin_list *entry; - unsigned int pin, reg; - - cfg = irq_cfg(irq); - entry = cfg->irq_2_pin; - for (;;) { - if (!entry) - break; - pin = entry->pin; - reg = io_apic_read(entry->apic, 0x10 + pin*2); - reg &= ~disable; - reg |= enable; - io_apic_modify(entry->apic, 0x10 + pin*2, reg); - if (!entry->next) - break; - entry = entry->next; - } + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); } /* mask = 1 */ -static void __mask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, 0); -} +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, io_apic_sync(entry->apic)) -/* mask = 0 */ -static void __unmask_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, 0, IO_APIC_REDIR_MASKED); -} +#else + +/* mask = 1 */ +DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, ) /* mask = 1, trigger = 0 */ -static void __mask_and_edge_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_MASKED, - IO_APIC_REDIR_LEVEL_TRIGGER); -} +DO_ACTION(__mask_and_edge, 0, |= IO_APIC_REDIR_MASKED, &= ~IO_APIC_REDIR_LEVEL_TRIGGER, ) /* mask = 0, trigger = 1 */ -static void __unmask_and_level_IO_APIC_irq(unsigned int irq) -{ - __modify_IO_APIC_irq(irq, IO_APIC_REDIR_LEVEL_TRIGGER, - IO_APIC_REDIR_MASKED); -} +DO_ACTION(__unmask_and_level, 0, |= IO_APIC_REDIR_LEVEL_TRIGGER, &= ~IO_APIC_REDIR_MASKED, ) #endif -- cgit v1.2.2 From 3eb2cce84beae8fd41de950569cafd5bca7edd5d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:48 -0700 Subject: x86: unify ack_apic_edge use code in 64 to replace move_native_irq(irq, desc); in 32 bit Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 73 +++++++++++++++++++++++------------------------ 1 file changed, 35 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 099696109ca8..a466b04ad5a4 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -389,7 +389,6 @@ static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned writel(value, &io_apic->data); } -#ifdef CONFIG_X86_64 static bool io_apic_level_ack_pending(unsigned int irq) { struct irq_pin_list *entry; @@ -419,7 +418,6 @@ static bool io_apic_level_ack_pending(unsigned int irq) return false; } -#endif union entry_union { struct { u32 w1, w2; }; @@ -2398,9 +2396,16 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_32 +atomic_t irq_mis_count; +#endif + static void ack_apic_level(unsigned int irq) { +#ifdef CONFIG_X86_32 + unsigned long v; + int i; +#endif int do_unmask_irq = 0; irq_complete_move(irq); @@ -2412,6 +2417,31 @@ static void ack_apic_level(unsigned int irq) } #endif +#ifdef CONFIG_X86_32 + /* + * It appears there is an erratum which affects at least version 0x11 + * of I/O APIC (that's the 82093AA and cores integrated into various + * chipsets). Under certain conditions a level-triggered interrupt is + * erroneously delivered as edge-triggered one but the respective IRR + * bit gets set nevertheless. As a result the I/O unit expects an EOI + * message but it will never arrive and further interrupts are blocked + * from the source. The exact reason is so far unknown, but the + * phenomenon was observed when two consecutive interrupt requests + * from a given source get delivered to the same CPU and the source is + * temporarily disabled in between. + * + * A workaround is to simulate an EOI message manually. We achieve it + * by setting the trigger mode to edge and then to level when the edge + * trigger mode gets detected in the TMR of a local APIC for a + * level-triggered interrupt. We mask the source for the time of the + * operation to prevent an edge-triggered interrupt escaping meanwhile. + * The idea is from Manfred Spraul. --macro + */ + i = irq_cfg(irq)->vector; + + v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); +#endif + /* * We must acknowledge the irq before we move it or the acknowledge will * not propagate properly. @@ -2450,41 +2480,8 @@ static void ack_apic_level(unsigned int irq) move_masked_irq(irq); unmask_IO_APIC_irq(irq); } -} -#else -atomic_t irq_mis_count; -static void ack_apic_level(unsigned int irq) -{ - unsigned long v; - int i; - - irq_complete_move(irq); - move_native_irq(irq); - /* - * It appears there is an erratum which affects at least version 0x11 - * of I/O APIC (that's the 82093AA and cores integrated into various - * chipsets). Under certain conditions a level-triggered interrupt is - * erroneously delivered as edge-triggered one but the respective IRR - * bit gets set nevertheless. As a result the I/O unit expects an EOI - * message but it will never arrive and further interrupts are blocked - * from the source. The exact reason is so far unknown, but the - * phenomenon was observed when two consecutive interrupt requests - * from a given source get delivered to the same CPU and the source is - * temporarily disabled in between. - * - * A workaround is to simulate an EOI message manually. We achieve it - * by setting the trigger mode to edge and then to level when the edge - * trigger mode gets detected in the TMR of a local APIC for a - * level-triggered interrupt. We mask the source for the time of the - * operation to prevent an edge-triggered interrupt escaping meanwhile. - * The idea is from Manfred Spraul. --macro - */ - i = irq_cfg(irq)->vector; - - v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); - - ack_APIC_irq(); +#ifdef CONFIG_X86_32 if (!(v & (1 << (i & 0x1f)))) { atomic_inc(&irq_mis_count); spin_lock(&ioapic_lock); @@ -2492,8 +2489,8 @@ static void ack_apic_level(unsigned int irq) __unmask_and_level_IO_APIC_irq(irq); spin_unlock(&ioapic_lock); } -} #endif +} static struct irq_chip ioapic_chip __read_mostly = { .name = "IO-APIC", -- cgit v1.2.2 From 29ccbbf232c035b8c7ff0c5060fbe30a66ed9b99 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:49 -0700 Subject: x86: remove first_free_entry/pin_map_size no user now Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 7 ------- arch/x86/kernel/setup.c | 4 ---- 2 files changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index a466b04ad5a4..5de2d38812aa 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -72,13 +72,6 @@ int sis_apic_bug = -1; static DEFINE_SPINLOCK(ioapic_lock); static DEFINE_SPINLOCK(vector_lock); -int first_free_entry; -/* - * Rough estimation of how many shared IRQs there are, can - * be changed anytime. - */ -int pin_map_size; - /* * # of IRQ routing registers */ diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 02e3a6697977..d90c659b70e9 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1074,10 +1074,6 @@ void __init setup_arch(char **cmdline_p) nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif -#ifdef CONFIG_X86_IO_APIC - pin_map_size = nr_irqs * 2; - first_free_entry = nr_irqs; -#endif init_apic_mappings(); ioapic_init_mappings(); -- cgit v1.2.2 From ffd5aae7817fba22c5c3e304a31c44fa0a4e9a97 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:50 -0700 Subject: x86: print local APIC of APs one by one instead of print that of all APs at the time Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 5de2d38812aa..bf0e66d73030 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1777,7 +1777,12 @@ __apicdebuginit(void) print_local_APIC(void *dummy) __apicdebuginit(void) print_all_local_APICs(void) { - on_each_cpu(print_local_APIC, NULL, 1); + int cpu; + + preempt_disable(); + for_each_online_cpu(cpu) + smp_call_function_single(cpu, print_local_APIC, NULL, 1); + preempt_enable(); } __apicdebuginit(void) print_PIC(void) -- cgit v1.2.2 From 8f09cd20a24c5f13c571bc73ddcd47be0af3b70f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:51 -0700 Subject: x86: make HAVE_SPARSE_IRQ support selectable Ingo said sparse_irq is some intrusive. need to make it selectable to make it simple, remove irq_desc as parameter in some functions. (ack, eoi, set_affinity). may need to make member if irq_chip to take irq_desc, or struct irq later. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 12 +++++++- arch/x86/kernel/io_apic.c | 71 ++++++++++++++++++++++++++++++++--------------- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- 4 files changed, 62 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index b157e94637bf..600584b7a497 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -34,7 +34,6 @@ config X86 select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_DYN_ARRAY - select HAVE_SPARSE_IRQ config ARCH_DEFCONFIG string @@ -238,6 +237,17 @@ config SMP If you don't know what to do here, say N. +config HAVE_SPARSE_IRQ + bool "Support sparse irq numbering" + depends on PCI_MSI || HT_IRQ + default y + help + This enables support for sparse irq, esp for msi/msi-x. the irq + number will be bus/dev/fn + 12bit. You may need if you have lots of + cards supports msi-x installed. + + If you don't know what to do here, say Y. + config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index bf0e66d73030..f853b667fa5c 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -107,7 +107,9 @@ struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; +#ifdef CONFIG_HAVE_SPARSE_IRQ struct irq_cfg *next; +#endif struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; @@ -137,20 +139,6 @@ static struct irq_cfg irq_cfg_legacy[] __initdata = { }; static struct irq_cfg irq_cfg_init = { .irq = -1U, }; -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); static void init_one_irq_cfg(struct irq_cfg *cfg) { @@ -158,7 +146,9 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) } static struct irq_cfg *irq_cfgx; +#ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_cfg *irq_cfgx_free; +#endif static void __init init_work(void *data) { struct dyn_array *da = data; @@ -174,15 +164,34 @@ static void __init init_work(void *data) for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); +#ifdef CONFIG_HAVE_SPARSE_IRQ for (i = 1; i < *da->nr; i++) cfg[i-1].next = &cfg[i]; irq_cfgx_free = &irq_cfgx[legacy_count]; irq_cfgx[legacy_count - 1].next = NULL; +#endif +} + +#ifdef CONFIG_HAVE_SPARSE_IRQ +/* need to be biger than size of irq_cfg_legacy */ +static int nr_irq_cfg = 32; + +static int __init parse_nr_irq_cfg(char *arg) +{ + if (arg) { + nr_irq_cfg = simple_strtoul(arg, NULL, 0); + if (nr_irq_cfg < 32) + nr_irq_cfg = 32; + } + return 0; } -#define for_each_irq_cfg(cfg) \ - for (cfg = irq_cfgx; cfg; cfg = cfg->next) +early_param("nr_irq_cfg", parse_nr_irq_cfg); + +#define for_each_irq_cfg(irqX, cfg) \ + for (cfg = irq_cfgx, irqX = cfg->irq; cfg; cfg = cfg->next, irqX = cfg ? cfg->irq : -1U) + DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); @@ -273,7 +282,26 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) #endif return cfg; } +#else + +#define for_each_irq_cfg(irq, cfg) \ + for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) + +DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); +struct irq_cfg *irq_cfg(unsigned int irq) +{ + if (irq < nr_irqs) + return &irq_cfgx[irq]; + + return NULL; +} +struct irq_cfg *irq_cfg_alloc(unsigned int irq) +{ + return irq_cfg(irq); +} + +#endif /* * This is performance-critical, we want to do it O(1) * @@ -1282,11 +1310,10 @@ void __setup_vector_irq(int cpu) struct irq_cfg *cfg; /* Mark the inuse vectors */ - for_each_irq_cfg(cfg) { + for_each_irq_cfg(irq, cfg) { if (!cpu_isset(cpu, cfg->domain)) continue; vector = cfg->vector; - irq = cfg->irq; per_cpu(vector_irq, cpu)[vector] = irq; } /* Mark the free vectors */ @@ -1563,6 +1590,7 @@ __apicdebuginit(void) print_IO_APIC(void) union IO_APIC_reg_03 reg_03; unsigned long flags; struct irq_cfg *cfg; + unsigned int irq; if (apic_verbosity == APIC_QUIET) return; @@ -1651,11 +1679,11 @@ __apicdebuginit(void) print_IO_APIC(void) } } printk(KERN_DEBUG "IRQ to pin mappings:\n"); - for_each_irq_cfg(cfg) { + for_each_irq_cfg(irq, cfg) { struct irq_pin_list *entry = cfg->irq_2_pin; if (!entry) continue; - printk(KERN_DEBUG "IRQ%d ", cfg->irq); + printk(KERN_DEBUG "IRQ%d ", irq); for (;;) { printk("-> %d:%d", entry->apic, entry->pin); if (!entry->next) @@ -2535,8 +2563,7 @@ static inline void init_IO_APIC_traps(void) * Also, we've got to be careful not to trash gate * 0x80, because int 0x80 is hm, kind of importantish. ;) */ - for_each_irq_cfg(cfg) { - irq = cfg->irq; + for_each_irq_cfg(irq, cfg) { if (IO_APIC_IRQ(irq) && !cfg->vector) { /* * Hmm.. We don't have an entry for this, diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index cc929f2f84f1..b2e1082cf5ad 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -269,7 +269,7 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction * action; unsigned long flags; unsigned int entries; - struct irq_desc *desc; + struct irq_desc *desc = NULL; int tail = 0; #ifdef CONFIG_HAVE_SPARSE_IRQ diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 348a11168c2b..c2fca89a3f7e 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -74,7 +74,7 @@ int show_interrupts(struct seq_file *p, void *v) struct irqaction * action; unsigned long flags; unsigned int entries; - struct irq_desc *desc; + struct irq_desc *desc = NULL; int tail = 0; #ifdef CONFIG_HAVE_SPARSE_IRQ -- cgit v1.2.2 From 9d6a4d0823b3b8e29156f5e698b5a68687afad32 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 19 Aug 2008 20:50:52 -0700 Subject: x86: probe nr_irqs even only mptable is used for !CONFIG_HAVE_SPARSE_IRQ fix: In file included from arch/x86/kernel/early-quirks.c:18: include/asm/io_apic.h: In function 'probe_nr_irqs': include/asm/io_apic.h:209: error: 'NR_IRQS' undeclared (first use in this function) include/asm/io_apic.h:209: error: (Each undeclared identifier is reported only once include/asm/io_apic.h:209: error: for each function it appears in.) v2: fix by Ingo Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 25 ------------------------- arch/x86/kernel/io_apic.c | 43 ++++++++++++++++++++++++++++++------------- arch/x86/kernel/setup.c | 6 +++--- 3 files changed, 33 insertions(+), 41 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 3e9d163fd92f..5fef4fece4a5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -957,29 +957,6 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) nr_ioapics++; } -int get_nr_irqs_via_madt(void) -{ - int idx; - int nr = 0; - - for (idx = 0; idx < nr_ioapics; idx++) { - if (mp_ioapic_routing[idx].gsi_end > nr) - nr = mp_ioapic_routing[idx].gsi_end; - } - - nr++; - - /* double it for hotplug and msi and nmi */ - nr <<= 1; - - /* something wrong ? */ - if (nr < 32) - nr = 32; - - return nr; - -} - static void assign_to_mp_irq(struct mp_config_intsrc *m, struct mp_config_intsrc *mp_irq) { @@ -1278,8 +1255,6 @@ static int __init acpi_parse_madt_ioapic_entries(void) } - nr_irqs = get_nr_irqs_via_madt(); - count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, nr_irqs); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index f853b667fa5c..f7e80262cbbb 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3596,6 +3596,36 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ +int __init io_apic_get_redir_entries (int ioapic) +{ + union IO_APIC_reg_01 reg_01; + unsigned long flags; + + spin_lock_irqsave(&ioapic_lock, flags); + reg_01.raw = io_apic_read(ioapic, 1); + spin_unlock_irqrestore(&ioapic_lock, flags); + + return reg_01.bits.entries; +} + +int __init probe_nr_irqs(void) +{ + int idx; + int nr = 0; + + for (idx = 0; idx < nr_ioapics; idx++) + nr += io_apic_get_redir_entries(idx); + + /* double it for hotplug and msi and nmi */ + nr <<= 1; + + /* something wrong ? */ + if (nr < 32) + nr = 32; + + return nr; +} + /* -------------------------------------------------------------------------- ACPI-based IOAPIC Configuration -------------------------------------------------------------------------- */ @@ -3690,19 +3720,6 @@ int __init io_apic_get_version(int ioapic) } #endif -int __init io_apic_get_redir_entries (int ioapic) -{ - union IO_APIC_reg_01 reg_01; - unsigned long flags; - - spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(ioapic, 1); - spin_unlock_irqrestore(&ioapic_lock, flags); - - return reg_01.bits.entries; -} - - int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity) { if (!IO_APIC_IRQ(irq)) { diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index d90c659b70e9..61335306696a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1069,15 +1069,15 @@ void __init setup_arch(char **cmdline_p) prefill_possible_map(); #ifdef CONFIG_X86_64 - /* need to wait for nr_cpu_ids settle down */ - if (nr_irqs == NR_IRQS) - nr_irqs = 32 * nr_cpu_ids + 224; init_cpu_to_node(); #endif init_apic_mappings(); ioapic_init_mappings(); + /* need to wait for io_apic is mapped */ + nr_irqs = probe_nr_irqs(); + kvm_guest_init(); e820_reserve_resources(); -- cgit v1.2.2 From bf9d3cf73e8caf0b3ae0b7f508a9f251536f5ff4 Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 18 Aug 2008 22:17:08 -0700 Subject: xen: Fix bug `do_IRQ: cannot handle IRQ -1 vector 0x6 cpu 1' Following commit 9c3f2468d8339866d9ef6a25aae31a8909c6be0d, do_IRQ() looks up the IRQ number in the per-cpu variable vector_irq. This commit makes Xen initialise an identity vector_irq map for both X86_32 and X86_64. Signed-off-by: Alex Nixon Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/irq.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 28b85ab8422e..bb042608c602 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -21,7 +21,6 @@ void xen_force_evtchn_callback(void) static void __init __xen_init_IRQ(void) { -#ifdef CONFIG_X86_64 int i; /* Create identity vector->irq map */ @@ -31,7 +30,6 @@ static void __init __xen_init_IRQ(void) for_each_possible_cpu(cpu) per_cpu(vector_irq, cpu)[i] = i; } -#endif /* CONFIG_X86_64 */ xen_init_IRQ(); } -- cgit v1.2.2 From 0c425cec64eb0c0d0dd7037c21a25585cbe3636c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 18 Aug 2008 13:04:26 +0200 Subject: warning: fix arch x86 kernel io_apic c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix warning: arch/x86/kernel/io_apic.c: In function ‘print_local_APIC’: arch/x86/kernel/io_apic.c:1786: warning: format ‘%08x’ expects type ‘unsigned int’, but argument 2 has type ‘u64’ arch/x86/kernel/io_apic.c:1787: warning: format ‘%08x’ expects type ‘unsigned int’, but argument 2 has type ‘u64’ By creating uniform behavior on 32-bit and 64-bit and printing out the ICR value in two 32-bit words. Code has changed: text data bss dec hex filename 22901 19650 17040 59591 e8c7 io_apic.o.before 22899 19650 17040 59589 e8c5 io_apic.o.after Due to the 32-bit cast narrowing the printed out value on 64-bit. Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index f7e80262cbbb..34c74cf5c244 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1774,8 +1774,8 @@ __apicdebuginit(void) print_local_APIC(void *dummy) } icr = apic_icr_read(); - printk(KERN_DEBUG "... APIC ICR: %08x\n", icr); - printk(KERN_DEBUG "... APIC ICR2: %08x\n", icr >> 32); + printk(KERN_DEBUG "... APIC ICR: %08x\n", (u32)icr); + printk(KERN_DEBUG "... APIC ICR2: %08x\n", (u32)(icr >> 32)); v = apic_read(APIC_LVTT); printk(KERN_DEBUG "... APIC LVTT: %08x\n", v); -- cgit v1.2.2 From e89eb43863c2d9f11a3bbe766766fe646e6c50d9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 20 Aug 2008 20:46:25 -0700 Subject: x86: sparse_irq needs spin_lock in allocations Suresh Siddha noticed that we should have a spinlock around it. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 34c74cf5c244..7ca556690474 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -146,6 +146,12 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) } static struct irq_cfg *irq_cfgx; + +/* + * Protect the irq_cfgx_free freelist: + */ +static DEFINE_SPINLOCK(irq_cfg_lock); + #ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_cfg *irq_cfgx_free; #endif @@ -213,8 +219,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq) static struct irq_cfg *irq_cfg_alloc(unsigned int irq) { struct irq_cfg *cfg, *cfg_pri; - int i; + unsigned long flags; int count = 0; + int i; cfg_pri = cfg = irq_cfgx; while (cfg) { @@ -226,6 +233,7 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) count++; } + spin_lock_irqsave(&irq_cfg_lock, flags); if (!irq_cfgx_free) { unsigned long phys; unsigned long total_bytes; @@ -263,6 +271,9 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) else irq_cfgx = cfg; cfg->irq = irq; + + spin_unlock_irqrestore(&irq_cfg_lock, flags); + printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); #ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG { -- cgit v1.2.2 From a2d332fa3445160519de03c350a59602ac1c3df9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 21 Aug 2008 12:56:32 -0700 Subject: x86: fix 32-bit ioapic lockup with sparseirqs Missed two lines when copying. Fix panic on one of Ingo's machines that need to adjust ioapic id when acpi off/ 32bit. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 7ca556690474..4e44fd1f466e 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2077,6 +2077,8 @@ static void __init setup_ioapic_ids_from_mpc(void) reg_00.bits.ID = mp_ioapics[apic].mp_apicid; spin_lock_irqsave(&ioapic_lock, flags); + io_apic_write(apic, 0, reg_00.raw); + spin_unlock_irqrestore(&ioapic_lock, flags); /* * Sanity check -- cgit v1.2.2 From 052c0bff9b83a578654dfa513d6e3d0b3795f1e8 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 21 Aug 2008 13:10:09 -0700 Subject: x86: fix probe_nr_irqs for xen otherwise Xen is _completely_ unusable with 5 or more VCPUs. (when !CONFIG_HAVE_SPARSE_IRQ). based on Alex Nixon's patch. also add +1 offset after redir_entries Signed-off-by: Yinghai Lu Acked-by: Jeremy Fitzhardinge Acked-by: Alex Nixon Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4e44fd1f466e..d28128e0392c 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3625,16 +3625,21 @@ int __init probe_nr_irqs(void) { int idx; int nr = 0; +#ifndef CONFIG_XEN + int nr_min = 32; +#else + int nr_min = NR_IRQS; +#endif for (idx = 0; idx < nr_ioapics; idx++) - nr += io_apic_get_redir_entries(idx); + nr += io_apic_get_redir_entries(idx) + 1; /* double it for hotplug and msi and nmi */ nr <<= 1; /* something wrong ? */ - if (nr < 32) - nr = 32; + if (nr < nr_min) + nr = nr_min; return nr; } -- cgit v1.2.2 From 2699574b3c04351f5a23c8a842e17c303d7ebb6f Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Thu, 21 Aug 2008 11:26:43 -0700 Subject: x86: VMI, initialize IRQ vector Initialize vector_irq for the vmi used vector, to point to correct irq. Signed-off-by: Alok N Kataria Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmiclock_32.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c index 6953859fe289..254ee07f8635 100644 --- a/arch/x86/kernel/vmiclock_32.c +++ b/arch/x86/kernel/vmiclock_32.c @@ -235,11 +235,14 @@ static void __devinit vmi_time_init_clockevent(void) void __init vmi_time_init(void) { + unsigned int cpu; /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */ outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */ vmi_time_init_clockevent(); setup_irq(0, &vmi_clock_action); + for_each_possible_cpu(cpu) + per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0; } #ifdef CONFIG_X86_LOCAL_APIC -- cgit v1.2.2 From db4b5525caafd846ec20f95afbc6403c792e22cf Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:39 -0700 Subject: x86: apic_64.c - setup_APIC_timer has to be __cpuinit function There is no need to hold this code if CPU_HOTPLUG is not defined. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index cd860856a0b7..8f551276681e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -400,7 +400,7 @@ static void lapic_timer_broadcast(cpumask_t mask) * Setup the local APIC timer for this CPU. Copy the initilized values * of the boot CPU and register the clock event in the framework. */ -static void setup_APIC_timer(void) +static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); -- cgit v1.2.2 From 7c37e48b5125fdb0acac846a0a3af42806175d44 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:40 -0700 Subject: x86: apic - introduce get_physical_broadcast for 64bit We don't really use it now on 64bit mode but could reserve it for future. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 8f551276681e..bb46711a0b1e 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -244,6 +244,16 @@ void __cpuinit enable_NMI_through_LVT0(void) apic_write(APIC_LVT0, v); } +#ifdef CONFIG_X86_32 +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} +#endif + /** * lapic_get_maxlvt - get the maximum number of local vector table entries */ -- cgit v1.2.2 From 920fa7a507c3b1004a9ebe07a2c9d38605b3406a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:41 -0700 Subject: x86: apic - unify setup_apicpmtimer Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 10 ++++++++++ arch/x86/kernel/apic_64.c | 2 ++ 2 files changed, 12 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 21c831d96af3..df6ed603e547 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1774,6 +1774,16 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); +#ifdef CONFIG_X86_64 +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + static int __init apic_set_verbosity(char *arg) { if (!arg) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index bb46711a0b1e..ddc5b245faf2 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1810,6 +1810,7 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); +#ifdef CONFIG_X86_64 static __init int setup_apicpmtimer(char *s) { apic_calibrate_pmtmr = 1; @@ -1817,6 +1818,7 @@ static __init int setup_apicpmtimer(char *s) return 0; } __setup("apicpmtimer", setup_apicpmtimer); +#endif static int __init apic_set_verbosity(char *arg) { -- cgit v1.2.2 From 80e5609cabd7e4321769701a70297f819a15b08d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:42 -0700 Subject: x86: apic_64.c - add sanity check for spurious vector definition Do not check for SPUTIOUS_APIC_VECTOR definition twice. Check it once - is what we need. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ddc5b245faf2..4587e16f73ec 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -46,6 +46,13 @@ #include #include +/* + * Sanity check + */ +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) +# error SPURIOUS_APIC_VECTOR definition error +#endif + /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; static int apic_calibrate_pmtmr __initdata; @@ -939,8 +946,6 @@ void __cpuinit setup_local_APIC(void) preempt_disable(); value = apic_read(APIC_LVR); - BUILD_BUG_ON((SPURIOUS_APIC_VECTOR & 0x0f) != 0x0f); - /* * Double-check whether this APIC is really registered. * This is meaningless in clustered apic mode, so we skip it. -- cgit v1.2.2 From 89c38c2867ebe37c4c5aee23e7fa1bffb025b171 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:43 -0700 Subject: x86: apic - unify setup_local_APIC - remove useless read of APIC_LVR - wrap with preempt_disable/enable - check for integrated APIC just in place v2: fix by Yinghai Lu. fix lapic_is_integrated using let 64-bit too have pic_mode Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 21 ++++++++++++++----- arch/x86/kernel/apic_64.c | 51 ++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 62 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index df6ed603e547..f8c1251984e2 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1033,9 +1033,10 @@ static void __cpuinit lapic_setup_esr(void) */ void __cpuinit setup_local_APIC(void) { - unsigned long value, integrated; + unsigned int value; int i, j; +#ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ if (esr_disable) { apic_write(APIC_ESR, 0); @@ -1043,14 +1044,16 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); } +#endif - integrated = lapic_is_integrated(); + preempt_disable(); /* * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. */ if (!apic_id_registered()) - WARN_ON_ONCE(1); + BUG(); /* * Intel recommends to set DFR, LDR and TPR before enabling @@ -1096,6 +1099,7 @@ void __cpuinit setup_local_APIC(void) */ value |= APIC_SPIV_APIC_ENABLED; +#ifdef CONFIG_X86_32 /* * Some unknown Intel IO/APIC (or APIC) errata is biting us with * certain networking cards. If high frequency interrupts are @@ -1116,8 +1120,13 @@ void __cpuinit setup_local_APIC(void) * See also the comment in end_level_ioapic_irq(). --macro */ - /* Enable focus processor (bit==0) */ + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif /* * Set spurious IRQ vector @@ -1154,9 +1163,11 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; - if (!integrated) /* 82489DX */ + if (!lapic_is_integrated()) /* 82489DX */ value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); + + preempt_enable(); } void __cpuinit end_local_APIC_setup(void) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 4587e16f73ec..283968d4e024 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -76,6 +76,8 @@ char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; */ unsigned int apic_verbosity; +int pic_mode; + /* Have we found an MP table */ int smp_found_config; @@ -943,8 +945,17 @@ void __cpuinit setup_local_APIC(void) unsigned int value; int i, j; +#ifdef CONFIG_X86_32 + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } +#endif + preempt_disable(); - value = apic_read(APIC_LVR); /* * Double-check whether this APIC is really registered. @@ -997,7 +1008,34 @@ void __cpuinit setup_local_APIC(void) */ value |= APIC_SPIV_APIC_ENABLED; - /* We always use processor focus */ +#ifdef CONFIG_X86_32 + /* + * Some unknown Intel IO/APIC (or APIC) errata is biting us with + * certain networking cards. If high frequency interrupts are + * happening on a particular IOAPIC pin, plus the IOAPIC routing + * entry is masked/unmasked at a high rate as well then sooner or + * later IOAPIC line gets 'stuck', no more interrupts are received + * from the device. If focus CPU is disabled then the hang goes + * away, oh well :-( + * + * [ This bug can be reproduced easily with a level-triggered + * PCI Ne2000 networking cards and PII/PIII processors, dual + * BX chipset. ] + */ + /* + * Actually disabling the focus CPU check just makes the hang less + * frequent as it makes the interrupt distributon model be more + * like LRU than MRU (the short-term load is more even across CPUs). + * See also the comment in end_level_ioapic_irq(). --macro + */ + + /* + * - enable focus processor (bit==0) + * - 64bit mode always use processor focus + * so no need to set it + */ + value &= ~APIC_SPIV_FOCUS_DISABLED; +#endif /* * Set spurious IRQ vector @@ -1016,14 +1054,14 @@ void __cpuinit setup_local_APIC(void) * TODO: set up through-local-APIC from through-I/O-APIC? --macro */ value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; - if (!smp_processor_id() && !value) { + if (!smp_processor_id() && (pic_mode || !value)) { value = APIC_DM_EXTINT; apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", - smp_processor_id()); + smp_processor_id()); } else { value = APIC_DM_EXTINT | APIC_LVT_MASKED; apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", - smp_processor_id()); + smp_processor_id()); } apic_write(APIC_LVT0, value); @@ -1034,7 +1072,10 @@ void __cpuinit setup_local_APIC(void) value = APIC_DM_NMI; else value = APIC_DM_NMI | APIC_LVT_MASKED; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; apic_write(APIC_LVT1, value); + preempt_enable(); } -- cgit v1.2.2 From 457cc52d4670bcf1470606a108bbf35aac28eb7f Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:44 -0700 Subject: x86: apic_32.c should use __cpuinit section All callers are __init or __cpuinit so there is no need to hold this code without CPU_HOTPLUG being set. Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index f8c1251984e2..4ca3717b3026 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -383,7 +383,7 @@ static void lapic_timer_broadcast(cpumask_t mask) * Setup the local APIC timer for this CPU. Copy the initilized values * of the boot CPU and register the clock event in the framework. */ -static void __devinit setup_APIC_timer(void) +static void __cpuinit setup_APIC_timer(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); @@ -652,7 +652,7 @@ void __init setup_boot_APIC_clock(void) setup_APIC_timer(); } -void __devinit setup_secondary_APIC_clock(void) +void __cpuinit setup_secondary_APIC_clock(void) { setup_APIC_timer(); } @@ -1713,7 +1713,7 @@ static struct sys_device device_lapic = { .cls = &lapic_sysclass, }; -static void __devinit apic_pm_activate(void) +static void __cpuinit apic_pm_activate(void) { apic_pm_state.active = 1; } -- cgit v1.2.2 From 6460bc73aac970135104a0bc407c2c8b85394d59 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 24 Aug 2008 02:01:45 -0700 Subject: x86: apic - unify smp_apic_timer_interrupt Signed-off-by: Cyrill Gorcunov Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 3 +++ arch/x86/kernel/apic_64.c | 2 ++ 2 files changed, 5 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 4ca3717b3026..913d9924b101 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -718,6 +718,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 283968d4e024..2e109119f647 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -625,7 +625,9 @@ void smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); local_apic_timer_interrupt(); irq_exit(); -- cgit v1.2.2 From b3c5117050e8028d48b2fa0ea09c7a50dd7f3414 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:46 -0700 Subject: x86: apic_xx.c order variables Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 46 +++++++++++++++++++++++---------------------- arch/x86/kernel/apic_64.c | 48 +++++++++++++++++++++++++++++++---------------- 2 files changed, 56 insertions(+), 38 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 913d9924b101..a54512bea629 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -50,16 +50,37 @@ # error SPURIOUS_APIC_VECTOR definition error #endif -unsigned long mp_lapic_addr; - +#ifdef CONFIG_X86_32 /* * Knob to control our willingness to enable the local APIC. * * +1=force-enable */ static int force_enable_local_apic; -int disable_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +#endif +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +unsigned long mp_lapic_addr; +int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ @@ -1742,15 +1763,6 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); static int __init setup_disableapic(char *arg) { @@ -1788,16 +1800,6 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); -#ifdef CONFIG_X86_64 -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - static int __init apic_set_verbosity(char *arg) { if (!arg) { diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 2e109119f647..c40a900e155b 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -53,16 +53,44 @@ # error SPURIOUS_APIC_VECTOR definition error #endif -/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; +#ifdef CONFIG_X86_32 +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +#endif + +#ifdef CONFIG_X86_64 static int apic_calibrate_pmtmr __initdata; -int disable_apic; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + int disable_x2apic; int x2apic; - /* x2apic enabled before OS handover */ int x2apic_preenabled; +unsigned long mp_lapic_addr; +int disable_apic; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int disable_apic_timer __cpuinitdata; /* Local APIC timer works in C2 */ int local_apic_timer_c2_ok; EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); @@ -113,8 +141,6 @@ static DEFINE_PER_CPU(struct clock_event_device, lapic_events); static unsigned long apic_phys; -unsigned long mp_lapic_addr; - /* * Get the LAPIC version */ @@ -1858,16 +1884,6 @@ static int __init parse_nolapic_timer(char *arg) } early_param("nolapic_timer", parse_nolapic_timer); -#ifdef CONFIG_X86_64 -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - static int __init apic_set_verbosity(char *arg) { if (!arg) { -- cgit v1.2.2 From 49899eacce79ce39faf531dad3e00f771eba2eb1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:47 -0700 Subject: x86: use HAVE_X2APIC in apic_64.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_64.c | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index c40a900e155b..d3ec746aede4 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -82,10 +82,23 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif -int disable_x2apic; +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC int x2apic; /* x2apic enabled before OS handover */ int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif unsigned long mp_lapic_addr; int disable_apic; @@ -228,6 +241,7 @@ static struct apic_ops xapic_ops = { struct apic_ops __read_mostly *apic_ops = &xapic_ops; EXPORT_SYMBOL_GPL(apic_ops); +#ifdef HAVE_X2APIC static void x2apic_wait_icr_idle(void) { /* no need to wait for icr idle in x2apic */ @@ -261,6 +275,7 @@ static struct apic_ops x2apic_ops = { .wait_icr_idle = x2apic_wait_icr_idle, .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, }; +#endif /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 @@ -1125,6 +1140,7 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +#ifdef HAVE_X2APIC void check_x2apic(void) { int msr, msr2; @@ -1243,6 +1259,7 @@ end: return; } +#endif /* HAVE_X2APIC */ /* * Detect and enable local APICs on non-SMP boards. @@ -1291,10 +1308,12 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { +#ifdef HAVE_X2APIC if (x2apic) { boot_cpu_physical_apicid = read_apic_id(); return; } +#endif /* * If no local APIC can be found then set up a fake all @@ -1335,8 +1354,9 @@ int __init APIC_init_uniprocessor(void) printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; } - +#ifdef HAVE_X2APIC enable_IR_x2apic(); +#endif setup_apic_routing(); verify_local_APIC(); @@ -1672,7 +1692,7 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); -#ifdef CONFIG_X86_64 +#ifdef HAVE_X2APIC if (x2apic) enable_x2apic(); else @@ -1836,15 +1856,6 @@ __cpuinit int apic_is_clustered_box(void) return (clusters > 2); } -static __init int setup_nox2apic(char *str) -{ - disable_x2apic = 1; - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_X2APIC); - return 0; -} -early_param("nox2apic", setup_nox2apic); - - /* * APIC command line parameters */ -- cgit v1.2.2 From 3491998dd54f6d4ef7344518fe5463b299fdf537 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:48 -0700 Subject: x86: add hard_smp_prossor_id with MACRO in io_apic_xx.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 7 +++++++ arch/x86/kernel/apic_64.c | 2 ++ 2 files changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index a54512bea629..93718ffb9b9c 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1600,6 +1600,13 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu_set(cpu, cpu_present_map); } +#ifdef CONFIG_X86_64 +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} +#endif + /* * Power management */ diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index d3ec746aede4..eeb69838c2f8 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1612,10 +1612,12 @@ void __cpuinit generic_processor_info(int apicid, int version) cpu_set(cpu, cpu_present_map); } +#ifdef CONFIG_X86_64 int hard_smp_processor_id(void) { return read_apic_id(); } +#endif /* * Power management -- cgit v1.2.2 From f28c0ae21d80ffd6eb0987901c5273843387e341 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:49 -0700 Subject: x86: make apic_32/64.c more like except x2apic, detec_init_APIC, and calibrating_APIC_clock Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 125 +++++++++++++++++++++++++++++++++++++++++----- arch/x86/kernel/apic_64.c | 10 +++- 2 files changed, 122 insertions(+), 13 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 93718ffb9b9c..bfac26a16099 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -66,6 +66,9 @@ static int __init parse_lapic(char *arg) return 0; } early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + #endif #ifdef CONFIG_X86_64 @@ -131,9 +134,6 @@ static struct clock_event_device lapic_clockevent = { }; static DEFINE_PER_CPU(struct clock_event_device, lapic_events); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - static unsigned long apic_phys; /* @@ -240,6 +240,7 @@ void __cpuinit enable_NMI_through_LVT0(void) apic_write(APIC_LVT0, v); } +#ifdef CONFIG_X86_32 /** * get_physical_broadcast - Get number of physical broadcast IDs */ @@ -247,6 +248,7 @@ int get_physical_broadcast(void) { return modern_apic() ? 0xff : 0xf; } +#endif /** * lapic_get_maxlvt - get the maximum number of local vector table entries @@ -1291,6 +1293,32 @@ no_apic: return -1; } +#ifdef CONFIG_X86_64 +void __init early_init_lapic_mapping(void) +{ + unsigned long phys_addr; + + /* + * If no local APIC can be found then go out + * : it means there is no mpatable and MADT + */ + if (!smp_found_config) + return; + + phys_addr = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, phys_addr); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_physical_apicid = read_apic_id(); +} +#endif + /** * init_apic_mappings - initialize APIC mappings */ @@ -1308,8 +1336,8 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - printk(KERN_DEBUG "mapped APIC to %08lx (%08lx)\n", APIC_BASE, - apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, apic_phys); /* * Fetch the APIC ID of the BSP in case we have a @@ -1317,14 +1345,12 @@ void __init init_apic_mappings(void) */ if (boot_cpu_physical_apicid == -1U) boot_cpu_physical_apicid = read_apic_id(); - } /* * This initializes the IO-APIC and APIC hardware if this is * a UP kernel. */ - int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) @@ -1682,11 +1708,6 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); -#ifdef CONFIG_X86_64 - if (x2apic) - enable_x2apic(); - else -#endif { /* * Make sure the APICBASE points to the right address @@ -1770,7 +1791,87 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ +#ifdef CONFIG_X86_64 +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. + */ +__cpuinit int apic_is_clustered_box(void) +{ + int i, clusters, zeros; + unsigned id; + u16 *bios_cpu_apicid; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + /* + * there is not this kind of box with AMD CPU yet. + * Some AMD box with quadcore cpu and 8 sockets apicid + * will be [4, 0x23] or [8, 0x27] could be thought to + * vsmp box still need checking... + */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) + return 0; + + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards + */ + if (is_vsmp_box() && clusters > 1) + return 1; + + /* + * If clusters > 2, then should be multi-chassis. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); +} +#endif + +/* + * APIC command line parameters + */ static int __init setup_disableapic(char *arg) { disable_apic = 1; diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index eeb69838c2f8..dca6c246e731 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -69,6 +69,9 @@ static int __init parse_lapic(char *arg) return 0; } early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + #endif #ifdef CONFIG_X86_64 @@ -1279,6 +1282,7 @@ static int __init detect_init_APIC(void) return 0; } +#ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) { unsigned long phys_addr; @@ -1302,6 +1306,7 @@ void __init early_init_lapic_mapping(void) */ boot_cpu_physical_apicid = read_apic_id(); } +#endif /** * init_apic_mappings - initialize APIC mappings @@ -1334,7 +1339,8 @@ void __init init_apic_mappings(void) * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). */ - boot_cpu_physical_apicid = read_apic_id(); + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); } /* @@ -1782,6 +1788,7 @@ static void apic_pm_activate(void) { } #endif /* CONFIG_PM */ +#ifdef CONFIG_X86_64 /* * apic_is_clustered_box() -- Check if we can expect good TSC * @@ -1857,6 +1864,7 @@ __cpuinit int apic_is_clustered_box(void) */ return (clusters > 2); } +#endif /* * APIC command line parameters -- cgit v1.2.2 From fa2bd35a8d5c88c03b638c72daf7f38a132d0e8c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:50 -0700 Subject: x86: merge APIC_init_uniprocessor Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 51 +++++++++++++++++++++++++++++++++++++++++------ arch/x86/kernel/apic_64.c | 48 +++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 90 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index bfac26a16099..8f8b0e1f3eb3 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1355,6 +1355,17 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { +#ifdef CONFIG_X86_64 + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } +#else if (!smp_found_config && !cpu_has_apic) return -1; @@ -1368,34 +1379,62 @@ int __init APIC_init_uniprocessor(void) clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; } +#endif +#ifdef HAVE_X2APIC + enable_IR_x2apic(); +#endif +#ifdef CONFIG_X86_64 + setup_apic_routing(); +#endif verify_local_APIC(); - connect_bsp_APIC(); +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else /* * Hack: In case of kdump, after a crash, kernel might be booting * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid * might be zero if read from MP tables. Get it from LAPIC. */ -#ifdef CONFIG_CRASH_DUMP +# ifdef CONFIG_CRASH_DUMP boot_cpu_physical_apicid = read_apic_id(); +# endif #endif physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); +#ifdef CONFIG_X86_64 + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +#endif + #ifdef CONFIG_X86_IO_APIC if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) #endif localise_nmi_watchdog(); end_local_APIC_setup(); + #ifdef CONFIG_X86_IO_APIC - if (smp_found_config) - if (!skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif #endif + +#ifdef CONFIG_X86_64 + setup_boot_APIC_clock(); + check_nmi_watchdog(); +#else setup_boot_clock(); +#endif return 0; } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index dca6c246e731..16a30c22b07c 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1351,6 +1351,7 @@ int apic_version[MAX_APICS]; int __init APIC_init_uniprocessor(void) { +#ifdef CONFIG_X86_64 if (disable_apic) { printk(KERN_INFO "Apic disabled\n"); return -1; @@ -1360,37 +1361,78 @@ int __init APIC_init_uniprocessor(void) printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; } +#else + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + return -1; + } +#endif + #ifdef HAVE_X2APIC enable_IR_x2apic(); #endif +#ifdef CONFIG_X86_64 setup_apic_routing(); +#endif verify_local_APIC(); - connect_bsp_APIC(); - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); +#ifdef CONFIG_X86_64 apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); - +#else + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); setup_local_APIC(); +#ifdef CONFIG_X86_64 /* * Now enable IO-APICs, actually call clear_IO_APIC * We need clear_IO_APIC before enabling vector on BP */ if (!skip_ioapic_setup && nr_ioapics) enable_IO_APIC(); +#endif +#ifdef CONFIG_X86_IO_APIC if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) +#endif localise_nmi_watchdog(); end_local_APIC_setup(); +#ifdef CONFIG_X86_IO_APIC if (smp_found_config && !skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); +# ifdef CONFIG_X86_64 else nr_ioapics = 0; +# endif +#endif + +#ifdef CONFIG_X86_64 setup_boot_APIC_clock(); check_nmi_watchdog(); +#else + setup_boot_clock(); +#endif + return 0; } -- cgit v1.2.2 From be7a656fe131cba088912bcafb079b029320504d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:51 -0700 Subject: x86: copy detect_init_APIC to the other Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 20 ++++++++++++ arch/x86/kernel/apic_64.c | 82 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 101 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 8f8b0e1f3eb3..1103e05aa1ba 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1214,6 +1214,25 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +#ifdef CONFIG_X86_64 +/* + * Detect and enable local APICs on non-SMP boards. + * Original code written by Keir Fraser. + * On AMD64 we trust the BIOS - if it says no APIC it is likely + * not correctly set up (usually the APIC timer won't work etc.) + */ +static int __init detect_init_APIC(void) +{ + if (!cpu_has_apic) { + printk(KERN_INFO "No local APIC present\n"); + return -1; + } + + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + boot_cpu_physical_apicid = 0; + return 0; +} +#else /* * Detect and initialize APIC */ @@ -1292,6 +1311,7 @@ no_apic: printk(KERN_INFO "No local APIC present or hardware disabled\n"); return -1; } +#endif #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index 16a30c22b07c..b7268f5c8b18 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -684,7 +684,6 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } - /* * Local APIC start and shutdown */ @@ -1264,6 +1263,7 @@ end: } #endif /* HAVE_X2APIC */ +#ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. @@ -1281,6 +1281,86 @@ static int __init detect_init_APIC(void) boot_cpu_physical_apicid = 0; return 0; } +#else +/* + * Detect and initialize APIC + */ +static int __init detect_init_APIC(void) +{ + u32 h, l, features; + + /* Disabled by kernel option? */ + if (disable_apic) + return -1; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model > 1) || + (boot_cpu_data.x86 == 15)) + break; + goto no_apic; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && cpu_has_apic)) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!cpu_has_apic) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (!force_enable_local_apic) { + printk(KERN_INFO "Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + printk(KERN_WARNING "Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + printk(KERN_INFO "Found and enabled local APIC!\n"); + + apic_pm_activate(); + + return 0; + +no_apic: + printk(KERN_INFO "No local APIC present or hardware disabled\n"); + return -1; +} +#endif #ifdef CONFIG_X86_64 void __init early_init_lapic_mapping(void) -- cgit v1.2.2 From 773763df7de881e65ff2600c024c9ce2dde64750 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:52 -0700 Subject: x86: merge header files in apic_xx.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 8 ++++++++ arch/x86/kernel/apic_64.c | 7 ++++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 1103e05aa1ba..0ec8321e687c 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -23,11 +23,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -36,8 +38,14 @@ #include #include #include +#include #include #include +#include +#include +#include +#include +#include #include #include diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index b7268f5c8b18..ebe417b4d7fc 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -24,9 +24,11 @@ #include #include #include +#include #include #include #include +#include #include #include @@ -34,8 +36,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -43,8 +47,9 @@ #include #include -#include #include +#include +#include /* * Sanity check -- cgit v1.2.2 From dc1528dd864a0b79fa67b60b3ca5674fe94fdce5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:53 -0700 Subject: x86: apic unify smp_spurious/error_interrupt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 24 +++++++++++++++++++++--- arch/x86/kernel/apic_64.c | 24 ++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 0ec8321e687c..60a901b2d4fe 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -1474,10 +1474,17 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_spurious_interrupt(void) +#else void smp_spurious_interrupt(struct pt_regs *regs) +#endif { - unsigned long v; + u32 v; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1488,20 +1495,31 @@ void smp_spurious_interrupt(struct pt_regs *regs) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); +#ifdef CONFIG_X86_64 + add_pda(irq_spurious_count, 1); +#else /* see sw-dev-man vol 3, chapter 7.4.13.5 */ printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " "should never happen.\n", smp_processor_id()); __get_cpu_var(irq_stat).irq_spurious_count++; +#endif irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_error_interrupt(void) +#else void smp_error_interrupt(struct pt_regs *regs) +#endif { - unsigned long v, v1; + u32 v, v1; +#ifdef CONFIG_X86_64 + exit_idle(); +#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); @@ -1520,7 +1538,7 @@ void smp_error_interrupt(struct pt_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk(KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", smp_processor_id(), v , v1); irq_exit(); } diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index ebe417b4d7fc..c728885e4f4a 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -1528,10 +1528,17 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 asmlinkage void smp_spurious_interrupt(void) +#else +void smp_spurious_interrupt(struct pt_regs *regs) +#endif { - unsigned int v; + u32 v; + +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); /* * Check if this really is a spurious interrupt and ACK it @@ -1542,18 +1549,31 @@ asmlinkage void smp_spurious_interrupt(void) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); +#ifdef CONFIG_X86_64 add_pda(irq_spurious_count, 1); +#else + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); + __get_cpu_var(irq_stat).irq_spurious_count++; +#endif irq_exit(); } /* * This interrupt should never happen with our APIC/SMP architecture */ +#ifdef CONFIG_X86_64 asmlinkage void smp_error_interrupt(void) +#else +void smp_error_interrupt(struct pt_regs *regs) +#endif { - unsigned int v, v1; + u32 v, v1; +#ifdef CONFIG_X86_64 exit_idle(); +#endif irq_enter(); /* First tickle the hardware, only then report what went on. -- REW */ v = apic_read(APIC_ESR); -- cgit v1.2.2 From 2f04fa888d270951b9e0fe9e641ddd560d77ad1b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:54 -0700 Subject: x86: apic copy calibrate_APIC_clock to each other in apic_32/64.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 86 +++++++++++++++++++ arch/x86/kernel/apic_64.c | 215 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 301 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 60a901b2d4fe..05498c37f8d9 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -424,6 +424,90 @@ static void __cpuinit setup_APIC_timer(void) clockevents_register_device(levt); } +#ifdef CONFIG_X86_64 +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static int __init calibrate_APIC_clock(void) +{ + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + + local_irq_disable(); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + * + * No interrupt enable ! + */ + __setup_APIC_LVTT(250000000, 0, 0); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + + local_irq_enable(); + + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (result * APIC_DIVISOR) / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; +} + +#else /* * In this functions we calibrate APIC bus clocks to the external timer. * @@ -635,6 +719,8 @@ static int __init calibrate_APIC_clock(void) return 0; } +#endif + /* * Setup the boot APIC * diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c index c728885e4f4a..6e99af5ce678 100644 --- a/arch/x86/kernel/apic_64.c +++ b/arch/x86/kernel/apic_64.c @@ -478,6 +478,7 @@ static void __cpuinit setup_APIC_timer(void) clockevents_register_device(levt); } +#ifdef CONFIG_X86_64 /* * In this function we calibrate APIC bus clocks to the external * timer. Unfortunately we cannot use jiffies and the timer irq @@ -560,6 +561,220 @@ static int __init calibrate_APIC_clock(void) return 0; } +#else +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +static int __init calibrate_APIC_clock(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + int pm_referenced = 0; + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + pm_referenced = 1; + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + local_irq_enable(); + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* We trust the pm timer based calibration */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + levt->features |= CLOCK_EVT_FEAT_DUMMY; + } else + local_irq_enable(); + + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +#endif + /* * Setup the boot APIC * -- cgit v1.2.2 From 6c15822752a13748241e1a34068f2b15b660d28e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:55 -0700 Subject: x86: apic copy apic_64.c to apic_32.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic_32.c | 188 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 188 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c index 05498c37f8d9..0b11d6e3f091 100644 --- a/arch/x86/kernel/apic_32.c +++ b/arch/x86/kernel/apic_32.c @@ -90,6 +90,24 @@ static __init int setup_apicpmtimer(char *s) __setup("apicpmtimer", setup_apicpmtimer); #endif +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC +int x2apic; +/* x2apic enabled before OS handover */ +int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif + unsigned long mp_lapic_addr; int disable_apic; /* Disable local APIC timer from the kernel commandline or via dmi quirk */ @@ -231,6 +249,42 @@ static struct apic_ops xapic_ops = { struct apic_ops __read_mostly *apic_ops = &xapic_ops; EXPORT_SYMBOL_GPL(apic_ops); +#ifdef HAVE_X2APIC +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; +#endif + /** * enable_NMI_through_LVT0 - enable NMI through local vector table 0 */ @@ -1308,6 +1362,127 @@ void __cpuinit end_local_APIC_setup(void) apic_pm_activate(); } +#ifdef HAVE_X2APIC +void check_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + + if (msr & X2APIC_ENABLE) { + printk("x2apic enabled by BIOS, switching to x2apic ops\n"); + x2apic_preenabled = x2apic = 1; + apic_ops = &x2apic_ops; + } +} + +void enable_x2apic(void) +{ + int msr, msr2; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (!(msr & X2APIC_ENABLE)) { + printk("Enabling x2apic\n"); + wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); + } +} + +void enable_IR_x2apic(void) +{ +#ifdef CONFIG_INTR_REMAP + int ret; + unsigned long flags; + + if (!cpu_has_x2apic) + return; + + if (!x2apic_preenabled && disable_x2apic) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of nox2apic\n"); + return; + } + + if (x2apic_preenabled && disable_x2apic) + panic("Bios already enabled x2apic, can't enforce nox2apic"); + + if (!x2apic_preenabled && skip_ioapic_setup) { + printk(KERN_INFO + "Skipped enabling x2apic and Interrupt-remapping " + "because of skipping io-apic setup\n"); + return; + } + + ret = dmar_table_init(); + if (ret) { + printk(KERN_INFO + "dmar_table_init() failed with %d:\n", ret); + + if (x2apic_preenabled) + panic("x2apic enabled by bios. But IR enabling failed"); + else + printk(KERN_INFO + "Not enabling x2apic,Intr-remapping\n"); + return; + } + + local_irq_save(flags); + mask_8259A(); + save_mask_IO_APIC_setup(); + + ret = enable_intr_remapping(1); + + if (ret && x2apic_preenabled) { + local_irq_restore(flags); + panic("x2apic enabled by bios. But IR enabling failed"); + } + + if (ret) + goto end; + + if (!x2apic) { + x2apic = 1; + apic_ops = &x2apic_ops; + enable_x2apic(); + } +end: + if (ret) + /* + * IR enabling failed + */ + restore_IO_APIC_setup(); + else + reinit_intr_remapped_IO_APIC(x2apic_preenabled); + + unmask_8259A(); + local_irq_restore(flags); + + if (!ret) { + if (!x2apic_preenabled) + printk(KERN_INFO + "Enabled x2apic and interrupt-remapping\n"); + else + printk(KERN_INFO + "Enabled Interrupt-remapping\n"); + } else + printk(KERN_ERR + "Failed to enable Interrupt-remapping and x2apic\n"); +#else + if (!cpu_has_x2apic) + return; + + if (x2apic_preenabled) + panic("x2apic enabled prior OS handover," + " enable CONFIG_INTR_REMAP"); + + printk(KERN_INFO "Enable CONFIG_INTR_REMAP for enabling intr-remapping " + " and x2apic\n"); +#endif + + return; +} +#endif /* HAVE_X2APIC */ + #ifdef CONFIG_X86_64 /* * Detect and enable local APICs on non-SMP boards. @@ -1438,6 +1613,13 @@ void __init early_init_lapic_mapping(void) */ void __init init_apic_mappings(void) { +#ifdef HAVE_X2APIC + if (x2apic) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } +#endif + /* * If no local APIC can be found then set up a fake all * zeroes page to simulate the local APIC and another @@ -1501,6 +1683,7 @@ int __init APIC_init_uniprocessor(void) #ifdef CONFIG_X86_64 setup_apic_routing(); #endif + verify_local_APIC(); connect_bsp_APIC(); @@ -1879,6 +2062,11 @@ static int lapic_resume(struct sys_device *dev) local_irq_save(flags); +#ifdef HAVE_X2APIC + if (x2apic) + enable_x2apic(); + else +#endif { /* * Make sure the APICBASE points to the right address -- cgit v1.2.2 From 0f611ffaea81b8e1c69682188ba1ccaf7683a2ba Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 02:01:56 -0700 Subject: x86: rename apic_32.c and apic_64.c to apic.c Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/apic.c | 2311 ++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/apic_32.c | 2312 --------------------------------------------- arch/x86/kernel/apic_64.c | 2311 -------------------------------------------- 4 files changed, 2312 insertions(+), 4624 deletions(-) create mode 100644 arch/x86/kernel/apic.c delete mode 100644 arch/x86/kernel/apic_32.c delete mode 100644 arch/x86/kernel/apic_64.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 7264f9c3af8f..45cecc59d894 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -60,7 +60,7 @@ obj-$(CONFIG_X86_32_SMP) += smpcommon.o obj-$(CONFIG_X86_64_SMP) += tsc_sync.o smpcommon.o obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o obj-$(CONFIG_X86_MPPARSE) += mpparse.o -obj-$(CONFIG_X86_LOCAL_APIC) += apic_$(BITS).o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c new file mode 100644 index 000000000000..6e99af5ce678 --- /dev/null +++ b/arch/x86/kernel/apic.c @@ -0,0 +1,2311 @@ +/* + * Local APIC handling, local APIC timers + * + * (c) 1999, 2000 Ingo Molnar + * + * Fixes + * Maciej W. Rozycki : Bits for genuine 82489DX APICs; + * thanks to Eric Gilmore + * and Rolf G. Tews + * for testing these extensively. + * Maciej W. Rozycki : Various updates and fixes. + * Mikael Pettersson : Power Management for UP-APIC. + * Pavel Machek and + * Mikael Pettersson : PM converted to driver model. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * Sanity check + */ +#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) +# error SPURIOUS_APIC_VECTOR definition error +#endif + +#ifdef CONFIG_X86_32 +/* + * Knob to control our willingness to enable the local APIC. + * + * +1=force-enable + */ +static int force_enable_local_apic; +/* + * APIC command line parameters + */ +static int __init parse_lapic(char *arg) +{ + force_enable_local_apic = 1; + return 0; +} +early_param("lapic", parse_lapic); +/* Local APIC was disabled by the BIOS and enabled by the kernel */ +static int enabled_via_apicbase; + +#endif + +#ifdef CONFIG_X86_64 +static int apic_calibrate_pmtmr __initdata; +static __init int setup_apicpmtimer(char *s) +{ + apic_calibrate_pmtmr = 1; + notsc_setup(NULL); + return 0; +} +__setup("apicpmtimer", setup_apicpmtimer); +#endif + +#ifdef CONFIG_X86_64 +#define HAVE_X2APIC +#endif + +#ifdef HAVE_X2APIC +int x2apic; +/* x2apic enabled before OS handover */ +int x2apic_preenabled; +int disable_x2apic; +static __init int setup_nox2apic(char *str) +{ + disable_x2apic = 1; + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + return 0; +} +early_param("nox2apic", setup_nox2apic); +#endif + +unsigned long mp_lapic_addr; +int disable_apic; +/* Disable local APIC timer from the kernel commandline or via dmi quirk */ +static int disable_apic_timer __cpuinitdata; +/* Local APIC timer works in C2 */ +int local_apic_timer_c2_ok; +EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); + +int first_system_vector = 0xfe; + +char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; + +/* + * Debug level, exported for io_apic.c + */ +unsigned int apic_verbosity; + +int pic_mode; + +/* Have we found an MP table */ +int smp_found_config; + +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + +static unsigned int calibration_result; + +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt); +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt); +static void lapic_timer_broadcast(cpumask_t mask); +static void apic_pm_activate(void); + +/* + * The local apic timer can be used for any function which is CPU local. + */ +static struct clock_event_device lapic_clockevent = { + .name = "lapic", + .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT + | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, + .shift = 32, + .set_mode = lapic_timer_setup, + .set_next_event = lapic_next_event, + .broadcast = lapic_timer_broadcast, + .rating = 100, + .irq = -1, +}; +static DEFINE_PER_CPU(struct clock_event_device, lapic_events); + +static unsigned long apic_phys; + +/* + * Get the LAPIC version + */ +static inline int lapic_get_version(void) +{ + return GET_APIC_VERSION(apic_read(APIC_LVR)); +} + +/* + * Check, if the APIC is integrated or a separate chip + */ +static inline int lapic_is_integrated(void) +{ +#ifdef CONFIG_X86_64 + return 1; +#else + return APIC_INTEGRATED(lapic_get_version()); +#endif +} + +/* + * Check, whether this is a modern or a first generation APIC + */ +static int modern_apic(void) +{ + /* AMD systems use old APIC versions, so check the CPU */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86 >= 0xf) + return 1; + return lapic_get_version() >= 0x14; +} + +/* + * Paravirt kernels also might be using these below ops. So we still + * use generic apic_read()/apic_write(), which might be pointing to different + * ops in PARAVIRT case. + */ +void xapic_wait_icr_idle(void) +{ + while (apic_read(APIC_ICR) & APIC_ICR_BUSY) + cpu_relax(); +} + +u32 safe_xapic_wait_icr_idle(void) +{ + u32 send_status; + int timeout; + + timeout = 0; + do { + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + if (!send_status) + break; + udelay(100); + } while (timeout++ < 1000); + + return send_status; +} + +void xapic_icr_write(u32 low, u32 id) +{ + apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); + apic_write(APIC_ICR, low); +} + +u64 xapic_icr_read(void) +{ + u32 icr1, icr2; + + icr2 = apic_read(APIC_ICR2); + icr1 = apic_read(APIC_ICR); + + return icr1 | ((u64)icr2 << 32); +} + +static struct apic_ops xapic_ops = { + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = xapic_icr_read, + .icr_write = xapic_icr_write, + .wait_icr_idle = xapic_wait_icr_idle, + .safe_wait_icr_idle = safe_xapic_wait_icr_idle, +}; + +struct apic_ops __read_mostly *apic_ops = &xapic_ops; +EXPORT_SYMBOL_GPL(apic_ops); + +#ifdef HAVE_X2APIC +static void x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return; +} + +static u32 safe_x2apic_wait_icr_idle(void) +{ + /* no need to wait for icr idle in x2apic */ + return 0; +} + +void x2apic_icr_write(u32 low, u32 id) +{ + wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); +} + +u64 x2apic_icr_read(void) +{ + unsigned long val; + + rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); + return val; +} + +static struct apic_ops x2apic_ops = { + .read = native_apic_msr_read, + .write = native_apic_msr_write, + .icr_read = x2apic_icr_read, + .icr_write = x2apic_icr_write, + .wait_icr_idle = x2apic_wait_icr_idle, + .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, +}; +#endif + +/** + * enable_NMI_through_LVT0 - enable NMI through local vector table 0 + */ +void __cpuinit enable_NMI_through_LVT0(void) +{ + unsigned int v; + + /* unmask and set to NMI */ + v = APIC_DM_NMI; + + /* Level triggered for 82489DX (32bit mode) */ + if (!lapic_is_integrated()) + v |= APIC_LVT_LEVEL_TRIGGER; + + apic_write(APIC_LVT0, v); +} + +#ifdef CONFIG_X86_32 +/** + * get_physical_broadcast - Get number of physical broadcast IDs + */ +int get_physical_broadcast(void) +{ + return modern_apic() ? 0xff : 0xf; +} +#endif + +/** + * lapic_get_maxlvt - get the maximum number of local vector table entries + */ +int lapic_get_maxlvt(void) +{ + unsigned int v; + + v = apic_read(APIC_LVR); + /* + * - we always have APIC integrated on 64bit mode + * - 82489DXs do not report # of LVT entries + */ + return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; +} + +/* + * Local APIC timer + */ + +/* Clock divisor */ +#ifdef CONFG_X86_64 +#define APIC_DIVISOR 1 +#else +#define APIC_DIVISOR 16 +#endif + +/* + * This function sets up the local APIC timer, with a timeout of + * 'clocks' APIC bus clock. During calibration we actually call + * this function twice on the boot CPU, once with a bogus timeout + * value, second time for real. The other (noncalibrating) CPUs + * call this function only once, with the real, calibrated value. + * + * We do reads before writes even if unnecessary, to get around the + * P5 APIC double write bug. + */ +static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) +{ + unsigned int lvtt_value, tmp_value; + + lvtt_value = LOCAL_TIMER_VECTOR; + if (!oneshot) + lvtt_value |= APIC_LVT_TIMER_PERIODIC; + if (!lapic_is_integrated()) + lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); + + if (!irqen) + lvtt_value |= APIC_LVT_MASKED; + + apic_write(APIC_LVTT, lvtt_value); + + /* + * Divide PICLK by 16 + */ + tmp_value = apic_read(APIC_TDCR); + apic_write(APIC_TDCR, + (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | + APIC_TDR_DIV_16); + + if (!oneshot) + apic_write(APIC_TMICT, clocks / APIC_DIVISOR); +} + +/* + * Setup extended LVT, AMD specific (K8, family 10h) + * + * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and + * MCE interrupts are supported. Thus MCE offset must be set to 0. + * + * If mask=1, the LVT entry does not generate interrupts while mask=0 + * enables the vector. See also the BKDGs. + */ + +#define APIC_EILVT_LVTOFF_MCE 0 +#define APIC_EILVT_LVTOFF_IBS 1 + +static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) +{ + unsigned long reg = (lvt_off << 4) + APIC_EILVT0; + unsigned int v = (mask << 16) | (msg_type << 8) | vector; + + apic_write(reg, v); +} + +u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_MCE; +} + +u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) +{ + setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); + return APIC_EILVT_LVTOFF_IBS; +} +EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); + +/* + * Program the next event, relative to now + */ +static int lapic_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + apic_write(APIC_TMICT, delta); + return 0; +} + +/* + * Setup the lapic timer in periodic or oneshot mode + */ +static void lapic_timer_setup(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + unsigned long flags; + unsigned int v; + + /* Lapic used as dummy for broadcast ? */ + if (evt->features & CLOCK_EVT_FEAT_DUMMY) + return; + + local_irq_save(flags); + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + __setup_APIC_LVTT(calibration_result, + mode != CLOCK_EVT_MODE_PERIODIC, 1); + break; + case CLOCK_EVT_MODE_UNUSED: + case CLOCK_EVT_MODE_SHUTDOWN: + v = apic_read(APIC_LVTT); + v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); + apic_write(APIC_LVTT, v); + break; + case CLOCK_EVT_MODE_RESUME: + /* Nothing to do here */ + break; + } + + local_irq_restore(flags); +} + +/* + * Local APIC timer broadcast function + */ +static void lapic_timer_broadcast(cpumask_t mask) +{ +#ifdef CONFIG_SMP + send_IPI_mask(mask, LOCAL_TIMER_VECTOR); +#endif +} + +/* + * Setup the local APIC timer for this CPU. Copy the initilized values + * of the boot CPU and register the clock event in the framework. + */ +static void __cpuinit setup_APIC_timer(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + + memcpy(levt, &lapic_clockevent, sizeof(*levt)); + levt->cpumask = cpumask_of_cpu(smp_processor_id()); + + clockevents_register_device(levt); +} + +#ifdef CONFIG_X86_64 +/* + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. + * + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. + */ + +#define TICK_COUNT 100000000 + +static int __init calibrate_APIC_clock(void) +{ + unsigned apic, apic_start; + unsigned long tsc, tsc_start; + int result; + + local_irq_disable(); + + /* + * Put whatever arbitrary (but long enough) timeout + * value into the APIC clock, we just want to get the + * counter running for calibration. + * + * No interrupt enable ! + */ + __setup_APIC_LVTT(250000000, 0, 0); + + apic_start = apic_read(APIC_TMCCT); +#ifdef CONFIG_X86_PM_TIMER + if (apic_calibrate_pmtmr && pmtmr_ioport) { + pmtimer_wait(5000); /* 5ms wait */ + apic = apic_read(APIC_TMCCT); + result = (apic_start - apic) * 1000L / 5; + } else +#endif + { + rdtscll(tsc_start); + + do { + apic = apic_read(APIC_TMCCT); + rdtscll(tsc); + } while ((tsc - tsc_start) < TICK_COUNT && + (apic_start - apic) < TICK_COUNT); + + result = (apic_start - apic) * 1000L * tsc_khz / + (tsc - tsc_start); + } + + local_irq_enable(); + + printk(KERN_DEBUG "APIC timer calibration result %d\n", result); + + printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", + result / 1000 / 1000, result / 1000 % 1000); + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (result * APIC_DIVISOR) / HZ; + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + return 0; +} + +#else +/* + * In this functions we calibrate APIC bus clocks to the external timer. + * + * We want to do the calibration only once since we want to have local timer + * irqs syncron. CPUs connected by the same APIC bus have the very same bus + * frequency. + * + * This was previously done by reading the PIT/HPET and waiting for a wrap + * around to find out, that a tick has elapsed. I have a box, where the PIT + * readout is broken, so it never gets out of the wait loop again. This was + * also reported by others. + * + * Monitoring the jiffies value is inaccurate and the clockevents + * infrastructure allows us to do a simple substitution of the interrupt + * handler. + * + * The calibration routine also uses the pm_timer when possible, as the PIT + * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes + * back to normal later in the boot process). + */ + +#define LAPIC_CAL_LOOPS (HZ/10) + +static __initdata int lapic_cal_loops = -1; +static __initdata long lapic_cal_t1, lapic_cal_t2; +static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; +static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; +static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; + +/* + * Temporary interrupt handler. + */ +static void __init lapic_cal_handler(struct clock_event_device *dev) +{ + unsigned long long tsc = 0; + long tapic = apic_read(APIC_TMCCT); + unsigned long pm = acpi_pm_read_early(); + + if (cpu_has_tsc) + rdtscll(tsc); + + switch (lapic_cal_loops++) { + case 0: + lapic_cal_t1 = tapic; + lapic_cal_tsc1 = tsc; + lapic_cal_pm1 = pm; + lapic_cal_j1 = jiffies; + break; + + case LAPIC_CAL_LOOPS: + lapic_cal_t2 = tapic; + lapic_cal_tsc2 = tsc; + if (pm < lapic_cal_pm1) + pm += ACPI_PM_OVRRUN; + lapic_cal_pm2 = pm; + lapic_cal_j2 = jiffies; + break; + } +} + +static int __init calibrate_APIC_clock(void) +{ + struct clock_event_device *levt = &__get_cpu_var(lapic_events); + const long pm_100ms = PMTMR_TICKS_PER_SEC/10; + const long pm_thresh = pm_100ms/100; + void (*real_handler)(struct clock_event_device *dev); + unsigned long deltaj; + long delta, deltapm; + int pm_referenced = 0; + + local_irq_disable(); + + /* Replace the global interrupt handler */ + real_handler = global_clock_event->event_handler; + global_clock_event->event_handler = lapic_cal_handler; + + /* + * Setup the APIC counter to 1e9. There is no way the lapic + * can underflow in the 100ms detection time frame + */ + __setup_APIC_LVTT(1000000000, 0, 0); + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Restore the real event handler */ + global_clock_event->event_handler = real_handler; + + /* Build delta t1-t2 as apic timer counts down */ + delta = lapic_cal_t1 - lapic_cal_t2; + apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); + + /* Check, if the PM timer is available */ + deltapm = lapic_cal_pm2 - lapic_cal_pm1; + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + if (deltapm) { + unsigned long mult; + u64 res; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64) deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64) delta) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long) res, delta); + delta = (long) res; + } + pm_referenced = 1; + } + + /* Calculate the scaled math multiplication factor */ + lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, + lapic_clockevent.shift); + lapic_clockevent.max_delta_ns = + clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); + lapic_clockevent.min_delta_ns = + clockevent_delta2ns(0xF, &lapic_clockevent); + + calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; + + apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); + apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); + apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", + calibration_result); + + if (cpu_has_tsc) { + delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); + apic_printk(APIC_VERBOSE, "..... CPU clock speed is " + "%ld.%04ld MHz.\n", + (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), + (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); + } + + apic_printk(APIC_VERBOSE, "..... host bus clock speed is " + "%u.%04u MHz.\n", + calibration_result / (1000000 / HZ), + calibration_result % (1000000 / HZ)); + + /* + * Do a sanity check on the APIC calibration result + */ + if (calibration_result < (1000000 / HZ)) { + local_irq_enable(); + printk(KERN_WARNING + "APIC frequency too slow, disabling apic timer\n"); + return -1; + } + + levt->features &= ~CLOCK_EVT_FEAT_DUMMY; + + /* We trust the pm timer based calibration */ + if (!pm_referenced) { + apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); + + /* + * Setup the apic timer manually + */ + levt->event_handler = lapic_cal_handler; + lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); + lapic_cal_loops = -1; + + /* Let the interrupts run */ + local_irq_enable(); + + while (lapic_cal_loops <= LAPIC_CAL_LOOPS) + cpu_relax(); + + local_irq_disable(); + + /* Stop the lapic timer */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); + + local_irq_enable(); + + /* Jiffies delta */ + deltaj = lapic_cal_j2 - lapic_cal_j1; + apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); + + /* Check, if the jiffies result is consistent */ + if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) + apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); + else + levt->features |= CLOCK_EVT_FEAT_DUMMY; + } else + local_irq_enable(); + + if (levt->features & CLOCK_EVT_FEAT_DUMMY) { + printk(KERN_WARNING + "APIC timer disabled due to verification failure.\n"); + return -1; + } + + return 0; +} + +#endif + +/* + * Setup the boot APIC + * + * Calibrate and verify the result. + */ +void __init setup_boot_APIC_clock(void) +{ + /* + * The local apic timer can be disabled via the kernel + * commandline or from the CPU detection code. Register the lapic + * timer as a dummy clock event source on SMP systems, so the + * broadcast mechanism is used. On UP systems simply ignore it. + */ + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) { + lapic_clockevent.mult = 1; + setup_APIC_timer(); + } + return; + } + + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" + "calibrating APIC timer ...\n"); + + if (calibrate_APIC_clock()) { + /* No broadcast on UP ! */ + if (num_possible_cpus() > 1) + setup_APIC_timer(); + return; + } + + /* + * If nmi_watchdog is set to IO_APIC, we need the + * PIT/HPET going. Otherwise register lapic as a dummy + * device. + */ + if (nmi_watchdog != NMI_IO_APIC) + lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; + else + printk(KERN_WARNING "APIC timer registered as dummy," + " due to nmi_watchdog=%d!\n", nmi_watchdog); + + /* Setup the lapic or request the broadcast */ + setup_APIC_timer(); +} + +void __cpuinit setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(); +} + +/* + * The guts of the apic timer interrupt + */ +static void local_apic_timer_interrupt(void) +{ + int cpu = smp_processor_id(); + struct clock_event_device *evt = &per_cpu(lapic_events, cpu); + + /* + * Normally we should not be here till LAPIC has been initialized but + * in some cases like kdump, its possible that there is a pending LAPIC + * timer interrupt from previous kernel's context and is delivered in + * new kernel the moment interrupts are enabled. + * + * Interrupts are enabled early and LAPIC is setup much later, hence + * its possible that when we get here evt->event_handler is NULL. + * Check for event_handler being NULL and discard the interrupt as + * spurious. + */ + if (!evt->event_handler) { + printk(KERN_WARNING + "Spurious LAPIC timer interrupt on cpu %d\n", cpu); + /* Switch it off */ + lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); + return; + } + + /* + * the NMI deadlock-detector uses this. + */ +#ifdef CONFIG_X86_64 + add_pda(apic_timer_irqs, 1); +#else + per_cpu(irq_stat, cpu).apic_timer_irqs++; +#endif + + evt->event_handler(evt); +} + +/* + * Local APIC timer interrupt. This is the most natural way for doing + * local interrupts, but local timer interrupts can be emulated by + * broadcast interrupts too. [in case the hw doesn't support APIC timers] + * + * [ if a single-CPU system runs an SMP kernel then we call the local + * interrupt as well. Thus we cannot inline the local irq ... ] + */ +void smp_apic_timer_interrupt(struct pt_regs *regs) +{ + struct pt_regs *old_regs = set_irq_regs(regs); + + /* + * NOTE! We'd better ACK the irq immediately, + * because timer handling can be slow. + */ + ack_APIC_irq(); + /* + * update_process_times() expects us to have done irq_enter(). + * Besides, if we don't timer interrupts ignore the global + * interrupt lock, which is the WrongThing (tm) to do. + */ +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + local_apic_timer_interrupt(); + irq_exit(); + + set_irq_regs(old_regs); +} + +int setup_profiling_timer(unsigned int multiplier) +{ + return -EINVAL; +} + +/* + * Local APIC start and shutdown + */ + +/** + * clear_local_APIC - shutdown the local APIC + * + * This is called, when a CPU is disabled and before rebooting, so the state of + * the local APIC has no dangling leftovers. Also used to cleanout any BIOS + * leftovers during boot. + */ +void clear_local_APIC(void) +{ + int maxlvt; + u32 v; + + /* APIC hasn't been mapped yet */ + if (!apic_phys) + return; + + maxlvt = lapic_get_maxlvt(); + /* + * Masking an LVT entry can trigger a local APIC error + * if the vector is zero. Mask LVTERR first to prevent this. + */ + if (maxlvt >= 3) { + v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ + apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); + } + /* + * Careful: we have to set masks only first to deassert + * any level-triggered sources. + */ + v = apic_read(APIC_LVTT); + apic_write(APIC_LVTT, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT0); + apic_write(APIC_LVT0, v | APIC_LVT_MASKED); + v = apic_read(APIC_LVT1); + apic_write(APIC_LVT1, v | APIC_LVT_MASKED); + if (maxlvt >= 4) { + v = apic_read(APIC_LVTPC); + apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); + } + + /* lets not touch this if we didn't frob it */ +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) + if (maxlvt >= 5) { + v = apic_read(APIC_LVTTHMR); + apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); + } +#endif + /* + * Clean APIC state for other OSs: + */ + apic_write(APIC_LVTT, APIC_LVT_MASKED); + apic_write(APIC_LVT0, APIC_LVT_MASKED); + apic_write(APIC_LVT1, APIC_LVT_MASKED); + if (maxlvt >= 3) + apic_write(APIC_LVTERR, APIC_LVT_MASKED); + if (maxlvt >= 4) + apic_write(APIC_LVTPC, APIC_LVT_MASKED); + + /* Integrated APIC (!82489DX) ? */ + if (lapic_is_integrated()) { + if (maxlvt > 3) + /* Clear ESR due to Pentium errata 3AP and 11AP */ + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } +} + +/** + * disable_local_APIC - clear and disable the local APIC + */ +void disable_local_APIC(void) +{ + unsigned int value; + + clear_local_APIC(); + + /* + * Disable APIC (implies clearing of registers + * for 82489DX!). + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_SPIV_APIC_ENABLED; + apic_write(APIC_SPIV, value); + +#ifdef CONFIG_X86_32 + /* + * When LAPIC was disabled by the BIOS and enabled by the kernel, + * restore the disabled state. + */ + if (enabled_via_apicbase) { + unsigned int l, h; + + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_ENABLE; + wrmsr(MSR_IA32_APICBASE, l, h); + } +#endif +} + +/* + * If Linux enabled the LAPIC against the BIOS default disable it down before + * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and + * not power-off. Additionally clear all LVT entries before disable_local_APIC + * for the case where Linux didn't enable the LAPIC. + */ +void lapic_shutdown(void) +{ + unsigned long flags; + + if (!cpu_has_apic) + return; + + local_irq_save(flags); + +#ifdef CONFIG_X86_32 + if (!enabled_via_apicbase) + clear_local_APIC(); + else +#endif + disable_local_APIC(); + + + local_irq_restore(flags); +} + +/* + * This is to verify that we're looking at a real local APIC. + * Check these against your board if the CPUs aren't getting + * started for no apparent reason. + */ +int __init verify_local_APIC(void) +{ + unsigned int reg0, reg1; + + /* + * The version register is read-only in a real APIC. + */ + reg0 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); + apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); + reg1 = apic_read(APIC_LVR); + apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); + + /* + * The two version reads above should print the same + * numbers. If the second one is different, then we + * poke at a non-APIC. + */ + if (reg1 != reg0) + return 0; + + /* + * Check if the version looks reasonably. + */ + reg1 = GET_APIC_VERSION(reg0); + if (reg1 == 0x00 || reg1 == 0xff) + return 0; + reg1 = lapic_get_maxlvt(); + if (reg1 < 0x02 || reg1 == 0xff) + return 0; + + /* + * The ID register is read/write in a real APIC. + */ + reg0 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); + apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); + reg1 = apic_read(APIC_ID); + apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); + apic_write(APIC_ID, reg0); + if (reg1 != (reg0 ^ APIC_ID_MASK)) + return 0; + + /* + * The next two are just to see if we have sane values. + * They're only really relevant if we're in Virtual Wire + * compatibility mode, but most boxes are anymore. + */ + reg0 = apic_read(APIC_LVT0); + apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); + reg1 = apic_read(APIC_LVT1); + apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); + + return 1; +} + +/** + * sync_Arb_IDs - synchronize APIC bus arbitration IDs + */ +void __init sync_Arb_IDs(void) +{ + /* + * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not + * needed on AMD. + */ + if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); + apic_write(APIC_ICR, APIC_DEST_ALLINC | + APIC_INT_LEVELTRIG | APIC_DM_INIT); +} + +/* + * An initial setup of the virtual wire mode. + */ +void __init init_bsp_APIC(void) +{ + unsigned int value; + + /* + * Don't do the setup now if we have a SMP BIOS as the + * through-I/O-APIC virtual wire mode might be active. + */ + if (smp_found_config || !cpu_has_apic) + return; + + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + + /* + * Enable APIC. + */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + +#ifdef CONFIG_X86_32 + /* This bit is reserved on P4/Xeon and should be cleared */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && + (boot_cpu_data.x86 == 15)) + value &= ~APIC_SPIV_FOCUS_DISABLED; + else +#endif + value |= APIC_SPIV_FOCUS_DISABLED; + value |= SPURIOUS_APIC_VECTOR; + apic_write(APIC_SPIV, value); + + /* + * Set up the virtual wire mode. + */ + apic_write(APIC_LVT0, APIC_DM_EXTINT); + value = APIC_DM_NMI; + if (!lapic_is_integrated()) /* 82489DX */ + value |= APIC_LVT_LEVEL_TRIGGER; + apic_write(APIC_LVT1, value); +} + +static void __cpuinit lapic_setup_esr(void) +{ + unsigned long oldvalue, value, maxlvt; + if (lapic_is_integrated() && !esr_disable) { + if (esr_disable) { + /* + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh + */ + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; + } + /* !82489DX */ + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08lx after: 0x%08lx\n", + oldvalue, value); + } else { + printk(KERN_INFO "No ESR for 82489DX.\n"); + } +} + + +/** + * setup_local_APIC - setup the local APIC + */ +void __cpuinit setup_local_APIC(void) +{ + unsigned int value; + int i, j; + +#ifdef CONFIG_X86_32 + /* Pound the ESR really hard over the head with a big hammer - mbligh */ + if (esr_disable) { + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + apic_write(APIC_ESR, 0); + } +#endif + + preempt_disable(); + + /* + * Double-check whether this APIC is really registered. + * This is meaningless in clustered apic mode, so we skip it. + */ + if (!apic_id_registered()) + BUG(); + + /* + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ + init_apic_ldr(); + + /* + * Set Task Priority to 'accept all'. We never change this + * later on. + */ + value = apic_read(APIC_TASKPRI); + value &= ~APIC_TPRI_MASK; + apic_write(APIC_TASKPRI, value); + + /* + * After a crash, we no longer service the interrupts and a pending + * interrupt from previous kernel might still have ISR bit set. + * + * Most probably by now CPU has serviced that pending interrupt and + * it might not have done the ack_APIC_irq() because it thought, + * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it + * does not clear the ISR bit and cpu thinks it has already serivced + * the interrupt. Hence a vector might get locked. It was noticed + * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. + */ + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1< 1) || + (boot_cpu_data.x86 == 15)) + break; + goto no_apic; + case X86_VENDOR_INTEL: + if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || + (boot_cpu_data.x86 == 5 && cpu_has_apic)) + break; + goto no_apic; + default: + goto no_apic; + } + + if (!cpu_has_apic) { + /* + * Over-ride BIOS and try to enable the local APIC only if + * "lapic" specified. + */ + if (!force_enable_local_apic) { + printk(KERN_INFO "Local APIC disabled by BIOS -- " + "you can enable it with \"lapic\"\n"); + return -1; + } + /* + * Some BIOSes disable the local APIC in the APIC_BASE + * MSR. This can only be done in software for Intel P6 or later + * and AMD K7 (Model > 1) or later. + */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (!(l & MSR_IA32_APICBASE_ENABLE)) { + printk(KERN_INFO + "Local APIC disabled by BIOS -- reenabling.\n"); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; + wrmsr(MSR_IA32_APICBASE, l, h); + enabled_via_apicbase = 1; + } + } + /* + * The APIC feature bit should now be enabled + * in `cpuid' + */ + features = cpuid_edx(1); + if (!(features & (1 << X86_FEATURE_APIC))) { + printk(KERN_WARNING "Could not enable APIC!\n"); + return -1; + } + set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; + + /* The BIOS may have set up the APIC at some other address */ + rdmsr(MSR_IA32_APICBASE, l, h); + if (l & MSR_IA32_APICBASE_ENABLE) + mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; + + printk(KERN_INFO "Found and enabled local APIC!\n"); + + apic_pm_activate(); + + return 0; + +no_apic: + printk(KERN_INFO "No local APIC present or hardware disabled\n"); + return -1; +} +#endif + +#ifdef CONFIG_X86_64 +void __init early_init_lapic_mapping(void) +{ + unsigned long phys_addr; + + /* + * If no local APIC can be found then go out + * : it means there is no mpatable and MADT + */ + if (!smp_found_config) + return; + + phys_addr = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, phys_addr); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, phys_addr); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + boot_cpu_physical_apicid = read_apic_id(); +} +#endif + +/** + * init_apic_mappings - initialize APIC mappings + */ +void __init init_apic_mappings(void) +{ +#ifdef HAVE_X2APIC + if (x2apic) { + boot_cpu_physical_apicid = read_apic_id(); + return; + } +#endif + + /* + * If no local APIC can be found then set up a fake all + * zeroes page to simulate the local APIC and another + * one for the IO-APIC. + */ + if (!smp_found_config && detect_init_APIC()) { + apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); + apic_phys = __pa(apic_phys); + } else + apic_phys = mp_lapic_addr; + + set_fixmap_nocache(FIX_APIC_BASE, apic_phys); + apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + APIC_BASE, apic_phys); + + /* + * Fetch the APIC ID of the BSP in case we have a + * default configuration (or the MP table is broken). + */ + if (boot_cpu_physical_apicid == -1U) + boot_cpu_physical_apicid = read_apic_id(); +} + +/* + * This initializes the IO-APIC and APIC hardware if this is + * a UP kernel. + */ +int apic_version[MAX_APICS]; + +int __init APIC_init_uniprocessor(void) +{ +#ifdef CONFIG_X86_64 + if (disable_apic) { + printk(KERN_INFO "Apic disabled\n"); + return -1; + } + if (!cpu_has_apic) { + disable_apic = 1; + printk(KERN_INFO "Apic disabled by BIOS\n"); + return -1; + } +#else + if (!smp_found_config && !cpu_has_apic) + return -1; + + /* + * Complain if the BIOS pretends there is one. + */ + if (!cpu_has_apic && + APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); + return -1; + } +#endif + +#ifdef HAVE_X2APIC + enable_IR_x2apic(); +#endif +#ifdef CONFIG_X86_64 + setup_apic_routing(); +#endif + + verify_local_APIC(); + connect_bsp_APIC(); + +#ifdef CONFIG_X86_64 + apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); +#else + /* + * Hack: In case of kdump, after a crash, kernel might be booting + * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid + * might be zero if read from MP tables. Get it from LAPIC. + */ +# ifdef CONFIG_CRASH_DUMP + boot_cpu_physical_apicid = read_apic_id(); +# endif +#endif + physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); + setup_local_APIC(); + +#ifdef CONFIG_X86_64 + /* + * Now enable IO-APICs, actually call clear_IO_APIC + * We need clear_IO_APIC before enabling vector on BP + */ + if (!skip_ioapic_setup && nr_ioapics) + enable_IO_APIC(); +#endif + +#ifdef CONFIG_X86_IO_APIC + if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) +#endif + localise_nmi_watchdog(); + end_local_APIC_setup(); + +#ifdef CONFIG_X86_IO_APIC + if (smp_found_config && !skip_ioapic_setup && nr_ioapics) + setup_IO_APIC(); +# ifdef CONFIG_X86_64 + else + nr_ioapics = 0; +# endif +#endif + +#ifdef CONFIG_X86_64 + setup_boot_APIC_clock(); + check_nmi_watchdog(); +#else + setup_boot_clock(); +#endif + + return 0; +} + +/* + * Local APIC interrupts + */ + +/* + * This interrupt should _never_ happen with our APIC/SMP architecture + */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_spurious_interrupt(void) +#else +void smp_spurious_interrupt(struct pt_regs *regs) +#endif +{ + u32 v; + +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + /* + * Check if this really is a spurious interrupt and ACK it + * if it is a vectored one. Just in case... + * Spurious interrupts should not be ACKed. + */ + v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); + if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) + ack_APIC_irq(); + +#ifdef CONFIG_X86_64 + add_pda(irq_spurious_count, 1); +#else + /* see sw-dev-man vol 3, chapter 7.4.13.5 */ + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " + "should never happen.\n", smp_processor_id()); + __get_cpu_var(irq_stat).irq_spurious_count++; +#endif + irq_exit(); +} + +/* + * This interrupt should never happen with our APIC/SMP architecture + */ +#ifdef CONFIG_X86_64 +asmlinkage void smp_error_interrupt(void) +#else +void smp_error_interrupt(struct pt_regs *regs) +#endif +{ + u32 v, v1; + +#ifdef CONFIG_X86_64 + exit_idle(); +#endif + irq_enter(); + /* First tickle the hardware, only then report what went on. -- REW */ + v = apic_read(APIC_ESR); + apic_write(APIC_ESR, 0); + v1 = apic_read(APIC_ESR); + ack_APIC_irq(); + atomic_inc(&irq_err_count); + + /* Here is what the APIC error bits mean: + 0: Send CS error + 1: Receive CS error + 2: Send accept error + 3: Receive accept error + 4: Reserved + 5: Send illegal vector + 6: Received illegal vector + 7: Illegal register address + */ + printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", + smp_processor_id(), v , v1); + irq_exit(); +} + +/** + * connect_bsp_APIC - attach the APIC to the interrupt system + */ +void __init connect_bsp_APIC(void) +{ +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Do not trust the local APIC being empty at bootup. + */ + clear_local_APIC(); + /* + * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's + * local APIC to INT and NMI lines. + */ + apic_printk(APIC_VERBOSE, "leaving PIC mode, " + "enabling APIC mode.\n"); + outb(0x70, 0x22); + outb(0x01, 0x23); + } +#endif + enable_apic_mode(); +} + +/** + * disconnect_bsp_APIC - detach the APIC from the interrupt system + * @virt_wire_setup: indicates, whether virtual wire mode is selected + * + * Virtual wire mode is necessary to deliver legacy interrupts even when the + * APIC is disabled. + */ +void disconnect_bsp_APIC(int virt_wire_setup) +{ + unsigned int value; + +#ifdef CONFIG_X86_32 + if (pic_mode) { + /* + * Put the board back into PIC mode (has an effect only on + * certain older boards). Note that APIC interrupts, including + * IPIs, won't work beyond this point! The only exception are + * INIT IPIs. + */ + apic_printk(APIC_VERBOSE, "disabling APIC mode, " + "entering PIC mode.\n"); + outb(0x70, 0x22); + outb(0x00, 0x23); + return; + } +#endif + + /* Go back to Virtual Wire compatibility mode */ + + /* For the spurious interrupt use vector F, and enable it */ + value = apic_read(APIC_SPIV); + value &= ~APIC_VECTOR_MASK; + value |= APIC_SPIV_APIC_ENABLED; + value |= 0xf; + apic_write(APIC_SPIV, value); + + if (!virt_wire_setup) { + /* + * For LVT0 make it edge triggered, active high, + * external and enabled + */ + value = apic_read(APIC_LVT0); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); + apic_write(APIC_LVT0, value); + } else { + /* Disable LVT0 */ + apic_write(APIC_LVT0, APIC_LVT_MASKED); + } + + /* + * For LVT1 make it edge triggered, active high, + * nmi and enabled + */ + value = apic_read(APIC_LVT1); + value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | + APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | + APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); + value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; + value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); + apic_write(APIC_LVT1, value); +} + +void __cpuinit generic_processor_info(int apicid, int version) +{ + int cpu; + cpumask_t tmp_map; + + /* + * Validate version + */ + if (version == 0x0) { + printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " + "fixing up to 0x10. (tell your hw vendor)\n", + version); + version = 0x10; + } + apic_version[apicid] = version; + + if (num_processors >= NR_CPUS) { + printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." + " Processor ignored.\n", NR_CPUS); + return; + } + + num_processors++; + cpus_complement(tmp_map, cpu_present_map); + cpu = first_cpu(tmp_map); + + physid_set(apicid, phys_cpu_present_map); + if (apicid == boot_cpu_physical_apicid) { + /* + * x86_bios_cpu_apicid is required to have processors listed + * in same order as logical cpu numbers. Hence the first + * entry is BSP, and so on. + */ + cpu = 0; + } + if (apicid > max_physical_apicid) + max_physical_apicid = apicid; + +#ifdef CONFIG_X86_32 + /* + * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y + * but we need to work other dependencies like SMP_SUSPEND etc + * before this can be done without some confusion. + * if (CPU_HOTPLUG_ENABLED || num_processors > 8) + * - Ashok Raj + */ + if (max_physical_apicid >= 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } +#endif + +#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) + /* are we being called early in kernel startup? */ + if (early_per_cpu_ptr(x86_cpu_to_apicid)) { + u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); + u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + + cpu_to_apicid[cpu] = apicid; + bios_cpu_apicid[cpu] = apicid; + } else { + per_cpu(x86_cpu_to_apicid, cpu) = apicid; + per_cpu(x86_bios_cpu_apicid, cpu) = apicid; + } +#endif + + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); +} + +#ifdef CONFIG_X86_64 +int hard_smp_processor_id(void) +{ + return read_apic_id(); +} +#endif + +/* + * Power management + */ +#ifdef CONFIG_PM + +static struct { + /* + * 'active' is true if the local APIC was enabled by us and + * not the BIOS; this signifies that we are also responsible + * for disabling it before entering apm/acpi suspend + */ + int active; + /* r/w apic fields */ + unsigned int apic_id; + unsigned int apic_taskpri; + unsigned int apic_ldr; + unsigned int apic_dfr; + unsigned int apic_spiv; + unsigned int apic_lvtt; + unsigned int apic_lvtpc; + unsigned int apic_lvt0; + unsigned int apic_lvt1; + unsigned int apic_lvterr; + unsigned int apic_tmict; + unsigned int apic_tdcr; + unsigned int apic_thmr; +} apic_pm_state; + +static int lapic_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + apic_pm_state.apic_id = apic_read(APIC_ID); + apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); + apic_pm_state.apic_ldr = apic_read(APIC_LDR); + apic_pm_state.apic_dfr = apic_read(APIC_DFR); + apic_pm_state.apic_spiv = apic_read(APIC_SPIV); + apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); + apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); + apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); + apic_pm_state.apic_tmict = apic_read(APIC_TMICT); + apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif + + local_irq_save(flags); + disable_local_APIC(); + local_irq_restore(flags); + return 0; +} + +static int lapic_resume(struct sys_device *dev) +{ + unsigned int l, h; + unsigned long flags; + int maxlvt; + + if (!apic_pm_state.active) + return 0; + + maxlvt = lapic_get_maxlvt(); + + local_irq_save(flags); + +#ifdef HAVE_X2APIC + if (x2apic) + enable_x2apic(); + else +#endif + { + /* + * Make sure the APICBASE points to the right address + * + * FIXME! This will be wrong if we ever support suspend on + * SMP! We'll need to do this as part of the CPU restore! + */ + rdmsr(MSR_IA32_APICBASE, l, h); + l &= ~MSR_IA32_APICBASE_BASE; + l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; + wrmsr(MSR_IA32_APICBASE, l, h); + } + + apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); + apic_write(APIC_ID, apic_pm_state.apic_id); + apic_write(APIC_DFR, apic_pm_state.apic_dfr); + apic_write(APIC_LDR, apic_pm_state.apic_ldr); + apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); + apic_write(APIC_SPIV, apic_pm_state.apic_spiv); + apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); + apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); +#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); + apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); + apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); + apic_write(APIC_TMICT, apic_pm_state.apic_tmict); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + + local_irq_restore(flags); + + return 0; +} + +/* + * This device has no shutdown method - fully functioning local APICs + * are needed on every CPU up until machine_halt/restart/poweroff. + */ + +static struct sysdev_class lapic_sysclass = { + .name = "lapic", + .resume = lapic_resume, + .suspend = lapic_suspend, +}; + +static struct sys_device device_lapic = { + .id = 0, + .cls = &lapic_sysclass, +}; + +static void __cpuinit apic_pm_activate(void) +{ + apic_pm_state.active = 1; +} + +static int __init init_lapic_sysfs(void) +{ + int error; + + if (!cpu_has_apic) + return 0; + /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ + + error = sysdev_class_register(&lapic_sysclass); + if (!error) + error = sysdev_register(&device_lapic); + return error; +} +device_initcall(init_lapic_sysfs); + +#else /* CONFIG_PM */ + +static void apic_pm_activate(void) { } + +#endif /* CONFIG_PM */ + +#ifdef CONFIG_X86_64 +/* + * apic_is_clustered_box() -- Check if we can expect good TSC + * + * Thus far, the major user of this is IBM's Summit2 series: + * + * Clustered boxes may have unsynced TSC problems if they are + * multi-chassis. Use available data to take a good guess. + * If in doubt, go HPET. + */ +__cpuinit int apic_is_clustered_box(void) +{ + int i, clusters, zeros; + unsigned id; + u16 *bios_cpu_apicid; + DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); + + /* + * there is not this kind of box with AMD CPU yet. + * Some AMD box with quadcore cpu and 8 sockets apicid + * will be [4, 0x23] or [8, 0x27] could be thought to + * vsmp box still need checking... + */ + if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) + return 0; + + bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); + bitmap_zero(clustermap, NUM_APIC_CLUSTERS); + + for (i = 0; i < NR_CPUS; i++) { + /* are we being called early in kernel startup? */ + if (bios_cpu_apicid) { + id = bios_cpu_apicid[i]; + } + else if (i < nr_cpu_ids) { + if (cpu_present(i)) + id = per_cpu(x86_bios_cpu_apicid, i); + else + continue; + } + else + break; + + if (id != BAD_APICID) + __set_bit(APIC_CLUSTERID(id), clustermap); + } + + /* Problem: Partially populated chassis may not have CPUs in some of + * the APIC clusters they have been allocated. Only present CPUs have + * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. + * Since clusters are allocated sequentially, count zeros only if + * they are bounded by ones. + */ + clusters = 0; + zeros = 0; + for (i = 0; i < NUM_APIC_CLUSTERS; i++) { + if (test_bit(i, clustermap)) { + clusters += 1 + zeros; + zeros = 0; + } else + ++zeros; + } + + /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are + * not guaranteed to be synced between boards + */ + if (is_vsmp_box() && clusters > 1) + return 1; + + /* + * If clusters > 2, then should be multi-chassis. + * May have to revisit this when multi-core + hyperthreaded CPUs come + * out, but AFAIK this will work even for them. + */ + return (clusters > 2); +} +#endif + +/* + * APIC command line parameters + */ +static int __init setup_disableapic(char *arg) +{ + disable_apic = 1; + setup_clear_cpu_cap(X86_FEATURE_APIC); + return 0; +} +early_param("disableapic", setup_disableapic); + +/* same as disableapic, for compatibility */ +static int __init setup_nolapic(char *arg) +{ + return setup_disableapic(arg); +} +early_param("nolapic", setup_nolapic); + +static int __init parse_lapic_timer_c2_ok(char *arg) +{ + local_apic_timer_c2_ok = 1; + return 0; +} +early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); + +static int __init parse_disable_apic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("noapictimer", parse_disable_apic_timer); + +static int __init parse_nolapic_timer(char *arg) +{ + disable_apic_timer = 1; + return 0; +} +early_param("nolapic_timer", parse_nolapic_timer); + +static int __init apic_set_verbosity(char *arg) +{ + if (!arg) { +#ifdef CONFIG_X86_64 + skip_ioapic_setup = 0; + return 0; +#endif + return -EINVAL; + } + + if (strcmp("debug", arg) == 0) + apic_verbosity = APIC_DEBUG; + else if (strcmp("verbose", arg) == 0) + apic_verbosity = APIC_VERBOSE; + else { + printk(KERN_WARNING "APIC Verbosity level %s not recognised" + " use apic=verbose or apic=debug\n", arg); + return -EINVAL; + } + + return 0; +} +early_param("apic", apic_set_verbosity); + +static int __init lapic_insert_resource(void) +{ + if (!apic_phys) + return -1; + + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + + return 0; +} + +/* + * need call insert after e820_reserve_resources() + * that is using request_resource + */ +late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic_32.c b/arch/x86/kernel/apic_32.c deleted file mode 100644 index 0b11d6e3f091..000000000000 --- a/arch/x86/kernel/apic_32.c +++ /dev/null @@ -1,2312 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * Sanity check - */ -#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) -# error SPURIOUS_APIC_VECTOR definition error -#endif - -#ifdef CONFIG_X86_32 -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - -#endif - -#ifdef CONFIG_X86_64 -static int apic_calibrate_pmtmr __initdata; -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - -#ifdef CONFIG_X86_64 -#define HAVE_X2APIC -#endif - -#ifdef HAVE_X2APIC -int x2apic; -/* x2apic enabled before OS handover */ -int x2apic_preenabled; -int disable_x2apic; -static __init int setup_nox2apic(char *str) -{ - disable_x2apic = 1; - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - return 0; -} -early_param("nox2apic", setup_nox2apic); -#endif - -unsigned long mp_lapic_addr; -int disable_apic; -/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; -/* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; -EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); - -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -/* - * Debug level, exported for io_apic.c - */ -unsigned int apic_verbosity; - -int pic_mode; - -/* Have we found an MP table */ -int smp_found_config; - -static struct resource lapic_resource = { - .name = "Local APIC", - .flags = IORESOURCE_MEM | IORESOURCE_BUSY, -}; - -static unsigned int calibration_result; - -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); -static void apic_pm_activate(void); - -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - -static unsigned long apic_phys; - -/* - * Get the LAPIC version - */ -static inline int lapic_get_version(void) -{ - return GET_APIC_VERSION(apic_read(APIC_LVR)); -} - -/* - * Check, if the APIC is integrated or a separate chip - */ -static inline int lapic_is_integrated(void) -{ -#ifdef CONFIG_X86_64 - return 1; -#else - return APIC_INTEGRATED(lapic_get_version()); -#endif -} - -/* - * Check, whether this is a modern or a first generation APIC - */ -static int modern_apic(void) -{ - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - return lapic_get_version() >= 0x14; -} - -/* - * Paravirt kernels also might be using these below ops. So we still - * use generic apic_read()/apic_write(), which might be pointing to different - * ops in PARAVIRT case. - */ -void xapic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 safe_xapic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); - - return send_status; -} - -void xapic_icr_write(u32 low, u32 id) -{ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); - apic_write(APIC_ICR, low); -} - -u64 xapic_icr_read(void) -{ - u32 icr1, icr2; - - icr2 = apic_read(APIC_ICR2); - icr1 = apic_read(APIC_ICR); - - return icr1 | ((u64)icr2 << 32); -} - -static struct apic_ops xapic_ops = { - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .icr_read = xapic_icr_read, - .icr_write = xapic_icr_write, - .wait_icr_idle = xapic_wait_icr_idle, - .safe_wait_icr_idle = safe_xapic_wait_icr_idle, -}; - -struct apic_ops __read_mostly *apic_ops = &xapic_ops; -EXPORT_SYMBOL_GPL(apic_ops); - -#ifdef HAVE_X2APIC -static void x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static u32 safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - -void x2apic_icr_write(u32 low, u32 id) -{ - wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); -} - -u64 x2apic_icr_read(void) -{ - unsigned long val; - - rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); - return val; -} - -static struct apic_ops x2apic_ops = { - .read = native_apic_msr_read, - .write = native_apic_msr_write, - .icr_read = x2apic_icr_read, - .icr_write = x2apic_icr_write, - .wait_icr_idle = x2apic_wait_icr_idle, - .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, -}; -#endif - -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - -#ifdef CONFIG_X86_32 -/** - * get_physical_broadcast - Get number of physical broadcast IDs - */ -int get_physical_broadcast(void) -{ - return modern_apic() ? 0xff : 0xf; -} -#endif - -/** - * lapic_get_maxlvt - get the maximum number of local vector table entries - */ -int lapic_get_maxlvt(void) -{ - unsigned int v; - - v = apic_read(APIC_LVR); - /* - * - we always have APIC integrated on 64bit mode - * - 82489DXs do not report # of LVT entries - */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; -} - -/* - * Local APIC timer - */ - -/* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else -#define APIC_DIVISOR 16 -#endif - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, - (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | - APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks / APIC_DIVISOR); -} - -/* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - * - * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} -EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); - -/* - * Program the next event, relative to now - */ -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - apic_write(APIC_TMICT, delta); - return 0; -} - -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long flags; - unsigned int v; - - /* Lapic used as dummy for broadcast ? */ - if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; - - local_irq_save(flags); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } - - local_irq_restore(flags); -} - -/* - * Local APIC timer broadcast function - */ -static void lapic_timer_broadcast(cpumask_t mask) -{ -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#endif -} - -/* - * Setup the local APIC timer for this CPU. Copy the initilized values - * of the boot CPU and register the clock event in the framework. - */ -static void __cpuinit setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -#ifdef CONFIG_X86_64 -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (result * APIC_DIVISOR) / HZ; - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - return 0; -} - -#else -/* - * In this functions we calibrate APIC bus clocks to the external timer. - * - * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus - * frequency. - * - * This was previously done by reading the PIT/HPET and waiting for a wrap - * around to find out, that a tick has elapsed. I have a box, where the PIT - * readout is broken, so it never gets out of the wait loop again. This was - * also reported by others. - * - * Monitoring the jiffies value is inaccurate and the clockevents - * infrastructure allows us to do a simple substitution of the interrupt - * handler. - * - * The calibration routine also uses the pm_timer when possible, as the PIT - * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes - * back to normal later in the boot process). - */ - -#define LAPIC_CAL_LOOPS (HZ/10) - -static __initdata int lapic_cal_loops = -1; -static __initdata long lapic_cal_t1, lapic_cal_t2; -static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; -static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; -static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; - -/* - * Temporary interrupt handler. - */ -static void __init lapic_cal_handler(struct clock_event_device *dev) -{ - unsigned long long tsc = 0; - long tapic = apic_read(APIC_TMCCT); - unsigned long pm = acpi_pm_read_early(); - - if (cpu_has_tsc) - rdtscll(tsc); - - switch (lapic_cal_loops++) { - case 0: - lapic_cal_t1 = tapic; - lapic_cal_tsc1 = tsc; - lapic_cal_pm1 = pm; - lapic_cal_j1 = jiffies; - break; - - case LAPIC_CAL_LOOPS: - lapic_cal_t2 = tapic; - lapic_cal_tsc2 = tsc; - if (pm < lapic_cal_pm1) - pm += ACPI_PM_OVRRUN; - lapic_cal_pm2 = pm; - lapic_cal_j2 = jiffies; - break; - } -} - -static int __init calibrate_APIC_clock(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; - void (*real_handler)(struct clock_event_device *dev); - unsigned long deltaj; - long delta, deltapm; - int pm_referenced = 0; - - local_irq_disable(); - - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - - /* - * Setup the APIC counter to 1e9. There is no way the lapic - * can underflow in the 100ms detection time frame - */ - __setup_APIC_LVTT(1000000000, 0, 0); - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; - - /* Build delta t1-t2 as apic timer counts down */ - delta = lapic_cal_t1 - lapic_cal_t2; - apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); - - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; - - apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - calibration_result); - - if (cpu_has_tsc) { - delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); - } - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%u.%04u MHz.\n", - calibration_result / (1000000 / HZ), - calibration_result % (1000000 / HZ)); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - local_irq_enable(); - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - levt->features &= ~CLOCK_EVT_FEAT_DUMMY; - - /* We trust the pm timer based calibration */ - if (!pm_referenced) { - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); - - /* - * Setup the apic timer manually - */ - levt->event_handler = lapic_cal_handler; - lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); - lapic_cal_loops = -1; - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Stop the lapic timer */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - - local_irq_enable(); - - /* Jiffies delta */ - deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); - - /* Check, if the jiffies result is consistent */ - if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) - apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); - else - levt->features |= CLOCK_EVT_FEAT_DUMMY; - } else - local_irq_enable(); - - if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - printk(KERN_WARNING - "APIC timer disabled due to verification failure.\n"); - return -1; - } - - return 0; -} - -#endif - -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) -{ - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - - if (calibrate_APIC_clock()) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); - - /* Setup the lapic or request the broadcast */ - setup_APIC_timer(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(); -} - -/* - * The guts of the apic timer interrupt - */ -static void local_apic_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - local_apic_timer_interrupt(); - irq_exit(); - - set_irq_regs(old_regs); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -/* - * Local APIC start and shutdown - */ - -/** - * clear_local_APIC - shutdown the local APIC - * - * This is called, when a CPU is disabled and before rebooting, so the state of - * the local APIC has no dangling leftovers. Also used to cleanout any BIOS - * leftovers during boot. - */ -void clear_local_APIC(void) -{ - int maxlvt; - u32 v; - - /* APIC hasn't been mapped yet */ - if (!apic_phys) - return; - - maxlvt = lapic_get_maxlvt(); - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) - if (maxlvt >= 5) { - v = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); - } -#endif - /* - * Clean APIC state for other OSs: - */ - apic_write(APIC_LVTT, APIC_LVT_MASKED); - apic_write(APIC_LVT0, APIC_LVT_MASKED); - apic_write(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write(APIC_LVTPC, APIC_LVT_MASKED); - - /* Integrated APIC (!82489DX) ? */ - if (lapic_is_integrated()) { - if (maxlvt > 3) - /* Clear ESR due to Pentium errata 3AP and 11AP */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } -} - -/** - * disable_local_APIC - clear and disable the local APIC - */ -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write(APIC_SPIV, value); - -#ifdef CONFIG_X86_32 - /* - * When LAPIC was disabled by the BIOS and enabled by the kernel, - * restore the disabled state. - */ - if (enabled_via_apicbase) { - unsigned int l, h; - - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_ENABLE; - wrmsr(MSR_IA32_APICBASE, l, h); - } -#endif -} - -/* - * If Linux enabled the LAPIC against the BIOS default disable it down before - * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and - * not power-off. Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. - */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - -#ifdef CONFIG_X86_32 - if (!enabled_via_apicbase) - clear_local_APIC(); - else -#endif - disable_local_APIC(); - - - local_irq_restore(flags); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -/** - * sync_Arb_IDs - synchronize APIC bus arbitration IDs - */ -void __init sync_Arb_IDs(void) -{ - /* - * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not - * needed on AMD. - */ - if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); -} - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - -#ifdef CONFIG_X86_32 - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else -#endif - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!lapic_is_integrated()) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write(APIC_LVT1, value); -} - -static void __cpuinit lapic_setup_esr(void) -{ - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); - - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); - } -} - - -/** - * setup_local_APIC - setup the local APIC - */ -void __cpuinit setup_local_APIC(void) -{ - unsigned int value; - int i, j; - -#ifdef CONFIG_X86_32 - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } -#endif - - preempt_disable(); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI, value); - - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1< 1) || - (boot_cpu_data.x86 == 15)) - break; - goto no_apic; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) - break; - goto no_apic; - default: - goto no_apic; - } - - if (!cpu_has_apic) { - /* - * Over-ride BIOS and try to enable the local APIC only if - * "lapic" specified. - */ - if (!force_enable_local_apic) { - printk(KERN_INFO "Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); - return -1; - } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk(KERN_INFO - "Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - printk(KERN_WARNING "Could not enable APIC!\n"); - return -1; - } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - printk(KERN_INFO "Found and enabled local APIC!\n"); - - apic_pm_activate(); - - return 0; - -no_apic: - printk(KERN_INFO "No local APIC present or hardware disabled\n"); - return -1; -} -#endif - -#ifdef CONFIG_X86_64 -void __init early_init_lapic_mapping(void) -{ - unsigned long phys_addr; - - /* - * If no local APIC can be found then go out - * : it means there is no mpatable and MADT - */ - if (!smp_found_config) - return; - - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = read_apic_id(); -} -#endif - -/** - * init_apic_mappings - initialize APIC mappings - */ -void __init init_apic_mappings(void) -{ -#ifdef HAVE_X2APIC - if (x2apic) { - boot_cpu_physical_apicid = read_apic_id(); - return; - } -#endif - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int apic_version[MAX_APICS]; - -int __init APIC_init_uniprocessor(void) -{ -#ifdef CONFIG_X86_64 - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !cpu_has_apic) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - return -1; - } -#endif - -#ifdef HAVE_X2APIC - enable_IR_x2apic(); -#endif -#ifdef CONFIG_X86_64 - setup_apic_routing(); -#endif - - verify_local_APIC(); - connect_bsp_APIC(); - -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); - -#ifdef CONFIG_X86_64 - /* - * Now enable IO-APICs, actually call clear_IO_APIC - * We need clear_IO_APIC before enabling vector on BP - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); -#endif - -#ifdef CONFIG_X86_IO_APIC - if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) -#endif - localise_nmi_watchdog(); - end_local_APIC_setup(); - -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -# ifdef CONFIG_X86_64 - else - nr_ioapics = 0; -# endif -#endif - -#ifdef CONFIG_X86_64 - setup_boot_APIC_clock(); - check_nmi_watchdog(); -#else - setup_boot_clock(); -#endif - - return 0; -} - -/* - * Local APIC interrupts - */ - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_spurious_interrupt(void) -#else -void smp_spurious_interrupt(struct pt_regs *regs) -#endif -{ - u32 v; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_error_interrupt(void) -#else -void smp_error_interrupt(struct pt_regs *regs) -#endif -{ - u32 v, v1; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -/** - * connect_bsp_APIC - attach the APIC to the interrupt system - */ -void __init connect_bsp_APIC(void) -{ -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's - * local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -#endif - enable_apic_mode(); -} - -/** - * disconnect_bsp_APIC - detach the APIC from the interrupt system - * @virt_wire_setup: indicates, whether virtual wire mode is selected - * - * Virtual wire mode is necessary to deliver legacy interrupts even when the - * APIC is disabled. - */ -void disconnect_bsp_APIC(int virt_wire_setup) -{ - unsigned int value; - -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect only on - * certain older boards). Note that APIC interrupts, including - * IPIs, won't work beyond this point! The only exception are - * INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - return; - } -#endif - - /* Go back to Virtual Wire compatibility mode */ - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* - * For LVT1 make it edge triggered, active high, - * nmi and enabled - */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - -void __cpuinit generic_processor_info(int apicid, int version) -{ - int cpu; - cpumask_t tmp_map; - - /* - * Validate version - */ - if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(apicid, phys_cpu_present_map); - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - -#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) - /* are we being called early in kernel startup? */ - if (early_per_cpu_ptr(x86_cpu_to_apicid)) { - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - - cpu_to_apicid[cpu] = apicid; - bios_cpu_apicid[cpu] = apicid; - } else { - per_cpu(x86_cpu_to_apicid, cpu) = apicid; - per_cpu(x86_bios_cpu_apicid, cpu) = apicid; - } -#endif - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -#ifdef CONFIG_X86_64 -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} -#endif - -/* - * Power management - */ -#ifdef CONFIG_PM - -static struct { - /* - * 'active' is true if the local APIC was enabled by us and - * not the BIOS; this signifies that we are also responsible - * for disabling it before entering apm/acpi suspend - */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - local_irq_save(flags); - -#ifdef HAVE_X2APIC - if (x2apic) - enable_x2apic(); - else -#endif - { - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - } - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - - local_irq_restore(flags); - - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. - */ - -static struct sysdev_class lapic_sysclass = { - .name = "lapic", - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -#ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } - else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } - else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) - return 1; - - /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} -#endif - -/* - * APIC command line parameters - */ -static int __init setup_disableapic(char *arg) -{ - disable_apic = 1; - setup_clear_cpu_cap(X86_FEATURE_APIC); - return 0; -} -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} -early_param("nolapic", setup_nolapic); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init parse_disable_apic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("noapictimer", parse_disable_apic_timer); - -static int __init parse_nolapic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("nolapic_timer", parse_nolapic_timer); - -static int __init apic_set_verbosity(char *arg) -{ - if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; - ioapic_force = 1; - return 0; -#endif - return -EINVAL; - } - - if (strcmp("debug", arg) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", arg) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", arg); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -static int __init lapic_insert_resource(void) -{ - if (!apic_phys) - return -1; - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - return 0; -} - -/* - * need call insert after e820_reserve_resources() - * that is using request_resource - */ -late_initcall(lapic_insert_resource); diff --git a/arch/x86/kernel/apic_64.c b/arch/x86/kernel/apic_64.c deleted file mode 100644 index 6e99af5ce678..000000000000 --- a/arch/x86/kernel/apic_64.c +++ /dev/null @@ -1,2311 +0,0 @@ -/* - * Local APIC handling, local APIC timers - * - * (c) 1999, 2000 Ingo Molnar - * - * Fixes - * Maciej W. Rozycki : Bits for genuine 82489DX APICs; - * thanks to Eric Gilmore - * and Rolf G. Tews - * for testing these extensively. - * Maciej W. Rozycki : Various updates and fixes. - * Mikael Pettersson : Power Management for UP-APIC. - * Pavel Machek and - * Mikael Pettersson : PM converted to driver model. - */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include - -/* - * Sanity check - */ -#if ((SPURIOUS_APIC_VECTOR & 0x0F) != 0x0F) -# error SPURIOUS_APIC_VECTOR definition error -#endif - -#ifdef CONFIG_X86_32 -/* - * Knob to control our willingness to enable the local APIC. - * - * +1=force-enable - */ -static int force_enable_local_apic; -/* - * APIC command line parameters - */ -static int __init parse_lapic(char *arg) -{ - force_enable_local_apic = 1; - return 0; -} -early_param("lapic", parse_lapic); -/* Local APIC was disabled by the BIOS and enabled by the kernel */ -static int enabled_via_apicbase; - -#endif - -#ifdef CONFIG_X86_64 -static int apic_calibrate_pmtmr __initdata; -static __init int setup_apicpmtimer(char *s) -{ - apic_calibrate_pmtmr = 1; - notsc_setup(NULL); - return 0; -} -__setup("apicpmtimer", setup_apicpmtimer); -#endif - -#ifdef CONFIG_X86_64 -#define HAVE_X2APIC -#endif - -#ifdef HAVE_X2APIC -int x2apic; -/* x2apic enabled before OS handover */ -int x2apic_preenabled; -int disable_x2apic; -static __init int setup_nox2apic(char *str) -{ - disable_x2apic = 1; - setup_clear_cpu_cap(X86_FEATURE_X2APIC); - return 0; -} -early_param("nox2apic", setup_nox2apic); -#endif - -unsigned long mp_lapic_addr; -int disable_apic; -/* Disable local APIC timer from the kernel commandline or via dmi quirk */ -static int disable_apic_timer __cpuinitdata; -/* Local APIC timer works in C2 */ -int local_apic_timer_c2_ok; -EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); - -int first_system_vector = 0xfe; - -char system_vectors[NR_VECTORS] = { [0 ... NR_VECTORS-1] = SYS_VECTOR_FREE}; - -/* - * Debug level, exported for io_apic.c - */ -unsigned int apic_verbosity; - -int pic_mode; - -/* Have we found an MP table */ -int smp_found_config; - -static struct resource lapic_resource = { - .name = "Local APIC", - .flags = IORESOURCE_MEM | IORESOURCE_BUSY, -}; - -static unsigned int calibration_result; - -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt); -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt); -static void lapic_timer_broadcast(cpumask_t mask); -static void apic_pm_activate(void); - -/* - * The local apic timer can be used for any function which is CPU local. - */ -static struct clock_event_device lapic_clockevent = { - .name = "lapic", - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT - | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY, - .shift = 32, - .set_mode = lapic_timer_setup, - .set_next_event = lapic_next_event, - .broadcast = lapic_timer_broadcast, - .rating = 100, - .irq = -1, -}; -static DEFINE_PER_CPU(struct clock_event_device, lapic_events); - -static unsigned long apic_phys; - -/* - * Get the LAPIC version - */ -static inline int lapic_get_version(void) -{ - return GET_APIC_VERSION(apic_read(APIC_LVR)); -} - -/* - * Check, if the APIC is integrated or a separate chip - */ -static inline int lapic_is_integrated(void) -{ -#ifdef CONFIG_X86_64 - return 1; -#else - return APIC_INTEGRATED(lapic_get_version()); -#endif -} - -/* - * Check, whether this is a modern or a first generation APIC - */ -static int modern_apic(void) -{ - /* AMD systems use old APIC versions, so check the CPU */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD && - boot_cpu_data.x86 >= 0xf) - return 1; - return lapic_get_version() >= 0x14; -} - -/* - * Paravirt kernels also might be using these below ops. So we still - * use generic apic_read()/apic_write(), which might be pointing to different - * ops in PARAVIRT case. - */ -void xapic_wait_icr_idle(void) -{ - while (apic_read(APIC_ICR) & APIC_ICR_BUSY) - cpu_relax(); -} - -u32 safe_xapic_wait_icr_idle(void) -{ - u32 send_status; - int timeout; - - timeout = 0; - do { - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - if (!send_status) - break; - udelay(100); - } while (timeout++ < 1000); - - return send_status; -} - -void xapic_icr_write(u32 low, u32 id) -{ - apic_write(APIC_ICR2, SET_APIC_DEST_FIELD(id)); - apic_write(APIC_ICR, low); -} - -u64 xapic_icr_read(void) -{ - u32 icr1, icr2; - - icr2 = apic_read(APIC_ICR2); - icr1 = apic_read(APIC_ICR); - - return icr1 | ((u64)icr2 << 32); -} - -static struct apic_ops xapic_ops = { - .read = native_apic_mem_read, - .write = native_apic_mem_write, - .icr_read = xapic_icr_read, - .icr_write = xapic_icr_write, - .wait_icr_idle = xapic_wait_icr_idle, - .safe_wait_icr_idle = safe_xapic_wait_icr_idle, -}; - -struct apic_ops __read_mostly *apic_ops = &xapic_ops; -EXPORT_SYMBOL_GPL(apic_ops); - -#ifdef HAVE_X2APIC -static void x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return; -} - -static u32 safe_x2apic_wait_icr_idle(void) -{ - /* no need to wait for icr idle in x2apic */ - return 0; -} - -void x2apic_icr_write(u32 low, u32 id) -{ - wrmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), ((__u64) id) << 32 | low); -} - -u64 x2apic_icr_read(void) -{ - unsigned long val; - - rdmsrl(APIC_BASE_MSR + (APIC_ICR >> 4), val); - return val; -} - -static struct apic_ops x2apic_ops = { - .read = native_apic_msr_read, - .write = native_apic_msr_write, - .icr_read = x2apic_icr_read, - .icr_write = x2apic_icr_write, - .wait_icr_idle = x2apic_wait_icr_idle, - .safe_wait_icr_idle = safe_x2apic_wait_icr_idle, -}; -#endif - -/** - * enable_NMI_through_LVT0 - enable NMI through local vector table 0 - */ -void __cpuinit enable_NMI_through_LVT0(void) -{ - unsigned int v; - - /* unmask and set to NMI */ - v = APIC_DM_NMI; - - /* Level triggered for 82489DX (32bit mode) */ - if (!lapic_is_integrated()) - v |= APIC_LVT_LEVEL_TRIGGER; - - apic_write(APIC_LVT0, v); -} - -#ifdef CONFIG_X86_32 -/** - * get_physical_broadcast - Get number of physical broadcast IDs - */ -int get_physical_broadcast(void) -{ - return modern_apic() ? 0xff : 0xf; -} -#endif - -/** - * lapic_get_maxlvt - get the maximum number of local vector table entries - */ -int lapic_get_maxlvt(void) -{ - unsigned int v; - - v = apic_read(APIC_LVR); - /* - * - we always have APIC integrated on 64bit mode - * - 82489DXs do not report # of LVT entries - */ - return APIC_INTEGRATED(GET_APIC_VERSION(v)) ? GET_APIC_MAXLVT(v) : 2; -} - -/* - * Local APIC timer - */ - -/* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else -#define APIC_DIVISOR 16 -#endif - -/* - * This function sets up the local APIC timer, with a timeout of - * 'clocks' APIC bus clock. During calibration we actually call - * this function twice on the boot CPU, once with a bogus timeout - * value, second time for real. The other (noncalibrating) CPUs - * call this function only once, with the real, calibrated value. - * - * We do reads before writes even if unnecessary, to get around the - * P5 APIC double write bug. - */ -static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) -{ - unsigned int lvtt_value, tmp_value; - - lvtt_value = LOCAL_TIMER_VECTOR; - if (!oneshot) - lvtt_value |= APIC_LVT_TIMER_PERIODIC; - if (!lapic_is_integrated()) - lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV); - - if (!irqen) - lvtt_value |= APIC_LVT_MASKED; - - apic_write(APIC_LVTT, lvtt_value); - - /* - * Divide PICLK by 16 - */ - tmp_value = apic_read(APIC_TDCR); - apic_write(APIC_TDCR, - (tmp_value & ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE)) | - APIC_TDR_DIV_16); - - if (!oneshot) - apic_write(APIC_TMICT, clocks / APIC_DIVISOR); -} - -/* - * Setup extended LVT, AMD specific (K8, family 10h) - * - * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and - * MCE interrupts are supported. Thus MCE offset must be set to 0. - * - * If mask=1, the LVT entry does not generate interrupts while mask=0 - * enables the vector. See also the BKDGs. - */ - -#define APIC_EILVT_LVTOFF_MCE 0 -#define APIC_EILVT_LVTOFF_IBS 1 - -static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) -{ - unsigned long reg = (lvt_off << 4) + APIC_EILVT0; - unsigned int v = (mask << 16) | (msg_type << 8) | vector; - - apic_write(reg, v); -} - -u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_MCE; -} - -u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) -{ - setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); - return APIC_EILVT_LVTOFF_IBS; -} -EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); - -/* - * Program the next event, relative to now - */ -static int lapic_next_event(unsigned long delta, - struct clock_event_device *evt) -{ - apic_write(APIC_TMICT, delta); - return 0; -} - -/* - * Setup the lapic timer in periodic or oneshot mode - */ -static void lapic_timer_setup(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - unsigned long flags; - unsigned int v; - - /* Lapic used as dummy for broadcast ? */ - if (evt->features & CLOCK_EVT_FEAT_DUMMY) - return; - - local_irq_save(flags); - - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - case CLOCK_EVT_MODE_ONESHOT: - __setup_APIC_LVTT(calibration_result, - mode != CLOCK_EVT_MODE_PERIODIC, 1); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - v = apic_read(APIC_LVTT); - v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); - apic_write(APIC_LVTT, v); - break; - case CLOCK_EVT_MODE_RESUME: - /* Nothing to do here */ - break; - } - - local_irq_restore(flags); -} - -/* - * Local APIC timer broadcast function - */ -static void lapic_timer_broadcast(cpumask_t mask) -{ -#ifdef CONFIG_SMP - send_IPI_mask(mask, LOCAL_TIMER_VECTOR); -#endif -} - -/* - * Setup the local APIC timer for this CPU. Copy the initilized values - * of the boot CPU and register the clock event in the framework. - */ -static void __cpuinit setup_APIC_timer(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - - memcpy(levt, &lapic_clockevent, sizeof(*levt)); - levt->cpumask = cpumask_of_cpu(smp_processor_id()); - - clockevents_register_device(levt); -} - -#ifdef CONFIG_X86_64 -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (result * APIC_DIVISOR) / HZ; - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - return 0; -} - -#else -/* - * In this functions we calibrate APIC bus clocks to the external timer. - * - * We want to do the calibration only once since we want to have local timer - * irqs syncron. CPUs connected by the same APIC bus have the very same bus - * frequency. - * - * This was previously done by reading the PIT/HPET and waiting for a wrap - * around to find out, that a tick has elapsed. I have a box, where the PIT - * readout is broken, so it never gets out of the wait loop again. This was - * also reported by others. - * - * Monitoring the jiffies value is inaccurate and the clockevents - * infrastructure allows us to do a simple substitution of the interrupt - * handler. - * - * The calibration routine also uses the pm_timer when possible, as the PIT - * happens to run way too slow (factor 2.3 on my VAIO CoreDuo, which goes - * back to normal later in the boot process). - */ - -#define LAPIC_CAL_LOOPS (HZ/10) - -static __initdata int lapic_cal_loops = -1; -static __initdata long lapic_cal_t1, lapic_cal_t2; -static __initdata unsigned long long lapic_cal_tsc1, lapic_cal_tsc2; -static __initdata unsigned long lapic_cal_pm1, lapic_cal_pm2; -static __initdata unsigned long lapic_cal_j1, lapic_cal_j2; - -/* - * Temporary interrupt handler. - */ -static void __init lapic_cal_handler(struct clock_event_device *dev) -{ - unsigned long long tsc = 0; - long tapic = apic_read(APIC_TMCCT); - unsigned long pm = acpi_pm_read_early(); - - if (cpu_has_tsc) - rdtscll(tsc); - - switch (lapic_cal_loops++) { - case 0: - lapic_cal_t1 = tapic; - lapic_cal_tsc1 = tsc; - lapic_cal_pm1 = pm; - lapic_cal_j1 = jiffies; - break; - - case LAPIC_CAL_LOOPS: - lapic_cal_t2 = tapic; - lapic_cal_tsc2 = tsc; - if (pm < lapic_cal_pm1) - pm += ACPI_PM_OVRRUN; - lapic_cal_pm2 = pm; - lapic_cal_j2 = jiffies; - break; - } -} - -static int __init calibrate_APIC_clock(void) -{ - struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; - void (*real_handler)(struct clock_event_device *dev); - unsigned long deltaj; - long delta, deltapm; - int pm_referenced = 0; - - local_irq_disable(); - - /* Replace the global interrupt handler */ - real_handler = global_clock_event->event_handler; - global_clock_event->event_handler = lapic_cal_handler; - - /* - * Setup the APIC counter to 1e9. There is no way the lapic - * can underflow in the 100ms detection time frame - */ - __setup_APIC_LVTT(1000000000, 0, 0); - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Restore the real event handler */ - global_clock_event->event_handler = real_handler; - - /* Build delta t1-t2 as apic timer counts down */ - delta = lapic_cal_t1 - lapic_cal_t2; - apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); - - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; - - apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); - apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); - apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", - calibration_result); - - if (cpu_has_tsc) { - delta = (long)(lapic_cal_tsc2 - lapic_cal_tsc1); - apic_printk(APIC_VERBOSE, "..... CPU clock speed is " - "%ld.%04ld MHz.\n", - (delta / LAPIC_CAL_LOOPS) / (1000000 / HZ), - (delta / LAPIC_CAL_LOOPS) % (1000000 / HZ)); - } - - apic_printk(APIC_VERBOSE, "..... host bus clock speed is " - "%u.%04u MHz.\n", - calibration_result / (1000000 / HZ), - calibration_result % (1000000 / HZ)); - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - local_irq_enable(); - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - levt->features &= ~CLOCK_EVT_FEAT_DUMMY; - - /* We trust the pm timer based calibration */ - if (!pm_referenced) { - apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); - - /* - * Setup the apic timer manually - */ - levt->event_handler = lapic_cal_handler; - lapic_timer_setup(CLOCK_EVT_MODE_PERIODIC, levt); - lapic_cal_loops = -1; - - /* Let the interrupts run */ - local_irq_enable(); - - while (lapic_cal_loops <= LAPIC_CAL_LOOPS) - cpu_relax(); - - local_irq_disable(); - - /* Stop the lapic timer */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, levt); - - local_irq_enable(); - - /* Jiffies delta */ - deltaj = lapic_cal_j2 - lapic_cal_j1; - apic_printk(APIC_VERBOSE, "... jiffies delta = %lu\n", deltaj); - - /* Check, if the jiffies result is consistent */ - if (deltaj >= LAPIC_CAL_LOOPS-2 && deltaj <= LAPIC_CAL_LOOPS+2) - apic_printk(APIC_VERBOSE, "... jiffies result ok\n"); - else - levt->features |= CLOCK_EVT_FEAT_DUMMY; - } else - local_irq_enable(); - - if (levt->features & CLOCK_EVT_FEAT_DUMMY) { - printk(KERN_WARNING - "APIC timer disabled due to verification failure.\n"); - return -1; - } - - return 0; -} - -#endif - -/* - * Setup the boot APIC - * - * Calibrate and verify the result. - */ -void __init setup_boot_APIC_clock(void) -{ - /* - * The local apic timer can be disabled via the kernel - * commandline or from the CPU detection code. Register the lapic - * timer as a dummy clock event source on SMP systems, so the - * broadcast mechanism is used. On UP systems simply ignore it. - */ - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) { - lapic_clockevent.mult = 1; - setup_APIC_timer(); - } - return; - } - - apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n" - "calibrating APIC timer ...\n"); - - if (calibrate_APIC_clock()) { - /* No broadcast on UP ! */ - if (num_possible_cpus() > 1) - setup_APIC_timer(); - return; - } - - /* - * If nmi_watchdog is set to IO_APIC, we need the - * PIT/HPET going. Otherwise register lapic as a dummy - * device. - */ - if (nmi_watchdog != NMI_IO_APIC) - lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY; - else - printk(KERN_WARNING "APIC timer registered as dummy," - " due to nmi_watchdog=%d!\n", nmi_watchdog); - - /* Setup the lapic or request the broadcast */ - setup_APIC_timer(); -} - -void __cpuinit setup_secondary_APIC_clock(void) -{ - setup_APIC_timer(); -} - -/* - * The guts of the apic timer interrupt - */ -static void local_apic_timer_interrupt(void) -{ - int cpu = smp_processor_id(); - struct clock_event_device *evt = &per_cpu(lapic_events, cpu); - - /* - * Normally we should not be here till LAPIC has been initialized but - * in some cases like kdump, its possible that there is a pending LAPIC - * timer interrupt from previous kernel's context and is delivered in - * new kernel the moment interrupts are enabled. - * - * Interrupts are enabled early and LAPIC is setup much later, hence - * its possible that when we get here evt->event_handler is NULL. - * Check for event_handler being NULL and discard the interrupt as - * spurious. - */ - if (!evt->event_handler) { - printk(KERN_WARNING - "Spurious LAPIC timer interrupt on cpu %d\n", cpu); - /* Switch it off */ - lapic_timer_setup(CLOCK_EVT_MODE_SHUTDOWN, evt); - return; - } - - /* - * the NMI deadlock-detector uses this. - */ -#ifdef CONFIG_X86_64 - add_pda(apic_timer_irqs, 1); -#else - per_cpu(irq_stat, cpu).apic_timer_irqs++; -#endif - - evt->event_handler(evt); -} - -/* - * Local APIC timer interrupt. This is the most natural way for doing - * local interrupts, but local timer interrupts can be emulated by - * broadcast interrupts too. [in case the hw doesn't support APIC timers] - * - * [ if a single-CPU system runs an SMP kernel then we call the local - * interrupt as well. Thus we cannot inline the local irq ... ] - */ -void smp_apic_timer_interrupt(struct pt_regs *regs) -{ - struct pt_regs *old_regs = set_irq_regs(regs); - - /* - * NOTE! We'd better ACK the irq immediately, - * because timer handling can be slow. - */ - ack_APIC_irq(); - /* - * update_process_times() expects us to have done irq_enter(). - * Besides, if we don't timer interrupts ignore the global - * interrupt lock, which is the WrongThing (tm) to do. - */ -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - local_apic_timer_interrupt(); - irq_exit(); - - set_irq_regs(old_regs); -} - -int setup_profiling_timer(unsigned int multiplier) -{ - return -EINVAL; -} - -/* - * Local APIC start and shutdown - */ - -/** - * clear_local_APIC - shutdown the local APIC - * - * This is called, when a CPU is disabled and before rebooting, so the state of - * the local APIC has no dangling leftovers. Also used to cleanout any BIOS - * leftovers during boot. - */ -void clear_local_APIC(void) -{ - int maxlvt; - u32 v; - - /* APIC hasn't been mapped yet */ - if (!apic_phys) - return; - - maxlvt = lapic_get_maxlvt(); - /* - * Masking an LVT entry can trigger a local APIC error - * if the vector is zero. Mask LVTERR first to prevent this. - */ - if (maxlvt >= 3) { - v = ERROR_APIC_VECTOR; /* any non-zero vector will do */ - apic_write(APIC_LVTERR, v | APIC_LVT_MASKED); - } - /* - * Careful: we have to set masks only first to deassert - * any level-triggered sources. - */ - v = apic_read(APIC_LVTT); - apic_write(APIC_LVTT, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT0); - apic_write(APIC_LVT0, v | APIC_LVT_MASKED); - v = apic_read(APIC_LVT1); - apic_write(APIC_LVT1, v | APIC_LVT_MASKED); - if (maxlvt >= 4) { - v = apic_read(APIC_LVTPC); - apic_write(APIC_LVTPC, v | APIC_LVT_MASKED); - } - - /* lets not touch this if we didn't frob it */ -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(X86_MCE_INTEL) - if (maxlvt >= 5) { - v = apic_read(APIC_LVTTHMR); - apic_write(APIC_LVTTHMR, v | APIC_LVT_MASKED); - } -#endif - /* - * Clean APIC state for other OSs: - */ - apic_write(APIC_LVTT, APIC_LVT_MASKED); - apic_write(APIC_LVT0, APIC_LVT_MASKED); - apic_write(APIC_LVT1, APIC_LVT_MASKED); - if (maxlvt >= 3) - apic_write(APIC_LVTERR, APIC_LVT_MASKED); - if (maxlvt >= 4) - apic_write(APIC_LVTPC, APIC_LVT_MASKED); - - /* Integrated APIC (!82489DX) ? */ - if (lapic_is_integrated()) { - if (maxlvt > 3) - /* Clear ESR due to Pentium errata 3AP and 11AP */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } -} - -/** - * disable_local_APIC - clear and disable the local APIC - */ -void disable_local_APIC(void) -{ - unsigned int value; - - clear_local_APIC(); - - /* - * Disable APIC (implies clearing of registers - * for 82489DX!). - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_SPIV_APIC_ENABLED; - apic_write(APIC_SPIV, value); - -#ifdef CONFIG_X86_32 - /* - * When LAPIC was disabled by the BIOS and enabled by the kernel, - * restore the disabled state. - */ - if (enabled_via_apicbase) { - unsigned int l, h; - - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_ENABLE; - wrmsr(MSR_IA32_APICBASE, l, h); - } -#endif -} - -/* - * If Linux enabled the LAPIC against the BIOS default disable it down before - * re-entering the BIOS on shutdown. Otherwise the BIOS may get confused and - * not power-off. Additionally clear all LVT entries before disable_local_APIC - * for the case where Linux didn't enable the LAPIC. - */ -void lapic_shutdown(void) -{ - unsigned long flags; - - if (!cpu_has_apic) - return; - - local_irq_save(flags); - -#ifdef CONFIG_X86_32 - if (!enabled_via_apicbase) - clear_local_APIC(); - else -#endif - disable_local_APIC(); - - - local_irq_restore(flags); -} - -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ APIC_ID_MASK); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ APIC_ID_MASK)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - -/** - * sync_Arb_IDs - synchronize APIC bus arbitration IDs - */ -void __init sync_Arb_IDs(void) -{ - /* - * Unsupported on P4 - see Intel Dev. Manual Vol. 3, Ch. 8.6.1 And not - * needed on AMD. - */ - if (modern_apic() || boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_printk(APIC_DEBUG, "Synchronizing Arb IDs.\n"); - apic_write(APIC_ICR, APIC_DEST_ALLINC | - APIC_INT_LEVELTRIG | APIC_DM_INIT); -} - -/* - * An initial setup of the virtual wire mode. - */ -void __init init_bsp_APIC(void) -{ - unsigned int value; - - /* - * Don't do the setup now if we have a SMP BIOS as the - * through-I/O-APIC virtual wire mode might be active. - */ - if (smp_found_config || !cpu_has_apic) - return; - - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - - /* - * Enable APIC. - */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - -#ifdef CONFIG_X86_32 - /* This bit is reserved on P4/Xeon and should be cleared */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && - (boot_cpu_data.x86 == 15)) - value &= ~APIC_SPIV_FOCUS_DISABLED; - else -#endif - value |= APIC_SPIV_FOCUS_DISABLED; - value |= SPURIOUS_APIC_VECTOR; - apic_write(APIC_SPIV, value); - - /* - * Set up the virtual wire mode. - */ - apic_write(APIC_LVT0, APIC_DM_EXTINT); - value = APIC_DM_NMI; - if (!lapic_is_integrated()) /* 82489DX */ - value |= APIC_LVT_LEVEL_TRIGGER; - apic_write(APIC_LVT1, value); -} - -static void __cpuinit lapic_setup_esr(void) -{ - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); - - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); - /* - * spec says clear errors after enabling vector. - */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); - } -} - - -/** - * setup_local_APIC - setup the local APIC - */ -void __cpuinit setup_local_APIC(void) -{ - unsigned int value; - int i, j; - -#ifdef CONFIG_X86_32 - /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - apic_write(APIC_ESR, 0); - } -#endif - - preempt_disable(); - - /* - * Double-check whether this APIC is really registered. - * This is meaningless in clustered apic mode, so we skip it. - */ - if (!apic_id_registered()) - BUG(); - - /* - * Intel recommends to set DFR, LDR and TPR before enabling - * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel - * document number 292116). So here it goes... - */ - init_apic_ldr(); - - /* - * Set Task Priority to 'accept all'. We never change this - * later on. - */ - value = apic_read(APIC_TASKPRI); - value &= ~APIC_TPRI_MASK; - apic_write(APIC_TASKPRI, value); - - /* - * After a crash, we no longer service the interrupts and a pending - * interrupt from previous kernel might still have ISR bit set. - * - * Most probably by now CPU has serviced that pending interrupt and - * it might not have done the ack_APIC_irq() because it thought, - * interrupt came from i8259 as ExtInt. LAPIC did not get EOI so it - * does not clear the ISR bit and cpu thinks it has already serivced - * the interrupt. Hence a vector might get locked. It was noticed - * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. - */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1< 1) || - (boot_cpu_data.x86 == 15)) - break; - goto no_apic; - case X86_VENDOR_INTEL: - if (boot_cpu_data.x86 == 6 || boot_cpu_data.x86 == 15 || - (boot_cpu_data.x86 == 5 && cpu_has_apic)) - break; - goto no_apic; - default: - goto no_apic; - } - - if (!cpu_has_apic) { - /* - * Over-ride BIOS and try to enable the local APIC only if - * "lapic" specified. - */ - if (!force_enable_local_apic) { - printk(KERN_INFO "Local APIC disabled by BIOS -- " - "you can enable it with \"lapic\"\n"); - return -1; - } - /* - * Some BIOSes disable the local APIC in the APIC_BASE - * MSR. This can only be done in software for Intel P6 or later - * and AMD K7 (Model > 1) or later. - */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (!(l & MSR_IA32_APICBASE_ENABLE)) { - printk(KERN_INFO - "Local APIC disabled by BIOS -- reenabling.\n"); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE; - wrmsr(MSR_IA32_APICBASE, l, h); - enabled_via_apicbase = 1; - } - } - /* - * The APIC feature bit should now be enabled - * in `cpuid' - */ - features = cpuid_edx(1); - if (!(features & (1 << X86_FEATURE_APIC))) { - printk(KERN_WARNING "Could not enable APIC!\n"); - return -1; - } - set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - mp_lapic_addr = APIC_DEFAULT_PHYS_BASE; - - /* The BIOS may have set up the APIC at some other address */ - rdmsr(MSR_IA32_APICBASE, l, h); - if (l & MSR_IA32_APICBASE_ENABLE) - mp_lapic_addr = l & MSR_IA32_APICBASE_BASE; - - printk(KERN_INFO "Found and enabled local APIC!\n"); - - apic_pm_activate(); - - return 0; - -no_apic: - printk(KERN_INFO "No local APIC present or hardware disabled\n"); - return -1; -} -#endif - -#ifdef CONFIG_X86_64 -void __init early_init_lapic_mapping(void) -{ - unsigned long phys_addr; - - /* - * If no local APIC can be found then go out - * : it means there is no mpatable and MADT - */ - if (!smp_found_config) - return; - - phys_addr = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, phys_addr); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, phys_addr); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - boot_cpu_physical_apicid = read_apic_id(); -} -#endif - -/** - * init_apic_mappings - initialize APIC mappings - */ -void __init init_apic_mappings(void) -{ -#ifdef HAVE_X2APIC - if (x2apic) { - boot_cpu_physical_apicid = read_apic_id(); - return; - } -#endif - - /* - * If no local APIC can be found then set up a fake all - * zeroes page to simulate the local APIC and another - * one for the IO-APIC. - */ - if (!smp_found_config && detect_init_APIC()) { - apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - apic_phys = __pa(apic_phys); - } else - apic_phys = mp_lapic_addr; - - set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", - APIC_BASE, apic_phys); - - /* - * Fetch the APIC ID of the BSP in case we have a - * default configuration (or the MP table is broken). - */ - if (boot_cpu_physical_apicid == -1U) - boot_cpu_physical_apicid = read_apic_id(); -} - -/* - * This initializes the IO-APIC and APIC hardware if this is - * a UP kernel. - */ -int apic_version[MAX_APICS]; - -int __init APIC_init_uniprocessor(void) -{ -#ifdef CONFIG_X86_64 - if (disable_apic) { - printk(KERN_INFO "Apic disabled\n"); - return -1; - } - if (!cpu_has_apic) { - disable_apic = 1; - printk(KERN_INFO "Apic disabled by BIOS\n"); - return -1; - } -#else - if (!smp_found_config && !cpu_has_apic) - return -1; - - /* - * Complain if the BIOS pretends there is one. - */ - if (!cpu_has_apic && - APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); - return -1; - } -#endif - -#ifdef HAVE_X2APIC - enable_IR_x2apic(); -#endif -#ifdef CONFIG_X86_64 - setup_apic_routing(); -#endif - - verify_local_APIC(); - connect_bsp_APIC(); - -#ifdef CONFIG_X86_64 - apic_write(APIC_ID, SET_APIC_ID(boot_cpu_physical_apicid)); -#else - /* - * Hack: In case of kdump, after a crash, kernel might be booting - * on a cpu with non-zero lapic id. But boot_cpu_physical_apicid - * might be zero if read from MP tables. Get it from LAPIC. - */ -# ifdef CONFIG_CRASH_DUMP - boot_cpu_physical_apicid = read_apic_id(); -# endif -#endif - physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); - setup_local_APIC(); - -#ifdef CONFIG_X86_64 - /* - * Now enable IO-APICs, actually call clear_IO_APIC - * We need clear_IO_APIC before enabling vector on BP - */ - if (!skip_ioapic_setup && nr_ioapics) - enable_IO_APIC(); -#endif - -#ifdef CONFIG_X86_IO_APIC - if (!smp_found_config || skip_ioapic_setup || !nr_ioapics) -#endif - localise_nmi_watchdog(); - end_local_APIC_setup(); - -#ifdef CONFIG_X86_IO_APIC - if (smp_found_config && !skip_ioapic_setup && nr_ioapics) - setup_IO_APIC(); -# ifdef CONFIG_X86_64 - else - nr_ioapics = 0; -# endif -#endif - -#ifdef CONFIG_X86_64 - setup_boot_APIC_clock(); - check_nmi_watchdog(); -#else - setup_boot_clock(); -#endif - - return 0; -} - -/* - * Local APIC interrupts - */ - -/* - * This interrupt should _never_ happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_spurious_interrupt(void) -#else -void smp_spurious_interrupt(struct pt_regs *regs) -#endif -{ - u32 v; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* - * Check if this really is a spurious interrupt and ACK it - * if it is a vectored one. Just in case... - * Spurious interrupts should not be ACKed. - */ - v = apic_read(APIC_ISR + ((SPURIOUS_APIC_VECTOR & ~0x1f) >> 1)); - if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) - ack_APIC_irq(); - -#ifdef CONFIG_X86_64 - add_pda(irq_spurious_count, 1); -#else - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, " - "should never happen.\n", smp_processor_id()); - __get_cpu_var(irq_stat).irq_spurious_count++; -#endif - irq_exit(); -} - -/* - * This interrupt should never happen with our APIC/SMP architecture - */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_error_interrupt(void) -#else -void smp_error_interrupt(struct pt_regs *regs) -#endif -{ - u32 v, v1; - -#ifdef CONFIG_X86_64 - exit_idle(); -#endif - irq_enter(); - /* First tickle the hardware, only then report what went on. -- REW */ - v = apic_read(APIC_ESR); - apic_write(APIC_ESR, 0); - v1 = apic_read(APIC_ESR); - ack_APIC_irq(); - atomic_inc(&irq_err_count); - - /* Here is what the APIC error bits mean: - 0: Send CS error - 1: Receive CS error - 2: Send accept error - 3: Receive accept error - 4: Reserved - 5: Send illegal vector - 6: Received illegal vector - 7: Illegal register address - */ - printk(KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); - irq_exit(); -} - -/** - * connect_bsp_APIC - attach the APIC to the interrupt system - */ -void __init connect_bsp_APIC(void) -{ -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Do not trust the local APIC being empty at bootup. - */ - clear_local_APIC(); - /* - * PIC mode, enable APIC mode in the IMCR, i.e. connect BSP's - * local APIC to INT and NMI lines. - */ - apic_printk(APIC_VERBOSE, "leaving PIC mode, " - "enabling APIC mode.\n"); - outb(0x70, 0x22); - outb(0x01, 0x23); - } -#endif - enable_apic_mode(); -} - -/** - * disconnect_bsp_APIC - detach the APIC from the interrupt system - * @virt_wire_setup: indicates, whether virtual wire mode is selected - * - * Virtual wire mode is necessary to deliver legacy interrupts even when the - * APIC is disabled. - */ -void disconnect_bsp_APIC(int virt_wire_setup) -{ - unsigned int value; - -#ifdef CONFIG_X86_32 - if (pic_mode) { - /* - * Put the board back into PIC mode (has an effect only on - * certain older boards). Note that APIC interrupts, including - * IPIs, won't work beyond this point! The only exception are - * INIT IPIs. - */ - apic_printk(APIC_VERBOSE, "disabling APIC mode, " - "entering PIC mode.\n"); - outb(0x70, 0x22); - outb(0x00, 0x23); - return; - } -#endif - - /* Go back to Virtual Wire compatibility mode */ - - /* For the spurious interrupt use vector F, and enable it */ - value = apic_read(APIC_SPIV); - value &= ~APIC_VECTOR_MASK; - value |= APIC_SPIV_APIC_ENABLED; - value |= 0xf; - apic_write(APIC_SPIV, value); - - if (!virt_wire_setup) { - /* - * For LVT0 make it edge triggered, active high, - * external and enabled - */ - value = apic_read(APIC_LVT0); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_EXTINT); - apic_write(APIC_LVT0, value); - } else { - /* Disable LVT0 */ - apic_write(APIC_LVT0, APIC_LVT_MASKED); - } - - /* - * For LVT1 make it edge triggered, active high, - * nmi and enabled - */ - value = apic_read(APIC_LVT1); - value &= ~(APIC_MODE_MASK | APIC_SEND_PENDING | - APIC_INPUT_POLARITY | APIC_LVT_REMOTE_IRR | - APIC_LVT_LEVEL_TRIGGER | APIC_LVT_MASKED); - value |= APIC_LVT_REMOTE_IRR | APIC_SEND_PENDING; - value = SET_APIC_DELIVERY_MODE(value, APIC_MODE_NMI); - apic_write(APIC_LVT1, value); -} - -void __cpuinit generic_processor_info(int apicid, int version) -{ - int cpu; - cpumask_t tmp_map; - - /* - * Validate version - */ - if (version == 0x0) { - printk(KERN_WARNING "BIOS bug, APIC version is 0 for CPU#%d! " - "fixing up to 0x10. (tell your hw vendor)\n", - version); - version = 0x10; - } - apic_version[apicid] = version; - - if (num_processors >= NR_CPUS) { - printk(KERN_WARNING "WARNING: NR_CPUS limit of %i reached." - " Processor ignored.\n", NR_CPUS); - return; - } - - num_processors++; - cpus_complement(tmp_map, cpu_present_map); - cpu = first_cpu(tmp_map); - - physid_set(apicid, phys_cpu_present_map); - if (apicid == boot_cpu_physical_apicid) { - /* - * x86_bios_cpu_apicid is required to have processors listed - * in same order as logical cpu numbers. Hence the first - * entry is BSP, and so on. - */ - cpu = 0; - } - if (apicid > max_physical_apicid) - max_physical_apicid = apicid; - -#ifdef CONFIG_X86_32 - /* - * Would be preferable to switch to bigsmp when CONFIG_HOTPLUG_CPU=y - * but we need to work other dependencies like SMP_SUSPEND etc - * before this can be done without some confusion. - * if (CPU_HOTPLUG_ENABLED || num_processors > 8) - * - Ashok Raj - */ - if (max_physical_apicid >= 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } -#endif - -#if defined(CONFIG_X86_SMP) || defined(CONFIG_X86_64) - /* are we being called early in kernel startup? */ - if (early_per_cpu_ptr(x86_cpu_to_apicid)) { - u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid); - u16 *bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - - cpu_to_apicid[cpu] = apicid; - bios_cpu_apicid[cpu] = apicid; - } else { - per_cpu(x86_cpu_to_apicid, cpu) = apicid; - per_cpu(x86_bios_cpu_apicid, cpu) = apicid; - } -#endif - - cpu_set(cpu, cpu_possible_map); - cpu_set(cpu, cpu_present_map); -} - -#ifdef CONFIG_X86_64 -int hard_smp_processor_id(void) -{ - return read_apic_id(); -} -#endif - -/* - * Power management - */ -#ifdef CONFIG_PM - -static struct { - /* - * 'active' is true if the local APIC was enabled by us and - * not the BIOS; this signifies that we are also responsible - * for disabling it before entering apm/acpi suspend - */ - int active; - /* r/w apic fields */ - unsigned int apic_id; - unsigned int apic_taskpri; - unsigned int apic_ldr; - unsigned int apic_dfr; - unsigned int apic_spiv; - unsigned int apic_lvtt; - unsigned int apic_lvtpc; - unsigned int apic_lvt0; - unsigned int apic_lvt1; - unsigned int apic_lvterr; - unsigned int apic_tmict; - unsigned int apic_tdcr; - unsigned int apic_thmr; -} apic_pm_state; - -static int lapic_suspend(struct sys_device *dev, pm_message_t state) -{ - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - apic_pm_state.apic_id = apic_read(APIC_ID); - apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); - apic_pm_state.apic_ldr = apic_read(APIC_LDR); - apic_pm_state.apic_dfr = apic_read(APIC_DFR); - apic_pm_state.apic_spiv = apic_read(APIC_SPIV); - apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - if (maxlvt >= 4) - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); - apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); - apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); - apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); - apic_pm_state.apic_tmict = apic_read(APIC_TMICT); - apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); -#endif - - local_irq_save(flags); - disable_local_APIC(); - local_irq_restore(flags); - return 0; -} - -static int lapic_resume(struct sys_device *dev) -{ - unsigned int l, h; - unsigned long flags; - int maxlvt; - - if (!apic_pm_state.active) - return 0; - - maxlvt = lapic_get_maxlvt(); - - local_irq_save(flags); - -#ifdef HAVE_X2APIC - if (x2apic) - enable_x2apic(); - else -#endif - { - /* - * Make sure the APICBASE points to the right address - * - * FIXME! This will be wrong if we ever support suspend on - * SMP! We'll need to do this as part of the CPU restore! - */ - rdmsr(MSR_IA32_APICBASE, l, h); - l &= ~MSR_IA32_APICBASE_BASE; - l |= MSR_IA32_APICBASE_ENABLE | mp_lapic_addr; - wrmsr(MSR_IA32_APICBASE, l, h); - } - - apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED); - apic_write(APIC_ID, apic_pm_state.apic_id); - apic_write(APIC_DFR, apic_pm_state.apic_dfr); - apic_write(APIC_LDR, apic_pm_state.apic_ldr); - apic_write(APIC_TASKPRI, apic_pm_state.apic_taskpri); - apic_write(APIC_SPIV, apic_pm_state.apic_spiv); - apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); - apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); -#if defined(CONFIG_X86_MCE_P4THERMAL) || defined(CONFIG_X86_MCE_INTEL) - if (maxlvt >= 5) - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); -#endif - if (maxlvt >= 4) - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); - apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); - apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); - apic_write(APIC_TMICT, apic_pm_state.apic_tmict); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - - local_irq_restore(flags); - - return 0; -} - -/* - * This device has no shutdown method - fully functioning local APICs - * are needed on every CPU up until machine_halt/restart/poweroff. - */ - -static struct sysdev_class lapic_sysclass = { - .name = "lapic", - .resume = lapic_resume, - .suspend = lapic_suspend, -}; - -static struct sys_device device_lapic = { - .id = 0, - .cls = &lapic_sysclass, -}; - -static void __cpuinit apic_pm_activate(void) -{ - apic_pm_state.active = 1; -} - -static int __init init_lapic_sysfs(void) -{ - int error; - - if (!cpu_has_apic) - return 0; - /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ - - error = sysdev_class_register(&lapic_sysclass); - if (!error) - error = sysdev_register(&device_lapic); - return error; -} -device_initcall(init_lapic_sysfs); - -#else /* CONFIG_PM */ - -static void apic_pm_activate(void) { } - -#endif /* CONFIG_PM */ - -#ifdef CONFIG_X86_64 -/* - * apic_is_clustered_box() -- Check if we can expect good TSC - * - * Thus far, the major user of this is IBM's Summit2 series: - * - * Clustered boxes may have unsynced TSC problems if they are - * multi-chassis. Use available data to take a good guess. - * If in doubt, go HPET. - */ -__cpuinit int apic_is_clustered_box(void) -{ - int i, clusters, zeros; - unsigned id; - u16 *bios_cpu_apicid; - DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS); - - /* - * there is not this kind of box with AMD CPU yet. - * Some AMD box with quadcore cpu and 8 sockets apicid - * will be [4, 0x23] or [8, 0x27] could be thought to - * vsmp box still need checking... - */ - if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box()) - return 0; - - bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid); - bitmap_zero(clustermap, NUM_APIC_CLUSTERS); - - for (i = 0; i < NR_CPUS; i++) { - /* are we being called early in kernel startup? */ - if (bios_cpu_apicid) { - id = bios_cpu_apicid[i]; - } - else if (i < nr_cpu_ids) { - if (cpu_present(i)) - id = per_cpu(x86_bios_cpu_apicid, i); - else - continue; - } - else - break; - - if (id != BAD_APICID) - __set_bit(APIC_CLUSTERID(id), clustermap); - } - - /* Problem: Partially populated chassis may not have CPUs in some of - * the APIC clusters they have been allocated. Only present CPUs have - * x86_bios_cpu_apicid entries, thus causing zeroes in the bitmap. - * Since clusters are allocated sequentially, count zeros only if - * they are bounded by ones. - */ - clusters = 0; - zeros = 0; - for (i = 0; i < NUM_APIC_CLUSTERS; i++) { - if (test_bit(i, clustermap)) { - clusters += 1 + zeros; - zeros = 0; - } else - ++zeros; - } - - /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are - * not guaranteed to be synced between boards - */ - if (is_vsmp_box() && clusters > 1) - return 1; - - /* - * If clusters > 2, then should be multi-chassis. - * May have to revisit this when multi-core + hyperthreaded CPUs come - * out, but AFAIK this will work even for them. - */ - return (clusters > 2); -} -#endif - -/* - * APIC command line parameters - */ -static int __init setup_disableapic(char *arg) -{ - disable_apic = 1; - setup_clear_cpu_cap(X86_FEATURE_APIC); - return 0; -} -early_param("disableapic", setup_disableapic); - -/* same as disableapic, for compatibility */ -static int __init setup_nolapic(char *arg) -{ - return setup_disableapic(arg); -} -early_param("nolapic", setup_nolapic); - -static int __init parse_lapic_timer_c2_ok(char *arg) -{ - local_apic_timer_c2_ok = 1; - return 0; -} -early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); - -static int __init parse_disable_apic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("noapictimer", parse_disable_apic_timer); - -static int __init parse_nolapic_timer(char *arg) -{ - disable_apic_timer = 1; - return 0; -} -early_param("nolapic_timer", parse_nolapic_timer); - -static int __init apic_set_verbosity(char *arg) -{ - if (!arg) { -#ifdef CONFIG_X86_64 - skip_ioapic_setup = 0; - return 0; -#endif - return -EINVAL; - } - - if (strcmp("debug", arg) == 0) - apic_verbosity = APIC_DEBUG; - else if (strcmp("verbose", arg) == 0) - apic_verbosity = APIC_VERBOSE; - else { - printk(KERN_WARNING "APIC Verbosity level %s not recognised" - " use apic=verbose or apic=debug\n", arg); - return -EINVAL; - } - - return 0; -} -early_param("apic", apic_set_verbosity); - -static int __init lapic_insert_resource(void) -{ - if (!apic_phys) - return -1; - - /* Put local APIC into the resource map. */ - lapic_resource.start = apic_phys; - lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; - insert_resource(&iomem_resource, &lapic_resource); - - return 0; -} - -/* - * need call insert after e820_reserve_resources() - * that is using request_resource - */ -late_initcall(lapic_insert_resource); -- cgit v1.2.2 From e492c5ae85428d4a3815d06bf308c590120b928b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 24 Aug 2008 22:41:26 -0700 Subject: x86: let 64 bit to use 32 bit calibrate_apic_clock Use the 32-bit APIC calibration code - it's more mature. Signed-of-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 88 ++------------------------------------------------ 1 file changed, 2 insertions(+), 86 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 6e99af5ce678..ca5ef71f4208 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -478,90 +478,6 @@ static void __cpuinit setup_APIC_timer(void) clockevents_register_device(levt); } -#ifdef CONFIG_X86_64 -/* - * In this function we calibrate APIC bus clocks to the external - * timer. Unfortunately we cannot use jiffies and the timer irq - * to calibrate, since some later bootup code depends on getting - * the first irq? Ugh. - * - * We want to do the calibration only once since we - * want to have local timer irqs syncron. CPUs connected - * by the same APIC bus have the very same bus frequency. - * And we want to have irqs off anyways, no accidental - * APIC irq that way. - */ - -#define TICK_COUNT 100000000 - -static int __init calibrate_APIC_clock(void) -{ - unsigned apic, apic_start; - unsigned long tsc, tsc_start; - int result; - - local_irq_disable(); - - /* - * Put whatever arbitrary (but long enough) timeout - * value into the APIC clock, we just want to get the - * counter running for calibration. - * - * No interrupt enable ! - */ - __setup_APIC_LVTT(250000000, 0, 0); - - apic_start = apic_read(APIC_TMCCT); -#ifdef CONFIG_X86_PM_TIMER - if (apic_calibrate_pmtmr && pmtmr_ioport) { - pmtimer_wait(5000); /* 5ms wait */ - apic = apic_read(APIC_TMCCT); - result = (apic_start - apic) * 1000L / 5; - } else -#endif - { - rdtscll(tsc_start); - - do { - apic = apic_read(APIC_TMCCT); - rdtscll(tsc); - } while ((tsc - tsc_start) < TICK_COUNT && - (apic_start - apic) < TICK_COUNT); - - result = (apic_start - apic) * 1000L * tsc_khz / - (tsc - tsc_start); - } - - local_irq_enable(); - - printk(KERN_DEBUG "APIC timer calibration result %d\n", result); - - printk(KERN_INFO "Detected %d.%03d MHz APIC timer.\n", - result / 1000 / 1000, result / 1000 % 1000); - - /* Calculate the scaled math multiplication factor */ - lapic_clockevent.mult = div_sc(result, NSEC_PER_SEC, - lapic_clockevent.shift); - lapic_clockevent.max_delta_ns = - clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); - lapic_clockevent.min_delta_ns = - clockevent_delta2ns(0xF, &lapic_clockevent); - - calibration_result = (result * APIC_DIVISOR) / HZ; - - /* - * Do a sanity check on the APIC calibration result - */ - if (calibration_result < (1000000 / HZ)) { - printk(KERN_WARNING - "APIC frequency too slow, disabling apic timer\n"); - return -1; - } - - return 0; -} - -#else /* * In this functions we calibrate APIC bus clocks to the external timer. * @@ -659,6 +575,7 @@ static int __init calibrate_APIC_clock(void) delta = lapic_cal_t1 - lapic_cal_t2; apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); +#ifdef CONFIG_X86_PM_TIMER /* Check, if the PM timer is available */ deltapm = lapic_cal_pm2 - lapic_cal_pm1; apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); @@ -687,6 +604,7 @@ static int __init calibrate_APIC_clock(void) } pm_referenced = 1; } +#endif /* Calculate the scaled math multiplication factor */ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, @@ -773,8 +691,6 @@ static int __init calibrate_APIC_clock(void) return 0; } -#endif - /* * Setup the boot APIC * -- cgit v1.2.2 From 1dd6ba2e179773597e20f17f66049a64e6c4b2ec Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 25 Aug 2008 21:27:26 +0400 Subject: x86: apic - unify smp_spurious/error_interrupt declaration According to entry_64.S we do pass pt_regs pointer into interrupt handlers but don't use them. So we safely may merge the declarations. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index ca5ef71f4208..0556f375e40b 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1659,11 +1659,7 @@ int __init APIC_init_uniprocessor(void) /* * This interrupt should _never_ happen with our APIC/SMP architecture */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_spurious_interrupt(void) -#else void smp_spurious_interrupt(struct pt_regs *regs) -#endif { u32 v; @@ -1694,11 +1690,7 @@ void smp_spurious_interrupt(struct pt_regs *regs) /* * This interrupt should never happen with our APIC/SMP architecture */ -#ifdef CONFIG_X86_64 -asmlinkage void smp_error_interrupt(void) -#else void smp_error_interrupt(struct pt_regs *regs) -#endif { u32 v, v1; -- cgit v1.2.2 From a11b5abef50722e42a7d13f6b799c4f606fcb797 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 3 Sep 2008 16:58:31 -0700 Subject: x2apic: fix reserved APIC register accesses in print_local_APIC() APIC_ARBPRI is a reserved register for XAPIC and beyond. APIC_RRR is a reserved register except for 82489DX, APIC for Pentium processors. APIC_EOI is a write only register. APIC_DFR is reserved in x2apic mode. Access to these registers in x2apic will result in #GP fault. Fix these apic register accesses. Signed-off-by: Yinghai Lu Signed-off-by: Suresh Siddha Cc: Maciej W. Rozycki Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index d28128e0392c..93ceb19d1e90 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1751,21 +1751,30 @@ __apicdebuginit(void) print_local_APIC(void *dummy) printk(KERN_DEBUG "... APIC TASKPRI: %08x (%02x)\n", v, v & APIC_TPRI_MASK); if (APIC_INTEGRATED(ver)) { /* !82489DX */ - v = apic_read(APIC_ARBPRI); - printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, - v & APIC_ARBPRI_MASK); + if (!APIC_XAPIC(ver)) { + v = apic_read(APIC_ARBPRI); + printk(KERN_DEBUG "... APIC ARBPRI: %08x (%02x)\n", v, + v & APIC_ARBPRI_MASK); + } v = apic_read(APIC_PROCPRI); printk(KERN_DEBUG "... APIC PROCPRI: %08x\n", v); } - v = apic_read(APIC_EOI); - printk(KERN_DEBUG "... APIC EOI: %08x\n", v); - v = apic_read(APIC_RRR); - printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + /* + * Remote read supported only in the 82489DX and local APIC for + * Pentium processors. + */ + if (!APIC_INTEGRATED(ver) || maxlvt == 3) { + v = apic_read(APIC_RRR); + printk(KERN_DEBUG "... APIC RRR: %08x\n", v); + } + v = apic_read(APIC_LDR); printk(KERN_DEBUG "... APIC LDR: %08x\n", v); - v = apic_read(APIC_DFR); - printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + if (!x2apic_enabled()) { + v = apic_read(APIC_DFR); + printk(KERN_DEBUG "... APIC DFR: %08x\n", v); + } v = apic_read(APIC_SPIV); printk(KERN_DEBUG "... APIC SPIV: %08x\n", v); -- cgit v1.2.2 From 02c1df199c990cd21c09a4dffaa06d4e0b7bf2bf Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 20:57:11 +0200 Subject: x86: print out if acpi want physical flat of all Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 9eca5ba7a6b1..2ec2de8d8c46 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -179,8 +179,10 @@ static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) * is an example). */ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && - (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) { + printk(KERN_DEBUG "system APIC only can use physical flat"); return 1; + } #endif return 0; -- cgit v1.2.2 From 676f4a920be27160747439fe71026aa15ec78e5a Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 4 Sep 2008 22:37:49 +0400 Subject: x86: io-apic - use ARRAY_SIZE macro Make the code width a bit shorter with ARRAY_SIZE macro. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 93ceb19d1e90..9b01fdadcb9b 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -166,7 +166,7 @@ static void __init init_work(void *data) memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - legacy_count = sizeof(irq_cfg_legacy)/sizeof(irq_cfg_legacy[0]); + legacy_count = ARRAY_SIZE(irq_cfg_legacy); for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); -- cgit v1.2.2 From ac54a6c9371bacb86bee1db23f7d82e8685c7e17 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 4 Sep 2008 22:37:50 +0400 Subject: x86: io-apic - declare irq_cfg_lock for SPARSE_IRQ only We use irq_cfg_lock lock in SPARSE_IRQ only context so move it under #ifdef and compiler will be happy. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 9b01fdadcb9b..d22fecf828b8 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -147,14 +147,15 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) static struct irq_cfg *irq_cfgx; +#ifdef CONFIG_HAVE_SPARSE_IRQ /* * Protect the irq_cfgx_free freelist: */ static DEFINE_SPINLOCK(irq_cfg_lock); -#ifdef CONFIG_HAVE_SPARSE_IRQ static struct irq_cfg *irq_cfgx_free; #endif + static void __init init_work(void *data) { struct dyn_array *da = data; -- cgit v1.2.2 From b40d575bf0679c45aaf9e1161fc51a6b041b7210 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 5 Sep 2008 18:02:16 -0700 Subject: x86: HPET_MSI Refactor code in preparation for HPET_MSI Preparatory patch before the actual HPET MSI changes. Sets up hpet_set_mode and hpet_next_event for the MSI related changes. Just the code refactoring and should be zero functional change. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index acf62fc233da..f7cb5e9e261e 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -227,44 +227,44 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } -static void hpet_legacy_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) +static void hpet_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt, int timer) { unsigned long cfg, cmp, now; uint64_t delta; switch(mode) { case CLOCK_EVT_MODE_PERIODIC: - delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * hpet_clockevent.mult; - delta >>= hpet_clockevent.shift; + delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; + delta >>= evt->shift; now = hpet_readl(HPET_COUNTER); cmp = now + (unsigned long) delta; - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg |= HPET_TN_ENABLE | HPET_TN_PERIODIC | HPET_TN_SETVAL | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); /* * The first write after writing TN_SETVAL to the * config register sets the counter value, the second * write sets the period. */ - hpet_writel(cmp, HPET_T0_CMP); + hpet_writel(cmp, HPET_Tn_CMP(timer)); udelay(1); - hpet_writel((unsigned long) delta, HPET_T0_CMP); + hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); break; case CLOCK_EVT_MODE_ONESHOT: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_PERIODIC; cfg |= HPET_TN_ENABLE | HPET_TN_32BIT; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_UNUSED: case CLOCK_EVT_MODE_SHUTDOWN: - cfg = hpet_readl(HPET_T0_CFG); + cfg = hpet_readl(HPET_Tn_CFG(timer)); cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T0_CFG); + hpet_writel(cfg, HPET_Tn_CFG(timer)); break; case CLOCK_EVT_MODE_RESUME: @@ -273,14 +273,14 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, } } -static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) +static int hpet_next_event(unsigned long delta, + struct clock_event_device *evt, int timer) { u32 cnt; cnt = hpet_readl(HPET_COUNTER); cnt += (u32) delta; - hpet_writel(cnt, HPET_T0_CMP); + hpet_writel(cnt, HPET_Tn_CMP(timer)); /* * We need to read back the CMP register to make sure that @@ -292,6 +292,18 @@ static int hpet_legacy_next_event(unsigned long delta, return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } +static void hpet_legacy_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + hpet_set_mode(mode, evt, 0); +} + +static int hpet_legacy_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + return hpet_next_event(delta, evt, 0); +} + /* * Clock source related code */ -- cgit v1.2.2 From 58ac1e76ce77d515bd5cb65dbc465a040da341c6 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 5 Sep 2008 18:02:17 -0700 Subject: x86: HPET_MSI Basic HPET_MSI setup code Basic HPET MSI setup code. Routines to perform basic MSI read write in HPET memory map and setting up irq_chip for HPET MSI. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 54 +++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/io_apic.c | 64 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 118 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index f7cb5e9e261e..3f10d16a8348 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -25,6 +27,15 @@ unsigned long hpet_address; static void __iomem *hpet_virt_address; +struct hpet_dev { + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int flags; + char name[10]; +}; + unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -304,6 +315,49 @@ static int hpet_legacy_next_event(unsigned long delta, return hpet_next_event(delta, evt, 0); } +/* + * HPET MSI Support + */ + +void hpet_msi_unmask(unsigned int irq) +{ + struct hpet_dev *hdev = get_irq_data(irq); + unsigned long cfg; + + /* unmask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg |= HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_mask(unsigned int irq) +{ + unsigned long cfg; + struct hpet_dev *hdev = get_irq_data(irq); + + /* mask it */ + cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); + cfg &= ~HPET_TN_FSB; + hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); +} + +void hpet_msi_write(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); + hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); +} + +void hpet_msi_read(unsigned int irq, struct msi_msg *msg) +{ + struct hpet_dev *hdev = get_irq_data(irq); + + msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); + msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); + msg->address_hi = 0; +} + /* * Clock source related code */ diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index d22fecf828b8..77fa155becf6 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -41,6 +41,7 @@ #endif #include #include +#include #include #include @@ -56,6 +57,7 @@ #include #include #include +#include #include #include @@ -3522,6 +3524,68 @@ int arch_setup_dmar_msi(unsigned int irq) } #endif +#ifdef CONFIG_HPET_TIMER + +#ifdef CONFIG_SMP +static void hpet_msi_set_affinity(unsigned int irq, cpumask_t mask) +{ + struct irq_cfg *cfg; + struct irq_desc *desc; + struct msi_msg msg; + unsigned int dest; + cpumask_t tmp; + + cpus_and(tmp, mask, cpu_online_map); + if (cpus_empty(tmp)) + return; + + if (assign_irq_vector(irq, mask)) + return; + + cfg = irq_cfg(irq); + cpus_and(tmp, cfg->domain, mask); + dest = cpu_mask_to_apicid(tmp); + + hpet_msi_read(irq, &msg); + + msg.data &= ~MSI_DATA_VECTOR_MASK; + msg.data |= MSI_DATA_VECTOR(cfg->vector); + msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; + msg.address_lo |= MSI_ADDR_DEST_ID(dest); + + hpet_msi_write(irq, &msg); + desc = irq_to_desc(irq); + desc->affinity = mask; +} +#endif /* CONFIG_SMP */ + +struct irq_chip hpet_msi_type = { + .name = "HPET_MSI", + .unmask = hpet_msi_unmask, + .mask = hpet_msi_mask, + .ack = ack_apic_edge, +#ifdef CONFIG_SMP + .set_affinity = hpet_msi_set_affinity, +#endif + .retrigger = ioapic_retrigger_irq, +}; + +int arch_setup_hpet_msi(unsigned int irq) +{ + int ret; + struct msi_msg msg; + + ret = msi_compose_msg(NULL, irq, &msg); + if (ret < 0) + return ret; + + hpet_msi_write(irq, &msg); + set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, + "edge"); + return 0; +} +#endif + #endif /* CONFIG_PCI_MSI */ /* * Hypertransport interrupt support -- cgit v1.2.2 From 4588c1f0354ac96a358b3f9e8e4331c51cf3336f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 6 Sep 2008 14:19:17 +0200 Subject: x86: HPET_MSI Basic HPET_MSI setup code, cleanups small style cleanups. Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 50 +++++++++++++++++++++++++------------------------- 1 file changed, 25 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 3f10d16a8348..03d3655734b4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,39 +1,39 @@ #include #include +#include +#include #include #include #include #include -#include -#include -#include #include +#include +#include #include -#include #include -#include +#include -#define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 +#define HPET_MASK CLOCKSOURCE_MASK(32) +#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ -#define FSEC_PER_NSEC 1000000L +#define FSEC_PER_NSEC 1000000L /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ -unsigned long hpet_address; -static void __iomem *hpet_virt_address; +unsigned long hpet_address; +static void __iomem *hpet_virt_address; struct hpet_dev { - struct clock_event_device evt; - unsigned int num; - int cpu; - unsigned int irq; - unsigned int flags; - char name[10]; + struct clock_event_device evt; + unsigned int num; + int cpu; + unsigned int irq; + unsigned int flags; + char name[10]; }; unsigned long hpet_readl(unsigned long a) @@ -70,7 +70,7 @@ static inline void hpet_clear_mapping(void) static int boot_hpet_disable; int hpet_force_user; -static int __init hpet_setup(char* str) +static int __init hpet_setup(char *str) { if (str) { if (!strncmp("disable", str, 7)) @@ -91,7 +91,7 @@ __setup("nohpet", disable_hpet); static inline int is_hpet_capable(void) { - return (!boot_hpet_disable && hpet_address); + return !boot_hpet_disable && hpet_address; } /* @@ -122,10 +122,10 @@ static void hpet_reserve_platform_timers(unsigned long id) nrtimers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT) + 1; - memset(&hd, 0, sizeof (hd)); - hd.hd_phys_address = hpet_address; - hd.hd_address = hpet; - hd.hd_nirqs = nrtimers; + memset(&hd, 0, sizeof(hd)); + hd.hd_phys_address = hpet_address; + hd.hd_address = hpet; + hd.hd_nirqs = nrtimers; hpet_reserve_timer(&hd, 0); #ifdef CONFIG_HPET_EMULATE_RTC @@ -141,8 +141,8 @@ static void hpet_reserve_platform_timers(unsigned long id) hd.hd_irq[1] = HPET_LEGACY_RTC; for (i = 2; i < nrtimers; timer++, i++) { - hd.hd_irq[i] = (readl(&timer->hpet_config) & Tn_INT_ROUTE_CNF_MASK) >> - Tn_INT_ROUTE_CNF_SHIFT; + hd.hd_irq[i] = (readl(&timer->hpet_config) & + Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } hpet_alloc(&hd); @@ -244,7 +244,7 @@ static void hpet_set_mode(enum clock_event_mode mode, unsigned long cfg, cmp, now; uint64_t delta; - switch(mode) { + switch (mode) { case CLOCK_EVT_MODE_PERIODIC: delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; delta >>= evt->shift; -- cgit v1.2.2 From 26afe5f2fbf06ea0765aaa316640c4dd472310c0 Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 5 Sep 2008 18:02:18 -0700 Subject: x86: HPET_MSI Initialise per-cpu HPET timers Initialize a per CPU HPET MSI timer when possible. We retain the HPET timer 0 (IRQ 0) and timer 1 (IRQ 8) as is when legacy mode is being used. We setup the remaining HPET timers as per CPU MSI based timers. This per CPU timer will eliminate the need for timer broadcasting with IRQ 0 when there is non-functional LAPIC timer across CPU deep C-states. If there are more CPUs than number of available timers, CPUs that do not find any timer to use will continue using LAPIC and IRQ 0 broadcast. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 295 ++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 293 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 03d3655734b4..31e9191b7e19 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -21,10 +21,19 @@ NSEC = 10^-9 */ #define FSEC_PER_NSEC 1000000L +#define HPET_DEV_USED_BIT 2 +#define HPET_DEV_USED (1 << HPET_DEV_USED_BIT) +#define HPET_DEV_VALID 0x8 +#define HPET_DEV_FSB_CAP 0x1000 +#define HPET_DEV_PERI_CAP 0x2000 + +#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) + /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ unsigned long hpet_address; +unsigned long hpet_num_timers; static void __iomem *hpet_virt_address; struct hpet_dev { @@ -36,6 +45,10 @@ struct hpet_dev { char name[10]; }; +static struct hpet_dev *hpet_devs; + +static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); + unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -145,6 +158,16 @@ static void hpet_reserve_platform_timers(unsigned long id) Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } + for (i = 0; i < nrtimers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd.hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(&hd, hdev->num); + } + hpet_alloc(&hd); } @@ -238,6 +261,8 @@ static void hpet_legacy_clockevent_register(void) printk(KERN_DEBUG "hpet clockevent registered\n"); } +static int hpet_setup_msi_irq(unsigned int irq); + static void hpet_set_mode(enum clock_event_mode mode, struct clock_event_device *evt, int timer) { @@ -279,7 +304,15 @@ static void hpet_set_mode(enum clock_event_mode mode, break; case CLOCK_EVT_MODE_RESUME: - hpet_enable_legacy_int(); + if (timer == 0) { + hpet_enable_legacy_int(); + } else { + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_setup_msi_irq(hdev->irq); + disable_irq(hdev->irq); + irq_set_affinity(hdev->irq, cpumask_of_cpu(hdev->cpu)); + enable_irq(hdev->irq); + } break; } } @@ -318,7 +351,7 @@ static int hpet_legacy_next_event(unsigned long delta, /* * HPET MSI Support */ - +#ifdef CONFIG_PCI_MSI void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); @@ -358,6 +391,253 @@ void hpet_msi_read(unsigned int irq, struct msi_msg *msg) msg->address_hi = 0; } +static void hpet_msi_set_mode(enum clock_event_mode mode, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + hpet_set_mode(mode, evt, hdev->num); +} + +static int hpet_msi_next_event(unsigned long delta, + struct clock_event_device *evt) +{ + struct hpet_dev *hdev = EVT_TO_HPET_DEV(evt); + return hpet_next_event(delta, evt, hdev->num); +} + +static int hpet_setup_msi_irq(unsigned int irq) +{ + if (arch_setup_hpet_msi(irq)) { + destroy_irq(irq); + return -EINVAL; + } + return 0; +} + +static int hpet_assign_irq(struct hpet_dev *dev) +{ + unsigned int irq; + + irq = create_irq(); + if (!irq) + return -EINVAL; + + set_irq_data(irq, dev); + + if (hpet_setup_msi_irq(irq)) + return -EINVAL; + + dev->irq = irq; + return 0; +} + +static irqreturn_t hpet_interrupt_handler(int irq, void *data) +{ + struct hpet_dev *dev = (struct hpet_dev *)data; + struct clock_event_device *hevt = &dev->evt; + + if (!hevt->event_handler) { + printk(KERN_INFO "Spurious HPET timer interrupt on HPET timer %d\n", + dev->num); + return IRQ_HANDLED; + } + + hevt->event_handler(hevt); + return IRQ_HANDLED; +} + +static int hpet_setup_irq(struct hpet_dev *dev) +{ + + if (request_irq(dev->irq, hpet_interrupt_handler, + IRQF_SHARED|IRQF_NOBALANCING, dev->name, dev)) + return -1; + + disable_irq(dev->irq); + irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); + enable_irq(dev->irq); + + return 0; +} + +/* This should be called in specific @cpu */ +static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) +{ + struct clock_event_device *evt = &hdev->evt; + uint64_t hpet_freq; + + WARN_ON(cpu != smp_processor_id()); + if (!(hdev->flags & HPET_DEV_VALID)) + return; + + if (hpet_setup_msi_irq(hdev->irq)) + return; + + hdev->cpu = cpu; + per_cpu(cpu_hpet_dev, cpu) = hdev; + evt->name = hdev->name; + hpet_setup_irq(hdev); + evt->irq = hdev->irq; + + evt->rating = 110; + evt->features = CLOCK_EVT_FEAT_ONESHOT; + if (hdev->flags & HPET_DEV_PERI_CAP) + evt->features |= CLOCK_EVT_FEAT_PERIODIC; + + evt->set_mode = hpet_msi_set_mode; + evt->set_next_event = hpet_msi_next_event; + evt->shift = 32; + + /* + * The period is a femto seconds value. We need to calculate the + * scaled math multiplication factor for nanosecond to hpet tick + * conversion. + */ + hpet_freq = 1000000000000000ULL; + do_div(hpet_freq, hpet_period); + evt->mult = div_sc((unsigned long) hpet_freq, + NSEC_PER_SEC, evt->shift); + /* Calculate the max delta */ + evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt); + /* 5 usec minimum reprogramming delta. */ + evt->min_delta_ns = 5000; + + evt->cpumask = cpumask_of_cpu(hdev->cpu); + clockevents_register_device(evt); +} + +#ifdef CONFIG_HPET +/* Reserve at least one timer for userspace (/dev/hpet) */ +#define RESERVE_TIMERS 1 +#else +#define RESERVE_TIMERS 0 +#endif +void hpet_msi_capability_lookup(unsigned int start_timer) +{ + unsigned int id; + unsigned int num_timers; + unsigned int num_timers_used = 0; + int i; + + id = hpet_readl(HPET_ID); + + num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); + num_timers++; /* Value read out starts from 0 */ + + hpet_devs = kzalloc(sizeof(struct hpet_dev) * num_timers, GFP_KERNEL); + if (!hpet_devs) + return; + + hpet_num_timers = num_timers; + + for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { + struct hpet_dev *hdev = &hpet_devs[num_timers_used]; + unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); + + /* Only consider HPET timer with MSI support */ + if (!(cfg & HPET_TN_FSB_CAP)) + continue; + + hdev->flags = 0; + if (cfg & HPET_TN_PERIODIC_CAP) + hdev->flags |= HPET_DEV_PERI_CAP; + hdev->num = i; + + sprintf(hdev->name, "hpet%d", i); + if (hpet_assign_irq(hdev)) + continue; + + hdev->flags |= HPET_DEV_FSB_CAP; + hdev->flags |= HPET_DEV_VALID; + num_timers_used++; + if (num_timers_used == num_possible_cpus()) + break; + } + + printk(KERN_INFO "HPET: %d timers in total, %d timers will be used for per-cpu timer\n", + num_timers, num_timers_used); +} + +static struct hpet_dev *hpet_get_unused_timer(void) +{ + int i; + + if (!hpet_devs) + return NULL; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + if (test_and_set_bit(HPET_DEV_USED_BIT, + (unsigned long *)&hdev->flags)) + continue; + return hdev; + } + return NULL; +} + +struct hpet_work_struct { + struct delayed_work work; + struct completion complete; +}; + +static void hpet_work(struct work_struct *w) +{ + struct hpet_dev *hdev; + int cpu = smp_processor_id(); + struct hpet_work_struct *hpet_work; + + hpet_work = container_of(w, struct hpet_work_struct, work.work); + + hdev = hpet_get_unused_timer(); + if (hdev) + init_one_hpet_msi_clockevent(hdev, cpu); + + complete(&hpet_work->complete); +} + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + struct hpet_work_struct work; + struct hpet_dev *hdev = per_cpu(cpu_hpet_dev, cpu); + + switch (action & 0xf) { + case CPU_ONLINE: + INIT_DELAYED_WORK(&work.work, hpet_work); + init_completion(&work.complete); + /* FIXME: add schedule_work_on() */ + schedule_delayed_work_on(cpu, &work.work, 0); + wait_for_completion(&work.complete); + break; + case CPU_DEAD: + if (hdev) { + free_irq(hdev->irq, hdev); + hdev->flags &= ~HPET_DEV_USED; + per_cpu(cpu_hpet_dev, cpu) = NULL; + } + break; + } + return NOTIFY_OK; +} +#else + +void hpet_msi_capability_lookup(unsigned int start_timer) +{ + return; +} + +static int hpet_cpuhp_notify(struct notifier_block *n, + unsigned long action, void *hcpu) +{ + return NOTIFY_OK; +} + +#endif + /* * Clock source related code */ @@ -493,8 +773,10 @@ int __init hpet_enable(void) if (id & HPET_ID_LEGSUP) { hpet_legacy_clockevent_register(); + hpet_msi_capability_lookup(2); return 1; } + hpet_msi_capability_lookup(0); return 0; out_nohpet: @@ -511,6 +793,8 @@ out_nohpet: */ static __init int hpet_late_init(void) { + int cpu; + if (boot_hpet_disable) return -ENODEV; @@ -526,6 +810,13 @@ static __init int hpet_late_init(void) hpet_reserve_platform_timers(hpet_readl(HPET_ID)); + for_each_online_cpu(cpu) { + hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); + } + + /* This notifier should be called after workqueue is ready */ + hotcpu_notifier(hpet_cpuhp_notify, -20); + return 0; } fs_initcall(hpet_late_init); -- cgit v1.2.2 From 3c2cbd2490656fb4b6ede586c557a2b09811a432 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 6 Sep 2008 14:15:33 +0400 Subject: x86: io-apic - code style cleaning for setup_IO_APIC_irqs By changing printout form we are able to shrink (and clean up) code a bit. Former printout example: init IO_APIC IRQs IO-APIC (apicid-pin) 1-1, 1-2, 1-3 not connected. IO-APIC (apicid-pin) 2-1, 2-2, 2-3 not connected. New printout example: init IO_APIC IRQs 1-1 1-2 1-3 (apicid-pin) not connected 2-1 2-2 2-3 (apicid-pin) not connected Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 53 +++++++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 25 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 77fa155becf6..4040d575a21e 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1518,41 +1518,44 @@ static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, static void __init setup_IO_APIC_irqs(void) { - int apic, pin, idx, irq, first_notcon = 1; + int apic, pin, idx, irq; + int notcon = 0; apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - - idx = find_irq_entry(apic,pin,mp_INT); - if (idx == -1) { - if (first_notcon) { - apic_printk(APIC_VERBOSE, KERN_DEBUG " IO-APIC (apicid-pin) %d-%d", mp_ioapics[apic].mp_apicid, pin); - first_notcon = 0; - } else - apic_printk(APIC_VERBOSE, ", %d-%d", mp_ioapics[apic].mp_apicid, pin); - continue; - } - if (!first_notcon) { - apic_printk(APIC_VERBOSE, " not connected.\n"); - first_notcon = 1; - } + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + + idx = find_irq_entry(apic, pin, mp_INT); + if (idx == -1) { + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic].mp_apicid, pin); + if (!notcon) + notcon = 1; + continue; + } - irq = pin_2_irq(idx, apic, pin); + irq = pin_2_irq(idx, apic, pin); #ifdef CONFIG_X86_32 - if (multi_timer_check(apic, irq)) - continue; + if (multi_timer_check(apic, irq)) + continue; #endif - add_pin_to_irq(irq, apic, pin); + add_pin_to_irq(irq, apic, pin); - setup_IO_APIC_irq(apic, pin, irq, - irq_trigger(idx), irq_polarity(idx)); - } + setup_IO_APIC_irq(apic, pin, irq, + irq_trigger(idx), irq_polarity(idx)); + } + if (notcon) { + apic_printk(APIC_VERBOSE, + KERN_DEBUG " (apicid-pin) not connected\n"); + notcon = 0; + } } - if (!first_notcon) - apic_printk(APIC_VERBOSE, " not connected.\n"); + if (notcon) + apic_printk(APIC_VERBOSE, + KERN_DEBUG " (apicid-pin) not connected\n"); } /* -- cgit v1.2.2 From 79c09698cac8df16c5539447d5e1047fde9742e5 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 7 Sep 2008 17:58:57 -0700 Subject: x86: lapic address print out like io apic addr Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 0556f375e40b..f3f50858048e 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1548,7 +1548,7 @@ void __init init_apic_mappings(void) apic_phys = mp_lapic_addr; set_fixmap_nocache(FIX_APIC_BASE, apic_phys); - apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n", + apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n", APIC_BASE, apic_phys); /* -- cgit v1.2.2 From 2a554fb132cf804477087057b9b0ff2162984507 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Mon, 8 Sep 2008 19:38:06 +0400 Subject: x86: io-apic - do not use KERN_DEBUG marker too much Do not use KERN_DEBUG several times on the same line being printed. Introduced by mine previous patch, sorry. Reported-by: Yinghai Lu Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4040d575a21e..12e59df026db 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1528,11 +1528,16 @@ static void __init setup_IO_APIC_irqs(void) idx = find_irq_entry(apic, pin, mp_INT); if (idx == -1) { - apic_printk(APIC_VERBOSE, - KERN_DEBUG " %d-%d", - mp_ioapics[apic].mp_apicid, pin); - if (!notcon) + if (!notcon) { notcon = 1; + apic_printk(APIC_VERBOSE, + KERN_DEBUG " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); + } else + apic_printk(APIC_VERBOSE, " %d-%d", + mp_ioapics[apic].mp_apicid, + pin); continue; } @@ -1548,14 +1553,14 @@ static void __init setup_IO_APIC_irqs(void) } if (notcon) { apic_printk(APIC_VERBOSE, - KERN_DEBUG " (apicid-pin) not connected\n"); + " (apicid-pin) not connected\n"); notcon = 0; } } if (notcon) apic_printk(APIC_VERBOSE, - KERN_DEBUG " (apicid-pin) not connected\n"); + " (apicid-pin) not connected\n"); } /* -- cgit v1.2.2 From f0ed4e695faf6766927c8cfbda2bc7530c7210c2 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Mon, 8 Sep 2008 10:18:40 -0700 Subject: x86: using HPET in MSI mode and setting up per CPU HPET timers, fix On Sat, Sep 06, 2008 at 06:03:53AM -0700, Ingo Molnar wrote: > > it crashes two testsystems, the fault on a NULL pointer in hpet init, > with: > > initcall print_all_ICs+0x0/0x520 returned 0 after 26 msecs > calling hpet_late_init+0x0/0x1c0 > BUG: unable to handle kernel NULL pointer dereference at 000000000000008c > IP: [] hpet_late_init+0xfe/0x1c0 > PGD 0 > Oops: 0000 [1] SMP > CPU 0 > Modules linked in: > Pid: 1, comm: swapper Not tainted 2.6.27-rc5 #29725 > RIP: 0010:[] [] hpet_late_init+0xfe/0x1c0 > RSP: 0018:ffff88003fa07dd0 EFLAGS: 00010246 > RAX: 0000000000000000 RBX: 0000000000000003 RCX: 0000000000000000 > RDX: ffffc20000000160 RSI: 0000000000000000 RDI: 0000000000000003 > RBP: ffff88003fa07e90 R08: 0000000000000000 R09: ffff88003fa07dd0 > R10: 0000000000000001 R11: 0000000000000000 R12: ffff88003fa07dd0 > R13: 0000000000000002 R14: ffffc20000000000 R15: 000000006f57e511 > FS: 0000000000000000(0000) GS:ffffffff80cf6a80(0000) knlGS:0000000000000000 > CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b > CR2: 000000000000008c CR3: 0000000000201000 CR4: 00000000000006e0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 > Process swapper (pid: 1, threadinfo ffff88003fa06000, task ffff88003fa08000) > Stack: 00000000fed00000 ffffc20000000000 0000000100000003 0000000800000002 > 0000000000000000 0000000000000000 0000000000000000 0000000000000000 > 0000000000000000 0000000000000000 0000000000000000 0000000000000000 > Call Trace: > [] ? hpet_late_init+0x0/0x1c0 > [] do_one_initcall+0x45/0x190 > [] ? register_irq_proc+0x19/0xe0 > [] ? early_idt_handler+0x0/0x73 > [] kernel_init+0x14c/0x1b0 > [] ? trace_hardirqs_on_thunk+0x3a/0x3f > [] child_rip+0xa/0x11 > [] ? restore_args+0x0/0x30 > [] ? kernel_init+0x0/0x1b0 > [] ? child_rip+0x0/0x11 > Code: 20 48 83 c1 01 48 39 f1 75 e3 44 89 e8 4c 8b 05 29 29 22 00 31 f6 48 8d 78 01 66 66 90 89 f0 48 8d 04 80 48 c1 e0 05 4a 8d 0c 00 81 8c 00 00 00 08 74 26 8b 81 80 00 00 00 8b 91 88 00 00 00 > RIP [] hpet_late_init+0xfe/0x1c0 > RSP > CR2: 000000000000008c > Kernel panic - not syncing: Fatal exception There was one code path, with CONFIG_PCI_MSI disabled, where we were accessing hpet_devs without initialization. That resulted in the above crash. The change below adds a check for hpet_devs. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 31e9191b7e19..01005aeda7d9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -126,6 +126,24 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + int i; + + if (!hpet_devs) + return; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd->hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(hd, hdev->num); + } +} + static void hpet_reserve_platform_timers(unsigned long id) { struct hpet __iomem *hpet = hpet_virt_address; @@ -158,15 +176,7 @@ static void hpet_reserve_platform_timers(unsigned long id) Tn_INT_ROUTE_CNF_MASK) >> Tn_INT_ROUTE_CNF_SHIFT; } - for (i = 0; i < nrtimers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; - - if (!(hdev->flags & HPET_DEV_VALID)) - continue; - - hd.hd_irq[hdev->num] = hdev->irq; - hpet_reserve_timer(&hd, hdev->num); - } + hpet_reserve_msi_timers(&hd); hpet_alloc(&hd); -- cgit v1.2.2 From ba374c9baef910fbc5373541d98c50f15e82c3f8 Mon Sep 17 00:00:00 2001 From: Steven Noonan Date: Mon, 8 Sep 2008 16:19:09 -0700 Subject: x86: fix HPET compiler error when not using CONFIG_PCI_MSI Added dummy function for hpet_setup_msi_irq(). Signed-off-by: Steven Noonan Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 01005aeda7d9..422c577ef691 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -635,6 +635,10 @@ static int hpet_cpuhp_notify(struct notifier_block *n, } #else +static int hpet_setup_msi_irq(unsigned int irq) +{ + return 0; +} void hpet_msi_capability_lookup(unsigned int start_timer) { return; -- cgit v1.2.2 From 87783be4c26d79f5cde245e6e8a9fd2b295e92d7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 10 Sep 2008 22:19:50 +0400 Subject: x86: io-apic - get rid of __DO_ACTION macro Replace __DO_ACTION macro with io_apic_modify_irq function. This allow us to 'grep' definitions being hided by __DO_ACTION macro: __unmask_IO_APIC_irq __mask_IO_APIC_irq __mask_and_edge_IO_APIC_irq __unmask_and_level_IO_APIC_irq Signed-off-by: Cyrill Gorcunov Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 103 +++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 51 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 12e59df026db..ac47384724e3 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -643,65 +643,66 @@ static void __init replace_pin_at_irq(unsigned int irq, add_pin_to_irq(irq, newapic, newpin); } -#define __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ - \ -{ \ - int pin; \ - struct irq_cfg *cfg; \ - struct irq_pin_list *entry; \ - \ - cfg = irq_cfg(irq); \ - entry = cfg->irq_2_pin; \ - for (;;) { \ - unsigned int reg; \ - if (!entry) \ - break; \ - pin = entry->pin; \ - reg = io_apic_read(entry->apic, 0x10 + R + pin*2); \ - reg ACTION_DISABLE; \ - reg ACTION_ENABLE; \ - io_apic_modify(entry->apic, 0x10 + R + pin*2, reg); \ - FINAL; \ - if (!entry->next) \ - break; \ - entry = entry->next; \ - } \ -} - -#define DO_ACTION(name,R, ACTION_ENABLE, ACTION_DISABLE, FINAL) \ - \ - static void name##_IO_APIC_irq (unsigned int irq) \ - __DO_ACTION(R, ACTION_ENABLE, ACTION_DISABLE, FINAL) - -/* mask = 0 */ -DO_ACTION(__unmask, 0, |= 0, &= ~IO_APIC_REDIR_MASKED, ) +static inline void io_apic_modify_irq(unsigned int irq, + int mask_and, int mask_or, + void (*final)(struct irq_pin_list *entry)) +{ + int pin; + struct irq_cfg *cfg; + struct irq_pin_list *entry; + + cfg = irq_cfg(irq); + for (entry = cfg->irq_2_pin; entry != NULL; entry = entry->next) { + unsigned int reg; + pin = entry->pin; + reg = io_apic_read(entry->apic, 0x10 + pin * 2); + reg &= mask_and; + reg |= mask_or; + io_apic_modify(entry->apic, 0x10 + pin * 2, reg); + if (final) + final(entry); + } +} + +static void __unmask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, 0, NULL); +} #ifdef CONFIG_X86_64 -/* - * Synchronize the IO-APIC and the CPU by doing - * a dummy read from the IO-APIC - */ -static inline void io_apic_sync(unsigned int apic) +void io_apic_sync(struct irq_pin_list *entry) { - struct io_apic __iomem *io_apic = io_apic_base(apic); + /* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ + struct io_apic __iomem *io_apic; + io_apic = io_apic_base(entry->apic); readl(&io_apic->data); } -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, io_apic_sync(entry->apic)) - -#else - -/* mask = 1 */ -DO_ACTION(__mask, 0, |= IO_APIC_REDIR_MASKED, &= ~0, ) - -/* mask = 1, trigger = 0 */ -DO_ACTION(__mask_and_edge, 0, |= IO_APIC_REDIR_MASKED, &= ~IO_APIC_REDIR_LEVEL_TRIGGER, ) +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); +} +#else /* CONFIG_X86_32 */ +static void __mask_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~0, IO_APIC_REDIR_MASKED, NULL); +} -/* mask = 0, trigger = 1 */ -DO_ACTION(__unmask_and_level, 0, |= IO_APIC_REDIR_LEVEL_TRIGGER, &= ~IO_APIC_REDIR_MASKED, ) +static void __mask_and_edge_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_LEVEL_TRIGGER, + IO_APIC_REDIR_MASKED, NULL); +} -#endif +static void __unmask_and_level_IO_APIC_irq(unsigned int irq) +{ + io_apic_modify_irq(irq, ~IO_APIC_REDIR_MASKED, + IO_APIC_REDIR_LEVEL_TRIGGER, NULL); +} +#endif /* CONFIG_X86_32 */ static void mask_IO_APIC_irq (unsigned int irq) { -- cgit v1.2.2 From 823b259b80158a5fb694f6784e18b5bae669c599 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Sep 2008 21:56:46 -0700 Subject: x86: print out apic id in hex format Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/cpu/amd.c | 2 +- arch/x86/kernel/cpu/intel.c | 2 +- arch/x86/kernel/smpboot.c | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index f3f50858048e..11cb18639581 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1586,7 +1586,7 @@ int __init APIC_init_uniprocessor(void) */ if (!cpu_has_apic && APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { - printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + printk(KERN_ERR "BIOS bug, local APIC 0x%x not detected!...\n", boot_cpu_physical_apicid); clear_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC); return -1; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 32e73520adf7..8f1e31db2ad5 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -249,7 +249,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) } numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 99468dbd08da..cce0b6118d55 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -174,7 +174,7 @@ static void __cpuinit srat_detect_node(void) node = first_node(node_online_map); numa_set_node(cpu, node); - printk(KERN_INFO "CPU %d/%x -> Node %d\n", cpu, apicid, node); + printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node); #endif } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c3aca7cb343..f84de2ff933c 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -543,10 +543,10 @@ static inline void __inquire_remote_apic(int apicid) int timeout; u32 status; - printk(KERN_INFO "Inquiring remote APIC #%d...\n", apicid); + printk(KERN_INFO "Inquiring remote APIC 0x%x...\n", apicid); for (i = 0; i < ARRAY_SIZE(regs); i++) { - printk(KERN_INFO "... APIC #%d %s: ", apicid, names[i]); + printk(KERN_INFO "... APIC 0x%x %s: ", apicid, names[i]); /* * Wait for idle. @@ -874,7 +874,7 @@ do_rest: start_ip = setup_trampoline(); /* So we see what's up */ - printk(KERN_INFO "Booting processor %d/%d ip %lx\n", + printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", cpu, apicid, start_ip); /* -- cgit v1.2.2 From 9df08f109572e88fbdab9a61d5cb0f0eae4ac9fa Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 14 Sep 2008 11:55:37 +0400 Subject: x86: apic - lapic_setup_esr does not handle esr_disable - fix it lapic_setup_esr doesn't handle esr_disable inquire. The error brought in during unification process. Fix it. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 63 ++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 11cb18639581..59550cc017dc 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1081,40 +1081,43 @@ void __init init_bsp_APIC(void) static void __cpuinit lapic_setup_esr(void) { - unsigned long oldvalue, value, maxlvt; - if (lapic_is_integrated() && !esr_disable) { - if (esr_disable) { - /* - * Something untraceable is creating bad interrupts on - * secondary quads ... for the moment, just leave the - * ESR disabled - we can't do anything useful with the - * errors anyway - mbligh - */ - printk(KERN_INFO "Leaving ESR disabled.\n"); - return; - } - /* !82489DX */ - maxlvt = lapic_get_maxlvt(); - if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ - apic_write(APIC_ESR, 0); - oldvalue = apic_read(APIC_ESR); + unsigned int oldvalue, value, maxlvt; + + if (!lapic_is_integrated()) { + printk(KERN_INFO "No ESR for 82489DX.\n"); + return; + } - /* enables sending errors */ - value = ERROR_APIC_VECTOR; - apic_write(APIC_LVTERR, value); + if (esr_disable) { /* - * spec says clear errors after enabling vector. + * Something untraceable is creating bad interrupts on + * secondary quads ... for the moment, just leave the + * ESR disabled - we can't do anything useful with the + * errors anyway - mbligh */ - if (maxlvt > 3) - apic_write(APIC_ESR, 0); - value = apic_read(APIC_ESR); - if (value != oldvalue) - apic_printk(APIC_VERBOSE, "ESR value before enabling " - "vector: 0x%08lx after: 0x%08lx\n", - oldvalue, value); - } else { - printk(KERN_INFO "No ESR for 82489DX.\n"); + printk(KERN_INFO "Leaving ESR disabled.\n"); + return; } + + maxlvt = lapic_get_maxlvt(); + if (maxlvt > 3) /* Due to the Pentium erratum 3AP. */ + apic_write(APIC_ESR, 0); + oldvalue = apic_read(APIC_ESR); + + /* enables sending errors */ + value = ERROR_APIC_VECTOR; + apic_write(APIC_LVTERR, value); + + /* + * spec says clear errors after enabling vector. + */ + if (maxlvt > 3) + apic_write(APIC_ESR, 0); + value = apic_read(APIC_ESR); + if (value != oldvalue) + apic_printk(APIC_VERBOSE, "ESR value before enabling " + "vector: 0x%08x after: 0x%08x\n", + oldvalue, value); } -- cgit v1.2.2 From 08ad776e3c54e6fe8b50939f176538063b69f89e Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sun, 14 Sep 2008 11:55:38 +0400 Subject: x86: apic - skip writting ESR register if we dont have on On 82489DX we don't have ESR register so we should not write it. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 59550cc017dc..d730e8d31727 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1131,7 +1131,7 @@ void __cpuinit setup_local_APIC(void) #ifdef CONFIG_X86_32 /* Pound the ESR really hard over the head with a big hammer - mbligh */ - if (esr_disable) { + if (lapic_is_integrated() && esr_disable) { apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); apic_write(APIC_ESR, 0); -- cgit v1.2.2 From b189892de4da4634560657aedd774752094dbfa0 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 12 Sep 2008 23:58:24 +0400 Subject: x86: apic - fix unused vars warning in calibrate_APIC_clock If we don't have CONFIG_X86_PM_TIMER=y compiler warns about unused variables. Move PM timer based calibration into a separate function and make the code cleaner and the compiler happy as well. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 81 +++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 34 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index d730e8d31727..784116933c70 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -538,14 +538,51 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) } } +static int __init calibrate_by_pmtimer(long deltapm, long *delta) +{ + const long pm_100ms = PMTMR_TICKS_PER_SEC / 10; + const long pm_thresh = pm_100ms / 100; + unsigned long mult; + u64 res; + +#ifndef CONFIG_X86_PM_TIMER + return -1; +#endif + + apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); + + /* Check, if the PM timer is available */ + if (!deltapm) + return -1; + + mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); + + if (deltapm > (pm_100ms - pm_thresh) && + deltapm < (pm_100ms + pm_thresh)) { + apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); + } else { + res = (((u64)deltapm) * mult) >> 22; + do_div(res, 1000000); + printk(KERN_WARNING "APIC calibration not consistent " + "with PM Timer: %ldms instead of 100ms\n", + (long)res); + /* Correct the lapic counter value */ + res = (((u64)(*delta)) * pm_100ms); + do_div(res, deltapm); + printk(KERN_INFO "APIC delta adjusted to PM-Timer: " + "%lu (%ld)\n", (unsigned long)res, *delta); + *delta = (long)res; + } + + return 0; +} + static int __init calibrate_APIC_clock(void) { struct clock_event_device *levt = &__get_cpu_var(lapic_events); - const long pm_100ms = PMTMR_TICKS_PER_SEC/10; - const long pm_thresh = pm_100ms/100; void (*real_handler)(struct clock_event_device *dev); unsigned long deltaj; - long delta, deltapm; + long delta; int pm_referenced = 0; local_irq_disable(); @@ -575,36 +612,9 @@ static int __init calibrate_APIC_clock(void) delta = lapic_cal_t1 - lapic_cal_t2; apic_printk(APIC_VERBOSE, "... lapic delta = %ld\n", delta); -#ifdef CONFIG_X86_PM_TIMER - /* Check, if the PM timer is available */ - deltapm = lapic_cal_pm2 - lapic_cal_pm1; - apic_printk(APIC_VERBOSE, "... PM timer delta = %ld\n", deltapm); - - if (deltapm) { - unsigned long mult; - u64 res; - - mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC, 22); - - if (deltapm > (pm_100ms - pm_thresh) && - deltapm < (pm_100ms + pm_thresh)) { - apic_printk(APIC_VERBOSE, "... PM timer result ok\n"); - } else { - res = (((u64) deltapm) * mult) >> 22; - do_div(res, 1000000); - printk(KERN_WARNING "APIC calibration not consistent " - "with PM Timer: %ldms instead of 100ms\n", - (long)res); - /* Correct the lapic counter value */ - res = (((u64) delta) * pm_100ms); - do_div(res, deltapm); - printk(KERN_INFO "APIC delta adjusted to PM-Timer: " - "%lu (%ld)\n", (unsigned long) res, delta); - delta = (long) res; - } - pm_referenced = 1; - } -#endif + /* we trust the PM based calibration if possible */ + pm_referenced = !calibrate_by_pmtimer(lapic_cal_pm2 - lapic_cal_pm1, + &delta); /* Calculate the scaled math multiplication factor */ lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, @@ -646,7 +656,10 @@ static int __init calibrate_APIC_clock(void) levt->features &= ~CLOCK_EVT_FEAT_DUMMY; - /* We trust the pm timer based calibration */ + /* + * PM timer calibration failed or not turned on + * so lets try APIC timer based calibration + */ if (!pm_referenced) { apic_printk(APIC_VERBOSE, "... verify APIC timer\n"); -- cgit v1.2.2 From 56ffa1a028b9fce3860a247c6fe79fce7cbf425b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Sat, 13 Sep 2008 13:11:16 +0400 Subject: x86: io-apic - do not use KERN_DEBUG marker too much, fix Yinghai Lu reported: | > 0 add_pin_to_irq: irq 15 --> apic 0 pin 15 | > IOAPIC[0]: Set routing entry (8-15 -> 0x3f -> IRQ 15 Mode:0 Active:0) | > 8-16 8-17 8-18 8-19 8-20 8-21 8-22 8-23 (apicid-pin) not connected | > 9-0 9-1 9-2 9-3 9-4 9-5 9-6 9-7 9-8 9-9 9-10 9-11 9-12 9-13 9-14 9-15 | > 9-16 9-17 9-18 9-19 9-20 9-21 9-22 9-23 (apicid-pin) not connected | > | | only first one not connected at first, and ... here is a quick fix for this. Signed-off-by: Cyrill Gorcunov Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index ac47384724e3..6ce5873a406c 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1541,6 +1541,11 @@ static void __init setup_IO_APIC_irqs(void) pin); continue; } + if (notcon) { + apic_printk(APIC_VERBOSE, + " (apicid-pin) not connected\n"); + notcon = 0; + } irq = pin_2_irq(idx, apic, pin); #ifdef CONFIG_X86_32 @@ -1552,11 +1557,6 @@ static void __init setup_IO_APIC_irqs(void) setup_IO_APIC_irq(apic, pin, irq, irq_trigger(idx), irq_polarity(idx)); } - if (notcon) { - apic_printk(APIC_VERBOSE, - " (apicid-pin) not connected\n"); - notcon = 0; - } } if (notcon) -- cgit v1.2.2 From 9d98598d2fc286c8dbcd0b681168639528428db0 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 19 Sep 2008 00:12:26 -0700 Subject: sparseirq: remove some debug print out Signed-off-by: Yinghai Lu Cc: Yinghai Lu Cc: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 6ce5873a406c..01419fdc1d5d 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -277,23 +277,6 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) spin_unlock_irqrestore(&irq_cfg_lock, flags); - printk(KERN_DEBUG "found new irq_cfg for irq %d\n", cfg->irq); -#ifdef CONFIG_HAVE_SPARSE_IRQ_DEBUG - { - /* dump the results */ - struct irq_cfg *cfg; - unsigned long phys; - unsigned long bytes = sizeof(struct irq_cfg); - - printk(KERN_DEBUG "=========================== %d\n", irq); - printk(KERN_DEBUG "irq_cfg dump after get that for %d\n", irq); - for_each_irq_cfg(cfg) { - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg %d ==> [%#lx - %#lx]\n", cfg->irq, phys, phys + bytes); - } - printk(KERN_DEBUG "===========================\n"); - } -#endif return cfg; } #else @@ -597,7 +580,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) cfg->irq_2_pin = entry; entry->apic = apic; entry->pin = pin; - printk(KERN_DEBUG " 0 add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); return; } @@ -613,7 +595,6 @@ static void add_pin_to_irq(unsigned int irq, int apic, int pin) entry = entry->next; entry->apic = apic; entry->pin = pin; - printk(KERN_DEBUG " x add_pin_to_irq: irq %d --> apic %d pin %d\n", irq, apic, pin); } /* -- cgit v1.2.2 From 5ffa4eb2224343ec3fbd492ffe71e28e797435b7 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 18 Sep 2008 23:37:57 +0400 Subject: x86: io-apic - interrupt remapping fix Interrupt remapping could lead to NULL dereference in case of kzalloc failed and memory leak in other way. So fix the both cases. Signed-off-by: Cyrill Gorcunov Cc: "Maciej W. Rozycki" Cc: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 13 ++++++++++--- arch/x86/kernel/io_apic.c | 19 +++++++++++++++++-- 2 files changed, 27 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 784116933c70..1f48bd1c9f2d 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -1360,7 +1360,12 @@ void enable_IR_x2apic(void) local_irq_save(flags); mask_8259A(); - save_mask_IO_APIC_setup(); + + ret = save_mask_IO_APIC_setup(); + if (ret) { + printk(KERN_INFO "Saving IO-APIC state failed: %d\n", ret); + goto end; + } ret = enable_intr_remapping(1); @@ -1370,14 +1375,15 @@ void enable_IR_x2apic(void) } if (ret) - goto end; + goto end_restore; if (!x2apic) { x2apic = 1; apic_ops = &x2apic_ops; enable_x2apic(); } -end: + +end_restore: if (ret) /* * IR enabling failed @@ -1386,6 +1392,7 @@ end: else reinit_intr_remapped_IO_APIC(x2apic_preenabled); +end: unmask_8259A(); local_irq_restore(flags); diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 01419fdc1d5d..4cc9cb64c811 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -812,7 +812,7 @@ int save_mask_IO_APIC_setup(void) kzalloc(sizeof(struct IO_APIC_route_entry) * nr_ioapic_registers[apic], GFP_KERNEL); if (!early_ioapic_entries[apic]) - return -ENOMEM; + goto nomem; } for (apic = 0; apic < nr_ioapics; apic++) @@ -826,17 +826,32 @@ int save_mask_IO_APIC_setup(void) ioapic_write_entry(apic, pin, entry); } } + return 0; + +nomem: + for (; apic > 0; apic--) + kfree(early_ioapic_entries[apic]); + kfree(early_ioapic_entries[apic]); + memset(early_ioapic_entries, 0, + ARRAY_SIZE(early_ioapic_entries)); + + return -ENOMEM; } void restore_IO_APIC_setup(void) { int apic, pin; - for (apic = 0; apic < nr_ioapics; apic++) + for (apic = 0; apic < nr_ioapics; apic++) { + if (!early_ioapic_entries[apic]) + break; for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) ioapic_write_entry(apic, pin, early_ioapic_entries[apic][pin]); + kfree(early_ioapic_entries[apic]); + early_ioapic_entries[apic] = NULL; + } } void reinit_intr_remapped_IO_APIC(int intr_remapping) -- cgit v1.2.2 From 8da077d6f31da291ee3a7dd559671cb8ca48cbe2 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 23 Sep 2008 08:42:05 -0500 Subject: x86, uv: fix ordering of calls to uv_system_init & uv_cpu_init Fix problem caused by reordering of the calls to uv_cpu_init() & uv_system_init. Originally, uv_cpu_init() was called AFTER uv_system_init. This order was recently broken as a side-effect of other patches. With this patch, initialization of cpu 0 is now done by the system_init call. Signed-off-by: Jack Steiner Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index 33581d94a90e..b689075bbe2f 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -356,7 +356,22 @@ static __init void uv_rtc_init(void) sn_rtc_cycles_per_second = ticks_per_sec; } -static bool uv_system_inited; +/* + * Called on each cpu to initialize the per_cpu UV data area. + * ZZZ hotplug not supported yet + */ +void __cpuinit uv_cpu_init(void) +{ + /* CPU 0 initilization will be done via uv_system_init. */ + if (!uv_blade_info) + return; + + uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; + + if (get_uv_system_type() == UV_NON_UNIQUE_APIC) + set_x2apic_extra_bits(uv_hub_info->pnode); +} + void __init uv_system_init(void) { @@ -448,21 +463,6 @@ void __init uv_system_init(void) map_mmr_high(max_pnode); map_config_high(max_pnode); map_mmioh_high(max_pnode); - uv_system_inited = true; -} -/* - * Called on each cpu to initialize the per_cpu UV data area. - * ZZZ hotplug not supported yet - */ -void __cpuinit uv_cpu_init(void) -{ - BUG_ON(!uv_system_inited); - - uv_blade_info[uv_numa_blade_id()].nr_online_cpus++; - - if (get_uv_system_type() == UV_NON_UNIQUE_APIC) - set_x2apic_extra_bits(uv_hub_info->pnode); + uv_cpu_init(); } - - -- cgit v1.2.2 From 7564676813780e0ba4dacf95338202cb72d2172d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Sep 2008 19:04:36 -0700 Subject: x86: irq no should not use hex in /proc/interrupts Arjan van de Ven noticed that we changed IRQ numbers from decimal to hex in /proc/interrupts - that can break user-space utilities like irqbalanced. Reported-by: Arjan van de Ven Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index b2e1082cf5ad..001772ffc918 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -307,7 +307,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%#x: ",i); + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index c2fca89a3f7e..ec2661091283 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -112,7 +112,7 @@ int show_interrupts(struct seq_file *p, void *v) action = desc->action; if (!action && !any_count) goto skip; - seq_printf(p, "%#x: ",i); + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP seq_printf(p, "%10u ", kstat_irqs(i)); #else -- cgit v1.2.2 From c1370b49cc4f09de5b447ddf688a3988a297dbfc Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 23 Sep 2008 23:00:02 +0400 Subject: x86: io-apic - interrupt remapping fix Clean up obscure for() cycle with straight while() form Signed-off-by: Cyrill Gorcunov Cc: "Maciej W. Rozycki" Acked-by: Suresh Siddha Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4cc9cb64c811..ed68a6f99dc2 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -830,9 +830,8 @@ int save_mask_IO_APIC_setup(void) return 0; nomem: - for (; apic > 0; apic--) - kfree(early_ioapic_entries[apic]); - kfree(early_ioapic_entries[apic]); + while (apic >= 0) + kfree(early_ioapic_entries[apic--]); memset(early_ioapic_entries, 0, ARRAY_SIZE(early_ioapic_entries)); -- cgit v1.2.2 From c81bba49a13cb3376654d0cc93dc069fd619ed76 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 25 Sep 2008 11:53:11 -0700 Subject: x86: print out irq nr for msi/ht, v3 v2: fix hpet compiling error v3: Bjorn want to use dev_printk instead Signed-off-by: Yinghai Lu Cc: Bjorn Helgaas Cc: Jesse Barnes Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 3 +++ arch/x86/kernel/io_apic.c | 5 +++++ 2 files changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 422c577ef691..2913913f4a46 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -467,6 +467,9 @@ static int hpet_setup_irq(struct hpet_dev *dev) irq_set_affinity(dev->irq, cpumask_of_cpu(dev->cpu)); enable_irq(dev->irq); + printk(KERN_DEBUG "hpet: %s irq %d for MSI\n", + dev->name, dev->irq); + return 0; } diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index ed68a6f99dc2..4ee270d30358 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -3354,6 +3354,8 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *desc, int irq) #endif set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); + return 0; } @@ -3586,6 +3588,7 @@ int arch_setup_hpet_msi(unsigned int irq) hpet_msi_write(irq, &msg); set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, "edge"); + return 0; } #endif @@ -3682,6 +3685,8 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); + + dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); } return err; } -- cgit v1.2.2 From 5f79f2f2ad39b5177c52ed08ffd066ea0c1da924 Mon Sep 17 00:00:00 2001 From: Venki Pallipadi Date: Wed, 24 Sep 2008 10:03:17 -0700 Subject: hpet: clean up warning Fix the below compile warnings due to recent HPET MSI changes arch/x86/kernel/hpet.c:48: warning: 'hpet_devs' defined but not used arch/x86/kernel/hpet.c:50: warning: 'per_cpu__cpu_hpet_dev' defined but not used Reported-by: Andrew Morton Signed-off-by: Venkatesh Pallipadi Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 57 +++++++++++++++++++++++++++++++------------------- 1 file changed, 35 insertions(+), 22 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 2913913f4a46..77017e834cf7 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -45,10 +45,6 @@ struct hpet_dev { char name[10]; }; -static struct hpet_dev *hpet_devs; - -static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); - unsigned long hpet_readl(unsigned long a) { return readl(hpet_virt_address + a); @@ -126,23 +122,8 @@ EXPORT_SYMBOL_GPL(is_hpet_enabled); * timer 0 and timer 1 in case of RTC emulation. */ #ifdef CONFIG_HPET -static void hpet_reserve_msi_timers(struct hpet_data *hd) -{ - int i; - if (!hpet_devs) - return; - - for (i = 0; i < hpet_num_timers; i++) { - struct hpet_dev *hdev = &hpet_devs[i]; - - if (!(hdev->flags & HPET_DEV_VALID)) - continue; - - hd->hd_irq[hdev->num] = hdev->irq; - hpet_reserve_timer(hd, hdev->num); - } -} +static void hpet_reserve_msi_timers(struct hpet_data *hd); static void hpet_reserve_platform_timers(unsigned long id) { @@ -362,6 +343,10 @@ static int hpet_legacy_next_event(unsigned long delta, * HPET MSI Support */ #ifdef CONFIG_PCI_MSI + +static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); +static struct hpet_dev *hpet_devs; + void hpet_msi_unmask(unsigned int irq) { struct hpet_dev *hdev = get_irq_data(irq); @@ -525,7 +510,8 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) #else #define RESERVE_TIMERS 0 #endif -void hpet_msi_capability_lookup(unsigned int start_timer) + +static void hpet_msi_capability_lookup(unsigned int start_timer) { unsigned int id; unsigned int num_timers; @@ -571,6 +557,26 @@ void hpet_msi_capability_lookup(unsigned int start_timer) num_timers, num_timers_used); } +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) +{ + int i; + + if (!hpet_devs) + return; + + for (i = 0; i < hpet_num_timers; i++) { + struct hpet_dev *hdev = &hpet_devs[i]; + + if (!(hdev->flags & HPET_DEV_VALID)) + continue; + + hd->hd_irq[hdev->num] = hdev->irq; + hpet_reserve_timer(hd, hdev->num); + } +} +#endif + static struct hpet_dev *hpet_get_unused_timer(void) { int i; @@ -642,10 +648,17 @@ static int hpet_setup_msi_irq(unsigned int irq) { return 0; } -void hpet_msi_capability_lookup(unsigned int start_timer) +static void hpet_msi_capability_lookup(unsigned int start_timer) +{ + return; +} + +#ifdef CONFIG_HPET +static void hpet_reserve_msi_timers(struct hpet_data *hd) { return; } +#endif static int hpet_cpuhp_notify(struct notifier_block *n, unsigned long action, void *hcpu) -- cgit v1.2.2 From 4173a0e7371ece227559b44943c6fd456ee470d1 Mon Sep 17 00:00:00 2001 From: Dean Nelson Date: Thu, 2 Oct 2008 12:18:21 -0500 Subject: x86, UV: add uv_setup_irq() and uv_teardown_irq() functions, v3 Provide a means for UV interrupt MMRs to be setup with the message to be sent when an MSI is raised. Signed-off-by: Dean Nelson Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/io_apic.c | 68 +++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/uv_irq.c | 77 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 146 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/uv_irq.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 45cecc59d894..acc0e8bf043e 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -108,7 +108,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o - obj-y += bios_uv.o + obj-y += bios_uv.o uv_irq.o obj-y += genx2apic_cluster.o obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 4ee270d30358..260c95a5e6dc 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -58,6 +58,8 @@ #include #include #include +#include +#include #include #include @@ -3692,6 +3694,72 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) } #endif /* CONFIG_HT_IRQ */ +#ifdef CONFIG_X86_64 +/* + * Re-target the irq to the specified CPU and enable the specified MMR located + * on the specified blade to allow the sending of MSIs to the specified CPU. + */ +int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + const cpumask_t *eligible_cpu = get_cpu_mask(cpu); + struct irq_cfg *cfg; + int mmr_pnode; + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + unsigned long flags; + int err; + + err = assign_irq_vector(irq, *eligible_cpu); + if (err != 0) + return err; + + spin_lock_irqsave(&vector_lock, flags); + set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, + irq_name); + spin_unlock_irqrestore(&vector_lock, flags); + + cfg = irq_cfg(irq); + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->vector = cfg->vector; + entry->delivery_mode = INT_DELIVERY_MODE; + entry->dest_mode = INT_DEST_MODE; + entry->polarity = 0; + entry->trigger = 0; + entry->mask = 0; + entry->dest = cpu_mask_to_apicid(*eligible_cpu); + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); + + return irq; +} + +/* + * Disable the specified MMR located on the specified blade so that MSIs are + * longer allowed to be sent. + */ +void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset) +{ + unsigned long mmr_value; + struct uv_IO_APIC_route_entry *entry; + int mmr_pnode; + + mmr_value = 0; + entry = (struct uv_IO_APIC_route_entry *)&mmr_value; + BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long)); + + entry->mask = 1; + + mmr_pnode = uv_blade_to_pnode(mmr_blade); + uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); +} +#endif /* CONFIG_X86_64 */ + int __init io_apic_get_redir_entries (int ioapic) { union IO_APIC_reg_01 reg_01; diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c new file mode 100644 index 000000000000..6bd26c91c30b --- /dev/null +++ b/arch/x86/kernel/uv_irq.c @@ -0,0 +1,77 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * SGI UV IRQ functions + * + * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. + */ + +#include +#include +#include + +static void uv_noop(unsigned int irq) +{ +} + +static unsigned int uv_noop_ret(unsigned int irq) +{ + return 0; +} + +static void uv_ack_apic(unsigned int irq) +{ + ack_APIC_irq(); +} + +struct irq_chip uv_irq_chip = { + .name = "UV-CORE", + .startup = uv_noop_ret, + .shutdown = uv_noop, + .enable = uv_noop, + .disable = uv_noop, + .ack = uv_noop, + .mask = uv_noop, + .unmask = uv_noop, + .eoi = uv_ack_apic, + .end = uv_noop, +}; + +/* + * Set up a mapping of an available irq and vector, and enable the specified + * MMR that defines the MSI that is to be sent to the specified CPU when an + * interrupt is raised. + */ +int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, + unsigned long mmr_offset) +{ + int irq; + int ret; + + irq = create_irq(); + if (irq <= 0) + return -EBUSY; + + ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); + if (ret != irq) + destroy_irq(irq); + + return ret; +} +EXPORT_SYMBOL_GPL(uv_setup_irq); + +/* + * Tear down a mapping of an irq and vector, and disable the specified MMR that + * defined the MSI that was to be sent to the specified CPU when an interrupt + * was raised. + * + * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). + */ +void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) +{ + arch_disable_uv_irq(mmr_blade, mmr_offset); + destroy_irq(irq); +} +EXPORT_SYMBOL_GPL(uv_teardown_irq); -- cgit v1.2.2 From 37762b6ffb37f9e21cbc6f80902aa06b7a053fd7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Oct 2008 11:38:37 +0200 Subject: x86, UV: add uv_setup_irq() and uv_teardown_irq() functions, v3, fix fix: arch/x86/kernel/uv_irq.c: In function 'uv_ack_apic': arch/x86/kernel/uv_irq.c:26: error: implicit declaration of function 'ack_APIC_irq' Signed-off-by: Ingo Molnar --- arch/x86/kernel/uv_irq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 6bd26c91c30b..aeef529917e4 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -10,6 +10,8 @@ #include #include + +#include #include static void uv_noop(unsigned int irq) -- cgit v1.2.2 From a50f70b17541c0060967c6df61133e968bad3652 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:58:54 -0500 Subject: x86: Add UV EFI table entry v4 Look for a UV entry in the EFI tables. Signed-off-by: Russ Anderson Signed-off-by: Paul Jackson Acked-by: Huang Ying Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/efi.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c index 945a31cdd81f..1119d247fe11 100644 --- a/arch/x86/kernel/efi.c +++ b/arch/x86/kernel/efi.c @@ -366,6 +366,10 @@ void __init efi_init(void) SMBIOS_TABLE_GUID)) { efi.smbios = config_tables[i].table; printk(" SMBIOS=0x%lx ", config_tables[i].table); + } else if (!efi_guidcmp(config_tables[i].guid, + UV_SYSTEM_TABLE_GUID)) { + efi.uv_systab = config_tables[i].table; + printk(" UVsystab=0x%lx ", config_tables[i].table); } else if (!efi_guidcmp(config_tables[i].guid, HCDP_TABLE_GUID)) { efi.hcdp = config_tables[i].table; -- cgit v1.2.2 From 7f5942329e0787087a5e4dced838cee711ac2b58 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:59:15 -0500 Subject: x86: Add UV bios call infrastructure v4 Add the EFI callback function and associated wrapper code. Initialize SAL system table entry info at boot time. Signed-off-by: Russ Anderson Signed-off-by: Paul Jackson Acked-by: Huang Ying Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 101 +++++++++++++++++++++++++++++++-------- arch/x86/kernel/genx2apic_uv_x.c | 1 + 2 files changed, 81 insertions(+), 21 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index fdd585f9c53d..5481eb59f783 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -1,8 +1,6 @@ /* * BIOS run time interface routines. * - * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. - * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -16,33 +14,94 @@ * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson */ +#include +#include +#include #include -const char * -x86_bios_strerror(long status) +struct uv_systab uv_systab; + +s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) { - const char *str; - switch (status) { - case 0: str = "Call completed without error"; break; - case -1: str = "Not implemented"; break; - case -2: str = "Invalid argument"; break; - case -3: str = "Call completed with error"; break; - default: str = "Unknown BIOS status code"; break; - } - return str; + struct uv_systab *tab = &uv_systab; + + if (!tab->function) + /* + * BIOS does not support UV systab + */ + return BIOS_STATUS_UNIMPLEMENTED; + + return efi_call6((void *)__va(tab->function), + (u64)which, a1, a2, a3, a4, a5); } -long -x86_bios_freq_base(unsigned long which, unsigned long *ticks_per_second, - unsigned long *drift_info) +s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) { - struct uv_bios_retval isrv; + unsigned long bios_flags; + s64 ret; + + local_irq_save(bios_flags); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + local_irq_restore(bios_flags); + + return ret; +} + +s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, + u64 a4, u64 a5) +{ + s64 ret; + + preempt_disable(); + ret = uv_bios_call(which, a1, a2, a3, a4, a5); + preempt_enable(); - BIOS_CALL(isrv, BIOS_FREQ_BASE, which, 0, 0, 0, 0, 0, 0); - *ticks_per_second = isrv.v0; - *drift_info = isrv.v1; - return isrv.status; + return ret; +} + +long +x86_bios_freq_base(unsigned long clock_type, unsigned long *ticks_per_second, + unsigned long *drift_info) +{ + return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, + (u64)ticks_per_second, 0, 0, 0); } EXPORT_SYMBOL_GPL(x86_bios_freq_base); + + +#ifdef CONFIG_EFI +void uv_bios_init(void) +{ + struct uv_systab *tab; + + if ((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || + (efi.uv_systab == (unsigned long)NULL)) { + printk(KERN_CRIT "No EFI UV System Table.\n"); + uv_systab.function = (unsigned long)NULL; + return; + } + + tab = (struct uv_systab *)ioremap(efi.uv_systab, + sizeof(struct uv_systab)); + if (strncmp(tab->signature, "UVST", 4) != 0) + printk(KERN_ERR "bad signature in UV system table!"); + + /* + * Copy table to permanent spot for later use. + */ + memcpy(&uv_systab, tab, sizeof(struct uv_systab)); + iounmap(tab); + + printk(KERN_INFO "EFI UV System Table Revision %d\n", tab->revision); +} +#else /* !CONFIG_EFI */ + +void uv_bios_init(void) { } +#endif + diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index b689075bbe2f..aa2e5e15bf69 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -427,6 +427,7 @@ void __init uv_system_init(void) gnode_upper = (((unsigned long)node_id.s.node_id) & ~((1 << n_val) - 1)) << m_val; + uv_bios_init(); uv_rtc_init(); for_each_present_cpu(cpu) { -- cgit v1.2.2 From 922402f15a85f7a064926eb1db68cc52bc4d4a91 Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:59:33 -0500 Subject: x86: Add UV partition call v4 Add a bios call to return partitioning related info. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/bios_uv.c | 44 +++++++++++++++++++++++++++++++++++----- arch/x86/kernel/genx2apic_uv_x.c | 14 +++++++------ 2 files changed, 47 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c index 5481eb59f783..f0dfe6f17e7e 100644 --- a/arch/x86/kernel/bios_uv.c +++ b/arch/x86/kernel/bios_uv.c @@ -23,6 +23,7 @@ #include #include #include +#include struct uv_systab uv_systab; @@ -65,14 +66,47 @@ s64 uv_bios_call_reentrant(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, return ret; } -long -x86_bios_freq_base(unsigned long clock_type, unsigned long *ticks_per_second, - unsigned long *drift_info) + +long sn_partition_id; +EXPORT_SYMBOL_GPL(sn_partition_id); +long uv_coherency_id; +EXPORT_SYMBOL_GPL(uv_coherency_id); +long uv_region_size; +EXPORT_SYMBOL_GPL(uv_region_size); +int uv_type; + + +s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, + long *region) +{ + s64 ret; + u64 v0, v1; + union partition_info_u part; + + ret = uv_bios_call_irqsave(UV_BIOS_GET_SN_INFO, fc, + (u64)(&v0), (u64)(&v1), 0, 0); + if (ret != BIOS_STATUS_SUCCESS) + return ret; + + part.val = v0; + if (uvtype) + *uvtype = part.hub_version; + if (partid) + *partid = part.partition_id; + if (coher) + *coher = part.coherence_id; + if (region) + *region = part.region_size; + return ret; +} + + +s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second) { return uv_bios_call(UV_BIOS_FREQ_BASE, clock_type, - (u64)ticks_per_second, 0, 0, 0); + (u64)ticks_per_second, 0, 0, 0); } -EXPORT_SYMBOL_GPL(x86_bios_freq_base); +EXPORT_SYMBOL_GPL(uv_bios_freq_base); #ifdef CONFIG_EFI diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index aa2e5e15bf69..bfd532843df6 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -341,12 +341,12 @@ static __init void map_mmioh_high(int max_pnode) static __init void uv_rtc_init(void) { - long status, ticks_per_sec, drift; + long status; + u64 ticks_per_sec; - status = - x86_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, &ticks_per_sec, - &drift); - if (status != 0 || ticks_per_sec < 100000) { + status = uv_bios_freq_base(BIOS_FREQ_BASE_REALTIME_CLOCK, + &ticks_per_sec); + if (status != BIOS_STATUS_SUCCESS || ticks_per_sec < 100000) { printk(KERN_WARNING "unable to determine platform RTC clock frequency, " "guessing.\n"); @@ -428,6 +428,8 @@ void __init uv_system_init(void) ~((1 << n_val) - 1)) << m_val; uv_bios_init(); + uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, + &uv_coherency_id, &uv_region_size); uv_rtc_init(); for_each_present_cpu(cpu) { @@ -449,7 +451,7 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1; uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; - uv_cpu_hub_info(cpu)->coherency_domain_number = 0;/* ZZZ */ + uv_cpu_hub_info(cpu)->coherency_domain_number = uv_coherency_id; uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); -- cgit v1.2.2 From fc8c2d763bacc02962048fa042e287debb1416aa Mon Sep 17 00:00:00 2001 From: Russ Anderson Date: Fri, 3 Oct 2008 11:59:50 -0500 Subject: x86: Add sysfs entries for UV v4 Create /sys/firmware/sgi_uv sysfs entries for partition_id and coherence_id. Signed-off-by: Russ Anderson Signed-off-by: H. Peter Anvin Signed-off-by: Ingo Molnar --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/uv_sysfs.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/uv_sysfs.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index acc0e8bf043e..cb6ade443f00 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -108,7 +108,7 @@ obj-$(CONFIG_MICROCODE) += microcode.o # 64 bit specific files ifeq ($(CONFIG_X86_64),y) obj-y += genapic_64.o genapic_flat_64.o genx2apic_uv_x.o tlb_uv.o - obj-y += bios_uv.o uv_irq.o + obj-y += bios_uv.o uv_irq.o uv_sysfs.o obj-y += genx2apic_cluster.o obj-y += genx2apic_phys.o obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c new file mode 100644 index 000000000000..67f9b9dbf800 --- /dev/null +++ b/arch/x86/kernel/uv_sysfs.c @@ -0,0 +1,72 @@ +/* + * This file supports the /sys/firmware/sgi_uv interfaces for SGI UV. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. + * Copyright (c) Russ Anderson + */ + +#include +#include + +struct kobject *sgi_uv_kobj; + +static ssize_t partition_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", sn_partition_id); +} + +static ssize_t coherence_id_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%ld\n", partition_coherence_id()); +} + +static struct kobj_attribute partition_id_attr = + __ATTR(partition_id, S_IRUGO, partition_id_show, NULL); + +static struct kobj_attribute coherence_id_attr = + __ATTR(coherence_id, S_IRUGO, coherence_id_show, NULL); + + +static int __init sgi_uv_sysfs_init(void) +{ + unsigned long ret; + + if (!sgi_uv_kobj) + sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); + if (!sgi_uv_kobj) { + printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); + return -EINVAL; + } + + ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); + return ret; + } + + ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); + if (ret) { + printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); + return ret; + } + + return 0; +} + +device_initcall(sgi_uv_sysfs_init); -- cgit v1.2.2 From 4c66a73f0796dacc2ff0d4af75794ec843ceb3d1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 4 Oct 2008 15:52:15 -0700 Subject: x86: sparse_irq: fix typo in debug print out Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 260c95a5e6dc..f959acbc0db2 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -257,7 +257,7 @@ static struct irq_cfg *irq_cfg_alloc(unsigned int irq) panic("please boot with nr_irq_cfg= %d\n", count * 2); phys = __pa(cfg); - printk(KERN_DEBUG "irq_irq ==> [%#lx - %#lx]\n", phys, phys + total_bytes); + printk(KERN_DEBUG "irq_cfg ==> [%#lx - %#lx]\n", phys, phys + total_bytes); for (i = 0; i < nr_irq_cfg; i++) init_one_irq_cfg(&cfg[i]); -- cgit v1.2.2 From 81608f3c254512b906ab78082ec5966b376aacd5 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Fri, 10 Oct 2008 19:00:17 +0400 Subject: x86: apic - unify APIC_DIVISOR Use APIC_DIVISOR being set to 16 for both 32/64bit mode. To escape APIC timer underflow during calibration set it to the maximum possible value. Also typo error (CONFG instead of proper CONFIG) fixed. The error was caught by Venkatesh Pallipadi, thanks a lot Venkatesh! See details on http://lkml.org/lkml/2008/10/9/425 Reported-by: Venkatesh Pallipad Signed-off-by: Cyrill Gorcunov Acked-by: "Maciej W. Rozycki" Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 1f48bd1c9f2d..04a7f960bbc0 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -332,11 +332,7 @@ int lapic_get_maxlvt(void) */ /* Clock divisor */ -#ifdef CONFG_X86_64 -#define APIC_DIVISOR 1 -#else #define APIC_DIVISOR 16 -#endif /* * This function sets up the local APIC timer, with a timeout of @@ -592,10 +588,10 @@ static int __init calibrate_APIC_clock(void) global_clock_event->event_handler = lapic_cal_handler; /* - * Setup the APIC counter to 1e9. There is no way the lapic + * Setup the APIC counter to maximum. There is no way the lapic * can underflow in the 100ms detection time frame */ - __setup_APIC_LVTT(1000000000, 0, 0); + __setup_APIC_LVTT(0xffffffff, 0, 0); /* Let the interrupts run */ local_irq_enable(); -- cgit v1.2.2 From 3235e936c0cc3589309280b6f59e5096779adae3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 13:16:00 +0200 Subject: x86: remove sparse irq from Kconfig This code is not ready yet. Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 600584b7a497..8636ddf2f4a4 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -237,17 +237,6 @@ config SMP If you don't know what to do here, say N. -config HAVE_SPARSE_IRQ - bool "Support sparse irq numbering" - depends on PCI_MSI || HT_IRQ - default y - help - This enables support for sparse irq, esp for msi/msi-x. the irq - number will be bus/dev/fn + 12bit. You may need if you have lots of - cards supports msi-x installed. - - If you don't know what to do here, say Y. - config X86_FIND_SMP_CONFIG def_bool y depends on X86_MPPARSE || X86_VOYAGER -- cgit v1.2.2 From 2cc21ef843d4fb7da122239b644a1f6f0aca60a6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:16:55 +0200 Subject: genirq: remove sparse irq code This code is not ready, but we need to rip it out instead of rebasing as we would lose the APIC/IO_APIC unification otherwise. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_apic.c | 130 ++-------------------------------------------- arch/x86/kernel/irq_32.c | 8 --- arch/x86/kernel/irq_64.c | 8 --- 3 files changed, 4 insertions(+), 142 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index f959acbc0db2..683610517d2a 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -111,9 +111,6 @@ struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; -#ifdef CONFIG_HAVE_SPARSE_IRQ - struct irq_cfg *next; -#endif struct irq_pin_list *irq_2_pin; cpumask_t domain; cpumask_t old_domain; @@ -151,15 +148,6 @@ static void init_one_irq_cfg(struct irq_cfg *cfg) static struct irq_cfg *irq_cfgx; -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* - * Protect the irq_cfgx_free freelist: - */ -static DEFINE_SPINLOCK(irq_cfg_lock); - -static struct irq_cfg *irq_cfgx_free; -#endif - static void __init init_work(void *data) { struct dyn_array *da = data; @@ -174,114 +162,7 @@ static void __init init_work(void *data) legacy_count = ARRAY_SIZE(irq_cfg_legacy); for (i = legacy_count; i < *da->nr; i++) init_one_irq_cfg(&cfg[i]); - -#ifdef CONFIG_HAVE_SPARSE_IRQ - for (i = 1; i < *da->nr; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = &irq_cfgx[legacy_count]; - irq_cfgx[legacy_count - 1].next = NULL; -#endif -} - -#ifdef CONFIG_HAVE_SPARSE_IRQ -/* need to be biger than size of irq_cfg_legacy */ -static int nr_irq_cfg = 32; - -static int __init parse_nr_irq_cfg(char *arg) -{ - if (arg) { - nr_irq_cfg = simple_strtoul(arg, NULL, 0); - if (nr_irq_cfg < 32) - nr_irq_cfg = 32; - } - return 0; -} - -early_param("nr_irq_cfg", parse_nr_irq_cfg); - -#define for_each_irq_cfg(irqX, cfg) \ - for (cfg = irq_cfgx, irqX = cfg->irq; cfg; cfg = cfg->next, irqX = cfg ? cfg->irq : -1U) - - -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irq_cfg, PAGE_SIZE, init_work); - -static struct irq_cfg *irq_cfg(unsigned int irq) -{ - struct irq_cfg *cfg; - - cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg = cfg->next; - } - - return NULL; -} - -static struct irq_cfg *irq_cfg_alloc(unsigned int irq) -{ - struct irq_cfg *cfg, *cfg_pri; - unsigned long flags; - int count = 0; - int i; - - cfg_pri = cfg = irq_cfgx; - while (cfg) { - if (cfg->irq == irq) - return cfg; - - cfg_pri = cfg; - cfg = cfg->next; - count++; - } - - spin_lock_irqsave(&irq_cfg_lock, flags); - if (!irq_cfgx_free) { - unsigned long phys; - unsigned long total_bytes; - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_cfg %d\n", nr_irq_cfg); - - total_bytes = sizeof(struct irq_cfg) * nr_irq_cfg; - if (after_bootmem) - cfg = kzalloc(total_bytes, GFP_ATOMIC); - else - cfg = __alloc_bootmem_nopanic(total_bytes, PAGE_SIZE, 0); - - if (!cfg) - panic("please boot with nr_irq_cfg= %d\n", count * 2); - - phys = __pa(cfg); - printk(KERN_DEBUG "irq_cfg ==> [%#lx - %#lx]\n", phys, phys + total_bytes); - - for (i = 0; i < nr_irq_cfg; i++) - init_one_irq_cfg(&cfg[i]); - - for (i = 1; i < nr_irq_cfg; i++) - cfg[i-1].next = &cfg[i]; - - irq_cfgx_free = cfg; - } - - cfg = irq_cfgx_free; - irq_cfgx_free = irq_cfgx_free->next; - cfg->next = NULL; - if (cfg_pri) - cfg_pri->next = cfg; - else - irq_cfgx = cfg; - cfg->irq = irq; - - spin_unlock_irqrestore(&irq_cfg_lock, flags); - - return cfg; } -#else #define for_each_irq_cfg(irq, cfg) \ for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) @@ -290,17 +171,16 @@ DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq < nr_irqs) - return &irq_cfgx[irq]; + if (irq < nr_irqs) + return &irq_cfgx[irq]; - return NULL; + return NULL; } struct irq_cfg *irq_cfg_alloc(unsigned int irq) { - return irq_cfg(irq); + return irq_cfg(irq); } -#endif /* * This is performance-critical, we want to do it O(1) * @@ -3068,9 +2948,7 @@ unsigned int create_irq_nr(unsigned int irq_want) unsigned long flags; struct irq_cfg *cfg_new; -#ifndef CONFIG_HAVE_SPARSE_IRQ irq_want = nr_irqs - 1; -#endif irq = 0; spin_lock_irqsave(&vector_lock, flags); diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 001772ffc918..ccf6c1bf7120 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -272,20 +272,12 @@ int show_interrupts(struct seq_file *p, void *v) struct irq_desc *desc = NULL; int tail = 0; -#ifdef CONFIG_HAVE_SPARSE_IRQ - desc = (struct irq_desc *)v; - entries = -1U; - i = desc->irq; - if (!desc->next) - tail = 1; -#else entries = nr_irqs - 1; i = *(loff_t *) v; if (i == nr_irqs) tail = 1; else desc = irq_to_desc(i); -#endif if (i == 0) { seq_printf(p, " "); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index ec2661091283..21f53b911113 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -77,20 +77,12 @@ int show_interrupts(struct seq_file *p, void *v) struct irq_desc *desc = NULL; int tail = 0; -#ifdef CONFIG_HAVE_SPARSE_IRQ - desc = (struct irq_desc *)v; - entries = -1U; - i = desc->irq; - if (!desc->next) - tail = 1; -#else entries = nr_irqs - 1; i = *(loff_t *) v; if (i == nr_irqs) tail = 1; else desc = irq_to_desc(i); -#endif if (i == 0) { seq_printf(p, " "); -- cgit v1.2.2 From ee32c9732244bde4b9b59eeac2814c23e2b71f8d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 14:34:09 +0200 Subject: genirq: remove irq_to_desc_alloc Remove the leftover of sparseirqs. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/io_apic.c | 6 +----- arch/x86/kernel/irqinit_32.c | 2 +- arch/x86/kernel/irqinit_64.c | 2 +- 3 files changed, 3 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 683610517d2a..e03bc0f87eef 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -1257,11 +1257,7 @@ static void ioapic_register_intr(int irq, unsigned long trigger) { struct irq_desc *desc; - /* first time to use this irq_desc */ - if (irq < 16) - desc = irq_to_desc(irq); - else - desc = irq_to_desc_alloc(irq); + desc = irq_to_desc(irq); if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || trigger == IOAPIC_LEVEL) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 9092103a18eb..a8d35998d308 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -70,7 +70,7 @@ void __init init_ISA_irqs (void) */ for (i = 0; i < 16; i++) { /* first time call this irq_desc */ - struct irq_desc *desc = irq_to_desc_alloc(i); + struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; desc->action = NULL; diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index d17fbc26d96f..ff0235391285 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -144,7 +144,7 @@ void __init init_ISA_irqs(void) for (i = 0; i < 16; i++) { /* first time call this irq_desc */ - struct irq_desc *desc = irq_to_desc_alloc(i); + struct irq_desc *desc = irq_to_desc(i); desc->status = IRQ_DISABLED; desc->action = NULL; -- cgit v1.2.2 From d6c88a507ef0b6afdb013cba4e7804ba7324d99a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 15 Oct 2008 15:27:23 +0200 Subject: genirq: revert dynarray Revert the dynarray changes. They need more thought and polishing. Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 1 - arch/x86/kernel/io_apic.c | 199 +++++++++++++++------------------------ arch/x86/kernel/setup_percpu.c | 8 +- arch/x86/kernel/visws_quirks.c | 2 +- arch/x86/kernel/vmlinux_32.lds.S | 1 - arch/x86/kernel/vmlinux_64.lds.S | 2 - arch/x86/xen/spinlock.c | 2 +- 7 files changed, 78 insertions(+), 137 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8636ddf2f4a4..8da6123a60d0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -33,7 +33,6 @@ config X86 select HAVE_ARCH_TRACEHOOK select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_EFFICIENT_UNALIGNED_ACCESS - select HAVE_DYN_ARRAY config ARCH_DEFCONFIG string diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index e03bc0f87eef..6f80dc2f137e 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -107,7 +107,6 @@ static int __init parse_noapic(char *str) } early_param("noapic", parse_noapic); -struct irq_cfg; struct irq_pin_list; struct irq_cfg { unsigned int irq; @@ -120,7 +119,7 @@ struct irq_cfg { }; /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ -static struct irq_cfg irq_cfg_legacy[] __initdata = { +static struct irq_cfg irq_cfgx[NR_IRQS] = { [0] = { .irq = 0, .domain = CPU_MASK_ALL, .vector = IRQ0_VECTOR, }, [1] = { .irq = 1, .domain = CPU_MASK_ALL, .vector = IRQ1_VECTOR, }, [2] = { .irq = 2, .domain = CPU_MASK_ALL, .vector = IRQ2_VECTOR, }, @@ -139,48 +138,26 @@ static struct irq_cfg irq_cfg_legacy[] __initdata = { [15] = { .irq = 15, .domain = CPU_MASK_ALL, .vector = IRQ15_VECTOR, }, }; -static struct irq_cfg irq_cfg_init = { .irq = -1U, }; - -static void init_one_irq_cfg(struct irq_cfg *cfg) -{ - memcpy(cfg, &irq_cfg_init, sizeof(struct irq_cfg)); -} - -static struct irq_cfg *irq_cfgx; - -static void __init init_work(void *data) -{ - struct dyn_array *da = data; - struct irq_cfg *cfg; - int legacy_count; - int i; - - cfg = *da->name; - - memcpy(cfg, irq_cfg_legacy, sizeof(irq_cfg_legacy)); - - legacy_count = ARRAY_SIZE(irq_cfg_legacy); - for (i = legacy_count; i < *da->nr; i++) - init_one_irq_cfg(&cfg[i]); -} - #define for_each_irq_cfg(irq, cfg) \ - for (irq = 0, cfg = &irq_cfgx[irq]; irq < nr_irqs; irq++, cfg = &irq_cfgx[irq]) + for (irq = 0, cfg = irq_cfgx; irq < nr_irqs; irq++, cfg++) -DEFINE_DYN_ARRAY(irq_cfgx, sizeof(struct irq_cfg), nr_irqs, PAGE_SIZE, init_work); - -struct irq_cfg *irq_cfg(unsigned int irq) +static struct irq_cfg *irq_cfg(unsigned int irq) { - if (irq < nr_irqs) - return &irq_cfgx[irq]; - - return NULL; + return irq < nr_irqs ? irq_cfgx + irq : NULL; } -struct irq_cfg *irq_cfg_alloc(unsigned int irq) + +static struct irq_cfg *irq_cfg_alloc(unsigned int irq) { return irq_cfg(irq); } +/* + * Rough estimation of how many shared IRQs there are, can be changed + * anytime. + */ +#define MAX_PLUS_SHARED_IRQS NR_IRQS +#define PIN_MAP_SIZE (MAX_PLUS_SHARED_IRQS + NR_IRQS) + /* * This is performance-critical, we want to do it O(1) * @@ -193,59 +170,29 @@ struct irq_pin_list { struct irq_pin_list *next; }; -static struct irq_pin_list *irq_2_pin_head; -/* fill one page ? */ -static int nr_irq_2_pin = 0x100; +static struct irq_pin_list irq_2_pin_head[PIN_MAP_SIZE]; static struct irq_pin_list *irq_2_pin_ptr; -static void __init irq_2_pin_init_work(void *data) + +static void __init irq_2_pin_init(void) { - struct dyn_array *da = data; - struct irq_pin_list *pin; + struct irq_pin_list *pin = irq_2_pin_head; int i; - pin = *da->name; - - for (i = 1; i < *da->nr; i++) + for (i = 1; i < PIN_MAP_SIZE; i++) pin[i-1].next = &pin[i]; irq_2_pin_ptr = &pin[0]; } -DEFINE_DYN_ARRAY(irq_2_pin_head, sizeof(struct irq_pin_list), nr_irq_2_pin, PAGE_SIZE, irq_2_pin_init_work); static struct irq_pin_list *get_one_free_irq_2_pin(void) { - struct irq_pin_list *pin; - int i; - - pin = irq_2_pin_ptr; - - if (pin) { - irq_2_pin_ptr = pin->next; - pin->next = NULL; - return pin; - } - - /* - * we run out of pre-allocate ones, allocate more - */ - printk(KERN_DEBUG "try to get more irq_2_pin %d\n", nr_irq_2_pin); - - if (after_bootmem) - pin = kzalloc(sizeof(struct irq_pin_list)*nr_irq_2_pin, - GFP_ATOMIC); - else - pin = __alloc_bootmem_nopanic(sizeof(struct irq_pin_list) * - nr_irq_2_pin, PAGE_SIZE, 0); + struct irq_pin_list *pin = irq_2_pin_ptr; if (!pin) panic("can not get more irq_2_pin\n"); - for (i = 1; i < nr_irq_2_pin; i++) - pin[i-1].next = &pin[i]; - irq_2_pin_ptr = pin->next; pin->next = NULL; - return pin; } @@ -284,8 +231,9 @@ static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned i static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value) { struct io_apic __iomem *io_apic = io_apic_base(apic); - if (sis_apic_bug) - writel(reg, &io_apic->index); + + if (sis_apic_bug) + writel(reg, &io_apic->index); writel(value, &io_apic->data); } @@ -1044,11 +992,11 @@ static int pin_2_irq(int idx, int apic, int pin) while (i < apic) irq += nr_ioapic_registers[i++]; irq += pin; - /* + /* * For MPS mode, so far only needed by ES7000 platform */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); + if (ioapic_renumber_irq) + irq = ioapic_renumber_irq(apic, irq); } #ifdef CONFIG_X86_32 @@ -1232,19 +1180,19 @@ static struct irq_chip ir_ioapic_chip; #ifdef CONFIG_X86_32 static inline int IO_APIC_irq_trigger(int irq) { - int apic, idx, pin; + int apic, idx, pin; - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { - idx = find_irq_entry(apic, pin, mp_INT); - if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) - return irq_trigger(idx); - } - } - /* + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + idx = find_irq_entry(apic, pin, mp_INT); + if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) + return irq_trigger(idx); + } + } + /* * nonexistent IRQs are edge default */ - return 0; + return 0; } #else static inline int IO_APIC_irq_trigger(int irq) @@ -1509,8 +1457,8 @@ __apicdebuginit(void) print_IO_APIC(void) reg_01.raw = io_apic_read(apic, 1); if (reg_01.bits.version >= 0x10) reg_02.raw = io_apic_read(apic, 2); - if (reg_01.bits.version >= 0x20) - reg_03.raw = io_apic_read(apic, 3); + if (reg_01.bits.version >= 0x20) + reg_03.raw = io_apic_read(apic, 3); spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); @@ -2089,9 +2037,9 @@ static int ioapic_retrigger_irq(unsigned int irq) #else static int ioapic_retrigger_irq(unsigned int irq) { - send_IPI_self(irq_cfg(irq)->vector); + send_IPI_self(irq_cfg(irq)->vector); - return 1; + return 1; } #endif @@ -2189,7 +2137,7 @@ static int migrate_irq_remapped_level(int irq) if (io_apic_level_ack_pending(irq)) { /* - * Interrupt in progress. Migrating irq now will change the + * Interrupt in progress. Migrating irq now will change the * vector information in the IO-APIC RTE and that will confuse * the EOI broadcast performed by cpu. * So, delay the irq migration to the next instance. @@ -2426,28 +2374,28 @@ static void ack_apic_level(unsigned int irq) } static struct irq_chip ioapic_chip __read_mostly = { - .name = "IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_apic_edge, - .eoi = ack_apic_level, + .name = "IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_apic_edge, + .eoi = ack_apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ioapic_affinity_irq, + .set_affinity = set_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; #ifdef CONFIG_INTR_REMAP static struct irq_chip ir_ioapic_chip __read_mostly = { - .name = "IR-IO-APIC", - .startup = startup_ioapic_irq, - .mask = mask_IO_APIC_irq, - .unmask = unmask_IO_APIC_irq, - .ack = ack_x2apic_edge, - .eoi = ack_x2apic_level, + .name = "IR-IO-APIC", + .startup = startup_ioapic_irq, + .mask = mask_IO_APIC_irq, + .unmask = unmask_IO_APIC_irq, + .ack = ack_x2apic_edge, + .eoi = ack_x2apic_level, #ifdef CONFIG_SMP - .set_affinity = set_ir_ioapic_affinity_irq, + .set_affinity = set_ir_ioapic_affinity_irq, #endif .retrigger = ioapic_retrigger_irq, }; @@ -2636,8 +2584,8 @@ static inline void __init check_timer(void) local_irq_save(flags); - ver = apic_read(APIC_LVR); - ver = GET_APIC_VERSION(ver); + ver = apic_read(APIC_LVR); + ver = GET_APIC_VERSION(ver); /* * get/set the timer IRQ vector: @@ -2822,12 +2770,12 @@ void __init setup_IO_APIC(void) io_apic_irqs = ~PIC_IRQS; apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); - /* + /* * Set up IO-APIC IRQ routing. */ #ifdef CONFIG_X86_32 - if (!acpi_ioapic) - setup_ioapic_ids_from_mpc(); + if (!acpi_ioapic) + setup_ioapic_ids_from_mpc(); #endif sync_Arb_IDs(); setup_IO_APIC_irqs(); @@ -2842,9 +2790,9 @@ void __init setup_IO_APIC(void) static int __init io_apic_bug_finalize(void) { - if (sis_apic_bug == -1) - sis_apic_bug = 0; - return 0; + if (sis_apic_bug == -1) + sis_apic_bug = 0; + return 0; } late_initcall(io_apic_bug_finalize); @@ -3199,7 +3147,7 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec) if (index < 0) { printk(KERN_ERR "Unable to allocate %d IRTE for PCI %s\n", nvec, - pci_name(dev)); + pci_name(dev)); return -ENOSPC; } return index; @@ -3885,23 +3833,24 @@ static struct resource * __init ioapic_setup_resources(void) void __init ioapic_init_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; - int i; struct resource *ioapic_res; + int i; + irq_2_pin_init(); ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mp_apicaddr; #ifdef CONFIG_X86_32 - if (!ioapic_phys) { - printk(KERN_ERR - "WARNING: bogus zero IO-APIC " - "address found in MPTABLE, " - "disabling IO/APIC support!\n"); - smp_found_config = 0; - skip_ioapic_setup = 1; - goto fake_ioapic_page; - } + if (!ioapic_phys) { + printk(KERN_ERR + "WARNING: bogus zero IO-APIC " + "address found in MPTABLE, " + "disabling IO/APIC support!\n"); + smp_found_config = 0; + skip_ioapic_setup = 1; + goto fake_ioapic_page; + } #endif } else { #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2b7dab699e83..410c88f0bfeb 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -140,7 +140,7 @@ static void __init setup_cpu_pda_map(void) */ void __init setup_per_cpu_areas(void) { - ssize_t size, old_size, da_size; + ssize_t size, old_size; char *ptr; int cpu; unsigned long align = 1; @@ -150,9 +150,8 @@ void __init setup_per_cpu_areas(void) /* Copy section for each CPU (we discard the original) */ old_size = PERCPU_ENOUGH_ROOM; - da_size = per_cpu_dyn_array_size(&align); align = max_t(unsigned long, PAGE_SIZE, align); - size = roundup(old_size + da_size, align); + size = roundup(old_size, align); printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n", size); @@ -182,9 +181,6 @@ void __init setup_per_cpu_areas(void) #endif per_cpu_offset(cpu) = ptr - __per_cpu_start; memcpy(ptr, __per_cpu_start, __per_cpu_end - __per_cpu_start); - - per_cpu_alloc_dyn_array(cpu, ptr + old_size); - } printk(KERN_DEBUG "NR_CPUS: %d, nr_cpu_ids: %d, nr_node_ids %d\n", diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c index 817aa55a1209..0c9667f0752a 100644 --- a/arch/x86/kernel/visws_quirks.c +++ b/arch/x86/kernel/visws_quirks.c @@ -633,7 +633,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id) /* * handle this 'virtual interrupt' as a Cobalt one now. */ - kstat_irqs_this_cpu(desc)++; + kstat_incr_irqs_this_cpu(realirq, desc); if (likely(desc->action != NULL)) handle_IRQ_event(realirq, desc->action); diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index c36007ab3940..a9b8560adbc2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -145,7 +145,6 @@ SECTIONS *(.x86_cpu_dev.init) __x86_cpu_dev_end = .; } - DYN_ARRAY_INIT(8) SECURITY_INIT . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 30973dbac8c2..3245ad72594a 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -174,8 +174,6 @@ SECTIONS } __x86_cpu_dev_end = .; - DYN_ARRAY_INIT(8) - SECURITY_INIT . = ALIGN(8); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index bb6bc721b13d..5601506f2dd9 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -241,7 +241,7 @@ static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enabl ADD_STATS(taken_slow_spurious, !xen_test_irq_pending(irq)); } while (!xen_test_irq_pending(irq)); /* check for spurious wakeups */ - kstat_irqs_this_cpu(irq_to_desc(irq))++; + kstat_incr_irqs_this_cpu(irq, irq_to_desc(irq)); out: raw_local_irq_restore(flags); -- cgit v1.2.2 From a1aca5de08a0cb840a90fb3f729a5940f8d21185 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 15 Oct 2008 19:29:15 +0200 Subject: genirq: remove artifacts from sparseirq removal Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 1 - arch/x86/kernel/irqinit_32.c | 1 - arch/x86/kernel/vmlinux_64.lds.S | 1 - arch/x86/mm/init_32.c | 3 --- 4 files changed, 6 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 5fef4fece4a5..0d1c26a583c5 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1254,7 +1254,6 @@ static int __init acpi_parse_madt_ioapic_entries(void) return count; } - count = acpi_table_parse_madt(ACPI_MADT_TYPE_INTERRUPT_OVERRIDE, acpi_parse_int_src_ovr, nr_irqs); diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index a8d35998d308..845aa9803e80 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -184,4 +184,3 @@ void __init native_init_IRQ(void) irq_ctx_init(smp_processor_id()); } - diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 3245ad72594a..46e05447405b 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -173,7 +173,6 @@ SECTIONS *(.x86_cpu_dev.init) } __x86_cpu_dev_end = .; - SECURITY_INIT . = ALIGN(8); diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 91343c2694b4..8396868e82c5 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -66,7 +66,6 @@ static unsigned long __meminitdata table_end; static unsigned long __meminitdata table_top; static int __initdata after_init_bootmem; -int after_bootmem; static __init void *alloc_low_page(unsigned long *phys) { @@ -989,8 +988,6 @@ void __init mem_init(void) set_highmem_pages_init(); - after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; -- cgit v1.2.2 From c0c168ca26b54a4a6ad34fc813fe00f275fbc94c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 11:15:28 +0200 Subject: x86: cleanup show_interrupts The sparseirq patches introduced some more ugliness in show_interrupts(). Clean it up all together and make the code easier to read by splitting out the "tail" function which prints the special interrupts. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq_32.c | 160 +++++++++++++++++++++++------------------------ arch/x86/kernel/irq_64.c | 153 ++++++++++++++++++++++---------------------- 2 files changed, 154 insertions(+), 159 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index ccf6c1bf7120..8d525765a6c4 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -263,22 +263,67 @@ atomic_t irq_err_count; * /proc/interrupts printing: */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", nmi_count(j)); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + return 0; +} + int show_interrupts(struct seq_file *p, void *v) { + unsigned long flags, any_count = 0; int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; - unsigned int entries; - struct irq_desc *desc = NULL; - int tail = 0; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; - entries = nr_irqs - 1; - i = *(loff_t *) v; if (i == nr_irqs) - tail = 1; - else - desc = irq_to_desc(i); + return show_other_interrupts(p); + /* print header */ if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) @@ -286,88 +331,37 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i <= entries) { - unsigned any_count = 0; - - spin_lock_irqsave(&desc->lock, flags); + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP - any_count = kstat_irqs(i); + any_count = kstat_irqs(i); #else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); #endif - action = desc->action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ", i); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); + seq_printf(p, "%10u ", kstat_irqs(i)); #else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&desc->lock, flags); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); } - if (tail) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", nmi_count(j)); - seq_printf(p, " Non-maskable interrupts\n"); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#endif -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", - per_cpu(irq_stat,j).irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif - } + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); return 0; } diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 21f53b911113..4f374294f292 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -68,22 +68,65 @@ static inline void stack_overflow_check(struct pt_regs *regs) * Generic, controller-independent functions: */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +#endif + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + + return 0; +} + int show_interrupts(struct seq_file *p, void *v) { - int i, j; - struct irqaction * action; - unsigned long flags; - unsigned int entries; - struct irq_desc *desc = NULL; - int tail = 0; - - entries = nr_irqs - 1; - i = *(loff_t *) v; + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + if (i == nr_irqs) - tail = 1; - else - desc = irq_to_desc(i); + return show_other_interrupts(p); + /* print header */ if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) @@ -91,79 +134,37 @@ int show_interrupts(struct seq_file *p, void *v) seq_putc(p, '\n'); } - if (i <= entries) { - unsigned any_count = 0; - - spin_lock_irqsave(&desc->lock, flags); + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); #ifndef CONFIG_SMP - any_count = kstat_irqs(i); + any_count = kstat_irqs(i); #else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); #endif - action = desc->action; - if (!action && !any_count) - goto skip; - seq_printf(p, "%3d: ", i); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); #ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); + seq_printf(p, "%10u ", kstat_irqs(i)); #else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); #endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - seq_putc(p, '\n'); -skip: - spin_unlock_irqrestore(&desc->lock, flags); - } - - if (tail) { - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); - seq_printf(p, "THR: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); -#endif - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); } + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); return 0; } -- cgit v1.2.2 From 6b39ba771e3c78d00e0abcebad270bd4212b28bc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 11:32:24 +0200 Subject: x86: unify show_interrupts() and proc helpers show_interrupts() and proc helpers are basically the same for 32 and 64 bit. Move them to a shared source file. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 2 +- arch/x86/kernel/irq.c | 166 +++++++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/irq_32.c | 146 ----------------------------------------- arch/x86/kernel/irq_64.c | 132 ------------------------------------- 4 files changed, 167 insertions(+), 279 deletions(-) create mode 100644 arch/x86/kernel/irq.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cb6ade443f00..d7e5a58ee22f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,7 +23,7 @@ CFLAGS_hpet.o := $(nostackp) CFLAGS_tsc.o := $(nostackp) obj-y := process_$(BITS).o signal_$(BITS).o entry_$(BITS).o -obj-y += traps.o irq_$(BITS).o dumpstack_$(BITS).o +obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time_$(BITS).o ioport.o ldt.o obj-y += setup.o i8259.o irqinit_$(BITS).o setup_percpu.o obj-$(CONFIG_X86_VISWS) += visws_quirks.o diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c new file mode 100644 index 000000000000..3b9128498976 --- /dev/null +++ b/arch/x86/kernel/irq.c @@ -0,0 +1,166 @@ +/* + * Common interrupt code for 32 and 64 bit + */ +#include +#include +#include +#include + +#include +#include +#include + +atomic_t irq_err_count; + +#ifdef CONFIG_X86_32 +# define irq_stats(x) (&per_cpu(irq_stat,x)) +#else +# define irq_stats(x) cpu_pda(x) +#endif +/* + * /proc/interrupts printing: + */ +static int show_other_interrupts(struct seq_file *p) +{ + int j; + + seq_printf(p, "NMI: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->__nmi_count); + seq_printf(p, " Non-maskable interrupts\n"); +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "LOC: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); + seq_printf(p, " Local timer interrupts\n"); +#endif +#ifdef CONFIG_SMP + seq_printf(p, "RES: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_resched_count); + seq_printf(p, " Rescheduling interrupts\n"); + seq_printf(p, "CAL: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); + seq_printf(p, " Function call interrupts\n"); + seq_printf(p, "TLB: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); + seq_printf(p, " TLB shootdowns\n"); +#endif +#ifdef CONFIG_X86_MCE + seq_printf(p, "TRM: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); + seq_printf(p, " Thermal event interrupts\n"); +# ifdef CONFIG_X86_64 + seq_printf(p, "THR: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); + seq_printf(p, " Threshold APIC interrupts\n"); +# endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + seq_printf(p, "SPU: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); + seq_printf(p, " Spurious interrupts\n"); +#endif + seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); +#if defined(CONFIG_X86_IO_APIC) + seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); +#endif + return 0; +} + +int show_interrupts(struct seq_file *p, void *v) +{ + unsigned long flags, any_count = 0; + int i = *(loff_t *) v, j; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + if (i == nr_irqs) + return show_other_interrupts(p); + + /* print header */ + if (i == 0) { + seq_printf(p, " "); + for_each_online_cpu(j) + seq_printf(p, "CPU%-8d",j); + seq_putc(p, '\n'); + } + + desc = irq_to_desc(i); + spin_lock_irqsave(&desc->lock, flags); +#ifndef CONFIG_SMP + any_count = kstat_irqs(i); +#else + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); +#endif + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%3d: ", i); +#ifndef CONFIG_SMP + seq_printf(p, "%10u ", kstat_irqs(i)); +#else + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); +#endif + seq_printf(p, " %8s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { + seq_printf(p, " %s", action->name); + while ((action = action->next) != NULL) + seq_printf(p, ", %s", action->name); + } + + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); + return 0; +} + +/* + * /proc/stat helpers + */ +u64 arch_irq_stat_cpu(unsigned int cpu) +{ + u64 sum = irq_stats(cpu)->__nmi_count; + +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->apic_timer_irqs; +#endif +#ifdef CONFIG_SMP + sum += irq_stats(cpu)->irq_resched_count; + sum += irq_stats(cpu)->irq_call_count; + sum += irq_stats(cpu)->irq_tlb_count; +#endif +#ifdef CONFIG_X86_MCE + sum += irq_stats(cpu)->irq_thermal_count; +# ifdef CONFIG_X86_64 + sum += irq_stats(cpu)->irq_threshold_count; +#endif +#endif +#ifdef CONFIG_X86_LOCAL_APIC + sum += irq_stats(cpu)->irq_spurious_count; +#endif + return sum; +} + +u64 arch_irq_stat(void) +{ + u64 sum = atomic_read(&irq_err_count); + +#ifdef CONFIG_X86_IO_APIC + sum += atomic_read(&irq_mis_count); +#endif + return sum; +} diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 8d525765a6c4..6d9bf3936c78 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -253,152 +253,6 @@ unsigned int do_IRQ(struct pt_regs *regs) return 1; } -/* - * Interrupt statistics: - */ - -atomic_t irq_err_count; - -/* - * /proc/interrupts printing: - */ - -static int show_other_interrupts(struct seq_file *p) -{ - int j; - - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", nmi_count(j)); - seq_printf(p, " Non-maskable interrupts\n"); -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#endif -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); -#endif -#ifdef CONFIG_X86_LOCAL_APIC - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(irq_stat,j).irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); -#endif - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); -#if defined(CONFIG_X86_IO_APIC) - seq_printf(p, "MIS: %10u\n", atomic_read(&irq_mis_count)); -#endif - return 0; -} - -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - if (i == nr_irqs) - return show_other_interrupts(p); - - /* print header */ - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - spin_lock_irqsave(&desc->lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); -#endif - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%3d: ", i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = nmi_count(cpu); - -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).apic_timer_irqs; -#endif -#ifdef CONFIG_SMP - sum += per_cpu(irq_stat, cpu).irq_resched_count; - sum += per_cpu(irq_stat, cpu).irq_call_count; - sum += per_cpu(irq_stat, cpu).irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += per_cpu(irq_stat, cpu).irq_thermal_count; -#endif -#ifdef CONFIG_X86_LOCAL_APIC - sum += per_cpu(irq_stat, cpu).irq_spurious_count; -#endif - return sum; -} - -u64 arch_irq_stat(void) -{ - u64 sum = atomic_read(&irq_err_count); - -#ifdef CONFIG_X86_IO_APIC - sum += atomic_read(&irq_mis_count); -#endif - return sum; -} - #ifdef CONFIG_HOTPLUG_CPU #include diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4f374294f292..39ef7feb9ea4 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,8 +18,6 @@ #include #include -atomic_t irq_err_count; - /* * 'what should we do if we get a hw irq event on an illegal vector'. * each architecture has to answer this themselves. @@ -64,136 +62,6 @@ static inline void stack_overflow_check(struct pt_regs *regs) } #endif -/* - * Generic, controller-independent functions: - */ - -static int show_other_interrupts(struct seq_file *p) -{ - int j; - - seq_printf(p, "NMI: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->__nmi_count); - seq_printf(p, " Non-maskable interrupts\n"); - seq_printf(p, "LOC: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->apic_timer_irqs); - seq_printf(p, " Local timer interrupts\n"); -#ifdef CONFIG_SMP - seq_printf(p, "RES: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_resched_count); - seq_printf(p, " Rescheduling interrupts\n"); - seq_printf(p, "CAL: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_call_count); - seq_printf(p, " Function call interrupts\n"); - seq_printf(p, "TLB: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_tlb_count); - seq_printf(p, " TLB shootdowns\n"); -#endif -#ifdef CONFIG_X86_MCE - seq_printf(p, "TRM: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_thermal_count); - seq_printf(p, " Thermal event interrupts\n"); - seq_printf(p, "THR: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_threshold_count); - seq_printf(p, " Threshold APIC interrupts\n"); -#endif - seq_printf(p, "SPU: "); - for_each_online_cpu(j) - seq_printf(p, "%10u ", cpu_pda(j)->irq_spurious_count); - seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "ERR: %10u\n", atomic_read(&irq_err_count)); - - return 0; -} - -int show_interrupts(struct seq_file *p, void *v) -{ - unsigned long flags, any_count = 0; - int i = *(loff_t *) v, j; - struct irqaction *action; - struct irq_desc *desc; - - if (i > nr_irqs) - return 0; - - if (i == nr_irqs) - return show_other_interrupts(p); - - /* print header */ - if (i == 0) { - seq_printf(p, " "); - for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); - seq_putc(p, '\n'); - } - - desc = irq_to_desc(i); - spin_lock_irqsave(&desc->lock, flags); -#ifndef CONFIG_SMP - any_count = kstat_irqs(i); -#else - for_each_online_cpu(j) - any_count |= kstat_irqs_cpu(i, j); -#endif - action = desc->action; - if (!action && !any_count) - goto out; - - seq_printf(p, "%3d: ", i); -#ifndef CONFIG_SMP - seq_printf(p, "%10u ", kstat_irqs(i)); -#else - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); -#endif - seq_printf(p, " %8s", desc->chip->name); - seq_printf(p, "-%-8s", desc->name); - - if (action) { - seq_printf(p, " %s", action->name); - while ((action = action->next) != NULL) - seq_printf(p, ", %s", action->name); - } - - seq_putc(p, '\n'); -out: - spin_unlock_irqrestore(&desc->lock, flags); - return 0; -} - -/* - * /proc/stat helpers - */ -u64 arch_irq_stat_cpu(unsigned int cpu) -{ - u64 sum = cpu_pda(cpu)->__nmi_count; - - sum += cpu_pda(cpu)->apic_timer_irqs; -#ifdef CONFIG_SMP - sum += cpu_pda(cpu)->irq_resched_count; - sum += cpu_pda(cpu)->irq_call_count; - sum += cpu_pda(cpu)->irq_tlb_count; -#endif -#ifdef CONFIG_X86_MCE - sum += cpu_pda(cpu)->irq_thermal_count; - sum += cpu_pda(cpu)->irq_threshold_count; -#endif - sum += cpu_pda(cpu)->irq_spurious_count; - return sum; -} - -u64 arch_irq_stat(void) -{ - return atomic_read(&irq_err_count); -} - /* * do_IRQ handles all normal device IRQ's (the special * SMP cross-CPU interrupts have their own specific -- cgit v1.2.2 From 249f6d9eab372790579ada8991bba3384c204e06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 16 Oct 2008 12:18:50 +0200 Subject: x86: move ack_bad_irq() to irq.c Share more duplicated code. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 23 +++++++++++++++++++++++ arch/x86/kernel/irq_32.c | 23 ----------------------- arch/x86/kernel/irq_64.c | 20 -------------------- 3 files changed, 23 insertions(+), 43 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3b9128498976..ccf6c503fc3b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,29 @@ atomic_t irq_err_count; +/* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. + */ +void ack_bad_irq(unsigned int irq) +{ + printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); + +#ifdef CONFIG_X86_LOCAL_APIC + /* + * Currently unexpected vectors happen only on SMP and APIC. + * We _must_ ack these because every local APIC has only N + * irq slots per priority level, and a 'hanging, unacked' IRQ + * holds up an irq slot - in excessive cases (when multiple + * unexpected vectors occur) that might lock up the APIC + * completely. + * But only ack when the APIC is enabled -AK + */ + if (cpu_has_apic) + ack_APIC_irq(); +#endif +} + #ifdef CONFIG_X86_32 # define irq_stats(x) (&per_cpu(irq_stat,x)) #else diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 6d9bf3936c78..a51382672de0 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -25,29 +25,6 @@ EXPORT_PER_CPU_SYMBOL(irq_stat); DEFINE_PER_CPU(struct pt_regs *, irq_regs); EXPORT_PER_CPU_SYMBOL(irq_regs); -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq); - -#ifdef CONFIG_X86_LOCAL_APIC - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But only ack when the APIC is enabled -AK - */ - if (cpu_has_apic) - ack_APIC_irq(); -#endif -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ static int check_stack_overflow(void) diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 39ef7feb9ea4..60eb84eb77a0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -18,26 +18,6 @@ #include #include -/* - * 'what should we do if we get a hw irq event on an illegal vector'. - * each architecture has to answer this themselves. - */ -void ack_bad_irq(unsigned int irq) -{ - printk(KERN_WARNING "unexpected IRQ trap at vector %02x\n", irq); - /* - * Currently unexpected vectors happen only on SMP and APIC. - * We _must_ ack these because every local APIC has only N - * irq slots per priority level, and a 'hanging, unacked' IRQ - * holds up an irq slot - in excessive cases (when multiple - * unexpected vectors occur) that might lock up the APIC - * completely. - * But don't ack when the APIC is disabled. -AK - */ - if (!disable_apic) - ack_APIC_irq(); -} - #ifdef CONFIG_DEBUG_STACKOVERFLOW /* * Probabilistic stack overflow check: -- cgit v1.2.2 From 10e0298686be6249b0665465737a6b83446c4d35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 16 Oct 2008 17:04:22 +0200 Subject: io_apic: make irq_mis_count available on 64-bit too Signed-off-by: Ingo Molnar --- arch/x86/kernel/io_apic.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c index 6f80dc2f137e..b764d7429c61 100644 --- a/arch/x86/kernel/io_apic.c +++ b/arch/x86/kernel/io_apic.c @@ -2277,9 +2277,7 @@ static void ack_apic_edge(unsigned int irq) ack_APIC_irq(); } -#ifdef CONFIG_X86_32 atomic_t irq_mis_count; -#endif static void ack_apic_level(unsigned int irq) { -- cgit v1.2.2 From 0f019cc477b494dfc472f2a98eb64d02d4937741 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 16 Oct 2008 15:01:40 +0200 Subject: oprofile: fixing whitespaces in arch/x86/oprofile/* Signed-off-by: Robert Richter --- arch/x86/oprofile/backtrace.c | 3 +-- arch/x86/oprofile/op_counter.h | 18 +++++++++--------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 36e324139f77..04df67f8a7ba 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -52,8 +52,7 @@ struct frame_head { unsigned long ret; } __attribute__((packed)); -static struct frame_head * -dump_user_backtrace(struct frame_head *head) +static struct frame_head *dump_user_backtrace(struct frame_head *head) { struct frame_head bufhead[2]; diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h index 2880b15c4675..91b6a116165e 100644 --- a/arch/x86/oprofile/op_counter.h +++ b/arch/x86/oprofile/op_counter.h @@ -6,22 +6,22 @@ * * @author John Levon */ - + #ifndef OP_COUNTER_H #define OP_COUNTER_H - + #define OP_MAX_COUNTER 8 - + /* Per-perfctr configuration as set via * oprofilefs. */ struct op_counter_config { - unsigned long count; - unsigned long enabled; - unsigned long event; - unsigned long kernel; - unsigned long user; - unsigned long unit_mask; + unsigned long count; + unsigned long enabled; + unsigned long event; + unsigned long kernel; + unsigned long user; + unsigned long unit_mask; }; extern struct op_counter_config counter_config[]; -- cgit v1.2.2 From ae87221d3ce49d9de1e43756da834fd0bf05a2ad Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 24 Aug 2007 16:11:54 -0700 Subject: sysfs: crash debugging Print the name of the last-accessed sysfs file when we oops, to help track down oopses which occur in sysfs store/read handlers. Because these oopses tend to not leave any trace of the offending code in the stack traces. Cc: Kay Sievers Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/dumpstack_32.c | 2 ++ arch/x86/kernel/dumpstack_64.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 201ee359a1a9..1a78180f08d3 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -343,6 +344,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); + sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 086cc8118e39..96a5db7da8a7 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -489,6 +490,7 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err) printk("DEBUG_PAGEALLOC"); #endif printk("\n"); + sysfs_printk_last_file(); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) return 1; -- cgit v1.2.2 From a9b12619f7b6f19c871437ec24a088787a04b1de Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 21 Jul 2008 20:03:34 -0700 Subject: device create: misc: convert device_create_drvdata to device_create Now that device_create() has been audited, rename things back to the original call to be sane. Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpuid.c | 4 ++-- arch/x86/kernel/msr.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 6a44d6465991..72cefd1e649b 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -147,8 +147,8 @@ static __cpuinit int cpuid_device_create(int cpu) { struct device *dev; - dev = device_create_drvdata(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), - NULL, "cpu%d", cpu); + dev = device_create(cpuid_class, NULL, MKDEV(CPUID_MAJOR, cpu), NULL, + "cpu%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 2e2af5d18191..82a7c7ed6d45 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -163,8 +163,8 @@ static int __cpuinit msr_device_create(int cpu) { struct device *dev; - dev = device_create_drvdata(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), - NULL, "msr%d", cpu); + dev = device_create(msr_class, NULL, MKDEV(MSR_MAJOR, cpu), NULL, + "msr%d", cpu); return IS_ERR(dev) ? PTR_ERR(dev) : 0; } -- cgit v1.2.2 From 80a914dc05683ecfc98f9e1887fd6564846ffbec Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Wed, 15 Oct 2008 22:01:25 -0700 Subject: misc: replace __FUNCTION__ with __func__ __FUNCTION__ is gcc-specific, use __func__ Signed-off-by: Harvey Harrison Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 0d682fc6aeb3..19afbb644c7f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -564,7 +564,7 @@ static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info * hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", - __FUNCTION__, tsc_khz, hv_clock->tsc_shift, + __func__, tsc_khz, hv_clock->tsc_shift, hv_clock->tsc_to_system_mul); } -- cgit v1.2.2 From 9ba16087d9f996a93ab6f4453a52a4b24bc1f25c Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Wed, 15 Oct 2008 22:01:38 -0700 Subject: Kconfig: eliminate "def_bool n" constructs Using "def_bool n" is pointless, simply using bool here appears more appropriate. Further, retaining such options that don't have a prompt and aren't selected by anything seems also at least questionable. Signed-off-by: Jan Beulich Cc: Ingo Molnar Cc: Tony Luck Cc: Thomas Gleixner Cc: Bartlomiej Zolnierkiewicz Cc: Sam Ravnborg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f65c2744d573..7ccb6e60e60c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -39,10 +39,6 @@ config ARCH_DEFCONFIG default "arch/x86/configs/i386_defconfig" if X86_32 default "arch/x86/configs/x86_64_defconfig" if X86_64 - -config GENERIC_LOCKBREAK - def_bool n - config GENERIC_TIME def_bool y @@ -95,7 +91,7 @@ config GENERIC_HWEIGHT def_bool y config GENERIC_GPIO - def_bool n + bool config ARCH_MAY_HAVE_PC_FDC def_bool y @@ -106,12 +102,6 @@ config RWSEM_GENERIC_SPINLOCK config RWSEM_XCHGADD_ALGORITHM def_bool X86_XADD -config ARCH_HAS_ILOG2_U32 - def_bool n - -config ARCH_HAS_ILOG2_U64 - def_bool n - config ARCH_HAS_CPU_IDLE_WAIT def_bool y @@ -758,9 +748,8 @@ config I8K Say N otherwise. config X86_REBOOTFIXUPS - def_bool n - prompt "Enable X86 board specific fixups for reboot" - depends on X86_32 && X86 + bool "Enable X86 board specific fixups for reboot" + depends on X86_32 ---help--- This enables chipset and/or board specific fixups to be done in order to get reboot to work correctly. This is only needed on @@ -944,8 +933,7 @@ config HIGHMEM depends on X86_32 && (HIGHMEM64G || HIGHMEM4G) config X86_PAE - def_bool n - prompt "PAE (Physical Address Extension) Support" + bool "PAE (Physical Address Extension) Support" depends on X86_32 && !HIGHMEM4G select RESOURCES_64BIT help @@ -1238,8 +1226,7 @@ config X86_PAT If unsure, say Y. config EFI - def_bool n - prompt "EFI runtime service support" + bool "EFI runtime service support" depends on ACPI ---help--- This enables the kernel to use EFI runtime services that are -- cgit v1.2.2 From 25ddbb18aae33ad255eb9f35aacebe3af01e1e9c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 15 Oct 2008 22:01:41 -0700 Subject: Make the taint flags reliable It's somewhat unlikely that it happens, but right now a race window between interrupts or machine checks or oopses could corrupt the tainted bitmap because it is modified in a non atomic fashion. Convert the taint variable to an unsigned long and use only atomic bit operations on it. Unfortunately this means the intvec sysctl functions cannot be used on it anymore. It turned out the taint sysctl handler could actually be simplified a bit (since it only increases capabilities) so this patch actually removes code. [akpm@linux-foundation.org: remove unneeded include] Signed-off-by: Andi Kleen Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/smpboot.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8c3aca7cb343..7ed9e070a6e9 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -282,6 +282,8 @@ static void __cpuinit smp_callin(void) cpu_set(cpuid, cpu_callin_map); } +static int __cpuinitdata unsafe_smp; + /* * Activate a secondary processor. */ @@ -397,7 +399,7 @@ static void __cpuinit smp_apply_quirks(struct cpuinfo_x86 *c) goto valid_k7; /* If we get here, not a certified SMP capable AMD system. */ - add_taint(TAINT_UNSAFE_SMP); + unsafe_smp = 1; } valid_k7: @@ -414,12 +416,10 @@ static void __cpuinit smp_checks(void) * Don't taint if we are running SMP kernel on a single non-MP * approved Athlon */ - if (tainted & TAINT_UNSAFE_SMP) { - if (num_online_cpus()) - printk(KERN_INFO "WARNING: This combination of AMD" - "processors is not suitable for SMP.\n"); - else - tainted &= ~TAINT_UNSAFE_SMP; + if (unsafe_smp && num_online_cpus() > 1) { + printk(KERN_INFO "WARNING: This combination of AMD" + "processors is not suitable for SMP.\n"); + add_taint(TAINT_UNSAFE_SMP); } } -- cgit v1.2.2 From f7a5000f7a8924e9c5fad1801616601d6dc65a17 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Oct 2008 22:02:05 -0700 Subject: compat: move cp_compat_stat to common code struct stat / compat_stat is the same on all architectures, so cp_compat_stat should be, too. Turns out it is, except that various architectures have slightly and some high2lowuid/high2lowgid or the direct assignment instead of the SET_UID/SET_GID that expands to the correct one anyway. This patch replaces the arch-specific cp_compat_stat implementations with a common one based on the x86-64 one. Signed-off-by: Christoph Hellwig Acked-by: David S. Miller [ sparc bits ] Acked-by: Kyle McMartin [ parisc bits ] Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/sys_ia32.c | 35 ----------------------------------- 1 file changed, 35 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index beda4232ce69..4d3ad8d78a4d 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -49,41 +49,6 @@ #define AA(__x) ((unsigned long)(__x)) -int cp_compat_stat(struct kstat *kbuf, struct compat_stat __user *ubuf) -{ - compat_ino_t ino; - - typeof(ubuf->st_uid) uid = 0; - typeof(ubuf->st_gid) gid = 0; - SET_UID(uid, kbuf->uid); - SET_GID(gid, kbuf->gid); - if (!old_valid_dev(kbuf->dev) || !old_valid_dev(kbuf->rdev)) - return -EOVERFLOW; - if (kbuf->size >= 0x7fffffff) - return -EOVERFLOW; - ino = kbuf->ino; - if (sizeof(ino) < sizeof(kbuf->ino) && ino != kbuf->ino) - return -EOVERFLOW; - if (!access_ok(VERIFY_WRITE, ubuf, sizeof(struct compat_stat)) || - __put_user(old_encode_dev(kbuf->dev), &ubuf->st_dev) || - __put_user(ino, &ubuf->st_ino) || - __put_user(kbuf->mode, &ubuf->st_mode) || - __put_user(kbuf->nlink, &ubuf->st_nlink) || - __put_user(uid, &ubuf->st_uid) || - __put_user(gid, &ubuf->st_gid) || - __put_user(old_encode_dev(kbuf->rdev), &ubuf->st_rdev) || - __put_user(kbuf->size, &ubuf->st_size) || - __put_user(kbuf->atime.tv_sec, &ubuf->st_atime) || - __put_user(kbuf->atime.tv_nsec, &ubuf->st_atime_nsec) || - __put_user(kbuf->mtime.tv_sec, &ubuf->st_mtime) || - __put_user(kbuf->mtime.tv_nsec, &ubuf->st_mtime_nsec) || - __put_user(kbuf->ctime.tv_sec, &ubuf->st_ctime) || - __put_user(kbuf->ctime.tv_nsec, &ubuf->st_ctime_nsec) || - __put_user(kbuf->blksize, &ubuf->st_blksize) || - __put_user(kbuf->blocks, &ubuf->st_blocks)) - return -EFAULT; - return 0; -} asmlinkage long sys32_truncate64(char __user *filename, unsigned long offset_low, -- cgit v1.2.2 From b418da16dd44810e5d5a22bba377cca80512a524 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 15 Oct 2008 22:02:06 -0700 Subject: compat: generic compat get/settimeofday Nothing arch specific in get/settimeofday. The details of the timeval conversion varied a little from arch to arch, but all with the same results. Also add an extern declaration for sys_tz to linux/time.h because externs in .c files are fowned upon. I'll kill the externs in various other files in a sparate patch. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Christoph Hellwig Acked-by: David S. Miller [ sparc bits ] Cc: "Luck, Tony" Cc: Ralf Baechle Acked-by: Kyle McMartin Cc: Matthew Wilcox Cc: Grant Grundler Cc: Paul Mackerras Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/ia32/ia32entry.S | 4 +-- arch/x86/ia32/sys_ia32.c | 64 ----------------------------------------------- 2 files changed, 2 insertions(+), 66 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index eb4314768bf7..256b00b61892 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -571,8 +571,8 @@ ia32_sys_call_table: .quad compat_sys_setrlimit /* 75 */ .quad compat_sys_old_getrlimit /* old_getrlimit */ .quad compat_sys_getrusage - .quad sys32_gettimeofday - .quad sys32_settimeofday + .quad compat_sys_gettimeofday + .quad compat_sys_settimeofday .quad sys_getgroups16 /* 80 */ .quad sys_setgroups16 .quad sys32_old_select diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 4d3ad8d78a4d..2e09dcd3c0a6 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -367,75 +367,11 @@ asmlinkage long sys32_rt_sigprocmask(int how, compat_sigset_t __user *set, return 0; } -static inline long get_tv32(struct timeval *o, struct compat_timeval __user *i) -{ - int err = -EFAULT; - - if (access_ok(VERIFY_READ, i, sizeof(*i))) { - err = __get_user(o->tv_sec, &i->tv_sec); - err |= __get_user(o->tv_usec, &i->tv_usec); - } - return err; -} - -static inline long put_tv32(struct compat_timeval __user *o, struct timeval *i) -{ - int err = -EFAULT; - - if (access_ok(VERIFY_WRITE, o, sizeof(*o))) { - err = __put_user(i->tv_sec, &o->tv_sec); - err |= __put_user(i->tv_usec, &o->tv_usec); - } - return err; -} - asmlinkage long sys32_alarm(unsigned int seconds) { return alarm_setitimer(seconds); } -/* - * Translations due to time_t size differences. Which affects all - * sorts of things, like timeval and itimerval. - */ -asmlinkage long sys32_gettimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - if (tv) { - struct timeval ktv; - - do_gettimeofday(&ktv); - if (put_tv32(tv, &ktv)) - return -EFAULT; - } - if (tz) { - if (copy_to_user(tz, &sys_tz, sizeof(sys_tz))) - return -EFAULT; - } - return 0; -} - -asmlinkage long sys32_settimeofday(struct compat_timeval __user *tv, - struct timezone __user *tz) -{ - struct timeval ktv; - struct timespec kts; - struct timezone ktz; - - if (tv) { - if (get_tv32(&ktv, tv)) - return -EFAULT; - kts.tv_sec = ktv.tv_sec; - kts.tv_nsec = ktv.tv_usec * NSEC_PER_USEC; - } - if (tz) { - if (copy_from_user(&ktz, tz, sizeof(ktz))) - return -EFAULT; - } - - return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); -} - struct sel_arg_struct { unsigned int n; unsigned int inp; -- cgit v1.2.2 From bdab0ba3d9ad8de257ee6236daf314723748fde6 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:07 -0700 Subject: x86: rename iommu_num_pages function to iommu_nr_pages This series of patches re-introduces the iommu_num_pages function so that it can be used by each architecture specific IOMMU implementations. The series also changes IOMMU implementations for X86, Alpha, PowerPC and UltraSparc. The other implementations are not yet changed because the modifications required are not obvious and I can't test them on real hardware. This patch: This is a preparation patch for introducing a generic iommu_num_pages function. Signed-off-by: Joerg Roedel Cc: "David S. Miller" Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_iommu.c | 8 ++++---- arch/x86/kernel/pci-dma.c | 4 ++-- arch/x86/kernel/pci-gart_64.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 34e4d112b1ef..10646acba9be 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -295,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = iommu_num_pages(address, size); + unsigned pages = iommu_nr_pages(address, size); address &= PAGE_MASK; @@ -679,7 +679,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_num_pages(iommu->exclusion_start, + int pages = iommu_nr_pages(iommu->exclusion_start, iommu->exclusion_length); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -935,7 +935,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned long align_mask = 0; int i; - pages = iommu_num_pages(paddr, size); + pages = iommu_nr_pages(paddr, size); paddr &= PAGE_MASK; if (align) @@ -980,7 +980,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = iommu_num_pages(dma_addr, size); + pages = iommu_nr_pages(dma_addr, size); dma_addr &= PAGE_MASK; start = dma_addr; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 0a3824e837b4..192624820217 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -125,13 +125,13 @@ void __init pci_iommu_alloc(void) pci_swiotlb_init(); } -unsigned long iommu_num_pages(unsigned long addr, unsigned long len) +unsigned long iommu_nr_pages(unsigned long addr, unsigned long len) { unsigned long size = roundup((addr & ~PAGE_MASK) + len, PAGE_SIZE); return size >> PAGE_SHIFT; } -EXPORT_SYMBOL(iommu_num_pages); +EXPORT_SYMBOL(iommu_nr_pages); #endif void *dma_generic_alloc_coherent(struct device *dev, size_t size, diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 145f1c83369f..14f1b41348fd 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -231,7 +231,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { - unsigned long npages = iommu_num_pages(phys_mem, size); + unsigned long npages = iommu_nr_pages(phys_mem, size); unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; @@ -285,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = iommu_num_pages(dma_addr, size); + npages = iommu_nr_pages(dma_addr, size); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -368,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = iommu_num_pages(s->offset, s->length); + pages = iommu_nr_pages(s->offset, s->length); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -451,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += iommu_num_pages(s->offset, s->length); + pages += iommu_nr_pages(s->offset, s->length); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) -- cgit v1.2.2 From 1477b8e5f13266bbf0389baafd39a90ff29c5f61 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:11 -0700 Subject: x86: convert GART driver to generic iommu_num_pages function Signed-off-by: Joerg Roedel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Cc: Muli Ben-Yehuda Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-gart_64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index 14f1b41348fd..e3f75bbcedea 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -231,7 +231,7 @@ nonforced_iommu(struct device *dev, unsigned long addr, size_t size) static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, size_t size, int dir, unsigned long align_mask) { - unsigned long npages = iommu_nr_pages(phys_mem, size); + unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); int i; @@ -285,7 +285,7 @@ static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, return; iommu_page = (dma_addr - iommu_bus_base)>>PAGE_SHIFT; - npages = iommu_nr_pages(dma_addr, size); + npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; CLEAR_LEAK(iommu_page + i); @@ -368,7 +368,7 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, } addr = phys_addr; - pages = iommu_nr_pages(s->offset, s->length); + pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); SET_LEAK(iommu_page); @@ -451,7 +451,7 @@ gart_map_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) seg_size += s->length; need = nextneed; - pages += iommu_nr_pages(s->offset, s->length); + pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE); ps = s; } if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0) -- cgit v1.2.2 From e3c449f526cebb8d287241c7e82faafd9709668b Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:11 -0700 Subject: x86, AMD IOMMU: convert driver to generic iommu_num_pages function Signed-off-by: Joerg Roedel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Cc: Muli Ben-Yehuda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/amd_iommu.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 10646acba9be..a8fd9ebdc8e2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -295,7 +295,7 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, u64 address, size_t size) { int s = 0; - unsigned pages = iommu_nr_pages(address, size); + unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); address &= PAGE_MASK; @@ -679,8 +679,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu, if (iommu->exclusion_start && iommu->exclusion_start < dma_dom->aperture_size) { unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; - int pages = iommu_nr_pages(iommu->exclusion_start, - iommu->exclusion_length); + int pages = iommu_num_pages(iommu->exclusion_start, + iommu->exclusion_length, + PAGE_SIZE); dma_ops_reserve_addresses(dma_dom, startpage, pages); } @@ -935,7 +936,7 @@ static dma_addr_t __map_single(struct device *dev, unsigned long align_mask = 0; int i; - pages = iommu_nr_pages(paddr, size); + pages = iommu_num_pages(paddr, size, PAGE_SIZE); paddr &= PAGE_MASK; if (align) @@ -980,7 +981,7 @@ static void __unmap_single(struct amd_iommu *iommu, if ((dma_addr == 0) || (dma_addr + size > dma_dom->aperture_size)) return; - pages = iommu_nr_pages(dma_addr, size); + pages = iommu_num_pages(dma_addr, size, PAGE_SIZE); dma_addr &= PAGE_MASK; start = dma_addr; -- cgit v1.2.2 From 036b4c50fe99a2f308f36561335b9904ab507972 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 15 Oct 2008 22:02:12 -0700 Subject: x86: convert Calgary IOMMU driver to generic iommu_num_pages function Signed-off-by: Joerg Roedel Cc: Ingo Molnar Cc: Thomas Gleixner Cc: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/pci-calgary_64.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 080d1d27f37a..e1e731d78f38 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -217,16 +217,6 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, #endif /* CONFIG_IOMMU_DEBUG */ -static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) -{ - unsigned int npages; - - npages = PAGE_ALIGN(dma + dmalen) - (dma & PAGE_MASK); - npages >>= PAGE_SHIFT; - - return npages; -} - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -408,7 +398,7 @@ static void calgary_unmap_sg(struct device *dev, if (dmalen == 0) break; - npages = num_dma_pages(dma, dmalen); + npages = iommu_num_pages(dma, dmalen, PAGE_SIZE); iommu_free(tbl, dma, npages); } } @@ -427,7 +417,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg, BUG_ON(!sg_page(s)); vaddr = (unsigned long) sg_virt(s); - npages = num_dma_pages(vaddr, s->length); + npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); entry = iommu_range_alloc(dev, tbl, npages); if (entry == bad_dma_address) { @@ -464,7 +454,7 @@ static dma_addr_t calgary_map_single(struct device *dev, phys_addr_t paddr, struct iommu_table *tbl = find_iommu_table(dev); uaddr = (unsigned long)vaddr; - npages = num_dma_pages(uaddr, size); + npages = iommu_num_pages(uaddr, size, PAGE_SIZE); return iommu_alloc(dev, tbl, vaddr, npages, direction); } @@ -475,7 +465,7 @@ static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; - npages = num_dma_pages(dma_handle, size); + npages = iommu_num_pages(dma_handle, size, PAGE_SIZE); iommu_free(tbl, dma_handle, npages); } -- cgit v1.2.2 From 4d31a2b74c6d063362ae10ce3be3e80d8713bf23 Mon Sep 17 00:00:00 2001 From: Michal Januszewski Date: Wed, 15 Oct 2008 22:03:51 -0700 Subject: fbdev: ignore VESA modes if framebuffer does not support them Currently, it is possible to set a graphics VESA mode at boot time via the vga= parameter even when no framebuffer driver supporting this is configured. This could lead to the system booting with a black screen, without a usable console. Fix this problem by only allowing to set graphics modes at boot time if a supporting framebuffer driver is configured. Signed-off-by: Michal Januszewski Acked-by: Krzysztof Helt Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/boot/video-vesa.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 1e6fe0214c85..99b3079dc6ab 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -88,14 +88,11 @@ static int vesa_probe(void) (vminfo.memory_layout == 4 || vminfo.memory_layout == 6) && vminfo.memory_planes == 1) { -#ifdef CONFIG_FB +#ifdef CONFIG_FB_BOOT_VESA_SUPPORT /* Graphics mode, color, linear frame buffer supported. Only register the mode if if framebuffer is configured, however, - otherwise the user will be left without a screen. - We don't require CONFIG_FB_VESA, however, since - some of the other framebuffer drivers can use - this mode-setting, too. */ + otherwise the user will be left without a screen. */ mi = GET_HEAP(struct mode_info, 1); mi->mode = mode + VIDEO_FIRST_VESA; mi->depth = vminfo.bpp; @@ -133,10 +130,12 @@ static int vesa_set_mode(struct mode_info *mode) if ((vminfo.mode_attr & 0x15) == 0x05) { /* It's a supported text mode */ is_graphic = 0; +#ifdef CONFIG_FB_BOOT_VESA_SUPPORT } else if ((vminfo.mode_attr & 0x99) == 0x99) { /* It's a graphics mode with linear frame buffer */ is_graphic = 1; vesa_mode |= 0x4000; /* Request linear frame buffer */ +#endif } else { return -1; /* Invalid mode */ } -- cgit v1.2.2 From 26adcfbf00e0726b4469070aa2f530dcf963f484 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Tue, 14 Oct 2008 21:01:15 +0200 Subject: x86: SB600: skip ACPI IRQ0 override if it is not routed to INT2 of IOAPIC On some more HP laptops BIOS reports an IRQ0 override but the SB600 chipset is configured such that timer interrupts go to INT0 of IOAPIC. Check IRQ0 routing and if it is routed to INT0 of IOAPIC skip the timer override. http://bugzilla.kernel.org/show_bug.cgi?id=11715 http://bugzilla.kernel.org/show_bug.cgi?id=11516 Signed-off-by: Andreas Herrmann Signed-off-by: Len Brown --- arch/x86/kernel/early-quirks.c | 55 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 733c4f8d42ea..3ce029ffaa55 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,7 +95,8 @@ static void __init nvidia_bugs(int num, int slot, int func) } -static u32 ati_ixp4x0_rev(int num, int slot, int func) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; u8 b; @@ -115,7 +116,6 @@ static u32 ati_ixp4x0_rev(int num, int slot, int func) static void __init ati_bugs(int num, int slot, int func) { -#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) u32 d; u8 b; @@ -138,9 +138,56 @@ static void __init ati_bugs(int num, int slot, int func) printk(KERN_INFO "If you got timer trouble " "try acpi_use_timer_override\n"); } -#endif } +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 old, d; + + d = read_pci_config(num, slot, func, 0x70); + old = d; + d &= ~(1<<8); + write_pci_config(num, slot, func, 0x70, d); + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + write_pci_config(num, slot, func, 0x70, old); + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + if (acpi_use_timer_override) + return; + + rev = ati_sbx00_rev(num, slot, func); + if (rev > 0x13) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + #ifdef CONFIG_DMAR static void __init intel_g33_dmar(int num, int slot, int func) { @@ -176,6 +223,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, #ifdef CONFIG_DMAR { PCI_VENDOR_ID_INTEL, 0x29c0, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, -- cgit v1.2.2 From 89cedfefca1d446ee2598fd3bcbb23ee3802e26a Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 16 Oct 2008 19:00:08 -0400 Subject: cpuidle: upon BIOS bug, default to default_idle rather than polling http://bugzilla.kernel.org/show_bug.cgi?id=11345 Signed-off-by: Venkatesh Pallipadi Signed-off-by: Len Brown --- arch/x86/Kconfig | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..f8caf040650e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -123,6 +123,9 @@ config GENERIC_TIME_VSYSCALL config ARCH_HAS_CPU_RELAX def_bool y +config ARCH_HAS_DEFAULT_IDLE + def_bool y + config ARCH_HAS_CACHE_LINE_SIZE def_bool y -- cgit v1.2.2 From 3038edabf48f01421c621cb77a712b446d3a5d67 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 17 Oct 2008 01:26:27 +0200 Subject: x86 ACPI: fix breakage of resume on 64-bit UP systems with SMP kernel x86 ACPI: Fix breakage of resume on 64-bit UP systems with SMP kernel We are now using per CPU GDT tables in head_64.S and the original early_gdt_descr.address is invalidated after boot by setup_per_cpu_areas(). This breaks resume from suspend to RAM on x86_64 UP systems using SMP kernels, because this part of head_64.S is also executed during the resume and the invalid GDT address causes the system to crash. It doesn't break on 'true' SMP systems, because early_gdt_descr.address is modified every time native_cpu_up() runs. However, during resume it should point to the GDT of the boot CPU rather than to another CPU's GDT. For this reason, during suspend to RAM always make early_gdt_descr.address point to the boot CPU's GDT. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=11568, which is a regression from 2.6.26. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Signed-off-by: Ingo Molnar Reported-and-tested-by: Andy Wettstein --- arch/x86/kernel/acpi/sleep.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 426e5d91b63a..c44cd6dbfa14 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -10,6 +10,7 @@ #include #include #include +#include #include "realmode/wakeup.h" #include "sleep.h" @@ -98,6 +99,8 @@ int acpi_save_state_mem(void) header->trampoline_segment = setup_trampoline() >> 4; #ifdef CONFIG_SMP stack_start.sp = temp_stack + 4096; + early_gdt_descr.address = + (unsigned long)get_cpu_gdt_table(smp_processor_id()); #endif initial_code = (unsigned long)wakeup_long64; saved_magic = 0x123456789abcdef0; -- cgit v1.2.2 From d1d8c925b71dd6753bf438f9e14a9e5c5183bcc6 Mon Sep 17 00:00:00 2001 From: Eric Anholt Date: Thu, 21 Aug 2008 12:53:33 -0700 Subject: Export kmap_atomic_pfn for DRM-GEM. The driver would like to map IO space directly for copying data in when appropriate, to avoid CPU cache flushing for streaming writes. kmap_atomic_pfn lets us avoid IPIs associated with ioremap for this process. Signed-off-by: Eric Anholt Signed-off-by: Dave Airlie --- arch/x86/mm/highmem_32.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 165c871ba9af..bcc079c282dd 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -137,6 +137,7 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type) return (void*) vaddr; } +EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */ struct page *kmap_atomic_to_page(void *ptr) { -- cgit v1.2.2 From 5b6985ce8ec7127b4d60ad450b64ca8b82748a3b Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Thu, 16 Oct 2008 18:02:32 -0700 Subject: intel-iommu: IA64 support The current Intel IOMMU code assumes that both host page size and Intel IOMMU page size are 4KiB. The first patch supports variable page size. This provides support for IA64 which has multiple page sizes. This patch also adds some other code hooks for IA64 platform including DMAR_OPERATION_TIMEOUT definition. [dwmw2: some cleanup] Signed-off-by: Fenghua Yu Signed-off-by: Tony Luck Signed-off-by: David Woodhouse --- arch/x86/kernel/pci-dma.c | 16 ---------------- 1 file changed, 16 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 192624820217..1972266e8ba5 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -9,8 +9,6 @@ #include #include -static int forbid_dac __read_mostly; - struct dma_mapping_ops *dma_ops; EXPORT_SYMBOL(dma_ops); @@ -293,17 +291,3 @@ void pci_iommu_shutdown(void) } /* Must execute after PCI subsystem */ fs_initcall(pci_iommu_init); - -#ifdef CONFIG_PCI -/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ - -static __devinit void via_no_dac(struct pci_dev *dev) -{ - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { - printk(KERN_INFO "PCI: VIA PCI bridge detected." - "Disabling DAC.\n"); - forbid_dac = 1; - } -} -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); -#endif -- cgit v1.2.2 From f609891f428e1c20e270e7c350daf8c93cc459d7 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 16 Oct 2008 16:27:36 +0200 Subject: amd_iommu: fix nasty bug that caused ILLEGAL_DEVICE_TABLE_ENTRY errors We are on 64-bit so better use u64 instead of u32 to deal with addresses: static void __init iommu_set_device_table(struct amd_iommu *iommu) { u64 entry; ... entry = virt_to_phys(amd_iommu_dev_table); ... (I am wondering why gcc 4.2.x did not warn about the assignment between u32 and unsigned long.) Cc: iommu@lists.linux-foundation.org Cc: stable@kernel.org Signed-off-by: Andreas Herrmann Signed-off-by: Joerg Roedel Signed-off-by: David Woodhouse --- arch/x86/kernel/amd_iommu_init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 4cd8083c58be..0cdcda35a05f 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -212,7 +212,7 @@ static void __init iommu_set_exclusion_range(struct amd_iommu *iommu) /* Programs the physical address of the device table into the IOMMU hardware */ static void __init iommu_set_device_table(struct amd_iommu *iommu) { - u32 entry; + u64 entry; BUG_ON(iommu->mmio_base == NULL); -- cgit v1.2.2 From db64fe02258f1507e13fe5212a989922323685ce Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Sat, 18 Oct 2008 20:27:03 -0700 Subject: mm: rewrite vmap layer Rewrite the vmap allocator to use rbtrees and lazy tlb flushing, and provide a fast, scalable percpu frontend for small vmaps (requires a slightly different API, though). The biggest problem with vmap is actually vunmap. Presently this requires a global kernel TLB flush, which on most architectures is a broadcast IPI to all CPUs to flush the cache. This is all done under a global lock. As the number of CPUs increases, so will the number of vunmaps a scaled workload will want to perform, and so will the cost of a global TLB flush. This gives terrible quadratic scalability characteristics. Another problem is that the entire vmap subsystem works under a single lock. It is a rwlock, but it is actually taken for write in all the fast paths, and the read locking would likely never be run concurrently anyway, so it's just pointless. This is a rewrite of vmap subsystem to solve those problems. The existing vmalloc API is implemented on top of the rewritten subsystem. The TLB flushing problem is solved by using lazy TLB unmapping. vmap addresses do not have to be flushed immediately when they are vunmapped, because the kernel will not reuse them again (would be a use-after-free) until they are reallocated. So the addresses aren't allocated again until a subsequent TLB flush. A single TLB flush then can flush multiple vunmaps from each CPU. XEN and PAT and such do not like deferred TLB flushing because they can't always handle multiple aliasing virtual addresses to a physical address. They now call vm_unmap_aliases() in order to flush any deferred mappings. That call is very expensive (well, actually not a lot more expensive than a single vunmap under the old scheme), however it should be OK if not called too often. The virtual memory extent information is stored in an rbtree rather than a linked list to improve the algorithmic scalability. There is a per-CPU allocator for small vmaps, which amortizes or avoids global locking. To use the per-CPU interface, the vm_map_ram / vm_unmap_ram interfaces must be used in place of vmap and vunmap. Vmalloc does not use these interfaces at the moment, so it will not be quite so scalable (although it will use lazy TLB flushing). As a quick test of performance, I ran a test that loops in the kernel, linearly mapping then touching then unmapping 4 pages. Different numbers of tests were run in parallel on an 4 core, 2 socket opteron. Results are in nanoseconds per map+touch+unmap. threads vanilla vmap rewrite 1 14700 2900 2 33600 3000 4 49500 2800 8 70631 2900 So with a 8 cores, the rewritten version is already 25x faster. In a slightly more realistic test (although with an older and less scalable version of the patch), I ripped the not-very-good vunmap batching code out of XFS, and implemented the large buffer mapping with vm_map_ram and vm_unmap_ram... along with a couple of other tricks, I was able to speed up a large directory workload by 20x on a 64 CPU system. I believe vmap/vunmap is actually sped up a lot more than 20x on such a system, but I'm running into other locks now. vmap is pretty well blown off the profiles. Before: 1352059 total 0.1401 798784 _write_lock 8320.6667 <- vmlist_lock 529313 default_idle 1181.5022 15242 smp_call_function 15.8771 <- vmap tlb flushing 2472 __get_vm_area_node 1.9312 <- vmap 1762 remove_vm_area 4.5885 <- vunmap 316 map_vm_area 0.2297 <- vmap 312 kfree 0.1950 300 _spin_lock 3.1250 252 sn_send_IPI_phys 0.4375 <- tlb flushing 238 vmap 0.8264 <- vmap 216 find_lock_page 0.5192 196 find_next_bit 0.3603 136 sn2_send_IPI 0.2024 130 pio_phys_write_mmr 2.0312 118 unmap_kernel_range 0.1229 After: 78406 total 0.0081 40053 default_idle 89.4040 33576 ia64_spinlock_contention 349.7500 1650 _spin_lock 17.1875 319 __reg_op 0.5538 281 _atomic_dec_and_lock 1.0977 153 mutex_unlock 1.5938 123 iget_locked 0.1671 117 xfs_dir_lookup 0.1662 117 dput 0.1406 114 xfs_iget_core 0.0268 92 xfs_da_hashname 0.1917 75 d_alloc 0.0670 68 vmap_page_range 0.0462 <- vmap 58 kmem_cache_alloc 0.0604 57 memset 0.0540 52 rb_next 0.1625 50 __copy_user 0.0208 49 bitmap_find_free_region 0.2188 <- vmap 46 ia64_sn_udelay 0.1106 45 find_inode_fast 0.1406 42 memcmp 0.2188 42 finish_task_switch 0.1094 42 __d_lookup 0.0410 40 radix_tree_lookup_slot 0.1250 37 _spin_unlock_irqrestore 0.3854 36 xfs_bmapi 0.0050 36 kmem_cache_free 0.0256 35 xfs_vn_getattr 0.0322 34 radix_tree_lookup 0.1062 33 __link_path_walk 0.0035 31 xfs_da_do_buf 0.0091 30 _xfs_buf_find 0.0204 28 find_get_page 0.0875 27 xfs_iread 0.0241 27 __strncpy_from_user 0.2812 26 _xfs_buf_initialize 0.0406 24 _xfs_buf_lookup_pages 0.0179 24 vunmap_page_range 0.0250 <- vunmap 23 find_lock_page 0.0799 22 vm_map_ram 0.0087 <- vmap 20 kfree 0.0125 19 put_page 0.0330 18 __kmalloc 0.0176 17 xfs_da_node_lookup_int 0.0086 17 _read_lock 0.0885 17 page_waitqueue 0.0664 vmap has gone from being the top 5 on the profiles and flushing the crap out of all TLBs, to using less than 1% of kernel time. [akpm@linux-foundation.org: cleanups, section fix] [akpm@linux-foundation.org: fix build on alpha] Signed-off-by: Nick Piggin Cc: Jeremy Fitzhardinge Cc: Krzysztof Helt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pageattr.c | 2 ++ arch/x86/xen/enlighten.c | 1 + arch/x86/xen/mmu.c | 1 + 3 files changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a9ec89c3fbca..407d8784f669 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, /* Must avoid aliasing mappings in the highmem code */ kmap_flush_unused(); + vm_unmap_aliases(); + cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0013a729b41d..b61534c7a4c4 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l /* make sure there are no stray mappings of this page */ kmap_flush_unused(); + vm_unmap_aliases(); } } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index ae173f6edd8b..d4d52f5a1cf7 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd) /* re-enable interrupts for kmap_flush_unused */ xen_mc_issue(0); kmap_flush_unused(); + vm_unmap_aliases(); xen_mc_batch(); } -- cgit v1.2.2 From dc52ddc0e6f45b04780b26fc0813509f8e798c42 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Sat, 18 Oct 2008 20:27:21 -0700 Subject: container freezer: implement freezer cgroup subsystem This patch implements a new freezer subsystem in the control groups framework. It provides a way to stop and resume execution of all tasks in a cgroup by writing in the cgroup filesystem. The freezer subsystem in the container filesystem defines a file named freezer.state. Writing "FROZEN" to the state file will freeze all tasks in the cgroup. Subsequently writing "RUNNING" will unfreeze the tasks in the cgroup. Reading will return the current state. * Examples of usage : # mkdir /containers/freezer # mount -t cgroup -ofreezer freezer /containers # mkdir /containers/0 # echo $some_pid > /containers/0/tasks to get status of the freezer subsystem : # cat /containers/0/freezer.state RUNNING to freeze all tasks in the container : # echo FROZEN > /containers/0/freezer.state # cat /containers/0/freezer.state FREEZING # cat /containers/0/freezer.state FROZEN to unfreeze all tasks in the container : # echo RUNNING > /containers/0/freezer.state # cat /containers/0/freezer.state RUNNING This is the basic mechanism which should do the right thing for user space task in a simple scenario. It's important to note that freezing can be incomplete. In that case we return EBUSY. This means that some tasks in the cgroup are busy doing something that prevents us from completely freezing the cgroup at this time. After EBUSY, the cgroup will remain partially frozen -- reflected by freezer.state reporting "FREEZING" when read. The state will remain "FREEZING" until one of these things happens: 1) Userspace cancels the freezing operation by writing "RUNNING" to the freezer.state file 2) Userspace retries the freezing operation by writing "FROZEN" to the freezer.state file (writing "FREEZING" is not legal and returns EIO) 3) The tasks that blocked the cgroup from entering the "FROZEN" state disappear from the cgroup's set of tasks. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: export thaw_process] Signed-off-by: Cedric Le Goater Signed-off-by: Matt Helsley Acked-by: Serge E. Hallyn Tested-by: Matt Helsley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bd3c2c53873e..49349ba77d80 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -193,6 +193,7 @@ config X86_TRAMPOLINE config KTIME_SCALAR def_bool X86_32 source "init/Kconfig" +source "kernel/Kconfig.freezer" menu "Processor type and features" -- cgit v1.2.2 From 57cac4d1880527e0baf6c2fda529d2ad1d815aec Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Sat, 18 Oct 2008 20:28:25 -0700 Subject: kdump: make elfcorehdr_addr independent of CONFIG_PROC_VMCORE o elfcorehdr_addr is used by not only the code under CONFIG_PROC_VMCORE but also by the code which is not inside CONFIG_PROC_VMCORE. For example, is_kdump_kernel() is used by powerpc code to determine if kernel is booting after a panic then use previous kernel's TCE table. So even if CONFIG_PROC_VMCORE is not set in second kernel, one should be able to correctly determine that we are booting after a panic and setup calgary iommu accordingly. o So remove the assumption that elfcorehdr_addr is under CONFIG_PROC_VMCORE. o Move definition of elfcorehdr_addr to arch dependent crash files. (Unfortunately crash dump does not have an arch independent file otherwise that would have been the best place). o kexec.c is not the right place as one can Have CRASH_DUMP enabled in second kernel without KEXEC being enabled. o I don't see sh setup code parsing the command line for elfcorehdr_addr. I am wondering how does vmcore interface work on sh. Anyway, I am atleast defining elfcoredhr_addr so that compilation is not broken on sh. Signed-off-by: Vivek Goyal Acked-by: "Eric W. Biederman" Acked-by: Simon Horman Acked-by: Paul Mundt Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/crash_dump_32.c | 3 +++ arch/x86/kernel/crash_dump_64.c | 3 +++ arch/x86/kernel/setup.c | 8 +++++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 72d0c56c1b48..f7cdb3b457aa 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,6 +13,9 @@ static void *kdump_buf_page; +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c index e90a60ef10c2..045b36cada65 100644 --- a/arch/x86/kernel/crash_dump_64.c +++ b/arch/x86/kernel/crash_dump_64.c @@ -10,6 +10,9 @@ #include #include +/* Stores the physical address of elf header of crash image. */ +unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; + /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2255782e8d4b..b2c97874ec0f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -561,7 +561,13 @@ static void __init reserve_standard_io_resources(void) } -#ifdef CONFIG_PROC_VMCORE +/* + * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by + * is_kdump_kernel() to determine if we are booting after a panic. Hence + * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE. + */ + +#ifdef CONFIG_CRASH_DUMP /* elfcorehdr= specifies the location of elf core header * stored by the crashed kernel. This option will be passed * by kexec loader to the capture kernel. -- cgit v1.2.2 From 357c6e63590895dc87cc9300f5a1c27544ea69e8 Mon Sep 17 00:00:00 2001 From: Adrian Bunk Date: Sat, 18 Oct 2008 20:28:42 -0700 Subject: rtc: use bcd2bin/bin2bcd Change various rtc related code to use the new bcd2bin/bin2bcd functions instead of the obsolete BCD_TO_BIN/BIN_TO_BCD/BCD2BIN/BIN2BCD macros. Signed-off-by: Adrian Bunk Acked-by: Alessandro Zummo Cc: David Brownell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/rtc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 0a23b5795b25..dd6f2b71561b 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -52,7 +52,7 @@ int mach_set_rtc_mmss(unsigned long nowtime) cmos_minutes = CMOS_READ(RTC_MINUTES); if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) - BCD_TO_BIN(cmos_minutes); + cmos_minutes = bcd2bin(cmos_minutes); /* * since we're only adjusting minutes and seconds, @@ -69,8 +69,8 @@ int mach_set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) < 30) { if (!(save_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - BIN_TO_BCD(real_seconds); - BIN_TO_BCD(real_minutes); + real_seconds = bin2bcd(real_seconds); + real_minutes = bin2bcd(real_minutes); } CMOS_WRITE(real_seconds,RTC_SECONDS); CMOS_WRITE(real_minutes,RTC_MINUTES); @@ -124,16 +124,16 @@ unsigned long mach_get_cmos_time(void) WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY)); if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) { - BCD_TO_BIN(sec); - BCD_TO_BIN(min); - BCD_TO_BIN(hour); - BCD_TO_BIN(day); - BCD_TO_BIN(mon); - BCD_TO_BIN(year); + sec = bcd2bin(sec); + min = bcd2bin(min); + hour = bcd2bin(hour); + day = bcd2bin(day); + mon = bcd2bin(mon); + year = bcd2bin(year); } if (century) { - BCD_TO_BIN(century); + century = bcd2bin(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); } else -- cgit v1.2.2 From d768cb6929773060171eee8397a63883f60ddc07 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Mon, 25 Aug 2008 15:44:59 -0600 Subject: x86/PCI: follow lspci device/vendor style Use "[%04x:%04x]" for PCI vendor/device IDs to follow the format used by lspci(8). Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 006599db0dc7..52a1de1128c1 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -493,7 +493,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq if (pirq <= 4) irq = read_config_nybble(router, 0x56, pirq - 1); dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d get IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d get IRQ %d\n", dev->vendor, dev->device, pirq, irq); return irq; } @@ -501,7 +501,7 @@ static int pirq_amd756_get(struct pci_dev *router, struct pci_dev *dev, int pirq static int pirq_amd756_set(struct pci_dev *router, struct pci_dev *dev, int pirq, int irq) { dev_info(&dev->dev, - "AMD756: dev [%04x/%04x], router PIRQ %d set IRQ %d\n", + "AMD756: dev [%04x:%04x], router PIRQ %d set IRQ %d\n", dev->vendor, dev->device, pirq, irq); if (pirq <= 4) write_config_nybble(router, 0x56, pirq - 1, irq); @@ -823,7 +823,7 @@ static void __init pirq_find_router(struct irq_router *r) r->get = NULL; r->set = NULL; - DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for %04x:%04x\n", + DBG(KERN_DEBUG "PCI: Attempting to find IRQ router for [%04x:%04x]\n", rt->rtr_vendor, rt->rtr_device); pirq_router_dev = pci_get_bus_and_slot(rt->rtr_bus, rt->rtr_devfn); @@ -843,7 +843,7 @@ static void __init pirq_find_router(struct irq_router *r) h->probe(r, pirq_router_dev, pirq_router_dev->device)) break; } - dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x/%04x]\n", + dev_info(&pirq_router_dev->dev, "%s IRQ router [%04x:%04x]\n", pirq_router.name, pirq_router_dev->vendor, pirq_router_dev->device); -- cgit v1.2.2 From 37a84ec668ba251ae02cf2c2c664baf6b247ae1f Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Thu, 28 Aug 2008 15:40:59 -0700 Subject: x86/PCI: irq and pci_ids patch for Intel Ibex Peak DeviceIDs This patch updates the Intel Ibex Peak (PCH) LPC and SMBus Controller DeviceIDs. The LPC Controller ID is set by Firmware within the range of 0x3b00-3b1f. This range is included in pci_ids.h using min and max values, and irq.c now has code to handle the range (in lieu of 32 additions to a SWITCH statement). The SMBus Controller ID is a fixed-value and will not change. Signed-off-by: Seth Heasley Acked-by: Jean Delvare Signed-off-by: Jesse Barnes --- arch/x86/pci/irq.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 52a1de1128c1..bf69dbe08bff 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -590,13 +590,20 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route case PCI_DEVICE_ID_INTEL_ICH10_1: case PCI_DEVICE_ID_INTEL_ICH10_2: case PCI_DEVICE_ID_INTEL_ICH10_3: - case PCI_DEVICE_ID_INTEL_PCH_0: - case PCI_DEVICE_ID_INTEL_PCH_1: r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; return 1; } + + if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && + (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { + r->name = "PIIX/ICH"; + r->get = pirq_piix_get; + r->set = pirq_piix_set; + return 1; + } + return 0; } -- cgit v1.2.2 From db7a6d8d01b21829d28638258cbbc9553210bac1 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Oct 2008 11:24:31 -0700 Subject: Update .gitignore files for generated targets The generated 'capflags.c' file wasn't properly ignored, and the list of files in scripts/basic/ wasn't up-to-date. Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/.gitignore | 1 + 1 file changed, 1 insertion(+) create mode 100644 arch/x86/kernel/cpu/.gitignore (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/.gitignore b/arch/x86/kernel/cpu/.gitignore new file mode 100644 index 000000000000..667df55a4399 --- /dev/null +++ b/arch/x86/kernel/cpu/.gitignore @@ -0,0 +1 @@ +capflags.c -- cgit v1.2.2 From f4432c5caec5fa95ea7eefd00f8e6cee17e2e023 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Mon, 20 Oct 2008 13:31:45 -0400 Subject: Update email addresses. Update assorted email addresses and related info to point to a single current, valid address. additionally - trivial CREDITS entry updates. (Not that this file means much any more) - remove arjans dead redhat.com address from powernow driver Signed-off-by: Dave Jones Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/cpufreq/longhaul.c | 4 ++-- arch/x86/kernel/cpu/cpufreq/powernow-k6.c | 2 +- arch/x86/kernel/cpu/cpufreq/powernow-k7.c | 4 ++-- arch/x86/kernel/cpu/cpufreq/powernow-k8.c | 2 +- arch/x86/kernel/cpu/cpufreq/speedstep-ich.c | 2 +- arch/x86/kernel/cpu/mcheck/k7.c | 4 ++-- arch/x86/kernel/cpu/mcheck/mce_32.c | 2 +- arch/x86/kernel/cpu/mcheck/non-fatal.c | 2 +- 8 files changed, 11 insertions(+), 11 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 06fcce516d51..b0461856acfb 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -1,5 +1,5 @@ /* - * (C) 2001-2004 Dave Jones. + * (C) 2001-2004 Dave Jones. * (C) 2002 Padraig Brady. * * Licensed under the terms of the GNU GPL License version 2. @@ -1019,7 +1019,7 @@ MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor"); module_param(revid_errata, int, 0644); MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID"); -MODULE_AUTHOR ("Dave Jones "); +MODULE_AUTHOR ("Dave Jones "); MODULE_DESCRIPTION ("Longhaul driver for VIA Cyrix processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c index b5ced806a316..c1ac5790c63e 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c @@ -246,7 +246,7 @@ static void __exit powernow_k6_exit(void) } -MODULE_AUTHOR("Arjan van de Ven , Dave Jones , Dominik Brodowski "); +MODULE_AUTHOR("Arjan van de Ven, Dave Jones , Dominik Brodowski "); MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors."); MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 0a61159d7b71..7c7d56b43136 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -1,6 +1,6 @@ /* * AMD K7 Powernow driver. - * (C) 2003 Dave Jones on behalf of SuSE Labs. + * (C) 2003 Dave Jones on behalf of SuSE Labs. * (C) 2003-2004 Dave Jones * * Licensed under the terms of the GNU GPL License version 2. @@ -692,7 +692,7 @@ static void __exit powernow_exit (void) module_param(acpi_force, int, 0444); MODULE_PARM_DESC(acpi_force, "Force ACPI to be used."); -MODULE_AUTHOR ("Dave Jones "); +MODULE_AUTHOR ("Dave Jones "); MODULE_DESCRIPTION ("Powernow driver for AMD K7 processors."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index 84bb395038d8..008d23ba491b 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -7,7 +7,7 @@ * Support : mark.langsdorf@amd.com * * Based on the powernow-k7.c module written by Dave Jones. - * (C) 2003 Dave Jones on behalf of SuSE Labs + * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski * (C) 2004 Pavel Machek * Licensed under the terms of the GNU GPL License version 2. diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c index 191f7263c61d..04d0376b64b0 100644 --- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c +++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c @@ -431,7 +431,7 @@ static void __exit speedstep_exit(void) } -MODULE_AUTHOR ("Dave Jones , Dominik Brodowski "); +MODULE_AUTHOR ("Dave Jones , Dominik Brodowski "); MODULE_DESCRIPTION ("Speedstep driver for Intel mobile processors on chipsets with ICH-M southbridges."); MODULE_LICENSE ("GPL"); diff --git a/arch/x86/kernel/cpu/mcheck/k7.c b/arch/x86/kernel/cpu/mcheck/k7.c index f390c9f66351..dd3af6e7b39a 100644 --- a/arch/x86/kernel/cpu/mcheck/k7.c +++ b/arch/x86/kernel/cpu/mcheck/k7.c @@ -1,6 +1,6 @@ /* - * Athlon/Hammer specific Machine Check Exception Reporting - * (C) Copyright 2002 Dave Jones + * Athlon specific Machine Check Exception Reporting + * (C) Copyright 2002 Dave Jones */ #include diff --git a/arch/x86/kernel/cpu/mcheck/mce_32.c b/arch/x86/kernel/cpu/mcheck/mce_32.c index 774d87cfd8cd..0ebf3fc6a610 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_32.c +++ b/arch/x86/kernel/cpu/mcheck/mce_32.c @@ -1,6 +1,6 @@ /* * mce.c - x86 Machine Check Exception Reporting - * (c) 2002 Alan Cox , Dave Jones + * (c) 2002 Alan Cox , Dave Jones */ #include diff --git a/arch/x86/kernel/cpu/mcheck/non-fatal.c b/arch/x86/kernel/cpu/mcheck/non-fatal.c index cc1fccdd31e0..a74af128efc9 100644 --- a/arch/x86/kernel/cpu/mcheck/non-fatal.c +++ b/arch/x86/kernel/cpu/mcheck/non-fatal.c @@ -1,7 +1,7 @@ /* * Non Fatal Machine Check Exception Reporting * - * (C) Copyright 2002 Dave Jones. + * (C) Copyright 2002 Dave Jones. * * This file contains routines to check for non-fatal MCEs every 15s * -- cgit v1.2.2 From e9f95e637320efe1936b647308ddf4ec5b8e0311 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 21 Oct 2008 15:49:59 +0200 Subject: genirq: fix off by one and coding style Fix off-by-one in for_each_irq_desc_reverse(). Impact is near zero in practice, because nothing substantial wants to iterate down to IRQ#0 - but fix it nevertheless. Signed-off-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- arch/x86/kernel/irq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index ccf6c503fc3b..d1d4dc52f649 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -36,7 +36,7 @@ void ack_bad_irq(unsigned int irq) } #ifdef CONFIG_X86_32 -# define irq_stats(x) (&per_cpu(irq_stat,x)) +# define irq_stats(x) (&per_cpu(irq_stat, x)) #else # define irq_stats(x) cpu_pda(x) #endif @@ -113,7 +113,7 @@ int show_interrupts(struct seq_file *p, void *v) if (i == 0) { seq_printf(p, " "); for_each_online_cpu(j) - seq_printf(p, "CPU%-8d",j); + seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); } -- cgit v1.2.2 From c7d87d79d14cecab7a34dedf1b133377cf5a0203 Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Thu, 16 Oct 2008 16:34:43 -0400 Subject: x86 allow modules to register idle notifiers needed if the i7300_idle driver is to be modular. Signed-off-by: Venkatesh Pallipadi Acked-by: Ingo Molnar Signed-off-by: Len Brown --- arch/x86/kernel/process_64.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e12e0e4dd256..3e3d503eadcf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -62,6 +62,13 @@ void idle_notifier_register(struct notifier_block *n) { atomic_notifier_chain_register(&idle_notifier, n); } +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); void enter_idle(void) { -- cgit v1.2.2 From 27471fdb32e77ecb92f09d4ac5757785b4dc33bc Mon Sep 17 00:00:00 2001 From: Andy Henroid Date: Thu, 9 Oct 2008 11:45:22 -0700 Subject: i7300_idle driver v1.55 The Intel 7300 Memory Controller supports dynamic throttling of memory which can be used to save power when system is idle. This driver does the memory throttling when all CPUs are idle on such a system. Refer to "Intel 7300 Memory Controller Hub (MCH)" datasheet for the config space description. Signed-off-by: Andy Henroid Signed-off-by: Len Brown Signed-off-by: Venkatesh Pallipadi --- arch/x86/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ed92864d1325..19cdfe1f237a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1536,6 +1536,8 @@ source "arch/x86/kernel/cpu/cpufreq/Kconfig" source "drivers/cpuidle/Kconfig" +source "drivers/idle/Kconfig" + endmenu -- cgit v1.2.2 From 8bcad30f2e6d4c20f7e71d2e2ac77acc0f0931e5 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Tue, 21 Oct 2008 19:49:09 -0400 Subject: x86: make variables static These variables are only used in their source files, so make them static. Signed-off-by: Roel Kluin Signed-off-by: Ingo Molnar --- arch/x86/boot/video-bios.c | 4 ++-- arch/x86/boot/video-vesa.c | 4 ++-- arch/x86/kernel/xsave.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 49f26aaaebc8..3fa979c9c363 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -17,7 +17,7 @@ #include "boot.h" #include "video.h" -__videocard video_bios; +static __videocard video_bios; /* Set a conventional BIOS mode */ static int set_bios_mode(u8 mode); @@ -119,7 +119,7 @@ static int bios_probe(void) return nmodes; } -__videocard video_bios = +static __videocard video_bios = { .card_name = "BIOS", .probe = bios_probe, diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 99b3079dc6ab..75115849af33 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -20,7 +20,7 @@ static struct vesa_general_info vginfo; static struct vesa_mode_info vminfo; -__videocard video_vesa; +static __videocard video_vesa; #ifndef _WAKEUP static void vesa_store_mode_params_graphics(void); @@ -293,7 +293,7 @@ void vesa_store_edid(void) #endif /* not _WAKEUP */ -__videocard video_vesa = +static __videocard video_vesa = { .card_name = "VESA", .probe = vesa_probe, diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 9abac8a9d823..b13acb75e822 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -248,7 +248,7 @@ clear: * This will be saved when ever the FP and extended state context is * saved on the user stack during the signal handler delivery to the user. */ -void prepare_fx_sw_frame(void) +static void prepare_fx_sw_frame(void) { int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + FP_XSTATE_MAGIC2_SIZE; -- cgit v1.2.2 From 2bfef69d9e8cc056aa4dbc13f2136747340b4515 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 15 Oct 2008 11:51:53 +0200 Subject: x86: SB600: skip IRQ0 override if it is not routed to INT2 of IOAPIC Impact: fix hung bootup and other misbehavior on certain laptops On some more HP laptops BIOS reports an IRQ0 override but the SB600 chipset is configured such that timer interrupts go to INT0 of IOAPIC. Check IRQ0 routing and if it is routed to INT0 of IOAPIC skip the timer override. See following bug reports: http://bugzilla.kernel.org/show_bug.cgi?id=11715 http://bugzilla.kernel.org/show_bug.cgi?id=11516 Signed-off-by: Andreas Herrmann Cc: Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 55 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 52 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 733c4f8d42ea..3ce029ffaa55 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -95,7 +95,8 @@ static void __init nvidia_bugs(int num, int slot, int func) } -static u32 ati_ixp4x0_rev(int num, int slot, int func) +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) +static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { u32 d; u8 b; @@ -115,7 +116,6 @@ static u32 ati_ixp4x0_rev(int num, int slot, int func) static void __init ati_bugs(int num, int slot, int func) { -#if defined(CONFIG_ACPI) && defined (CONFIG_X86_IO_APIC) u32 d; u8 b; @@ -138,9 +138,56 @@ static void __init ati_bugs(int num, int slot, int func) printk(KERN_INFO "If you got timer trouble " "try acpi_use_timer_override\n"); } -#endif } +static u32 __init ati_sbx00_rev(int num, int slot, int func) +{ + u32 old, d; + + d = read_pci_config(num, slot, func, 0x70); + old = d; + d &= ~(1<<8); + write_pci_config(num, slot, func, 0x70, d); + d = read_pci_config(num, slot, func, 0x8); + d &= 0xff; + write_pci_config(num, slot, func, 0x70, old); + + return d; +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ + u32 d, rev; + + if (acpi_use_timer_override) + return; + + rev = ati_sbx00_rev(num, slot, func); + if (rev > 0x13) + return; + + /* check for IRQ0 interrupt swap */ + d = read_pci_config(num, slot, func, 0x64); + if (!(d & (1<<14))) + acpi_skip_timer_override = 1; + + if (acpi_skip_timer_override) { + printk(KERN_INFO "SB600 revision 0x%x\n", rev); + printk(KERN_INFO "Ignoring ACPI timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); + } +} +#else +static void __init ati_bugs(int num, int slot, int func) +{ +} + +static void __init ati_bugs_contd(int num, int slot, int func) +{ +} +#endif + #ifdef CONFIG_DMAR static void __init intel_g33_dmar(int num, int slot, int func) { @@ -176,6 +223,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, fix_hypertransport_config }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_IXP400_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, + { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, #ifdef CONFIG_DMAR { PCI_VENDOR_ID_INTEL, 0x29c0, PCI_CLASS_BRIDGE_HOST, PCI_ANY_ID, 0, intel_g33_dmar }, -- cgit v1.2.2 From d2f6f7aeee890df445be29a60e34925ec15f620c Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 21 Oct 2008 22:45:22 +0200 Subject: MCE: Don't run 32bit machine checks with interrupts on Running machine checks with interrupt on is a extremly bad idea. The machine check handler only runs when the system is broken and needs to finish as quickly as possible. Remove the respective bogus post 2.6.27 regression and call the machine check vector directly again. This removes only code. Signed-off-by: Andi Kleen [Cherry-picked from x86/mce] Signed-off-by: Thomas Gleixner --- arch/x86/kernel/entry_32.S | 2 +- arch/x86/kernel/traps.c | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index c356423a6026..dd65143941a8 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1024,7 +1024,7 @@ ENTRY(machine_check) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 - pushl $do_machine_check + pushl machine_check_vector CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index e062974cce34..04d242ab0161 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -931,14 +931,6 @@ do_device_not_available(struct pt_regs *regs, long error) } #ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_MCE -dotraplinkage void __kprobes do_machine_check(struct pt_regs *regs, long error) -{ - conditional_sti(regs); - machine_check_vector(regs, error); -} -#endif - dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; -- cgit v1.2.2 From cf52ebedba77ee494b495dedd3a1f55944611275 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Fri, 17 Oct 2008 17:00:13 -0400 Subject: x86, kexec: fix hang on i386 when panic occurs while console_sem is held There's a corner case in 32 bit x86 kdump at the moment. When the box panics via nmi, we call bust_spinlocks(1) to disable sensitivity to the console_sem (allowing us to print to the console in all cases), but we don't call crash_kexec, until after we call bust_spinlocks(0), which re-enables console_sem sensitivity. The result is that, if we get an nmi while the console_sem is held and kdump is configured, and we try to print something to the console during kdump shutdown (which we often do) we deadlock the box. The fix is to simply do what 64 bit die_nmi does which is to not call bust_spinlocks(0) until after we call crash_kexec. Patch below tested successfully by me. Signed-off-by: Neil Horman Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 1a78180f08d3..b3614752197b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -405,7 +405,6 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) panic("Non maskable interrupt"); console_silent(); spin_unlock(&nmi_print_lock); - bust_spinlocks(0); /* * If we are in kernel we are probably nested up pretty bad @@ -416,6 +415,7 @@ die_nmi(char *str, struct pt_regs *regs, int do_panic) crash_kexec(regs); } + bust_spinlocks(0); do_exit(SIGSEGV); } -- cgit v1.2.2 From 35af28219e684a36cc8b1ff456c370ce22be157d Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 22 Oct 2008 13:08:31 +0200 Subject: x86: call dmi-quirks for HP Laptops after early-quirks are executed Impact: make warning message disappear - functionality unchanged Problems with bogus IRQ0 override of those laptops should be fixed with commits x86: SB600: skip IRQ0 override if it is not routed to INT2 of IOAPIC x86: SB450: skip IRQ0 override if it is not routed to INT2 of IOAPIC that introduce early-quirks based on chipset configuration. For further information, see http://bugzilla.kernel.org/show_bug.cgi?id=11516 Instead of removing the related dmi-quirks completely we'd like to keep them for (at least) one kernel version -- to double-check whether the early-quirks really took effect. But the dmi-quirks need to be called after early-quirks are executed. With this patch calling sequence for dmi-quriks is changed as follows: acpi_boot_table_init() (dmi-quirks) ... early_quirks() (detect bogus IRQ0 override) ... acpi_boot_init() (late dmi-quirks and setup IO APIC) Note: Plan is to remove the "late dmi-quirks" with next kernel version. Signed-off-by: Andreas Herrmann Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 0d1c26a583c5..8072edb62478 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1598,6 +1598,11 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "TravelMate 360"), }, }, + {} +}; + +/* second table for DMI checks that should run after early-quirks */ +static struct dmi_system_id __initdata acpi_dmi_table_late[] = { /* * HP laptops which use a DSDT reporting as HP/SB400/10000, * which includes some code which overrides all temperature @@ -1726,6 +1731,9 @@ int __init early_acpi_boot_init(void) int __init acpi_boot_init(void) { + /* those are executed after early-quirks are executed */ + dmi_check_system(acpi_dmi_table_late); + /* * If acpi_disabled, bail out * One exception: acpi=ht continues far enough to enumerate LAPICs -- cgit v1.2.2 From bc8bcc79ea4203c7d04309f1307ab88c86f0b0cf Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Wed, 22 Oct 2008 12:42:30 +0800 Subject: x86/proc: fix /proc/cpuinfo cpu offline bug Impact: fix missing CPUs in /proc/cpuinfo after CPU hotunplug/hotreplug In my test, I found that if a cpu has been offline, the next cpus may not be shown in the /proc/cpuinfo. if one read() cannot consume the whole /proc/cpuinfo, c_start() will be called again in the next read() calls. And *pos has been increased by 1 by the caller(seq_read()). if this time the cpu#*pos is offline, c_start() will return NULL, and the next cpus can not be shown. this fix use next_cpu_nr(*pos - 1, cpu_online_map) to search the next unshown cpu. the most easy way to reproduce this bug: 1) offline cpu#1 (cpu#0 is online) 2) dd ibs=2 if=/proc/cpuinfo the result is that only cpu#0 is shown. cpu#2 and cpu#3 .... cannot be shown in /proc/cpuinfo it's bug. Signed-off-by: Lai Jiangshan Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/proc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index a26c480b9491..01b1244ef1c0 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -160,14 +160,16 @@ static void *c_start(struct seq_file *m, loff_t *pos) { if (*pos == 0) /* just in case, cpu 0 is not the first */ *pos = first_cpu(cpu_online_map); - if ((*pos) < nr_cpu_ids && cpu_online(*pos)) + else + *pos = next_cpu_nr(*pos - 1, cpu_online_map); + if ((*pos) < nr_cpu_ids) return &cpu_data(*pos); return NULL; } static void *c_next(struct seq_file *m, void *v, loff_t *pos) { - *pos = next_cpu(*pos, cpu_online_map); + (*pos)++; return c_start(m, pos); } -- cgit v1.2.2 From db96b0a0e4158806fcf03945a870f9320e6bac9b Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 22 Oct 2008 18:00:09 +0400 Subject: x86: do_boot_cpu - check if we have ESR register Impact: fix APIC IRQ irregularities on certain older boxes We should touch the APIC ESR register if only we have it. The patch fixes the problem mentioned by Max Kellermann: http://lkml.org/lkml/2008/10/17/147 Bisected-by: Max Kellermann Signed-off-by: Cyrill Gorcunov [ mingo@elte.hu: build fix ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7ece815ea637..7b1093397319 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -893,9 +893,11 @@ do_rest: smpboot_setup_warm_reset_vector(start_ip); /* * Be paranoid about clearing APIC errors. - */ - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid])) { + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } } /* -- cgit v1.2.2 From 55410791c91f448737c90585d2e280edd3c4d5c7 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Wed, 15 Oct 2008 10:37:04 -0300 Subject: x86: remove redundant KERN_DEBUG on pr_debug pr_debug don't need KERN_DEBUG. Signed-off-by: Gustavo F. Padovan Signed-off-by: Ingo Molnar --- arch/x86/kernel/acpi/boot.c | 2 +- arch/x86/kernel/setup_percpu.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 8072edb62478..dc3eac353db2 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1136,7 +1136,7 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity) return gsi; } if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) { - pr_debug(KERN_DEBUG "Pin %d-%d already programmed\n", + pr_debug("Pin %d-%d already programmed\n", mp_ioapic_routing[ioapic].apic_id, ioapic_pin); #ifdef CONFIG_X86_32 return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]); diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 410c88f0bfeb..ae0c0d3bb770 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -218,7 +218,7 @@ static void __init setup_node_to_cpumask_map(void) /* allocate the map */ map = alloc_bootmem_low(nr_node_ids * sizeof(cpumask_t)); - pr_debug(KERN_DEBUG "Node to cpumask map at %p for %d nodes\n", + pr_debug("Node to cpumask map at %p for %d nodes\n", map, nr_node_ids); /* node_to_cpumask() will now work */ -- cgit v1.2.2 From aef8f5b8c2da706cb3efe701e2a35e11221ea5bd Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 14 Oct 2008 21:43:43 -0700 Subject: x86/tlb_uv: remove strange mc146818rtc include For some reason tlb_uv was including linux/mc146818rtc.h. It really just needs linux/seq_file.h Signed-off-by: Jeremy Fitzhardinge Cc: Cliff Wickman Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 8b8c0d6640fa..04431f34fd16 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -6,7 +6,7 @@ * This code is released under the GNU General Public License version 2 or * later. */ -#include +#include #include #include -- cgit v1.2.2 From 2cb0ebeeb664e6eefe06951709949203e04f7c7b Mon Sep 17 00:00:00 2001 From: Daniele Calore Date: Mon, 13 Oct 2008 10:34:12 +0200 Subject: x86: memtest fix use of reserve_early() Hi all, Wrong usage of 2nd parameter in reserve_early call. 66/75: reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); ^^^^^^^^^^^^^^^^^^^^ The correct way is to use 'end' address and not 'size'. As a bonus a fix to the printk format. Signed-off-by: Daniele Calore Acked-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/mm/memtest.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 672e17f8262a..9cab18b0b857 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -61,9 +61,9 @@ static void __init memtest(unsigned long start_phys, unsigned long size, last_bad += incr; } else { if (start_bad) { - printk(KERN_CONT "\n %010lx bad mem addr %010lx - %010lx reserved", + printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + reserve_early(start_bad, last_bad + incr, "BAD RAM"); } start_bad = last_bad = start_phys_aligned; } @@ -72,9 +72,8 @@ static void __init memtest(unsigned long start_phys, unsigned long size, if (start_bad) { printk(KERN_CONT "\n %016lx bad mem addr %010lx - %010lx reserved", val, start_bad, last_bad + incr); - reserve_early(start_bad, last_bad - start_bad, "BAD RAM"); + reserve_early(start_bad, last_bad + incr, "BAD RAM"); } - } /* default is disabled */ -- cgit v1.2.2 From 983f91ff949d9008eb7fb73ba4e35f9decc182b3 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:08 +0200 Subject: x86: fix section mismatch warning - apic_flat Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbe08): Section mismatch in reference from the variable apic_flat to the function .init.text:flat_acpi_madt_oem_check() The variable apic_flat references the function __init flat_acpi_madt_oem_check() This is harmless, because the .acpi_madt_oem_check is only called during init time. But we keep the function pointer around in a .data function pointer template, so it's better we do not keep that stale - so mark this function non-__init. Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index 2ec2de8d8c46..a10ed42a35e0 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -25,7 +25,7 @@ #include #endif -static int __init flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 1; } -- cgit v1.2.2 From fae1721601a008fc83a809fdd76085a56f2275f0 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:09 +0200 Subject: x86: fix section mismatch warning - apic_physflat Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbe88): Section mismatch in reference from the variable apic_physflat to the function .init.text:physflat_acpi_madt_oem_check() The variable apic_physflat references the function __init physflat_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genapic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genapic_flat_64.c b/arch/x86/kernel/genapic_flat_64.c index a10ed42a35e0..c0262791bda4 100644 --- a/arch/x86/kernel/genapic_flat_64.c +++ b/arch/x86/kernel/genapic_flat_64.c @@ -170,7 +170,7 @@ struct genapic apic_flat = { * We cannot use logical delivery in this case because the mask * overflows, so use physical mode. */ -static int __init physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { #ifdef CONFIG_ACPI /* -- cgit v1.2.2 From f8827c017f19e1cd8834821f4303d5f163feab84 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:10 +0200 Subject: x86: fix section mismatch warning - apic_x2apic_uv_x Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbf08): Section mismatch in reference from the variable apic_x2apic_uv_x to the function .init.text:uv_acpi_madt_oem_check() The variable apic_x2apic_uv_x references the function __init uv_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_uv_x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_uv_x.c b/arch/x86/kernel/genx2apic_uv_x.c index bfd532843df6..680a06557c5e 100644 --- a/arch/x86/kernel/genx2apic_uv_x.c +++ b/arch/x86/kernel/genx2apic_uv_x.c @@ -30,7 +30,7 @@ DEFINE_PER_CPU(int, x2apic_extra_bits); static enum uv_system_type uv_system_type; -static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (!strcmp(oem_id, "SGI")) { if (!strcmp(oem_table_id, "UVL")) -- cgit v1.2.2 From 2caa37150d7aa110b6155cb77b07e581c103fef4 Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:44:11 +0200 Subject: x86: fix section mismatch warning - apic_x2apic_cluster Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xbf88): Section mismatch in reference from the variable apic_x2apic_cluster to the function .init.text:x2apic_acpi_madt_oem_check() The variable apic_x2apic_cluster references the function __init x2apic_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_cluster.c b/arch/x86/kernel/genx2apic_cluster.c index e4bf2cc0d743..f6a2c8eb48a6 100644 --- a/arch/x86/kernel/genx2apic_cluster.c +++ b/arch/x86/kernel/genx2apic_cluster.c @@ -12,7 +12,7 @@ DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); -static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (cpu_has_x2apic) return 1; -- cgit v1.2.2 From 3cfba0892585d4c8e7b4122b5dc0d206a76936de Mon Sep 17 00:00:00 2001 From: Marcin Slusarz Date: Sun, 12 Oct 2008 11:46:23 +0200 Subject: x86: fix section mismatch warning - apic_x2apic_phys Impact: cleanup only, no functionality changed WARNING: vmlinux.o(.data+0xc008): Section mismatch in reference from the variable apic_x2apic_phys to the function .init.text:x2apic_acpi_madt_oem_check() The variable apic_x2apic_phys references the function __init x2apic_acpi_madt_oem_check() Signed-off-by: Marcin Slusarz Signed-off-by: Ingo Molnar --- arch/x86/kernel/genx2apic_phys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/genx2apic_phys.c b/arch/x86/kernel/genx2apic_phys.c index 8f1343df2627..d042211768b7 100644 --- a/arch/x86/kernel/genx2apic_phys.c +++ b/arch/x86/kernel/genx2apic_phys.c @@ -19,7 +19,7 @@ static int set_x2apic_phys_mode(char *arg) } early_param("x2apic_phys", set_x2apic_phys_mode); -static int __init x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { if (cpu_has_x2apic && x2apic_phys) return 1; -- cgit v1.2.2 From bb8985586b7a906e116db835c64773b7a7d51663 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 17 Aug 2008 21:05:42 -0400 Subject: x86, um: ... and asm-x86 move Signed-off-by: Al Viro Signed-off-by: H. Peter Anvin --- arch/x86/Makefile | 6 +- arch/x86/include/asm/Kbuild | 24 + arch/x86/include/asm/a.out-core.h | 73 + arch/x86/include/asm/a.out.h | 20 + arch/x86/include/asm/acpi.h | 178 +++ arch/x86/include/asm/agp.h | 35 + arch/x86/include/asm/alternative-asm.h | 22 + arch/x86/include/asm/alternative.h | 183 +++ arch/x86/include/asm/amd_iommu.h | 35 + arch/x86/include/asm/amd_iommu_types.h | 404 +++++ arch/x86/include/asm/apic.h | 199 +++ arch/x86/include/asm/apicdef.h | 417 ++++++ arch/x86/include/asm/arch_hooks.h | 26 + arch/x86/include/asm/asm.h | 47 + arch/x86/include/asm/atomic.h | 5 + arch/x86/include/asm/atomic_32.h | 259 ++++ arch/x86/include/asm/atomic_64.h | 473 ++++++ arch/x86/include/asm/auxvec.h | 12 + arch/x86/include/asm/bigsmp/apic.h | 139 ++ arch/x86/include/asm/bigsmp/apicdef.h | 13 + arch/x86/include/asm/bigsmp/ipi.h | 25 + arch/x86/include/asm/bios_ebda.h | 36 + arch/x86/include/asm/bitops.h | 451 ++++++ arch/x86/include/asm/boot.h | 26 + arch/x86/include/asm/bootparam.h | 111 ++ arch/x86/include/asm/bug.h | 39 + arch/x86/include/asm/bugs.h | 12 + arch/x86/include/asm/byteorder.h | 81 + arch/x86/include/asm/cache.h | 20 + arch/x86/include/asm/cacheflush.h | 118 ++ arch/x86/include/asm/calgary.h | 72 + arch/x86/include/asm/calling.h | 170 +++ arch/x86/include/asm/checksum.h | 5 + arch/x86/include/asm/checksum_32.h | 189 +++ arch/x86/include/asm/checksum_64.h | 191 +++ arch/x86/include/asm/cmpxchg.h | 5 + arch/x86/include/asm/cmpxchg_32.h | 344 +++++ arch/x86/include/asm/cmpxchg_64.h | 185 +++ arch/x86/include/asm/compat.h | 218 +++ arch/x86/include/asm/cpu.h | 20 + arch/x86/include/asm/cpufeature.h | 271 ++++ arch/x86/include/asm/cputime.h | 1 + arch/x86/include/asm/current.h | 39 + arch/x86/include/asm/debugreg.h | 70 + arch/x86/include/asm/delay.h | 31 + arch/x86/include/asm/desc.h | 409 +++++ arch/x86/include/asm/desc_defs.h | 95 ++ arch/x86/include/asm/device.h | 16 + arch/x86/include/asm/div64.h | 60 + arch/x86/include/asm/dma-mapping.h | 308 ++++ arch/x86/include/asm/dma.h | 318 ++++ arch/x86/include/asm/dmi.h | 26 + arch/x86/include/asm/ds.h | 238 +++ arch/x86/include/asm/dwarf2.h | 61 + arch/x86/include/asm/e820.h | 146 ++ arch/x86/include/asm/edac.h | 18 + arch/x86/include/asm/efi.h | 110 ++ arch/x86/include/asm/elf.h | 336 +++++ arch/x86/include/asm/emergency-restart.h | 18 + arch/x86/include/asm/errno.h | 1 + arch/x86/include/asm/es7000/apic.h | 193 +++ arch/x86/include/asm/es7000/apicdef.h | 13 + arch/x86/include/asm/es7000/ipi.h | 24 + arch/x86/include/asm/es7000/mpparse.h | 30 + arch/x86/include/asm/es7000/wakecpu.h | 59 + arch/x86/include/asm/fb.h | 21 + arch/x86/include/asm/fcntl.h | 1 + arch/x86/include/asm/fixmap.h | 68 + arch/x86/include/asm/fixmap_32.h | 123 ++ arch/x86/include/asm/fixmap_64.h | 83 ++ arch/x86/include/asm/floppy.h | 281 ++++ arch/x86/include/asm/frame.h | 27 + arch/x86/include/asm/ftrace.h | 24 + arch/x86/include/asm/futex.h | 140 ++ arch/x86/include/asm/gart.h | 73 + arch/x86/include/asm/genapic.h | 5 + arch/x86/include/asm/genapic_32.h | 126 ++ arch/x86/include/asm/genapic_64.h | 58 + arch/x86/include/asm/geode.h | 253 ++++ arch/x86/include/asm/gpio.h | 56 + arch/x86/include/asm/hardirq.h | 11 + arch/x86/include/asm/hardirq_32.h | 28 + arch/x86/include/asm/hardirq_64.h | 23 + arch/x86/include/asm/highmem.h | 82 + arch/x86/include/asm/hpet.h | 114 ++ arch/x86/include/asm/hugetlb.h | 93 ++ arch/x86/include/asm/hw_irq.h | 131 ++ arch/x86/include/asm/hypertransport.h | 45 + arch/x86/include/asm/i387.h | 400 +++++ arch/x86/include/asm/i8253.h | 18 + arch/x86/include/asm/i8259.h | 63 + arch/x86/include/asm/ia32.h | 170 +++ arch/x86/include/asm/ia32_unistd.h | 18 + arch/x86/include/asm/idle.h | 15 + arch/x86/include/asm/intel_arch_perfmon.h | 31 + arch/x86/include/asm/io.h | 91 ++ arch/x86/include/asm/io_32.h | 284 ++++ arch/x86/include/asm/io_64.h | 244 +++ arch/x86/include/asm/io_apic.h | 204 +++ arch/x86/include/asm/ioctl.h | 1 + arch/x86/include/asm/ioctls.h | 94 ++ arch/x86/include/asm/iommu.h | 46 + arch/x86/include/asm/ipcbuf.h | 28 + arch/x86/include/asm/ipi.h | 138 ++ arch/x86/include/asm/irq.h | 50 + arch/x86/include/asm/irq_regs.h | 5 + arch/x86/include/asm/irq_regs_32.h | 29 + arch/x86/include/asm/irq_regs_64.h | 1 + arch/x86/include/asm/irq_remapping.h | 8 + arch/x86/include/asm/irq_vectors.h | 164 ++ arch/x86/include/asm/irqflags.h | 211 +++ arch/x86/include/asm/ist.h | 34 + arch/x86/include/asm/k8.h | 15 + arch/x86/include/asm/kdebug.h | 37 + arch/x86/include/asm/kexec.h | 175 +++ arch/x86/include/asm/kgdb.h | 79 + arch/x86/include/asm/kmap_types.h | 29 + arch/x86/include/asm/kprobes.h | 88 ++ arch/x86/include/asm/kvm.h | 211 +++ arch/x86/include/asm/kvm_host.h | 752 ++++++++++ arch/x86/include/asm/kvm_para.h | 147 ++ arch/x86/include/asm/kvm_x86_emulate.h | 184 +++ arch/x86/include/asm/ldt.h | 40 + arch/x86/include/asm/lguest.h | 94 ++ arch/x86/include/asm/lguest_hcall.h | 71 + arch/x86/include/asm/linkage.h | 61 + arch/x86/include/asm/local.h | 235 +++ arch/x86/include/asm/mach-default/apm.h | 73 + arch/x86/include/asm/mach-default/do_timer.h | 16 + arch/x86/include/asm/mach-default/entry_arch.h | 36 + arch/x86/include/asm/mach-default/mach_apic.h | 156 ++ arch/x86/include/asm/mach-default/mach_apicdef.h | 24 + arch/x86/include/asm/mach-default/mach_ipi.h | 64 + arch/x86/include/asm/mach-default/mach_mpparse.h | 17 + arch/x86/include/asm/mach-default/mach_mpspec.h | 12 + arch/x86/include/asm/mach-default/mach_timer.h | 48 + arch/x86/include/asm/mach-default/mach_traps.h | 33 + arch/x86/include/asm/mach-default/mach_wakecpu.h | 42 + arch/x86/include/asm/mach-default/pci-functions.h | 19 + arch/x86/include/asm/mach-default/setup_arch.h | 3 + arch/x86/include/asm/mach-default/smpboot_hooks.h | 59 + arch/x86/include/asm/mach-generic/gpio.h | 15 + arch/x86/include/asm/mach-generic/mach_apic.h | 33 + arch/x86/include/asm/mach-generic/mach_apicdef.h | 11 + arch/x86/include/asm/mach-generic/mach_ipi.h | 10 + arch/x86/include/asm/mach-generic/mach_mpparse.h | 10 + arch/x86/include/asm/mach-generic/mach_mpspec.h | 12 + arch/x86/include/asm/mach-rdc321x/gpio.h | 60 + arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h | 12 + arch/x86/include/asm/mach-voyager/do_timer.h | 17 + arch/x86/include/asm/mach-voyager/entry_arch.h | 26 + arch/x86/include/asm/mach-voyager/setup_arch.h | 12 + arch/x86/include/asm/math_emu.h | 31 + arch/x86/include/asm/mc146818rtc.h | 104 ++ arch/x86/include/asm/mca.h | 43 + arch/x86/include/asm/mca_dma.h | 201 +++ arch/x86/include/asm/mce.h | 130 ++ arch/x86/include/asm/microcode.h | 47 + arch/x86/include/asm/mman.h | 20 + arch/x86/include/asm/mmconfig.h | 12 + arch/x86/include/asm/mmu.h | 26 + arch/x86/include/asm/mmu_context.h | 37 + arch/x86/include/asm/mmu_context_32.h | 56 + arch/x86/include/asm/mmu_context_64.h | 54 + arch/x86/include/asm/mmx.h | 14 + arch/x86/include/asm/mmzone.h | 5 + arch/x86/include/asm/mmzone_32.h | 134 ++ arch/x86/include/asm/mmzone_64.h | 51 + arch/x86/include/asm/module.h | 80 + arch/x86/include/asm/mpspec.h | 145 ++ arch/x86/include/asm/mpspec_def.h | 180 +++ arch/x86/include/asm/msgbuf.h | 39 + arch/x86/include/asm/msidef.h | 55 + arch/x86/include/asm/msr-index.h | 332 +++++ arch/x86/include/asm/msr.h | 247 +++ arch/x86/include/asm/mtrr.h | 173 +++ arch/x86/include/asm/mutex.h | 5 + arch/x86/include/asm/mutex_32.h | 125 ++ arch/x86/include/asm/mutex_64.h | 100 ++ arch/x86/include/asm/nmi.h | 81 + arch/x86/include/asm/nops.h | 118 ++ arch/x86/include/asm/numa.h | 5 + arch/x86/include/asm/numa_32.h | 11 + arch/x86/include/asm/numa_64.h | 43 + arch/x86/include/asm/numaq.h | 169 +++ arch/x86/include/asm/numaq/apic.h | 136 ++ arch/x86/include/asm/numaq/apicdef.h | 14 + arch/x86/include/asm/numaq/ipi.h | 25 + arch/x86/include/asm/numaq/mpparse.h | 7 + arch/x86/include/asm/numaq/wakecpu.h | 43 + arch/x86/include/asm/olpc.h | 132 ++ arch/x86/include/asm/page.h | 209 +++ arch/x86/include/asm/page_32.h | 136 ++ arch/x86/include/asm/page_64.h | 105 ++ arch/x86/include/asm/param.h | 22 + arch/x86/include/asm/paravirt.h | 1650 +++++++++++++++++++++ arch/x86/include/asm/parport.h | 10 + arch/x86/include/asm/pat.h | 22 + arch/x86/include/asm/pci-direct.h | 21 + arch/x86/include/asm/pci.h | 114 ++ arch/x86/include/asm/pci_32.h | 34 + arch/x86/include/asm/pci_64.h | 66 + arch/x86/include/asm/pda.h | 137 ++ arch/x86/include/asm/percpu.h | 218 +++ arch/x86/include/asm/pgalloc.h | 114 ++ arch/x86/include/asm/pgtable-2level-defs.h | 20 + arch/x86/include/asm/pgtable-2level.h | 79 + arch/x86/include/asm/pgtable-3level-defs.h | 28 + arch/x86/include/asm/pgtable-3level.h | 175 +++ arch/x86/include/asm/pgtable.h | 561 +++++++ arch/x86/include/asm/pgtable_32.h | 191 +++ arch/x86/include/asm/pgtable_64.h | 285 ++++ arch/x86/include/asm/poll.h | 1 + arch/x86/include/asm/posix_types.h | 13 + arch/x86/include/asm/posix_types_32.h | 85 ++ arch/x86/include/asm/posix_types_64.h | 119 ++ arch/x86/include/asm/prctl.h | 10 + arch/x86/include/asm/processor-cyrix.h | 38 + arch/x86/include/asm/processor-flags.h | 100 ++ arch/x86/include/asm/processor.h | 936 ++++++++++++ arch/x86/include/asm/proto.h | 32 + arch/x86/include/asm/ptrace-abi.h | 145 ++ arch/x86/include/asm/ptrace.h | 280 ++++ arch/x86/include/asm/pvclock-abi.h | 42 + arch/x86/include/asm/pvclock.h | 14 + arch/x86/include/asm/reboot.h | 21 + arch/x86/include/asm/reboot_fixups.h | 6 + arch/x86/include/asm/required-features.h | 82 + arch/x86/include/asm/resource.h | 1 + arch/x86/include/asm/resume-trace.h | 21 + arch/x86/include/asm/rio.h | 63 + arch/x86/include/asm/rtc.h | 1 + arch/x86/include/asm/rwlock.h | 8 + arch/x86/include/asm/rwsem.h | 265 ++++ arch/x86/include/asm/scatterlist.h | 33 + arch/x86/include/asm/seccomp.h | 5 + arch/x86/include/asm/seccomp_32.h | 17 + arch/x86/include/asm/seccomp_64.h | 25 + arch/x86/include/asm/sections.h | 1 + arch/x86/include/asm/segment.h | 209 +++ arch/x86/include/asm/sembuf.h | 24 + arch/x86/include/asm/serial.h | 29 + arch/x86/include/asm/setup.h | 105 ++ arch/x86/include/asm/shmbuf.h | 51 + arch/x86/include/asm/shmparam.h | 6 + arch/x86/include/asm/sigcontext.h | 284 ++++ arch/x86/include/asm/sigcontext32.h | 75 + arch/x86/include/asm/siginfo.h | 10 + arch/x86/include/asm/signal.h | 262 ++++ arch/x86/include/asm/smp.h | 229 +++ arch/x86/include/asm/socket.h | 57 + arch/x86/include/asm/sockios.h | 13 + arch/x86/include/asm/sparsemem.h | 34 + arch/x86/include/asm/spinlock.h | 364 +++++ arch/x86/include/asm/spinlock_types.h | 20 + arch/x86/include/asm/srat.h | 39 + arch/x86/include/asm/stacktrace.h | 21 + arch/x86/include/asm/stat.h | 114 ++ arch/x86/include/asm/statfs.h | 12 + arch/x86/include/asm/string.h | 5 + arch/x86/include/asm/string_32.h | 326 ++++ arch/x86/include/asm/string_64.h | 60 + arch/x86/include/asm/summit/apic.h | 184 +++ arch/x86/include/asm/summit/apicdef.h | 13 + arch/x86/include/asm/summit/ipi.h | 25 + arch/x86/include/asm/summit/mpparse.h | 109 ++ arch/x86/include/asm/suspend.h | 5 + arch/x86/include/asm/suspend_32.h | 51 + arch/x86/include/asm/suspend_64.h | 52 + arch/x86/include/asm/swiotlb.h | 58 + arch/x86/include/asm/sync_bitops.h | 130 ++ arch/x86/include/asm/syscall.h | 211 +++ arch/x86/include/asm/syscalls.h | 93 ++ arch/x86/include/asm/system.h | 425 ++++++ arch/x86/include/asm/system_64.h | 22 + arch/x86/include/asm/tce.h | 48 + arch/x86/include/asm/termbits.h | 198 +++ arch/x86/include/asm/termios.h | 113 ++ arch/x86/include/asm/therm_throt.h | 9 + arch/x86/include/asm/thread_info.h | 264 ++++ arch/x86/include/asm/time.h | 63 + arch/x86/include/asm/timer.h | 66 + arch/x86/include/asm/timex.h | 19 + arch/x86/include/asm/tlb.h | 11 + arch/x86/include/asm/tlbflush.h | 178 +++ arch/x86/include/asm/topology.h | 258 ++++ arch/x86/include/asm/trampoline.h | 21 + arch/x86/include/asm/traps.h | 81 + arch/x86/include/asm/tsc.h | 62 + arch/x86/include/asm/types.h | 36 + arch/x86/include/asm/uaccess.h | 454 ++++++ arch/x86/include/asm/uaccess_32.h | 218 +++ arch/x86/include/asm/uaccess_64.h | 202 +++ arch/x86/include/asm/ucontext.h | 18 + arch/x86/include/asm/unaligned.h | 14 + arch/x86/include/asm/unistd.h | 13 + arch/x86/include/asm/unistd_32.h | 379 +++++ arch/x86/include/asm/unistd_64.h | 693 +++++++++ arch/x86/include/asm/unwind.h | 13 + arch/x86/include/asm/user.h | 5 + arch/x86/include/asm/user32.h | 70 + arch/x86/include/asm/user_32.h | 131 ++ arch/x86/include/asm/user_64.h | 137 ++ arch/x86/include/asm/uv/bios.h | 94 ++ arch/x86/include/asm/uv/uv_bau.h | 332 +++++ arch/x86/include/asm/uv/uv_hub.h | 354 +++++ arch/x86/include/asm/uv/uv_irq.h | 36 + arch/x86/include/asm/uv/uv_mmrs.h | 1295 ++++++++++++++++ arch/x86/include/asm/vdso.h | 47 + arch/x86/include/asm/vga.h | 20 + arch/x86/include/asm/vgtod.h | 29 + arch/x86/include/asm/vic.h | 61 + arch/x86/include/asm/visws/cobalt.h | 125 ++ arch/x86/include/asm/visws/lithium.h | 53 + arch/x86/include/asm/visws/piix4.h | 107 ++ arch/x86/include/asm/visws/sgivw.h | 5 + arch/x86/include/asm/vm86.h | 208 +++ arch/x86/include/asm/vmi.h | 263 ++++ arch/x86/include/asm/vmi_time.h | 98 ++ arch/x86/include/asm/voyager.h | 528 +++++++ arch/x86/include/asm/vsyscall.h | 44 + arch/x86/include/asm/xcr.h | 49 + arch/x86/include/asm/xen/events.h | 24 + arch/x86/include/asm/xen/grant_table.h | 7 + arch/x86/include/asm/xen/hypercall.h | 527 +++++++ arch/x86/include/asm/xen/hypervisor.h | 82 + arch/x86/include/asm/xen/interface.h | 175 +++ arch/x86/include/asm/xen/interface_32.h | 97 ++ arch/x86/include/asm/xen/interface_64.h | 159 ++ arch/x86/include/asm/xen/page.h | 165 +++ arch/x86/include/asm/xor.h | 5 + arch/x86/include/asm/xor_32.h | 888 +++++++++++ arch/x86/include/asm/xor_64.h | 361 +++++ arch/x86/include/asm/xsave.h | 118 ++ arch/x86/kernel/cpu/Makefile | 2 +- 335 files changed, 38676 insertions(+), 4 deletions(-) create mode 100644 arch/x86/include/asm/Kbuild create mode 100644 arch/x86/include/asm/a.out-core.h create mode 100644 arch/x86/include/asm/a.out.h create mode 100644 arch/x86/include/asm/acpi.h create mode 100644 arch/x86/include/asm/agp.h create mode 100644 arch/x86/include/asm/alternative-asm.h create mode 100644 arch/x86/include/asm/alternative.h create mode 100644 arch/x86/include/asm/amd_iommu.h create mode 100644 arch/x86/include/asm/amd_iommu_types.h create mode 100644 arch/x86/include/asm/apic.h create mode 100644 arch/x86/include/asm/apicdef.h create mode 100644 arch/x86/include/asm/arch_hooks.h create mode 100644 arch/x86/include/asm/asm.h create mode 100644 arch/x86/include/asm/atomic.h create mode 100644 arch/x86/include/asm/atomic_32.h create mode 100644 arch/x86/include/asm/atomic_64.h create mode 100644 arch/x86/include/asm/auxvec.h create mode 100644 arch/x86/include/asm/bigsmp/apic.h create mode 100644 arch/x86/include/asm/bigsmp/apicdef.h create mode 100644 arch/x86/include/asm/bigsmp/ipi.h create mode 100644 arch/x86/include/asm/bios_ebda.h create mode 100644 arch/x86/include/asm/bitops.h create mode 100644 arch/x86/include/asm/boot.h create mode 100644 arch/x86/include/asm/bootparam.h create mode 100644 arch/x86/include/asm/bug.h create mode 100644 arch/x86/include/asm/bugs.h create mode 100644 arch/x86/include/asm/byteorder.h create mode 100644 arch/x86/include/asm/cache.h create mode 100644 arch/x86/include/asm/cacheflush.h create mode 100644 arch/x86/include/asm/calgary.h create mode 100644 arch/x86/include/asm/calling.h create mode 100644 arch/x86/include/asm/checksum.h create mode 100644 arch/x86/include/asm/checksum_32.h create mode 100644 arch/x86/include/asm/checksum_64.h create mode 100644 arch/x86/include/asm/cmpxchg.h create mode 100644 arch/x86/include/asm/cmpxchg_32.h create mode 100644 arch/x86/include/asm/cmpxchg_64.h create mode 100644 arch/x86/include/asm/compat.h create mode 100644 arch/x86/include/asm/cpu.h create mode 100644 arch/x86/include/asm/cpufeature.h create mode 100644 arch/x86/include/asm/cputime.h create mode 100644 arch/x86/include/asm/current.h create mode 100644 arch/x86/include/asm/debugreg.h create mode 100644 arch/x86/include/asm/delay.h create mode 100644 arch/x86/include/asm/desc.h create mode 100644 arch/x86/include/asm/desc_defs.h create mode 100644 arch/x86/include/asm/device.h create mode 100644 arch/x86/include/asm/div64.h create mode 100644 arch/x86/include/asm/dma-mapping.h create mode 100644 arch/x86/include/asm/dma.h create mode 100644 arch/x86/include/asm/dmi.h create mode 100644 arch/x86/include/asm/ds.h create mode 100644 arch/x86/include/asm/dwarf2.h create mode 100644 arch/x86/include/asm/e820.h create mode 100644 arch/x86/include/asm/edac.h create mode 100644 arch/x86/include/asm/efi.h create mode 100644 arch/x86/include/asm/elf.h create mode 100644 arch/x86/include/asm/emergency-restart.h create mode 100644 arch/x86/include/asm/errno.h create mode 100644 arch/x86/include/asm/es7000/apic.h create mode 100644 arch/x86/include/asm/es7000/apicdef.h create mode 100644 arch/x86/include/asm/es7000/ipi.h create mode 100644 arch/x86/include/asm/es7000/mpparse.h create mode 100644 arch/x86/include/asm/es7000/wakecpu.h create mode 100644 arch/x86/include/asm/fb.h create mode 100644 arch/x86/include/asm/fcntl.h create mode 100644 arch/x86/include/asm/fixmap.h create mode 100644 arch/x86/include/asm/fixmap_32.h create mode 100644 arch/x86/include/asm/fixmap_64.h create mode 100644 arch/x86/include/asm/floppy.h create mode 100644 arch/x86/include/asm/frame.h create mode 100644 arch/x86/include/asm/ftrace.h create mode 100644 arch/x86/include/asm/futex.h create mode 100644 arch/x86/include/asm/gart.h create mode 100644 arch/x86/include/asm/genapic.h create mode 100644 arch/x86/include/asm/genapic_32.h create mode 100644 arch/x86/include/asm/genapic_64.h create mode 100644 arch/x86/include/asm/geode.h create mode 100644 arch/x86/include/asm/gpio.h create mode 100644 arch/x86/include/asm/hardirq.h create mode 100644 arch/x86/include/asm/hardirq_32.h create mode 100644 arch/x86/include/asm/hardirq_64.h create mode 100644 arch/x86/include/asm/highmem.h create mode 100644 arch/x86/include/asm/hpet.h create mode 100644 arch/x86/include/asm/hugetlb.h create mode 100644 arch/x86/include/asm/hw_irq.h create mode 100644 arch/x86/include/asm/hypertransport.h create mode 100644 arch/x86/include/asm/i387.h create mode 100644 arch/x86/include/asm/i8253.h create mode 100644 arch/x86/include/asm/i8259.h create mode 100644 arch/x86/include/asm/ia32.h create mode 100644 arch/x86/include/asm/ia32_unistd.h create mode 100644 arch/x86/include/asm/idle.h create mode 100644 arch/x86/include/asm/intel_arch_perfmon.h create mode 100644 arch/x86/include/asm/io.h create mode 100644 arch/x86/include/asm/io_32.h create mode 100644 arch/x86/include/asm/io_64.h create mode 100644 arch/x86/include/asm/io_apic.h create mode 100644 arch/x86/include/asm/ioctl.h create mode 100644 arch/x86/include/asm/ioctls.h create mode 100644 arch/x86/include/asm/iommu.h create mode 100644 arch/x86/include/asm/ipcbuf.h create mode 100644 arch/x86/include/asm/ipi.h create mode 100644 arch/x86/include/asm/irq.h create mode 100644 arch/x86/include/asm/irq_regs.h create mode 100644 arch/x86/include/asm/irq_regs_32.h create mode 100644 arch/x86/include/asm/irq_regs_64.h create mode 100644 arch/x86/include/asm/irq_remapping.h create mode 100644 arch/x86/include/asm/irq_vectors.h create mode 100644 arch/x86/include/asm/irqflags.h create mode 100644 arch/x86/include/asm/ist.h create mode 100644 arch/x86/include/asm/k8.h create mode 100644 arch/x86/include/asm/kdebug.h create mode 100644 arch/x86/include/asm/kexec.h create mode 100644 arch/x86/include/asm/kgdb.h create mode 100644 arch/x86/include/asm/kmap_types.h create mode 100644 arch/x86/include/asm/kprobes.h create mode 100644 arch/x86/include/asm/kvm.h create mode 100644 arch/x86/include/asm/kvm_host.h create mode 100644 arch/x86/include/asm/kvm_para.h create mode 100644 arch/x86/include/asm/kvm_x86_emulate.h create mode 100644 arch/x86/include/asm/ldt.h create mode 100644 arch/x86/include/asm/lguest.h create mode 100644 arch/x86/include/asm/lguest_hcall.h create mode 100644 arch/x86/include/asm/linkage.h create mode 100644 arch/x86/include/asm/local.h create mode 100644 arch/x86/include/asm/mach-default/apm.h create mode 100644 arch/x86/include/asm/mach-default/do_timer.h create mode 100644 arch/x86/include/asm/mach-default/entry_arch.h create mode 100644 arch/x86/include/asm/mach-default/mach_apic.h create mode 100644 arch/x86/include/asm/mach-default/mach_apicdef.h create mode 100644 arch/x86/include/asm/mach-default/mach_ipi.h create mode 100644 arch/x86/include/asm/mach-default/mach_mpparse.h create mode 100644 arch/x86/include/asm/mach-default/mach_mpspec.h create mode 100644 arch/x86/include/asm/mach-default/mach_timer.h create mode 100644 arch/x86/include/asm/mach-default/mach_traps.h create mode 100644 arch/x86/include/asm/mach-default/mach_wakecpu.h create mode 100644 arch/x86/include/asm/mach-default/pci-functions.h create mode 100644 arch/x86/include/asm/mach-default/setup_arch.h create mode 100644 arch/x86/include/asm/mach-default/smpboot_hooks.h create mode 100644 arch/x86/include/asm/mach-generic/gpio.h create mode 100644 arch/x86/include/asm/mach-generic/mach_apic.h create mode 100644 arch/x86/include/asm/mach-generic/mach_apicdef.h create mode 100644 arch/x86/include/asm/mach-generic/mach_ipi.h create mode 100644 arch/x86/include/asm/mach-generic/mach_mpparse.h create mode 100644 arch/x86/include/asm/mach-generic/mach_mpspec.h create mode 100644 arch/x86/include/asm/mach-rdc321x/gpio.h create mode 100644 arch/x86/include/asm/mach-rdc321x/rdc321x_defs.h create mode 100644 arch/x86/include/asm/mach-voyager/do_timer.h create mode 100644 arch/x86/include/asm/mach-voyager/entry_arch.h create mode 100644 arch/x86/include/asm/mach-voyager/setup_arch.h create mode 100644 arch/x86/include/asm/math_emu.h create mode 100644 arch/x86/include/asm/mc146818rtc.h create mode 100644 arch/x86/include/asm/mca.h create mode 100644 arch/x86/include/asm/mca_dma.h create mode 100644 arch/x86/include/asm/mce.h create mode 100644 arch/x86/include/asm/microcode.h create mode 100644 arch/x86/include/asm/mman.h create mode 100644 arch/x86/include/asm/mmconfig.h create mode 100644 arch/x86/include/asm/mmu.h create mode 100644 arch/x86/include/asm/mmu_context.h create mode 100644 arch/x86/include/asm/mmu_context_32.h create mode 100644 arch/x86/include/asm/mmu_context_64.h create mode 100644 arch/x86/include/asm/mmx.h create mode 100644 arch/x86/include/asm/mmzone.h create mode 100644 arch/x86/include/asm/mmzone_32.h create mode 100644 arch/x86/include/asm/mmzone_64.h create mode 100644 arch/x86/include/asm/module.h create mode 100644 arch/x86/include/asm/mpspec.h create mode 100644 arch/x86/include/asm/mpspec_def.h create mode 100644 arch/x86/include/asm/msgbuf.h create mode 100644 arch/x86/include/asm/msidef.h create mode 100644 arch/x86/include/asm/msr-index.h create mode 100644 arch/x86/include/asm/msr.h create mode 100644 arch/x86/include/asm/mtrr.h create mode 100644 arch/x86/include/asm/mutex.h create mode 100644 arch/x86/include/asm/mutex_32.h create mode 100644 arch/x86/include/asm/mutex_64.h create mode 100644 arch/x86/include/asm/nmi.h create mode 100644 arch/x86/include/asm/nops.h create mode 100644 arch/x86/include/asm/numa.h create mode 100644 arch/x86/include/asm/numa_32.h create mode 100644 arch/x86/include/asm/numa_64.h create mode 100644 arch/x86/include/asm/numaq.h create mode 100644 arch/x86/include/asm/numaq/apic.h create mode 100644 arch/x86/include/asm/numaq/apicdef.h create mode 100644 arch/x86/include/asm/numaq/ipi.h create mode 100644 arch/x86/include/asm/numaq/mpparse.h create mode 100644 arch/x86/include/asm/numaq/wakecpu.h create mode 100644 arch/x86/include/asm/olpc.h create mode 100644 arch/x86/include/asm/page.h create mode 100644 arch/x86/include/asm/page_32.h create mode 100644 arch/x86/include/asm/page_64.h create mode 100644 arch/x86/include/asm/param.h create mode 100644 arch/x86/include/asm/paravirt.h create mode 100644 arch/x86/include/asm/parport.h create mode 100644 arch/x86/include/asm/pat.h create mode 100644 arch/x86/include/asm/pci-direct.h create mode 100644 arch/x86/include/asm/pci.h create mode 100644 arch/x86/include/asm/pci_32.h create mode 100644 arch/x86/include/asm/pci_64.h create mode 100644 arch/x86/include/asm/pda.h create mode 100644 arch/x86/include/asm/percpu.h create mode 100644 arch/x86/include/asm/pgalloc.h create mode 100644 arch/x86/include/asm/pgtable-2level-defs.h create mode 100644 arch/x86/include/asm/pgtable-2level.h create mode 100644 arch/x86/include/asm/pgtable-3level-defs.h create mode 100644 arch/x86/include/asm/pgtable-3level.h create mode 100644 arch/x86/include/asm/pgtable.h create mode 100644 arch/x86/include/asm/pgtable_32.h create mode 100644 arch/x86/include/asm/pgtable_64.h create mode 100644 arch/x86/include/asm/poll.h create mode 100644 arch/x86/include/asm/posix_types.h create mode 100644 arch/x86/include/asm/posix_types_32.h create mode 100644 arch/x86/include/asm/posix_types_64.h create mode 100644 arch/x86/include/asm/prctl.h create mode 100644 arch/x86/include/asm/processor-cyrix.h create mode 100644 arch/x86/include/asm/processor-flags.h create mode 100644 arch/x86/include/asm/processor.h create mode 100644 arch/x86/include/asm/proto.h create mode 100644 arch/x86/include/asm/ptrace-abi.h create mode 100644 arch/x86/include/asm/ptrace.h create mode 100644 arch/x86/include/asm/pvclock-abi.h create mode 100644 arch/x86/include/asm/pvclock.h create mode 100644 arch/x86/include/asm/reboot.h create mode 100644 arch/x86/include/asm/reboot_fixups.h create mode 100644 arch/x86/include/asm/required-features.h create mode 100644 arch/x86/include/asm/resource.h create mode 100644 arch/x86/include/asm/resume-trace.h create mode 100644 arch/x86/include/asm/rio.h create mode 100644 arch/x86/include/asm/rtc.h create mode 100644 arch/x86/include/asm/rwlock.h create mode 100644 arch/x86/include/asm/rwsem.h create mode 100644 arch/x86/include/asm/scatterlist.h create mode 100644 arch/x86/include/asm/seccomp.h create mode 100644 arch/x86/include/asm/seccomp_32.h create mode 100644 arch/x86/include/asm/seccomp_64.h create mode 100644 arch/x86/include/asm/sections.h create mode 100644 arch/x86/include/asm/segment.h create mode 100644 arch/x86/include/asm/sembuf.h create mode 100644 arch/x86/include/asm/serial.h create mode 100644 arch/x86/include/asm/setup.h create mode 100644 arch/x86/include/asm/shmbuf.h create mode 100644 arch/x86/include/asm/shmparam.h create mode 100644 arch/x86/include/asm/sigcontext.h create mode 100644 arch/x86/include/asm/sigcontext32.h create mode 100644 arch/x86/include/asm/siginfo.h create mode 100644 arch/x86/include/asm/signal.h create mode 100644 arch/x86/include/asm/smp.h create mode 100644 arch/x86/include/asm/socket.h create mode 100644 arch/x86/include/asm/sockios.h create mode 100644 arch/x86/include/asm/sparsemem.h create mode 100644 arch/x86/include/asm/spinlock.h create mode 100644 arch/x86/include/asm/spinlock_types.h create mode 100644 arch/x86/include/asm/srat.h create mode 100644 arch/x86/include/asm/stacktrace.h create mode 100644 arch/x86/include/asm/stat.h create mode 100644 arch/x86/include/asm/statfs.h create mode 100644 arch/x86/include/asm/string.h create mode 100644 arch/x86/include/asm/string_32.h create mode 100644 arch/x86/include/asm/string_64.h create mode 100644 arch/x86/include/asm/summit/apic.h create mode 100644 arch/x86/include/asm/summit/apicdef.h create mode 100644 arch/x86/include/asm/summit/ipi.h create mode 100644 arch/x86/include/asm/summit/mpparse.h create mode 100644 arch/x86/include/asm/suspend.h create mode 100644 arch/x86/include/asm/suspend_32.h create mode 100644 arch/x86/include/asm/suspend_64.h create mode 100644 arch/x86/include/asm/swiotlb.h create mode 100644 arch/x86/include/asm/sync_bitops.h create mode 100644 arch/x86/include/asm/syscall.h create mode 100644 arch/x86/include/asm/syscalls.h create mode 100644 arch/x86/include/asm/system.h create mode 100644 arch/x86/include/asm/system_64.h create mode 100644 arch/x86/include/asm/tce.h create mode 100644 arch/x86/include/asm/termbits.h create mode 100644 arch/x86/include/asm/termios.h create mode 100644 arch/x86/include/asm/therm_throt.h create mode 100644 arch/x86/include/asm/thread_info.h create mode 100644 arch/x86/include/asm/time.h create mode 100644 arch/x86/include/asm/timer.h create mode 100644 arch/x86/include/asm/timex.h create mode 100644 arch/x86/include/asm/tlb.h create mode 100644 arch/x86/include/asm/tlbflush.h create mode 100644 arch/x86/include/asm/topology.h create mode 100644 arch/x86/include/asm/trampoline.h create mode 100644 arch/x86/include/asm/traps.h create mode 100644 arch/x86/include/asm/tsc.h create mode 100644 arch/x86/include/asm/types.h create mode 100644 arch/x86/include/asm/uaccess.h create mode 100644 arch/x86/include/asm/uaccess_32.h create mode 100644 arch/x86/include/asm/uaccess_64.h create mode 100644 arch/x86/include/asm/ucontext.h create mode 100644 arch/x86/include/asm/unaligned.h create mode 100644 arch/x86/include/asm/unistd.h create mode 100644 arch/x86/include/asm/unistd_32.h create mode 100644 arch/x86/include/asm/unistd_64.h create mode 100644 arch/x86/include/asm/unwind.h create mode 100644 arch/x86/include/asm/user.h create mode 100644 arch/x86/include/asm/user32.h create mode 100644 arch/x86/include/asm/user_32.h create mode 100644 arch/x86/include/asm/user_64.h create mode 100644 arch/x86/include/asm/uv/bios.h create mode 100644 arch/x86/include/asm/uv/uv_bau.h create mode 100644 arch/x86/include/asm/uv/uv_hub.h create mode 100644 arch/x86/include/asm/uv/uv_irq.h create mode 100644 arch/x86/include/asm/uv/uv_mmrs.h create mode 100644 arch/x86/include/asm/vdso.h create mode 100644 arch/x86/include/asm/vga.h create mode 100644 arch/x86/include/asm/vgtod.h create mode 100644 arch/x86/include/asm/vic.h create mode 100644 arch/x86/include/asm/visws/cobalt.h create mode 100644 arch/x86/include/asm/visws/lithium.h create mode 100644 arch/x86/include/asm/visws/piix4.h create mode 100644 arch/x86/include/asm/visws/sgivw.h create mode 100644 arch/x86/include/asm/vm86.h create mode 100644 arch/x86/include/asm/vmi.h create mode 100644 arch/x86/include/asm/vmi_time.h create mode 100644 arch/x86/include/asm/voyager.h create mode 100644 arch/x86/include/asm/vsyscall.h create mode 100644 arch/x86/include/asm/xcr.h create mode 100644 arch/x86/include/asm/xen/events.h create mode 100644 arch/x86/include/asm/xen/grant_table.h create mode 100644 arch/x86/include/asm/xen/hypercall.h create mode 100644 arch/x86/include/asm/xen/hypervisor.h create mode 100644 arch/x86/include/asm/xen/interface.h create mode 100644 arch/x86/include/asm/xen/interface_32.h create mode 100644 arch/x86/include/asm/xen/interface_64.h create mode 100644 arch/x86/include/asm/xen/page.h create mode 100644 arch/x86/include/asm/xor.h create mode 100644 arch/x86/include/asm/xor_32.h create mode 100644 arch/x86/include/asm/xor_64.h create mode 100644 arch/x86/include/asm/xsave.h (limited to 'arch/x86') diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f5631da585b6..d1a47adb5aec 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -110,16 +110,16 @@ KBUILD_CFLAGS += $(call cc-option,-mno-sse -mno-mmx -mno-sse2 -mno-3dnow,) mcore-y := arch/x86/mach-default/ # Voyager subarch support -mflags-$(CONFIG_X86_VOYAGER) := -Iinclude/asm-x86/mach-voyager +mflags-$(CONFIG_X86_VOYAGER) := -Iarch/x86/include/asm/mach-voyager mcore-$(CONFIG_X86_VOYAGER) := arch/x86/mach-voyager/ # generic subarchitecture -mflags-$(CONFIG_X86_GENERICARCH):= -Iinclude/asm-x86/mach-generic +mflags-$(CONFIG_X86_GENERICARCH):= -Iarch/x86/include/asm/mach-generic fcore-$(CONFIG_X86_GENERICARCH) += arch/x86/mach-generic/ mcore-$(CONFIG_X86_GENERICARCH) := arch/x86/mach-default/ # default subarch .h files -mflags-y += -Iinclude/asm-x86/mach-default +mflags-y += -Iarch/x86/include/asm/mach-default # 64 bit does not support subarch support - clear sub arch variables fcore-$(CONFIG_X86_64) := diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild new file mode 100644 index 000000000000..4a8e80cdcfa5 --- /dev/null +++ b/arch/x86/include/asm/Kbuild @@ -0,0 +1,24 @@ +include include/asm-generic/Kbuild.asm + +header-y += boot.h +header-y += bootparam.h +header-y += debugreg.h +header-y += ldt.h +header-y += msr-index.h +header-y += prctl.h +header-y += ptrace-abi.h +header-y += sigcontext32.h +header-y += ucontext.h +header-y += processor-flags.h + +unifdef-y += e820.h +unifdef-y += ist.h +unifdef-y += mce.h +unifdef-y += msr.h +unifdef-y += mtrr.h +unifdef-y += posix_types_32.h +unifdef-y += posix_types_64.h +unifdef-y += unistd_32.h +unifdef-y += unistd_64.h +unifdef-y += vm86.h +unifdef-y += vsyscall.h diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h new file mode 100644 index 000000000000..f5705761a37b --- /dev/null +++ b/arch/x86/include/asm/a.out-core.h @@ -0,0 +1,73 @@ +/* a.out coredump register dumper + * + * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public Licence + * as published by the Free Software Foundation; either version + * 2 of the Licence, or (at your option) any later version. + */ + +#ifndef ASM_X86__A_OUT_CORE_H +#define ASM_X86__A_OUT_CORE_H + +#ifdef __KERNEL__ +#ifdef CONFIG_X86_32 + +#include +#include + +/* + * fill in the user structure for an a.out core dump + */ +static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump) +{ + u16 gs; + +/* changed the size calculations - should hopefully work better. lbt */ + dump->magic = CMAGIC; + dump->start_code = 0; + dump->start_stack = regs->sp & ~(PAGE_SIZE - 1); + dump->u_tsize = ((unsigned long)current->mm->end_code) >> PAGE_SHIFT; + dump->u_dsize = ((unsigned long)(current->mm->brk + (PAGE_SIZE - 1))) + >> PAGE_SHIFT; + dump->u_dsize -= dump->u_tsize; + dump->u_ssize = 0; + dump->u_debugreg[0] = current->thread.debugreg0; + dump->u_debugreg[1] = current->thread.debugreg1; + dump->u_debugreg[2] = current->thread.debugreg2; + dump->u_debugreg[3] = current->thread.debugreg3; + dump->u_debugreg[4] = 0; + dump->u_debugreg[5] = 0; + dump->u_debugreg[6] = current->thread.debugreg6; + dump->u_debugreg[7] = current->thread.debugreg7; + + if (dump->start_stack < TASK_SIZE) + dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) + >> PAGE_SHIFT; + + dump->regs.bx = regs->bx; + dump->regs.cx = regs->cx; + dump->regs.dx = regs->dx; + dump->regs.si = regs->si; + dump->regs.di = regs->di; + dump->regs.bp = regs->bp; + dump->regs.ax = regs->ax; + dump->regs.ds = (u16)regs->ds; + dump->regs.es = (u16)regs->es; + dump->regs.fs = (u16)regs->fs; + savesegment(gs, gs); + dump->regs.orig_ax = regs->orig_ax; + dump->regs.ip = regs->ip; + dump->regs.cs = (u16)regs->cs; + dump->regs.flags = regs->flags; + dump->regs.sp = regs->sp; + dump->regs.ss = (u16)regs->ss; + + dump->u_fpvalid = dump_fpu(regs, &dump->i387); +} + +#endif /* CONFIG_X86_32 */ +#endif /* __KERNEL__ */ +#endif /* ASM_X86__A_OUT_CORE_H */ diff --git a/arch/x86/include/asm/a.out.h b/arch/x86/include/asm/a.out.h new file mode 100644 index 000000000000..0948748bc69c --- /dev/null +++ b/arch/x86/include/asm/a.out.h @@ -0,0 +1,20 @@ +#ifndef ASM_X86__A_OUT_H +#define ASM_X86__A_OUT_H + +struct exec +{ + unsigned int a_info; /* Use macros N_MAGIC, etc for access */ + unsigned a_text; /* length of text, in bytes */ + unsigned a_data; /* length of data, in bytes */ + unsigned a_bss; /* length of uninitialized data area for file, in bytes */ + unsigned a_syms; /* length of symbol table data in file, in bytes */ + unsigned a_entry; /* start address */ + unsigned a_trsize; /* length of relocation info for text, in bytes */ + unsigned a_drsize; /* length of relocation info for data, in bytes */ +}; + +#define N_TRSIZE(a) ((a).a_trsize) +#define N_DRSIZE(a) ((a).a_drsize) +#define N_SYMSIZE(a) ((a).a_syms) + +#endif /* ASM_X86__A_OUT_H */ diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h new file mode 100644 index 000000000000..392e17336be1 --- /dev/null +++ b/arch/x86/include/asm/acpi.h @@ -0,0 +1,178 @@ +#ifndef ASM_X86__ACPI_H +#define ASM_X86__ACPI_H + +/* + * Copyright (C) 2001 Paul Diefenbaugh + * Copyright (C) 2001 Patrick Mochel + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + */ +#include + +#include +#include +#include +#include + +#define COMPILER_DEPENDENT_INT64 long long +#define COMPILER_DEPENDENT_UINT64 unsigned long long + +/* + * Calling conventions: + * + * ACPI_SYSTEM_XFACE - Interfaces to host OS (handlers, threads) + * ACPI_EXTERNAL_XFACE - External ACPI interfaces + * ACPI_INTERNAL_XFACE - Internal ACPI interfaces + * ACPI_INTERNAL_VAR_XFACE - Internal variable-parameter list interfaces + */ +#define ACPI_SYSTEM_XFACE +#define ACPI_EXTERNAL_XFACE +#define ACPI_INTERNAL_XFACE +#define ACPI_INTERNAL_VAR_XFACE + +/* Asm macros */ + +#define ACPI_ASM_MACROS +#define BREAKPOINT3 +#define ACPI_DISABLE_IRQS() local_irq_disable() +#define ACPI_ENABLE_IRQS() local_irq_enable() +#define ACPI_FLUSH_CPU_CACHE() wbinvd() + +int __acpi_acquire_global_lock(unsigned int *lock); +int __acpi_release_global_lock(unsigned int *lock); + +#define ACPI_ACQUIRE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_acquire_global_lock(&facs->global_lock)) + +#define ACPI_RELEASE_GLOBAL_LOCK(facs, Acq) \ + ((Acq) = __acpi_release_global_lock(&facs->global_lock)) + +/* + * Math helper asm macros + */ +#define ACPI_DIV_64_BY_32(n_hi, n_lo, d32, q32, r32) \ + asm("divl %2;" \ + : "=a"(q32), "=d"(r32) \ + : "r"(d32), \ + "0"(n_lo), "1"(n_hi)) + + +#define ACPI_SHIFT_RIGHT_64(n_hi, n_lo) \ + asm("shrl $1,%2 ;" \ + "rcrl $1,%3;" \ + : "=r"(n_hi), "=r"(n_lo) \ + : "0"(n_hi), "1"(n_lo)) + +#ifdef CONFIG_ACPI +extern int acpi_lapic; +extern int acpi_ioapic; +extern int acpi_noirq; +extern int acpi_strict; +extern int acpi_disabled; +extern int acpi_ht; +extern int acpi_pci_disabled; +extern int acpi_skip_timer_override; +extern int acpi_use_timer_override; + +extern u8 acpi_sci_flags; +extern int acpi_sci_override_gsi; +void acpi_pic_sci_set_trigger(unsigned int, u16); + +static inline void disable_acpi(void) +{ + acpi_disabled = 1; + acpi_ht = 0; + acpi_pci_disabled = 1; + acpi_noirq = 1; +} + +/* Fixmap pages to reserve for ACPI boot-time tables (see fixmap.h) */ +#define FIX_ACPI_PAGES 4 + +extern int acpi_gsi_to_irq(u32 gsi, unsigned int *irq); + +static inline void acpi_noirq_set(void) { acpi_noirq = 1; } +static inline void acpi_disable_pci(void) +{ + acpi_pci_disabled = 1; + acpi_noirq_set(); +} +extern int acpi_irq_balance_set(char *str); + +/* routines for saving/restoring kernel state */ +extern int acpi_save_state_mem(void); +extern void acpi_restore_state_mem(void); + +extern unsigned long acpi_wakeup_address; + +/* early initialization routine */ +extern void acpi_reserve_bootmem(void); + +/* + * Check if the CPU can handle C2 and deeper + */ +static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) +{ + /* + * Early models (<=5) of AMD Opterons are not supposed to go into + * C2 state. + * + * Steppings 0x0A and later are good + */ + if (boot_cpu_data.x86 == 0x0F && + boot_cpu_data.x86_vendor == X86_VENDOR_AMD && + boot_cpu_data.x86_model <= 0x05 && + boot_cpu_data.x86_mask < 0x0A) + return 1; + else if (boot_cpu_has(X86_FEATURE_AMDC1E)) + return 1; + else + return max_cstate; +} + +#else /* !CONFIG_ACPI */ + +#define acpi_lapic 0 +#define acpi_ioapic 0 +static inline void acpi_noirq_set(void) { } +static inline void acpi_disable_pci(void) { } +static inline void disable_acpi(void) { } + +#endif /* !CONFIG_ACPI */ + +#define ARCH_HAS_POWER_INIT 1 + +struct bootnode; + +#ifdef CONFIG_ACPI_NUMA +extern int acpi_numa; +extern int acpi_scan_nodes(unsigned long start, unsigned long end); +#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) +extern void acpi_fake_nodes(const struct bootnode *fake_nodes, + int num_nodes); +#else +static inline void acpi_fake_nodes(const struct bootnode *fake_nodes, + int num_nodes) +{ +} +#endif + +#define acpi_unlazy_tlb(x) leave_mm(x) + +#endif /* ASM_X86__ACPI_H */ diff --git a/arch/x86/include/asm/agp.h b/arch/x86/include/asm/agp.h new file mode 100644 index 000000000000..3617fd4fcdf9 --- /dev/null +++ b/arch/x86/include/asm/agp.h @@ -0,0 +1,35 @@ +#ifndef ASM_X86__AGP_H +#define ASM_X86__AGP_H + +#include +#include + +/* + * Functions to keep the agpgart mappings coherent with the MMU. The + * GART gives the CPU a physical alias of pages in memory. The alias + * region is mapped uncacheable. Make sure there are no conflicting + * mappings with different cachability attributes for the same + * page. This avoids data corruption on some CPUs. + */ + +#define map_page_into_agp(page) set_pages_uc(page, 1) +#define unmap_page_from_agp(page) set_pages_wb(page, 1) + +/* + * Could use CLFLUSH here if the cpu supports it. But then it would + * need to be called for each cacheline of the whole page so it may + * not be worth it. Would need a page for it. + */ +#define flush_agp_cache() wbinvd() + +/* Convert a physical address to an address suitable for the GART. */ +#define phys_to_gart(x) (x) +#define gart_to_phys(x) (x) + +/* GATT allocation. Returns/accepts GATT kernel virtual address. */ +#define alloc_gatt_pages(order) \ + ((char *)__get_free_pages(GFP_KERNEL, (order))) +#define free_gatt_pages(table, order) \ + free_pages((unsigned long)(table), (order)) + +#endif /* ASM_X86__AGP_H */ diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h new file mode 100644 index 000000000000..e2077d343c33 --- /dev/null +++ b/arch/x86/include/asm/alternative-asm.h @@ -0,0 +1,22 @@ +#ifdef __ASSEMBLY__ + +#ifdef CONFIG_X86_32 +# define X86_ALIGN .long +#else +# define X86_ALIGN .quad +#endif + +#ifdef CONFIG_SMP + .macro LOCK_PREFIX +1: lock + .section .smp_locks,"a" + .align 4 + X86_ALIGN 1b + .previous + .endm +#else + .macro LOCK_PREFIX + .endm +#endif + +#endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h new file mode 100644 index 000000000000..22d3c9862bf3 --- /dev/null +++ b/arch/x86/include/asm/alternative.h @@ -0,0 +1,183 @@ +#ifndef ASM_X86__ALTERNATIVE_H +#define ASM_X86__ALTERNATIVE_H + +#include +#include +#include + +/* + * Alternative inline assembly for SMP. + * + * The LOCK_PREFIX macro defined here replaces the LOCK and + * LOCK_PREFIX macros used everywhere in the source tree. + * + * SMP alternatives use the same data structures as the other + * alternatives and the X86_FEATURE_UP flag to indicate the case of a + * UP system running a SMP kernel. The existing apply_alternatives() + * works fine for patching a SMP kernel for UP. + * + * The SMP alternative tables can be kept after boot and contain both + * UP and SMP versions of the instructions to allow switching back to + * SMP at runtime, when hotplugging in a new CPU, which is especially + * useful in virtualized environments. + * + * The very common lock prefix is handled as special case in a + * separate table which is a pure address list without replacement ptr + * and size information. That keeps the table sizes small. + */ + +#ifdef CONFIG_SMP +#define LOCK_PREFIX \ + ".section .smp_locks,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661f\n" /* address */ \ + ".previous\n" \ + "661:\n\tlock; " + +#else /* ! CONFIG_SMP */ +#define LOCK_PREFIX "" +#endif + +/* This must be included *after* the definition of LOCK_PREFIX */ +#include + +struct alt_instr { + u8 *instr; /* original instruction */ + u8 *replacement; + u8 cpuid; /* cpuid bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction, <= instrlen */ + u8 pad1; +#ifdef CONFIG_X86_64 + u32 pad2; +#endif +}; + +extern void alternative_instructions(void); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); + +struct module; + +#ifdef CONFIG_SMP +extern void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end); +extern void alternatives_smp_module_del(struct module *mod); +extern void alternatives_smp_switch(int smp); +#else +static inline void alternatives_smp_module_add(struct module *mod, char *name, + void *locks, void *locks_end, + void *text, void *text_end) {} +static inline void alternatives_smp_module_del(struct module *mod) {} +static inline void alternatives_smp_switch(int smp) {} +#endif /* CONFIG_SMP */ + +const unsigned char *const *find_nop_table(void); + +/* + * Alternative instructions for different CPU types or capabilities. + * + * This allows to use optimized instructions even on generic binary + * kernels. + * + * length of oldinstr must be longer or equal the length of newinstr + * It can be padded with nops as needed. + * + * For non barrier like inlines please define new variants + * without volatile and memory clobber. + */ +#define alternative(oldinstr, newinstr, feature) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c0\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" :: "i" (feature) : "memory") + +/* + * Alternative inline assembly with input. + * + * Pecularities: + * No memory clobber here. + * Argument numbers start with 1. + * Best is to use constraints that are fixed size (like (%1) ... "r") + * If you use variable sized constraints like "m" or "g" in the + * replacement make sure to pad to the worst case length. + */ +#define alternative_input(oldinstr, newinstr, feature, input...) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c0\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" :: "i" (feature), ##input) + +/* Like alternative_input, but with a single output argument */ +#define alternative_io(oldinstr, newinstr, feature, output, input...) \ + asm volatile ("661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte %c[feat]\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement,\"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" : output : [feat] "i" (feature), ##input) + +/* + * use this macro(s) if you need more than one output parameter + * in alternative_io + */ +#define ASM_OUTPUT2(a, b) a, b + +struct paravirt_patch_site; +#ifdef CONFIG_PARAVIRT +void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end); +#else +static inline void apply_paravirt(struct paravirt_patch_site *start, + struct paravirt_patch_site *end) +{} +#define __parainstructions NULL +#define __parainstructions_end NULL +#endif + +extern void add_nops(void *insns, unsigned int len); + +/* + * Clear and restore the kernel write-protection flag on the local CPU. + * Allows the kernel to edit read-only pages. + * Side-effect: any interrupt handler running between save and restore will have + * the ability to write to read-only pages. + * + * Warning: + * Code patching in the UP case is safe if NMIs and MCE handlers are stopped and + * no thread can be preempted in the instructions being modified (no iret to an + * invalid instruction possible) or if the instructions are changed from a + * consistent state to another consistent state atomically. + * More care must be taken when modifying code in the SMP case because of + * Intel's errata. + * On the local CPU you need to be protected again NMI or MCE handlers seeing an + * inconsistent instruction while you patch. + * The _early version expects the memory to already be RW. + */ + +extern void *text_poke(void *addr, const void *opcode, size_t len); +extern void *text_poke_early(void *addr, const void *opcode, size_t len); + +#endif /* ASM_X86__ALTERNATIVE_H */ diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h new file mode 100644 index 000000000000..041d0db7da27 --- /dev/null +++ b/arch/x86/include/asm/amd_iommu.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * Leo Duran + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef ASM_X86__AMD_IOMMU_H +#define ASM_X86__AMD_IOMMU_H + +#include + +#ifdef CONFIG_AMD_IOMMU +extern int amd_iommu_init(void); +extern int amd_iommu_init_dma_ops(void); +extern void amd_iommu_detect(void); +extern irqreturn_t amd_iommu_int_handler(int irq, void *data); +#else +static inline int amd_iommu_init(void) { return -ENODEV; } +static inline void amd_iommu_detect(void) { } +#endif + +#endif /* ASM_X86__AMD_IOMMU_H */ diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h new file mode 100644 index 000000000000..b3085869a17b --- /dev/null +++ b/arch/x86/include/asm/amd_iommu_types.h @@ -0,0 +1,404 @@ +/* + * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. + * Author: Joerg Roedel + * Leo Duran + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#ifndef ASM_X86__AMD_IOMMU_TYPES_H +#define ASM_X86__AMD_IOMMU_TYPES_H + +#include +#include +#include + +/* + * some size calculation constants + */ +#define DEV_TABLE_ENTRY_SIZE 32 +#define ALIAS_TABLE_ENTRY_SIZE 2 +#define RLOOKUP_TABLE_ENTRY_SIZE (sizeof(void *)) + +/* Length of the MMIO region for the AMD IOMMU */ +#define MMIO_REGION_LENGTH 0x4000 + +/* Capability offsets used by the driver */ +#define MMIO_CAP_HDR_OFFSET 0x00 +#define MMIO_RANGE_OFFSET 0x0c +#define MMIO_MISC_OFFSET 0x10 + +/* Masks, shifts and macros to parse the device range capability */ +#define MMIO_RANGE_LD_MASK 0xff000000 +#define MMIO_RANGE_FD_MASK 0x00ff0000 +#define MMIO_RANGE_BUS_MASK 0x0000ff00 +#define MMIO_RANGE_LD_SHIFT 24 +#define MMIO_RANGE_FD_SHIFT 16 +#define MMIO_RANGE_BUS_SHIFT 8 +#define MMIO_GET_LD(x) (((x) & MMIO_RANGE_LD_MASK) >> MMIO_RANGE_LD_SHIFT) +#define MMIO_GET_FD(x) (((x) & MMIO_RANGE_FD_MASK) >> MMIO_RANGE_FD_SHIFT) +#define MMIO_GET_BUS(x) (((x) & MMIO_RANGE_BUS_MASK) >> MMIO_RANGE_BUS_SHIFT) +#define MMIO_MSI_NUM(x) ((x) & 0x1f) + +/* Flag masks for the AMD IOMMU exclusion range */ +#define MMIO_EXCL_ENABLE_MASK 0x01ULL +#define MMIO_EXCL_ALLOW_MASK 0x02ULL + +/* Used offsets into the MMIO space */ +#define MMIO_DEV_TABLE_OFFSET 0x0000 +#define MMIO_CMD_BUF_OFFSET 0x0008 +#define MMIO_EVT_BUF_OFFSET 0x0010 +#define MMIO_CONTROL_OFFSET 0x0018 +#define MMIO_EXCL_BASE_OFFSET 0x0020 +#define MMIO_EXCL_LIMIT_OFFSET 0x0028 +#define MMIO_CMD_HEAD_OFFSET 0x2000 +#define MMIO_CMD_TAIL_OFFSET 0x2008 +#define MMIO_EVT_HEAD_OFFSET 0x2010 +#define MMIO_EVT_TAIL_OFFSET 0x2018 +#define MMIO_STATUS_OFFSET 0x2020 + +/* MMIO status bits */ +#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 + +/* event logging constants */ +#define EVENT_ENTRY_SIZE 0x10 +#define EVENT_TYPE_SHIFT 28 +#define EVENT_TYPE_MASK 0xf +#define EVENT_TYPE_ILL_DEV 0x1 +#define EVENT_TYPE_IO_FAULT 0x2 +#define EVENT_TYPE_DEV_TAB_ERR 0x3 +#define EVENT_TYPE_PAGE_TAB_ERR 0x4 +#define EVENT_TYPE_ILL_CMD 0x5 +#define EVENT_TYPE_CMD_HARD_ERR 0x6 +#define EVENT_TYPE_IOTLB_INV_TO 0x7 +#define EVENT_TYPE_INV_DEV_REQ 0x8 +#define EVENT_DEVID_MASK 0xffff +#define EVENT_DEVID_SHIFT 0 +#define EVENT_DOMID_MASK 0xffff +#define EVENT_DOMID_SHIFT 0 +#define EVENT_FLAGS_MASK 0xfff +#define EVENT_FLAGS_SHIFT 0x10 + +/* feature control bits */ +#define CONTROL_IOMMU_EN 0x00ULL +#define CONTROL_HT_TUN_EN 0x01ULL +#define CONTROL_EVT_LOG_EN 0x02ULL +#define CONTROL_EVT_INT_EN 0x03ULL +#define CONTROL_COMWAIT_EN 0x04ULL +#define CONTROL_PASSPW_EN 0x08ULL +#define CONTROL_RESPASSPW_EN 0x09ULL +#define CONTROL_COHERENT_EN 0x0aULL +#define CONTROL_ISOC_EN 0x0bULL +#define CONTROL_CMDBUF_EN 0x0cULL +#define CONTROL_PPFLOG_EN 0x0dULL +#define CONTROL_PPFINT_EN 0x0eULL + +/* command specific defines */ +#define CMD_COMPL_WAIT 0x01 +#define CMD_INV_DEV_ENTRY 0x02 +#define CMD_INV_IOMMU_PAGES 0x03 + +#define CMD_COMPL_WAIT_STORE_MASK 0x01 +#define CMD_COMPL_WAIT_INT_MASK 0x02 +#define CMD_INV_IOMMU_PAGES_SIZE_MASK 0x01 +#define CMD_INV_IOMMU_PAGES_PDE_MASK 0x02 + +#define CMD_INV_IOMMU_ALL_PAGES_ADDRESS 0x7fffffffffffffffULL + +/* macros and definitions for device table entries */ +#define DEV_ENTRY_VALID 0x00 +#define DEV_ENTRY_TRANSLATION 0x01 +#define DEV_ENTRY_IR 0x3d +#define DEV_ENTRY_IW 0x3e +#define DEV_ENTRY_NO_PAGE_FAULT 0x62 +#define DEV_ENTRY_EX 0x67 +#define DEV_ENTRY_SYSMGT1 0x68 +#define DEV_ENTRY_SYSMGT2 0x69 +#define DEV_ENTRY_INIT_PASS 0xb8 +#define DEV_ENTRY_EINT_PASS 0xb9 +#define DEV_ENTRY_NMI_PASS 0xba +#define DEV_ENTRY_LINT0_PASS 0xbe +#define DEV_ENTRY_LINT1_PASS 0xbf +#define DEV_ENTRY_MODE_MASK 0x07 +#define DEV_ENTRY_MODE_SHIFT 0x09 + +/* constants to configure the command buffer */ +#define CMD_BUFFER_SIZE 8192 +#define CMD_BUFFER_ENTRIES 512 +#define MMIO_CMD_SIZE_SHIFT 56 +#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) + +/* constants for event buffer handling */ +#define EVT_BUFFER_SIZE 8192 /* 512 entries */ +#define EVT_LEN_MASK (0x9ULL << 56) + +#define PAGE_MODE_1_LEVEL 0x01 +#define PAGE_MODE_2_LEVEL 0x02 +#define PAGE_MODE_3_LEVEL 0x03 + +#define IOMMU_PDE_NL_0 0x000ULL +#define IOMMU_PDE_NL_1 0x200ULL +#define IOMMU_PDE_NL_2 0x400ULL +#define IOMMU_PDE_NL_3 0x600ULL + +#define IOMMU_PTE_L2_INDEX(address) (((address) >> 30) & 0x1ffULL) +#define IOMMU_PTE_L1_INDEX(address) (((address) >> 21) & 0x1ffULL) +#define IOMMU_PTE_L0_INDEX(address) (((address) >> 12) & 0x1ffULL) + +#define IOMMU_MAP_SIZE_L1 (1ULL << 21) +#define IOMMU_MAP_SIZE_L2 (1ULL << 30) +#define IOMMU_MAP_SIZE_L3 (1ULL << 39) + +#define IOMMU_PTE_P (1ULL << 0) +#define IOMMU_PTE_TV (1ULL << 1) +#define IOMMU_PTE_U (1ULL << 59) +#define IOMMU_PTE_FC (1ULL << 60) +#define IOMMU_PTE_IR (1ULL << 61) +#define IOMMU_PTE_IW (1ULL << 62) + +#define IOMMU_L1_PDE(address) \ + ((address) | IOMMU_PDE_NL_1 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) +#define IOMMU_L2_PDE(address) \ + ((address) | IOMMU_PDE_NL_2 | IOMMU_PTE_P | IOMMU_PTE_IR | IOMMU_PTE_IW) + +#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) +#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) +#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) +#define IOMMU_PTE_MODE(pte) (((pte) >> 9) & 0x07) + +#define IOMMU_PROT_MASK 0x03 +#define IOMMU_PROT_IR 0x01 +#define IOMMU_PROT_IW 0x02 + +/* IOMMU capabilities */ +#define IOMMU_CAP_IOTLB 24 +#define IOMMU_CAP_NPCACHE 26 + +#define MAX_DOMAIN_ID 65536 + +/* FIXME: move this macro to */ +#define PCI_BUS(x) (((x) >> 8) & 0xff) + +/* + * This structure contains generic data for IOMMU protection domains + * independent of their use. + */ +struct protection_domain { + spinlock_t lock; /* mostly used to lock the page table*/ + u16 id; /* the domain id written to the device table */ + int mode; /* paging mode (0-6 levels) */ + u64 *pt_root; /* page table root pointer */ + void *priv; /* private data */ +}; + +/* + * Data container for a dma_ops specific protection domain + */ +struct dma_ops_domain { + struct list_head list; + + /* generic protection domain information */ + struct protection_domain domain; + + /* size of the aperture for the mappings */ + unsigned long aperture_size; + + /* address we start to search for free addresses */ + unsigned long next_bit; + + /* address allocation bitmap */ + unsigned long *bitmap; + + /* + * Array of PTE pages for the aperture. In this array we save all the + * leaf pages of the domain page table used for the aperture. This way + * we don't need to walk the page table to find a specific PTE. We can + * just calculate its address in constant time. + */ + u64 **pte_pages; + + /* This will be set to true when TLB needs to be flushed */ + bool need_flush; + + /* + * if this is a preallocated domain, keep the device for which it was + * preallocated in this variable + */ + u16 target_dev; +}; + +/* + * Structure where we save information about one hardware AMD IOMMU in the + * system. + */ +struct amd_iommu { + struct list_head list; + + /* locks the accesses to the hardware */ + spinlock_t lock; + + /* Pointer to PCI device of this IOMMU */ + struct pci_dev *dev; + + /* + * Capability pointer. There could be more than one IOMMU per PCI + * device function if there are more than one AMD IOMMU capability + * pointers. + */ + u16 cap_ptr; + + /* physical address of MMIO space */ + u64 mmio_phys; + /* virtual address of MMIO space */ + u8 *mmio_base; + + /* capabilities of that IOMMU read from ACPI */ + u32 cap; + + /* pci domain of this IOMMU */ + u16 pci_seg; + + /* first device this IOMMU handles. read from PCI */ + u16 first_device; + /* last device this IOMMU handles. read from PCI */ + u16 last_device; + + /* start of exclusion range of that IOMMU */ + u64 exclusion_start; + /* length of exclusion range of that IOMMU */ + u64 exclusion_length; + + /* command buffer virtual address */ + u8 *cmd_buf; + /* size of command buffer */ + u32 cmd_buf_size; + + /* event buffer virtual address */ + u8 *evt_buf; + /* size of event buffer */ + u32 evt_buf_size; + /* MSI number for event interrupt */ + u16 evt_msi_num; + + /* if one, we need to send a completion wait command */ + int need_sync; + + /* true if interrupts for this IOMMU are already enabled */ + bool int_enabled; + + /* default dma_ops domain for that IOMMU */ + struct dma_ops_domain *default_dom; +}; + +/* + * List with all IOMMUs in the system. This list is not locked because it is + * only written and read at driver initialization or suspend time + */ +extern struct list_head amd_iommu_list; + +/* + * Structure defining one entry in the device table + */ +struct dev_table_entry { + u32 data[8]; +}; + +/* + * One entry for unity mappings parsed out of the ACPI table. + */ +struct unity_map_entry { + struct list_head list; + + /* starting device id this entry is used for (including) */ + u16 devid_start; + /* end device id this entry is used for (including) */ + u16 devid_end; + + /* start address to unity map (including) */ + u64 address_start; + /* end address to unity map (including) */ + u64 address_end; + + /* required protection */ + int prot; +}; + +/* + * List of all unity mappings. It is not locked because as runtime it is only + * read. It is created at ACPI table parsing time. + */ +extern struct list_head amd_iommu_unity_map; + +/* + * Data structures for device handling + */ + +/* + * Device table used by hardware. Read and write accesses by software are + * locked with the amd_iommu_pd_table lock. + */ +extern struct dev_table_entry *amd_iommu_dev_table; + +/* + * Alias table to find requestor ids to device ids. Not locked because only + * read on runtime. + */ +extern u16 *amd_iommu_alias_table; + +/* + * Reverse lookup table to find the IOMMU which translates a specific device. + */ +extern struct amd_iommu **amd_iommu_rlookup_table; + +/* size of the dma_ops aperture as power of 2 */ +extern unsigned amd_iommu_aperture_order; + +/* largest PCI device id we expect translation requests for */ +extern u16 amd_iommu_last_bdf; + +/* data structures for protection domain handling */ +extern struct protection_domain **amd_iommu_pd_table; + +/* allocation bitmap for domain ids */ +extern unsigned long *amd_iommu_pd_alloc_bitmap; + +/* will be 1 if device isolation is enabled */ +extern int amd_iommu_isolate; + +/* + * If true, the addresses will be flushed on unmap time, not when + * they are reused + */ +extern bool amd_iommu_unmap_flush; + +/* takes a PCI device id and prints it out in a readable form */ +static inline void print_devid(u16 devid, int nl) +{ + int bus = devid >> 8; + int dev = devid >> 3 & 0x1f; + int fn = devid & 0x07; + + printk("%02x:%02x.%x", bus, dev, fn); + if (nl) + printk("\n"); +} + +/* takes bus and device/function and returns the device id + * FIXME: should that be in generic PCI code? */ +static inline u16 calc_devid(u8 bus, u8 devfn) +{ + return (((u16)bus) << 8) | devfn; +} + +#endif /* ASM_X86__AMD_IOMMU_TYPES_H */ diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h new file mode 100644 index 000000000000..ef1d72dbdfe0 --- /dev/null +++ b/arch/x86/include/asm/apic.h @@ -0,0 +1,199 @@ +#ifndef ASM_X86__APIC_H +#define ASM_X86__APIC_H + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define ARCH_APICTIMER_STOPS_ON_C3 1 + +/* + * Debugging macros + */ +#define APIC_QUIET 0 +#define APIC_VERBOSE 1 +#define APIC_DEBUG 2 + +/* + * Define the default level of output to be very little + * This can be turned up by using apic=verbose for more + * information and apic=debug for _lots_ of information. + * apic_verbosity is defined in apic.c + */ +#define apic_printk(v, s, a...) do { \ + if ((v) <= apic_verbosity) \ + printk(s, ##a); \ + } while (0) + + +extern void generic_apic_probe(void); + +#ifdef CONFIG_X86_LOCAL_APIC + +extern unsigned int apic_verbosity; +extern int local_apic_timer_c2_ok; + +extern int disable_apic; +/* + * Basic functions accessing APICs. + */ +#ifdef CONFIG_PARAVIRT +#include +#else +#define setup_boot_clock setup_boot_APIC_clock +#define setup_secondary_clock setup_secondary_APIC_clock +#endif + +extern int is_vsmp_box(void); +extern void xapic_wait_icr_idle(void); +extern u32 safe_xapic_wait_icr_idle(void); +extern u64 xapic_icr_read(void); +extern void xapic_icr_write(u32, u32); +extern int setup_profiling_timer(unsigned int); + +static inline void native_apic_mem_write(u32 reg, u32 v) +{ + volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); + + alternative_io("movl %0, %1", "xchgl %0, %1", X86_FEATURE_11AP, + ASM_OUTPUT2("=r" (v), "=m" (*addr)), + ASM_OUTPUT2("0" (v), "m" (*addr))); +} + +static inline u32 native_apic_mem_read(u32 reg) +{ + return *((volatile u32 *)(APIC_BASE + reg)); +} + +static inline void native_apic_msr_write(u32 reg, u32 v) +{ + if (reg == APIC_DFR || reg == APIC_ID || reg == APIC_LDR || + reg == APIC_LVR) + return; + + wrmsr(APIC_BASE_MSR + (reg >> 4), v, 0); +} + +static inline u32 native_apic_msr_read(u32 reg) +{ + u32 low, high; + + if (reg == APIC_DFR) + return -1; + + rdmsr(APIC_BASE_MSR + (reg >> 4), low, high); + return low; +} + +#ifndef CONFIG_X86_32 +extern int x2apic, x2apic_preenabled; +extern void check_x2apic(void); +extern void enable_x2apic(void); +extern void enable_IR_x2apic(void); +extern void x2apic_icr_write(u32 low, u32 id); +static inline int x2apic_enabled(void) +{ + int msr, msr2; + + if (!cpu_has_x2apic) + return 0; + + rdmsr(MSR_IA32_APICBASE, msr, msr2); + if (msr & X2APIC_ENABLE) + return 1; + return 0; +} +#else +#define x2apic_enabled() 0 +#endif + +struct apic_ops { + u32 (*read)(u32 reg); + void (*write)(u32 reg, u32 v); + u64 (*icr_read)(void); + void (*icr_write)(u32 low, u32 high); + void (*wait_icr_idle)(void); + u32 (*safe_wait_icr_idle)(void); +}; + +extern struct apic_ops *apic_ops; + +#define apic_read (apic_ops->read) +#define apic_write (apic_ops->write) +#define apic_icr_read (apic_ops->icr_read) +#define apic_icr_write (apic_ops->icr_write) +#define apic_wait_icr_idle (apic_ops->wait_icr_idle) +#define safe_apic_wait_icr_idle (apic_ops->safe_wait_icr_idle) + +extern int get_physical_broadcast(void); + +#ifdef CONFIG_X86_64 +static inline void ack_x2APIC_irq(void) +{ + /* Docs say use 0 for future compatibility */ + native_apic_msr_write(APIC_EOI, 0); +} +#endif + + +static inline void ack_APIC_irq(void) +{ + /* + * ack_APIC_irq() actually gets compiled as a single instruction + * ... yummie. + */ + + /* Docs say use 0 for future compatibility */ + apic_write(APIC_EOI, 0); +} + +extern int lapic_get_maxlvt(void); +extern void clear_local_APIC(void); +extern void connect_bsp_APIC(void); +extern void disconnect_bsp_APIC(int virt_wire_setup); +extern void disable_local_APIC(void); +extern void lapic_shutdown(void); +extern int verify_local_APIC(void); +extern void cache_APIC_registers(void); +extern void sync_Arb_IDs(void); +extern void init_bsp_APIC(void); +extern void setup_local_APIC(void); +extern void end_local_APIC_setup(void); +extern void init_apic_mappings(void); +extern void setup_boot_APIC_clock(void); +extern void setup_secondary_APIC_clock(void); +extern int APIC_init_uniprocessor(void); +extern void enable_NMI_through_LVT0(void); + +/* + * On 32bit this is mach-xxx local + */ +#ifdef CONFIG_X86_64 +extern void early_init_lapic_mapping(void); +extern int apic_is_clustered_box(void); +#else +static inline int apic_is_clustered_box(void) +{ + return 0; +} +#endif + +extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); +extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask); + + +#else /* !CONFIG_X86_LOCAL_APIC */ +static inline void lapic_shutdown(void) { } +#define local_apic_timer_c2_ok 1 +static inline void init_apic_mappings(void) { } + +#endif /* !CONFIG_X86_LOCAL_APIC */ + +#endif /* ASM_X86__APIC_H */ diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h new file mode 100644 index 000000000000..b922c85ac91d --- /dev/null +++ b/arch/x86/include/asm/apicdef.h @@ -0,0 +1,417 @@ +#ifndef ASM_X86__APICDEF_H +#define ASM_X86__APICDEF_H + +/* + * Constants for various Intel APICs. (local APIC, IOAPIC, etc.) + * + * Alan Cox , 1995. + * Ingo Molnar , 1999, 2000 + */ + +#define APIC_DEFAULT_PHYS_BASE 0xfee00000 + +#define APIC_ID 0x20 + +#define APIC_LVR 0x30 +#define APIC_LVR_MASK 0xFF00FF +#define GET_APIC_VERSION(x) ((x) & 0xFFu) +#define GET_APIC_MAXLVT(x) (((x) >> 16) & 0xFFu) +#ifdef CONFIG_X86_32 +# define APIC_INTEGRATED(x) ((x) & 0xF0u) +#else +# define APIC_INTEGRATED(x) (1) +#endif +#define APIC_XAPIC(x) ((x) >= 0x14) +#define APIC_TASKPRI 0x80 +#define APIC_TPRI_MASK 0xFFu +#define APIC_ARBPRI 0x90 +#define APIC_ARBPRI_MASK 0xFFu +#define APIC_PROCPRI 0xA0 +#define APIC_EOI 0xB0 +#define APIC_EIO_ACK 0x0 +#define APIC_RRR 0xC0 +#define APIC_LDR 0xD0 +#define APIC_LDR_MASK (0xFFu << 24) +#define GET_APIC_LOGICAL_ID(x) (((x) >> 24) & 0xFFu) +#define SET_APIC_LOGICAL_ID(x) (((x) << 24)) +#define APIC_ALL_CPUS 0xFFu +#define APIC_DFR 0xE0 +#define APIC_DFR_CLUSTER 0x0FFFFFFFul +#define APIC_DFR_FLAT 0xFFFFFFFFul +#define APIC_SPIV 0xF0 +#define APIC_SPIV_FOCUS_DISABLED (1 << 9) +#define APIC_SPIV_APIC_ENABLED (1 << 8) +#define APIC_ISR 0x100 +#define APIC_ISR_NR 0x8 /* Number of 32 bit ISR registers. */ +#define APIC_TMR 0x180 +#define APIC_IRR 0x200 +#define APIC_ESR 0x280 +#define APIC_ESR_SEND_CS 0x00001 +#define APIC_ESR_RECV_CS 0x00002 +#define APIC_ESR_SEND_ACC 0x00004 +#define APIC_ESR_RECV_ACC 0x00008 +#define APIC_ESR_SENDILL 0x00020 +#define APIC_ESR_RECVILL 0x00040 +#define APIC_ESR_ILLREGA 0x00080 +#define APIC_ICR 0x300 +#define APIC_DEST_SELF 0x40000 +#define APIC_DEST_ALLINC 0x80000 +#define APIC_DEST_ALLBUT 0xC0000 +#define APIC_ICR_RR_MASK 0x30000 +#define APIC_ICR_RR_INVALID 0x00000 +#define APIC_ICR_RR_INPROG 0x10000 +#define APIC_ICR_RR_VALID 0x20000 +#define APIC_INT_LEVELTRIG 0x08000 +#define APIC_INT_ASSERT 0x04000 +#define APIC_ICR_BUSY 0x01000 +#define APIC_DEST_LOGICAL 0x00800 +#define APIC_DEST_PHYSICAL 0x00000 +#define APIC_DM_FIXED 0x00000 +#define APIC_DM_LOWEST 0x00100 +#define APIC_DM_SMI 0x00200 +#define APIC_DM_REMRD 0x00300 +#define APIC_DM_NMI 0x00400 +#define APIC_DM_INIT 0x00500 +#define APIC_DM_STARTUP 0x00600 +#define APIC_DM_EXTINT 0x00700 +#define APIC_VECTOR_MASK 0x000FF +#define APIC_ICR2 0x310 +#define GET_APIC_DEST_FIELD(x) (((x) >> 24) & 0xFF) +#define SET_APIC_DEST_FIELD(x) ((x) << 24) +#define APIC_LVTT 0x320 +#define APIC_LVTTHMR 0x330 +#define APIC_LVTPC 0x340 +#define APIC_LVT0 0x350 +#define APIC_LVT_TIMER_BASE_MASK (0x3 << 18) +#define GET_APIC_TIMER_BASE(x) (((x) >> 18) & 0x3) +#define SET_APIC_TIMER_BASE(x) (((x) << 18)) +#define APIC_TIMER_BASE_CLKIN 0x0 +#define APIC_TIMER_BASE_TMBASE 0x1 +#define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_MASKED (1 << 16) +#define APIC_LVT_LEVEL_TRIGGER (1 << 15) +#define APIC_LVT_REMOTE_IRR (1 << 14) +#define APIC_INPUT_POLARITY (1 << 13) +#define APIC_SEND_PENDING (1 << 12) +#define APIC_MODE_MASK 0x700 +#define GET_APIC_DELIVERY_MODE(x) (((x) >> 8) & 0x7) +#define SET_APIC_DELIVERY_MODE(x, y) (((x) & ~0x700) | ((y) << 8)) +#define APIC_MODE_FIXED 0x0 +#define APIC_MODE_NMI 0x4 +#define APIC_MODE_EXTINT 0x7 +#define APIC_LVT1 0x360 +#define APIC_LVTERR 0x370 +#define APIC_TMICT 0x380 +#define APIC_TMCCT 0x390 +#define APIC_TDCR 0x3E0 +#define APIC_SELF_IPI 0x3F0 +#define APIC_TDR_DIV_TMBASE (1 << 2) +#define APIC_TDR_DIV_1 0xB +#define APIC_TDR_DIV_2 0x0 +#define APIC_TDR_DIV_4 0x1 +#define APIC_TDR_DIV_8 0x2 +#define APIC_TDR_DIV_16 0x3 +#define APIC_TDR_DIV_32 0x8 +#define APIC_TDR_DIV_64 0x9 +#define APIC_TDR_DIV_128 0xA +#define APIC_EILVT0 0x500 +#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ +#define APIC_EILVT_NR_AMD_10H 4 +#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) +#define APIC_EILVT_MSG_FIX 0x0 +#define APIC_EILVT_MSG_SMI 0x2 +#define APIC_EILVT_MSG_NMI 0x4 +#define APIC_EILVT_MSG_EXT 0x7 +#define APIC_EILVT_MASKED (1 << 16) +#define APIC_EILVT1 0x510 +#define APIC_EILVT2 0x520 +#define APIC_EILVT3 0x530 + +#define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) +#define APIC_BASE_MSR 0x800 +#define X2APIC_ENABLE (1UL << 10) + +#ifdef CONFIG_X86_32 +# define MAX_IO_APICS 64 +#else +# define MAX_IO_APICS 128 +# define MAX_LOCAL_APIC 32768 +#endif + +/* + * All x86-64 systems are xAPIC compatible. + * In the following, "apicid" is a physical APIC ID. + */ +#define XAPIC_DEST_CPUS_SHIFT 4 +#define XAPIC_DEST_CPUS_MASK ((1u << XAPIC_DEST_CPUS_SHIFT) - 1) +#define XAPIC_DEST_CLUSTER_MASK (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT) +#define APIC_CLUSTER(apicid) ((apicid) & XAPIC_DEST_CLUSTER_MASK) +#define APIC_CLUSTERID(apicid) (APIC_CLUSTER(apicid) >> XAPIC_DEST_CPUS_SHIFT) +#define APIC_CPUID(apicid) ((apicid) & XAPIC_DEST_CPUS_MASK) +#define NUM_APIC_CLUSTERS ((BAD_APICID + 1) >> XAPIC_DEST_CPUS_SHIFT) + +/* + * the local APIC register structure, memory mapped. Not terribly well + * tested, but we might eventually use this one in the future - the + * problem why we cannot use it right now is the P5 APIC, it has an + * errata which cannot take 8-bit reads and writes, only 32-bit ones ... + */ +#define u32 unsigned int + +struct local_apic { + +/*000*/ struct { u32 __reserved[4]; } __reserved_01; + +/*010*/ struct { u32 __reserved[4]; } __reserved_02; + +/*020*/ struct { /* APIC ID Register */ + u32 __reserved_1 : 24, + phys_apic_id : 4, + __reserved_2 : 4; + u32 __reserved[3]; + } id; + +/*030*/ const + struct { /* APIC Version Register */ + u32 version : 8, + __reserved_1 : 8, + max_lvt : 8, + __reserved_2 : 8; + u32 __reserved[3]; + } version; + +/*040*/ struct { u32 __reserved[4]; } __reserved_03; + +/*050*/ struct { u32 __reserved[4]; } __reserved_04; + +/*060*/ struct { u32 __reserved[4]; } __reserved_05; + +/*070*/ struct { u32 __reserved[4]; } __reserved_06; + +/*080*/ struct { /* Task Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } tpr; + +/*090*/ const + struct { /* Arbitration Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } apr; + +/*0A0*/ const + struct { /* Processor Priority Register */ + u32 priority : 8, + __reserved_1 : 24; + u32 __reserved_2[3]; + } ppr; + +/*0B0*/ struct { /* End Of Interrupt Register */ + u32 eoi; + u32 __reserved[3]; + } eoi; + +/*0C0*/ struct { u32 __reserved[4]; } __reserved_07; + +/*0D0*/ struct { /* Logical Destination Register */ + u32 __reserved_1 : 24, + logical_dest : 8; + u32 __reserved_2[3]; + } ldr; + +/*0E0*/ struct { /* Destination Format Register */ + u32 __reserved_1 : 28, + model : 4; + u32 __reserved_2[3]; + } dfr; + +/*0F0*/ struct { /* Spurious Interrupt Vector Register */ + u32 spurious_vector : 8, + apic_enabled : 1, + focus_cpu : 1, + __reserved_2 : 22; + u32 __reserved_3[3]; + } svr; + +/*100*/ struct { /* In Service Register */ +/*170*/ u32 bitfield; + u32 __reserved[3]; + } isr [8]; + +/*180*/ struct { /* Trigger Mode Register */ +/*1F0*/ u32 bitfield; + u32 __reserved[3]; + } tmr [8]; + +/*200*/ struct { /* Interrupt Request Register */ +/*270*/ u32 bitfield; + u32 __reserved[3]; + } irr [8]; + +/*280*/ union { /* Error Status Register */ + struct { + u32 send_cs_error : 1, + receive_cs_error : 1, + send_accept_error : 1, + receive_accept_error : 1, + __reserved_1 : 1, + send_illegal_vector : 1, + receive_illegal_vector : 1, + illegal_register_address : 1, + __reserved_2 : 24; + u32 __reserved_3[3]; + } error_bits; + struct { + u32 errors; + u32 __reserved_3[3]; + } all_errors; + } esr; + +/*290*/ struct { u32 __reserved[4]; } __reserved_08; + +/*2A0*/ struct { u32 __reserved[4]; } __reserved_09; + +/*2B0*/ struct { u32 __reserved[4]; } __reserved_10; + +/*2C0*/ struct { u32 __reserved[4]; } __reserved_11; + +/*2D0*/ struct { u32 __reserved[4]; } __reserved_12; + +/*2E0*/ struct { u32 __reserved[4]; } __reserved_13; + +/*2F0*/ struct { u32 __reserved[4]; } __reserved_14; + +/*300*/ struct { /* Interrupt Command Register 1 */ + u32 vector : 8, + delivery_mode : 3, + destination_mode : 1, + delivery_status : 1, + __reserved_1 : 1, + level : 1, + trigger : 1, + __reserved_2 : 2, + shorthand : 2, + __reserved_3 : 12; + u32 __reserved_4[3]; + } icr1; + +/*310*/ struct { /* Interrupt Command Register 2 */ + union { + u32 __reserved_1 : 24, + phys_dest : 4, + __reserved_2 : 4; + u32 __reserved_3 : 24, + logical_dest : 8; + } dest; + u32 __reserved_4[3]; + } icr2; + +/*320*/ struct { /* LVT - Timer */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + timer_mode : 1, + __reserved_3 : 14; + u32 __reserved_4[3]; + } lvt_timer; + +/*330*/ struct { /* LVT - Thermal Sensor */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_thermal; + +/*340*/ struct { /* LVT - Performance Counter */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_pc; + +/*350*/ struct { /* LVT - LINT0 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint0; + +/*360*/ struct { /* LVT - LINT1 */ + u32 vector : 8, + delivery_mode : 3, + __reserved_1 : 1, + delivery_status : 1, + polarity : 1, + remote_irr : 1, + trigger : 1, + mask : 1, + __reserved_2 : 15; + u32 __reserved_3[3]; + } lvt_lint1; + +/*370*/ struct { /* LVT - Error */ + u32 vector : 8, + __reserved_1 : 4, + delivery_status : 1, + __reserved_2 : 3, + mask : 1, + __reserved_3 : 15; + u32 __reserved_4[3]; + } lvt_error; + +/*380*/ struct { /* Timer Initial Count Register */ + u32 initial_count; + u32 __reserved_2[3]; + } timer_icr; + +/*390*/ const + struct { /* Timer Current Count Register */ + u32 curr_count; + u32 __reserved_2[3]; + } timer_ccr; + +/*3A0*/ struct { u32 __reserved[4]; } __reserved_16; + +/*3B0*/ struct { u32 __reserved[4]; } __reserved_17; + +/*3C0*/ struct { u32 __reserved[4]; } __reserved_18; + +/*3D0*/ struct { u32 __reserved[4]; } __reserved_19; + +/*3E0*/ struct { /* Timer Divide Configuration Register */ + u32 divisor : 4, + __reserved_1 : 28; + u32 __reserved_2[3]; + } timer_dcr; + +/*3F0*/ struct { u32 __reserved[4]; } __reserved_20; + +} __attribute__ ((packed)); + +#undef u32 + +#ifdef CONFIG_X86_32 + #define BAD_APICID 0xFFu +#else + #define BAD_APICID 0xFFFFu +#endif +#endif /* ASM_X86__APICDEF_H */ diff --git a/arch/x86/include/asm/arch_hooks.h b/arch/x86/include/asm/arch_hooks.h new file mode 100644 index 000000000000..de4596b24c23 --- /dev/null +++ b/arch/x86/include/asm/arch_hooks.h @@ -0,0 +1,26 @@ +#ifndef ASM_X86__ARCH_HOOKS_H +#define ASM_X86__ARCH_HOOKS_H + +#include + +/* + * linux/include/asm/arch_hooks.h + * + * define the architecture specific hooks + */ + +/* these aren't arch hooks, they are generic routines + * that can be used by the hooks */ +extern void init_ISA_irqs(void); +extern irqreturn_t timer_interrupt(int irq, void *dev_id); + +/* these are the defined hooks */ +extern void intr_init_hook(void); +extern void pre_intr_init_hook(void); +extern void pre_setup_arch_hook(void); +extern void trap_init_hook(void); +extern void pre_time_init_hook(void); +extern void time_init_hook(void); +extern void mca_nmi_hook(void); + +#endif /* ASM_X86__ARCH_HOOKS_H */ diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h new file mode 100644 index 000000000000..e1355f44d7c3 --- /dev/null +++ b/arch/x86/include/asm/asm.h @@ -0,0 +1,47 @@ +#ifndef ASM_X86__ASM_H +#define ASM_X86__ASM_H + +#ifdef __ASSEMBLY__ +# define __ASM_FORM(x) x +# define __ASM_EX_SEC .section __ex_table +#else +# define __ASM_FORM(x) " " #x " " +# define __ASM_EX_SEC " .section __ex_table,\"a\"\n" +#endif + +#ifdef CONFIG_X86_32 +# define __ASM_SEL(a,b) __ASM_FORM(a) +#else +# define __ASM_SEL(a,b) __ASM_FORM(b) +#endif + +#define __ASM_SIZE(inst) __ASM_SEL(inst##l, inst##q) +#define __ASM_REG(reg) __ASM_SEL(e##reg, r##reg) + +#define _ASM_PTR __ASM_SEL(.long, .quad) +#define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) + +#define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_INC __ASM_SIZE(inc) +#define _ASM_DEC __ASM_SIZE(dec) +#define _ASM_ADD __ASM_SIZE(add) +#define _ASM_SUB __ASM_SIZE(sub) +#define _ASM_XADD __ASM_SIZE(xadd) + +#define _ASM_AX __ASM_REG(ax) +#define _ASM_BX __ASM_REG(bx) +#define _ASM_CX __ASM_REG(cx) +#define _ASM_DX __ASM_REG(dx) +#define _ASM_SP __ASM_REG(sp) +#define _ASM_BP __ASM_REG(bp) +#define _ASM_SI __ASM_REG(si) +#define _ASM_DI __ASM_REG(di) + +/* Exception table entry */ +# define _ASM_EXTABLE(from,to) \ + __ASM_EX_SEC \ + _ASM_ALIGN "\n" \ + _ASM_PTR #from "," #to "\n" \ + " .previous\n" + +#endif /* ASM_X86__ASM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h new file mode 100644 index 000000000000..4e1b8873c474 --- /dev/null +++ b/arch/x86/include/asm/atomic.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "atomic_32.h" +#else +# include "atomic_64.h" +#endif diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h new file mode 100644 index 000000000000..14d3f0beb889 --- /dev/null +++ b/arch/x86/include/asm/atomic_32.h @@ -0,0 +1,259 @@ +#ifndef ASM_X86__ATOMIC_32_H +#define ASM_X86__ATOMIC_32_H + +#include +#include +#include + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { + int counter; +} atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +#define atomic_read(v) ((v)->counter) + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic_set(v, i) (((v)->counter) = (i)) + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub - subtract integer from atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "+m" (v->counter) + : "ir" (i)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "+m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "+m" (v->counter), "=qm" (c) + : : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @v: pointer of type atomic_t + * @i: integer value to add + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "+m" (v->counter), "=qm" (c) + : "ir" (i) : "memory"); + return c; +} + +/** + * atomic_add_return - add integer and return + * @v: pointer of type atomic_t + * @i: integer value to add + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i; +#ifdef CONFIG_M386 + unsigned long flags; + if (unlikely(boot_cpu_data.x86 <= 3)) + goto no_xadd; +#endif + /* Modern 486+ processor */ + __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; + +#ifdef CONFIG_M386 +no_xadd: /* Legacy 386 processor */ + local_irq_save(flags); + __i = atomic_read(v); + atomic_set(v, i + __i); + local_irq_restore(flags); + return i + __i; +#endif +} + +/** + * atomic_sub_return - subtract integer and return + * @v: pointer of type atomic_t + * @i: integer value to subtract + * + * Atomically subtracts @i from @v and returns @v - @i + */ +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) + +/** + * atomic_add_unless - add unless the number is already a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as @v was not already @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" (mask), "m" (*(addr)) : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + +#include +#endif /* ASM_X86__ATOMIC_32_H */ diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h new file mode 100644 index 000000000000..2cb218c4a356 --- /dev/null +++ b/arch/x86/include/asm/atomic_64.h @@ -0,0 +1,473 @@ +#ifndef ASM_X86__ATOMIC_64_H +#define ASM_X86__ATOMIC_64_H + +#include +#include + +/* atomic_t should be 32 bit signed type */ + +/* + * Atomic operations that C can't guarantee us. Useful for + * resource counting etc.. + */ + +/* + * Make sure gcc doesn't try to be clever and move things around + * on us. We need to use _exactly_ the address the user gave us, + * not some alias that contains the same information. + */ +typedef struct { + int counter; +} atomic_t; + +#define ATOMIC_INIT(i) { (i) } + +/** + * atomic_read - read atomic variable + * @v: pointer of type atomic_t + * + * Atomically reads the value of @v. + */ +#define atomic_read(v) ((v)->counter) + +/** + * atomic_set - set atomic variable + * @v: pointer of type atomic_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic_set(v, i) (((v)->counter) = (i)) + +/** + * atomic_add - add integer to atomic variable + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v. + */ +static inline void atomic_add(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "addl %1,%0" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); +} + +/** + * atomic_sub - subtract the atomic variable + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic_sub(int i, atomic_t *v) +{ + asm volatile(LOCK_PREFIX "subl %1,%0" + : "=m" (v->counter) + : "ir" (i), "m" (v->counter)); +} + +/** + * atomic_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer of type atomic_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_sub_and_test(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subl %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic_inc - increment atomic variable + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1. + */ +static inline void atomic_inc(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "incl %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic_dec - decrement atomic variable + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic_dec(atomic_t *v) +{ + asm volatile(LOCK_PREFIX "decl %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic_dec_and_test - decrement and test + * @v: pointer of type atomic_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic_dec_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decl %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic_inc_and_test - increment and test + * @v: pointer of type atomic_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic_inc_and_test(atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incl %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic_add_negative(int i, atomic_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addl %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "ir" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic_add_return - add and return + * @i: integer value to add + * @v: pointer of type atomic_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline int atomic_add_return(int i, atomic_t *v) +{ + int __i = i; + asm volatile(LOCK_PREFIX "xaddl %0, %1" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline int atomic_sub_return(int i, atomic_t *v) +{ + return atomic_add_return(-i, v); +} + +#define atomic_inc_return(v) (atomic_add_return(1, v)) +#define atomic_dec_return(v) (atomic_sub_return(1, v)) + +/* An 64bit atomic type */ + +typedef struct { + long counter; +} atomic64_t; + +#define ATOMIC64_INIT(i) { (i) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +#define atomic64_read(v) ((v)->counter) + +/** + * atomic64_set - set atomic64 variable + * @v: pointer to type atomic64_t + * @i: required value + * + * Atomically sets the value of @v to @i. + */ +#define atomic64_set(v, i) (((v)->counter) = (i)) + +/** + * atomic64_add - add integer to atomic64 variable + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v. + */ +static inline void atomic64_add(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "addq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v. + */ +static inline void atomic64_sub(long i, atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "subq %1,%0" + : "=m" (v->counter) + : "er" (i), "m" (v->counter)); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @i: integer value to subtract + * @v: pointer to type atomic64_t + * + * Atomically subtracts @i from @v and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_sub_and_test(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "subq %2,%0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_inc - increment atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1. + */ +static inline void atomic64_inc(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "incq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1. + */ +static inline void atomic64_dec(atomic64_t *v) +{ + asm volatile(LOCK_PREFIX "decq %0" + : "=m" (v->counter) + : "m" (v->counter)); +} + +/** + * atomic64_dec_and_test - decrement and test + * @v: pointer to type atomic64_t + * + * Atomically decrements @v by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "decq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_inc_and_test - increment and test + * @v: pointer to type atomic64_t + * + * Atomically increments @v by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "incq %0; sete %1" + : "=m" (v->counter), "=qm" (c) + : "m" (v->counter) : "memory"); + return c != 0; +} + +/** + * atomic64_add_negative - add and test if negative + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int atomic64_add_negative(long i, atomic64_t *v) +{ + unsigned char c; + + asm volatile(LOCK_PREFIX "addq %2,%0; sets %1" + : "=m" (v->counter), "=qm" (c) + : "er" (i), "m" (v->counter) : "memory"); + return c; +} + +/** + * atomic64_add_return - add and return + * @i: integer value to add + * @v: pointer to type atomic64_t + * + * Atomically adds @i to @v and returns @i + @v + */ +static inline long atomic64_add_return(long i, atomic64_t *v) +{ + long __i = i; + asm volatile(LOCK_PREFIX "xaddq %0, %1;" + : "+r" (i), "+m" (v->counter) + : : "memory"); + return i + __i; +} + +static inline long atomic64_sub_return(long i, atomic64_t *v) +{ + return atomic64_add_return(-i, v); +} + +#define atomic64_inc_return(v) (atomic64_add_return(1, (v))) +#define atomic64_dec_return(v) (atomic64_sub_return(1, (v))) + +#define atomic64_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) + +#define atomic_cmpxchg(v, old, new) (cmpxchg(&((v)->counter), (old), (new))) +#define atomic_xchg(v, new) (xchg(&((v)->counter), (new))) + +/** + * atomic_add_unless - add unless the number is a given value + * @v: pointer of type atomic_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic_add_unless(atomic_t *v, int a, int u) +{ + int c, old; + c = atomic_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0) + +/** + * atomic64_add_unless - add unless the number is a given value + * @v: pointer of type atomic64_t + * @a: the amount to add to v... + * @u: ...unless v is equal to u. + * + * Atomically adds @a to @v, so long as it was not @u. + * Returns non-zero if @v was not @u, and zero otherwise. + */ +static inline int atomic64_add_unless(atomic64_t *v, long a, long u) +{ + long c, old; + c = atomic64_read(v); + for (;;) { + if (unlikely(c == (u))) + break; + old = atomic64_cmpxchg((v), c, c + (a)); + if (likely(old == c)) + break; + c = old; + } + return c != (u); +} + +/** + * atomic_inc_short - increment of a short integer + * @v: pointer to type int + * + * Atomically adds 1 to @v + * Returns the new value of @u + */ +static inline short int atomic_inc_short(short int *v) +{ + asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v)); + return *v; +} + +/** + * atomic_or_long - OR of two long integers + * @v1: pointer to type unsigned long + * @v2: pointer to type unsigned long + * + * Atomically ORs @v1 and @v2 + * Returns the result of the OR + */ +static inline void atomic_or_long(unsigned long *v1, unsigned long v2) +{ + asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2)); +} + +#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) + +/* These are x86-specific, used by some header files */ +#define atomic_clear_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "andl %0,%1" \ + : : "r" (~(mask)), "m" (*(addr)) : "memory") + +#define atomic_set_mask(mask, addr) \ + asm volatile(LOCK_PREFIX "orl %0,%1" \ + : : "r" ((unsigned)(mask)), "m" (*(addr)) \ + : "memory") + +/* Atomic operations are already serializing on x86 */ +#define smp_mb__before_atomic_dec() barrier() +#define smp_mb__after_atomic_dec() barrier() +#define smp_mb__before_atomic_inc() barrier() +#define smp_mb__after_atomic_inc() barrier() + +#include +#endif /* ASM_X86__ATOMIC_64_H */ diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h new file mode 100644 index 000000000000..12c7cac74202 --- /dev/null +++ b/arch/x86/include/asm/auxvec.h @@ -0,0 +1,12 @@ +#ifndef ASM_X86__AUXVEC_H +#define ASM_X86__AUXVEC_H +/* + * Architecture-neutral AT_ values in 0-17, leave some room + * for more of them, start the x86-specific ones at 32. + */ +#ifdef __i386__ +#define AT_SYSINFO 32 +#endif +#define AT_SYSINFO_EHDR 33 + +#endif /* ASM_X86__AUXVEC_H */ diff --git a/arch/x86/include/asm/bigsmp/apic.h b/arch/x86/include/asm/bigsmp/apic.h new file mode 100644 index 000000000000..1d9543b9d358 --- /dev/null +++ b/arch/x86/include/asm/bigsmp/apic.h @@ -0,0 +1,139 @@ +#ifndef __ASM_MACH_APIC_H +#define __ASM_MACH_APIC_H + +#define xapic_phys_to_log_apicid(cpu) (per_cpu(x86_bios_cpu_apicid, cpu)) +#define esr_disable (1) + +static inline int apic_id_registered(void) +{ + return (1); +} + +static inline cpumask_t target_cpus(void) +{ +#ifdef CONFIG_SMP + return cpu_online_map; +#else + return cpumask_of_cpu(0); +#endif +} + +#undef APIC_DEST_LOGICAL +#define APIC_DEST_LOGICAL 0 +#define APIC_DFR_VALUE (APIC_DFR_FLAT) +#define INT_DELIVERY_MODE (dest_Fixed) +#define INT_DEST_MODE (0) /* phys delivery to target proc */ +#define NO_BALANCE_IRQ (0) +#define WAKE_SECONDARY_VIA_INIT + + +static inline unsigned long check_apicid_used(physid_mask_t bitmap, int apicid) +{ + return (0); +} + +static inline unsigned long check_apicid_present(int bit) +{ + return (1); +} + +static inline unsigned long calculate_ldr(int cpu) +{ + unsigned long val, id; + val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; + id = xapic_phys_to_log_apicid(cpu); + val |= SET_APIC_LOGICAL_ID(id); + return val; +} + +/* + * Set up the logical destination ID. + * + * Intel recommends to set DFR, LDR and TPR before enabling + * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel + * document number 292116). So here it goes... + */ +static inline void init_apic_ldr(void) +{ + unsigned long val; + int cpu = smp_processor_id(); + + apic_write(APIC_DFR, APIC_DFR_VALUE); + val = calculate_ldr(cpu); + apic_write(APIC_LDR, val); +} + +static inline void setup_apic_routing(void) +{ + printk("Enabling APIC mode: %s. Using %d I/O APICs\n", + "Physflat", nr_ioapics); +} + +static inline int multi_timer_check(int apic, int irq) +{ + return (0); +} + +static inline int apicid_to_node(int logical_apicid) +{ + return apicid_2_node[hard_smp_processor_id()]; +} + +static inline int cpu_present_to_apicid(int mps_cpu) +{ + if (mps_cpu < NR_CPUS) + return (int) per_cpu(x86_bios_cpu_apicid, mps_cpu); + + return BAD_APICID; +} + +static inline physid_mask_t apicid_to_cpu_present(int phys_apicid) +{ + return physid_mask_of_physid(phys_apicid); +} + +extern u8 cpu_2_logical_apicid[]; +/* Mapping from cpu number to logical apicid */ +static inline int cpu_to_logical_apicid(int cpu) +{ + if (cpu >= NR_CPUS) + return BAD_APICID; + return cpu_physical_id(cpu); +} + +static inline physid_mask_t ioapic_phys_id_map(physid_mask_t phys_map) +{ + /* For clustered we don't have a good way to do this yet - hack */ + return physids_promote(0xFFL); +} + +static inline void setup_portio_remap(void) +{ +} + +static inline void enable_apic_mode(void) +{ +} + +static inline int check_phys_apicid_present(int boot_cpu_physical_apicid) +{ + return (1); +} + +/* As we are using single CPU as destination, pick only one CPU here */ +static inline unsigned int cpu_mask_to_apicid(cpumask_t cpumask) +{ + int cpu; + int apicid; + + cpu = first_cpu(cpumask); + apicid = cpu_to_logical_apicid(cpu); + return apicid; +} + +static inline u32 phys_pkg_id(u32 cpuid_apic, int index_msb) +{ + return cpuid_apic >> index_msb; +} + +#endif /* __ASM_MACH_APIC_H */ diff --git a/arch/x86/include/asm/bigsmp/apicdef.h b/arch/x86/include/asm/bigsmp/apicdef.h new file mode 100644 index 000000000000..392c3f5ef2fe --- /dev/null +++ b/arch/x86/include/asm/bigsmp/apicdef.h @@ -0,0 +1,13 @@ +#ifndef __ASM_MACH_APICDEF_H +#define __ASM_MACH_APICDEF_H + +#define APIC_ID_MASK (0xFF<<24) + +static inline unsigned get_apic_id(unsigned long x) +{ + return (((x)>>24)&0xFF); +} + +#define GET_APIC_ID(x) get_apic_id(x) + +#endif diff --git a/arch/x86/include/asm/bigsmp/ipi.h b/arch/x86/include/asm/bigsmp/ipi.h new file mode 100644 index 000000000000..9404c535b7ec --- /dev/null +++ b/arch/x86/include/asm/bigsmp/ipi.h @@ -0,0 +1,25 @@ +#ifndef __ASM_MACH_IPI_H +#define __ASM_MACH_IPI_H + +void send_IPI_mask_sequence(cpumask_t mask, int vector); + +static inline void send_IPI_mask(cpumask_t mask, int vector) +{ + send_IPI_mask_sequence(mask, vector); +} + +static inline void send_IPI_allbutself(int vector) +{ + cpumask_t mask = cpu_online_map; + cpu_clear(smp_processor_id(), mask); + + if (!cpus_empty(mask)) + send_IPI_mask(mask, vector); +} + +static inline void send_IPI_all(int vector) +{ + send_IPI_mask(cpu_online_map, vector); +} + +#endif /* __ASM_MACH_IPI_H */ diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h new file mode 100644 index 000000000000..79b4b88505d7 --- /dev/null +++ b/arch/x86/include/asm/bios_ebda.h @@ -0,0 +1,36 @@ +#ifndef ASM_X86__BIOS_EBDA_H +#define ASM_X86__BIOS_EBDA_H + +#include + +/* + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E. + */ +static inline unsigned int get_bios_ebda(void) +{ + unsigned int address = *(unsigned short *)phys_to_virt(0x40E); + address <<= 4; + return address; /* 0 means none */ +} + +void reserve_ebda_region(void); + +#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION +/* + * This is obviously not a great place for this, but we want to be + * able to scatter it around anywhere in the kernel. + */ +void check_for_bios_corruption(void); +void start_periodic_check_for_corruption(void); +#else +static inline void check_for_bios_corruption(void) +{ +} + +static inline void start_periodic_check_for_corruption(void) +{ +} +#endif + +#endif /* ASM_X86__BIOS_EBDA_H */ diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h new file mode 100644 index 000000000000..451a74762bd4 --- /dev/null +++ b/arch/x86/include/asm/bitops.h @@ -0,0 +1,451 @@ +#ifndef ASM_X86__BITOPS_H +#define ASM_X86__BITOPS_H + +/* + * Copyright 1992, Linus Torvalds. + */ + +#ifndef _LINUX_BITOPS_H +#error only can be included directly +#endif + +#include +#include + +/* + * These have to be done with inline assembly: that way the bit-setting + * is guaranteed to be atomic. All bit operations return 0 if the bit + * was cleared before the operation and != 0 if it was not. + * + * bit 0 is the LSB of addr; bit 32 is the LSB of (addr+1). + */ + +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 1) +/* Technically wrong, but this avoids compilation errors on some gcc + versions. */ +#define BITOP_ADDR(x) "=m" (*(volatile long *) (x)) +#else +#define BITOP_ADDR(x) "+m" (*(volatile long *) (x)) +#endif + +#define ADDR BITOP_ADDR(addr) + +/* + * We do the locked ops that don't return the old value as + * a mask operation on a byte. + */ +#define IS_IMMEDIATE(nr) (__builtin_constant_p(nr)) +#define CONST_MASK_ADDR(nr, addr) BITOP_ADDR((void *)(addr) + ((nr)>>3)) +#define CONST_MASK(nr) (1 << ((nr) & 7)) + +/** + * set_bit - Atomically set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * This function is atomic and may not be reordered. See __set_bit() + * if you do not require the atomic guarantees. + * + * Note: there are no guarantees that this function will not be reordered + * on non x86 architectures, so if you are writing portable code, + * make sure not to rely on its reordering guarantees. + * + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void set_bit(unsigned int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "orb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)CONST_MASK(nr)) + : "memory"); + } else { + asm volatile(LOCK_PREFIX "bts %1,%0" + : BITOP_ADDR(addr) : "Ir" (nr) : "memory"); + } +} + +/** + * __set_bit - Set a bit in memory + * @nr: the bit to set + * @addr: the address to start counting from + * + * Unlike set_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __set_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("bts %1,%0" : ADDR : "Ir" (nr) : "memory"); +} + +/** + * clear_bit - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and may not be reordered. However, it does + * not contain a memory barrier, so if it is used for locking purposes, + * you should call smp_mb__before_clear_bit() and/or smp_mb__after_clear_bit() + * in order to ensure changes are visible on other processors. + */ +static inline void clear_bit(int nr, volatile unsigned long *addr) +{ + if (IS_IMMEDIATE(nr)) { + asm volatile(LOCK_PREFIX "andb %1,%0" + : CONST_MASK_ADDR(nr, addr) + : "iq" ((u8)~CONST_MASK(nr))); + } else { + asm volatile(LOCK_PREFIX "btr %1,%0" + : BITOP_ADDR(addr) + : "Ir" (nr)); + } +} + +/* + * clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * clear_bit() is atomic and implies release semantics before the memory + * operation. It can be used for an unlock. + */ +static inline void clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + clear_bit(nr, addr); +} + +static inline void __clear_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btr %1,%0" : ADDR : "Ir" (nr)); +} + +/* + * __clear_bit_unlock - Clears a bit in memory + * @nr: Bit to clear + * @addr: Address to start counting from + * + * __clear_bit() is non-atomic and implies release semantics before the memory + * operation. It can be used for an unlock if no other CPUs can concurrently + * modify other bits in the word. + * + * No memory barrier is required here, because x86 cannot reorder stores past + * older loads. Same principle as spin_unlock. + */ +static inline void __clear_bit_unlock(unsigned nr, volatile unsigned long *addr) +{ + barrier(); + __clear_bit(nr, addr); +} + +#define smp_mb__before_clear_bit() barrier() +#define smp_mb__after_clear_bit() barrier() + +/** + * __change_bit - Toggle a bit in memory + * @nr: the bit to change + * @addr: the address to start counting from + * + * Unlike change_bit(), this function is non-atomic and may be reordered. + * If it's called on the same region of memory simultaneously, the effect + * may be that only one operation succeeds. + */ +static inline void __change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile("btc %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * change_bit - Toggle a bit in memory + * @nr: Bit to change + * @addr: Address to start counting from + * + * change_bit() is atomic and may not be reordered. + * Note that @nr may be almost arbitrarily large; this function is not + * restricted to acting on a single-word quantity. + */ +static inline void change_bit(int nr, volatile unsigned long *addr) +{ + asm volatile(LOCK_PREFIX "btc %1,%0" : ADDR : "Ir" (nr)); +} + +/** + * test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "bts %2,%1\n\t" + "sbb %0,%0" : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_set_bit_lock - Set a bit and return its old value for lock + * @nr: Bit to set + * @addr: Address to count from + * + * This is the same as test_and_set_bit on x86. + */ +static inline int test_and_set_bit_lock(int nr, volatile unsigned long *addr) +{ + return test_and_set_bit(nr, addr); +} + +/** + * __test_and_set_bit - Set a bit and return its old value + * @nr: Bit to set + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_set_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm("bts %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/** + * test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * __test_and_clear_bit - Clear a bit and return its old value + * @nr: Bit to clear + * @addr: Address to count from + * + * This operation is non-atomic and can be reordered. + * If two examples of this operation race, one can appear to succeed + * but actually fail. You must protect multiple accesses with a lock. + */ +static inline int __test_and_clear_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("btr %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr)); + return oldbit; +} + +/* WARNING: non atomic and it can be reordered! */ +static inline int __test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile("btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR + : "Ir" (nr) : "memory"); + + return oldbit; +} + +/** + * test_and_change_bit - Change a bit and return its old value + * @nr: Bit to change + * @addr: Address to count from + * + * This operation is atomic and cannot be reordered. + * It also implies a memory barrier. + */ +static inline int test_and_change_bit(int nr, volatile unsigned long *addr) +{ + int oldbit; + + asm volatile(LOCK_PREFIX "btc %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit), ADDR : "Ir" (nr) : "memory"); + + return oldbit; +} + +static inline int constant_test_bit(int nr, const volatile unsigned long *addr) +{ + return ((1UL << (nr % BITS_PER_LONG)) & + (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; +} + +static inline int variable_test_bit(int nr, volatile const unsigned long *addr) +{ + int oldbit; + + asm volatile("bt %2,%1\n\t" + "sbb %0,%0" + : "=r" (oldbit) + : "m" (*(unsigned long *)addr), "Ir" (nr)); + + return oldbit; +} + +#if 0 /* Fool kernel-doc since it doesn't do macros yet */ +/** + * test_bit - Determine whether a bit is set + * @nr: bit number to test + * @addr: Address to start counting from + */ +static int test_bit(int nr, const volatile unsigned long *addr); +#endif + +#define test_bit(nr, addr) \ + (__builtin_constant_p((nr)) \ + ? constant_test_bit((nr), (addr)) \ + : variable_test_bit((nr), (addr))) + +/** + * __ffs - find first set bit in word + * @word: The word to search + * + * Undefined if no bit exists, so code should check against 0 first. + */ +static inline unsigned long __ffs(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +/** + * ffz - find first zero bit in word + * @word: The word to search + * + * Undefined if no zero exists, so code should check against ~0UL first. + */ +static inline unsigned long ffz(unsigned long word) +{ + asm("bsf %1,%0" + : "=r" (word) + : "r" (~word)); + return word; +} + +/* + * __fls: find last set bit in word + * @word: The word to search + * + * Undefined if no set bit exists, so code should check against 0 first. + */ +static inline unsigned long __fls(unsigned long word) +{ + asm("bsr %1,%0" + : "=r" (word) + : "rm" (word)); + return word; +} + +#ifdef __KERNEL__ +/** + * ffs - find first set bit in word + * @x: the word to search + * + * This is defined the same way as the libc and compiler builtin ffs + * routines, therefore differs in spirit from the other bitops. + * + * ffs(value) returns 0 if value is 0 or the position of the first + * set bit if value is nonzero. The first (least significant) bit + * is at position 1. + */ +static inline int ffs(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + asm("bsfl %1,%0\n\t" + "cmovzl %2,%0" + : "=r" (r) : "rm" (x), "r" (-1)); +#else + asm("bsfl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} + +/** + * fls - find last set bit in word + * @x: the word to search + * + * This is defined in a similar way as the libc and compiler builtin + * ffs, but returns the position of the most significant set bit. + * + * fls(value) returns 0 if value is 0 or the position of the last + * set bit if value is nonzero. The last (most significant) bit is + * at position 32. + */ +static inline int fls(int x) +{ + int r; +#ifdef CONFIG_X86_CMOV + asm("bsrl %1,%0\n\t" + "cmovzl %2,%0" + : "=&r" (r) : "rm" (x), "rm" (-1)); +#else + asm("bsrl %1,%0\n\t" + "jnz 1f\n\t" + "movl $-1,%0\n" + "1:" : "=r" (r) : "rm" (x)); +#endif + return r + 1; +} +#endif /* __KERNEL__ */ + +#undef ADDR + +#ifdef __KERNEL__ + +#include + +#define ARCH_HAS_FAST_MULTIPLIER 1 + +#include + +#endif /* __KERNEL__ */ + +#include + +#ifdef __KERNEL__ + +#include + +#define ext2_set_bit_atomic(lock, nr, addr) \ + test_and_set_bit((nr), (unsigned long *)(addr)) +#define ext2_clear_bit_atomic(lock, nr, addr) \ + test_and_clear_bit((nr), (unsigned long *)(addr)) + +#include + +#endif /* __KERNEL__ */ +#endif /* ASM_X86__BITOPS_H */ diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h new file mode 100644 index 000000000000..1d63bd5d5946 --- /dev/null +++ b/arch/x86/include/asm/boot.h @@ -0,0 +1,26 @@ +#ifndef ASM_X86__BOOT_H +#define ASM_X86__BOOT_H + +/* Don't touch these, unless you really know what you're doing. */ +#define DEF_SYSSEG 0x1000 +#define DEF_SYSSIZE 0x7F00 + +/* Internal svga startup constants */ +#define NORMAL_VGA 0xffff /* 80x25 mode */ +#define EXTENDED_VGA 0xfffe /* 80x50 mode */ +#define ASK_VGA 0xfffd /* ask for it at bootup */ + +/* Physical address where kernel should be loaded. */ +#define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + + (CONFIG_PHYSICAL_ALIGN - 1)) \ + & ~(CONFIG_PHYSICAL_ALIGN - 1)) + +#ifdef CONFIG_X86_64 +#define BOOT_HEAP_SIZE 0x7000 +#define BOOT_STACK_SIZE 0x4000 +#else +#define BOOT_HEAP_SIZE 0x4000 +#define BOOT_STACK_SIZE 0x1000 +#endif + +#endif /* ASM_X86__BOOT_H */ diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h new file mode 100644 index 000000000000..ccf027e2d97d --- /dev/null +++ b/arch/x86/include/asm/bootparam.h @@ -0,0 +1,111 @@ +#ifndef ASM_X86__BOOTPARAM_H +#define ASM_X86__BOOTPARAM_H + +#include +#include +#include +#include +#include +#include +#include