From db4315d6f53edc2cc0b0b06fce1beffebb119c71 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Tue, 5 Feb 2008 00:48:13 +0100 Subject: timer_list: print relative expiry time signed Relative expiry time can get negative, so it should be signed. Signed-off-by: Pavel Machek Signed-off-by: Thomas Gleixner --- kernel/time/timer_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index d3d94c1a0fd2..67fe8fc21fb1 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -65,9 +65,9 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); #endif SEQ_printf(m, "\n"); - SEQ_printf(m, " # expires at %Lu nsecs [in %Lu nsecs]\n", + SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", (unsigned long long)ktime_to_ns(timer->expires), - (unsigned long long)(ktime_to_ns(timer->expires) - now)); + (long long)(ktime_to_ns(timer->expires) - now)); } static void -- cgit v1.2.2 From b0abcfc14605b2a8c686bd8e193ab05b01a7980b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 18 Feb 2008 18:23:16 -0500 Subject: Audit: use == not = in if statements Clearly this was supposed to be an == not an = in the if statement. This patch also causes us to stop processing execve args once we have failed rather than continuing to loop on failure over and over and over. Signed-off-by: Eric Paris Acked-by: Al Viro Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/auditsc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index ac6d9b23b018..2087d6de67ea 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1000,9 +1000,10 @@ static int audit_log_single_execve_arg(struct audit_context *context, * for strings that are too long, we should not have created * any. */ - if (unlikely((len = -1) || len > MAX_ARG_STRLEN - 1)) { + if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } /* walk the whole argument looking for non-ascii chars */ @@ -1020,6 +1021,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; has_cntl = audit_string_contains_control(buf, to_send); @@ -1083,6 +1085,7 @@ static int audit_log_single_execve_arg(struct audit_context *context, if (ret) { WARN_ON(1); send_sig(SIGKILL, current, 0); + return -1; } buf[to_send] = '\0'; -- cgit v1.2.2 From 188fd89d539d899bfca2bc83534e5508e0161139 Mon Sep 17 00:00:00 2001 From: "S.Caglar Onur" Date: Thu, 14 Feb 2008 17:36:51 +0200 Subject: genirq: spurious.c: use time_* macros The functions time_before, time_before_eq, time_after, and time_after_eq are more robust for comparing jiffies against other values. So following patch implements usage of the time_after() macro, defined at linux/jiffies.h, which deals with wrapping correctly Signed-off-by: S.Caglar Onur Acked-by: Ingo Molnar Signed-off-by: Thomas Gleixner --- kernel/irq/spurious.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index a6b2bc831dd0..088dabbf2d6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -6,6 +6,7 @@ * This file contains spurious interrupt handling. */ +#include #include #include #include @@ -179,7 +180,7 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc, * otherwise the couter becomes a doomsday timer for otherwise * working systems */ - if (jiffies - desc->last_unhandled > HZ/10) + if (time_after(jiffies, desc->last_unhandled + HZ/10)) desc->irqs_unhandled = 1; else desc->irqs_unhandled++; -- cgit v1.2.2 From 89d694b9dbe769ca1004e01db0ca43964806a611 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 18 Feb 2008 18:25:17 +0100 Subject: genirq: do not leave interupts enabled on free_irq The default_disable() function was changed in commit: 76d2160147f43f982dfe881404cfde9fd0a9da21 genirq: do not mask interrupts by default It removed the mask function in favour of the default delayed interrupt disabling. Unfortunately this also broke the shutdown in free_irq() when the last handler is removed from the interrupt for those architectures which rely on the default implementations. Now we can end up with a enabled interrupt line after the last handler was removed, which can result in spurious interrupts. Fix this by adding a default_shutdown function, which is only installed, when the irqchip implementation does provide neither a shutdown nor a disable function. [@stable: affected versions: .21 - .24 ] Pointed-out-by: Michael Hennerich Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: stable@kernel.org Tested-by: Michael Hennerich --- kernel/irq/chip.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index cc54c6276356..fdb3fbe2b0c4 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -245,6 +245,17 @@ static unsigned int default_startup(unsigned int irq) return 0; } +/* + * default shutdown function + */ +static void default_shutdown(unsigned int irq) +{ + struct irq_desc *desc = irq_desc + irq; + + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; +} + /* * Fixup enable/disable function pointers */ @@ -256,8 +267,15 @@ void irq_chip_set_defaults(struct irq_chip *chip) chip->disable = default_disable; if (!chip->startup) chip->startup = default_startup; + /* + * We use chip->disable, when the user provided its own. When + * we have default_disable set for chip->disable, then we need + * to use default_shutdown, otherwise the irq line is not + * disabled on free_irq(): + */ if (!chip->shutdown) - chip->shutdown = chip->disable; + chip->shutdown = chip->disable != default_disable ? + chip->disable : default_shutdown; if (!chip->name) chip->name = chip->typename; if (!chip->end) -- cgit v1.2.2 From 8a235efad548abd2ab5ebea45a9ffa750c814375 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 20 Feb 2008 01:47:44 +0100 Subject: Hibernation: Handle DEBUG_PAGEALLOC on x86 Make hibernation work with CONFIG_DEBUG_PAGEALLOC set on x86, by checking if the pages to be copied are marked as present in the kernel mapping and temporarily marking them as present if that's not the case. No functional modifications are introduced if CONFIG_DEBUG_PAGEALLOC is unset. Signed-off-by: Rafael J. Wysocki Signed-off-by: Len Brown --- kernel/power/snapshot.c | 42 +++++++++++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 95250d7c8d91..72a020cabb4c 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -875,8 +875,8 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } #endif /* CONFIG_HIGHMEM */ /** - * saveable - Determine whether a non-highmem page should be included in - * the suspend image. + * saveable_page - Determine whether a non-highmem page should be included + * in the suspend image. * * We should save the page if it isn't Nosave, and is not in the range * of pages statically defined as 'unsaveable', and it isn't a part of @@ -897,7 +897,8 @@ static struct page *saveable_page(unsigned long pfn) if (swsusp_page_is_forbidden(page) || swsusp_page_is_free(page)) return NULL; - if (PageReserved(page) && pfn_is_nosave(pfn)) + if (PageReserved(page) + && (!kernel_page_present(page) || pfn_is_nosave(pfn))) return NULL; return page; @@ -938,6 +939,25 @@ static inline void do_copy_page(long *dst, long *src) *dst++ = *src++; } + +/** + * safe_copy_page - check if the page we are going to copy is marked as + * present in the kernel page tables (this always is the case if + * CONFIG_DEBUG_PAGEALLOC is not set and in that case + * kernel_page_present() always returns 'true'). + */ +static void safe_copy_page(void *dst, struct page *s_page) +{ + if (kernel_page_present(s_page)) { + do_copy_page(dst, page_address(s_page)); + } else { + kernel_map_pages(s_page, 1, 1); + do_copy_page(dst, page_address(s_page)); + kernel_map_pages(s_page, 1, 0); + } +} + + #ifdef CONFIG_HIGHMEM static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) @@ -946,8 +966,7 @@ page_is_saveable(struct zone *zone, unsigned long pfn) saveable_highmem_page(pfn) : saveable_page(pfn); } -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { struct page *s_page, *d_page; void *src, *dst; @@ -961,29 +980,26 @@ copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) kunmap_atomic(src, KM_USER0); kunmap_atomic(dst, KM_USER1); } else { - src = page_address(s_page); if (PageHighMem(d_page)) { /* Page pointed to by src may contain some kernel * data modified by kmap_atomic() */ - do_copy_page(buffer, src); + safe_copy_page(buffer, s_page); dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { - dst = page_address(d_page); - do_copy_page(dst, src); + safe_copy_page(page_address(d_page), s_page); } } } #else #define page_is_saveable(zone, pfn) saveable_page(pfn) -static inline void -copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) +static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { - do_copy_page(page_address(pfn_to_page(dst_pfn)), - page_address(pfn_to_page(src_pfn))); + safe_copy_page(page_address(pfn_to_page(dst_pfn)), + pfn_to_page(src_pfn)); } #endif /* CONFIG_HIGHMEM */ -- cgit v1.2.2 From 120fc3d77acfd91f3521737a440d42839c475982 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 21 Feb 2008 00:33:20 +0100 Subject: modules: do not try to add sysfs attributes if !CONFIG_SYSFS Thanks to Alexey for the testing and the fix of the fix. Cc: Alexey Dobriyan Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 92595bad3812..901cd6ac2f11 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -987,12 +987,11 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, return ret; } - /* * /sys/module/foo/sections stuff * J. Corbet */ -#ifdef CONFIG_KALLSYMS +#if defined(CONFIG_KALLSYMS) && defined(CONFIG_SYSFS) static ssize_t module_sect_show(struct module_attribute *mattr, struct module *mod, char *buf) { @@ -1188,7 +1187,7 @@ static inline void add_notes_attrs(struct module *mod, unsigned int nsect, static inline void remove_notes_attrs(struct module *mod) { } -#endif /* CONFIG_KALLSYMS */ +#endif #ifdef CONFIG_SYSFS int module_add_modinfo_attrs(struct module *mod) @@ -1231,9 +1230,7 @@ void module_remove_modinfo_attrs(struct module *mod) } kfree(mod->modinfo_attrs); } -#endif -#ifdef CONFIG_SYSFS int mod_sysfs_init(struct module *mod) { int err; -- cgit v1.2.2 From 3a2d5b700132f35401f1d9e22fe3c2cab02c2549 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 23 Feb 2008 19:13:25 +0100 Subject: PM: Introduce PM_EVENT_HIBERNATE callback state During the last step of hibernation in the "platform" mode (with the help of ACPI) we use the suspend code, including the devices' ->suspend() methods, to prepare the system for entering the ACPI S4 system sleep state. But at least for some devices the operations performed by the ->suspend() callback in that case must be different from its operations during regular suspend. For this reason, introduce the new PM event type PM_EVENT_HIBERNATE and pass it to the device drivers' ->suspend() methods during the last phase of hibernation, so that they can distinguish this case and handle it as appropriate. Modify the drivers that handle PM_EVENT_SUSPEND in a special way and need to handle PM_EVENT_HIBERNATE in the same way. These changes are necessary to fix a hibernation regression related to the i915 driver (ref. http://lkml.org/lkml/2008/2/22/488). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Tested-by: Jeff Chua Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 859a8e59773a..14a656cdc652 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -391,7 +391,7 @@ int hibernation_platform_enter(void) goto Close; suspend_console(); - error = device_suspend(PMSG_SUSPEND); + error = device_suspend(PMSG_HIBERNATE); if (error) goto Resume_console; @@ -404,7 +404,7 @@ int hibernation_platform_enter(void) goto Finish; local_irq_disable(); - error = device_power_down(PMSG_SUSPEND); + error = device_power_down(PMSG_HIBERNATE); if (!error) { hibernation_ops->enter(); /* We should never get here */ -- cgit v1.2.2 From de4fc64f0f2a4efbaad3e7c1e1e05a28f69b45e5 Mon Sep 17 00:00:00 2001 From: Harvey Harrison Date: Sat, 23 Feb 2008 15:23:33 -0800 Subject: markers: fix sparse warnings in markers.c char can be unsigned kernel/marker.c:64:20: error: dubious one-bit signed bitfield kernel/marker.c:65:14: error: dubious one-bit signed bitfield Signed-off-by: Harvey Harrison Acked-by: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/marker.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/marker.c b/kernel/marker.c index c4c2cd8b61f5..50effc01d9a2 100644 --- a/kernel/marker.c +++ b/kernel/marker.c @@ -61,8 +61,8 @@ struct marker_entry { int refcount; /* Number of times armed. 0 if disarmed. */ struct rcu_head rcu; void *oldptr; - char rcu_pending:1; - char ptype:1; + unsigned char rcu_pending:1; + unsigned char ptype:1; char name[0]; /* Contains name'\0'format'\0' */ }; -- cgit v1.2.2 From 3e4ab747efa8e78562ec6782b08bbf21a00aba1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Feb 2008 15:23:55 -0800 Subject: futex: fix init order When the futex init code fails to initialize the futex pseudo file system it returns early without initializing the hash queues. Should the boot succeed then a futex syscall which tries to enqueue a waiter on the hashqueue will crash due to the unitilialized plist heads. Initialize the hash queues before the filesystem. Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Lennert Buytenhek Cc: Riku Voipio Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index 221f2128a437..c21f667c63f6 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2145,8 +2145,14 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { - int i = register_filesystem(&futex_fs_type); + int i; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { + plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); + spin_lock_init(&futex_queues[i].lock); + } + + i = register_filesystem(&futex_fs_type); if (i) return i; @@ -2156,10 +2162,6 @@ static int __init init(void) return PTR_ERR(futex_mnt); } - for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { - plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); - spin_lock_init(&futex_queues[i].lock); - } return 0; } __initcall(init); -- cgit v1.2.2 From a0c1e9073ef7428a14309cba010633a6cd6719ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 23 Feb 2008 15:23:57 -0800 Subject: futex: runtime enable pi and robust functionality Not all architectures implement futex_atomic_cmpxchg_inatomic(). The default implementation returns -ENOSYS, which is currently not handled inside of the futex guts. Futex PI calls and robust list exits with a held futex result in an endless loop in the futex code on architectures which have no support. Fixing up every place where futex_atomic_cmpxchg_inatomic() is called would add a fair amount of extra if/else constructs to the already complex code. It is also not possible to disable the robust feature before user space tries to register robust lists. Compile time disabling is not a good idea either, as there are already architectures with runtime detection of futex_atomic_cmpxchg_inatomic support. Detect the functionality at runtime instead by calling cmpxchg_futex_value_locked() with a NULL pointer from the futex initialization code. This is guaranteed to fail, but the call of futex_atomic_cmpxchg_inatomic() happens with pagefaults disabled. On architectures, which use the asm-generic implementation or have a runtime CPU feature detection, a -ENOSYS return value disables the PI/robust features. On architectures with a working implementation the call returns -EFAULT and the PI/robust features are enabled. The relevant syscalls return -ENOSYS and the robust list exit code is blocked, when the detection fails. Fixes http://lkml.org/lkml/2008/2/11/149 Originally reported by: Lennart Buytenhek Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Lennert Buytenhek Cc: Riku Voipio Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/futex.c | 38 ++++++++++++++++++++++++++++++++++---- kernel/futex_compat.c | 9 +++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index c21f667c63f6..06968cd79200 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -60,6 +60,8 @@ #include "rtmutex_common.h" +int __read_mostly futex_cmpxchg_enabled; + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -469,6 +471,8 @@ void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key; + if (!futex_cmpxchg_enabled) + return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -1870,6 +1874,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -1894,6 +1900,9 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, struct robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->robust_list; else { @@ -1997,6 +2006,9 @@ void exit_robust_list(struct task_struct *curr) unsigned long futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -2051,7 +2063,7 @@ void exit_robust_list(struct task_struct *curr) long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret; + int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; struct rw_semaphore *fshared = NULL; @@ -2083,13 +2095,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr, fshared); + if (futex_cmpxchg_enabled) + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; default: ret = -ENOSYS; @@ -2145,8 +2160,23 @@ static struct file_system_type futex_fs_type = { static int __init init(void) { + u32 curval; int i; + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non functional ones will return + * -ENOSYS. + */ + curval = cmpxchg_futex_value_locked(NULL, 0, 0); + if (curval == -EFAULT) + futex_cmpxchg_enabled = 1; + for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); spin_lock_init(&futex_queues[i].lock); diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 7d5e4b016f39..ff90f049f8f6 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -54,6 +54,9 @@ void compat_exit_robust_list(struct task_struct *curr) compat_long_t futex_offset; int rc; + if (!futex_cmpxchg_enabled) + return; + /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): @@ -115,6 +118,9 @@ asmlinkage long compat_sys_set_robust_list(struct compat_robust_list_head __user *head, compat_size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (unlikely(len != sizeof(*head))) return -EINVAL; @@ -130,6 +136,9 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr, struct compat_robust_list_head __user *head; unsigned long ret; + if (!futex_cmpxchg_enabled) + return -ENOSYS; + if (!pid) head = current->compat_robust_list; else { -- cgit v1.2.2 From 43627582799db317e966ecb0002c2c3c9805ec0f Mon Sep 17 00:00:00 2001 From: Srinivasa Ds Date: Sat, 23 Feb 2008 15:24:04 -0800 Subject: kprobes: refuse kprobe insertion on add/sub_preempt_counter() Kprobes makes use of preempt_disable(),preempt_enable_noresched() and these functions inturn call add/sub_preempt_count(). So we need to refuse user from inserting probe in to these functions. This patch disallows user from probing add/sub_preempt_count(). Signed-off-by: Srinivasa DS Acked-by: Ananth N Mavinakayanahalli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index f28f19e65b59..c4bc8c210958 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3766,7 +3766,7 @@ void scheduler_tick(void) #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) -void add_preempt_count(int val) +void __kprobes add_preempt_count(int val) { /* * Underflow? @@ -3782,7 +3782,7 @@ void add_preempt_count(int val) } EXPORT_SYMBOL(add_preempt_count); -void sub_preempt_count(int val) +void __kprobes sub_preempt_count(int val) { /* * Underflow? -- cgit v1.2.2 From a043e3b2c63445512c5592cbe3c8694f3c655e81 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:09 -0800 Subject: cgroup: fix comments fix: - comments about need_forkexit_callback - comments about release agent - typo and comment style, etc. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 142 +++++++++++++++++++++++++++++++------------------------- 1 file changed, 79 insertions(+), 63 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4766bb65e4d9..36066d8a4911 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -113,9 +113,9 @@ static int root_count; #define dummytop (&rootnode.top_cgroup) /* This flag indicates whether tasks in the fork and exit paths should - * take callback_mutex and check for fork/exit handlers to call. This - * avoids us having to do extra work in the fork/exit path if none of the - * subsystems need to be called. + * check for fork/exit handlers to call. This avoids us having to do + * extra work in the fork/exit path if none of the subsystems need to + * be called. */ static int need_forkexit_callback; @@ -307,7 +307,6 @@ static inline void put_css_set_taskexit(struct css_set *cg) * template: location in which to build the desired set of subsystem * state objects for the new cgroup group */ - static struct css_set *find_existing_css_set( struct css_set *oldcg, struct cgroup *cgrp, @@ -354,7 +353,6 @@ static struct css_set *find_existing_css_set( * and chains them on tmp through their cgrp_link_list fields. Returns 0 on * success or a negative error */ - static int allocate_cg_links(int count, struct list_head *tmp) { struct cg_cgroup_link *link; @@ -396,7 +394,6 @@ static void free_cg_links(struct list_head *tmp) * substituted into the appropriate hierarchy. Must be called with * cgroup_mutex held */ - static struct css_set *find_css_set( struct css_set *oldcg, struct cgroup *cgrp) { @@ -507,8 +504,8 @@ static struct css_set *find_css_set( * critical pieces of code here. The exception occurs on cgroup_exit(), * when a task in a notify_on_release cgroup exits. Then cgroup_mutex * is taken, and if the cgroup count is zero, a usermode call made - * to /sbin/cgroup_release_agent with the name of the cgroup (path - * relative to the root of cgroup file system) as the argument. + * to the release agent with the name of the cgroup (path relative to + * the root of cgroup file system) as the argument. * * A cgroup can only be deleted if both its 'count' of using tasks * is zero, and its list of 'children' cgroups is empty. Since all @@ -521,7 +518,7 @@ static struct css_set *find_css_set( * * The need for this exception arises from the action of * cgroup_attach_task(), which overwrites one tasks cgroup pointer with - * another. It does so using cgroup_mutexe, however there are + * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference * task->cgroup without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as @@ -537,7 +534,6 @@ static struct css_set *find_css_set( * cgroup_lock - lock out any changes to cgroup structures * */ - void cgroup_lock(void) { mutex_lock(&cgroup_mutex); @@ -548,7 +544,6 @@ void cgroup_lock(void) * * Undo the lock taken in a previous cgroup_lock() call. */ - void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); @@ -590,7 +585,6 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) * Call subsys's pre_destroy handler. * This is called before css refcnt check. */ - static void cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; @@ -600,7 +594,6 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp) return; } - static void cgroup_diput(struct dentry *dentry, struct inode *inode) { /* is dentry a directory ? if so, kfree() associated cgroup */ @@ -1129,8 +1122,13 @@ static inline struct cftype *__d_cft(struct dentry *dentry) return dentry->d_fsdata; } -/* - * Called with cgroup_mutex held. Writes path of cgroup into buf. +/** + * cgroup_path - generate the path of a cgroup + * @cgrp: the cgroup in question + * @buf: the buffer to write the path into + * @buflen: the length of the buffer + * + * Called with cgroup_mutex held. Writes path of cgroup into buf. * Returns 0 on success, -errno on error. */ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) @@ -1188,11 +1186,13 @@ static void get_first_subsys(const struct cgroup *cgrp, *subsys_id = test_ss->subsys_id; } -/* - * Attach task 'tsk' to cgroup 'cgrp' +/** + * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' + * @cgrp: the cgroup the task is attaching to + * @tsk: the task to be attached * - * Call holding cgroup_mutex. May take task_lock of - * the task 'pid' during call. + * Call holding cgroup_mutex. May take task_lock of + * the task 'tsk' during call. */ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { @@ -1293,7 +1293,6 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) } /* The various types of files and directories in a cgroup file system */ - enum cgroup_filetype { FILE_ROOT, FILE_DIR, @@ -1584,12 +1583,11 @@ static int cgroup_create_file(struct dentry *dentry, int mode, } /* - * cgroup_create_dir - create a directory for an object. - * cgrp: the cgroup we create the directory for. - * It must have a valid ->parent field - * And we are going to fill its ->dentry field. - * dentry: dentry of the new cgroup - * mode: mode to set on new directory. + * cgroup_create_dir - create a directory for an object. + * @cgrp: the cgroup we create the directory for. It must have a valid + * ->parent field. And we are going to fill its ->dentry field. + * @dentry: dentry of the new cgroup + * @mode: mode to set on new directory. */ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry, int mode) @@ -1651,8 +1649,12 @@ int cgroup_add_files(struct cgroup *cgrp, return 0; } -/* Count the number of tasks in a cgroup. */ - +/** + * cgroup_task_count - count the number of tasks in a cgroup. + * @cgrp: the cgroup in question + * + * Return the number of tasks in the cgroup. + */ int cgroup_task_count(const struct cgroup *cgrp) { int count = 0; @@ -1962,12 +1964,13 @@ static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp) } /** - * Build and fill cgroupstats so that taskstats can export it to user - * space. - * + * cgroupstats_build - build and fill cgroupstats * @stats: cgroupstats to fill information into * @dentry: A dentry entry belonging to the cgroup for which stats have * been requested. + * + * Build and fill cgroupstats so that taskstats can export it to user + * space. */ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { @@ -2199,14 +2202,13 @@ static void init_cgroup_css(struct cgroup_subsys_state *css, } /* - * cgroup_create - create a cgroup - * parent: cgroup that will be parent of the new cgroup. - * name: name of the new cgroup. Will be strcpy'ed. - * mode: mode to set on new inode + * cgroup_create - create a cgroup + * @parent: cgroup that will be parent of the new cgroup + * @dentry: dentry of the new cgroup + * @mode: mode to set on new inode * - * Must be called with the mutex on the parent inode held + * Must be called with the mutex on the parent inode held */ - static long cgroup_create(struct cgroup *parent, struct dentry *dentry, int mode) { @@ -2349,13 +2351,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) parent = cgrp->parent; root = cgrp->root; sb = root->sb; + /* - * Call pre_destroy handlers of subsys + * Call pre_destroy handlers of subsys. Notify subsystems + * that rmdir() request comes. */ cgroup_call_pre_destroy(cgrp); - /* - * Notify subsyses that rmdir() request comes. - */ if (cgroup_has_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); @@ -2431,8 +2432,10 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) } /** - * cgroup_init_early - initialize cgroups at system boot, and - * initialize any subsystems that request early init. + * cgroup_init_early - cgroup initialization at system boot + * + * Initialize cgroups at system boot, and initialize any + * subsystems that request early init. */ int __init cgroup_init_early(void) { @@ -2474,8 +2477,10 @@ int __init cgroup_init_early(void) } /** - * cgroup_init - register cgroup filesystem and /proc file, and - * initialize any subsystems that didn't request early init. + * cgroup_init - cgroup initialization + * + * Register cgroup filesystem and /proc file, and initialize + * any subsystems that didn't request early init. */ int __init cgroup_init(void) { @@ -2618,7 +2623,7 @@ static struct file_operations proc_cgroupstats_operations = { /** * cgroup_fork - attach newly forked task to its parents cgroup. - * @tsk: pointer to task_struct of forking parent process. + * @child: pointer to task_struct of forking parent process. * * Description: A task inherits its parent's cgroup at fork(). * @@ -2642,9 +2647,12 @@ void cgroup_fork(struct task_struct *child) } /** - * cgroup_fork_callbacks - called on a new task very soon before - * adding it to the tasklist. No need to take any locks since no-one - * can be operating on this task + * cgroup_fork_callbacks - run fork callbacks + * @child: the new task + * + * Called on a new task very soon before adding it to the + * tasklist. No need to take any locks since no-one can + * be operating on this task. */ void cgroup_fork_callbacks(struct task_struct *child) { @@ -2659,11 +2667,14 @@ void cgroup_fork_callbacks(struct task_struct *child) } /** - * cgroup_post_fork - called on a new task after adding it to the - * task list. Adds the task to the list running through its css_set - * if necessary. Has to be after the task is visible on the task list - * in case we race with the first call to cgroup_iter_start() - to - * guarantee that the new task ends up on its list. */ + * cgroup_post_fork - called on a new task after adding it to the task list + * @child: the task in question + * + * Adds the task to the list running through its css_set if necessary. + * Has to be after the task is visible on the task list in case we race + * with the first call to cgroup_iter_start() - to guarantee that the + * new task ends up on its list. + */ void cgroup_post_fork(struct task_struct *child) { if (use_task_css_set_links) { @@ -2676,6 +2687,7 @@ void cgroup_post_fork(struct task_struct *child) /** * cgroup_exit - detach cgroup from exiting task * @tsk: pointer to task_struct of exiting process + * @run_callback: run exit callbacks? * * Description: Detach cgroup from @tsk and release it. * @@ -2706,7 +2718,6 @@ void cgroup_post_fork(struct task_struct *child) * top_cgroup isn't going away, and either task has PF_EXITING set, * which wards off any cgroup_attach_task() attempts, or task is a failed * fork, never visible to cgroup_attach_task. - * */ void cgroup_exit(struct task_struct *tsk, int run_callbacks) { @@ -2743,9 +2754,13 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) } /** - * cgroup_clone - duplicate the current cgroup in the hierarchy - * that the given subsystem is attached to, and move this task into - * the new child + * cgroup_clone - clone the cgroup the given subsystem is attached to + * @tsk: the task to be moved + * @subsys: the given subsystem + * + * Duplicate the current cgroup in the hierarchy that the given + * subsystem is attached to, and move this task into the new + * child. */ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) { @@ -2858,9 +2873,12 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) return ret; } -/* - * See if "cgrp" is a descendant of the current task's cgroup in - * the appropriate hierarchy +/** + * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp + * @cgrp: the cgroup in question + * + * See if @cgrp is a descendant of the current task's cgroup in + * the appropriate hierarchy. * * If we are sending in dummytop, then presumably we are creating * the top cgroup in the subsystem. @@ -2939,9 +2957,7 @@ void __css_put(struct cgroup_subsys_state *css) * release agent task. We don't bother to wait because the caller of * this routine has no use for the exit status of the release agent * task, so no sense holding our caller up for that. - * */ - static void cgroup_release_agent(struct work_struct *work) { BUG_ON(work != &release_agent_work); -- cgit v1.2.2 From f777073848ba3708d68d87e43f104f83316187d7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:10 -0800 Subject: cgroup: fix memory leak in cgroup_get_sb() opts.release_agent is not kfree()ed in all necessary places. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 36066d8a4911..947fe3b22182 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -954,8 +954,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, } root = kzalloc(sizeof(*root), GFP_KERNEL); - if (!root) + if (!root) { + if (opts.release_agent) + kfree(opts.release_agent); return -ENOMEM; + } init_cgroup_root(root); root->subsys_bits = opts.subsys_bits; -- cgit v1.2.2 From 8d53d55d27754508e58e9ac18a4a445b110434bf Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:11 -0800 Subject: cgroup: fix subsys bitops Cgroup uses unsigned long for subsys bitops, not unsigned long long. Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 947fe3b22182..841259361724 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -319,7 +319,7 @@ static struct css_set *find_existing_css_set( /* Built the set of subsystem state objects that we want to * see in the new css_set */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - if (root->subsys_bits & (1ull << i)) { + if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want * the subsystem state from the new * cgroup */ @@ -689,7 +689,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { - unsigned long long bit = 1ull << i; + unsigned long bit = 1UL << i; struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; -- cgit v1.2.2 From 68db38f1537a44097e264f28bda751d6b919cd53 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:11 -0800 Subject: cgroup: remove duplicate code in find_css_set() The list head res->tasks gets initialized twice in find_css_set(). Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 841259361724..2aa408201aa5 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -470,7 +470,6 @@ static struct css_set *find_css_set( /* Link this cgroup group into the list */ list_add(&res->list, &init_css_set.list); css_set_count++; - INIT_LIST_HEAD(&res->tasks); write_unlock(&css_set_lock); return res; -- cgit v1.2.2 From bc231d2a048010d5e0b49ac7fddbfa822fc41109 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 23 Feb 2008 15:24:12 -0800 Subject: cgroup: remove dead code in cgroup_get_rootdir() Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2aa408201aa5..d8abe996e009 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -919,7 +919,6 @@ static int cgroup_get_rootdir(struct super_block *sb) if (!inode) return -ENOMEM; - inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; inode->i_op = &cgroup_dir_inode_operations; /* directories start off with i_nlink == 2 (for "." entry) */ -- cgit v1.2.2 From 04e2f1741d235ba599037734878d72e57cb302b5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 23 Feb 2008 18:05:03 -0800 Subject: Add memory barrier semantics to wake_up() & co Oleg Nesterov and others have pointed out that on some architectures, the traditional sequence of set_current_state(TASK_INTERRUPTIBLE); if (CONDITION) return; schedule(); is racy wrt another CPU doing CONDITION = 1; wake_up_process(p); because while set_current_state() has a memory barrier separating setting of the TASK_INTERRUPTIBLE state from reading of the CONDITION variable, there is no such memory barrier on the wakeup side. Now, wake_up_process() does actually take a spinlock before it reads and sets the task state on the waking side, and on x86 (and many other architectures) that spinlock is in fact equivalent to a memory barrier, but that is not generally guaranteed. The write that sets CONDITION could move into the critical region protected by the runqueue spinlock. However, adding a smp_wmb() to before the spinlock should now order the writing of CONDITION wrt the lock itself, which in turn is ordered wrt the accesses within the spinlock (which includes the reading of the old state). This should thus close the race (which probably has never been seen in practice, but since smp_wmb() is a no-op on x86, it's not like this will make anything worse either on the most common architecture where the spinlock already gave the required protection). Acked-by: Oleg Nesterov Acked-by: Dmitry Adamushko Cc: Andrew Morton Cc: Nick Piggin Signed-off-by: Linus Torvalds --- kernel/sched.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index c4bc8c210958..b387a8de26a5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1831,6 +1831,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) long old_state; struct rq *rq; + smp_wmb(); rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) -- cgit v1.2.2