From 36cd3c9f925b9307236505ae7ad1ad7ac4d4357c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Apr 2009 18:48:34 +0200 Subject: mutex: have non-spinning mutexes on s390 by default Impact: performance regression fix for s390 The adaptive spinning mutexes will not always do what one would expect on virtualized architectures like s390. Especially the cpu_relax() loop in mutex_spin_on_owner might hurt if the mutex holding cpu has been scheduled away by the hypervisor. We would end up in a cpu_relax() loop when there is no chance that the state of the mutex changes until the target cpu has been scheduled again by the hypervisor. For that reason we should change the default behaviour to no-spin on s390. We do have an instruction which allows to yield the current cpu in favour of a different target cpu. Also we have an instruction which allows us to figure out if the target cpu is physically backed. However we need to do some performance tests until we can come up with a solution that will do the right thing on s390. Signed-off-by: Heiko Carstens Acked-by: Peter Zijlstra Cc: Martin Schwidefsky Cc: Christian Borntraeger LKML-Reference: <20090409184834.7a0df7b2@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/mutex.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a3..507cf2b5e9f1 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -148,7 +148,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire(&lock->dep_map, subclass, 0, ip); -#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) +#if defined(CONFIG_SMP) && !defined(CONFIG_DEBUG_MUTEXES) && \ + !defined(CONFIG_HAVE_DEFAULT_NO_SPIN_MUTEXES) /* * Optimistic spinning. * -- cgit v1.2.2 From 4d1f4372dbea068ba4ee3d98231133a4a4ee15bd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 10 Apr 2009 08:48:36 +0800 Subject: tracing: fix document references When moving documents to Documentation/trace/, I forgot to grep Kconfig to find out those references. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Pekka Enberg Cc: Pekka Paalanen Cc: eduard.munteanu@linux360.ro LKML-Reference: <49DE97EF.7080208@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 2246141bda4d..417d1985e299 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -312,7 +312,7 @@ config KMEMTRACE and profile kernel code. This requires an userspace application to use. See - Documentation/vm/kmemtrace.txt for more information. + Documentation/trace/kmemtrace.txt for more information. Saying Y will make the kernel somewhat larger and slower. However, if you disable kmemtrace at run-time or boot-time, the performance @@ -403,7 +403,7 @@ config MMIOTRACE implementation and works via page faults. Tracing is disabled by default and can be enabled at run-time. - See Documentation/tracers/mmiotrace.txt. + See Documentation/trace/mmiotrace.txt. If you are not helping to develop drivers, say N. config MMIOTRACE_TEST -- cgit v1.2.2 From 0462b5664b2bda5a18fef7efb5bb32ce36590c1a Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Wed, 8 Apr 2009 17:00:13 +0800 Subject: ftrace: Output REC->var instead of __entry->var for trace format print fmt: "irq=%d return=%s", __entry->irq, __entry->ret ? \"handled\" : \"unhandled\" "__entry" should be convert to "REC" by __stringify() macro. Signed-off-by: Zhao Lei Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49DC679D.2090901@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_stage_2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 30743f7d4110..d363c6672c6c 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -105,10 +105,10 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ return 0; #undef __entry -#define __entry "REC" +#define __entry REC #undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) #undef TP_fast_assign #define TP_fast_assign(args...) args -- cgit v1.2.2 From d6de2c80e9d758d2e36c21699117db6178c0f517 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 10 Apr 2009 12:17:41 -0700 Subject: async: Fix module loading async-work regression Several drivers use asynchronous work to do device discovery, and we synchronize with them in the compiled-in case before we actually try to mount root filesystems etc. However, when compiled as modules, that synchronization is missing - the module loading completes, but the driver hasn't actually finished probing for devices, and that means that any user mode that expects to use the devices after the 'insmod' is now potentially broken. We already saw one case of a similar issue in the ACPI battery code, where the kernel itself expected the module to be all done, and unmapped the init memory - but the async device discovery was still running. That got hacked around by just removing the "__init" (see commit 5d38258ec026921a7b266f4047ebeaa75db358e5 "ACPI battery: fix async boot oops"), but the real fix is to just make the module loading wait for all async work to be completed. It will slow down module loading, but since common devices should be built in anyway, and since the bug is really annoying and hard to handle from user space (and caused several S3 resume regressions), the simple fix to wait is the right one. This fixes at least http://bugzilla.kernel.org/show_bug.cgi?id=13063 but probably a few other bugzilla entries too (12936, for example), and is confirmed to fix Rafael's storage driver breakage after resume bug report (no bugzilla entry). We should also be able to now revert that ACPI battery fix. Reported-and-tested-by: Rafael J. Wysocki Tested-by: Heinz Diehl Acked-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/module.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 05f014efa32c..e797812a4d95 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2388,6 +2388,9 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); + /* We need to finish all async code before the module init sequence is done */ + async_synchronize_full(); + mutex_lock(&module_mutex); /* Drop initial reference. */ module_put(mod); -- cgit v1.2.2 From 8433a40eb7f2c4883ad57f9900f63e4d59240eb7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 11 Apr 2009 15:52:18 +0800 Subject: tracing/filters: NIL-terminate user input filter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make sure messages from user space are NIL-terminated strings, otherwise we could dump random memory while reading filter file. Try this: # echo 'parent_comm ==' > events/sched/sched_process_fork/filter # cat events/sched/sched_process_fork/filter parent_comm == � Signed-off-by: Li Zefan Acked-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E04C32.6060508@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64ec4d278ffb..054bc1802bcd 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -503,6 +503,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) @@ -569,6 +570,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (copy_from_user(&buf, ubuf, cnt)) return -EFAULT; + buf[cnt] = '\0'; pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) -- cgit v1.2.2 From bcabd91c271e50eebc0cb9220ac92700332b452e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 11 Apr 2009 15:52:35 +0800 Subject: tracing/filters: fix NULL pointer dereference Try this, and you'll see NULL pointer dereference bug: # echo -n 'parent_comm ==' > sched/sched_process_fork/filter Because we passed NULL ptr to simple_strtoull(). Signed-off-by: Li Zefan Acked-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E04C43.1050504@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 026be412f356..9d2162fd2305 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -410,6 +410,11 @@ int filter_parse(char **pbuf, struct filter_pred *pred) } } + if (!val_str) { + pred->field_name = NULL; + return -EINVAL; + } + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!pred->field_name) return -ENOMEM; -- cgit v1.2.2 From a3e0ab050774117d4a6173087c8bf3888662a83f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 11 Apr 2009 15:52:51 +0800 Subject: tracing/filters: allow user input integer to be oct or hex Before patch: # echo 'parent_pid == 0x10' > events/sched/sched_process_fork/filter # cat sched/sched_process_fork/filter parent_pid == 0 After patch: # cat sched/sched_process_fork/filter parent_pid == 16 Also check the input more strictly. Signed-off-by: Li Zefan Acked-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E04C53.4010600@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9d2162fd2305..49b3ef54ec46 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -419,12 +419,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred) if (!pred->field_name) return -ENOMEM; - pred->val = simple_strtoull(val_str, &tmp, 10); + pred->val = simple_strtoull(val_str, &tmp, 0); if (tmp == val_str) { pred->str_val = kstrdup(val_str, GFP_KERNEL); if (!pred->str_val) return -ENOMEM; - } + } else if (*tmp != '\0') + return -EINVAL; return 0; } -- cgit v1.2.2 From 44e9c8b7adc52079f0535f9de0c2c2477831389b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Sat, 11 Apr 2009 15:55:28 +0800 Subject: tracing/filters: return proper error code when writing filter file - propagate return value of filter_add_pred() to the user - return -ENOSPC but not -ENOMEM or -EINVAL when the filter array is full Signed-off-by: Li Zefan Acked-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E04CF0.3010105@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 10 ++++++---- kernel/trace/trace_events_filter.c | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 054bc1802bcd..576f4fa2af0d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -521,9 +521,10 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } - if (filter_add_pred(call, pred)) { + err = filter_add_pred(call, pred); + if (err < 0) { filter_free_pred(pred); - return -EINVAL; + return err; } *ppos += cnt; @@ -588,10 +589,11 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } - if (filter_add_subsystem_pred(system, pred)) { + err = filter_add_subsystem_pred(system, pred); + if (err < 0) { filter_free_subsystem_preds(system); filter_free_pred(pred); - return -EINVAL; + return err; } *ppos += cnt; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 49b3ef54ec46..e03cbf1e38f3 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -215,7 +215,7 @@ static int __filter_add_pred(struct ftrace_event_call *call, } } - return -ENOMEM; + return -ENOSPC; } static int is_string_field(const char *type) @@ -319,7 +319,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, } if (i == MAX_FILTER_PRED) - return -EINVAL; + return -ENOSPC; events_for_each(call) { int err; -- cgit v1.2.2 From 9eeba6138cefc0435695463ddadb0d95e0a6bcd2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 11 Apr 2009 03:17:17 +0200 Subject: lockdep: warn about lockdep disabling after kernel taint Impact: provide useful missing info for developers Kernel taint can occur in several situations such as warnings, load of prorietary or staging modules, bad page, etc... But when such taint happens, a developer might still be working on the kernel, expecting that lockdep is still enabled. But a taint disables lockdep without ever warning about it. Such a kernel behaviour doesn't really help for kernel development. This patch adds this missing warning. Since the taint is done most of the time after the main message that explain the real source issue, it seems safe to warn about it inside add_taint() so that it appears at last, without hurting the main information. v2: Use a generic helper to disable lockdep instead of an open coded xchg(). Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra LKML-Reference: <1239412638-6739-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/panic.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 3fd8c5bf8b39..940ca14f6dbf 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -213,8 +213,14 @@ unsigned long get_taint(void) void add_taint(unsigned flag) { - /* can't trust the integrity of the kernel anymore: */ - debug_locks = 0; + /* + * Can't trust the integrity of the kernel anymore. + * We don't call directly debug_locks_off() because the issue + * is not necessarily serious enough to set oops_in_progress to 1 + */ + if (__debug_locks_off()) + printk(KERN_WARNING "Disabling lockdep due to kernel taint\n"); + set_bit(flag, &tainted_mask); } EXPORT_SYMBOL(add_taint); -- cgit v1.2.2 From 574bbe782057fdf0490dc7dec906a2dc26363e20 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 11 Apr 2009 03:17:18 +0200 Subject: lockdep: continue lock debugging despite some taints Impact: broaden lockdep checks Lockdep is disabled after any kernel taints. This might be convenient to ignore bad locking issues which sources come from outside the kernel tree. Nevertheless, it might be a frustrating experience for the staging developers or those who experience a warning but are focused on another things that require lockdep. The v2 of this patch simply don't disable anymore lockdep in case of TAINT_CRAP and TAINT_WARN events. Signed-off-by: Frederic Weisbecker Cc: LTP Cc: Peter Zijlstra Cc: Greg KH LKML-Reference: <1239412638-6739-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/panic.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 940ca14f6dbf..934fb377f4b3 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -217,8 +217,10 @@ void add_taint(unsigned flag) * Can't trust the integrity of the kernel anymore. * We don't call directly debug_locks_off() because the issue * is not necessarily serious enough to set oops_in_progress to 1 + * Also we want to keep up lockdep for staging development and + * post-warning case. */ - if (__debug_locks_off()) + if (flag != TAINT_CRAP && flag != TAINT_WARN && __debug_locks_off()) printk(KERN_WARNING "Disabling lockdep due to kernel taint\n"); set_bit(flag, &tainted_mask); -- cgit v1.2.2 From c751085943362143f84346d274e0011419c84202 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 12 Apr 2009 20:06:56 +0200 Subject: PM/Hibernate: Wait for SCSI devices scan to complete during resume There is a race between resume from hibernation and the asynchronous scanning of SCSI devices and to prevent it from happening we need to call scsi_complete_async_scans() during resume from hibernation. In addition, if the resume from hibernation is userland-driven, it's better to wait for all device probes in the kernel to complete before attempting to open the resume device. Signed-off-by: Rafael J. Wysocki Acked-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 8 ++++++++ kernel/power/user.c | 9 +++++++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 5f21ab2bbcdf..0854770b63b9 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "power.h" @@ -644,6 +645,13 @@ static int software_resume(void) if (noresume) return 0; + /* + * We can't depend on SCSI devices being available after loading one of + * their modules if scsi_complete_async_scans() is not called and the + * resume device usually is a SCSI one. + */ + scsi_complete_async_scans(); + /* * name_to_dev_t() below takes a sysfs buffer mutex when sysfs * is configured into the kernel. Since the regular hibernate diff --git a/kernel/power/user.c b/kernel/power/user.c index 6c85359364f2..ed97375daae9 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -24,6 +24,7 @@ #include #include #include +#include #include @@ -92,6 +93,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) filp->private_data = data; memset(&data->handle, 0, sizeof(struct snapshot_handle)); if ((filp->f_flags & O_ACCMODE) == O_RDONLY) { + /* Hibernating. The image device should be accessible. */ data->swap = swsusp_resume_device ? swap_type_of(swsusp_resume_device, 0, NULL) : -1; data->mode = O_RDONLY; @@ -99,6 +101,13 @@ static int snapshot_open(struct inode *inode, struct file *filp) if (error) pm_notifier_call_chain(PM_POST_HIBERNATION); } else { + /* + * Resuming. We may need to wait for the image device to + * appear. + */ + wait_for_device_probe(); + scsi_complete_async_scans(); + data->swap = -1; data->mode = O_WRONLY; error = pm_notifier_call_chain(PM_RESTORE_PREPARE); -- cgit v1.2.2