From dd17c8f72993f9461e9c19250e3f155d6d99df22 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 29 Oct 2009 22:34:15 +0900 Subject: percpu: remove per_cpu__ prefix. Now that the return from alloc_percpu is compatible with the address of per-cpu vars, it makes sense to hand around the address of per-cpu variables. To make this sane, we remove the per_cpu__ prefix we used created to stop people accidentally using these vars directly. Now we have sparse, we can use that (next patch). tj: * Updated to convert stuff which were missed by or added after the original patch. * Kill per_cpu_var() macro. Signed-off-by: Rusty Russell Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter --- kernel/rcutorture.c | 8 ++++---- kernel/trace/trace.c | 6 +++--- kernel/trace/trace_functions_graph.c | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 178967b6434e..e339ab349121 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -731,13 +731,13 @@ static void rcu_torture_timer(unsigned long unused) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); + __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); + __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); } @@ -786,13 +786,13 @@ rcu_torture_reader(void *arg) /* Should not happen, but... */ pipe_count = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_count)[pipe_count]); + __this_cpu_inc(rcu_torture_count[pipe_count]); completed = cur_ops->completed() - completed; if (completed > RCU_TORTURE_PIPE_LEN) { /* Should not happen, but... */ completed = RCU_TORTURE_PIPE_LEN; } - __this_cpu_inc(per_cpu_var(rcu_torture_batch)[completed]); + __this_cpu_inc(rcu_torture_batch[completed]); preempt_enable(); cur_ops->readunlock(idx); schedule(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 85a5ed70b5b2..b808177af816 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -91,12 +91,12 @@ DEFINE_PER_CPU(int, ftrace_cpu_disabled); static inline void ftrace_disable_cpu(void) { preempt_disable(); - __this_cpu_inc(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_inc(ftrace_cpu_disabled); } static inline void ftrace_enable_cpu(void) { - __this_cpu_dec(per_cpu_var(ftrace_cpu_disabled)); + __this_cpu_dec(ftrace_cpu_disabled); preempt_enable(); } @@ -1085,7 +1085,7 @@ trace_function(struct trace_array *tr, struct ftrace_entry *entry; /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 90a6daa10962..8614e3241ff8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -176,7 +176,7 @@ static int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return 0; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, @@ -240,7 +240,7 @@ static void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(per_cpu_var(ftrace_cpu_disabled)))) + if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) return; event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, -- cgit v1.2.2 From 86fc80f16e8a2449d5827bf1a9838b7fd9f70097 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 9 Dec 2009 17:13:31 +0100 Subject: capabilities: Use RCU to protect task lookup in sys_capget cap_get_target_pid() protects the task lookup with tasklist_lock. security_capget() is called under tasklist_lock as well but tasklist_lock does not protect anything there. The capabilities are protected by RCU already. So tasklist_lock only protects the lookup and prevents the task going away, which can be done with rcu_read_lock() as well. Signed-off-by: Thomas Gleixner Signed-off-by: James Morris --- kernel/capability.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 7f876e60521f..9e4697e9b276 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -135,7 +135,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, if (pid && (pid != task_pid_vnr(current))) { struct task_struct *target; - read_lock(&tasklist_lock); + rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) @@ -143,7 +143,7 @@ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, else ret = security_capget(target, pEp, pIp, pPp); - read_unlock(&tasklist_lock); + rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); -- cgit v1.2.2 From e1783a240f491fb233f04edc042e16b18a7a79ba Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: module: Use this_cpu_xx to dynamically allocate counters Use cpu ops to deal with the per cpu data instead of a local_t. Reduces memory requirements, cache footprint and decreases cycle counts. The this_cpu_xx operations are also used for !SMP mode. Otherwise we could not drop the use of __module_ref_addr() which would make per cpu data handling complicated. this_cpu_xx operations have their own fallback for !SMP. V8-V9: - Leave include asm/module.h since ringbuffer.c depends on it. Nothing else does though. Another patch will deal with that. - Remove spurious free. Signed-off-by: Christoph Lameter Acked-by: Rusty Russell Signed-off-by: Tejun Heo --- kernel/module.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e96b8ed1cb6a..9bf228052ec5 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -474,9 +474,10 @@ static void module_unload_init(struct module *mod) INIT_LIST_HEAD(&mod->modules_which_use_me); for_each_possible_cpu(cpu) - local_set(__module_ref_addr(mod, cpu), 0); + per_cpu_ptr(mod->refptr, cpu)->count = 0; + /* Hold reference count during initialization. */ - local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1); + __this_cpu_write(mod->refptr->count, 1); /* Backwards compatibility macros put refcount during init. */ mod->waiter = current; } @@ -619,7 +620,7 @@ unsigned int module_refcount(struct module *mod) int cpu; for_each_possible_cpu(cpu) - total += local_read(__module_ref_addr(mod, cpu)); + total += per_cpu_ptr(mod->refptr, cpu)->count; return total; } EXPORT_SYMBOL(module_refcount); @@ -796,14 +797,15 @@ static struct module_attribute refcnt = { void module_put(struct module *module) { if (module) { - unsigned int cpu = get_cpu(); - local_dec(__module_ref_addr(module, cpu)); + preempt_disable(); + __this_cpu_dec(module->refptr->count); + trace_module_put(module, _RET_IP_, - local_read(__module_ref_addr(module, cpu))); + __this_cpu_read(module->refptr->count)); /* Maybe they're waiting for us to drop reference? */ if (unlikely(!module_is_live(module))) wake_up_process(module->waiter); - put_cpu(); + preempt_enable(); } } EXPORT_SYMBOL(module_put); @@ -1394,9 +1396,9 @@ static void free_module(struct module *mod) kfree(mod->args); if (mod->percpu) percpu_modfree(mod->percpu); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) +#if defined(CONFIG_MODULE_UNLOAD) if (mod->refptr) - percpu_modfree(mod->refptr); + free_percpu(mod->refptr); #endif /* Free lock-classes: */ lockdep_free_key_range(mod->module_core, mod->core_size); @@ -2159,9 +2161,8 @@ static noinline struct module *load_module(void __user *umod, mod = (void *)sechdrs[modindex].sh_addr; kmemleak_load_module(mod, hdr, sechdrs, secstrings); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t), - mod->name); +#if defined(CONFIG_MODULE_UNLOAD) + mod->refptr = alloc_percpu(struct module_ref); if (!mod->refptr) { err = -ENOMEM; goto free_init; @@ -2393,8 +2394,8 @@ static noinline struct module *load_module(void __user *umod, kobject_put(&mod->mkobj.kobj); free_unload: module_unload_free(mod); -#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) - percpu_modfree(mod->refptr); +#if defined(CONFIG_MODULE_UNLOAD) + free_percpu(mod->refptr); free_init: #endif module_free(mod, mod->module_init); -- cgit v1.2.2 From 79615760f380ec86cd58204744e774c33fab9211 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 5 Jan 2010 15:34:50 +0900 Subject: local_t: Move local.h include to ringbuffer.c and ring_buffer_benchmark.c ringbuffer*.c are the last users of local.h. Remove the include from modules.h and add it to ringbuffer files. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo --- kernel/trace/ring_buffer.c | 1 + kernel/trace/ring_buffer_benchmark.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 2326b04c95c4..eb6c8988c31a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -20,6 +20,7 @@ #include #include +#include #include "trace.h" /* diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index b2477caf09c2..df74c7982255 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -8,6 +8,7 @@ #include #include #include +#include struct rb_page { u64 ts; -- cgit v1.2.2 From 002345925e6c45861f60db6f4fc6236713fd8847 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:36:43 -0800 Subject: syslog: distinguish between /proc/kmsg and syscalls This allows the LSM to distinguish between syslog functions originating from /proc/kmsg access and direct syscalls. By default, the commoncaps will now no longer require CAP_SYS_ADMIN to read an opened /proc/kmsg file descriptor. For example the kernel syslog reader can now drop privileges after opening /proc/kmsg, instead of staying privileged with CAP_SYS_ADMIN. MAC systems that implement security_syslog have unchanged behavior. Signed-off-by: Kees Cook Acked-by: Serge Hallyn Acked-by: John Johansen Signed-off-by: James Morris --- kernel/printk.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 17463ca2e229..809cf9a258a0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -273,14 +274,14 @@ static inline void boot_delay_msec(void) * 9 -- Return number of unread characters in the log buffer * 10 -- Return size of the log buffer */ -int do_syslog(int type, char __user *buf, int len) +int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; int do_clear = 0; char c; int error = 0; - error = security_syslog(type); + error = security_syslog(type, from_file); if (error) return error; @@ -417,7 +418,7 @@ out: SYSCALL_DEFINE3(syslog, int, type, char __user *, buf, int, len) { - return do_syslog(type, buf, len); + return do_syslog(type, buf, len, SYSLOG_FROM_CALL); } /* -- cgit v1.2.2 From d78ca3cd733d8a2c3dcd88471beb1a15d973eed8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 3 Feb 2010 15:37:13 -0800 Subject: syslog: use defined constants instead of raw numbers Right now the syslog "type" action are just raw numbers which makes the source difficult to follow. This patch replaces the raw numbers with defined constants for some level of sanity. Signed-off-by: Kees Cook Acked-by: John Johansen Acked-by: Serge Hallyn Signed-off-by: James Morris --- kernel/printk.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 809cf9a258a0..3e162d867098 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -259,21 +259,6 @@ static inline void boot_delay_msec(void) } #endif -/* - * Commands to do_syslog: - * - * 0 -- Close the log. Currently a NOP. - * 1 -- Open the log. Currently a NOP. - * 2 -- Read from the log. - * 3 -- Read all messages remaining in the ring buffer. - * 4 -- Read and clear all messages remaining in the ring buffer - * 5 -- Clear ring buffer. - * 6 -- Disable printk's to console - * 7 -- Enable printk's to console - * 8 -- Set level of messages printed to console - * 9 -- Return number of unread characters in the log buffer - * 10 -- Return size of the log buffer - */ int do_syslog(int type, char __user *buf, int len, bool from_file) { unsigned i, j, limit, count; @@ -286,11 +271,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) return error; switch (type) { - case 0: /* Close log */ + case SYSLOG_ACTION_CLOSE: /* Close log */ break; - case 1: /* Open log */ + case SYSLOG_ACTION_OPEN: /* Open log */ break; - case 2: /* Read from log */ + case SYSLOG_ACTION_READ: /* Read from log */ error = -EINVAL; if (!buf || len < 0) goto out; @@ -321,10 +306,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) if (!error) error = i; break; - case 4: /* Read/clear last kernel messages */ + /* Read/clear last kernel messages */ + case SYSLOG_ACTION_READ_CLEAR: do_clear = 1; /* FALL THRU */ - case 3: /* Read last kernel messages */ + /* Read last kernel messages */ + case SYSLOG_ACTION_READ_ALL: error = -EINVAL; if (!buf || len < 0) goto out; @@ -377,21 +364,25 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) } } break; - case 5: /* Clear ring buffer */ + /* Clear ring buffer */ + case SYSLOG_ACTION_CLEAR: logged_chars = 0; break; - case 6: /* Disable logging to console */ + /* Disable logging to console */ + case SYSLOG_ACTION_CONSOLE_OFF: if (saved_console_loglevel == -1) saved_console_loglevel = console_loglevel; console_loglevel = minimum_console_loglevel; break; - case 7: /* Enable logging to console */ + /* Enable logging to console */ + case SYSLOG_ACTION_CONSOLE_ON: if (saved_console_loglevel != -1) { console_loglevel = saved_console_loglevel; saved_console_loglevel = -1; } break; - case 8: /* Set level of messages printed to console */ + /* Set level of messages printed to console */ + case SYSLOG_ACTION_CONSOLE_LEVEL: error = -EINVAL; if (len < 1 || len > 8) goto out; @@ -402,10 +393,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file) saved_console_loglevel = -1; error = 0; break; - case 9: /* Number of chars in the log buffer */ + /* Number of chars in the log buffer */ + case SYSLOG_ACTION_SIZE_UNREAD: error = log_end - log_start; break; - case 10: /* Size of the log buffer */ + /* Size of the log buffer */ + case SYSLOG_ACTION_SIZE_BUFFER: error = log_buf_len; break; default: -- cgit v1.2.2 From c41b20e721ea4f6f20f66a66e7f0c3c97a2ca9c2 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 11 Dec 2009 16:35:39 -0500 Subject: Fix misspellings of "truly" in comments. Some comments misspell "truly"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- kernel/trace/ring_buffer.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c1b2d290718..9ab578f1bb65 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2541,7 +2541,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable); * @buffer: The ring buffer to enable writes * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable(struct ring_buffer *buffer) { @@ -2577,7 +2577,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu); * @cpu: The CPU to enable. * * Note, multiple disables will need the same number of enables - * to truely enable the writing (much like preempt_disable). + * to truly enable the writing (much like preempt_disable). */ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) { -- cgit v1.2.2 From 2a61aa401638529cd4231f6106980d307fba98fa Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 11 Dec 2009 16:35:40 -0500 Subject: Fix misspellings of "invocation" in comments. Some comments misspell "invocation"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- kernel/sched_cpupri.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index 597b33099dfa..3db4b1a0e921 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -58,7 +58,7 @@ static int convert_prio(int prio) * @lowest_mask: A mask to fill in with selected CPUs (or NULL) * * Note: This function returns the recommended CPUs as calculated during the - * current invokation. By the time the call returns, the CPUs may have in + * current invocation. By the time the call returns, the CPUs may have in * fact changed priorities any number of times. While not ideal, it is not * an issue of correctness since the normal rebalancer logic will correct * any discrepancies created by racing against the uncertainty of the current -- cgit v1.2.2 From c9404c9c392d557a4687c4cbda022b03cb787ce9 Mon Sep 17 00:00:00 2001 From: Adam Buchbinder Date: Fri, 18 Dec 2009 15:40:42 -0500 Subject: Fix misspelling of "should" and "shouldn't" in comments. Some comments misspell "should" or "shouldn't"; this fixes them. No code changes. Signed-off-by: Adam Buchbinder Signed-off-by: Jiri Kosina --- kernel/audit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/audit.c b/kernel/audit.c index 5feed232be9d..78f7f86aa238 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -398,7 +398,7 @@ static void kauditd_send_skb(struct sk_buff *skb) skb_get(skb); err = netlink_unicast(audit_sock, skb, audit_nlk_pid, 0); if (err < 0) { - BUG_ON(err != -ECONNREFUSED); /* Shoudn't happen */ + BUG_ON(err != -ECONNREFUSED); /* Shouldn't happen */ printk(KERN_ERR "audit: *NO* daemon at audit_pid=%d\n", audit_pid); audit_log_lost("auditd dissapeared\n"); audit_pid = 0; -- cgit v1.2.2 From af66585270ef99aa6097faf3bd7344855077e75d Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Sun, 17 Jan 2010 19:14:26 -0200 Subject: fix comment typo boo -> boot in ksysfs.c Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: Jiri Kosina --- kernel/ksysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 3feaf5a74514..ac08efca54c3 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -33,7 +33,7 @@ static ssize_t uevent_seqnum_show(struct kobject *kobj, } KERNEL_ATTR_RO(uevent_seqnum); -/* uevent helper program, used during early boo */ +/* uevent helper program, used during early boot */ static ssize_t uevent_helper_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { -- cgit v1.2.2 From 350f82586b7554240bee18c41cc5c842f63265ae Mon Sep 17 00:00:00 2001 From: "Edward Z. Yang" Date: Mon, 1 Feb 2010 18:26:59 -0500 Subject: Remove redundant trailing semicolons from macros Signed-off-by: Edward Z. Yang Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/params.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index cf1b69183127..2278ce244cf8 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -402,8 +402,8 @@ int param_get_string(char *buffer, struct kernel_param *kp) } /* sysfs output in /sys/modules/XYZ/parameters/ */ -#define to_module_attr(n) container_of(n, struct module_attribute, attr); -#define to_module_kobject(n) container_of(n, struct module_kobject, kobj); +#define to_module_attr(n) container_of(n, struct module_attribute, attr) +#define to_module_kobject(n) container_of(n, struct module_kobject, kobj) extern struct kernel_param __start___param[], __stop___param[]; @@ -421,7 +421,7 @@ struct module_param_attrs }; #ifdef CONFIG_SYSFS -#define to_param_attr(n) container_of(n, struct param_attribute, mattr); +#define to_param_attr(n) container_of(n, struct param_attribute, mattr) static ssize_t param_attr_show(struct module_attribute *mattr, struct module *mod, char *buf) -- cgit v1.2.2 From 9ce8e498ee58bb8a866a6c3c08fcb385ed66e9d2 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 2 Feb 2010 08:54:51 +0200 Subject: devres: typo fix s/dev/devm/ Signed-off-by: Baruch Siach Signed-off-by: Jiri Kosina --- kernel/irq/devres.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index d06df9c41cba..30d56bafc9c2 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -81,7 +81,7 @@ EXPORT_SYMBOL(devm_request_threaded_irq); * Except for the extra @dev argument, this function takes the * same arguments and performs the same function as free_irq(). * This function instead of free_irq() should be used to manually - * free IRQs allocated with dev_request_irq(). + * free IRQs allocated with devm_request_irq(). */ void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) { -- cgit v1.2.2 From 1537a3638cbf741d3826c1002026cce487a6bee0 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Fri, 29 Jan 2010 15:57:49 +0800 Subject: tree-wide: fix 'lenght' typo in comments and code Some misspelled occurences of 'octet' and some comments were also fixed as I was on it. Signed-off-by: Daniel Mack Cc: Jiri Kosina Cc: Joe Perches Cc: Junio C Hamano Signed-off-by: Jiri Kosina --- kernel/trace/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb196..e4b32c8aa85f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -549,7 +549,7 @@ static inline int ftrace_trace_task(struct task_struct *task) * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found * @buffer: holds the parsed user input - * @idx: user input lenght + * @idx: user input length * @size: buffer size */ struct trace_parser { -- cgit v1.2.2 From cccc6bba3f771ef29b33e4f79e70ebc3dba245b0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 25 Dec 2009 05:07:33 -0500 Subject: Lose the first argument of audit_inode_child() it's always equal to ->d_name.name of the second argument Signed-off-by: Al Viro --- kernel/auditsc.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/auditsc.c b/kernel/auditsc.c index fc0f928167e7..f3a461c0970a 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1988,7 +1988,6 @@ void __audit_inode(const char *name, const struct dentry *dentry) /** * audit_inode_child - collect inode info for created/removed objects - * @dname: inode's dentry name * @dentry: dentry being audited * @parent: inode of dentry parent * @@ -2000,13 +1999,14 @@ void __audit_inode(const char *name, const struct dentry *dentry) * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const char *dname, const struct dentry *dentry, +void __audit_inode_child(const struct dentry *dentry, const struct inode *parent) { int idx; struct audit_context *context = current->audit_context; const char *found_parent = NULL, *found_child = NULL; const struct inode *inode = dentry->d_inode; + const char *dname = dentry->d_name.name; int dirlen = 0; if (!context->in_syscall) @@ -2014,9 +2014,6 @@ void __audit_inode_child(const char *dname, const struct dentry *dentry, if (inode) handle_one(inode); - /* determine matching parent */ - if (!dname) - goto add_names; /* parent is more likely, look for it first */ for (idx = 0; idx < context->name_count; idx++) { -- cgit v1.2.2 From b80109e256bc17ed66c9d559175f087b03ca2a8e Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 9 Feb 2010 15:07:40 +1100 Subject: Remove reference to kthread_create_on_cpu kthread_create_on_cpu doesn't exist so update a comment in kthread.c to reflect this. Signed-off-by: Anton Blanchard Acked-by: Rusty Russell Signed-off-by: Jiri Kosina --- kernel/kthread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kthread.c b/kernel/kthread.c index fbb6222fe7e0..82ed0ea15194 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -101,7 +101,7 @@ static void create_kthread(struct kthread_create_info *create) * * Description: This helper function creates and names a kernel * thread. The thread will be stopped: use wake_up_process() to start - * it. See also kthread_run(), kthread_create_on_cpu(). + * it. See also kthread_run(). * * When woken, the thread will run @threadfn() with @data as its * argument. @threadfn() can either call do_exit() directly if it is a -- cgit v1.2.2 From ced5b697a76d325e7a7ac7d382dbbb632c765093 Mon Sep 17 00:00:00 2001 From: Brandon Phiilps Date: Wed, 10 Feb 2010 01:20:06 -0800 Subject: x86: Avoid race condition in pci_enable_msix() Keep chip_data in create_irq_nr and destroy_irq. When two drivers are setting up MSI-X at the same time via pci_enable_msix() there is a race. See this dmesg excerpt: [ 85.170610] ixgbe 0000:02:00.1: irq 97 for MSI/MSI-X [ 85.170611] alloc irq_desc for 99 on node -1 [ 85.170613] igb 0000:08:00.1: irq 98 for MSI/MSI-X [ 85.170614] alloc kstat_irqs on node -1 [ 85.170616] alloc irq_2_iommu on node -1 [ 85.170617] alloc irq_desc for 100 on node -1 [ 85.170619] alloc kstat_irqs on node -1 [ 85.170621] alloc irq_2_iommu on node -1 [ 85.170625] ixgbe 0000:02:00.1: irq 99 for MSI/MSI-X [ 85.170626] alloc irq_desc for 101 on node -1 [ 85.170628] igb 0000:08:00.1: irq 100 for MSI/MSI-X [ 85.170630] alloc kstat_irqs on node -1 [ 85.170631] alloc irq_2_iommu on node -1 [ 85.170635] alloc irq_desc for 102 on node -1 [ 85.170636] alloc kstat_irqs on node -1 [ 85.170639] alloc irq_2_iommu on node -1 [ 85.170646] BUG: unable to handle kernel NULL pointer dereference at 0000000000000088 As you can see igb and ixgbe are both alternating on create_irq_nr() via pci_enable_msix() in their probe function. ixgbe: While looping through irq_desc_ptrs[] via create_irq_nr() ixgbe choses irq_desc_ptrs[102] and exits the loop, drops vector_lock and calls dynamic_irq_init. Then it sets irq_desc_ptrs[102]->chip_data = NULL via dynamic_irq_init(). igb: Grabs the vector_lock now and starts looping over irq_desc_ptrs[] via create_irq_nr(). It gets to irq_desc_ptrs[102] and does this: cfg_new = irq_desc_ptrs[102]->chip_data; if (cfg_new->vector != 0) continue; This hits the NULL deref. Another possible race exists via pci_disable_msix() in a driver or in the number of error paths that call free_msi_irqs(): destroy_irq() dynamic_irq_cleanup() which sets desc->chip_data = NULL ...race window... desc->chip_data = cfg; Remove the save and restore code for cfg in create_irq_nr() and destroy_irq() and take the desc->lock when checking the irq_cfg. Reported-and-analyzed-by: Brandon Philips Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-3-git-send-email-yinghai@kernel.org> Signed-off-by: Brandon Phililps Cc: stable@kernel.org Signed-off-by: H. Peter Anvin --- kernel/irq/chip.c | 52 +++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f666..d70394f12ee9 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -18,11 +18,7 @@ #include "internals.h" -/** - * dynamic_irq_init - initialize a dynamically allocated irq - * @irq: irq number to initialize - */ -void dynamic_irq_init(unsigned int irq) +static void dynamic_irq_init_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc; unsigned long flags; @@ -41,7 +37,8 @@ void dynamic_irq_init(unsigned int irq) desc->depth = 1; desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->action = NULL; desc->irq_count = 0; desc->irqs_unhandled = 0; @@ -55,10 +52,26 @@ void dynamic_irq_init(unsigned int irq) } /** - * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * dynamic_irq_init - initialize a dynamically allocated irq * @irq: irq number to initialize */ -void dynamic_irq_cleanup(unsigned int irq) +void dynamic_irq_init(unsigned int irq) +{ + dynamic_irq_init_x(irq, false); +} + +/** + * dynamic_irq_init_keep_chip_data - initialize a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_init_keep_chip_data(unsigned int irq) +{ + dynamic_irq_init_x(irq, true); +} + +static void dynamic_irq_cleanup_x(unsigned int irq, bool keep_chip_data) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -77,7 +90,8 @@ void dynamic_irq_cleanup(unsigned int irq) } desc->msi_desc = NULL; desc->handler_data = NULL; - desc->chip_data = NULL; + if (!keep_chip_data) + desc->chip_data = NULL; desc->handle_irq = handle_bad_irq; desc->chip = &no_irq_chip; desc->name = NULL; @@ -85,6 +99,26 @@ void dynamic_irq_cleanup(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } +/** + * dynamic_irq_cleanup - cleanup a dynamically allocated irq + * @irq: irq number to initialize + */ +void dynamic_irq_cleanup(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, false); +} + +/** + * dynamic_irq_cleanup_keep_chip_data - cleanup a dynamically allocated irq + * @irq: irq number to initialize + * + * does not set irq_to_desc(irq)->chip_data to NULL + */ +void dynamic_irq_cleanup_keep_chip_data(unsigned int irq) +{ + dynamic_irq_cleanup_x(irq, true); +} + /** * set_irq_chip - set the irq chip for an irq -- cgit v1.2.2 From 27811d8cabe56e0c3622251b049086f49face4ff Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:07 -0800 Subject: x86: Move range related operation to one file We have almost the same code for mtrr cleanup and amd_bus checkup, and this code will also be used in replacing bootmem with early_res, so try to move them together and reuse it from different parts. Also rename update_range to subtract_range as that is what the function is actually doing. -v2: update comments as Christoph requested Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-4-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/Makefile | 2 +- kernel/range.c | 163 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 164 insertions(+), 1 deletion(-) create mode 100644 kernel/range.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..ad47330ccf32 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o + async.o range.o obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/range.c b/kernel/range.c new file mode 100644 index 000000000000..71e0021281fe --- /dev/null +++ b/kernel/range.c @@ -0,0 +1,163 @@ +/* + * Range add and subtract + */ +#include +#include +#include + +#include + +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#endif + +int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) +{ + if (start > end) + return nr_range; + + /* Out of slots: */ + if (nr_range >= az) + return nr_range; + + range[nr_range].start = start; + range[nr_range].end = end; + + nr_range++; + + return nr_range; +} + +int add_range_with_merge(struct range *range, int az, int nr_range, + u64 start, u64 end) +{ + int i; + + if (start > end) + return nr_range; + + /* Try to merge it with old one: */ + for (i = 0; i < nr_range; i++) { + u64 final_start, final_end; + u64 common_start, common_end; + + if (!range[i].end) + continue; + + common_start = max(range[i].start, start); + common_end = min(range[i].end, end); + if (common_start > common_end + 1) + continue; + + final_start = min(range[i].start, start); + final_end = max(range[i].end, end); + + range[i].start = final_start; + range[i].end = final_end; + return nr_range; + } + + /* Need to add it: */ + return add_range(range, az, nr_range, start, end); +} + +void subtract_range(struct range *range, int az, u64 start, u64 end) +{ + int i, j; + + if (start > end) + return; + + for (j = 0; j < az; j++) { + if (!range[j].end) + continue; + + if (start <= range[j].start && end >= range[j].end) { + range[j].start = 0; + range[j].end = 0; + continue; + } + + if (start <= range[j].start && end < range[j].end && + range[j].start < end + 1) { + range[j].start = end + 1; + continue; + } + + + if (start > range[j].start && end >= range[j].end && + range[j].end > start - 1) { + range[j].end = start - 1; + continue; + } + + if (start > range[j].start && end < range[j].end) { + /* Find the new spare: */ + for (i = 0; i < az; i++) { + if (range[i].end == 0) + break; + } + if (i < az) { + range[i].end = range[j].end; + range[i].start = end + 1; + } else { + printk(KERN_ERR "run of slot in ranges\n"); + } + range[j].end = start - 1; + continue; + } + } +} + +static int cmp_range(const void *x1, const void *x2) +{ + const struct range *r1 = x1; + const struct range *r2 = x2; + s64 start1, start2; + + start1 = r1->start; + start2 = r2->start; + + return start1 - start2; +} + +int clean_sort_range(struct range *range, int az) +{ + int i, j, k = az - 1, nr_range = 0; + + for (i = 0; i < k; i++) { + if (range[i].end) + continue; + for (j = k; j > i; j--) { + if (range[j].end) { + k = j; + break; + } + } + if (j == i) + break; + range[i].start = range[k].start; + range[i].end = range[k].end; + range[k].start = 0; + range[k].end = 0; + k--; + } + /* count it */ + for (i = 0; i < az; i++) { + if (!range[i].end) { + nr_range = i; + break; + } + } + + /* sort them */ + sort(range, nr_range, sizeof(struct range), cmp_range, NULL); + + return nr_range; +} + +void sort_range(struct range *range, int nr_range) +{ + /* sort them */ + sort(range, nr_range, sizeof(struct range), cmp_range, NULL); +} -- cgit v1.2.2 From e9a0064ad03b899938059bb576615ad9ed0f27f9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:13 -0800 Subject: x86: Change range end to start+size So make interface more consistent with early_res. Later we can share some code with early_res. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-10-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/range.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 71e0021281fe..74e2e6114927 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -13,7 +13,7 @@ int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) { - if (start > end) + if (start >= end) return nr_range; /* Out of slots: */ @@ -33,7 +33,7 @@ int add_range_with_merge(struct range *range, int az, int nr_range, { int i; - if (start > end) + if (start >= end) return nr_range; /* Try to merge it with old one: */ @@ -46,7 +46,7 @@ int add_range_with_merge(struct range *range, int az, int nr_range, common_start = max(range[i].start, start); common_end = min(range[i].end, end); - if (common_start > common_end + 1) + if (common_start > common_end) continue; final_start = min(range[i].start, start); @@ -65,7 +65,7 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) { int i, j; - if (start > end) + if (start >= end) return; for (j = 0; j < az; j++) { @@ -79,15 +79,15 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) } if (start <= range[j].start && end < range[j].end && - range[j].start < end + 1) { - range[j].start = end + 1; + range[j].start < end) { + range[j].start = end; continue; } if (start > range[j].start && end >= range[j].end && - range[j].end > start - 1) { - range[j].end = start - 1; + range[j].end > start) { + range[j].end = start; continue; } @@ -99,11 +99,11 @@ void subtract_range(struct range *range, int az, u64 start, u64 end) } if (i < az) { range[i].end = range[j].end; - range[i].start = end + 1; + range[i].start = end; } else { printk(KERN_ERR "run of slot in ranges\n"); } - range[j].end = start - 1; + range[j].end = start; continue; } } -- cgit v1.2.2 From 5c42dc7070c94622ca914b5a2e227f3744e857e7 Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Thu, 11 Feb 2010 15:04:36 +0100 Subject: devres/irq: Fix devm_irq_match comment Fix the reference (in comment). Signed-off-by: Jean Delvare Signed-off-by: Jiri Kosina --- kernel/irq/devres.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c index 30d56bafc9c2..1ef4ffcdfa55 100644 --- a/kernel/irq/devres.c +++ b/kernel/irq/devres.c @@ -42,7 +42,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data) * automatically freed on driver detach. * * If an IRQ allocated with this function needs to be freed - * separately, dev_free_irq() must be used. + * separately, devm_free_irq() must be used. */ int devm_request_threaded_irq(struct device *dev, unsigned int irq, irq_handler_t handler, irq_handler_t thread_fn, -- cgit v1.2.2 From dfff0615d28bdb3e8d213e5537dd069265912667 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Fri, 12 Feb 2010 21:58:11 +0100 Subject: tree-wide: fix typos "ass?o[sc]iac?te" -> "associate" in comments MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Uwe Kleine-König Signed-off-by: Jiri Kosina --- kernel/irq/chip.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index ecc3fa28f666..ec8a96382461 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -520,7 +520,7 @@ out: * signal. The occurence is latched into the irq controller hardware * and must be acked in order to be reenabled. After the ack another * interrupt can happen on the same source even before the first one - * is handled by the assosiacted event handler. If this happens it + * is handled by the associated event handler. If this happens it * might be necessary to disable (mask) the interrupt depending on the * controller hardware. This requires to reenable the interrupt inside * of the loop which handles the interrupts which have arrived while -- cgit v1.2.2 From 43cf38eb5cea91245502df3fcee4dbfc1c74dd1c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 2 Feb 2010 14:38:57 +0900 Subject: percpu: add __percpu sparse annotations to core kernel subsystems Add __percpu sparse annotations to core subsystems. These annotations are to make sparse consider percpu variables to be in a different address space and warn if accessed without going through percpu accessors. This patch doesn't affect normal builds. Signed-off-by: Tejun Heo Reviewed-by: Christoph Lameter Acked-by: Paul E. McKenney Cc: Jens Axboe Cc: linux-mm@kvack.org Cc: Rusty Russell Cc: Dipankar Sarma Cc: Peter Zijlstra Cc: Andrew Morton Cc: Eric Biederman --- kernel/kexec.c | 2 +- kernel/sched.c | 4 ++-- kernel/stop_machine.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index ef077fb73155..87ebe8adc474 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -41,7 +41,7 @@ #include /* Per cpu memory for storing cpu states in case of system crash. */ -note_buf_t* crash_notes; +note_buf_t __percpu *crash_notes; /* vmcoreinfo stuff */ static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; diff --git a/kernel/sched.c b/kernel/sched.c index 3a8fb30a91b1..978edfd35a96 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1566,7 +1566,7 @@ static unsigned long cpu_avg_load_per_task(int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED -static __read_mostly unsigned long *update_shares_data; +static __read_mostly unsigned long __percpu *update_shares_data; static void __set_se_shares(struct sched_entity *se, unsigned long shares); @@ -10683,7 +10683,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { struct cpuacct { struct cgroup_subsys_state css; /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 *cpuusage; + u64 __percpu *cpuusage; struct percpu_counter cpustat[CPUACCT_STAT_NSTATS]; struct cpuacct *parent; }; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 912823e2a11b..9bb9fb1bd79c 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -45,7 +45,7 @@ static int refcount; static struct workqueue_struct *stop_machine_wq; static struct stop_machine_data active, idle; static const struct cpumask *active_cpus; -static void *stop_machine_work; +static void __percpu *stop_machine_work; static void set_state(enum stopmachine_state newstate) { -- cgit v1.2.2 From 580e0ad21d6d6f932461d24b47041e3dd499c23f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Tue, 16 Feb 2010 18:40:35 -0800 Subject: core: Move early_res from arch/x86 to kernel/ This makes the range reservation feature available to other architectures. -v2: add get_max_mapped, max_pfn_mapped only defined in x86... to fix PPC compiling -v3: according to hpa, add CONFIG_HAVE_EARLY_RES -v4: fix typo about EARLY_RES in config Signed-off-by: Yinghai Lu LKML-Reference: <4B7B5723.4070009@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/Makefile | 1 + kernel/early_res.c | 513 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 514 insertions(+) create mode 100644 kernel/early_res.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index ad47330ccf32..1292b863d667 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -11,6 +11,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ async.o range.o +obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o obj-y += groups.o ifdef CONFIG_FUNCTION_TRACER diff --git a/kernel/early_res.c b/kernel/early_res.c new file mode 100644 index 000000000000..aa5494ac4462 --- /dev/null +++ b/kernel/early_res.c @@ -0,0 +1,513 @@ +/* + * early_res, could be used to replace bootmem + */ +#include +#include +#include +#include +#include +#include + +/* + * Early reserved memory areas. + */ +/* + * need to make sure this one is bigger enough before + * find_fw_memmap_area could be used + */ +#define MAX_EARLY_RES_X 32 + +struct early_res { + u64 start, end; + char name[15]; + char overlap_ok; +}; +static struct early_res early_res_x[MAX_EARLY_RES_X] __initdata; + +static int max_early_res __initdata = MAX_EARLY_RES_X; +static struct early_res *early_res __initdata = &early_res_x[0]; +static int early_res_count __initdata; + +static int __init find_overlapped_early(u64 start, u64 end) +{ + int i; + struct early_res *r; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + if (end > r->start && start < r->end) + break; + } + + return i; +} + +/* + * Drop the i-th range from the early reservation map, + * by copying any higher ranges down one over it, and + * clearing what had been the last slot. + */ +static void __init drop_range(int i) +{ + int j; + + for (j = i + 1; j < max_early_res && early_res[j].end; j++) + ; + + memmove(&early_res[i], &early_res[i + 1], + (j - 1 - i) * sizeof(struct early_res)); + + early_res[j - 1].end = 0; + early_res_count--; +} + +/* + * Split any existing ranges that: + * 1) are marked 'overlap_ok', and + * 2) overlap with the stated range [start, end) + * into whatever portion (if any) of the existing range is entirely + * below or entirely above the stated range. Drop the portion + * of the existing range that overlaps with the stated range, + * which will allow the caller of this routine to then add that + * stated range without conflicting with any existing range. + */ +static void __init drop_overlaps_that_are_ok(u64 start, u64 end) +{ + int i; + struct early_res *r; + u64 lower_start, lower_end; + u64 upper_start, upper_end; + char name[15]; + + for (i = 0; i < max_early_res && early_res[i].end; i++) { + r = &early_res[i]; + + /* Continue past non-overlapping ranges */ + if (end <= r->start || start >= r->end) + continue; + + /* + * Leave non-ok overlaps as is; let caller + * panic "Overlapping early reservations" + * when it hits this overlap. + */ + if (!r->overlap_ok) + return; + + /* + * We have an ok overlap. We will drop it from the early + * reservation map, and add back in any non-overlapping + * portions (lower or upper) as separate, overlap_ok, + * non-overlapping ranges. + */ + + /* 1. Note any non-overlapping (lower or upper) ranges. */ + strncpy(name, r->name, sizeof(name) - 1); + + lower_start = lower_end = 0; + upper_start = upper_end = 0; + if (r->start < start) { + lower_start = r->start; + lower_end = start; + } + if (r->end > end) { + upper_start = end; + upper_end = r->end; + } + + /* 2. Drop the original ok overlapping range */ + drop_range(i); + + i--; /* resume for-loop on copied down entry */ + + /* 3. Add back in any non-overlapping ranges. */ + if (lower_end) + reserve_early_overlap_ok(lower_start, lower_end, name); + if (upper_end) + reserve_early_overlap_ok(upper_start, upper_end, name); + } +} + +static void __init __reserve_early(u64 start, u64 end, char *name, + int overlap_ok) +{ + int i; + struct early_res *r; + + i = find_overlapped_early(start, end); + if (i >= max_early_res) + panic("Too many early reservations"); + r = &early_res[i]; + if (r->end) + panic("Overlapping early reservations " + "%llx-%llx %s to %llx-%llx %s\n", + start, end - 1, name ? name : "", r->start, + r->end - 1, r->name); + r->start = start; + r->end = end; + r->overlap_ok = overlap_ok; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +/* + * A few early reservtations come here. + * + * The 'overlap_ok' in the name of this routine does -not- mean it + * is ok for these reservations to overlap an earlier reservation. + * Rather it means that it is ok for subsequent reservations to + * overlap this one. + * + * Use this entry point to reserve early ranges when you are doing + * so out of "Paranoia", reserving perhaps more memory than you need, + * just in case, and don't mind a subsequent overlapping reservation + * that is known to be needed. + * + * The drop_overlaps_that_are_ok() call here isn't really needed. + * It would be needed if we had two colliding 'overlap_ok' + * reservations, so that the second such would not panic on the + * overlap with the first. We don't have any such as of this + * writing, but might as well tolerate such if it happens in + * the future. + */ +void __init reserve_early_overlap_ok(u64 start, u64 end, char *name) +{ + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 1); +} + +static void __init __check_and_double_early_res(u64 ex_start, u64 ex_end) +{ + u64 start, end, size, mem; + struct early_res *new; + + /* do we have enough slots left ? */ + if ((max_early_res - early_res_count) > max(max_early_res/8, 2)) + return; + + /* double it */ + mem = -1ULL; + size = sizeof(struct early_res) * max_early_res * 2; + if (early_res == early_res_x) + start = 0; + else + start = early_res[0].end; + end = ex_start; + if (start + size < end) + mem = find_fw_memmap_area(start, end, size, + sizeof(struct early_res)); + if (mem == -1ULL) { + start = ex_end; + end = get_max_mapped(); + if (start + size < end) + mem = find_fw_memmap_area(start, end, size, + sizeof(struct early_res)); + } + if (mem == -1ULL) + panic("can not find more space for early_res array"); + + new = __va(mem); + /* save the first one for own */ + new[0].start = mem; + new[0].end = mem + size; + new[0].overlap_ok = 0; + /* copy old to new */ + if (early_res == early_res_x) { + memcpy(&new[1], &early_res[0], + sizeof(struct early_res) * max_early_res); + memset(&new[max_early_res+1], 0, + sizeof(struct early_res) * (max_early_res - 1)); + early_res_count++; + } else { + memcpy(&new[1], &early_res[1], + sizeof(struct early_res) * (max_early_res - 1)); + memset(&new[max_early_res], 0, + sizeof(struct early_res) * max_early_res); + } + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = new; + max_early_res *= 2; + printk(KERN_DEBUG "early_res array is doubled to %d at [%llx - %llx]\n", + max_early_res, mem, mem + size - 1); +} + +/* + * Most early reservations come here. + * + * We first have drop_overlaps_that_are_ok() drop any pre-existing + * 'overlap_ok' ranges, so that we can then reserve this memory + * range without risk of panic'ing on an overlapping overlap_ok + * early reservation. + */ +void __init reserve_early(u64 start, u64 end, char *name) +{ + if (start >= end) + return; + + __check_and_double_early_res(start, end); + + drop_overlaps_that_are_ok(start, end); + __reserve_early(start, end, name, 0); +} + +void __init reserve_early_without_check(u64 start, u64 end, char *name) +{ + struct early_res *r; + + if (start >= end) + return; + + __check_and_double_early_res(start, end); + + r = &early_res[early_res_count]; + + r->start = start; + r->end = end; + r->overlap_ok = 0; + if (name) + strncpy(r->name, name, sizeof(r->name) - 1); + early_res_count++; +} + +void __init free_early(u64 start, u64 end) +{ + struct early_res *r; + int i; + + i = find_overlapped_early(start, end); + r = &early_res[i]; + if (i >= max_early_res || r->end != end || r->start != start) + panic("free_early on not reserved area: %llx-%llx!", + start, end - 1); + + drop_range(i); +} + +#ifdef CONFIG_NO_BOOTMEM +static void __init subtract_early_res(struct range *range, int az) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + +#define DEBUG_PRINT_EARLY_RES 1 + +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO "Subtract (%d early reservations)\n", count); +#endif + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; +#if DEBUG_PRINT_EARLY_RES + printk(KERN_INFO " #%d [%010llx - %010llx] %15s\n", i, + r->start, r->end, r->name); +#endif + final_start = PFN_DOWN(r->start); + final_end = PFN_UP(r->end); + if (final_start >= final_end) + continue; + subtract_range(range, az, final_start, final_end); + } + +} + +int __init get_free_all_memory_range(struct range **rangep, int nodeid) +{ + int i, count; + u64 start = 0, end; + u64 size; + u64 mem; + struct range *range; + int nr_range; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + count *= 2; + + size = sizeof(struct range) * count; + end = get_max_mapped(); +#ifdef MAX_DMA32_PFN + if (end > (MAX_DMA32_PFN << PAGE_SHIFT)) + start = MAX_DMA32_PFN << PAGE_SHIFT; +#endif + mem = find_fw_memmap_area(start, end, size, sizeof(struct range)); + if (mem == -1ULL) + panic("can not find more space for range free"); + + range = __va(mem); + /* use early_node_map[] and early_res to get range array at first */ + memset(range, 0, size); + nr_range = 0; + + /* need to go over early_node_map to find out good range for node */ + nr_range = add_from_early_node_map(range, count, nr_range, nodeid); +#ifdef CONFIG_X86_32 + subtract_range(range, count, max_low_pfn, -1ULL); +#endif + subtract_early_res(range, count); + nr_range = clean_sort_range(range, count); + + /* need to clear it ? */ + if (nodeid == MAX_NUMNODES) { + memset(&early_res[0], 0, + sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + } + + *rangep = range; + return nr_range; +} +#else +void __init early_res_to_bootmem(u64 start, u64 end) +{ + int i, count; + u64 final_start, final_end; + int idx = 0; + + count = 0; + for (i = 0; i < max_early_res && early_res[i].end; i++) + count++; + + /* need to skip first one ?*/ + if (early_res != early_res_x) + idx = 1; + + printk(KERN_INFO "(%d/%d early reservations) ==> bootmem [%010llx - %010llx]\n", + count - idx, max_early_res, start, end); + for (i = idx; i < count; i++) { + struct early_res *r = &early_res[i]; + printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i, + r->start, r->end, r->name); + final_start = max(start, r->start); + final_end = min(end, r->end); + if (final_start >= final_end) { + printk(KERN_CONT "\n"); + continue; + } + printk(KERN_CONT " ==> [%010llx - %010llx]\n", + final_start, final_end); + reserve_bootmem_generic(final_start, final_end - final_start, + BOOTMEM_DEFAULT); + } + /* clear them */ + memset(&early_res[0], 0, sizeof(struct early_res) * max_early_res); + early_res = NULL; + max_early_res = 0; + early_res_count = 0; +} +#endif + +/* Check for already reserved areas */ +static inline int __init bad_addr(u64 *addrp, u64 size, u64 align) +{ + int i; + u64 addr = *addrp; + int changed = 0; + struct early_res *r; +again: + i = find_overlapped_early(addr, addr + size); + r = &early_res[i]; + if (i < max_early_res && r->end) { + *addrp = addr = round_up(r->end, align); + changed = 1; + goto again; + } + return changed; +} + +/* Check for already reserved areas */ +static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align) +{ + int i; + u64 addr = *addrp, last; + u64 size = *sizep; + int changed = 0; +again: + last = addr + size; + for (i = 0; i < max_early_res && early_res[i].end; i++) { + struct early_res *r = &early_res[i]; + if (last > r->start && addr < r->start) { + size = r->start - addr; + changed = 1; + goto again; + } + if (last > r->end && addr < r->end) { + addr = round_up(r->end, align); + size = last - addr; + changed = 1; + goto again; + } + if (last <= r->end && addr >= r->start) { + (*sizep)++; + return 0; + } + } + if (changed) { + *addrp = addr; + *sizep = size; + } + return changed; +} + +/* + * Find a free area with specified alignment in a specific range. + * only with the area.between start to end is active range from early_node_map + * so they are good as RAM + */ +u64 __init find_early_area(u64 ei_start, u64 ei_last, u64 start, u64 end, + u64 size, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + while (bad_addr(&addr, size, align) && addr+size <= ei_last) + ; + last = addr + size; + if (last > ei_last) + goto out; + if (last > end) + goto out; + + return addr; + +out: + return -1ULL; +} + +u64 __init find_early_area_size(u64 ei_start, u64 ei_last, u64 start, + u64 *sizep, u64 align) +{ + u64 addr, last; + + addr = round_up(ei_start, align); + if (addr < start) + addr = round_up(start, align); + if (addr >= ei_last) + goto out; + *sizep = ei_last - addr; + while (bad_addr_size(&addr, sizep, align) && addr + *sizep <= ei_last) + ; + last = addr + *sizep; + if (last > ei_last) + goto out; + + return addr; + +out: + return -1ULL; +} -- cgit v1.2.2 From febcb0c59ac19fef2081a30e371e7af3619b5e91 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:32 -0800 Subject: irq: Remove unnecessary bootmem code mem_init is moved early already. Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-29-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/irq/handle.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 814940e7f485..0e823c0d1c9c 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include "internals.h" @@ -87,12 +86,8 @@ void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr) { void *ptr; - if (slab_is_available()) - ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), - GFP_ATOMIC, node); - else - ptr = alloc_bootmem_node(NODE_DATA(node), - nr * sizeof(*desc->kstat_irqs)); + ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), + GFP_ATOMIC, node); /* * don't overwite if can not get new one @@ -219,10 +214,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) if (desc) goto out_unlock; - if (slab_is_available()) - desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); - else - desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc)); + desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node); printk(KERN_DEBUG " alloc irq_desc for %d on node %d\n", irq, node); if (!desc) { -- cgit v1.2.2 From 99558f0bbe68cb09799ec38adbaa3f3b2dc7ba63 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:34 -0800 Subject: sparseirq: Change irq_desc_ptrs to static Add replace_irq_desc() instead of poking at the array directly. -v2: remove unneeded boundary check in replace_irq_desc Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-31-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/irq/handle.c | 7 ++++++- kernel/irq/internals.h | 6 +----- kernel/irq/numa_migrate.c | 4 ++-- 3 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 0e823c0d1c9c..266f7986aa08 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -127,7 +127,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) */ DEFINE_RAW_SPINLOCK(sparse_irq_lock); -struct irq_desc **irq_desc_ptrs __read_mostly; +static struct irq_desc **irq_desc_ptrs __read_mostly; static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -192,6 +192,11 @@ struct irq_desc *irq_to_desc(unsigned int irq) return NULL; } +void replace_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + irq_desc_ptrs[irq] = desc; +} + struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index b2821f070a3d..c63f3bc88f0b 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -21,11 +21,7 @@ extern void clear_kstat_irqs(struct irq_desc *desc); extern raw_spinlock_t sparse_irq_lock; #ifdef CONFIG_SPARSE_IRQ -/* irq_desc_ptrs allocated at boot time */ -extern struct irq_desc **irq_desc_ptrs; -#else -/* irq_desc_ptrs is a fixed size array */ -extern struct irq_desc *irq_desc_ptrs[NR_IRQS]; +void replace_irq_desc(unsigned int irq, struct irq_desc *desc); #endif #ifdef CONFIG_PROC_FS diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c index 26bac9d8f860..963559dbd858 100644 --- a/kernel/irq/numa_migrate.c +++ b/kernel/irq/numa_migrate.c @@ -70,7 +70,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ - desc = irq_desc_ptrs[irq]; + desc = irq_to_desc(irq); if (desc && old_desc != desc) goto out_unlock; @@ -90,7 +90,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc, goto out_unlock; } - irq_desc_ptrs[irq] = desc; + replace_irq_desc(irq, desc); raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); /* free the old one */ -- cgit v1.2.2 From b5eb78f76ddfa7caf4340cf6893b032f45d8114a Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 10 Feb 2010 01:20:35 -0800 Subject: sparseirq: Use radix_tree instead of ptrs array Use radix_tree irq_desc_tree instead of irq_desc_ptrs. -v2: according to Eric and cyrill to use radix_tree_lookup_slot and radix_tree_replace_slot Signed-off-by: Yinghai Lu LKML-Reference: <1265793639-15071-32-git-send-email-yinghai@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/irq/handle.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 266f7986aa08..76d5a671bfe1 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "internals.h" @@ -127,7 +128,26 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int node) */ DEFINE_RAW_SPINLOCK(sparse_irq_lock); -static struct irq_desc **irq_desc_ptrs __read_mostly; +static RADIX_TREE(irq_desc_tree, GFP_ATOMIC); + +static void set_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + radix_tree_insert(&irq_desc_tree, irq, desc); +} + +struct irq_desc *irq_to_desc(unsigned int irq) +{ + return radix_tree_lookup(&irq_desc_tree, irq); +} + +void replace_irq_desc(unsigned int irq, struct irq_desc *desc) +{ + void **ptr; + + ptr = radix_tree_lookup_slot(&irq_desc_tree, irq); + if (ptr) + radix_tree_replace_slot(ptr, desc); +} static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = { [0 ... NR_IRQS_LEGACY-1] = { @@ -159,9 +179,6 @@ int __init early_irq_init(void) legacy_count = ARRAY_SIZE(irq_desc_legacy); node = first_online_node; - /* allocate irq_desc_ptrs array based on nr_irqs */ - irq_desc_ptrs = kcalloc(nr_irqs, sizeof(void *), GFP_NOWAIT); - /* allocate based on nr_cpu_ids */ kstat_irqs_legacy = kzalloc_node(NR_IRQS_LEGACY * nr_cpu_ids * sizeof(int), GFP_NOWAIT, node); @@ -175,28 +192,12 @@ int __init early_irq_init(void) lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); alloc_desc_masks(&desc[i], node, true); init_desc_masks(&desc[i]); - irq_desc_ptrs[i] = desc + i; + set_irq_desc(i, &desc[i]); } - for (i = legacy_count; i < nr_irqs; i++) - irq_desc_ptrs[i] = NULL; - return arch_early_irq_init(); } -struct irq_desc *irq_to_desc(unsigned int irq) -{ - if (irq_desc_ptrs && irq < nr_irqs) - return irq_desc_ptrs[irq]; - - return NULL; -} - -void replace_irq_desc(unsigned int irq, struct irq_desc *desc) -{ - irq_desc_ptrs[irq] = desc; -} - struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) { struct irq_desc *desc; @@ -208,14 +209,14 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) return NULL; } - desc = irq_desc_ptrs[irq]; + desc = irq_to_desc(irq); if (desc) return desc; raw_spin_lock_irqsave(&sparse_irq_lock, flags); /* We have to check it to avoid races with another CPU */ - desc = irq_desc_ptrs[irq]; + desc = irq_to_desc(irq); if (desc) goto out_unlock; @@ -228,7 +229,7 @@ struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node) } init_one_irq_desc(irq, desc, node); - irq_desc_ptrs[irq] = desc; + set_irq_desc(irq, desc); out_unlock: raw_spin_unlock_irqrestore(&sparse_irq_lock, flags); -- cgit v1.2.2 From b54452b07a7b1b8cc1385edba3ef2ef6d4679d5a Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 18 Feb 2010 08:14:31 +0000 Subject: const: struct nla_policy Make remaining netlink policies as const. Fixup coding style where needed. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- kernel/taskstats.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/taskstats.c b/kernel/taskstats.c index ea8384d3caa7..899ca51be5e8 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -46,15 +46,13 @@ static struct genl_family family = { .maxattr = TASKSTATS_CMD_ATTR_MAX, }; -static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] -__read_mostly = { +static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static struct nla_policy -cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] __read_mostly = { +static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; -- cgit v1.2.2 From 4610ee1d3638fa05ba8e87ccfa971db8e4033ae7 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:33:59 -0500 Subject: kprobes: Introduce generic insn_slot framework Make insn_slot framework support various size slots. Current insn_slot just supports one-size instruction buffer slot. However, kprobes jump optimization needs larger size buffers. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133358.6725.82430.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers --- kernel/kprobes.c | 104 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 65 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ccec774c716d..78105623d739 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -105,57 +105,74 @@ static struct kprobe_blackpoint kprobe_blacklist[] = { * stepping on the instruction on a vmalloced/kmalloced/data page * is a recipe for disaster */ -#define INSNS_PER_PAGE (PAGE_SIZE/(MAX_INSN_SIZE * sizeof(kprobe_opcode_t))) - struct kprobe_insn_page { struct list_head list; kprobe_opcode_t *insns; /* Page of instruction slots */ - char slot_used[INSNS_PER_PAGE]; int nused; int ngarbage; + char slot_used[]; +}; + +#define KPROBE_INSN_PAGE_SIZE(slots) \ + (offsetof(struct kprobe_insn_page, slot_used) + \ + (sizeof(char) * (slots))) + +struct kprobe_insn_cache { + struct list_head pages; /* list of kprobe_insn_page */ + size_t insn_size; /* size of instruction slot */ + int nr_garbage; }; +static int slots_per_page(struct kprobe_insn_cache *c) +{ + return PAGE_SIZE/(c->insn_size * sizeof(kprobe_opcode_t)); +} + enum kprobe_slot_state { SLOT_CLEAN = 0, SLOT_DIRTY = 1, SLOT_USED = 2, }; -static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_pages */ -static LIST_HEAD(kprobe_insn_pages); -static int kprobe_garbage_slots; -static int collect_garbage_slots(void); +static DEFINE_MUTEX(kprobe_insn_mutex); /* Protects kprobe_insn_slots */ +static struct kprobe_insn_cache kprobe_insn_slots = { + .pages = LIST_HEAD_INIT(kprobe_insn_slots.pages), + .insn_size = MAX_INSN_SIZE, + .nr_garbage = 0, +}; +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c); /** * __get_insn_slot() - Find a slot on an executable page for an instruction. * We allocate an executable page if there's no room on existing ones. */ -static kprobe_opcode_t __kprobes *__get_insn_slot(void) +static kprobe_opcode_t __kprobes *__get_insn_slot(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip; retry: - list_for_each_entry(kip, &kprobe_insn_pages, list) { - if (kip->nused < INSNS_PER_PAGE) { + list_for_each_entry(kip, &c->pages, list) { + if (kip->nused < slots_per_page(c)) { int i; - for (i = 0; i < INSNS_PER_PAGE; i++) { + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_CLEAN) { kip->slot_used[i] = SLOT_USED; kip->nused++; - return kip->insns + (i * MAX_INSN_SIZE); + return kip->insns + (i * c->insn_size); } } - /* Surprise! No unused slots. Fix kip->nused. */ - kip->nused = INSNS_PER_PAGE; + /* kip->nused is broken. Fix it. */ + kip->nused = slots_per_page(c); + WARN_ON(1); } } /* If there are any garbage slots, collect it and try again. */ - if (kprobe_garbage_slots && collect_garbage_slots() == 0) { + if (c->nr_garbage && collect_garbage_slots(c) == 0) goto retry; - } - /* All out of space. Need to allocate a new page. Use slot 0. */ - kip = kmalloc(sizeof(struct kprobe_insn_page), GFP_KERNEL); + + /* All out of space. Need to allocate a new page. */ + kip = kmalloc(KPROBE_INSN_PAGE_SIZE(slots_per_page(c)), GFP_KERNEL); if (!kip) return NULL; @@ -170,20 +187,23 @@ static kprobe_opcode_t __kprobes *__get_insn_slot(void) return NULL; } INIT_LIST_HEAD(&kip->list); - list_add(&kip->list, &kprobe_insn_pages); - memset(kip->slot_used, SLOT_CLEAN, INSNS_PER_PAGE); + memset(kip->slot_used, SLOT_CLEAN, slots_per_page(c)); kip->slot_used[0] = SLOT_USED; kip->nused = 1; kip->ngarbage = 0; + list_add(&kip->list, &c->pages); return kip->insns; } + kprobe_opcode_t __kprobes *get_insn_slot(void) { - kprobe_opcode_t *ret; + kprobe_opcode_t *ret = NULL; + mutex_lock(&kprobe_insn_mutex); - ret = __get_insn_slot(); + ret = __get_insn_slot(&kprobe_insn_slots); mutex_unlock(&kprobe_insn_mutex); + return ret; } @@ -199,7 +219,7 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) * so as not to have to set it up again the * next time somebody inserts a probe. */ - if (!list_is_singular(&kprobe_insn_pages)) { + if (!list_is_singular(&kip->list)) { list_del(&kip->list); module_free(NULL, kip->insns); kfree(kip); @@ -209,49 +229,55 @@ static int __kprobes collect_one_slot(struct kprobe_insn_page *kip, int idx) return 0; } -static int __kprobes collect_garbage_slots(void) +static int __kprobes collect_garbage_slots(struct kprobe_insn_cache *c) { struct kprobe_insn_page *kip, *next; /* Ensure no-one is interrupted on the garbages */ synchronize_sched(); - list_for_each_entry_safe(kip, next, &kprobe_insn_pages, list) { + list_for_each_entry_safe(kip, next, &c->pages, list) { int i; if (kip->ngarbage == 0) continue; kip->ngarbage = 0; /* we will collect all garbages */ - for (i = 0; i < INSNS_PER_PAGE; i++) { + for (i = 0; i < slots_per_page(c); i++) { if (kip->slot_used[i] == SLOT_DIRTY && collect_one_slot(kip, i)) break; } } - kprobe_garbage_slots = 0; + c->nr_garbage = 0; return 0; } -void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) +static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, + kprobe_opcode_t *slot, int dirty) { struct kprobe_insn_page *kip; - mutex_lock(&kprobe_insn_mutex); - list_for_each_entry(kip, &kprobe_insn_pages, list) { - if (kip->insns <= slot && - slot < kip->insns + (INSNS_PER_PAGE * MAX_INSN_SIZE)) { - int i = (slot - kip->insns) / MAX_INSN_SIZE; + list_for_each_entry(kip, &c->pages, list) { + long idx = ((long)slot - (long)kip->insns) / c->insn_size; + if (idx >= 0 && idx < slots_per_page(c)) { + WARN_ON(kip->slot_used[idx] != SLOT_USED); if (dirty) { - kip->slot_used[i] = SLOT_DIRTY; + kip->slot_used[idx] = SLOT_DIRTY; kip->ngarbage++; + if (++c->nr_garbage > slots_per_page(c)) + collect_garbage_slots(c); } else - collect_one_slot(kip, i); - break; + collect_one_slot(kip, idx); + return; } } + /* Could not free this slot. */ + WARN_ON(1); +} - if (dirty && ++kprobe_garbage_slots > INSNS_PER_PAGE) - collect_garbage_slots(); - +void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_insn_mutex); + __free_insn_slot(&kprobe_insn_slots, slot, dirty); mutex_unlock(&kprobe_insn_mutex); } #endif -- cgit v1.2.2 From afd66255b9a48f5851326ddae50e2203fbf71dc9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:07 -0500 Subject: kprobes: Introduce kprobes jump optimization Introduce kprobes jump optimization arch-independent parts. Kprobes uses breakpoint instruction for interrupting execution flow, on some architectures, it can be replaced by a jump instruction and interruption emulation code. This gains kprobs' performance drastically. To enable this feature, set CONFIG_OPTPROBES=y (default y if the arch supports OPTPROBE). Changes in v9: - Fix a bug to optimize probe when enabling. - Check nearby probes can be optimize/unoptimize when disarming/arming kprobes, instead of registering/unregistering. This will help kprobe-tracer because most of probes on it are usually disabled. Changes in v6: - Cleanup coding style for readability. - Add comments around get/put_online_cpus(). Changes in v5: - Use get_online_cpus()/put_online_cpus() for avoiding text_mutex deadlock. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133407.6725.81992.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 461 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 410 insertions(+), 51 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 78105623d739..612af2d61614 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -280,6 +281,33 @@ void __kprobes free_insn_slot(kprobe_opcode_t * slot, int dirty) __free_insn_slot(&kprobe_insn_slots, slot, dirty); mutex_unlock(&kprobe_insn_mutex); } +#ifdef CONFIG_OPTPROBES +/* For optimized_kprobe buffer */ +static DEFINE_MUTEX(kprobe_optinsn_mutex); /* Protects kprobe_optinsn_slots */ +static struct kprobe_insn_cache kprobe_optinsn_slots = { + .pages = LIST_HEAD_INIT(kprobe_optinsn_slots.pages), + /* .insn_size is initialized later */ + .nr_garbage = 0, +}; +/* Get a slot for optimized_kprobe buffer */ +kprobe_opcode_t __kprobes *get_optinsn_slot(void) +{ + kprobe_opcode_t *ret = NULL; + + mutex_lock(&kprobe_optinsn_mutex); + ret = __get_insn_slot(&kprobe_optinsn_slots); + mutex_unlock(&kprobe_optinsn_mutex); + + return ret; +} + +void __kprobes free_optinsn_slot(kprobe_opcode_t * slot, int dirty) +{ + mutex_lock(&kprobe_optinsn_mutex); + __free_insn_slot(&kprobe_optinsn_slots, slot, dirty); + mutex_unlock(&kprobe_optinsn_mutex); +} +#endif #endif /* We have preemption disabled.. so it is safe to use __ versions */ @@ -310,23 +338,324 @@ struct kprobe __kprobes *get_kprobe(void *addr) if (p->addr == addr) return p; } + return NULL; } +static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs); + +/* Return true if the kprobe is an aggregator */ +static inline int kprobe_aggrprobe(struct kprobe *p) +{ + return p->pre_handler == aggr_pre_handler; +} + +/* + * Keep all fields in the kprobe consistent + */ +static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) +{ + memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); + memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); +} + +#ifdef CONFIG_OPTPROBES +/* + * Call all pre_handler on the list, but ignores its return value. + * This must be called from arch-dep optimized caller. + */ +void __kprobes opt_pre_handler(struct kprobe *p, struct pt_regs *regs) +{ + struct kprobe *kp; + + list_for_each_entry_rcu(kp, &p->list, list) { + if (kp->pre_handler && likely(!kprobe_disabled(kp))) { + set_kprobe_instance(kp); + kp->pre_handler(kp, regs); + } + reset_kprobe_instance(); + } +} + +/* Return true(!0) if the kprobe is ready for optimization. */ +static inline int kprobe_optready(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if (kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + return arch_prepared_optinsn(&op->optinsn); + } + + return 0; +} + +/* + * Return an optimized kprobe whose optimizing code replaces + * instructions including addr (exclude breakpoint). + */ +struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +{ + int i; + struct kprobe *p = NULL; + struct optimized_kprobe *op; + + /* Don't check i == 0, since that is a breakpoint case. */ + for (i = 1; !p && i < MAX_OPTIMIZED_LENGTH; i++) + p = get_kprobe((void *)(addr - i)); + + if (p && kprobe_optready(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (arch_within_optimized_kprobe(op, addr)) + return p; + } + + return NULL; +} + +/* Optimization staging list, protected by kprobe_mutex */ +static LIST_HEAD(optimizing_list); + +static void kprobe_optimizer(struct work_struct *work); +static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); +#define OPTIMIZE_DELAY 5 + +/* Kprobe jump optimizer */ +static __kprobes void kprobe_optimizer(struct work_struct *work) +{ + struct optimized_kprobe *op, *tmp; + + /* Lock modules while optimizing kprobes */ + mutex_lock(&module_mutex); + mutex_lock(&kprobe_mutex); + if (kprobes_all_disarmed) + goto end; + + /* + * Wait for quiesence period to ensure all running interrupts + * are done. Because optprobe may modify multiple instructions + * there is a chance that Nth instruction is interrupted. In that + * case, running interrupt can return to 2nd-Nth byte of jump + * instruction. This wait is for avoiding it. + */ + synchronize_sched(); + + /* + * The optimization/unoptimization refers online_cpus via + * stop_machine() and cpu-hotplug modifies online_cpus. + * And same time, text_mutex will be held in cpu-hotplug and here. + * This combination can cause a deadlock (cpu-hotplug try to lock + * text_mutex but stop_machine can not be done because online_cpus + * has been changed) + * To avoid this deadlock, we need to call get_online_cpus() + * for preventing cpu-hotplug outside of text_mutex locking. + */ + get_online_cpus(); + mutex_lock(&text_mutex); + list_for_each_entry_safe(op, tmp, &optimizing_list, list) { + WARN_ON(kprobe_disabled(&op->kp)); + if (arch_optimize_kprobe(op) < 0) + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + list_del_init(&op->list); + } + mutex_unlock(&text_mutex); + put_online_cpus(); +end: + mutex_unlock(&kprobe_mutex); + mutex_unlock(&module_mutex); +} + +/* Optimize kprobe if p is ready to be optimized */ +static __kprobes void optimize_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + /* Check if the kprobe is disabled or not ready for optimization. */ + if (!kprobe_optready(p) || + (kprobe_disabled(p) || kprobes_all_disarmed)) + return; + + /* Both of break_handler and post_handler are not supported. */ + if (p->break_handler || p->post_handler) + return; + + op = container_of(p, struct optimized_kprobe, kp); + + /* Check there is no other kprobes at the optimized instructions */ + if (arch_check_optimized_kprobe(op) < 0) + return; + + /* Check if it is already optimized. */ + if (op->kp.flags & KPROBE_FLAG_OPTIMIZED) + return; + + op->kp.flags |= KPROBE_FLAG_OPTIMIZED; + list_add(&op->list, &optimizing_list); + if (!delayed_work_pending(&optimizing_work)) + schedule_delayed_work(&optimizing_work, OPTIMIZE_DELAY); +} + +/* Unoptimize a kprobe if p is optimized */ +static __kprobes void unoptimize_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + if ((p->flags & KPROBE_FLAG_OPTIMIZED) && kprobe_aggrprobe(p)) { + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + else + /* Replace jump with break */ + arch_unoptimize_kprobe(op); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } +} + +/* Remove optimized instructions */ +static void __kprobes kill_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + if (!list_empty(&op->list)) { + /* Dequeue from the optimization queue */ + list_del_init(&op->list); + op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + } + /* Don't unoptimize, because the target code will be freed. */ + arch_remove_optimized_kprobe(op); +} + +/* Try to prepare optimized instructions */ +static __kprobes void prepare_optimized_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_prepare_optimized_kprobe(op); +} + +/* Free optimized instructions and optimized_kprobe */ +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = container_of(p, struct optimized_kprobe, kp); + arch_remove_optimized_kprobe(op); + kfree(op); +} + +/* Allocate new optimized_kprobe and try to prepare optimized instructions */ +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + struct optimized_kprobe *op; + + op = kzalloc(sizeof(struct optimized_kprobe), GFP_KERNEL); + if (!op) + return NULL; + + INIT_LIST_HEAD(&op->list); + op->kp.addr = p->addr; + arch_prepare_optimized_kprobe(op); + + return &op->kp; +} + +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p); + +/* + * Prepare an optimized_kprobe and optimize it + * NOTE: p must be a normal registered kprobe + */ +static __kprobes void try_to_optimize_kprobe(struct kprobe *p) +{ + struct kprobe *ap; + struct optimized_kprobe *op; + + ap = alloc_aggr_kprobe(p); + if (!ap) + return; + + op = container_of(ap, struct optimized_kprobe, kp); + if (!arch_prepared_optinsn(&op->optinsn)) { + /* If failed to setup optimizing, fallback to kprobe */ + free_aggr_kprobe(ap); + return; + } + + init_aggr_kprobe(ap, p); + optimize_kprobe(ap); +} + +static void __kprobes __arm_kprobe(struct kprobe *p) +{ + struct kprobe *old_p; + + /* Check collision with other optimized kprobes */ + old_p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(old_p)) + unoptimize_kprobe(old_p); /* Fallback to unoptimized kprobe */ + + arch_arm_kprobe(p); + optimize_kprobe(p); /* Try to optimize (add kprobe to a list) */ +} + +static void __kprobes __disarm_kprobe(struct kprobe *p) +{ + struct kprobe *old_p; + + unoptimize_kprobe(p); /* Try to unoptimize */ + arch_disarm_kprobe(p); + + /* If another kprobe was blocked, optimize it. */ + old_p = get_optimized_kprobe((unsigned long)p->addr); + if (unlikely(old_p)) + optimize_kprobe(old_p); +} + +#else /* !CONFIG_OPTPROBES */ + +#define optimize_kprobe(p) do {} while (0) +#define unoptimize_kprobe(p) do {} while (0) +#define kill_optimized_kprobe(p) do {} while (0) +#define prepare_optimized_kprobe(p) do {} while (0) +#define try_to_optimize_kprobe(p) do {} while (0) +#define __arm_kprobe(p) arch_arm_kprobe(p) +#define __disarm_kprobe(p) arch_disarm_kprobe(p) + +static __kprobes void free_aggr_kprobe(struct kprobe *p) +{ + kfree(p); +} + +static __kprobes struct kprobe *alloc_aggr_kprobe(struct kprobe *p) +{ + return kzalloc(sizeof(struct kprobe), GFP_KERNEL); +} +#endif /* CONFIG_OPTPROBES */ + /* Arm a kprobe with text_mutex */ static void __kprobes arm_kprobe(struct kprobe *kp) { + /* + * Here, since __arm_kprobe() doesn't use stop_machine(), + * this doesn't cause deadlock on text_mutex. So, we don't + * need get_online_cpus(). + */ mutex_lock(&text_mutex); - arch_arm_kprobe(kp); + __arm_kprobe(kp); mutex_unlock(&text_mutex); } /* Disarm a kprobe with text_mutex */ static void __kprobes disarm_kprobe(struct kprobe *kp) { + get_online_cpus(); /* For avoiding text_mutex deadlock */ mutex_lock(&text_mutex); - arch_disarm_kprobe(kp); + __disarm_kprobe(kp); mutex_unlock(&text_mutex); + put_online_cpus(); } /* @@ -395,7 +724,7 @@ static int __kprobes aggr_break_handler(struct kprobe *p, struct pt_regs *regs) void __kprobes kprobes_inc_nmissed_count(struct kprobe *p) { struct kprobe *kp; - if (p->pre_handler != aggr_pre_handler) { + if (!kprobe_aggrprobe(p)) { p->nmissed++; } else { list_for_each_entry_rcu(kp, &p->list, list) @@ -518,15 +847,6 @@ static void __kprobes cleanup_rp_inst(struct kretprobe *rp) free_rp_inst(rp); } -/* - * Keep all fields in the kprobe consistent - */ -static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) -{ - memcpy(&p->opcode, &old_p->opcode, sizeof(kprobe_opcode_t)); - memcpy(&p->ainsn, &old_p->ainsn, sizeof(struct arch_specific_insn)); -} - /* * Add the new probe to ap->list. Fail if this is the * second jprobe at the address - two jprobes can't coexist @@ -534,6 +854,10 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) { BUG_ON(kprobe_gone(ap) || kprobe_gone(p)); + + if (p->break_handler || p->post_handler) + unoptimize_kprobe(ap); /* Fall back to normal kprobe */ + if (p->break_handler) { if (ap->break_handler) return -EEXIST; @@ -548,7 +872,7 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) ap->flags &= ~KPROBE_FLAG_DISABLED; if (!kprobes_all_disarmed) /* Arm the breakpoint again. */ - arm_kprobe(ap); + __arm_kprobe(ap); } return 0; } @@ -557,12 +881,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p) * Fill in the required fields of the "manager kprobe". Replace the * earlier kprobe in the hlist with the manager kprobe */ -static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) +static void __kprobes init_aggr_kprobe(struct kprobe *ap, struct kprobe *p) { + /* Copy p's insn slot to ap */ copy_kprobe(p, ap); flush_insn_slot(ap); ap->addr = p->addr; - ap->flags = p->flags; + ap->flags = p->flags & ~KPROBE_FLAG_OPTIMIZED; ap->pre_handler = aggr_pre_handler; ap->fault_handler = aggr_fault_handler; /* We don't care the kprobe which has gone. */ @@ -572,8 +897,9 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) ap->break_handler = aggr_break_handler; INIT_LIST_HEAD(&ap->list); - list_add_rcu(&p->list, &ap->list); + INIT_HLIST_NODE(&ap->hlist); + list_add_rcu(&p->list, &ap->list); hlist_replace_rcu(&p->hlist, &ap->hlist); } @@ -587,12 +913,12 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, int ret = 0; struct kprobe *ap = old_p; - if (old_p->pre_handler != aggr_pre_handler) { - /* If old_p is not an aggr_probe, create new aggr_kprobe. */ - ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL); + if (!kprobe_aggrprobe(old_p)) { + /* If old_p is not an aggr_kprobe, create new aggr_kprobe. */ + ap = alloc_aggr_kprobe(old_p); if (!ap) return -ENOMEM; - add_aggr_kprobe(ap, old_p); + init_aggr_kprobe(ap, old_p); } if (kprobe_gone(ap)) { @@ -611,6 +937,9 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, */ return ret; + /* Prepare optimized instructions if possible. */ + prepare_optimized_kprobe(ap); + /* * Clear gone flag to prevent allocating new slot again, and * set disabled flag because it is not armed yet. @@ -619,6 +948,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p, | KPROBE_FLAG_DISABLED; } + /* Copy ap's insn slot to p */ copy_kprobe(ap, p); return add_new_kprobe(ap, p); } @@ -769,27 +1099,34 @@ int __kprobes register_kprobe(struct kprobe *p) p->nmissed = 0; INIT_LIST_HEAD(&p->list); mutex_lock(&kprobe_mutex); + + get_online_cpus(); /* For avoiding text_mutex deadlock. */ + mutex_lock(&text_mutex); + old_p = get_kprobe(p->addr); if (old_p) { + /* Since this may unoptimize old_p, locking text_mutex. */ ret = register_aggr_kprobe(old_p, p); goto out; } - mutex_lock(&text_mutex); ret = arch_prepare_kprobe(p); if (ret) - goto out_unlock_text; + goto out; INIT_HLIST_NODE(&p->hlist); hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arch_arm_kprobe(p); + __arm_kprobe(p); + + /* Try to optimize kprobe */ + try_to_optimize_kprobe(p); -out_unlock_text: - mutex_unlock(&text_mutex); out: + mutex_unlock(&text_mutex); + put_online_cpus(); mutex_unlock(&kprobe_mutex); if (probed_mod) @@ -811,7 +1148,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) return -EINVAL; if (old_p == p || - (old_p->pre_handler == aggr_pre_handler && + (kprobe_aggrprobe(old_p) && list_is_singular(&old_p->list))) { /* * Only probe on the hash list. Disarm only if kprobes are @@ -819,7 +1156,7 @@ static int __kprobes __unregister_kprobe_top(struct kprobe *p) * already have been removed. We save on flushing icache. */ if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) - disarm_kprobe(p); + disarm_kprobe(old_p); hlist_del_rcu(&old_p->hlist); } else { if (p->break_handler && !kprobe_gone(p)) @@ -835,8 +1172,13 @@ noclean: list_del_rcu(&p->list); if (!kprobe_disabled(old_p)) { try_to_disable_aggr_kprobe(old_p); - if (!kprobes_all_disarmed && kprobe_disabled(old_p)) - disarm_kprobe(old_p); + if (!kprobes_all_disarmed) { + if (kprobe_disabled(old_p)) + disarm_kprobe(old_p); + else + /* Try to optimize this probe again */ + optimize_kprobe(old_p); + } } } return 0; @@ -853,7 +1195,7 @@ static void __kprobes __unregister_kprobe_bottom(struct kprobe *p) old_p = list_entry(p->list.next, struct kprobe, list); list_del(&p->list); arch_remove_kprobe(old_p); - kfree(old_p); + free_aggr_kprobe(old_p); } } @@ -1149,7 +1491,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) struct kprobe *kp; p->flags |= KPROBE_FLAG_GONE; - if (p->pre_handler == aggr_pre_handler) { + if (kprobe_aggrprobe(p)) { /* * If this is an aggr_kprobe, we have to list all the * chained probes and mark them GONE. @@ -1158,6 +1500,7 @@ static void __kprobes kill_kprobe(struct kprobe *p) kp->flags |= KPROBE_FLAG_GONE; p->post_handler = NULL; p->break_handler = NULL; + kill_optimized_kprobe(p); } /* * Here, we can remove insn_slot safely, because no thread calls @@ -1267,6 +1610,11 @@ static int __init init_kprobes(void) } } +#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT) + /* Init kprobe_optinsn_slots */ + kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; +#endif + /* By default, kprobes are armed */ kprobes_all_disarmed = false; @@ -1285,7 +1633,7 @@ static int __init init_kprobes(void) #ifdef CONFIG_DEBUG_FS static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, - const char *sym, int offset,char *modname) + const char *sym, int offset, char *modname, struct kprobe *pp) { char *kprobe_type; @@ -1295,19 +1643,21 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p, kprobe_type = "j"; else kprobe_type = "k"; + if (sym) - seq_printf(pi, "%p %s %s+0x%x %s %s%s\n", + seq_printf(pi, "%p %s %s+0x%x %s ", p->addr, kprobe_type, sym, offset, - (modname ? modname : " "), - (kprobe_gone(p) ? "[GONE]" : ""), - ((kprobe_disabled(p) && !kprobe_gone(p)) ? - "[DISABLED]" : "")); + (modname ? modname : " ")); else - seq_printf(pi, "%p %s %p %s%s\n", - p->addr, kprobe_type, p->addr, - (kprobe_gone(p) ? "[GONE]" : ""), - ((kprobe_disabled(p) && !kprobe_gone(p)) ? - "[DISABLED]" : "")); + seq_printf(pi, "%p %s %p ", + p->addr, kprobe_type, p->addr); + + if (!pp) + pp = p; + seq_printf(pi, "%s%s%s\n", + (kprobe_gone(p) ? "[GONE]" : ""), + ((kprobe_disabled(p) && !kprobe_gone(p)) ? "[DISABLED]" : ""), + (kprobe_optimized(pp) ? "[OPTIMIZED]" : "")); } static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos) @@ -1343,11 +1693,11 @@ static int __kprobes show_kprobe_addr(struct seq_file *pi, void *v) hlist_for_each_entry_rcu(p, node, head, hlist) { sym = kallsyms_lookup((unsigned long)p->addr, NULL, &offset, &modname, namebuf); - if (p->pre_handler == aggr_pre_handler) { + if (kprobe_aggrprobe(p)) { list_for_each_entry_rcu(kp, &p->list, list) - report_probe(pi, kp, sym, offset, modname); + report_probe(pi, kp, sym, offset, modname, p); } else - report_probe(pi, p, sym, offset, modname); + report_probe(pi, p, sym, offset, modname, NULL); } preempt_enable(); return 0; @@ -1425,12 +1775,13 @@ int __kprobes enable_kprobe(struct kprobe *kp) goto out; } - if (!kprobes_all_disarmed && kprobe_disabled(p)) - arm_kprobe(p); - - p->flags &= ~KPROBE_FLAG_DISABLED; if (p != kp) kp->flags &= ~KPROBE_FLAG_DISABLED; + + if (!kprobes_all_disarmed && kprobe_disabled(p)) { + p->flags &= ~KPROBE_FLAG_DISABLED; + arm_kprobe(p); + } out: mutex_unlock(&kprobe_mutex); return ret; @@ -1450,12 +1801,13 @@ static void __kprobes arm_all_kprobes(void) if (!kprobes_all_disarmed) goto already_enabled; + /* Arming kprobes doesn't optimize kprobe itself */ mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) if (!kprobe_disabled(p)) - arch_arm_kprobe(p); + __arm_kprobe(p); } mutex_unlock(&text_mutex); @@ -1482,16 +1834,23 @@ static void __kprobes disarm_all_kprobes(void) kprobes_all_disarmed = true; printk(KERN_INFO "Kprobes globally disabled\n"); + + /* + * Here we call get_online_cpus() for avoiding text_mutex deadlock, + * because disarming may also unoptimize kprobes. + */ + get_online_cpus(); mutex_lock(&text_mutex); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; hlist_for_each_entry_rcu(p, node, head, hlist) { if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - arch_disarm_kprobe(p); + __disarm_kprobe(p); } } mutex_unlock(&text_mutex); + put_online_cpus(); mutex_unlock(&kprobe_mutex); /* Allow all currently running kprobes to complete */ synchronize_sched(); -- cgit v1.2.2 From b2be84df99ebc93599c69e931a3c4a5105abfabc Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 25 Feb 2010 08:34:15 -0500 Subject: kprobes: Jump optimization sysctl interface Add /proc/sys/debug/kprobes-optimization sysctl which enables and disables kprobes jump optimization on the fly for debugging. Changes in v7: - Remove ctl_name = CTL_UNNUMBERED for upstream compatibility. Changes in v6: - Update comments and coding style. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Ananth N Mavinakayanahalli Cc: Jim Keniston Cc: Srikar Dronamraju Cc: Christoph Hellwig Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Anders Kaseorg Cc: Tim Abbott Cc: Andi Kleen Cc: Jason Baron Cc: Mathieu Desnoyers Cc: Frederic Weisbecker Cc: Ananth N Mavinakayanahalli LKML-Reference: <20100225133415.6725.8274.stgit@localhost6.localdomain6> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- kernel/sysctl.c | 12 ++++++++ 2 files changed, 97 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 612af2d61614..fa034d29cf73 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -360,6 +361,9 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) } #ifdef CONFIG_OPTPROBES +/* NOTE: change this value only with kprobe_mutex held */ +static bool kprobes_allow_optimization; + /* * Call all pre_handler on the list, but ignores its return value. * This must be called from arch-dep optimized caller. @@ -428,7 +432,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); mutex_lock(&kprobe_mutex); - if (kprobes_all_disarmed) + if (kprobes_all_disarmed || !kprobes_allow_optimization) goto end; /* @@ -471,7 +475,7 @@ static __kprobes void optimize_kprobe(struct kprobe *p) struct optimized_kprobe *op; /* Check if the kprobe is disabled or not ready for optimization. */ - if (!kprobe_optready(p) || + if (!kprobe_optready(p) || !kprobes_allow_optimization || (kprobe_disabled(p) || kprobes_all_disarmed)) return; @@ -588,6 +592,80 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p) optimize_kprobe(ap); } +#ifdef CONFIG_SYSCTL +static void __kprobes optimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already allowed, just return */ + if (kprobes_allow_optimization) + return; + + kprobes_allow_optimization = true; + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) + if (!kprobe_disabled(p)) + optimize_kprobe(p); + } + mutex_unlock(&text_mutex); + printk(KERN_INFO "Kprobes globally optimized\n"); +} + +static void __kprobes unoptimize_all_kprobes(void) +{ + struct hlist_head *head; + struct hlist_node *node; + struct kprobe *p; + unsigned int i; + + /* If optimization is already prohibited, just return */ + if (!kprobes_allow_optimization) + return; + + kprobes_allow_optimization = false; + printk(KERN_INFO "Kprobes globally unoptimized\n"); + get_online_cpus(); /* For avoiding text_mutex deadlock */ + mutex_lock(&text_mutex); + for (i = 0; i < KPROBE_TABLE_SIZE; i++) { + head = &kprobe_table[i]; + hlist_for_each_entry_rcu(p, node, head, hlist) { + if (!kprobe_disabled(p)) + unoptimize_kprobe(p); + } + } + + mutex_unlock(&text_mutex); + put_online_cpus(); + /* Allow all currently running kprobes to complete */ + synchronize_sched(); +} + +int sysctl_kprobes_optimization; +int proc_kprobes_optimization_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, + loff_t *ppos) +{ + int ret; + + mutex_lock(&kprobe_mutex); + sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + + if (sysctl_kprobes_optimization) + optimize_all_kprobes(); + else + unoptimize_all_kprobes(); + mutex_unlock(&kprobe_mutex); + + return ret; +} +#endif /* CONFIG_SYSCTL */ + static void __kprobes __arm_kprobe(struct kprobe *p) { struct kprobe *old_p; @@ -1610,10 +1688,14 @@ static int __init init_kprobes(void) } } -#if defined(CONFIG_OPTPROBES) && defined(__ARCH_WANT_KPROBES_INSN_SLOT) +#if defined(CONFIG_OPTPROBES) +#if defined(__ARCH_WANT_KPROBES_INSN_SLOT) /* Init kprobe_optinsn_slots */ kprobe_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; #endif + /* By default, kprobes can be optimized */ + kprobes_allow_optimization = true; +#endif /* By default, kprobes are armed */ kprobes_all_disarmed = false; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b2448468..40d791d616b5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include @@ -1449,6 +1450,17 @@ static struct ctl_table debug_table[] = { .mode = 0644, .proc_handler = proc_dointvec }, +#endif +#if defined(CONFIG_OPTPROBES) + { + .procname = "kprobes-optimization", + .data = &sysctl_kprobes_optimization, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_kprobes_optimization_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { } }; -- cgit v1.2.2 From fb90ef93df654f2678933efbbf864adac0ae490e Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 24 Feb 2010 18:36:53 -0800 Subject: early_res: Add free_early_partial() To free partial areas in pcpu_setup... Reported-by: Peter Zijlstra Signed-off-by: Yinghai Lu Cc: Tejun Heo Cc: Christoph Lameter Cc: Stephen Rothwell Cc: Linus Torvalds Cc: Jesse Barnes Cc: Pekka Enberg LKML-Reference: <4B85E245.5030001@kernel.org> Signed-off-by: Ingo Molnar --- kernel/early_res.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) (limited to 'kernel') diff --git a/kernel/early_res.c b/kernel/early_res.c index aa5494ac4462..9ab11cd84853 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -61,6 +61,40 @@ static void __init drop_range(int i) early_res_count--; } +static void __init drop_range_partial(int i, u64 start, u64 end) +{ + u64 common_start, common_end; + u64 old_start, old_end; + + old_start = early_res[i].start; + old_end = early_res[i].end; + common_start = max(old_start, start); + common_end = min(old_end, end); + + /* no overlap ? */ + if (common_start >= common_end) + return; + + if (old_start < common_start) { + /* make head segment */ + early_res[i].end = common_start; + if (old_end > common_end) { + /* add another for left over on tail */ + reserve_early_without_check(common_end, old_end, + early_res[i].name); + } + return; + } else { + if (old_end > common_end) { + /* reuse the entry for tail left */ + early_res[i].start = common_end; + return; + } + /* all covered */ + drop_range(i); + } +} + /* * Split any existing ranges that: * 1) are marked 'overlap_ok', and @@ -284,6 +318,27 @@ void __init free_early(u64 start, u64 end) drop_range(i); } +void __init free_early_partial(u64 start, u64 end) +{ + struct early_res *r; + int i; + +try_next: + i = find_overlapped_early(start, end); + if (i >= max_early_res) + return; + + r = &early_res[i]; + /* hole ? */ + if (r->end >= end && r->start <= start) { + drop_range_partial(i, start, end); + return; + } + + drop_range_partial(i, start, end); + goto try_next; +} + #ifdef CONFIG_NO_BOOTMEM static void __init subtract_early_res(struct range *range, int az) { -- cgit v1.2.2 From 44ee63587dce85593c22497140db16f4e5027860 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 17 Feb 2010 10:50:50 +0900 Subject: percpu: Add __percpu sparse annotations to hw_breakpoint Add __percpu sparse annotations to hw_breakpoint. These annotations are to make sparse consider percpu variables to be in a different address space and warn if accessed without going through percpu accessors. This patch doesn't affect normal builds. In kernel/hw_breakpoint.c, per_cpu(nr_task_bp_pinned, cpu)'s will trigger spurious noderef related warnings from sparse. Changing it to &per_cpu(nr_task_bp_pinned[0], cpu) will work around the problem but deemed to ugly by the maintainer. Leave it alone until better solution can be found. Signed-off-by: Tejun Heo Cc: Stephen Rothwell Cc: K.Prasad LKML-Reference: <4B7B4B7A.9050902@kernel.org> Signed-off-by: Frederic Weisbecker --- kernel/hw_breakpoint.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..6542eacb3fa5 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -413,17 +413,17 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); * * @return a set of per_cpu pointers to perf events */ -struct perf_event ** +struct perf_event * __percpu * register_wide_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered) { - struct perf_event **cpu_events, **pevent, *bp; + struct perf_event * __percpu *cpu_events, **pevent, *bp; long err; int cpu; cpu_events = alloc_percpu(typeof(*cpu_events)); if (!cpu_events) - return ERR_PTR(-ENOMEM); + return (void __percpu __force *)ERR_PTR(-ENOMEM); get_online_cpus(); for_each_online_cpu(cpu) { @@ -451,7 +451,7 @@ fail: put_online_cpus(); free_percpu(cpu_events); - return ERR_PTR(err); + return (void __percpu __force *)ERR_PTR(err); } EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); @@ -459,7 +459,7 @@ EXPORT_SYMBOL_GPL(register_wide_hw_breakpoint); * unregister_wide_hw_breakpoint - unregister a wide breakpoint in the kernel * @cpu_events: the per cpu set of events to unregister */ -void unregister_wide_hw_breakpoint(struct perf_event **cpu_events) +void unregister_wide_hw_breakpoint(struct perf_event * __percpu *cpu_events) { int cpu; struct perf_event **pevent; -- cgit v1.2.2 From 622ea685f1fafdf84d612440535c84341f0860b8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 27 Feb 2010 14:53:07 -0800 Subject: rcu: Fix holdoff for accelerated GPs for last non-dynticked CPU Make the holdoff only happen when the full number of attempts have been made. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267311188-16603-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree_plugin.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 464ad2cdee00..79b53bda8943 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1010,6 +1010,10 @@ int rcu_needs_cpu(int cpu) int c = 0; int thatcpu; + /* Check for being in the holdoff period. */ + if (per_cpu(rcu_dyntick_holdoff, cpu) == jiffies) + return rcu_needs_cpu_quick_check(cpu); + /* Don't bother unless we are the last non-dyntick-idle CPU. */ for_each_cpu_not(thatcpu, nohz_cpu_mask) if (thatcpu != cpu) { @@ -1041,10 +1045,8 @@ int rcu_needs_cpu(int cpu) } /* If RCU callbacks are still pending, RCU still needs this CPU. */ - if (c) { + if (c) raise_softirq(RCU_SOFTIRQ); - per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - } return c; } -- cgit v1.2.2 From ae1f30384baef4056438d81b305a6a5199b0d16c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 28 Feb 2010 19:42:38 +0100 Subject: tracing: Include irqflags headers from trace clock trace_clock.c includes spinlock.h, which ends up including asm/system.h, which in turn includes linux/irqflags.h in x86. So the definition of raw_local_irq_save is luckily covered there, but this is not the case in parisc: tip/kernel/trace/trace_clock.c:86: error: implicit declaration of function 'raw_local_irq_save' tip/kernel/trace/trace_clock.c:112: error: implicit declaration of function 'raw_local_irq_restore' We need to include linux/irqflags.h directly from trace_clock.c to avoid such build error. Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Robert Richter Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/trace/trace_clock.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index 84a3a7ba072a..6fbfb8f417b9 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -13,6 +13,7 @@ * Tracer plugins will chose a default from these clocks. */ #include +#include #include #include #include -- cgit v1.2.2 From 1e259e0a9982078896f3404240096cbea01daca4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 28 Feb 2010 20:51:15 +0100 Subject: hw-breakpoints: Remove stub unthrottle callback We support event unthrottling in breakpoint events. It means that if we have more than sysctl_perf_event_sample_rate/HZ, perf will throttle, ignoring subsequent events until the next tick. So if ptrace exceeds this max rate, it will omit events, which breaks the ptrace determinism that is supposed to report every triggered breakpoints. This is likely to happen if we set sysctl_perf_event_sample_rate to 1. This patch removes support for unthrottling in breakpoint events to break throttling and restore ptrace determinism. Signed-off-by: Frederic Weisbecker Cc: 2.6.33.x Cc: Peter Zijlstra Cc: K.Prasad Cc: Paul Mackerras --- kernel/hw_breakpoint.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 967e66143e11..4d99512ee149 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -489,5 +489,4 @@ struct pmu perf_ops_bp = { .enable = arch_install_hw_breakpoint, .disable = arch_uninstall_hw_breakpoint, .read = hw_breakpoint_pmu_read, - .unthrottle = hw_breakpoint_pmu_unthrottle }; -- cgit v1.2.2 From 4b17764737bb4ee3364b8bfa2059f51ebc19ccd6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 1 Mar 2010 00:02:23 -0800 Subject: sparc: Support show_unhandled_signals. Just faults right now, will add other traps later. Signed-off-by: David S. Miller --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8a68b2448468..33e7a38b6eb9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1441,7 +1441,7 @@ static struct ctl_table fs_table[] = { }; static struct ctl_table debug_table[] = { -#if defined(CONFIG_X86) || defined(CONFIG_PPC) +#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) { .procname = "exception-trace", .data = &show_unhandled_signals, -- cgit v1.2.2 From 90a6501f94aedd7fb40f5556334843194fb598be Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 28 Feb 2010 08:32:18 -0800 Subject: sched, rcu: Fix rcu_dereference() for RCU-lockdep Make rcu_dereference() of runqueue data structures be rcu_dereference_sched(). Located-by: Ingo Molnar Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20100228163218.GD6846@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3e1fd96c6cf9..5a5ea2cd924f 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -3476,7 +3476,7 @@ static void run_rebalance_domains(struct softirq_action *h) static inline int on_null_domain(int cpu) { - return !rcu_dereference(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(cpu_rq(cpu)->sd); } /* -- cgit v1.2.2 From 37b99dd5372cff42f83210c280f314f10f99138e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Mon, 1 Mar 2010 21:55:51 +0800 Subject: resource: Fix generic page_is_ram() for partial RAM pages The System RAM walk shall skip partial RAM pages and avoid calling func() on them. So that page_is_ram() return 0 for a partial RAM page. In particular, it shall not call func() with len=0. This fixes a boot time bug reported by Sachin and root caused by Thomas: > >>> WARNING: at arch/x86/mm/ioremap.c:111 __ioremap_caller+0x169/0x2f1() > >>> Hardware name: BladeCenter LS21 -[79716AA]- > >>> Modules linked in: > >>> Pid: 0, comm: swapper Not tainted 2.6.33-git6-autotest #1 > >>> Call Trace: > >>> [] ? __ioremap_caller+0x169/0x2f1 > >>> [] warn_slowpath_common+0x77/0xa4 > >>> [] warn_slowpath_null+0xf/0x11 > >>> [] __ioremap_caller+0x169/0x2f1 > >>> [] ? acpi_os_map_memory+0x12/0x1b > >>> [] ioremap_nocache+0x12/0x14 > >>> [] acpi_os_map_memory+0x12/0x1b > >>> [] acpi_tb_verify_table+0x29/0x5b > >>> [] acpi_load_tables+0x39/0x15a > >>> [] acpi_early_init+0x60/0xf5 > >>> [] start_kernel+0x397/0x3a7 > >>> [] x86_64_start_reservations+0xa5/0xa9 > >>> [] x86_64_start_kernel+0xe1/0xe8 > >>> ---[ end trace 4eaa2a86a8e2da22 ]--- > >>> ioremap reserve_memtype failed -22 The return code is -EINVAL, so it failed in the is_ram check, which is not too surprising > BIOS-provided physical RAM map: > BIOS-e820: 0000000000000000 - 000000000009c000 (usable) > BIOS-e820: 000000000009c000 - 00000000000a0000 (reserved) > BIOS-e820: 00000000000e0000 - 0000000000100000 (reserved) > BIOS-e820: 0000000000100000 - 00000000cffa3900 (usable) > BIOS-e820: 00000000cffa3900 - 00000000cffa7400 (ACPI data) The ACPI data is not starting on a page boundary and neither does the usable RAM area end on a page boundary. Very useful ! > ACPI: DSDT 00000000cffa3900 036CE (v01 IBM SERLEWIS 00001000 INTL 20060912) ACPI is trying to map DSDT at cffa3900, which results in a check vs. cffa3000 which is the relevant page boundary. The generic is_ram check correctly identifies that as RAM because it's in the usable resource area. The old e820 based is_ram check does not take overlapping resource areas into account. That's why it works. CC: Sachin Sant CC: Thomas Gleixner CC: KAMEZAWA Hiroyuki Signed-off-by: Wu Fengguang LKML-Reference: <20100301135551.GA9998@localhost> Signed-off-by: H. Peter Anvin --- kernel/resource.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 03c897f7935e..8f0e3d0f4bff 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -274,7 +274,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) { struct resource res; - unsigned long pfn, len; + unsigned long pfn, end_pfn; u64 orig_end; int ret = -1; @@ -284,9 +284,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, orig_end = res.end; while ((res.start < res.end) && (find_next_system_ram(&res, "System RAM") >= 0)) { - pfn = (unsigned long)(res.start >> PAGE_SHIFT); - len = (unsigned long)((res.end + 1 - res.start) >> PAGE_SHIFT); - ret = (*func)(pfn, len, arg); + pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; + end_pfn = (res.end + 1) >> PAGE_SHIFT; + if (end_pfn > pfn) + ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) break; res.start = res.end + 1; -- cgit v1.2.2 From dce46a04d55d6358d2d4ab44a4946a19f9425fe2 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 1 Mar 2010 18:48:52 -0800 Subject: early_res: Need to save the allocation name in drop_range_partial() During free_early_partial(), reserve_early_without_check() could end extending the early_res area from __check_and_double_early_res(); as a result, the location of the name for the current reservation could change. Therefore, we need to save a local copy of the name. [ hpa: rewrote comment and checkin description ] Signed-off-by: Yinghai Lu LKML-Reference: <4B8C7C94.7070000@kernel.org> Signed-off-by: H. Peter Anvin --- kernel/early_res.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/early_res.c b/kernel/early_res.c index 9ab11cd84853..3cb2c661bb78 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -79,9 +79,19 @@ static void __init drop_range_partial(int i, u64 start, u64 end) /* make head segment */ early_res[i].end = common_start; if (old_end > common_end) { + char name[15]; + + /* + * Save a local copy of the name, since the + * early_res array could get resized inside + * reserve_early_without_check() -> + * __check_and_double_early_res(), which would + * make the current name pointer invalid. + */ + strncpy(name, early_res[i].name, + sizeof(early_res[i].name) - 1); /* add another for left over on tail */ - reserve_early_without_check(common_end, old_end, - early_res[i].name); + reserve_early_without_check(common_end, old_end, name); } return; } else { -- cgit v1.2.2 From ad6759fbf35d104dbf573cd6f4c6784ad6823f7e Mon Sep 17 00:00:00 2001 From: john stultz Date: Mon, 1 Mar 2010 12:34:43 -0800 Subject: timekeeping: Prevent oops when GENERIC_TIME=n Aaro Koskinen reported an issue in kernel.org bugzilla #15366, where on non-GENERIC_TIME systems, accessing /sys/devices/system/clocksource/clocksource0/current_clocksource results in an oops. It seems the timekeeper/clocksource rework missed initializing the curr_clocksource value in the !GENERIC_TIME case. Thanks to Aaro for reporting and diagnosing the issue as well as testing the fix! Reported-by: Aaro Koskinen Signed-off-by: John Stultz Cc: Martin Schwidefsky Cc: stable@kernel.org LKML-Reference: <1267475683.4216.61.camel@localhost.localdomain> Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 1f663d23e85e..1f5dde637457 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -592,6 +592,10 @@ static inline void clocksource_select(void) { } */ static int __init clocksource_done_booting(void) { + mutex_lock(&clocksource_mutex); + curr_clocksource = clocksource_default_clock(); + mutex_unlock(&clocksource_mutex); + finished_booting = 1; /* -- cgit v1.2.2 From 320ebf09cbb6d01954c9a060266aa8e0d27f4638 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 2 Mar 2010 12:35:37 +0100 Subject: perf, x86: Restrict the ANY flag The ANY flag can show SMT data of another task (like 'top'), so we want to disable it when system-wide profiling is disabled. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a661e7991865..482d5e1d3764 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -56,21 +56,6 @@ static atomic_t nr_task_events __read_mostly; */ int sysctl_perf_event_paranoid __read_mostly = 1; -static inline bool perf_paranoid_tracepoint_raw(void) -{ - return sysctl_perf_event_paranoid > -1; -} - -static inline bool perf_paranoid_cpu(void) -{ - return sysctl_perf_event_paranoid > 0; -} - -static inline bool perf_paranoid_kernel(void) -{ - return sysctl_perf_event_paranoid > 1; -} - int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */ /* -- cgit v1.2.2 From f41496607e03ab99f263b8e26689ad0fc853007f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 2 Mar 2010 11:21:09 -0800 Subject: resource: Fix broken indentation Fix broken indentation in patch 37b99dd5372cff42f83210c280f314f10f99138e. Reported-by: Linus Torvalds Cc: Wu Fengguang LKML-Reference: <20100301135551.GA9998@localhost> Signed-off-by: H. Peter Anvin --- kernel/resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 8f0e3d0f4bff..91f430fd467e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -287,7 +287,7 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) - ret = (*func)(pfn, end_pfn - pfn, arg); + ret = (*func)(pfn, end_pfn - pfn, arg); if (ret) break; res.start = res.end + 1; -- cgit v1.2.2 From ac91d85456372a90af5b85eb6620fd2efb1e431b Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 2 Mar 2010 17:54:50 +0800 Subject: tracing: Fix warning in s_next of trace file ops This warning in s_next() can be triggered by lseek(): [] ? s_next+0x77/0x80 [] warn_slowpath_common+0x81/0xa0 [] ? s_next+0x77/0x80 [] warn_slowpath_null+0x1a/0x20 [] s_next+0x77/0x80 [] traverse+0x117/0x200 [] seq_lseek+0xa3/0x120 [] ? seq_lseek+0x0/0x120 [] vfs_llseek+0x41/0x50 [] sys_llseek+0x66/0xa0 [] sysenter_do_call+0x12/0x26 The iterator "leftover" variable is zeroed in the opening of the trace file. But lseek can call s_start() which will call s_next() without reseting the "leftover" variable back to zero, which might trigger the WARN_ON_ONCE(iter->leftover) that is in s_next(). Cc: stable@kernel.org Signed-off-by: Lai Jiangshan LKML-Reference: <4B8CE06A.9090207@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 032c57ca6502..5edf410bc540 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1703,6 +1703,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) ftrace_enable_cpu(); + iter->leftover = 0; for (p = iter; p && l < *pos; p = s_next(m, p, &l)) ; -- cgit v1.2.2 From 8737c9305bd5602b11f7eb4655d5695d4a42a0c6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Dec 2009 06:47:55 -0500 Subject: Switch may_open() and break_lease() to passing O_... ... instead of mixing FMODE_ and O_ Signed-off-by: Al Viro --- kernel/sysctl_binary.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 8f5d16e0707a..8cd50d8f9bde 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1331,7 +1331,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, ssize_t result; char *pathname; int flags; - int acc_mode, fmode; + int acc_mode; pathname = sysctl_getname(name, nlen, &table); result = PTR_ERR(pathname); @@ -1342,15 +1342,12 @@ static ssize_t binary_sysctl(const int *name, int nlen, if (oldval && oldlen && newval && newlen) { flags = O_RDWR; acc_mode = MAY_READ | MAY_WRITE; - fmode = FMODE_READ | FMODE_WRITE; } else if (newval && newlen) { flags = O_WRONLY; acc_mode = MAY_WRITE; - fmode = FMODE_WRITE; } else if (oldval && oldlen) { flags = O_RDONLY; acc_mode = MAY_READ; - fmode = FMODE_READ; } else { result = 0; goto out_putname; @@ -1361,7 +1358,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, if (result) goto out_putname; - result = may_open(&nd.path, acc_mode, fmode); + result = may_open(&nd.path, acc_mode, flags); if (result) goto out_putpath; -- cgit v1.2.2 From 2096f759abcb42200a81d776f597362fd9265024 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Jan 2010 13:16:21 -0500 Subject: New helper: path_is_under(path1, path2) Analog of is_subdir for vfsmount,dentry pairs, moved from audit_tree.c Signed-off-by: Al Viro --- kernel/audit_tree.c | 51 ++++++++++++--------------------------------------- 1 file changed, 12 insertions(+), 39 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 4b05bd9479db..f09b42d9c32d 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -603,22 +603,6 @@ skip_it: mutex_unlock(&audit_filter_mutex); } -static int is_under(struct vfsmount *mnt, struct dentry *dentry, - struct path *path) -{ - if (mnt != path->mnt) { - for (;;) { - if (mnt->mnt_parent == mnt) - return 0; - if (mnt->mnt_parent == path->mnt) - break; - mnt = mnt->mnt_parent; - } - dentry = mnt->mnt_mountpoint; - } - return is_subdir(dentry, path->dentry); -} - int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) { @@ -714,29 +698,24 @@ int audit_tag_tree(char *old, char *new) { struct list_head cursor, barrier; int failed = 0; - struct path path; + struct path path1, path2; struct vfsmount *tagged; struct list_head list; - struct vfsmount *mnt; - struct dentry *dentry; int err; - err = kern_path(new, 0, &path); + err = kern_path(new, 0, &path2); if (err) return err; - tagged = collect_mounts(&path); - path_put(&path); + tagged = collect_mounts(&path2); + path_put(&path2); if (!tagged) return -ENOMEM; - err = kern_path(old, 0, &path); + err = kern_path(old, 0, &path1); if (err) { drop_collected_mounts(tagged); return err; } - mnt = mntget(path.mnt); - dentry = dget(path.dentry); - path_put(&path); list_add_tail(&list, &tagged->mnt_list); @@ -747,6 +726,7 @@ int audit_tag_tree(char *old, char *new) while (cursor.next != &tree_list) { struct audit_tree *tree; struct vfsmount *p; + int good_one = 0; tree = container_of(cursor.next, struct audit_tree, list); get_tree(tree); @@ -754,23 +734,17 @@ int audit_tag_tree(char *old, char *new) list_add(&cursor, &tree->list); mutex_unlock(&audit_filter_mutex); - err = kern_path(tree->pathname, 0, &path); - if (err) { - put_tree(tree); - mutex_lock(&audit_filter_mutex); - continue; + err = kern_path(tree->pathname, 0, &path2); + if (!err) { + good_one = path_is_under(&path1, &path2); + path_put(&path2); } - spin_lock(&vfsmount_lock); - if (!is_under(mnt, dentry, &path)) { - spin_unlock(&vfsmount_lock); - path_put(&path); + if (!good_one) { put_tree(tree); mutex_lock(&audit_filter_mutex); continue; } - spin_unlock(&vfsmount_lock); - path_put(&path); list_for_each_entry(p, &list, mnt_list) { failed = tag_chunk(p->mnt_root->d_inode, tree); @@ -820,8 +794,7 @@ int audit_tag_tree(char *old, char *new) list_del(&cursor); list_del(&list); mutex_unlock(&audit_filter_mutex); - dput(dentry); - mntput(mnt); + path_put(&path1); drop_collected_mounts(tagged); return failed; } -- cgit v1.2.2 From 1f707137b55764740981d022d29c622832a61880 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 30 Jan 2010 22:51:25 -0500 Subject: new helper: iterate_mounts() apply function to vfsmounts in set returned by collect_mounts(), stop if it returns non-zero. Signed-off-by: Al Viro --- kernel/audit_tree.c | 49 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 33 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index f09b42d9c32d..028e85663f27 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -548,6 +548,11 @@ int audit_remove_tree_rule(struct audit_krule *rule) return 0; } +static int compare_root(struct vfsmount *mnt, void *arg) +{ + return mnt->mnt_root->d_inode == arg; +} + void audit_trim_trees(void) { struct list_head cursor; @@ -559,7 +564,6 @@ void audit_trim_trees(void) struct path path; struct vfsmount *root_mnt; struct node *node; - struct list_head list; int err; tree = container_of(cursor.next, struct audit_tree, list); @@ -577,24 +581,16 @@ void audit_trim_trees(void) if (!root_mnt) goto skip_it; - list_add_tail(&list, &root_mnt->mnt_list); spin_lock(&hash_lock); list_for_each_entry(node, &tree->chunks, list) { - struct audit_chunk *chunk = find_chunk(node); - struct inode *inode = chunk->watch.inode; - struct vfsmount *mnt; + struct inode *inode = find_chunk(node)->watch.inode; node->index |= 1U<<31; - list_for_each_entry(mnt, &list, mnt_list) { - if (mnt->mnt_root->d_inode == inode) { - node->index &= ~(1U<<31); - break; - } - } + if (iterate_mounts(compare_root, inode, root_mnt)) + node->index &= ~(1U<<31); } spin_unlock(&hash_lock); trim_marked(tree); put_tree(tree); - list_del_init(&list); drop_collected_mounts(root_mnt); skip_it: mutex_lock(&audit_filter_mutex); @@ -622,13 +618,17 @@ void audit_put_tree(struct audit_tree *tree) put_tree(tree); } +static int tag_mount(struct vfsmount *mnt, void *arg) +{ + return tag_chunk(mnt->mnt_root->d_inode, arg); +} + /* called with audit_filter_mutex */ int audit_add_tree_rule(struct audit_krule *rule) { struct audit_tree *seed = rule->tree, *tree; struct path path; - struct vfsmount *mnt, *p; - struct list_head list; + struct vfsmount *mnt; int err; list_for_each_entry(tree, &tree_list, list) { @@ -654,16 +654,9 @@ int audit_add_tree_rule(struct audit_krule *rule) err = -ENOMEM; goto Err; } - list_add_tail(&list, &mnt->mnt_list); get_tree(tree); - list_for_each_entry(p, &list, mnt_list) { - err = tag_chunk(p->mnt_root->d_inode, tree); - if (err) - break; - } - - list_del(&list); + err = iterate_mounts(tag_mount, tree, mnt); drop_collected_mounts(mnt); if (!err) { @@ -700,7 +693,6 @@ int audit_tag_tree(char *old, char *new) int failed = 0; struct path path1, path2; struct vfsmount *tagged; - struct list_head list; int err; err = kern_path(new, 0, &path2); @@ -717,15 +709,12 @@ int audit_tag_tree(char *old, char *new) return err; } - list_add_tail(&list, &tagged->mnt_list); - mutex_lock(&audit_filter_mutex); list_add(&barrier, &tree_list); list_add(&cursor, &barrier); while (cursor.next != &tree_list) { struct audit_tree *tree; - struct vfsmount *p; int good_one = 0; tree = container_of(cursor.next, struct audit_tree, list); @@ -746,12 +735,7 @@ int audit_tag_tree(char *old, char *new) continue; } - list_for_each_entry(p, &list, mnt_list) { - failed = tag_chunk(p->mnt_root->d_inode, tree); - if (failed) - break; - } - + failed = iterate_mounts(tag_mount, tree, tagged); if (failed) { put_tree(tree); mutex_lock(&audit_filter_mutex); @@ -792,7 +776,6 @@ int audit_tag_tree(char *old, char *new) } list_del(&barrier); list_del(&cursor); - list_del(&list); mutex_unlock(&audit_filter_mutex); path_put(&path1); drop_collected_mounts(tagged); -- cgit v1.2.2 From a27341cd5fcb7cf2d2d4726e9f324009f7162c00 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 2 Mar 2010 08:36:46 -0800 Subject: Prioritize synchronous signals over 'normal' signals This makes sure that we pick the synchronous signals caused by a processor fault over any pending regular asynchronous signals sent to use by [t]kill(). This is not strictly required semantics, but it makes it _much_ easier for programs like Wine that expect to find the fault information in the signal stack. Without this, if a non-synchronous signal gets picked first, the delayed asynchronous signal will have its signal context pointing to the new signal invocation, rather than the instruction that caused the SIGSEGV or SIGBUS in the first place. This is not all that pretty, and we're discussing making the synchronous signals more explicit rather than have these kinds of implicit preferences of SIGSEGV and friends. See for example http://bugzilla.kernel.org/show_bug.cgi?id=15395 for some of the discussion. But in the meantime this is a simple and fairly straightforward work-around, and the whole if (x & Y) x &= Y; thing can be compiled into (and gcc does do it) just three instructions: movq %rdx, %rax andl $Y, %eax cmovne %rax, %rdx so it is at least a simple solution to a subtle issue. Reported-and-tested-by: Pavel Vilim Acked-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/signal.c | 43 ++++++++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 934ae5e687b9..5bb9baffa4f1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -159,6 +159,10 @@ void recalc_sigpending(void) /* Given the mask, find the first available signal that should be serviced. */ +#define SYNCHRONOUS_MASK \ + (sigmask(SIGSEGV) | sigmask(SIGBUS) | sigmask(SIGILL) | \ + sigmask(SIGTRAP) | sigmask(SIGFPE)) + int next_signal(struct sigpending *pending, sigset_t *mask) { unsigned long i, *s, *m, x; @@ -166,26 +170,39 @@ int next_signal(struct sigpending *pending, sigset_t *mask) s = pending->signal.sig; m = mask->sig; + + /* + * Handle the first word specially: it contains the + * synchronous signals that need to be dequeued first. + */ + x = *s &~ *m; + if (x) { + if (x & SYNCHRONOUS_MASK) + x &= SYNCHRONOUS_MASK; + sig = ffz(~x) + 1; + return sig; + } + switch (_NSIG_WORDS) { default: - for (i = 0; i < _NSIG_WORDS; ++i, ++s, ++m) - if ((x = *s &~ *m) != 0) { - sig = ffz(~x) + i*_NSIG_BPW + 1; - break; - } + for (i = 1; i < _NSIG_WORDS; ++i) { + x = *++s &~ *++m; + if (!x) + continue; + sig = ffz(~x) + i*_NSIG_BPW + 1; + break; + } break; - case 2: if ((x = s[0] &~ m[0]) != 0) - sig = 1; - else if ((x = s[1] &~ m[1]) != 0) - sig = _NSIG_BPW + 1; - else + case 2: + x = s[1] &~ m[1]; + if (!x) break; - sig += ffz(~x); + sig = ffz(~x) + _NSIG_BPW + 1; break; - case 1: if ((x = *s &~ *m) != 0) - sig = ffz(~x) + 1; + case 1: + /* Nothing to do */ break; } -- cgit v1.2.2 From 74781387822cd7a549123ae2b35862bf802689be Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Thu, 4 Mar 2010 13:30:22 +0800 Subject: padata: Allocate the cpumask for the padata instance The cpumask of the padata instance was used without allocated. This caused boot crashes if CONFIG_CPUMASK_OFFSTACK is enabled. This patch fixes this by doing proper allocation for this cpumask. Signed-off-by: Steffen Klassert Signed-off-by: Herbert Xu --- kernel/padata.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/padata.c b/kernel/padata.c index 6f9bcb8313d6..93caf65ff57c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -642,6 +642,9 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, if (!pd) goto err_free_inst; + if (!alloc_cpumask_var(&pinst->cpumask, GFP_KERNEL)) + goto err_free_pd; + rcu_assign_pointer(pinst->pd, pd); pinst->wq = wq; @@ -654,12 +657,14 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask, pinst->cpu_notifier.priority = 0; err = register_hotcpu_notifier(&pinst->cpu_notifier); if (err) - goto err_free_pd; + goto err_free_cpumask; mutex_init(&pinst->lock); return pinst; +err_free_cpumask: + free_cpumask_var(pinst->cpumask); err_free_pd: padata_free_pd(pd); err_free_inst: @@ -685,6 +690,7 @@ void padata_free(struct padata_instance *pinst) unregister_hotcpu_notifier(&pinst->cpu_notifier); padata_free_pd(pinst->pd); + free_cpumask_var(pinst->cpumask); kfree(pinst); } EXPORT_SYMBOL(padata_free); -- cgit v1.2.2 From db1466b3e1bd1727375cdbfcbea4bcce2f860f61 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 07:46:56 -0800 Subject: rcu: Use wrapper function instead of exporting tasklist_lock Lockdep-RCU commit d11c563d exported tasklist_lock, which is not a good thing. This patch instead exports a function that uses lockdep to check whether tasklist_lock is held. Suggested-by: Christoph Hellwig Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Christoph Hellwig LKML-Reference: <1267631219-8713-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/exit.c | 2 +- kernel/fork.c | 9 ++++++++- kernel/pid.c | 4 +++- 3 files changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf5..fed3a4db6f04 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -87,7 +87,7 @@ static void __exit_signal(struct task_struct *tsk) sighand = rcu_dereference_check(tsk->sighand, rcu_read_lock_held() || - lockdep_is_held(&tasklist_lock)); + lockdep_tasklist_lock_is_held()); spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356d..8691c540a470 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,7 +86,14 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -EXPORT_SYMBOL_GPL(tasklist_lock); + +#ifdef CONFIG_PROVE_RCU +int lockdep_tasklist_lock_is_held(void) +{ + return lockdep_is_held(&tasklist_lock); +} +EXPORT_SYMBOL_GPL(lockdep_tasklist_lock_is_held); +#endif /* #ifdef CONFIG_PROVE_RCU */ int nr_processes(void) { diff --git a/kernel/pid.c b/kernel/pid.c index b08e697cd83f..b6064405f367 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -367,7 +367,9 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); + first = rcu_dereference_check(pid->tasks[type].first, + rcu_read_lock_held() || + lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pids[(type)].node); } -- cgit v1.2.2 From cc5b83a9f884fe8722a275069a5a6fde39988455 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 07:46:59 -0800 Subject: rcu: Add control variables to lockdep_rcu_dereference() diagnostics Add the values of rcu_scheduler_active() and debug_locks() to the lockdep_rcu_dereference() output to help diagnose RCU lockdep splats that occur shortly after the scheduler starts. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267631219-8713-4-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/lockdep.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0c30d0455de1..681bc2e1e187 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3822,6 +3822,7 @@ void lockdep_rcu_dereference(const char *file, const int line) printk("%s:%d invoked rcu_dereference_check() without protection!\n", file, line); printk("\nother info that might help us debug this:\n\n"); + printk("\nrcu_scheduler_active = %d, debug_locks = %d\n", rcu_scheduler_active, debug_locks); lockdep_print_held_locks(curr); printk("\nstack backtrace:\n"); dump_stack(); -- cgit v1.2.2 From 8d53dd546f36073e0d29b0cfc24c665db301e3e7 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Mar 2010 17:50:18 -0800 Subject: rcu, ftrace: Fix RCU lockdep splat in ftrace_perf_buf_prepare() Change the pair of rcu_dereference() calls in ftrace_perf_buf_prepare() to rcu_dereference_sched(). Signed-off-by: Paul E. McKenney Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: Frederic Weisbecker LKML-Reference: <1267667418-32233-3-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_profile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index f0d693005075..c1cc3ab633de 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -138,9 +138,9 @@ __kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, cpu = smp_processor_id(); if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); + trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); else - trace_buf = rcu_dereference(perf_trace_buf); + trace_buf = rcu_dereference_sched(perf_trace_buf); if (!trace_buf) goto err; -- cgit v1.2.2 From 801c29fd1fdeb84f60241beb445ff5db154450ae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Mar 2010 20:02:19 -0500 Subject: function-graph: Fix unused reference to ftrace_set_func() The declaration of ftrace_set_func() is at the start of the ftrace.c file and wrapped with a #ifdef CONFIG_FUNCTION_GRAPH condition. If function graph tracing is enabled but CONFIG_DYNAMIC_FTRACE is not, a warning about that function being declared static and unused is given. This really should have been placed within the CONFIG_FUNCTION_GRAPH condition that uses ftrace_set_func(). Moving the declaration down fixes the warning and makes the code cleaner. Reported-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d996353473fd..d0407c9f368c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -85,10 +85,6 @@ ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub; ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); -#endif - static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { struct ftrace_ops *op = ftrace_list; @@ -2300,6 +2296,8 @@ __setup("ftrace_filter=", set_ftrace_filter); #ifdef CONFIG_FUNCTION_GRAPH_TRACER static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; +static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); + static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); -- cgit v1.2.2 From a094fe04c751698a18c3a0d376a3bdb117f1e0d8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 5 Mar 2010 20:08:58 -0500 Subject: function-graph: Use comment notation for func names of dangling '}' When a '}' does not have a matching function start, the name is printed within parenthesis. But this makes it confusing between ending '}' and function starts. This patch makes the function name appear in C comment notation. Old view: 3) 1.281 us | } (might_fault) 3) 3.620 us | } (filldir) 3) 5.251 us | } (call_filldir) 3) | call_filldir() { 3) | filldir() { New view: 3) 1.281 us | } /* might_fault */ 3) 3.620 us | } /* filldir */ 3) 5.251 us | } /* call_filldir */ 3) | call_filldir() { 3) | filldir() { Requested-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index e998a824e9db..7b1f24618d97 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -920,7 +920,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, if (!ret) return TRACE_TYPE_PARTIAL_LINE; } else { - ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); + ret = trace_seq_printf(s, "} /* %ps */\n", (void *)trace->func); if (!ret) return TRACE_TYPE_PARTIAL_LINE; } -- cgit v1.2.2 From 1acaa1b2d9b5904c9cce06122990a2d71046ce16 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 5 Mar 2010 18:23:50 -0300 Subject: tracing: Update the comm field in the right variable in update_max_tr The latency output showed: # | task: -3 (uid:0 nice:0 policy:1 rt_prio:99) The comm is missing in the "task:" and it looks like a minus 3 is the output. The correct display should be: # | task: migration/0-3 (uid:0 nice:0 policy:1 rt_prio:99) The problem is that the comm is being stored in the wrong data structure. The max_tr.data[cpu] is what stores the comm, not the tr->data[cpu]. Before this patch the max_tr.data[cpu]->comm was zeroed and the /debug/trace ended up showing just the '-' sign followed by the pid. Also remove a needless initialization of max_data. Signed-off-by: Arnaldo Carvalho de Melo LKML-Reference: <1267824230-23861-1-git-send-email-acme@infradead.org> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 032c57ca6502..6efd5cb3c252 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -592,7 +592,7 @@ static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data = tr->data[cpu]; + struct trace_array_cpu *max_data; max_tr.cpu = cpu; max_tr.time_start = data->preempt_timestamp; @@ -602,7 +602,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; - memcpy(data->comm, tsk->comm, TASK_COMM_LEN); + memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN); max_data->pid = tsk->pid; max_data->uid = task_uid(tsk); max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO; -- cgit v1.2.2 From 0e95017355dcf43031da6d0e360a748717e56df1 Mon Sep 17 00:00:00 2001 From: Tim Bird Date: Thu, 25 Feb 2010 15:36:43 -0800 Subject: function-graph: Add tracing_thresh support to function_graph tracer Add support for tracing_thresh to the function_graph tracer. This version of this feature isolates the checks into new entry and return functions, to avoid adding more conditional code into the main function_graph paths. When the tracing_thresh is set and the function graph tracer is enabled, only the functions that took longer than the time in microseconds that was set in tracing_thresh are recorded. To do this efficiently, only the function exits are recorded: [tracing]# echo 100 > tracing_thresh [tracing]# echo function_graph > current_tracer [tracing]# cat trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 1) ! 119.214 us | } /* smp_apic_timer_interrupt */ 1) <========== | 0) ! 101.527 us | } /* __rcu_process_callbacks */ 0) ! 126.461 us | } /* rcu_process_callbacks */ 0) ! 145.111 us | } /* __do_softirq */ 0) ! 149.667 us | } /* do_softirq */ 0) ! 168.817 us | } /* irq_exit */ 0) ! 248.254 us | } /* smp_apic_timer_interrupt */ Also, add support for specifying tracing_thresh on the kernel command line. When used like so: "tracing_thresh=200 ftrace=function_graph" this can be used to analyse system startup. It is important to disable tracing soon after boot, in order to avoid losing the trace data. Acked-by: Frederic Weisbecker Signed-off-by: Tim Bird LKML-Reference: <4B87098B.4040308@am.sony.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 20 ++++++++++++++++++-- kernel/trace/trace.h | 3 ++- kernel/trace/trace_functions_graph.c | 25 +++++++++++++++++++++++-- 3 files changed, 43 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6efd5cb3c252..ababedb4e87f 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -374,6 +374,21 @@ static int __init set_buf_size(char *str) } __setup("trace_buf_size=", set_buf_size); +static int __init set_tracing_thresh(char *str) +{ + unsigned long threshhold; + int ret; + + if (!str) + return 0; + ret = strict_strtoul(str, 0, &threshhold); + if (ret < 0) + return 0; + tracing_thresh = threshhold * 1000; + return 1; +} +__setup("tracing_thresh=", set_tracing_thresh); + unsigned long nsecs_to_usecs(unsigned long nsecs) { return nsecs / 1000; @@ -579,9 +594,10 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) static arch_spinlock_t ftrace_max_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +unsigned long __read_mostly tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE unsigned long __read_mostly tracing_max_latency; -unsigned long __read_mostly tracing_thresh; /* * Copy the new maximum trace into the separate maximum-trace @@ -4248,10 +4264,10 @@ static __init int tracer_init_debugfs(void) #ifdef CONFIG_TRACER_MAX_TRACE trace_create_file("tracing_max_latency", 0644, d_tracer, &tracing_max_latency, &tracing_max_lat_fops); +#endif trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); -#endif trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index fd05bcaf91b0..1bc8cd1431d7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -396,9 +396,10 @@ extern int process_new_ksym_entry(char *ksymname, int op, unsigned long addr); extern unsigned long nsecs_to_usecs(unsigned long nsecs); +extern unsigned long tracing_thresh; + #ifdef CONFIG_TRACER_MAX_TRACE extern unsigned long tracing_max_latency; -extern unsigned long tracing_thresh; void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu); void update_max_tr_single(struct trace_array *tr, diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7b1f24618d97..e9df04b60267 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -237,6 +237,14 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return ret; } +int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) +{ + if (tracing_thresh) + return 1; + else + return trace_graph_entry(trace); +} + static void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -290,13 +298,26 @@ void set_graph_array(struct trace_array *tr) smp_mb(); } +void trace_graph_thresh_return(struct ftrace_graph_ret *trace) +{ + if (tracing_thresh && + (trace->rettime - trace->calltime < tracing_thresh)) + return; + else + trace_graph_return(trace); +} + static int graph_trace_init(struct trace_array *tr) { int ret; set_graph_array(tr); - ret = register_ftrace_graph(&trace_graph_return, - &trace_graph_entry); + if (tracing_thresh) + ret = register_ftrace_graph(&trace_graph_thresh_return, + &trace_graph_thresh_entry); + else + ret = register_ftrace_graph(&trace_graph_return, + &trace_graph_entry); if (ret) return ret; tracing_start_cmdline_record(); -- cgit v1.2.2 From 984b3f5746ed2cde3d184651dabf26980f2b66e5 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 5 Mar 2010 13:41:37 -0800 Subject: bitops: rename for_each_bit() to for_each_set_bit() Rename for_each_bit to for_each_set_bit in the kernel source tree. To permit for_each_clear_bit(), should that ever be added. The patch includes a macro to map the old for_each_bit() onto the new for_each_set_bit(). This is a (very) temporary thing to ease the migration. [akpm@linux-foundation.org: add temporary for_each_bit()] Suggested-by: Alexey Dobriyan Suggested-by: Andrew Morton Signed-off-by: Akinobu Mita Cc: "David S. Miller" Cc: Russell King Cc: David Woodhouse Cc: Artem Bityutskiy Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched_cpupri.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c index eeb3506c4834..82095bf2099f 100644 --- a/kernel/sched_cpupri.c +++ b/kernel/sched_cpupri.c @@ -47,7 +47,7 @@ static int convert_prio(int prio) } #define for_each_cpupri_active(array, idx) \ - for_each_bit(idx, array, CPUPRI_NR_PRIORITIES) + for_each_set_bit(idx, array, CPUPRI_NR_PRIORITIES) /** * cpupri_find - find the best (lowest-pri) CPU in the system -- cgit v1.2.2 From d559db086ff5be9bcc259e5aa50bf3d881eaf1d1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:39 -0800 Subject: mm: clean up mm_counter Presently, per-mm statistics counter is defined by macro in sched.h This patch modifies it to - defined in mm.h as inlinf functions - use array instead of macro's name creation. This patch is for reducing patch size in future patch to modify implementation of per-mm counter. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 3 +-- kernel/tsacct.c | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 17bbf093356d..7616bcf107b9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -455,8 +455,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) (current->mm->flags & MMF_INIT_MASK) : default_dump_filter; mm->core_state = NULL; mm->nr_ptes = 0; - set_mm_counter(mm, file_rss, 0); - set_mm_counter(mm, anon_rss, 0); + memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); mm->free_area_cache = TASK_UNMAPPED_BASE; mm->cached_hole_size = ~0UL; diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 00d59d048edf..0a67e041edf8 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -21,6 +21,7 @@ #include #include #include +#include /* * fill in basic accounting fields -- cgit v1.2.2 From 34e55232e59f7b19050267a05ff1226e5cd122a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Fri, 5 Mar 2010 13:41:40 -0800 Subject: mm: avoid false sharing of mm_counter Considering the nature of per mm stats, it's the shared object among threads and can be a cache-miss point in the page fault path. This patch adds per-thread cache for mm_counter. RSS value will be counted into a struct in task_struct and synchronized with mm's one at events. Now, in this patch, the event is the number of calls to handle_mm_fault. Per-thread value is added to mm at each 64 calls. rough estimation with small benchmark on parallel thread (2threads) shows [before] 4.5 cache-miss/faults [after] 4.0 cache-miss/faults Anyway, the most contended object is mmap_sem if the number of threads grows. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Christoph Lameter Cc: Lee Schermerhorn Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 45ed043b8bf5..10d3c5d5ae44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -952,7 +952,8 @@ NORET_TYPE void do_exit(long code) preempt_count()); acct_update_integrals(tsk); - + /* sync mm's RSS info before statistics gathering */ + sync_mm_rss(tsk, tsk->mm); group_dead = atomic_dec_and_test(&tsk->signal->live); if (group_dead) { hrtimer_cancel(&tsk->signal->real_timer); -- cgit v1.2.2 From 5beb49305251e5669852ed541e8e2f2f7696c53e Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Fri, 5 Mar 2010 13:42:07 -0800 Subject: mm: change anon_vma linking to fix multi-process server scalability issue The old anon_vma code can lead to scalability issues with heavily forking workloads. Specifically, each anon_vma will be shared between the parent process and all its child processes. In a workload with 1000 child processes and a VMA with 1000 anonymous pages per process that get COWed, this leads to a system with a million anonymous pages in the same anon_vma, each of which is mapped in just one of the 1000 processes. However, the current rmap code needs to walk them all, leading to O(N) scanning complexity for each page. This can result in systems where one CPU is walking the page tables of 1000 processes in page_referenced_one, while all other CPUs are stuck on the anon_vma lock. This leads to catastrophic failure for a benchmark like AIM7, where the total number of processes can reach in the tens of thousands. Real workloads are still a factor 10 less process intensive than AIM7, but they are catching up. This patch changes the way anon_vmas and VMAs are linked, which allows us to associate multiple anon_vmas with a VMA. At fork time, each child process gets its own anon_vmas, in which its COWed pages will be instantiated. The parents' anon_vma is also linked to the VMA, because non-COWed pages could be present in any of the children. This reduces rmap scanning complexity to O(1) for the pages of the 1000 child processes, with O(N) complexity for at most 1/N pages in the system. This reduces the average scanning cost in heavily forking workloads from O(N) to 2. The only real complexity in this patch stems from the fact that linking a VMA to anon_vmas now involves memory allocations. This means vma_adjust can fail, if it needs to attach a VMA to anon_vma structures. This in turn means error handling needs to be added to the calling functions. A second source of complexity is that, because there can be multiple anon_vmas, the anon_vma linking in vma_adjust can no longer be done under "the" anon_vma lock. To prevent the rmap code from walking up an incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h to make sure it is impossible to compile a kernel that needs both symbolic values for the same bitflag. Some test results: Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test box with 16GB RAM and not quite enough IO), the system ends up running >99% in system time, with every CPU on the same anon_vma lock in the pageout code. With these changes, AIM7 hits the cross-over point around 29.7k users. This happens with ~99% IO wait time, there never seems to be any spike in system time. The anon_vma lock contention appears to be resolved. [akpm@linux-foundation.org: cleanups] Signed-off-by: Rik van Riel Cc: KOSAKI Motohiro Cc: Larry Woodman Cc: Lee Schermerhorn Cc: Minchan Kim Cc: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 7616bcf107b9..bab7b254ad39 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -329,15 +329,17 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) if (!tmp) goto fail_nomem; *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); pol = mpol_dup(vma_policy(mpnt)); retval = PTR_ERR(pol); if (IS_ERR(pol)) goto fail_nomem_policy; vma_set_policy(tmp, pol); + if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; tmp->vm_flags &= ~VM_LOCKED; tmp->vm_mm = mm; tmp->vm_next = NULL; - anon_vma_link(tmp); file = tmp->vm_file; if (file) { struct inode *inode = file->f_path.dentry->d_inode; @@ -392,6 +394,8 @@ out: flush_tlb_mm(oldmm); up_write(&oldmm->mmap_sem); return retval; +fail_nomem_anon_vma_fork: + mpol_put(pol); fail_nomem_policy: kmem_cache_free(vm_area_cachep, tmp); fail_nomem: -- cgit v1.2.2 From 452aa6999e6703ffbddd7f6ea124d3968915f3e3 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 5 Mar 2010 13:42:13 -0800 Subject: mm/pm: force GFP_NOIO during suspend/hibernation and resume There are quite a few GFP_KERNEL memory allocations made during suspend/hibernation and resume that may cause the system to hang, because the I/O operations they depend on cannot be completed due to the underlying devices being suspended. Avoid this problem by clearing the __GFP_IO and __GFP_FS bits in gfp_allowed_mask before suspend/hibernation and restoring the original values of these bits in gfp_allowed_mask durig the subsequent resume. [akpm@linux-foundation.org: fix CONFIG_PM=n linkage] Signed-off-by: Rafael J. Wysocki Reported-by: Maxim Levitsky Cc: Sebastian Ott Cc: Benjamin Herrenschmidt Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/hibernate.c | 9 +++++++++ kernel/power/suspend.c | 3 +++ 2 files changed, 12 insertions(+) (limited to 'kernel') diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index bbfe472d7524..da5288ec2392 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -323,6 +323,7 @@ static int create_image(int platform_mode) int hibernation_snapshot(int platform_mode) { int error; + gfp_t saved_mask; error = platform_begin(platform_mode); if (error) @@ -334,6 +335,7 @@ int hibernation_snapshot(int platform_mode) goto Close; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_FREEZE); if (error) goto Recover_platform; @@ -351,6 +353,7 @@ int hibernation_snapshot(int platform_mode) dpm_resume_end(in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: platform_end(platform_mode); @@ -445,14 +448,17 @@ static int resume_target_kernel(bool platform_mode) int hibernation_restore(int platform_mode) { int error; + gfp_t saved_mask; pm_prepare_console(); suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_QUIESCE); if (!error) { error = resume_target_kernel(platform_mode); dpm_resume_end(PMSG_RECOVER); } + set_gfp_allowed_mask(saved_mask); resume_console(); pm_restore_console(); return error; @@ -466,6 +472,7 @@ int hibernation_restore(int platform_mode) int hibernation_platform_enter(void) { int error; + gfp_t saved_mask; if (!hibernation_ops) return -ENOSYS; @@ -481,6 +488,7 @@ int hibernation_platform_enter(void) entering_platform_hibernation = true; suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); error = dpm_suspend_start(PMSG_HIBERNATE); if (error) { if (hibernation_ops->recover) @@ -518,6 +526,7 @@ int hibernation_platform_enter(void) Resume_devices: entering_platform_hibernation = false; dpm_resume_end(PMSG_RESTORE); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 6f10dfc2d3e9..44cce10b582d 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -189,6 +189,7 @@ static int suspend_enter(suspend_state_t state) int suspend_devices_and_enter(suspend_state_t state) { int error; + gfp_t saved_mask; if (!suspend_ops) return -ENOSYS; @@ -199,6 +200,7 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); + saved_mask = clear_gfp_allowed_mask(GFP_IOFS); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -215,6 +217,7 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); + set_gfp_allowed_mask(saved_mask); resume_console(); Close: if (suspend_ops->end) -- cgit v1.2.2 From 87d5e0236d9d688fb575e9e12232764ac617617c Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Fri, 5 Mar 2010 13:42:38 -0800 Subject: kernel/cpu.c: delete deprecated definition in cpu_up() Additional_cpus is only supported for IA64 now. X86_64 should not be included. Signed-off-by: Chen Gong Cc: Ingo Molnar Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 677f25376a38..f8cced2692b3 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -338,7 +338,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); -#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) +#if defined(CONFIG_IA64) printk(KERN_ERR "please check additional_cpus= boot " "parameter\n"); #endif -- cgit v1.2.2 From 5f1664f92b2247111b7d37e454a050b76ac61b7f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 5 Mar 2010 13:42:51 -0800 Subject: splice: comparing unsigned int < 0 "ret" needs to be signed or the error handling for splice_to_pipe() won't work correctly. Signed-off-by: Dan Carpenter Cc: Tom Zanussi Cc: Jens Axboe Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index c705a41b4ba3..3d97f2821611 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1215,14 +1215,14 @@ static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ -static int subbuf_splice_actor(struct file *in, +static ssize_t subbuf_splice_actor(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags, int *nonpad_ret) { - unsigned int pidx, poff, total_len, subbuf_pages, nr_pages, ret; + unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1241,6 +1241,7 @@ static int subbuf_splice_actor(struct file *in, .ops = &relay_pipe_buf_ops, .spd_release = relay_page_release, }; + ssize_t ret; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) return 0; -- cgit v1.2.2 From 9c03c383563f147907f1a90cf16f1e190e2f4aae Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 5 Mar 2010 13:42:52 -0800 Subject: includecheck fix for kernel/params.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following 'make includecheck' warning: kernel/params.c: linux/string.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Cc: André Goddard Rosa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/params.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index cf1b69183127..8d95f5451b22 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -24,7 +24,6 @@ #include #include #include -#include #if 0 #define DEBUGP printk -- cgit v1.2.2 From f3abd4f9531becb71626bd206955d47d5ea54f06 Mon Sep 17 00:00:00 2001 From: Thiago Farina Date: Fri, 5 Mar 2010 13:42:52 -0800 Subject: kernel/exit.c: fix shadows sparse warning kernel/exit.c:1183:26: warning: symbol 'status' shadows an earlier one kernel/exit.c:1173:21: originally declared here Signed-off-by: Thiago Farina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 10d3c5d5ae44..ce1e48c2d93d 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1189,7 +1189,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) if (unlikely(wo->wo_flags & WNOWAIT)) { int exit_code = p->exit_code; - int why, status; + int why; get_task_struct(p); read_unlock(&tasklist_lock); -- cgit v1.2.2 From d4bb527438b4181cd3c564ae04dd344c381283a1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 5 Mar 2010 13:42:53 -0800 Subject: posix-cpu-timers: cleanup rlimits usage Fetch rlimit (both hard and soft) values only once and work on them. It removes many accesses through sig structure and makes the code cleaner. Mostly a preparation for writable resource limits support. Signed-off-by: Jiri Slaby Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-cpu-timers.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 438ff4523513..dbb16bf15c45 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -982,6 +982,7 @@ static void check_thread_timers(struct task_struct *tsk, int maxfire; struct list_head *timers = tsk->cpu_timers; struct signal_struct *const sig = tsk->signal; + unsigned long soft; maxfire = 20; tsk->cputime_expires.prof_exp = cputime_zero; @@ -1030,9 +1031,9 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { + soft = sig->rlim[RLIMIT_RTTIME].rlim_cur; + if (soft != RLIM_INFINITY) { unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; - unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -1043,14 +1044,13 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { + if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { /* * At the soft limit, send a SIGXCPU every second. */ - if (sig->rlim[RLIMIT_RTTIME].rlim_cur - < sig->rlim[RLIMIT_RTTIME].rlim_max) { - sig->rlim[RLIMIT_RTTIME].rlim_cur += - USEC_PER_SEC; + if (soft < hard) { + soft += USEC_PER_SEC; + sig->rlim[RLIMIT_RTTIME].rlim_cur = soft; } printk(KERN_INFO "RT Watchdog Timeout: %s[%d]\n", @@ -1121,6 +1121,7 @@ static void check_process_timers(struct task_struct *tsk, unsigned long long sum_sched_runtime, sched_expires; struct list_head *timers = sig->cpu_timers; struct task_cputime cputime; + unsigned long soft; /* * Don't sample the current process CPU clocks if there are no timers. @@ -1193,11 +1194,12 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { + soft = sig->rlim[RLIMIT_CPU].rlim_cur; + if (soft != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); + unsigned long hard = sig->rlim[RLIMIT_CPU].rlim_max; cputime_t x; - if (psecs >= sig->rlim[RLIMIT_CPU].rlim_max) { + if (psecs >= hard) { /* * At the hard limit, we just die. * No need to calculate anything else now. @@ -1205,17 +1207,17 @@ static void check_process_timers(struct task_struct *tsk, __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); return; } - if (psecs >= sig->rlim[RLIMIT_CPU].rlim_cur) { + if (psecs >= soft) { /* * At the soft limit, send a SIGXCPU every second. */ __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); - if (sig->rlim[RLIMIT_CPU].rlim_cur - < sig->rlim[RLIMIT_CPU].rlim_max) { - sig->rlim[RLIMIT_CPU].rlim_cur++; + if (soft < hard) { + soft++; + sig->rlim[RLIMIT_CPU].rlim_cur = soft; } } - x = secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + x = secs_to_cputime(soft); if (cputime_eq(prof_expires, cputime_zero) || cputime_lt(x, prof_expires)) { prof_expires = x; -- cgit v1.2.2 From 78d7d407b62a021e6d2e8dc24c0b90e390ab58a1 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 5 Mar 2010 13:42:54 -0800 Subject: kernel core: use helpers for rlimits Make sure compiler won't do weird things with limits. E.g. fetching them twice may return 2 different values after writable limits are implemented. I.e. either use rlimit helpers added in commit 3e10e716abf3 ("resource: add helpers for fetching rlimits") or ACCESS_ONCE if not applicable. Signed-off-by: Jiri Slaby Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: john stultz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++++---- kernel/perf_event.c | 2 +- kernel/posix-cpu-timers.c | 10 ++++++---- kernel/sched.c | 4 ++-- kernel/sched_rt.c | 5 +++-- kernel/signal.c | 2 +- kernel/sys.c | 3 +-- 7 files changed, 20 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index bab7b254ad39..b0ec34abc0bb 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -828,6 +828,8 @@ void __cleanup_sighand(struct sighand_struct *sighand) */ static void posix_cpu_timers_init_group(struct signal_struct *sig) { + unsigned long cpu_limit; + /* Thread group counters. */ thread_group_cputime_init(sig); @@ -842,9 +844,9 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) sig->cputime_expires.virt_exp = cputime_zero; sig->cputime_expires.sched_exp = 0; - if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { - sig->cputime_expires.prof_exp = - secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); + cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); + if (cpu_limit != RLIM_INFINITY) { + sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); sig->cputimer.running = 1; } @@ -1037,7 +1039,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #endif retval = -EAGAIN; if (atomic_read(&p->real_cred->user->processes) >= - p->signal->rlim[RLIMIT_NPROC].rlim_cur) { + task_rlimit(p, RLIMIT_NPROC)) { if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) && p->real_cred->user != INIT_USER) goto bad_fork_free; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a661e7991865..8e352c756ba7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2610,7 +2610,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (user_locked > user_lock_limit) extra = user_locked - user_lock_limit; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; locked = vma->vm_mm->locked_vm + extra; diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index dbb16bf15c45..1a22dfd42df9 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1031,9 +1031,10 @@ static void check_thread_timers(struct task_struct *tsk, /* * Check for the special case thread timers. */ - soft = sig->rlim[RLIMIT_RTTIME].rlim_cur; + soft = ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_cur); if (soft != RLIM_INFINITY) { - unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_RTTIME].rlim_max); if (hard != RLIM_INFINITY && tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { @@ -1194,10 +1195,11 @@ static void check_process_timers(struct task_struct *tsk, SIGPROF); check_cpu_itimer(tsk, &sig->it[CPUCLOCK_VIRT], &virt_expires, utime, SIGVTALRM); - soft = sig->rlim[RLIMIT_CPU].rlim_cur; + soft = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (soft != RLIM_INFINITY) { unsigned long psecs = cputime_to_secs(ptime); - unsigned long hard = sig->rlim[RLIMIT_CPU].rlim_max; + unsigned long hard = + ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_max); cputime_t x; if (psecs >= hard) { /* diff --git a/kernel/sched.c b/kernel/sched.c index abb36b16b93b..b47ceeec1a91 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4353,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice) /* convert nice value [19,-20] to rlimit style value [1,40] */ int nice_rlim = 20 - nice; - return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || + return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE)); } @@ -4530,7 +4530,7 @@ recheck: if (!lock_task_sighand(p, &flags)) return -ESRCH; - rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur; + rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); unlock_task_sighand(p, &flags); /* can't set/change the rt policy */ diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index bf3e38fdbe6d..5a6ed1f0990a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1662,8 +1662,9 @@ static void watchdog(struct rq *rq, struct task_struct *p) if (!p->signal) return; - soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; - hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; + /* max may change after cur was read, this will be fixed next tick */ + soft = task_rlimit(p, RLIMIT_RTTIME); + hard = task_rlimit_max(p, RLIMIT_RTTIME); if (soft != RLIM_INFINITY) { unsigned long next; diff --git a/kernel/signal.c b/kernel/signal.c index 5bb9baffa4f1..dbd7fe073c55 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -245,7 +245,7 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi if (override_rlimit || atomic_read(&user->sigpending) <= - t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) { + task_rlimit(t, RLIMIT_SIGPENDING)) { q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); diff --git a/kernel/sys.c b/kernel/sys.c index 877fe4f8e05e..9814e43fb23b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -571,8 +571,7 @@ static int set_user(struct cred *new) if (!new_user) return -EAGAIN; - if (atomic_read(&new_user->processes) >= - current->signal->rlim[RLIMIT_NPROC].rlim_cur && + if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && new_user != INIT_USER) { free_uid(new_user); return -EAGAIN; -- cgit v1.2.2 From 8aeee85a29e27e043db582bf2ae8e5f42767934f Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 5 Mar 2010 13:42:55 -0800 Subject: panic: fix panic_timeout accuracy when running on a hypervisor I've had some complaints about panic_timeout being wildly innacurate on shared processor PowerPC partitions (a 3 minute panic_timeout taking 30 minutes). The problem is we loop on mdelay(1) and with a 1ms in 10ms hypervisor timeslice each of these will take 10ms (ie 10x) longer. I expect other platforms with shared processor hypervisors will see the same issue. This patch keeps the old behaviour if we have a panic_blink (only keyboard LEDs right now) and does 1 second mdelays if we don't. Signed-off-by: Anton Blanchard Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 46 ++++++++++++++++++++++++++++++---------------- 1 file changed, 30 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index c787333282b8..13d966b4c14a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,15 +36,36 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list); EXPORT_SYMBOL(panic_notifier_list); -static long no_blink(long time) -{ - return 0; -} - /* Returns how long it waited in ms */ long (*panic_blink)(long time); EXPORT_SYMBOL(panic_blink); +static void panic_blink_one_second(void) +{ + static long i = 0, end; + + if (panic_blink) { + end = i + MSEC_PER_SEC; + + while (i < end) { + i += panic_blink(i); + mdelay(1); + i++; + } + } else { + /* + * When running under a hypervisor a small mdelay may get + * rounded up to the hypervisor timeslice. For example, with + * a 1ms in 10ms hypervisor timeslice we might inflate a + * mdelay(1) loop by 10x. + * + * If we have nothing to blink, spin on 1 second calls to + * mdelay to avoid this. + */ + mdelay(MSEC_PER_SEC); + } +} + /** * panic - halt the system * @fmt: The text string to print @@ -95,9 +116,6 @@ NORET_TYPE void panic(const char * fmt, ...) bust_spinlocks(0); - if (!panic_blink) - panic_blink = no_blink; - if (panic_timeout > 0) { /* * Delay timeout seconds before rebooting the machine. @@ -105,11 +123,9 @@ NORET_TYPE void panic(const char * fmt, ...) */ printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout); - for (i = 0; i < panic_timeout*1000; ) { + for (i = 0; i < panic_timeout; i++) { touch_nmi_watchdog(); - i += panic_blink(i); - mdelay(1); - i++; + panic_blink_one_second(); } /* * This will not be a clean reboot, with everything @@ -135,11 +151,9 @@ NORET_TYPE void panic(const char * fmt, ...) } #endif local_irq_enable(); - for (i = 0; ; ) { + while (1) { touch_softlockup_watchdog(); - i += panic_blink(i); - mdelay(1); - i++; + panic_blink_one_second(); } } -- cgit v1.2.2 From 9728e5d6e6c432ee8487c63ce6e479e2474d9945 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 5 Mar 2010 13:42:56 -0800 Subject: kernel/pid.c: update comment on find_task_by_pid_ns tasklist_lock does protect the task and its pid, it can't go away. The problem is that find_pid_ns() itself is unsafe without rcu lock, it can race with copy_process()->free_pid(any_pid). Protecting copy_process()->free_pid(any_pid) with tasklist_lock would make it possible to call find_task_by_pid_ns() under tasklist safely, but we don't do so because we are trying to get rid of the read_lock sites of tasklist_lock. Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Cc: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid.c b/kernel/pid.c index b08e697cd83f..86b296943e5f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -376,7 +376,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) EXPORT_SYMBOL(pid_task); /* - * Must be called under rcu_read_lock() or with tasklist_lock read-held. + * Must be called under rcu_read_lock(). */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { -- cgit v1.2.2 From cea83886dde49fd7524e9f4a246dd5dff4ad236a Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Fri, 5 Mar 2010 13:42:58 -0800 Subject: printk: avoid warning when CONFIG_PRINTK is disabled kernel/printk.c:72: warning: `saved_console_loglevel' defined but not used Signed-off-by: Gustavo F. Padovan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 40674122ecf2..75077ad0b537 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -70,8 +70,6 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; -static int saved_console_loglevel = -1; - /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -146,6 +144,7 @@ static char __log_buf[__LOG_BUF_LEN]; static char *log_buf = __log_buf; static int log_buf_len = __LOG_BUF_LEN; static unsigned logged_chars; /* Number of chars produced since last read+clear operation */ +static int saved_console_loglevel = -1; #ifdef CONFIG_KEXEC /* -- cgit v1.2.2 From 1fcccbac89f5bbc5e41aa72086960059fce372da Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA Date: Fri, 5 Mar 2010 13:44:07 -0800 Subject: elf coredump: replace ELF_CORE_EXTRA_* macros by functions elf_core_dump() and elf_fdpic_core_dump() use #ifdef and the corresponding macro for hiding _multiline_ logics in functions. This patch removes #ifdef and replaces ELF_CORE_EXTRA_* by corresponding functions. For architectures not implemeonting ELF_CORE_EXTRA_*, we use weak functions in order to reduce a range of modification. This cleanup is for my next patches, but I think this cleanup itself is worth doing regardless of my firnal purpose. Signed-off-by: Daisuke HATAYAMA Cc: "Luck, Tony" Cc: Jeff Dike Cc: David Howells Cc: Greg Ungerer Cc: Roland McGrath Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Alexander Viro Cc: Andi Kleen Cc: Alan Cox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/Makefile | 3 +++ kernel/elfcore.c | 23 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 kernel/elfcore.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 7b974699f8c2..a987aa1676b5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -91,6 +91,9 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o obj-$(CONFIG_TRACEPOINTS) += tracepoint.o obj-$(CONFIG_LATENCYTOP) += latencytop.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o +obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ diff --git a/kernel/elfcore.c b/kernel/elfcore.c new file mode 100644 index 000000000000..5445741f4b4c --- /dev/null +++ b/kernel/elfcore.c @@ -0,0 +1,23 @@ +#include +#include +#include + +#include + + +Elf_Half __weak elf_core_extra_phdrs(void) +{ + return 0; +} + +int __weak elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + return 1; +} + +int __weak elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + return 1; +} -- cgit v1.2.2 From 8d9032bbe4671dc481261ccd4e161cd96e54b118 Mon Sep 17 00:00:00 2001 From: Daisuke HATAYAMA Date: Fri, 5 Mar 2010 13:44:10 -0800 Subject: elf coredump: add extended numbering support The current ELF dumper implementation can produce broken corefiles if program headers exceed 65535. This number is determined by the number of vmas which the process have. In particular, some extreme programs may use more than 65535 vmas. (If you google max_map_count, you can find some users facing this problem.) This kind of program never be able to generate correct coredumps. This patch implements ``extended numbering'' that uses sh_info field of the first section header instead of e_phnum field in order to represent upto 4294967295 vmas. This is supported by AMD64-ABI(http://www.x86-64.org/documentation.html) and Solaris(http://docs.sun.com/app/docs/doc/817-1984/). Of course, we are preparing patches for gdb and binutils. Signed-off-by: Daisuke HATAYAMA Cc: "Luck, Tony" Cc: Jeff Dike Cc: David Howells Cc: Greg Ungerer Cc: Roland McGrath Cc: Oleg Nesterov Cc: Ingo Molnar Cc: Alexander Viro Cc: Andi Kleen Cc: Alan Cox Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/elfcore.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/elfcore.c b/kernel/elfcore.c index 5445741f4b4c..ff915efef66d 100644 --- a/kernel/elfcore.c +++ b/kernel/elfcore.c @@ -21,3 +21,8 @@ int __weak elf_core_write_extra_data(struct file *file, size_t *size, { return 1; } + +size_t __weak elf_core_extra_data_size(void) +{ + return 0; +} -- cgit v1.2.2 From c9be0a36f9bf392a7984473124a67a12964df11f Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 5 Jan 2010 12:47:58 +0100 Subject: sysdev: Pass attribute in sysdev_class attributes show/store Passing the attribute to the low level IO functions allows all kinds of cleanups, by sharing low level IO code without requiring an own function for every piece of data. Also drivers can extend the attributes with own data fields and use that in the low level function. Similar to sysdev_attributes and normal attributes. This is a tree-wide sweep, converting everything in one go. No functional changes in this patch other than passing the new argument everywhere. Tested on x86, the non x86 parts are uncompiled. Signed-off-by: Andi Kleen Signed-off-by: Greg Kroah-Hartman --- kernel/perf_event.c | 13 ++++++++++--- kernel/sched.c | 4 ++++ 2 files changed, 14 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8e352c756ba7..f40560b86544 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5481,13 +5481,16 @@ void __init perf_event_init(void) register_cpu_notifier(&perf_cpu_nb); } -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + char *buf) { return sprintf(buf, "%d\n", perf_reserved_percpu); } static ssize_t perf_set_reserve_percpu(struct sysdev_class *class, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { @@ -5516,13 +5519,17 @@ perf_set_reserve_percpu(struct sysdev_class *class, return count; } -static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +static ssize_t perf_show_overcommit(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + char *buf) { return sprintf(buf, "%d\n", perf_overcommit); } static ssize_t -perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +perf_set_overcommit(struct sysdev_class *class, + struct sysdev_class_attribute *attr, + const char *buf, size_t count) { unsigned long val; int err; diff --git a/kernel/sched.c b/kernel/sched.c index b47ceeec1a91..150b6988de49 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7406,11 +7406,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sysdev_class *class, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_mc_power_savings); } static ssize_t sched_mc_power_savings_store(struct sysdev_class *class, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 0); @@ -7422,11 +7424,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644, #ifdef CONFIG_SCHED_SMT static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, char *page) { return sprintf(page, "%u\n", sched_smt_power_savings); } static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev, + struct sysdev_class_attribute *attr, const char *buf, size_t count) { return sched_power_savings_store(buf, count, 1); -- cgit v1.2.2 From 9cd43611ccfb46632bfa7d19f688924ea93f1613 Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Thu, 31 Dec 2009 14:52:51 +0100 Subject: kobject: Constify struct kset_uevent_ops Constify struct kset_uevent_ops. This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy Signed-off-by: Greg Kroah-Hartman --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 8d95f5451b22..48370be3c0a1 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -736,7 +736,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) return 0; } -static struct kset_uevent_ops module_uevent_ops = { +static const struct kset_uevent_ops module_uevent_ops = { .filter = uevent_filter, }; -- cgit v1.2.2 From 52cf25d0ab7f78eeecc59ac652ed5090f69b619e Mon Sep 17 00:00:00 2001 From: Emese Revfy Date: Tue, 19 Jan 2010 02:58:23 +0100 Subject: Driver core: Constify struct sysfs_ops in struct kobj_type Constify struct sysfs_ops. This is part of the ops structure constification effort started by Arjan van de Ven et al. Benefits of this constification: * prevents modification of data that is shared (referenced) by many other structure instances at runtime * detects/prevents accidental (but not intentional) modification attempts on archs that enforce read-only kernel data at runtime * potentially better optimized code as the compiler can assume that the const data cannot be changed * the compiler/linker move const data into .rodata and therefore exclude them from false sharing Signed-off-by: Emese Revfy Acked-by: David Teigland Acked-by: Matt Domsch Acked-by: Maciej Sosnowski Acked-by: Hans J. Koch Acked-by: Pekka Enberg Acked-by: Jens Axboe Acked-by: Stephen Hemminger Signed-off-by: Greg Kroah-Hartman --- kernel/params.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 48370be3c0a1..68396d73c838 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -722,7 +722,7 @@ static ssize_t module_attr_store(struct kobject *kobj, return ret; } -static struct sysfs_ops module_sysfs_ops = { +static const struct sysfs_ops module_sysfs_ops = { .show = module_attr_show, .store = module_attr_store, }; -- cgit v1.2.2 From a07e4156a2ee6359d31a44946d7ee7f85dbf6bca Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 11 Feb 2010 15:23:05 -0800 Subject: sysfs: Use sysfs_attr_init and sysfs_bin_attr_init on dynamic attributes These are the non-static sysfs attributes that exist on my test machine. Fix them to use sysfs_attr_init or sysfs_bin_attr_init as appropriate. It simply requires making a sysfs attribute present to see this. So this is a little bit tedious but otherwise not too bad. Signed-off-by: Eric W. Biederman Acked-by: WANG Cong Cc: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/params.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 68396d73c838..d55a53ec9234 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -516,6 +516,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, new->grp.attrs = attrs; /* Tack new one on the end. */ + sysfs_attr_init(&new->attrs[num].mattr.attr); new->attrs[num].param = kp; new->attrs[num].mattr.show = param_attr_show; new->attrs[num].mattr.store = param_attr_store; -- cgit v1.2.2 From 361795b1eb7c08e9e65a2ebb4a4e536294d378a2 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 12 Feb 2010 13:41:56 -0800 Subject: sysfs: Use sysfs_attr_init and sysfs_bin_attr_init on module dynamic attributes A little more whack-a-mole annotating the dynamic sysfs attributes. I had everything built into my earlier test kernel, and so I missed these. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- kernel/module.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index e5538d5f00ad..c968d3606dca 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1085,6 +1085,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect, if (sattr->name == NULL) goto out; sect_attrs->nsections++; + sysfs_attr_init(&sattr->mattr.attr); sattr->mattr.show = module_sect_show; sattr->mattr.store = NULL; sattr->mattr.attr.name = sattr->name; @@ -1180,6 +1181,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, if (sect_empty(&sechdrs[i])) continue; if (sechdrs[i].sh_type == SHT_NOTE) { + sysfs_bin_attr_init(nattr); nattr->attr.name = mod->sect_attrs->attrs[loaded].name; nattr->attr.mode = S_IRUGO; nattr->size = sechdrs[i].sh_size; @@ -1252,6 +1254,7 @@ int module_add_modinfo_attrs(struct module *mod) if (!attr->test || (attr->test && attr->test(mod))) { memcpy(temp_attr, attr, sizeof(*temp_attr)); + sysfs_attr_init(&temp_attr->attr); error = sysfs_create_file(&mod->mkobj.kobj,&temp_attr->attr); ++temp_attr; } -- cgit v1.2.2 From dc1d628a67a8f042e711ea5accc0beedc3ef0092 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 3 Mar 2010 15:55:04 +0100 Subject: perf: Provide generic perf_sample_data initialization This makes it easier to extend perf_sample_data and fixes a bug on arm and sparc, which failed to set ->raw to NULL, which can cause crashes when combined with PERF_SAMPLE_RAW. It also optimizes PowerPC and tracepoint, because the struct initialization is forced to zero out the whole structure. Signed-off-by: Peter Zijlstra Acked-by: Jean Pihet Reviewed-by: Frederic Weisbecker Acked-by: David S. Miller Cc: Jamie Iles Cc: Paul Mackerras Cc: Stephane Eranian Cc: stable@kernel.org LKML-Reference: <20100304140100.315416040@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e68745053013..4393b9e73740 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4108,8 +4108,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, if (rctx < 0) return; - data.addr = addr; - data.raw = NULL; + perf_sample_data_init(&data, addr); do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); @@ -4154,11 +4153,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) struct perf_event *event; u64 period; - event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); event->pmu->read(event); - data.addr = 0; - data.raw = NULL; + perf_sample_data_init(&data, 0); data.period = event->hw.last_period; regs = get_irq_regs(); /* @@ -4322,17 +4320,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tp_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { + struct pt_regs *regs = get_irq_regs(); + struct perf_sample_data data; struct perf_raw_record raw = { .size = entry_size, .data = record, }; - struct perf_sample_data data = { - .addr = addr, - .raw = &raw, - }; - - struct pt_regs *regs = get_irq_regs(); + perf_sample_data_init(&data, addr); + data.raw = &raw; if (!regs) regs = task_pt_regs(current); @@ -4448,8 +4444,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; - sample.raw = NULL; - sample.addr = bp->attr.bp_addr; + perf_sample_data_init(&sample, bp->attr.bp_addr); if (!perf_exclude_event(bp, regs)) perf_swevent_add(bp, 1, 1, &sample, regs); -- cgit v1.2.2 From 3f6da3905398826d85731247e7fbcf53400c18bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 5 Mar 2010 13:01:18 +0100 Subject: perf: Rework and fix the arch CPU-hotplug hooks Remove the hw_perf_event_*() hotplug hooks in favour of per PMU hotplug notifiers. This has the advantage of reducing the static weak interface as well as exposing all hotplug actions to the PMU. Use this to fix x86 hotplug usage where we did things in ONLINE which should have been done in UP_PREPARE or STARTING. Signed-off-by: Peter Zijlstra Cc: Paul Mundt Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo LKML-Reference: <20100305154128.736225361@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 4393b9e73740..73329dedb5ad 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -81,10 +81,6 @@ extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } -void __weak hw_perf_event_setup(int cpu) { barrier(); } -void __weak hw_perf_event_setup_online(int cpu) { barrier(); } -void __weak hw_perf_event_setup_offline(int cpu) { barrier(); } - int __weak hw_perf_group_sched_in(struct perf_event *group_leader, struct perf_cpu_context *cpuctx, @@ -5382,8 +5378,6 @@ static void __cpuinit perf_event_init_cpu(int cpu) spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; spin_unlock(&perf_resource_lock); - - hw_perf_event_setup(cpu); } #ifdef CONFIG_HOTPLUG_CPU @@ -5423,20 +5417,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_event_init_cpu(cpu); break; - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - hw_perf_event_setup_online(cpu); - break; - case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_event_exit_cpu(cpu); break; - case CPU_DEAD: - hw_perf_event_setup_offline(cpu); - break; - default: break; } -- cgit v1.2.2 From 32975a4f114be52286f9a5bf6c230dbb8c0e1903 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 6 Mar 2010 19:49:19 +0100 Subject: perf: Optimize perf_disable Currently we always call hw_perf_disable(), even if its already disabled, this seems superflous, esp. since it cannot be made NMI safe (see further patches). Signed-off-by: Peter Zijlstra Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 73329dedb5ad..d8108465397d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -93,25 +93,15 @@ void __weak perf_event_print_debug(void) { } static DEFINE_PER_CPU(int, perf_disable_count); -void __perf_disable(void) -{ - __get_cpu_var(perf_disable_count)++; -} - -bool __perf_enable(void) -{ - return !--__get_cpu_var(perf_disable_count); -} - void perf_disable(void) { - __perf_disable(); - hw_perf_disable(); + if (!__get_cpu_var(perf_disable_count)++) + hw_perf_disable(); } void perf_enable(void) { - if (__perf_enable()) + if (!--__get_cpu_var(perf_disable_count)) hw_perf_enable(); } -- cgit v1.2.2 From d4944a06666054707d23e11888e480af239e5abf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 8 Mar 2010 13:51:20 +0100 Subject: perf: Provide better condition for event rotation Try to avoid useless rotation and PMU disables. [ Could be improved by keeping a nr_runnable count to better account for the < PERF_STAT_INACTIVE counters ] Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: paulus@samba.org Cc: eranian@google.com Cc: robert.richter@amd.com Cc: fweisbec@gmail.com LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d8108465397d..52c69a34d697 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1524,12 +1524,15 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); + perf_disable(); event->pmu->unthrottle(event); + perf_enable(); } if (!event->attr.freq || !event->attr.sample_freq) continue; + perf_disable(); event->pmu->read(event); now = atomic64_read(&event->count); delta = now - hwc->freq_count_stamp; @@ -1537,6 +1540,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (delta > 0) perf_adjust_period(event, TICK_NSEC, delta); + perf_enable(); } raw_spin_unlock(&ctx->lock); } @@ -1546,9 +1550,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ static void rotate_ctx(struct perf_event_context *ctx) { - if (!ctx->nr_events) - return; - raw_spin_lock(&ctx->lock); /* Rotate the first entry last of non-pinned groups */ @@ -1561,19 +1562,28 @@ void perf_event_task_tick(struct task_struct *curr) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; + int rotate = 0; if (!atomic_read(&nr_events)) return; cpuctx = &__get_cpu_var(perf_cpu_context); - ctx = curr->perf_event_ctxp; + if (cpuctx->ctx.nr_events && + cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; - perf_disable(); + ctx = curr->perf_event_ctxp; + if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) + rotate = 1; perf_ctx_adjust_freq(&cpuctx->ctx); if (ctx) perf_ctx_adjust_freq(ctx); + if (!rotate) + return; + + perf_disable(); cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_out(ctx, EVENT_FLEXIBLE); @@ -1585,7 +1595,6 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_in(curr, EVENT_FLEXIBLE); - perf_enable(); } -- cgit v1.2.2 From db2c4c7791cd04512093d05afc693c3511a65fd7 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 2 Feb 2010 23:34:40 +0100 Subject: lockdep: Move lock events under lockdep recursion protection There are rcu locked read side areas in the path where we submit a trace event. And these rcu_read_(un)lock() trigger lock events, which create recursive events. One pair in do_perf_sw_event: __lock_acquire | |--96.11%-- lock_acquire | | | |--27.21%-- do_perf_sw_event | | perf_tp_event | | | | | |--49.62%-- ftrace_profile_lock_release | | | lock_release | | | | | | | |--33.85%-- _raw_spin_unlock Another pair in perf_output_begin/end: __lock_acquire |--23.40%-- perf_output_begin | | __perf_event_overflow | | perf_swevent_overflow | | perf_swevent_add | | perf_swevent_ctx_event | | do_perf_sw_event | | perf_tp_event | | | | | |--55.37%-- ftrace_profile_lock_acquire | | | lock_acquire | | | | | | | |--37.31%-- _raw_spin_lock The problem is not that much the trace recursion itself, as we have a recursion protection already (though it's always wasteful to recurse). But the trace events are outside the lockdep recursion protection, then each lockdep event triggers a lock trace, which will trigger two other lockdep events. Here the recursive lock trace event won't be taken because of the trace recursion, so the recursion stops there but lockdep will still analyse these new events: To sum up, for each lockdep events we have: lock_*() | trace lock_acquire | ----- rcu_read_lock() | | | lock_acquire() | | | trace_lock_acquire() (stopped) | | | lockdep analyze | ----- rcu_read_unlock() | lock_release | trace_lock_release() (stopped) | lockdep analyze And you can repeat the above two times as we have two rcu read side sections when we submit an event. This is fixed in this patch by moving the lock trace event under the lockdep recursion protection. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Paul Mackerras Cc: Hitoshi Mitake Cc: Li Zefan Cc: Lai Jiangshan Cc: Masami Hiramatsu Cc: Jens Axboe --- kernel/lockdep.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 0c30d0455de1..65b5f5b7c298 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3211,8 +3211,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, { unsigned long flags; - trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); - if (unlikely(current->lockdep_recursion)) return; @@ -3220,6 +3218,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, check_flags(flags); current->lockdep_recursion = 1; + trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip); __lock_acquire(lock, subclass, trylock, read, check, irqs_disabled_flags(flags), nest_lock, ip, 0); current->lockdep_recursion = 0; @@ -3232,14 +3231,13 @@ void lock_release(struct lockdep_map *lock, int nested, { unsigned long flags; - trace_lock_release(lock, nested, ip); - if (unlikely(current->lockdep_recursion)) return; raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; + trace_lock_release(lock, nested, ip); __lock_release(lock, nested, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); @@ -3413,8 +3411,6 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_contended(lock, ip); - if (unlikely(!lock_stat)) return; @@ -3424,6 +3420,7 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) raw_local_irq_save(flags); check_flags(flags); current->lockdep_recursion = 1; + trace_lock_contended(lock, ip); __lock_contended(lock, ip); current->lockdep_recursion = 0; raw_local_irq_restore(flags); -- cgit v1.2.2 From 5331d7b84613b8325362dde53dc2bff2fb87d351 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 4 Mar 2010 21:15:56 +0100 Subject: perf: Introduce new perf_fetch_caller_regs() for hot regs snapshot Events that trigger overflows by interrupting a context can use get_irq_regs() or task_pt_regs() to retrieve the state when the event triggered. But this is not the case for some other class of events like trace events as tracepoints are executed in the same context than the code that triggered the event. It means we need a different api to capture the regs there, namely we need a hot snapshot to get the most important informations for perf: the instruction pointer to get the event origin, the frame pointer for the callchain, the code segment for user_mode() tests (we always use __KERNEL_CS as trace events always occur from the kernel) and the eflags for further purposes. v2: rename perf_save_regs to perf_fetch_caller_regs as per Masami's suggestion. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Jason Baron Cc: Archs --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 52c69a34d697..359d7f690c2b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2786,6 +2786,11 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } +__weak +void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) +{ +} + /* * Output */ -- cgit v1.2.2 From c530665c31c0140b74ca7689e7f836177796e5bd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 3 Mar 2010 07:16:16 +0100 Subject: perf: Take a hot regs snapshot for trace events We are taking a wrong regs snapshot when a trace event triggers. Either we use get_irq_regs(), which gives us the interrupted registers if we are in an interrupt, or we use task_pt_regs() which gives us the state before we entered the kernel, assuming we are lucky enough to be no kernel thread, in which case task_pt_regs() returns the initial set of regs when the kernel thread was started. What we want is different. We need a hot snapshot of the regs, so that we can get the instruction pointer to record in the sample, the frame pointer for the callchain, and some other things. Let's use the new perf_fetch_caller_regs() for that. Comparison with perf record -e lock: -R -a -f -g Before: perf [kernel] [k] __do_softirq | --- __do_softirq | |--55.16%-- __open | --44.84%-- __write_nocancel After: perf [kernel] [k] perf_tp_event | --- perf_tp_event | |--41.07%-- lock_acquire | | | |--39.36%-- _raw_spin_lock | | | | | |--7.81%-- hrtimer_interrupt | | | smp_apic_timer_interrupt | | | apic_timer_interrupt The old case was producing unreliable callchains. Now having right frame and instruction pointers, we have the trace we want. Also syscalls and kprobe events already have the right regs, let's use them instead of wasting a retrieval. v2: Follow the rename perf_save_regs() -> perf_fetch_caller_regs() Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: Arnaldo Carvalho de Melo Cc: Masami Hiramatsu Cc: Jason Baron Cc: Archs --- kernel/perf_event.c | 8 ++------ kernel/trace/trace_event_profile.c | 3 ++- kernel/trace/trace_kprobe.c | 5 +++-- kernel/trace/trace_syscalls.c | 4 ++-- 4 files changed, 9 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 359d7f690c2b..45b4b6e55891 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4318,9 +4318,8 @@ static const struct pmu perf_ops_task_clock = { #ifdef CONFIG_EVENT_TRACING void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size) + int entry_size, struct pt_regs *regs) { - struct pt_regs *regs = get_irq_regs(); struct perf_sample_data data; struct perf_raw_record raw = { .size = entry_size, @@ -4330,12 +4329,9 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, perf_sample_data_init(&data, addr); data.raw = &raw; - if (!regs) - regs = task_pt_regs(current); - /* Trace events already protected against recursion */ do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); + &data, regs); } EXPORT_SYMBOL_GPL(perf_tp_event); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index f0d693005075..e66d21e15a0f 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -2,13 +2,14 @@ * trace event based perf counter profiling * * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra - * + * Copyright (C) 2009-2010 Frederic Weisbecker */ #include #include #include "trace.h" +DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); static char *perf_trace_buf; static char *perf_trace_buf_nmi; diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 505c92273b1a..f7a20a8bfb31 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1240,7 +1240,7 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } /* Kretprobe profile handler */ @@ -1271,7 +1271,8 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags); + ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, + irq_flags, regs); } static int probe_profile_enable(struct ftrace_event_call *call) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index cba47d7935cc..7e6e84fb7b6c 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -467,7 +467,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs); } int prof_sysenter_enable(struct ftrace_event_call *call) @@ -542,7 +542,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags); + ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs); } int prof_sysexit_enable(struct ftrace_event_call *call) -- cgit v1.2.2 From 97d5a22005f38057b4bc0d95f81cd26510268794 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 5 Mar 2010 05:35:37 +0100 Subject: perf: Drop the obsolete profile naming for trace events Drop the obsolete "profile" naming used by perf for trace events. Perf can now do more than simple events counting, so generalize the API naming. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Jason Baron --- kernel/perf_event.c | 4 +- kernel/trace/Makefile | 2 +- kernel/trace/trace_event_perf.c | 165 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_event_profile.c | 165 ------------------------------------- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_kprobe.c | 28 +++---- kernel/trace/trace_syscalls.c | 72 ++++++++-------- 7 files changed, 219 insertions(+), 219 deletions(-) create mode 100644 kernel/trace/trace_event_perf.c delete mode 100644 kernel/trace/trace_event_profile.c (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 45b4b6e55891..c502b18594cc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4347,7 +4347,7 @@ static int perf_tp_event_match(struct perf_event *event, static void tp_perf_event_destroy(struct perf_event *event) { - ftrace_profile_disable(event->attr.config); + perf_trace_disable(event->attr.config); } static const struct pmu *tp_perf_event_init(struct perf_event *event) @@ -4361,7 +4361,7 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (ftrace_profile_enable(event->attr.config)) + if (perf_trace_enable(event->attr.config)) return NULL; event->destroy = tp_perf_event_destroy; diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index d00c6fe23f54..78edc6490038 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -52,7 +52,7 @@ obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o ifeq ($(CONFIG_PERF_EVENTS),y) -obj-$(CONFIG_EVENT_TRACING) += trace_event_profile.o +obj-$(CONFIG_EVENT_TRACING) += trace_event_perf.o endif obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o obj-$(CONFIG_KPROBE_EVENT) += trace_kprobe.o diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c new file mode 100644 index 000000000000..f315b12a41d8 --- /dev/null +++ b/kernel/trace/trace_event_perf.c @@ -0,0 +1,165 @@ +/* + * trace event based perf event profiling/tracing + * + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra + * Copyright (C) 2009-2010 Frederic Weisbecker + */ + +#include +#include +#include "trace.h" + +DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); + +static char *perf_trace_buf; +static char *perf_trace_buf_nmi; + +typedef typeof(char [PERF_MAX_TRACE_SIZE]) perf_trace_t ; + +/* Count the events in use (per event id, not per instance) */ +static int total_ref_count; + +static int perf_trace_event_enable(struct ftrace_event_call *event) +{ + char *buf; + int ret = -ENOMEM; + + if (event->perf_refcount++ > 0) + return 0; + + if (!total_ref_count) { + buf = (char *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail_buf; + + rcu_assign_pointer(perf_trace_buf, buf); + + buf = (char *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail_buf_nmi; + + rcu_assign_pointer(perf_trace_buf_nmi, buf); + } + + ret = event->perf_event_enable(event); + if (!ret) { + total_ref_count++; + return 0; + } + +fail_buf_nmi: + if (!total_ref_count) { + free_percpu(perf_trace_buf_nmi); + free_percpu(perf_trace_buf); + perf_trace_buf_nmi = NULL; + perf_trace_buf = NULL; + } +fail_buf: + event->perf_refcount--; + + return ret; +} + +int perf_trace_enable(int event_id) +{ + struct ftrace_event_call *event; + int ret = -EINVAL; + + mutex_lock(&event_mutex); + list_for_each_entry(event, &ftrace_events, list) { + if (event->id == event_id && event->perf_event_enable && + try_module_get(event->mod)) { + ret = perf_trace_event_enable(event); + break; + } + } + mutex_unlock(&event_mutex); + + return ret; +} + +static void perf_trace_event_disable(struct ftrace_event_call *event) +{ + char *buf, *nmi_buf; + + if (--event->perf_refcount > 0) + return; + + event->perf_event_disable(event); + + if (!--total_ref_count) { + buf = perf_trace_buf; + rcu_assign_pointer(perf_trace_buf, NULL); + + nmi_buf = perf_trace_buf_nmi; + rcu_assign_pointer(perf_trace_buf_nmi, NULL); + + /* + * Ensure every events in profiling have finished before + * releasing the buffers + */ + synchronize_sched(); + + free_percpu(buf); + free_percpu(nmi_buf); + } +} + +void perf_trace_disable(int event_id) +{ + struct ftrace_event_call *event; + + mutex_lock(&event_mutex); + list_for_each_entry(event, &ftrace_events, list) { + if (event->id == event_id) { + perf_trace_event_disable(event); + module_put(event->mod); + break; + } + } + mutex_unlock(&event_mutex); +} + +__kprobes void *perf_trace_buf_prepare(int size, unsigned short type, + int *rctxp, unsigned long *irq_flags) +{ + struct trace_entry *entry; + char *trace_buf, *raw_data; + int pc, cpu; + + pc = preempt_count(); + + /* Protect the per cpu buffer, begin the rcu read side */ + local_irq_save(*irq_flags); + + *rctxp = perf_swevent_get_recursion_context(); + if (*rctxp < 0) + goto err_recursion; + + cpu = smp_processor_id(); + + if (in_nmi()) + trace_buf = rcu_dereference(perf_trace_buf_nmi); + else + trace_buf = rcu_dereference(perf_trace_buf); + + if (!trace_buf) + goto err; + + raw_data = per_cpu_ptr(trace_buf, cpu); + + /* zero the dead bytes from align to not leak stack to user */ + *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; + + entry = (struct trace_entry *)raw_data; + tracing_generic_entry_update(entry, *irq_flags, pc); + entry->type = type; + + return raw_data; +err: + perf_swevent_put_recursion_context(*rctxp); +err_recursion: + local_irq_restore(*irq_flags); + return NULL; +} +EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c deleted file mode 100644 index e66d21e15a0f..000000000000 --- a/kernel/trace/trace_event_profile.c +++ /dev/null @@ -1,165 +0,0 @@ -/* - * trace event based perf counter profiling - * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra - * Copyright (C) 2009-2010 Frederic Weisbecker - */ - -#include -#include -#include "trace.h" - -DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); - -static char *perf_trace_buf; -static char *perf_trace_buf_nmi; - -typedef typeof(char [FTRACE_MAX_PROFILE_SIZE]) perf_trace_t ; - -/* Count the events in use (per event id, not per instance) */ -static int total_profile_count; - -static int ftrace_profile_enable_event(struct ftrace_event_call *event) -{ - char *buf; - int ret = -ENOMEM; - - if (event->profile_count++ > 0) - return 0; - - if (!total_profile_count) { - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf; - - rcu_assign_pointer(perf_trace_buf, buf); - - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf_nmi; - - rcu_assign_pointer(perf_trace_buf_nmi, buf); - } - - ret = event->profile_enable(event); - if (!ret) { - total_profile_count++; - return 0; - } - -fail_buf_nmi: - if (!total_profile_count) { - free_percpu(perf_trace_buf_nmi); - free_percpu(perf_trace_buf); - perf_trace_buf_nmi = NULL; - perf_trace_buf = NULL; - } -fail_buf: - event->profile_count--; - - return ret; -} - -int ftrace_profile_enable(int event_id) -{ - struct ftrace_event_call *event; - int ret = -EINVAL; - - mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->profile_enable && - try_module_get(event->mod)) { - ret = ftrace_profile_enable_event(event); - break; - } - } - mutex_unlock(&event_mutex); - - return ret; -} - -static void ftrace_profile_disable_event(struct ftrace_event_call *event) -{ - char *buf, *nmi_buf; - - if (--event->profile_count > 0) - return; - - event->profile_disable(event); - - if (!--total_profile_count) { - buf = perf_trace_buf; - rcu_assign_pointer(perf_trace_buf, NULL); - - nmi_buf = perf_trace_buf_nmi; - rcu_assign_pointer(perf_trace_buf_nmi, NULL); - - /* - * Ensure every events in profiling have finished before - * releasing the buffers - */ - synchronize_sched(); - - free_percpu(buf); - free_percpu(nmi_buf); - } -} - -void ftrace_profile_disable(int event_id) -{ - struct ftrace_event_call *event; - - mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { - ftrace_profile_disable_event(event); - module_put(event->mod); - break; - } - } - mutex_unlock(&event_mutex); -} - -__kprobes void *ftrace_perf_buf_prepare(int size, unsigned short type, - int *rctxp, unsigned long *irq_flags) -{ - struct trace_entry *entry; - char *trace_buf, *raw_data; - int pc, cpu; - - pc = preempt_count(); - - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(*irq_flags); - - *rctxp = perf_swevent_get_recursion_context(); - if (*rctxp < 0) - goto err_recursion; - - cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference(perf_trace_buf); - - if (!trace_buf) - goto err; - - raw_data = per_cpu_ptr(trace_buf, cpu); - - /* zero the dead bytes from align to not leak stack to user */ - *(u64 *)(&raw_data[size - sizeof(u64)]) = 0ULL; - - entry = (struct trace_entry *)raw_data; - tracing_generic_entry_update(entry, *irq_flags, pc); - entry->type = type; - - return raw_data; -err: - perf_swevent_put_recursion_context(*rctxp); -err_recursion: - local_irq_restore(*irq_flags); - return NULL; -} -EXPORT_SYMBOL_GPL(ftrace_perf_buf_prepare); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3f972ad98d04..beab8bf2f310 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -938,7 +938,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id && call->profile_enable) + if (call->id && call->perf_event_enable) trace_create_file("id", 0444, call->dir, call, id); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index f7a20a8bfb31..1251e367bae9 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1214,7 +1214,7 @@ static int set_print_fmt(struct trace_probe *tp) #ifdef CONFIG_PERF_EVENTS /* Kprobe profile handler */ -static __kprobes void kprobe_profile_func(struct kprobe *kp, +static __kprobes void kprobe_perf_func(struct kprobe *kp, struct pt_regs *regs) { struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); @@ -1227,11 +1227,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, __size = SIZEOF_KPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) return; @@ -1240,11 +1240,11 @@ static __kprobes void kprobe_profile_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); } /* Kretprobe profile handler */ -static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, +static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); @@ -1257,11 +1257,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, __size = SIZEOF_KRETPROBE_TRACE_ENTRY(tp->nr_args); size = ALIGN(__size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough")) return; - entry = ftrace_perf_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); if (!entry) return; @@ -1271,11 +1271,11 @@ static __kprobes void kretprobe_profile_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) entry->args[i] = call_fetch(&tp->args[i].fetch, regs); - ftrace_perf_buf_submit(entry, size, rctx, entry->ret_ip, 1, + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, irq_flags, regs); } -static int probe_profile_enable(struct ftrace_event_call *call) +static int probe_perf_enable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; @@ -1287,7 +1287,7 @@ static int probe_profile_enable(struct ftrace_event_call *call) return enable_kprobe(&tp->rp.kp); } -static void probe_profile_disable(struct ftrace_event_call *call) +static void probe_perf_disable(struct ftrace_event_call *call) { struct trace_probe *tp = (struct trace_probe *)call->data; @@ -1312,7 +1312,7 @@ int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) kprobe_trace_func(kp, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kprobe_profile_func(kp, regs); + kprobe_perf_func(kp, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1326,7 +1326,7 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) kretprobe_trace_func(ri, regs); #ifdef CONFIG_PERF_EVENTS if (tp->flags & TP_FLAG_PROFILE) - kretprobe_profile_func(ri, regs); + kretprobe_perf_func(ri, regs); #endif return 0; /* We don't tweek kernel, so just return 0 */ } @@ -1359,8 +1359,8 @@ static int register_probe_event(struct trace_probe *tp) call->unregfunc = probe_event_disable; #ifdef CONFIG_PERF_EVENTS - call->profile_enable = probe_profile_enable; - call->profile_disable = probe_profile_disable; + call->perf_event_enable = probe_perf_enable; + call->perf_event_disable = probe_perf_disable; #endif call->data = tp; ret = trace_add_event_call(call); diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7e6e84fb7b6c..33c2a5b769dc 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -428,12 +428,12 @@ core_initcall(init_ftrace_syscalls); #ifdef CONFIG_PERF_EVENTS -static DECLARE_BITMAP(enabled_prof_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_prof_exit_syscalls, NR_syscalls); -static int sys_prof_refcount_enter; -static int sys_prof_refcount_exit; +static DECLARE_BITMAP(enabled_perf_enter_syscalls, NR_syscalls); +static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); +static int sys_perf_refcount_enter; +static int sys_perf_refcount_exit; -static void prof_syscall_enter(struct pt_regs *regs, long id) +static void perf_syscall_enter(struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; @@ -443,7 +443,7 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) int size; syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_prof_enter_syscalls)) + if (!test_bit(syscall_nr, enabled_perf_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -455,11 +455,11 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) size = ALIGN(size + sizeof(u32), sizeof(u64)); size -= sizeof(u32); - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, - "profile buffer not large enough")) + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "perf buffer not large enough")) return; - rec = (struct syscall_trace_enter *)ftrace_perf_buf_prepare(size, + rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, sys_data->enter_event->id, &rctx, &flags); if (!rec) return; @@ -467,10 +467,10 @@ static void prof_syscall_enter(struct pt_regs *regs, long id) rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs); + perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); } -int prof_sysenter_enable(struct ftrace_event_call *call) +int perf_sysenter_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -478,34 +478,34 @@ int prof_sysenter_enable(struct ftrace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - if (!sys_prof_refcount_enter) - ret = register_trace_sys_enter(prof_syscall_enter); + if (!sys_perf_refcount_enter) + ret = register_trace_sys_enter(perf_syscall_enter); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); } else { - set_bit(num, enabled_prof_enter_syscalls); - sys_prof_refcount_enter++; + set_bit(num, enabled_perf_enter_syscalls); + sys_perf_refcount_enter++; } mutex_unlock(&syscall_trace_lock); return ret; } -void prof_sysenter_disable(struct ftrace_event_call *call) +void perf_sysenter_disable(struct ftrace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - sys_prof_refcount_enter--; - clear_bit(num, enabled_prof_enter_syscalls); - if (!sys_prof_refcount_enter) - unregister_trace_sys_enter(prof_syscall_enter); + sys_perf_refcount_enter--; + clear_bit(num, enabled_perf_enter_syscalls); + if (!sys_perf_refcount_enter) + unregister_trace_sys_enter(perf_syscall_enter); mutex_unlock(&syscall_trace_lock); } -static void prof_syscall_exit(struct pt_regs *regs, long ret) +static void perf_syscall_exit(struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; @@ -515,7 +515,7 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) int size; syscall_nr = syscall_get_nr(current, regs); - if (!test_bit(syscall_nr, enabled_prof_exit_syscalls)) + if (!test_bit(syscall_nr, enabled_perf_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -530,11 +530,11 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) * Impossible, but be paranoid with the future * How to put this check outside runtime? */ - if (WARN_ONCE(size > FTRACE_MAX_PROFILE_SIZE, - "exit event has grown above profile buffer size")) + if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, + "exit event has grown above perf buffer size")) return; - rec = (struct syscall_trace_exit *)ftrace_perf_buf_prepare(size, + rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, sys_data->exit_event->id, &rctx, &flags); if (!rec) return; @@ -542,10 +542,10 @@ static void prof_syscall_exit(struct pt_regs *regs, long ret) rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - ftrace_perf_buf_submit(rec, size, rctx, 0, 1, flags, regs); + perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); } -int prof_sysexit_enable(struct ftrace_event_call *call) +int perf_sysexit_enable(struct ftrace_event_call *call) { int ret = 0; int num; @@ -553,30 +553,30 @@ int prof_sysexit_enable(struct ftrace_event_call *call) num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - if (!sys_prof_refcount_exit) - ret = register_trace_sys_exit(prof_syscall_exit); + if (!sys_perf_refcount_exit) + ret = register_trace_sys_exit(perf_syscall_exit); if (ret) { pr_info("event trace: Could not activate" "syscall exit trace point"); } else { - set_bit(num, enabled_prof_exit_syscalls); - sys_prof_refcount_exit++; + set_bit(num, enabled_perf_exit_syscalls); + sys_perf_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } -void prof_sysexit_disable(struct ftrace_event_call *call) +void perf_sysexit_disable(struct ftrace_event_call *call) { int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; mutex_lock(&syscall_trace_lock); - sys_prof_refcount_exit--; - clear_bit(num, enabled_prof_exit_syscalls); - if (!sys_prof_refcount_exit) - unregister_trace_sys_exit(prof_syscall_exit); + sys_perf_refcount_exit--; + clear_bit(num, enabled_perf_exit_syscalls); + if (!sys_perf_refcount_exit) + unregister_trace_sys_exit(perf_syscall_exit); mutex_unlock(&syscall_trace_lock); } -- cgit v1.2.2 From 0b1adaa031a55e44f5dd942f234bf09d28e8a0d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Mar 2010 19:45:54 +0100 Subject: genirq: Prevent oneshot irq thread race Lars-Peter pointed out that the oneshot threaded interrupt handler code has the following race: CPU0 CPU1 hande_level_irq(irq X) mask_ack_irq(irq X) handle_IRQ_event(irq X) wake_up(thread_handler) thread handler(irq X) runs finalize_oneshot(irq X) does not unmask due to !(desc->status & IRQ_MASKED) return from irq does not unmask due to (desc->status & IRQ_ONESHOT) This leaves the interrupt line masked forever. The reason for this is the inconsistent handling of the IRQ_MASKED flag. Instead of setting it in the mask function the oneshot support sets the flag after waking up the irq thread. The solution for this is to set/clear the IRQ_MASKED status whenever we mask/unmask an interrupt line. That's the easy part, but that cleanup opens another race: CPU0 CPU1 hande_level_irq(irq) mask_ack_irq(irq) handle_IRQ_event(irq) wake_up(thread_handler) thread handler(irq) runs finalize_oneshot_irq(irq) unmask(irq) irq triggers again handle_level_irq(irq) mask_ack_irq(irq) return from irq due to IRQ_INPROGRESS return from irq does not unmask due to (desc->status & IRQ_ONESHOT) This requires that we synchronize finalize_oneshot_irq() with the primary handler. If IRQ_INPROGESS is set we wait until the primary handler on the other CPU has returned before unmasking the interrupt line again. We probably have never seen that problem because it does not happen on UP and on SMP the irqbalancer protects us by pinning the primary handler and the thread to the same CPU. Reported-by: Lars-Peter Clausen Signed-off-by: Thomas Gleixner Cc: stable@kernel.org --- kernel/irq/chip.c | 31 ++++++++++++++++++++++--------- kernel/irq/manage.c | 18 ++++++++++++++++++ 2 files changed, 40 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index d70394f12ee9..71eba24a39a2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -359,6 +359,23 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq) if (desc->chip->ack) desc->chip->ack(irq); } + desc->status |= IRQ_MASKED; +} + +static inline void mask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->mask) { + desc->chip->mask(irq); + desc->status |= IRQ_MASKED; + } +} + +static inline void unmask_irq(struct irq_desc *desc, int irq) +{ + if (desc->chip->unmask) { + desc->chip->unmask(irq); + desc->status &= ~IRQ_MASKED; + } } /* @@ -484,10 +501,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc) raw_spin_lock(&desc->lock); desc->status &= ~IRQ_INPROGRESS; - if (unlikely(desc->status & IRQ_ONESHOT)) - desc->status |= IRQ_MASKED; - else if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask) - desc->chip->unmask(irq); + if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT))) + unmask_irq(desc, irq); out_unlock: raw_spin_unlock(&desc->lock); } @@ -524,8 +539,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) action = desc->action; if (unlikely(!action || (desc->status & IRQ_DISABLED))) { desc->status |= IRQ_PENDING; - if (desc->chip->mask) - desc->chip->mask(irq); + mask_irq(desc, irq); goto out; } @@ -593,7 +607,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) irqreturn_t action_ret; if (unlikely(!action)) { - desc->chip->mask(irq); + mask_irq(desc, irq); goto out_unlock; } @@ -605,8 +619,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc) if (unlikely((desc->status & (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) == (IRQ_PENDING | IRQ_MASKED))) { - desc->chip->unmask(irq); - desc->status &= ~IRQ_MASKED; + unmask_irq(desc, irq); } desc->status &= ~IRQ_PENDING; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index eb6078ca60c7..69a3d7b9414c 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -483,8 +483,26 @@ static int irq_wait_for_interrupt(struct irqaction *action) */ static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc) { +again: chip_bus_lock(irq, desc); raw_spin_lock_irq(&desc->lock); + + /* + * Implausible though it may be we need to protect us against + * the following scenario: + * + * The thread is faster done than the hard interrupt handler + * on the other CPU. If we unmask the irq line then the + * interrupt can come in again and masks the line, leaves due + * to IRQ_INPROGRESS and the irq line is masked forever. + */ + if (unlikely(desc->status & IRQ_INPROGRESS)) { + raw_spin_unlock_irq(&desc->lock); + chip_bus_sync_unlock(irq, desc); + cpu_relax(); + goto again; + } + if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) { desc->status &= ~IRQ_MASKED; desc->chip->unmask(irq); -- cgit v1.2.2 From 220b140b52ab6cc133f674a7ffec8fa792054f25 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 10 Mar 2010 20:45:52 +1100 Subject: perf_event: Fix oops triggered by cpu offline/online Anton Blanchard found that he could reliably make the kernel hit a BUG_ON in the slab allocator by taking a cpu offline and then online while a system-wide perf record session was running. The reason is that when the cpu comes up, we completely reinitialize the ctx field of the struct perf_cpu_context for the cpu. If there is a system-wide perf record session running, then there will be a struct perf_event that has a reference to the context, so its refcount will be 2. (The perf_event has been removed from the context's group_entry and event_entry lists by perf_event_exit_cpu(), but that doesn't remove the perf_event's reference to the context and doesn't decrement the context's refcount.) When the cpu comes up, perf_event_init_cpu() gets called, and it calls __perf_event_init_context() on the cpu's context. That resets the refcount to 1. Then when the perf record session finishes and the perf_event is closed, the refcount gets decremented to 0 and the context gets kfreed after an RCU grace period. Since the context wasn't kmalloced -- it's part of a per-cpu variable -- bad things happen. In fact we don't need to completely reinitialize the context when the cpu comes up. It's sufficient to initialize the context once at boot, but we need to do it for all possible cpus. This moves the context initialization to happen at boot time. With this, we don't trash the refcount and the context never gets kfreed, and we don't hit the BUG_ON. Reported-by: Anton Blanchard Signed-off-by: Paul Mackerras Tested-by: Anton Blanchard Acked-by: Peter Zijlstra Cc: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c502b18594cc..fb3031cf9f17 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5368,12 +5368,22 @@ int perf_event_init_task(struct task_struct *child) return ret; } +static void __init perf_event_init_all_cpus(void) +{ + int cpu; + struct perf_cpu_context *cpuctx; + + for_each_possible_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + } +} + static void __cpuinit perf_event_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx, NULL); spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; @@ -5439,6 +5449,7 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { void __init perf_event_init(void) { + perf_event_init_all_cpus(); perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, -- cgit v1.2.2 From 3f379b03fbfddd20536389a85c6456f8233d1f8d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2010 15:03:25 -0800 Subject: ftrace: Replace read_barrier_depends() with rcu_dereference_raw() Replace the calls to read_barrier_depends() in ftrace_list_func() with rcu_dereference_raw() to improve readability. The reason that we use rcu_dereference_raw() here is that removed entries are never freed, instead they are simply leaked. This is one of a very few cases where use of rcu_dereference_raw() is the long-term right answer. And I don't yet know of any others. ;-) Signed-off-by: Paul E. McKenney Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83783579378f..8c5adc0e5db3 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -88,18 +89,22 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub; static int ftrace_set_func(unsigned long *array, int *idx, char *buffer); #endif +/* + * Traverse the ftrace_list, invoking all entries. The reason that we + * can use rcu_dereference_raw() is that elements removed from this list + * are simply leaked, so there is no need to interact with a grace-period + * mechanism. The rcu_dereference_raw() calls are needed to handle + * concurrent insertions into the ftrace_list. + * + * Silly Alpha and silly pointer-speculation compiler optimizations! + */ static void ftrace_list_func(unsigned long ip, unsigned long parent_ip) { - struct ftrace_ops *op = ftrace_list; - - /* in case someone actually ports this to alpha! */ - read_barrier_depends(); + struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/ while (op != &ftrace_list_end) { - /* silly alpha */ - read_barrier_depends(); op->func(ip, parent_ip); - op = op->next; + op = rcu_dereference_raw(op->next); /*see above*/ }; } @@ -154,8 +159,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops) * the ops->next pointer is valid before another CPU sees * the ops pointer included into the ftrace_list. */ - smp_wmb(); - ftrace_list = ops; + rcu_assign_pointer(ftrace_list, ops); if (ftrace_enabled) { ftrace_func_t func; -- cgit v1.2.2 From 007b09243b099811124f69d492adeebe9e439f96 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 5 Mar 2010 15:03:26 -0800 Subject: rcu: Increase RCU CPU stall timeouts if PROVE_RCU CONFIG_PROVE_RCU imposes additional overhead on the kernel, so increase the RCU CPU stall timeouts in an attempt to allow for this effect. Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <1267830207-9474-2-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcutree.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/rcutree.h b/kernel/rcutree.h index 1439eb504c22..4a525a30e08e 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -246,12 +246,21 @@ struct rcu_data { #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ -#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ -#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ - /* to take at least one */ - /* scheduling clock irq */ - /* before ratting on them. */ + +#ifdef CONFIG_PROVE_RCU +#define RCU_STALL_DELAY_DELTA (5 * HZ) +#else +#define RCU_STALL_DELAY_DELTA 0 +#endif + +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ + RCU_STALL_DELAY_DELTA) + /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ + RCU_STALL_DELAY_DELTA) + /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -- cgit v1.2.2 From 83ff56f46a8532488ee364bb93a9cb2a59490d33 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 9 Mar 2010 10:22:19 -0500 Subject: kprobes: Calculate the index correctly when freeing the out-of-line execution slot From : Ananth N Mavinakayanahalli When freeing the instruction slot, the arithmetic to calculate the index of the slot in the page needs to account for the total size of the instruction on the various architectures. Calculate the index correctly when freeing the out-of-line execution slot. Reported-by: Sachin Sant Reported-by: Heiko Carstens Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Masami Hiramatsu LKML-Reference: <4B9667AB.9050507@redhat.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index fa034d29cf73..0ed46f3e51e9 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -259,7 +259,8 @@ static void __kprobes __free_insn_slot(struct kprobe_insn_cache *c, struct kprobe_insn_page *kip; list_for_each_entry(kip, &c->pages, list) { - long idx = ((long)slot - (long)kip->insns) / c->insn_size; + long idx = ((long)slot - (long)kip->insns) / + (c->insn_size * sizeof(kprobe_opcode_t)); if (idx >= 0 && idx < slots_per_page(c)) { WARN_ON(kip->slot_used[idx] != SLOT_USED); if (dirty) { -- cgit v1.2.2 From 639fe4b12f92b54c9c3b38c82cdafaa38cfd3e63 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 11 Mar 2010 15:30:35 +0800 Subject: perf: export perf_trace_regs and perf_arch_fetch_caller_regs Export perf_trace_regs and perf_arch_fetch_caller_regs since module will use these. Signed-off-by: Xiao Guangrong [ use EXPORT_PER_CPU_SYMBOL_GPL() ] Signed-off-by: Peter Zijlstra LKML-Reference: <4B989C1B.2090407@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index f315b12a41d8..0709e4f75114 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -10,6 +10,7 @@ #include "trace.h" DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); +EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); static char *perf_trace_buf; static char *perf_trace_buf_nmi; -- cgit v1.2.2 From 80a05b9ffa7dc13f6693902dd8999a2b61a3a0d7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 12 Mar 2010 17:34:14 +0100 Subject: clockevents: Sanitize min_delta_ns adjustment and prevent overflows The current logic which handles clock events programming failures can increase min_delta_ns unlimited and even can cause overflows. Sanitize it by: - prevent zero increase when min_delta_ns == 1 - limiting min_delta_ns to a jiffie - bail out if the jiffie limit is hit - add retries stats for /proc/timer_list so we can gather data Reported-by: Uwe Kleine-Koenig Signed-off-by: Thomas Gleixner --- kernel/time/tick-oneshot.c | 52 +++++++++++++++++++++++++++++++++++----------- kernel/time/timer_list.c | 3 ++- 2 files changed, 42 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 0a8a213016f0..aada0e52680a 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -22,6 +22,29 @@ #include "tick-internal.h" +/* Limit min_delta to a jiffie */ +#define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) + +static int tick_increase_min_delta(struct clock_event_device *dev) +{ + /* Nothing to do if we already reached the limit */ + if (dev->min_delta_ns >= MIN_DELTA_LIMIT) + return -ETIME; + + if (dev->min_delta_ns < 5000) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns += dev->min_delta_ns >> 1; + + if (dev->min_delta_ns > MIN_DELTA_LIMIT) + dev->min_delta_ns = MIN_DELTA_LIMIT; + + printk(KERN_WARNING "CE: %s increased min_delta_ns to %llu nsec\n", + dev->name ? dev->name : "?", + (unsigned long long) dev->min_delta_ns); + return 0; +} + /** * tick_program_event internal worker function */ @@ -37,23 +60,28 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, if (!ret || !force) return ret; + dev->retries++; /* - * We tried 2 times to program the device with the given - * min_delta_ns. If that's not working then we double it + * We tried 3 times to program the device with the given + * min_delta_ns. If that's not working then we increase it * and emit a warning. */ if (++i > 2) { /* Increase the min. delta and try again */ - if (!dev->min_delta_ns) - dev->min_delta_ns = 5000; - else - dev->min_delta_ns += dev->min_delta_ns >> 1; - - printk(KERN_WARNING - "CE: %s increasing min_delta_ns to %llu nsec\n", - dev->name ? dev->name : "?", - (unsigned long long) dev->min_delta_ns << 1); - + if (tick_increase_min_delta(dev)) { + /* + * Get out of the loop if min_delta_ns + * hit the limit already. That's + * better than staying here forever. + * + * We clear next_event so we have a + * chance that the box survives. + */ + printk(KERN_WARNING + "CE: Reprogramming failure. Giving up\n"); + dev->next_event.tv64 = KTIME_MAX; + return -ETIME; + } i = 0; } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index bdfb8dd1050c..1a4a7dd78777 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -228,6 +228,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) SEQ_printf(m, " event_handler: "); print_name_offset(m, dev->event_handler); SEQ_printf(m, "\n"); + SEQ_printf(m, " retries: %lu\n", dev->retries); } static void timer_list_show_tickdevices(struct seq_file *m) @@ -257,7 +258,7 @@ static int timer_list_show(struct seq_file *m, void *v) u64 now = ktime_to_ns(ktime_get()); int cpu; - SEQ_printf(m, "Timer List Version: v0.5\n"); + SEQ_printf(m, "Timer List Version: v0.6\n"); SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); -- cgit v1.2.2 From 829b6c1ef488856c6a46a2f705f5068062d5f34c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Mar 2010 14:04:30 -0800 Subject: timer stats: Fix del_timer_sync() and try_to_del_timer_sync() These functions forgot to run timer_stats_timer_clear_start_info(). It's unobvious what effect this has and whether it matters much - we won't be printing it out anyway if the timer's detached. Untested, just an Ingo trollpatch. [ Nevertheless correct - tglx ] Signed-off-by: Andrew Morton Cc: Ingo Molnar Cc: johnstul@us.ibm.com Signed-off-by: Thomas Gleixner --- kernel/timer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index c61a7949387f..fc965eae0e87 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -880,6 +880,7 @@ int try_to_del_timer_sync(struct timer_list *timer) if (base->running_timer == timer) goto out; + timer_stats_timer_clear_start_info(timer); ret = 0; if (timer_pending(timer)) { detach_timer(timer, 1); -- cgit v1.2.2 From 15365c108ea27598e265f8c13e7051d99ca5b0b9 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 11 Mar 2010 14:04:31 -0800 Subject: posix-cpu-timers: Reset expire cache when no timer is running When a process deletes cpu timer or a timer expires we do not clear the expiration cache sig->cputimer_expires. As a result the fastpath_timer_check() which prevents us to loop over all threads in case no timer is active is not working and we run the slow path needlessly on every tick. Zero sig->cputimer_expires in stop_process_timers(). Signed-off-by: Stanislaw Gruszka Cc: Ingo Molnar Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Hidetoshi Seto Cc: Spencer Candland Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- kernel/posix-cpu-timers.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 438ff4523513..edec25a68e9e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -1060,9 +1060,9 @@ static void check_thread_timers(struct task_struct *tsk, } } -static void stop_process_timers(struct task_struct *tsk) +static void stop_process_timers(struct signal_struct *sig) { - struct thread_group_cputimer *cputimer = &tsk->signal->cputimer; + struct thread_group_cputimer *cputimer = &sig->cputimer; unsigned long flags; if (!cputimer->running) @@ -1071,6 +1071,10 @@ static void stop_process_timers(struct task_struct *tsk) spin_lock_irqsave(&cputimer->lock, flags); cputimer->running = 0; spin_unlock_irqrestore(&cputimer->lock, flags); + + sig->cputime_expires.prof_exp = cputime_zero; + sig->cputime_expires.virt_exp = cputime_zero; + sig->cputime_expires.sched_exp = 0; } static u32 onecputick; @@ -1131,7 +1135,7 @@ static void check_process_timers(struct task_struct *tsk, list_empty(&timers[CPUCLOCK_VIRT]) && cputime_eq(sig->it[CPUCLOCK_VIRT].expires, cputime_zero) && list_empty(&timers[CPUCLOCK_SCHED])) { - stop_process_timers(tsk); + stop_process_timers(sig); return; } -- cgit v1.2.2 From baed7fc9b580bd3fb8252ff1d9b36eaf1f86b670 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:18 -0800 Subject: Add generic sys_ipc wrapper Add a generic implementation of the ipc demultiplexer syscall. Except for s390 and sparc64 all implementations of the sys_ipc are nearly identical. There are slight differences in the types of the parameters, where mips and powerpc as the only 64-bit architectures with sys_ipc use unsigned long for the "third" argument as it gets casted to a pointer later, while it traditionally is an "int" like most other paramters. frv goes even further and uses unsigned long for all parameters execept for "ptr" which is a pointer type everywhere. The change from int to unsigned long for "third" and back to "int" for the others on frv should be fine due to the in-register calling conventions for syscalls (we already had a similar issue with the generic sys_ptrace), but I'd prefer to have the arch maintainers looks over this in details. Except for that h8300, m68k and m68knommu lack an impplementation of the semtimedop sub call which this patch adds, and various architectures have gets used - at least on i386 it seems superflous as the compat code on x86-64 and ia64 doesn't even bother to implement it. [akpm@linux-foundation.org: add sys_ipc to sys_ni.c] Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Reviewed-by: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Acked-by: Jesper Nilsson Acked-by: Russell King Acked-by: David Howells Acked-by: Kyle McMartin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 695384f12a7d..70f2ea758ffe 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -126,6 +126,7 @@ cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(sys_ipc); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); cond_syscall(sys_flock); -- cgit v1.2.2 From e28cbf22933d0c0ccaf3c4c27a1a263b41f73859 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:19 -0800 Subject: improve sys_newuname() for compat architectures On an architecture that supports 32-bit compat we need to override the reported machine in uname with the 32-bit value. Instead of doing this separately in every architecture introduce a COMPAT_UTS_MACHINE define in and apply it directly in sys_newuname(). Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 9814e43fb23b..e483eb5530e4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -1114,6 +1115,15 @@ out: DECLARE_RWSEM(uts_sem); +#ifdef COMPAT_UTS_MACHINE +#define override_architecture(name) \ + (current->personality == PER_LINUX32 && \ + copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ + sizeof(COMPAT_UTS_MACHINE))) +#else +#define override_architecture(name) 0 +#endif + SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) { int errno = 0; @@ -1122,6 +1132,9 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) if (copy_to_user(name, utsname(), sizeof *name)) errno = -EFAULT; up_read(&uts_sem); + + if (!errno && override_architecture(name)) + errno = -EFAULT; return errno; } -- cgit v1.2.2 From 5cacdb4add1b1e50fe75edc50ebbb7bddd9cf5e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 10 Mar 2010 15:21:21 -0800 Subject: Add generic sys_olduname() Add generic implementations of the old and really old uname system calls. Note that sh only implements sys_olduname but not sys_oldolduname, but I'm not going to bother with another ifdef for that special case. m32r implemented an old uname but never wired it up, so kill it, too. Signed-off-by: Christoph Hellwig Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mundt Cc: Jeff Dike Cc: Hirokazu Takata Cc: Thomas Gleixner Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Al Viro Cc: Arnd Bergmann Cc: Heiko Carstens Cc: Martin Schwidefsky Cc: "Luck, Tony" Cc: James Morris Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index e483eb5530e4..8298878f4f71 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1138,6 +1138,60 @@ SYSCALL_DEFINE1(newuname, struct new_utsname __user *, name) return errno; } +#ifdef __ARCH_WANT_SYS_OLD_UNAME +/* + * Old cruft + */ +SYSCALL_DEFINE1(uname, struct old_utsname __user *, name) +{ + int error = 0; + + if (!name) + return -EFAULT; + + down_read(&uts_sem); + if (copy_to_user(name, utsname(), sizeof(*name))) + error = -EFAULT; + up_read(&uts_sem); + + if (!error && override_architecture(name)) + error = -EFAULT; + return error; +} + +SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name) +{ + int error; + + if (!name) + return -EFAULT; + if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname))) + return -EFAULT; + + down_read(&uts_sem); + error = __copy_to_user(&name->sysname, &utsname()->sysname, + __OLD_UTS_LEN); + error |= __put_user(0, name->sysname + __OLD_UTS_LEN); + error |= __copy_to_user(&name->nodename, &utsname()->nodename, + __OLD_UTS_LEN); + error |= __put_user(0, name->nodename + __OLD_UTS_LEN); + error |= __copy_to_user(&name->release, &utsname()->release, + __OLD_UTS_LEN); + error |= __put_user(0, name->release + __OLD_UTS_LEN); + error |= __copy_to_user(&name->version, &utsname()->version, + __OLD_UTS_LEN); + error |= __put_user(0, name->version + __OLD_UTS_LEN); + error |= __copy_to_user(&name->machine, &utsname()->machine, + __OLD_UTS_LEN); + error |= __put_user(0, name->machine + __OLD_UTS_LEN); + up_read(&uts_sem); + + if (!error && override_architecture(name)) + error = -EFAULT; + return error ? -EFAULT : 0; +} +#endif + SYSCALL_DEFINE2(sethostname, char __user *, name, int, len) { int errno; -- cgit v1.2.2 From 2468c7234b366eeb799ee0648cb58f9cba394a54 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 10 Mar 2010 15:22:03 -0800 Subject: cgroup: introduce cancel_attach() Add cancel_attach() operation to struct cgroup_subsys. cancel_attach() can be used when can_attach() operation prepares something for the subsys, but we should rollback what can_attach() operation has prepared if attach task fails after we've succeeded in can_attach(). Signed-off-by: Daisuke Nishimura Acked-by: Li Zefan Reviewed-by: Paul Menage Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4fd90e129772..be45d2f6008a 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1554,7 +1554,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) { int retval = 0; - struct cgroup_subsys *ss; + struct cgroup_subsys *ss, *failed_ss = NULL; struct cgroup *oldcgrp; struct css_set *cg; struct css_set *newcg; @@ -1568,8 +1568,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) for_each_subsys(root, ss) { if (ss->can_attach) { retval = ss->can_attach(ss, cgrp, tsk, false); - if (retval) - return retval; + if (retval) { + /* + * Remember on which subsystem the can_attach() + * failed, so that we only call cancel_attach() + * against the subsystems whose can_attach() + * succeeded. (See below) + */ + failed_ss = ss; + goto out; + } } } @@ -1583,14 +1591,17 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) */ newcg = find_css_set(cg, cgrp); put_css_set(cg); - if (!newcg) - return -ENOMEM; + if (!newcg) { + retval = -ENOMEM; + goto out; + } task_lock(tsk); if (tsk->flags & PF_EXITING) { task_unlock(tsk); put_css_set(newcg); - return -ESRCH; + retval = -ESRCH; + goto out; } rcu_assign_pointer(tsk->cgroups, newcg); task_unlock(tsk); @@ -1616,7 +1627,22 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * is no longer empty. */ cgroup_wakeup_rmdir_waiter(cgrp); - return 0; +out: + if (retval) { + for_each_subsys(root, ss) { + if (ss == failed_ss) + /* + * This subsystem was the one that failed the + * can_attach() check earlier, so we don't need + * to call cancel_attach() against it or any + * remaining subsystems. + */ + break; + if (ss->cancel_attach) + ss->cancel_attach(ss, cgrp, tsk, false); + } + } + return retval; } /* -- cgit v1.2.2 From d7b9fff711d5e8db8c844161c684017e556c38a0 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Wed, 10 Mar 2010 15:22:05 -0800 Subject: cgroup: introduce coalesce css_get() and css_put() Current css_get() and css_put() increment/decrement css->refcnt one by one. This patch add a new function __css_get(), which takes "count" as a arg and increment the css->refcnt by "count". And this patch also add a new arg("count") to __css_put() and change the function to decrement the css->refcnt by "count". These coalesce version of __css_get()/__css_put() will be used to improve performance of memcg's moving charge feature later, where instead of calling css_get()/css_put() repeatedly, these new functions will be used. No change is needed for current users of css_get()/css_put(). Signed-off-by: Daisuke Nishimura Acked-by: Paul Menage Cc: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Li Zefan Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index be45d2f6008a..cace83ddbcdc 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3746,12 +3746,13 @@ static void check_for_release(struct cgroup *cgrp) } } -void __css_put(struct cgroup_subsys_state *css) +/* Caller must verify that the css is not for root cgroup */ +void __css_put(struct cgroup_subsys_state *css, int count) { struct cgroup *cgrp = css->cgroup; int val; rcu_read_lock(); - val = atomic_dec_return(&css->refcnt); + val = atomic_sub_return(count, &css->refcnt); if (val == 1) { if (notify_on_release(cgrp)) { set_bit(CGRP_RELEASABLE, &cgrp->flags); -- cgit v1.2.2 From aae8aab40367036931608fdaf9e2dc568b516f19 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:07 -0800 Subject: cgroups: revamp subsys array This patch series provides the ability for cgroup subsystems to be compiled as modules both within and outside the kernel tree. This is mainly useful for classifiers and subsystems that hook into components that are already modules. cls_cgroup and blkio-cgroup serve as the example use cases for this feature. It provides an interface cgroup_load_subsys() and cgroup_unload_subsys() which modular subsystems can use to register and depart during runtime. The net_cls classifier subsystem serves as the example for a subsystem which can be converted into a module using these changes. Patch #1 sets up the subsys[] array so its contents can be dynamic as modules appear and (eventually) disappear. Iterations over the array are modified to handle when subsystems are absent, and the dynamic section of the array is protected by cgroup_mutex. Patch #2 implements an interface for modules to load subsystems, called cgroup_load_subsys, similar to cgroup_init_subsys, and adds a module pointer in struct cgroup_subsys. Patch #3 adds a mechanism for unloading modular subsystems, which includes a more advanced rework of the rudimentary reference counting introduced in patch 2. Patch #4 modifies the net_cls subsystem, which already had some module declarations, to be configurable as a module, which also serves as a simple proof-of-concept. Part of implementing patches 2 and 4 involved updating css pointers in each css_set when the module appears or leaves. In doing this, it was discovered that css_sets always remain linked to the dummy cgroup, regardless of whether or not any subsystems are actually bound to it (i.e., not mounted on an actual hierarchy). The subsystem loading and unloading code therefore should keep in mind the special cases where the added subsystem is the only one in the dummy cgroup (and therefore all css_sets need to be linked back into it) and where the removed subsys was the only one in the dummy cgroup (and therefore all css_sets should be unlinked from it) - however, as all css_sets always stay attached to the dummy cgroup anyway, these cases are ignored. Any fix that addresses this issue should also make sure these cases are addressed in the subsystem loading and unloading code. This patch: Make subsys[] able to be dynamically populated to support modular subsystems This patch reworks the way the subsys[] array is used so that subsystems can register themselves after boot time, and enables the internals of cgroups to be able to handle when subsystems are not present or may appear/disappear. Signed-off-by: Ben Blum Acked-by: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 80 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cace83ddbcdc..c92fb9549358 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,10 +57,14 @@ static DEFINE_MUTEX(cgroup_mutex); -/* Generate an array of cgroup subsystem pointers */ +/* + * Generate an array of cgroup subsystem pointers. At boot time, this is + * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are + * registered after that. The mutable section of this array is protected by + * cgroup_mutex. + */ #define SUBSYS(_x) &_x ## _subsys, - -static struct cgroup_subsys *subsys[] = { +static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { #include }; @@ -448,8 +452,11 @@ static struct css_set *find_existing_css_set( struct hlist_node *node; struct css_set *cg; - /* Built the set of subsystem state objects that we want to - * see in the new css_set */ + /* + * Build the set of subsystem state objects that we want to see in the + * new css_set. while subsystems can change globally, the entries here + * won't change, so no need for locking. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { if (root->subsys_bits & (1UL << i)) { /* Subsystem is in this hierarchy. So we want @@ -884,7 +891,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) css_put(css); } - +/* + * Call with cgroup_mutex held. + */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -892,6 +901,8 @@ static int rebind_subsystems(struct cgroupfs_root *root, struct cgroup *cgrp = &root->top_cgroup; int i; + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + removed_bits = root->actual_subsys_bits & ~final_bits; added_bits = final_bits & ~root->actual_subsys_bits; /* Check that any added subsystems are currently free */ @@ -900,6 +911,12 @@ static int rebind_subsystems(struct cgroupfs_root *root, struct cgroup_subsys *ss = subsys[i]; if (!(bit & added_bits)) continue; + /* + * Nobody should tell us to do a subsys that doesn't exist: + * parse_cgroupfs_options should catch that case and refcounts + * ensure that subsystems won't disappear once selected. + */ + BUG_ON(ss == NULL); if (ss->root != &rootnode) { /* Subsystem isn't free */ return -EBUSY; @@ -919,6 +936,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long bit = 1UL << i; if (bit & added_bits) { /* We're binding this subsystem to this hierarchy */ + BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i]); BUG_ON(!dummytop->subsys[i]); BUG_ON(dummytop->subsys[i]->cgroup != dummytop); @@ -932,6 +950,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, mutex_unlock(&ss->hierarchy_mutex); } else if (bit & removed_bits) { /* We're removing this subsystem */ + BUG_ON(ss == NULL); BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); BUG_ON(cgrp->subsys[i]->cgroup != cgrp); mutex_lock(&ss->hierarchy_mutex); @@ -944,6 +963,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, mutex_unlock(&ss->hierarchy_mutex); } else if (bit & final_bits) { /* Subsystem state should already exist */ + BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); } else { /* Subsystem state shouldn't exist */ @@ -986,14 +1006,18 @@ struct cgroup_sb_opts { }; -/* Convert a hierarchy specifier into a bitmask of subsystems and - * flags. */ +/* + * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call + * with cgroup_mutex held to protect the subsys[] array. + */ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data ?: "all"; unsigned long mask = (unsigned long)-1; + BUG_ON(!mutex_is_locked(&cgroup_mutex)); + #ifdef CONFIG_CPUSETS mask = ~(1UL << cpuset_subsys_id); #endif @@ -1009,6 +1033,8 @@ static int parse_cgroupfs_options(char *data, opts->subsys_bits = 0; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (!ss->disabled) opts->subsys_bits |= 1ul << i; } @@ -1053,6 +1079,8 @@ static int parse_cgroupfs_options(char *data, int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; + if (ss == NULL) + continue; if (!strcmp(token, ss->name)) { if (!ss->disabled) set_bit(i, &opts->subsys_bits); @@ -1306,7 +1334,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type, struct cgroupfs_root *new_root; /* First find the desired set of subsystems */ + mutex_lock(&cgroup_mutex); ret = parse_cgroupfs_options(data, &opts); + mutex_unlock(&cgroup_mutex); if (ret) goto out_err; @@ -2918,8 +2948,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root) /* We need to take each hierarchy_mutex in a consistent order */ int i; + /* + * No worry about a race with rebind_subsystems that might mess up the + * locking order, since both parties are under cgroup_mutex. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (ss->root == root) mutex_lock(&ss->hierarchy_mutex); } @@ -2931,6 +2967,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root) for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; if (ss->root == root) mutex_unlock(&ss->hierarchy_mutex); } @@ -3054,11 +3092,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp) * synchronization other than RCU, and the subsystem linked * list isn't RCU-safe */ int i; + /* + * We won't need to lock the subsys array, because the subsystems + * we're concerned about aren't going anywhere since our cgroup root + * has a reference on them. + */ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; struct cgroup_subsys_state *css; - /* Skip subsystems not in this hierarchy */ - if (ss->root != cgrp->root) + /* Skip subsystems not present or not in this hierarchy */ + if (ss == NULL || ss->root != cgrp->root) continue; css = cgrp->subsys[ss->subsys_id]; /* When called from check_for_release() it's possible @@ -3279,7 +3322,8 @@ int __init cgroup_init_early(void) for (i = 0; i < CSS_SET_TABLE_SIZE; i++) INIT_HLIST_HEAD(&css_set_table[i]); - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; BUG_ON(!ss->name); @@ -3314,7 +3358,8 @@ int __init cgroup_init(void) if (err) return err; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* at bootup time, we don't worry about modular subsystems */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!ss->early_init) cgroup_init_subsys(ss); @@ -3423,9 +3468,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v) int i; seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); + /* + * ideally we don't want subsystems moving around while we do this. + * cgroup_mutex is also necessary to guarantee an atomic snapshot of + * subsys/hierarchy state. + */ mutex_lock(&cgroup_mutex); for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; + if (ss == NULL) + continue; seq_printf(m, "%s\t%d\t%d\t%d\n", ss->name, ss->root->hierarchy_id, ss->root->number_of_cgroups, !ss->disabled); @@ -3483,7 +3535,12 @@ void cgroup_fork_callbacks(struct task_struct *child) { if (need_forkexit_callback) { int i; - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * forkexit callbacks are only supported for builtin + * subsystems, and the builtin section of the subsys array is + * immutable, so we don't need to lock the subsys array here. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->fork) ss->fork(ss, child); @@ -3552,7 +3609,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) struct css_set *cg; if (run_callbacks && need_forkexit_callback) { - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * modular subsystems can't use callbacks, so no need to lock + * the subsys array + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (ss->exit) ss->exit(ss, tsk); @@ -3844,8 +3905,11 @@ static int __init cgroup_disable(char *str) while ((token = strsep(&str, ",")) != NULL) { if (!*token) continue; - - for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { + /* + * cgroup_disable, being at boot time, can't know about module + * subsystems, so we don't worry about them. + */ + for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; if (!strcmp(token, ss->name)) { -- cgit v1.2.2 From e6a1105ba08b265023dd71a4174fb4a29ebc7083 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:09 -0800 Subject: cgroups: subsystem module loading interface Add interface between cgroups subsystem management and module loading This patch implements rudimentary module-loading support for cgroups - namely, a cgroup_load_subsys (similar to cgroup_init_subsys) for use as a module initcall, and a struct module pointer in struct cgroup_subsys. Several functions that might be wanted by modules have had EXPORT_SYMBOL added to them, but it's unclear exactly which functions want it and which won't. Signed-off-by: Ben Blum Acked-by: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 145 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c92fb9549358..2cae38e64c59 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include #include @@ -254,7 +255,8 @@ struct cg_cgroup_link { static struct css_set init_css_set; static struct cg_cgroup_link init_css_set_link; -static int cgroup_subsys_init_idr(struct cgroup_subsys *ss); +static int cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *css); /* css_set_lock protects the list of css_set objects, and the * chain of tasks off each css_set. Nests outside task->alloc_lock @@ -2125,6 +2127,7 @@ int cgroup_add_file(struct cgroup *cgrp, error = PTR_ERR(dentry); return error; } +EXPORT_SYMBOL_GPL(cgroup_add_file); int cgroup_add_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, @@ -2139,6 +2142,7 @@ int cgroup_add_files(struct cgroup *cgrp, } return 0; } +EXPORT_SYMBOL_GPL(cgroup_add_files); /** * cgroup_task_count - count the number of tasks in a cgroup. @@ -3292,7 +3296,144 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss) mutex_init(&ss->hierarchy_mutex); lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; + + /* this function shouldn't be used with modular subsystems, since they + * need to register a subsys_id, among other things */ + BUG_ON(ss->module); +} + +/** + * cgroup_load_subsys: load and register a modular subsystem at runtime + * @ss: the subsystem to load + * + * This function should be called in a modular subsystem's initcall. If the + * subsytem is built as a module, it will be assigned a new subsys_id and set + * up for use. If the subsystem is built-in anyway, work is delegated to the + * simpler cgroup_init_subsys. + */ +int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) +{ + int i; + struct cgroup_subsys_state *css; + + /* check name and function validity */ + if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || + ss->create == NULL || ss->destroy == NULL) + return -EINVAL; + + /* + * we don't support callbacks in modular subsystems. this check is + * before the ss->module check for consistency; a subsystem that could + * be a module should still have no callbacks even if the user isn't + * compiling it as one. + */ + if (ss->fork || ss->exit) + return -EINVAL; + + /* + * an optionally modular subsystem is built-in: we want to do nothing, + * since cgroup_init_subsys will have already taken care of it. + */ + if (ss->module == NULL) { + /* a few sanity checks */ + BUG_ON(ss->subsys_id >= CGROUP_BUILTIN_SUBSYS_COUNT); + BUG_ON(subsys[ss->subsys_id] != ss); + return 0; + } + + /* + * need to register a subsys id before anything else - for example, + * init_cgroup_css needs it. + */ + mutex_lock(&cgroup_mutex); + /* find the first empty slot in the array */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + if (subsys[i] == NULL) + break; + } + if (i == CGROUP_SUBSYS_COUNT) { + /* maximum number of subsystems already registered! */ + mutex_unlock(&cgroup_mutex); + return -EBUSY; + } + /* assign ourselves the subsys_id */ + ss->subsys_id = i; + subsys[i] = ss; + + /* + * no ss->create seems to need anything important in the ss struct, so + * this can happen first (i.e. before the rootnode attachment). + */ + css = ss->create(ss, dummytop); + if (IS_ERR(css)) { + /* failure case - need to deassign the subsys[] slot. */ + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return PTR_ERR(css); + } + + list_add(&ss->sibling, &rootnode.subsys_list); + ss->root = &rootnode; + + /* our new subsystem will be attached to the dummy hierarchy. */ + init_cgroup_css(css, ss, dummytop); + /* init_idr must be after init_cgroup_css because it sets css->id. */ + if (ss->use_id) { + int ret = cgroup_init_idr(ss, css); + if (ret) { + dummytop->subsys[ss->subsys_id] = NULL; + ss->destroy(ss, dummytop); + subsys[i] = NULL; + mutex_unlock(&cgroup_mutex); + return ret; + } + } + + /* + * Now we need to entangle the css into the existing css_sets. unlike + * in cgroup_init_subsys, there are now multiple css_sets, so each one + * will need a new pointer to it; done by iterating the css_set_table. + * furthermore, modifying the existing css_sets will corrupt the hash + * table state, so each changed css_set will need its hash recomputed. + * this is all done under the css_set_lock. + */ + write_lock(&css_set_lock); + for (i = 0; i < CSS_SET_TABLE_SIZE; i++) { + struct css_set *cg; + struct hlist_node *node, *tmp; + struct hlist_head *bucket = &css_set_table[i], *new_bucket; + + hlist_for_each_entry_safe(cg, node, tmp, bucket, hlist) { + /* skip entries that we already rehashed */ + if (cg->subsys[ss->subsys_id]) + continue; + /* remove existing entry */ + hlist_del(&cg->hlist); + /* set new value */ + cg->subsys[ss->subsys_id] = css; + /* recompute hash and restore entry */ + new_bucket = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, new_bucket); + } + } + write_unlock(&css_set_lock); + + mutex_init(&ss->hierarchy_mutex); + lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); + ss->active = 1; + + /* + * pin the subsystem's module so it doesn't go away. this shouldn't + * fail, since the module's initcall calls us. + * TODO: with module unloading, move this elsewhere + */ + BUG_ON(!try_module_get(ss->module)); + + /* success! */ + mutex_unlock(&cgroup_mutex); + return 0; } +EXPORT_SYMBOL_GPL(cgroup_load_subsys); /** * cgroup_init_early - cgroup initialization at system boot @@ -3364,7 +3505,7 @@ int __init cgroup_init(void) if (!ss->early_init) cgroup_init_subsys(ss); if (ss->use_id) - cgroup_subsys_init_idr(ss); + cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); } /* Add init_css_set to the hash table */ @@ -4033,15 +4174,14 @@ err_out: } -static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss) +static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, + struct cgroup_subsys_state *rootcss) { struct css_id *newid; - struct cgroup_subsys_state *rootcss; spin_lock_init(&ss->id_lock); idr_init(&ss->idr); - rootcss = init_css_set.subsys[ss->subsys_id]; newid = get_new_cssid(ss, 0); if (IS_ERR(newid)) return PTR_ERR(newid); -- cgit v1.2.2 From cf5d5941fda647fe3d2f2d00cf9e0245236a5f08 Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:09 -0800 Subject: cgroups: subsystem module unloading Provides support for unloading modular subsystems. This patch adds a new function cgroup_unload_subsys which is to be used for removing a loaded subsystem during module deletion. Reference counting of the subsystems' modules is moved from once (at load time) to once per attached hierarchy (in parse_cgroupfs_options and rebind_subsystems) (i.e., 0 or 1). Signed-off-by: Ben Blum Acked-by: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 167 +++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 142 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 2cae38e64c59..aa889c96cc74 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -894,7 +894,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) } /* - * Call with cgroup_mutex held. + * Call with cgroup_mutex held. Drops reference counts on modules, including + * any duplicate ones that parse_cgroupfs_options took. If this function + * returns an error, no reference counts are touched. */ static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) @@ -950,6 +952,7 @@ static int rebind_subsystems(struct cgroupfs_root *root, if (ss->bind) ss->bind(ss, cgrp); mutex_unlock(&ss->hierarchy_mutex); + /* refcount was already taken, and we're keeping it */ } else if (bit & removed_bits) { /* We're removing this subsystem */ BUG_ON(ss == NULL); @@ -963,10 +966,20 @@ static int rebind_subsystems(struct cgroupfs_root *root, subsys[i]->root = &rootnode; list_move(&ss->sibling, &rootnode.subsys_list); mutex_unlock(&ss->hierarchy_mutex); + /* subsystem is now free - drop reference on module */ + module_put(ss->module); } else if (bit & final_bits) { /* Subsystem state should already exist */ BUG_ON(ss == NULL); BUG_ON(!cgrp->subsys[i]); + /* + * a refcount was taken, but we already had one, so + * drop the extra reference. + */ + module_put(ss->module); +#ifdef CONFIG_MODULE_UNLOAD + BUG_ON(ss->module && !module_refcount(ss->module)); +#endif } else { /* Subsystem state shouldn't exist */ BUG_ON(cgrp->subsys[i]); @@ -1010,13 +1023,16 @@ struct cgroup_sb_opts { /* * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call - * with cgroup_mutex held to protect the subsys[] array. + * with cgroup_mutex held to protect the subsys[] array. This function takes + * refcounts on subsystems to be used, unless it returns error, in which case + * no refcounts are taken. */ -static int parse_cgroupfs_options(char *data, - struct cgroup_sb_opts *opts) +static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data ?: "all"; unsigned long mask = (unsigned long)-1; + int i; + bool module_pin_failed = false; BUG_ON(!mutex_is_locked(&cgroup_mutex)); @@ -1031,7 +1047,6 @@ static int parse_cgroupfs_options(char *data, return -EINVAL; if (!strcmp(token, "all")) { /* Add all non-disabled subsystems */ - int i; opts->subsys_bits = 0; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { struct cgroup_subsys *ss = subsys[i]; @@ -1054,7 +1069,6 @@ static int parse_cgroupfs_options(char *data, if (!opts->release_agent) return -ENOMEM; } else if (!strncmp(token, "name=", 5)) { - int i; const char *name = token + 5; /* Can't specify an empty name */ if (!strlen(name)) @@ -1078,7 +1092,6 @@ static int parse_cgroupfs_options(char *data, return -ENOMEM; } else { struct cgroup_subsys *ss; - int i; for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { ss = subsys[i]; if (ss == NULL) @@ -1117,9 +1130,54 @@ static int parse_cgroupfs_options(char *data, if (!opts->subsys_bits && !opts->name) return -EINVAL; + /* + * Grab references on all the modules we'll need, so the subsystems + * don't dance around before rebind_subsystems attaches them. This may + * take duplicate reference counts on a subsystem that's already used, + * but rebind_subsystems handles this case. + */ + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + if (!try_module_get(subsys[i]->module)) { + module_pin_failed = true; + break; + } + } + if (module_pin_failed) { + /* + * oops, one of the modules was going away. this means that we + * raced with a module_delete call, and to the user this is + * essentially a "subsystem doesn't exist" case. + */ + for (i--; i >= CGROUP_BUILTIN_SUBSYS_COUNT; i--) { + /* drop refcounts only on the ones we took */ + unsigned long bit = 1UL << i; + + if (!(bit & opts->subsys_bits)) + continue; + module_put(subsys[i]->module); + } + return -ENOENT; + } + return 0; } +static void drop_parsed_module_refcounts(unsigned long subsys_bits) +{ + int i; + for (i = CGROUP_BUILTIN_SUBSYS_COUNT; i < CGROUP_SUBSYS_COUNT; i++) { + unsigned long bit = 1UL << i; + + if (!(bit & subsys_bits)) + continue; + module_put(subsys[i]->module); + } +} + static int cgroup_remount(struct super_block *sb, int *flags, char *data) { int ret = 0; @@ -1136,21 +1194,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data) if (ret) goto out_unlock; - /* Don't allow flags to change at remount */ - if (opts.flags != root->flags) { - ret = -EINVAL; - goto out_unlock; - } - - /* Don't allow name to change at remount */ - if (opts.name && strcmp(opts.name, root->name)) { + /* Don't allow flags or name to change at remount */ + if (opts.flags != root->flags || + (opts.name && strcmp(opts.name, root->name))) { ret = -EINVAL; + drop_parsed_module_refcounts(opts.subsys_bits); goto out_unlock; } ret = rebind_subsystems(root, opts.subsys_bits); - if (ret) + if (ret) { + drop_parsed_module_refcounts(opts.subsys_bits); goto out_unlock; + } /* (re)populate subsystem files */ cgroup_populate_dir(cgrp); @@ -1349,7 +1405,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, new_root = cgroup_root_from_opts(&opts); if (IS_ERR(new_root)) { ret = PTR_ERR(new_root); - goto out_err; + goto drop_modules; } opts.new_root = new_root; @@ -1358,7 +1414,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type, if (IS_ERR(sb)) { ret = PTR_ERR(sb); cgroup_drop_root(opts.new_root); - goto out_err; + goto drop_modules; } root = sb->s_fs_info; @@ -1414,6 +1470,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type, free_cg_links(&tmp_cg_links); goto drop_new_super; } + /* + * There must be no failure case after here, since rebinding + * takes care of subsystems' refcounts, which are explicitly + * dropped in the failure exit path. + */ /* EBUSY should be the only error here */ BUG_ON(ret); @@ -1452,6 +1513,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, * any) is not needed */ cgroup_drop_root(opts.new_root); + /* no subsys rebinding, so refcounts don't change */ + drop_parsed_module_refcounts(opts.subsys_bits); } simple_set_mnt(mnt, sb); @@ -1461,6 +1524,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type, drop_new_super: deactivate_locked_super(sb); + drop_modules: + drop_parsed_module_refcounts(opts.subsys_bits); out_err: kfree(opts.release_agent); kfree(opts.name); @@ -3422,19 +3487,71 @@ int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key); ss->active = 1; - /* - * pin the subsystem's module so it doesn't go away. this shouldn't - * fail, since the module's initcall calls us. - * TODO: with module unloading, move this elsewhere - */ - BUG_ON(!try_module_get(ss->module)); - /* success! */ mutex_unlock(&cgroup_mutex); return 0; } EXPORT_SYMBOL_GPL(cgroup_load_subsys); +/** + * cgroup_unload_subsys: unload a modular subsystem + * @ss: the subsystem to unload + * + * This function should be called in a modular subsystem's exitcall. When this + * function is invoked, the refcount on the subsystem's module will be 0, so + * the subsystem will not be attached to any hierarchy. + */ +void cgroup_unload_subsys(struct cgroup_subsys *ss) +{ + struct cg_cgroup_link *link; + struct hlist_head *hhead; + + BUG_ON(ss->module == NULL); + + /* + * we shouldn't be called if the subsystem is in use, and the use of + * try_module_get in parse_cgroupfs_options should ensure that it + * doesn't start being used while we're killing it off. + */ + BUG_ON(ss->root != &rootnode); + + mutex_lock(&cgroup_mutex); + /* deassign the subsys_id */ + BUG_ON(ss->subsys_id < CGROUP_BUILTIN_SUBSYS_COUNT); + subsys[ss->subsys_id] = NULL; + + /* remove subsystem from rootnode's list of subsystems */ + list_del(&ss->sibling); + + /* + * disentangle the css from all css_sets attached to the dummytop. as + * in loading, we need to pay our respects to the hashtable gods. + */ + write_lock(&css_set_lock); + list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + + hlist_del(&cg->hlist); + BUG_ON(!cg->subsys[ss->subsys_id]); + cg->subsys[ss->subsys_id] = NULL; + hhead = css_set_hash(cg->subsys); + hlist_add_head(&cg->hlist, hhead); + } + write_unlock(&css_set_lock); + + /* + * remove subsystem's css from the dummytop and free it - need to free + * before marking as null because ss->destroy needs the cgrp->subsys + * pointer to find their state. note that this also takes care of + * freeing the css_id. + */ + ss->destroy(ss, dummytop); + dummytop->subsys[ss->subsys_id] = NULL; + + mutex_unlock(&cgroup_mutex); +} +EXPORT_SYMBOL_GPL(cgroup_unload_subsys); + /** * cgroup_init_early - cgroup initialization at system boot * -- cgit v1.2.2 From 67523c48aa74d5637848edeccf285af1c60bf14a Mon Sep 17 00:00:00 2001 From: Ben Blum Date: Wed, 10 Mar 2010 15:22:11 -0800 Subject: cgroups: blkio subsystem as module Modify the Block I/O cgroup subsystem to be able to be built as a module. As the CFQ disk scheduler optionally depends on blk-cgroup, config options in block/Kconfig, block/Kconfig.iosched, and block/blk-cgroup.h are enhanced to support the new module dependency. Signed-off-by: Ben Blum Cc: Li Zefan Cc: Paul Menage Cc: "David S. Miller" Cc: KAMEZAWA Hiroyuki Cc: Lai Jiangshan Cc: Vivek Goyal Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index aa889c96cc74..521591dbab2f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -705,6 +705,7 @@ void cgroup_lock(void) { mutex_lock(&cgroup_mutex); } +EXPORT_SYMBOL_GPL(cgroup_lock); /** * cgroup_unlock - release lock on cgroup changes @@ -715,6 +716,7 @@ void cgroup_unlock(void) { mutex_unlock(&cgroup_mutex); } +EXPORT_SYMBOL_GPL(cgroup_unlock); /* * A couple of forward declarations required, due to cyclic reference loop: @@ -1639,6 +1641,7 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) memmove(buf, start, buf + buflen - start); return 0; } +EXPORT_SYMBOL_GPL(cgroup_path); /** * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' @@ -1805,6 +1808,7 @@ bool cgroup_lock_live_group(struct cgroup *cgrp) } return true; } +EXPORT_SYMBOL_GPL(cgroup_lock_live_group); static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, const char *buffer) @@ -4082,6 +4086,7 @@ void __css_put(struct cgroup_subsys_state *css, int count) rcu_read_unlock(); WARN_ON_ONCE(val < 1); } +EXPORT_SYMBOL_GPL(__css_put); /* * Notify userspace when a cgroup is released, by running the @@ -4197,6 +4202,7 @@ unsigned short css_id(struct cgroup_subsys_state *css) return cssid->id; return 0; } +EXPORT_SYMBOL_GPL(css_id); unsigned short css_depth(struct cgroup_subsys_state *css) { @@ -4206,6 +4212,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css) return cssid->depth; return 0; } +EXPORT_SYMBOL_GPL(css_depth); bool css_is_ancestor(struct cgroup_subsys_state *child, const struct cgroup_subsys_state *root) @@ -4242,6 +4249,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) spin_unlock(&ss->id_lock); call_rcu(&id->rcu_head, __free_css_id_cb); } +EXPORT_SYMBOL_GPL(free_css_id); /* * This is called by init or create(). Then, calls to this function are @@ -4358,6 +4366,7 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) return rcu_dereference(cssid->css); } +EXPORT_SYMBOL_GPL(css_lookup); /** * css_get_next - lookup next cgroup under specified hierarchy. -- cgit v1.2.2 From b70cc5fdb445a6929a01e9c406593265b136c99d Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 10 Mar 2010 15:22:12 -0800 Subject: cgroups: clean up cgroup_pidlist_find() a bit Don't call get_pid_ns() before we locate/alloc the ns. Signed-off-by: Li Zefan Cc: Serge Hallyn Acked-by: Paul Menage Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 521591dbab2f..1bf4d6db54ab 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2597,7 +2597,8 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, { struct cgroup_pidlist *l; /* don't need task_nsproxy() if we're looking at ourself */ - struct pid_namespace *ns = get_pid_ns(current->nsproxy->pid_ns); + struct pid_namespace *ns = current->nsproxy->pid_ns; + /* * We can't drop the pidlist_mutex before taking the l->mutex in case * the last ref-holder is trying to remove l from the list at the same @@ -2607,8 +2608,6 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, mutex_lock(&cgrp->pidlist_mutex); list_for_each_entry(l, &cgrp->pidlists, links) { if (l->key.type == type && l->key.ns == ns) { - /* found a matching list - drop the extra refcount */ - put_pid_ns(ns); /* make sure l doesn't vanish out from under us */ down_write(&l->mutex); mutex_unlock(&cgrp->pidlist_mutex); @@ -2619,13 +2618,12 @@ static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); if (!l) { mutex_unlock(&cgrp->pidlist_mutex); - put_pid_ns(ns); return l; } init_rwsem(&l->mutex); down_write(&l->mutex); l->key.type = type; - l->key.ns = ns; + l->key.ns = get_pid_ns(ns); l->use_count = 0; /* don't increment here */ l->list = NULL; l->owner = cgrp; -- cgit v1.2.2 From 0dea116876eefc9c7ca9c5d74fe665481e499fa3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Mar 2010 15:22:20 -0800 Subject: cgroup: implement eventfd-based generic API for notifications This patchset introduces eventfd-based API for notifications in cgroups and implements memory notifications on top of it. It uses statistics in memory controler to track memory usage. Output of time(1) on building kernel on tmpfs: Root cgroup before changes: make -j2 506.37 user 60.93s system 193% cpu 4:52.77 total Non-root cgroup before changes: make -j2 507.14 user 62.66s system 193% cpu 4:54.74 total Root cgroup after changes (0 thresholds): make -j2 507.13 user 62.20s system 193% cpu 4:53.55 total Non-root cgroup after changes (0 thresholds): make -j2 507.70 user 64.20s system 193% cpu 4:55.70 total Root cgroup after changes (1 thresholds, never crossed): make -j2 506.97 user 62.20s system 193% cpu 4:53.90 total Non-root cgroup after changes (1 thresholds, never crossed): make -j2 507.55 user 64.08s system 193% cpu 4:55.63 total This patch: Introduce the write-only file "cgroup.event_control" in every cgroup. To register new notification handler you need: - create an eventfd; - open a control file to be monitored. Callbacks register_event() and unregister_event() must be defined for the control file; - write " " to cgroup.event_control. Interpretation of args is defined by control file implementation; eventfd will be woken up by control file implementation or when the cgroup is removed. To unregister notification handler just close eventfd. If you need notification functionality for a control file you have to implement callbacks register_event() and unregister_event() in the struct cftype. [kamezawa.hiroyu@jp.fujitsu.com: Kconfig fix] Signed-off-by: Kirill A. Shutemov Reviewed-by: KAMEZAWA Hiroyuki Paul Menage Cc: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Dan Malek Cc: Vladislav Buzov Cc: Daisuke Nishimura Cc: Alexander Shishkin Cc: Davide Libenzi Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 228 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 227 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1bf4d6db54ab..ea94984a3895 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4,6 +4,10 @@ * Based originally on the cpuset system, extracted by Paul Menage * Copyright (C) 2006 Google, Inc * + * Notifications support + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * * Copyright notices from the original cpuset code: * -------------------------------------------------- * Copyright (C) 2003 BULL SA. @@ -53,6 +57,8 @@ #include #include #include /* TODO: replace with more sophisticated array */ +#include +#include #include @@ -152,6 +158,35 @@ struct css_id { unsigned short stack[0]; /* Array of Length (depth+1) */ }; +/* + * cgroup_event represents events which userspace want to recieve. + */ +struct cgroup_event { + /* + * Cgroup which the event belongs to. + */ + struct cgroup *cgrp; + /* + * Control file which the event associated. + */ + struct cftype *cft; + /* + * eventfd to signal userspace about the event. + */ + struct eventfd_ctx *eventfd; + /* + * Each of these stored in a list by the cgroup. + */ + struct list_head list; + /* + * All fields below needed to unregister event when + * userspace closes eventfd. + */ + poll_table pt; + wait_queue_head_t *wqh; + wait_queue_t wait; + struct work_struct remove; +}; /* The list of hierarchy roots */ @@ -760,14 +795,28 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) static int cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; + struct cgroup_event *event, *tmp; int ret = 0; for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { ret = ss->pre_destroy(ss, cgrp); if (ret) - break; + goto out; } + + /* + * Unregister events and notify userspace. + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del(&event->list); + eventfd_signal(event->eventfd, 1); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + +out: return ret; } @@ -1239,6 +1288,8 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->release_list); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); + INIT_LIST_HEAD(&cgrp->event_list); + spin_lock_init(&cgrp->event_list_lock); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2077,6 +2128,16 @@ static const struct inode_operations cgroup_dir_inode_operations = { .rename = cgroup_rename, }; +/* + * Check if a file is a control file + */ +static inline struct cftype *__file_cft(struct file *file) +{ + if (file->f_dentry->d_inode->i_fop != &cgroup_file_operations) + return ERR_PTR(-EINVAL); + return __d_cft(file->f_dentry); +} + static int cgroup_create_file(struct dentry *dentry, mode_t mode, struct super_block *sb) { @@ -2930,6 +2991,166 @@ static int cgroup_write_notify_on_release(struct cgroup *cgrp, return 0; } +/* + * Unregister event and free resources. + * + * Gets called from workqueue. + */ +static void cgroup_event_remove(struct work_struct *work) +{ + struct cgroup_event *event = container_of(work, struct cgroup_event, + remove); + struct cgroup *cgrp = event->cgrp; + + /* TODO: check return code */ + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + + eventfd_ctx_put(event->eventfd); + remove_wait_queue(event->wqh, &event->wait); + kfree(event); +} + +/* + * Gets called on POLLHUP on eventfd when user closes it. + * + * Called with wqh->lock held and interrupts disabled. + */ +static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, + int sync, void *key) +{ + struct cgroup_event *event = container_of(wait, + struct cgroup_event, wait); + struct cgroup *cgrp = event->cgrp; + unsigned long flags = (unsigned long)key; + + if (flags & POLLHUP) { + spin_lock(&cgrp->event_list_lock); + list_del(&event->list); + spin_unlock(&cgrp->event_list_lock); + /* + * We are in atomic context, but cgroup_event_remove() may + * sleep, so we have to call it in workqueue. + */ + schedule_work(&event->remove); + } + + return 0; +} + +static void cgroup_event_ptable_queue_proc(struct file *file, + wait_queue_head_t *wqh, poll_table *pt) +{ + struct cgroup_event *event = container_of(pt, + struct cgroup_event, pt); + + event->wqh = wqh; + add_wait_queue(wqh, &event->wait); +} + +/* + * Parse input and register new cgroup event handler. + * + * Input must be in format ' '. + * Interpretation of args is defined by control file implementation. + */ +static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, + const char *buffer) +{ + struct cgroup_event *event = NULL; + unsigned int efd, cfd; + struct file *efile = NULL; + struct file *cfile = NULL; + char *endp; + int ret; + + efd = simple_strtoul(buffer, &endp, 10); + if (*endp != ' ') + return -EINVAL; + buffer = endp + 1; + + cfd = simple_strtoul(buffer, &endp, 10); + if ((*endp != ' ') && (*endp != '\0')) + return -EINVAL; + buffer = endp + 1; + + event = kzalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + event->cgrp = cgrp; + INIT_LIST_HEAD(&event->list); + init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); + init_waitqueue_func_entry(&event->wait, cgroup_event_wake); + INIT_WORK(&event->remove, cgroup_event_remove); + + efile = eventfd_fget(efd); + if (IS_ERR(efile)) { + ret = PTR_ERR(efile); + goto fail; + } + + event->eventfd = eventfd_ctx_fileget(efile); + if (IS_ERR(event->eventfd)) { + ret = PTR_ERR(event->eventfd); + goto fail; + } + + cfile = fget(cfd); + if (!cfile) { + ret = -EBADF; + goto fail; + } + + /* the process need read permission on control file */ + ret = file_permission(cfile, MAY_READ); + if (ret < 0) + goto fail; + + event->cft = __file_cft(cfile); + if (IS_ERR(event->cft)) { + ret = PTR_ERR(event->cft); + goto fail; + } + + if (!event->cft->register_event || !event->cft->unregister_event) { + ret = -EINVAL; + goto fail; + } + + ret = event->cft->register_event(cgrp, event->cft, + event->eventfd, buffer); + if (ret) + goto fail; + + if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { + event->cft->unregister_event(cgrp, event->cft, event->eventfd); + ret = 0; + goto fail; + } + + spin_lock(&cgrp->event_list_lock); + list_add(&event->list, &cgrp->event_list); + spin_unlock(&cgrp->event_list_lock); + + fput(cfile); + fput(efile); + + return 0; + +fail: + if (cfile) + fput(cfile); + + if (event && event->eventfd && !IS_ERR(event->eventfd)) + eventfd_ctx_put(event->eventfd); + + if (!IS_ERR_OR_NULL(efile)) + fput(efile); + + kfree(event); + + return ret; +} + /* * for the common functions, 'private' gives the type of file */ @@ -2955,6 +3176,11 @@ static struct cftype files[] = { .read_u64 = cgroup_read_notify_on_release, .write_u64 = cgroup_write_notify_on_release, }, + { + .name = CGROUP_FILE_GENERIC_PREFIX "event_control", + .write_string = cgroup_write_event_control, + .mode = S_IWUGO, + }, }; static struct cftype cft_release_agent = { -- cgit v1.2.2 From 4ab78683c17d739c2a2077141dcf81a02b7fb57e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Mar 2010 15:22:34 -0800 Subject: cgroups: fix race between userspace and kernelspace Notify userspace about cgroup removing only after rmdir of cgroup directory to avoid race between userspace and kernelspace. eventfd are used to notify about two types of event: - control file-specific, like crossing memory threshold; - cgroup removing. To understand what really happen, userspace can check if the cgroup still exists. To avoid race beetween userspace and kernelspace we have to notify userspace about cgroup removing only after rmdir of cgroup directory. Signed-off-by: Kirill A. Shutemov Reviewed-by: KAMEZAWA Hiroyuki Cc: Paul Menage Acked-by: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Dan Malek Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ea94984a3895..87441fc75663 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -795,28 +795,15 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb) static int cgroup_call_pre_destroy(struct cgroup *cgrp) { struct cgroup_subsys *ss; - struct cgroup_event *event, *tmp; int ret = 0; for_each_subsys(cgrp->root, ss) if (ss->pre_destroy) { ret = ss->pre_destroy(ss, cgrp); if (ret) - goto out; + break; } - /* - * Unregister events and notify userspace. - */ - spin_lock(&cgrp->event_list_lock); - list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { - list_del(&event->list); - eventfd_signal(event->eventfd, 1); - schedule_work(&event->remove); - } - spin_unlock(&cgrp->event_list_lock); - -out: return ret; } @@ -3006,7 +2993,6 @@ static void cgroup_event_remove(struct work_struct *work) event->cft->unregister_event(cgrp, event->cft, event->eventfd); eventfd_ctx_put(event->eventfd); - remove_wait_queue(event->wqh, &event->wait); kfree(event); } @@ -3024,6 +3010,7 @@ static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, unsigned long flags = (unsigned long)key; if (flags & POLLHUP) { + remove_wait_queue_locked(event->wqh, &event->wait); spin_lock(&cgrp->event_list_lock); list_del(&event->list); spin_unlock(&cgrp->event_list_lock); @@ -3472,6 +3459,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) struct dentry *d; struct cgroup *parent; DEFINE_WAIT(wait); + struct cgroup_event *event, *tmp; int ret; /* the vfs holds both inode->i_mutex already */ @@ -3555,6 +3543,20 @@ again: set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); + /* + * Unregister events and notify userspace. + * Notify userspace about cgroup removing only after rmdir of cgroup + * directory to avoid race between userspace and kernelspace + */ + spin_lock(&cgrp->event_list_lock); + list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { + list_del(&event->list); + remove_wait_queue(event->wqh, &event->wait); + eventfd_signal(event->eventfd, 1); + schedule_work(&event->remove); + } + spin_unlock(&cgrp->event_list_lock); + mutex_unlock(&cgroup_mutex); return 0; } -- cgit v1.2.2 From a0a4db548edcce067c1201ef25cf2bc29f32dca4 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 10 Mar 2010 15:22:34 -0800 Subject: cgroups: remove events before destroying subsystem state objects Events should be removed after rmdir of cgroup directory, but before destroying subsystem state objects. Let's take reference to cgroup directory dentry to do that. Signed-off-by: Kirill A. Shutemov Acked-by: KAMEZAWA Hiroyuki Cc: Paul Menage Acked-by: Li Zefan Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Dan Malek Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 87441fc75663..ef909a329750 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2994,6 +2994,7 @@ static void cgroup_event_remove(struct work_struct *work) eventfd_ctx_put(event->eventfd); kfree(event); + dput(cgrp->dentry); } /* @@ -3114,6 +3115,13 @@ static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, goto fail; } + /* + * Events should be removed after rmdir of cgroup directory, but before + * destroying subsystem state objects. Let's take reference to cgroup + * directory dentry to do that. + */ + dget(cgrp->dentry); + spin_lock(&cgrp->event_list_lock); list_add(&event->list, &cgrp->event_list); spin_unlock(&cgrp->event_list_lock); -- cgit v1.2.2 From a56704ef6b0c5796c9ff38cc78aa232dfb9644d7 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:01 -0800 Subject: copy_signal() cleanup: use zalloc and remove initializations Use kmem_cache_zalloc() on signal creation and remove unneeded initialization lines in copy_signal(). Signed-off-by: Veaceslav Falico Acked-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index b0ec34abc0bb..ce2666f84d85 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -863,7 +863,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (clone_flags & CLONE_THREAD) return 0; - sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); + sig = kmem_cache_zalloc(signal_cachep, GFP_KERNEL); tsk->signal = sig; if (!sig) return -ENOMEM; @@ -871,46 +871,21 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) atomic_set(&sig->count, 1); atomic_set(&sig->live, 1); init_waitqueue_head(&sig->wait_chldexit); - sig->flags = 0; if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; - sig->group_exit_code = 0; - sig->group_exit_task = NULL; - sig->group_stop_count = 0; sig->curr_target = tsk; init_sigpending(&sig->shared_pending); INIT_LIST_HEAD(&sig->posix_timers); hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - sig->it_real_incr.tv64 = 0; sig->real_timer.function = it_real_fn; - sig->leader = 0; /* session leadership doesn't inherit */ - sig->tty_old_pgrp = NULL; - sig->tty = NULL; - - sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; - sig->gtime = cputime_zero; - sig->cgtime = cputime_zero; -#ifndef CONFIG_VIRT_CPU_ACCOUNTING - sig->prev_utime = sig->prev_stime = cputime_zero; -#endif - sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; - sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; - sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; - sig->maxrss = sig->cmaxrss = 0; - task_io_accounting_init(&sig->ioac); - sig->sum_sched_runtime = 0; - taskstats_tgid_init(sig); - task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); task_unlock(current->group_leader); posix_cpu_timers_init_group(sig); - acct_init_pacct(&sig->pacct); - tty_audit_fork(sig); sig->oom_adj = current->signal->oom_adj; -- cgit v1.2.2 From 4dd66e69d472f0ba5355a2529364d0db9a18a02b Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:02 -0800 Subject: copy_signal() cleanup: kill taskstats_tgid_init() and acct_init_pacct() Kill unused functions taskstats_tgid_init() and acct_init_pacct() because we don't use them anywhere after using kmem_cache_zalloc() in copy_signal(). Signed-off-by: Veaceslav Falico Cc: Roland McGrath Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index a6605ca921b6..24f8c81fc48d 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -587,16 +587,6 @@ out: revert_creds(orig_cred); } -/** - * acct_init_pacct - initialize a new pacct_struct - * @pacct: per-process accounting info struct to initialize - */ -void acct_init_pacct(struct pacct_struct *pacct) -{ - memset(pacct, 0, sizeof(struct pacct_struct)); - pacct->ac_utime = pacct->ac_stime = cputime_zero; -} - /** * acct_collect - collect accounting information into pacct_struct * @exitcode: task exit code -- cgit v1.2.2 From 93c59907c6f247d09239135caecf294a106a2ae0 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Wed, 10 Mar 2010 15:23:03 -0800 Subject: copy_signal() cleanup: clean thread_group_cputime_init() Remove unneeded initializations in thread_group_cputime_init() and in posix_cpu_timers_init_group(). They are useless after kmem_cache_zalloc() was used in copy_signal(). Signed-off-by: Veaceslav Falico Acked-by: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 11 ----------- 1 file changed, 11 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index ce2666f84d85..1beb6c303c41 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -833,17 +833,6 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig) /* Thread group counters. */ thread_group_cputime_init(sig); - /* Expiration times and increments. */ - sig->it[CPUCLOCK_PROF].expires = cputime_zero; - sig->it[CPUCLOCK_PROF].incr = cputime_zero; - sig->it[CPUCLOCK_VIRT].expires = cputime_zero; - sig->it[CPUCLOCK_VIRT].incr = cputime_zero; - - /* Cached expiration times. */ - sig->cputime_expires.prof_exp = cputime_zero; - sig->cputime_expires.virt_exp = cputime_zero; - sig->cputime_expires.sched_exp = 0; - cpu_limit = ACCESS_ONCE(sig->rlim[RLIMIT_CPU].rlim_cur); if (cpu_limit != RLIM_INFINITY) { sig->cputime_expires.prof_exp = secs_to_cputime(cpu_limit); -- cgit v1.2.2 From 13aa9a6b0f2371d2ce0de57c2ede62ab7a787157 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Mar 2010 15:23:09 -0800 Subject: pid_ns: zap_pid_ns_processes: use SEND_SIG_NOINFO instead of force_sig() zap_pid_ns_processes() uses force_sig(SIGKILL) to ensure SIGKILL will be delivered to sub-namespace inits as well. This is correct, but we are going to change force_sig_info() semantics. See http://bugzilla.kernel.org/show_bug.cgi?id=15395#c31 We can use send_sig_info(SEND_SIG_NOINFO) instead, since 614c517d7c00af1b26ded20646b329397d6f51a1 ("signals: SEND_SIG_NOINFO should be considered as SI_FROMUSER()") SEND_SIG_NOINFO means "from user" and therefore send_signal() will get the correct from_ancestor_ns = T flag. Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Acked-by: Linus Torvalds Acked-by: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 86b3796b0436..79aac93acf99 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -161,13 +161,12 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) rcu_read_lock(); /* - * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring - * any nested-container's init processes don't ignore the - * signal + * Any nested-container's init processes won't ignore the + * SEND_SIG_NOINFO signal, see send_signal()->si_fromuser(). */ task = pid_task(find_vpid(nr), PIDTYPE_PID); if (task) - force_sig(SIGKILL, task); + send_sig_info(SIGKILL, SEND_SIG_NOINFO, task); rcu_read_unlock(); -- cgit v1.2.2 From 8467005da3ef6104b89a4cc5e9c9d9445b75565f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 10 Mar 2010 15:23:10 -0800 Subject: nsproxy: remove INIT_NSPROXY() Remove INIT_NSPROXY(), use C99 initializer. Remove INIT_IPC_NS(), INIT_NET_NS() while I'm at it. Note: headers trim will be done later, now it's quite pointless because results will be invalidated by merge window. Signed-off-by: Alexey Dobriyan Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/nsproxy.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 09b4ff9711b2..2ab67233ee8f 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -24,7 +24,18 @@ static struct kmem_cache *nsproxy_cachep; -struct nsproxy init_nsproxy = INIT_NSPROXY(init_nsproxy); +struct nsproxy init_nsproxy = { + .count = ATOMIC_INIT(1), + .uts_ns = &init_uts_ns, +#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC) + .ipc_ns = &init_ipc_ns, +#endif + .mnt_ns = NULL, + .pid_ns = &init_pid_ns, +#ifdef CONFIG_NET + .net_ns = &init_net, +#endif +}; static inline struct nsproxy *create_nsproxy(void) { -- cgit v1.2.2 From eb5572fed55f4c2b7dbc42582bc82dcb47632380 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:23:59 -0800 Subject: sysctl extern cleanup: C_A_D Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move C_A_D extern variable declaration to linux/reboot.h Signed-off-by: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0ef19c614f6d..72c3b1e80d7b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -65,7 +65,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int C_A_D; extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; -- cgit v1.2.2 From d33ed52d57e794eba55cea3f5eab3c8f80b6cb5a Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:23:59 -0800 Subject: sysctl extern cleanup: signal Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move print_fatal_signals extern declaration to linux/signal.h Signed-off-by: Dave Young Cc: Oleg Nesterov Cc: Roland McGrath Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 72c3b1e80d7b..a8fd10a9a501 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -65,7 +66,6 @@ #if defined(CONFIG_SYSCTL) /* External variables not in a header file. */ -extern int print_fatal_signals; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern int sysctl_panic_on_oom; -- cgit v1.2.2 From e5ab67726f33b50f40db0ccf271ceb3c658554d5 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:05 -0800 Subject: sysctl extern cleanup: rcu Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move rcutorture_runnable extern declaration to linux/rcupdate.h Signed-off-by: Dave Young Acked-by: Josh Triplett Reviewed-by: "Paul E. McKenney" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index a8fd10a9a501..f18aaa7b0d65 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -87,9 +87,6 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif -#ifdef CONFIG_RCU_TORTURE_TEST -extern int rcutorture_runnable; -#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ #ifdef CONFIG_BLOCK extern int blk_iopoll_enabled; #endif -- cgit v1.2.2 From 5ed109103d73b0bafc92e860cead56725231384d Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:06 -0800 Subject: sysctl extern cleanup: module Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move modprobe_path extern declaration to linux/kmod.h Move modules_disabled extern declaration to linux/module.h Signed-off-by: Dave Young Cc: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f18aaa7b0d65..44e9492368fd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -116,10 +116,6 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_MODULES -extern char modprobe_path[]; -extern int modules_disabled; -#endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; #endif -- cgit v1.2.2 From 15485a4682d1d3bfee2aa78b4b1a5d36f5746b64 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:07 -0800 Subject: sysctl extern cleanup: sg Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move sg_big_buff extern declaration to scsi/sg.h Signed-off-by: Dave Young Acked-by: Doug Gilbert Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 44e9492368fd..5290c437f151 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,9 @@ #include #include #endif +#ifdef CONFIG_CHR_DEV_SG +#include +#endif #if defined(CONFIG_SYSCTL) @@ -116,10 +119,6 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; -#ifdef CONFIG_CHR_DEV_SG -extern int sg_big_buff; -#endif - #ifdef CONFIG_SPARC #include #endif -- cgit v1.2.2 From c55b7c3e82d0ad58f35a0785faaaf2f70b9b6cd3 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:08 -0800 Subject: sysctl extern cleanup: acct Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move acct_parm extern declaration to linux/acct.h Signed-off-by: Dave Young Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 5290c437f151..7635bb15f5af 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -61,6 +61,9 @@ #include #include #endif +#ifdef CONFIG_BSD_PROCESS_ACCT +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -140,10 +143,6 @@ extern int sysctl_userprocess_debug; extern int spin_retry; #endif -#ifdef CONFIG_BSD_PROCESS_ACCT -extern int acct_parm[]; -#endif - #ifdef CONFIG_IA64 extern int no_unaligned_warning; extern int unaligned_dump_stack; -- cgit v1.2.2 From 4f0e056fdebc15d3f4724ebc7bbf323158add1d7 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:09 -0800 Subject: sysctl extern cleanup: rtmutex Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move max_lock_depth extern declaration to linux/rtmutex.h Signed-off-by: Dave Young Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7635bb15f5af..622029ba5103 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,6 +64,9 @@ #ifdef CONFIG_BSD_PROCESS_ACCT #include #endif +#ifdef CONFIG_RT_MUTEXES +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -150,10 +153,6 @@ extern int unaligned_dump_stack; extern struct ratelimit_state printk_ratelimit_state; -#ifdef CONFIG_RT_MUTEXES -extern int max_lock_depth; -#endif - #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); -- cgit v1.2.2 From 2edf5e49800846a2b2b6461d99cdae18067c440f Mon Sep 17 00:00:00 2001 From: Dave Young Date: Wed, 10 Mar 2010 15:24:10 -0800 Subject: sysctl extern cleanup: lockdep Extern declarations in sysctl.c should be moved to their own header file, and then include them in relavant .c files. Move lockdep extern declarations to linux/lockdep.h Signed-off-by: Dave Young Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 622029ba5103..8686b0f5fc12 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -67,6 +67,9 @@ #ifdef CONFIG_RT_MUTEXES #include #endif +#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_LOCK_STAT) +#include +#endif #ifdef CONFIG_CHR_DEV_SG #include #endif @@ -191,9 +194,6 @@ extern struct ctl_table epoll_table[]; int sysctl_legacy_va_layout; #endif -extern int prove_locking; -extern int lock_stat; - /* The default sysctl tables: */ static struct ctl_table root_table[] = { -- cgit v1.2.2 From 52fbe9cde7fdb5c6fac196d7ebd2d92d05ef3cd4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 8 Mar 2010 14:50:43 +0800 Subject: ring-buffer: Move disabled check into preempt disable section The ring buffer resizing and resetting relies on a schedule RCU action. The buffers are disabled, a synchronize_sched() is called and then the resize or reset takes place. But this only works if the disabling of the buffers are within the preempt disabled section, otherwise a window exists that the buffers can be written to while a reset or resize takes place. Cc: stable@kernel.org Reported-by: Li Zefan Signed-off-by: Lai Jiangshan LKML-Reference: <4B949E43.2010906@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 8c1b2d290718..54191d6ed195 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2232,12 +2232,12 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (ring_buffer_flags != RB_BUFFERS_ON) return NULL; - if (atomic_read(&buffer->record_disabled)) - return NULL; - /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out_nocheck; + if (trace_recursive_lock()) goto out_nocheck; @@ -2469,11 +2469,11 @@ int ring_buffer_write(struct ring_buffer *buffer, if (ring_buffer_flags != RB_BUFFERS_ON) return -EBUSY; - if (atomic_read(&buffer->record_disabled)) - return -EBUSY; - resched = ftrace_preempt_disable(); + if (atomic_read(&buffer->record_disabled)) + goto out; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) -- cgit v1.2.2 From ea14eb714041d40fcc5180b5a586034503650149 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 19:41:23 -0500 Subject: function-graph: Init curr_ret_stack with ret_stack If the graph tracer is active, and a task is forked but the allocating of the processes graph stack fails, it can cause crash later on. This is due to the temporary stack being NULL, but the curr_ret_stack variable is copied from the parent. If it is not -1, then in ftrace_graph_probe_sched_switch() the following: for (index = next->curr_ret_stack; index >= 0; index--) next->ret_stack[index].calltime += timestamp; Will cause a kernel OOPS. Found with Li Zefan's ftrace_stress_test. Cc: stable@kernel.org Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d4d1238b096b..bb53edbb5c8c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3349,6 +3349,7 @@ void ftrace_graph_init_task(struct task_struct *t) { /* Make sure we do not use the parent ret_stack */ t->ret_stack = NULL; + t->curr_ret_stack = -1; if (ftrace_graph_active) { struct ftrace_ret_stack *ret_stack; @@ -3358,7 +3359,6 @@ void ftrace_graph_init_task(struct task_struct *t) GFP_KERNEL); if (!ret_stack) return; - t->curr_ret_stack = -1; atomic_set(&t->tracing_graph_pause, 0); atomic_set(&t->trace_overrun, 0); t->ftrace_timestamp = 0; -- cgit v1.2.2 From 283740c619d211e34572cc93c8cdba92ccbdb9cc Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 19:48:41 -0500 Subject: tracing: Use same local variable when resetting the ring buffer In the ftrace code that resets the ring buffer it references the buffer with a local variable, but then uses the tr->buffer as the parameter to reset. If the wakeup tracer is running, which can switch the tr->buffer with the max saved buffer, this can break the requirement of disabling the buffer before the reset. buffer = tr->buffer; ring_buffer_record_disable(buffer); synchronize_sched(); __tracing_reset(tr->buffer, cpu); If the tr->buffer is swapped, then the reset is not happening to the buffer that was disabled. This will cause the ring buffer to fail. Found with Li Zefan's ftrace_stress_test. Cc: stable@kernel.org Reported-by: Lai Jiangshan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6af8d7bc953b..60de37bd0f75 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -840,10 +840,10 @@ out: mutex_unlock(&trace_types_lock); } -static void __tracing_reset(struct trace_array *tr, int cpu) +static void __tracing_reset(struct ring_buffer *buffer, int cpu) { ftrace_disable_cpu(); - ring_buffer_reset_cpu(tr->buffer, cpu); + ring_buffer_reset_cpu(buffer, cpu); ftrace_enable_cpu(); } @@ -855,7 +855,7 @@ void tracing_reset(struct trace_array *tr, int cpu) /* Make sure all commits have finished */ synchronize_sched(); - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } @@ -873,7 +873,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) tr->time_start = ftrace_now(tr->cpu); for_each_online_cpu(cpu) - __tracing_reset(tr, cpu); + __tracing_reset(buffer, cpu); ring_buffer_record_enable(buffer); } -- cgit v1.2.2 From a2f8071428ed9a0f06865f417c962421c9a6b488 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 19:56:00 -0500 Subject: tracing: Disable buffer switching when starting or stopping trace When the trace iterator is read, tracing_start() and tracing_stop() is called to stop tracing while the iterator is processing the trace output. These functions disable both the standard buffer and the max latency buffer. But if the wakeup tracer is running, it can switch these buffers between the two disables: buffer = global_trace.buffer; if (buffer) ring_buffer_record_disable(buffer); <<<--------- swap happens here buffer = max_tr.buffer; if (buffer) ring_buffer_record_disable(buffer); What happens is that we disabled the same buffer twice. On tracing_start() we can enable the same buffer twice. All ring_buffer_record_disable() must be matched with a ring_buffer_record_enable() or the buffer can be disable permanently, or enable prematurely, and cause a bug where a reset happens while a trace is commiting. This patch protects these two by taking the ftrace_max_lock to prevent a switch from occurring. Found with Li Zefan's ftrace_stress_test. Cc: stable@kernel.org Reported-by: Lai Jiangshan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 60de37bd0f75..484337d33959 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -950,6 +950,8 @@ void tracing_start(void) goto out; } + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); buffer = global_trace.buffer; if (buffer) @@ -959,6 +961,8 @@ void tracing_start(void) if (buffer) ring_buffer_record_enable(buffer); + arch_spin_unlock(&ftrace_max_lock); + ftrace_start(); out: spin_unlock_irqrestore(&tracing_start_lock, flags); @@ -980,6 +984,9 @@ void tracing_stop(void) if (trace_stop_count++) goto out; + /* Prevent the buffers from switching */ + arch_spin_lock(&ftrace_max_lock); + buffer = global_trace.buffer; if (buffer) ring_buffer_record_disable(buffer); @@ -988,6 +995,8 @@ void tracing_stop(void) if (buffer) ring_buffer_record_disable(buffer); + arch_spin_unlock(&ftrace_max_lock); + out: spin_unlock_irqrestore(&tracing_start_lock, flags); } -- cgit v1.2.2 From b6345879ccbd9b92864fbd7eb8ac48acdb4d6b15 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 12 Mar 2010 20:03:30 -0500 Subject: tracing: Do not record user stack trace from NMI context A bug was found with Li Zefan's ftrace_stress_test that caused applications to segfault during the test. Placing a tracing_off() in the segfault code, and examining several traces, I found that the following was always the case. The lock tracer was enabled (lockdep being required) and userstack was enabled. Testing this out, I just enabled the two, but that was not good enough. I needed to run something else that could trigger it. Running a load like hackbench did not work, but executing a new program would. The following would trigger the segfault within seconds: # echo 1 > /debug/tracing/options/userstacktrace # echo 1 > /debug/tracing/events/lock/enable # while :; do ls > /dev/null ; done Enabling the function graph tracer and looking at what was happening I finally noticed that all cashes happened just after an NMI. 1) | copy_user_handle_tail() { 1) | bad_area_nosemaphore() { 1) | __bad_area_nosemaphore() { 1) | no_context() { 1) | fixup_exception() { 1) 0.319 us | search_exception_tables(); 1) 0.873 us | } [...] 1) 0.314 us | __rcu_read_unlock(); 1) 0.325 us | native_apic_mem_write(); 1) 0.943 us | } 1) 0.304 us | rcu_nmi_exit(); [...] 1) 0.479 us | find_vma(); 1) | bad_area() { 1) | __bad_area() { After capturing several traces of failures, all of them happened after an NMI. Curious about this, I added a trace_printk() to the NMI handler to read the regs->ip to see where the NMI happened. In which I found out it was here: ffffffff8135b660 : ffffffff8135b660: 48 83 ec 78 sub $0x78,%rsp ffffffff8135b664: e8 97 01 00 00 callq ffffffff8135b800 What was happening is that the NMI would happen at the place that a page fault occurred. It would call rcu_read_lock() which was traced by the lock events, and the user_stack_trace would run. This would trigger a page fault inside the NMI. I do not see where the CR2 register is saved or restored in NMI handling. This means that it would corrupt the page fault handling that the NMI interrupted. The reason the while loop of ls helped trigger the bug, was that each execution of ls would cause lots of pages to be faulted in, and increase the chances of the race happening. The simple solution is to not allow user stack traces in NMI context. After this patch, I ran the above "ls" test for a couple of hours without any issues. Without this patch, the bug would trigger in less than a minute. Cc: stable@kernel.org Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 484337d33959..e52683f7c3b2 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1284,6 +1284,13 @@ ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) if (!(trace_flags & TRACE_ITER_USERSTACKTRACE)) return; + /* + * NMIs can not handle page faults, even with fix ups. + * The save user stack can (and often does) fault. + */ + if (unlikely(in_nmi())) + return; + event = trace_buffer_lock_reserve(buffer, TRACE_USER_STACK, sizeof(*entry), flags, pc); if (!event) -- cgit v1.2.2 From cd3d8031eb4311e516329aee03c79a08333141f1 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Fri, 12 Mar 2010 16:15:36 +0900 Subject: sched: sched_getaffinity(): Allow less than NR_CPUS length [ Note, this commit changes the syscall ABI for > 1024 CPUs systems. ] Recently, some distro decided to use NR_CPUS=4096 for mysterious reasons. Unfortunately, glibc sched interface has the following definition: # define __CPU_SETSIZE 1024 # define __NCPUBITS (8 * sizeof (__cpu_mask)) typedef unsigned long int __cpu_mask; typedef struct { __cpu_mask __bits[__CPU_SETSIZE / __NCPUBITS]; } cpu_set_t; It mean, if NR_CPUS is bigger than 1024, cpu_set_t makes an ABI issue ... More recently, Sharyathi Nagesh reported following test program makes misterious syscall failure: ----------------------------------------------------------------------- #define _GNU_SOURCE #include #include #include int main() { cpu_set_t set; if (sched_getaffinity(0, sizeof(cpu_set_t), &set) < 0) printf("\n Call is failing with:%d", errno); } ----------------------------------------------------------------------- Because the kernel assumes len argument of sched_getaffinity() is bigger than NR_CPUS. But now it is not correct. Now we are faced with the following annoying dilemma, due to the limitations of the glibc interface built in years ago: (1) if we change glibc's __CPU_SETSIZE definition, we lost binary compatibility of _all_ application. (2) if we don't change it, we also lost binary compatibility of Sharyathi's use case. Then, I would propse to change the rule of the len argument of sched_getaffinity(). Old: len should be bigger than NR_CPUS New: len should be bigger than maximum possible cpu id This creates the following behavior: (A) In the real 4096 cpus machine, the above test program still return -EINVAL. (B) NR_CPUS=4096 but the machine have less than 1024 cpus (almost all machines in the world), the above can run successfully. Fortunatelly, BIG SGI machine is mainly used for HPC use case. It means they can rebuild their programs. IOW we hope they are not annoyed by this issue ... Reported-by: Sharyathi Nagesh Signed-off-by: KOSAKI Motohiro Acked-by: Ulrich Drepper Acked-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Cc: Jack Steiner Cc: Russ Anderson Cc: Mike Travis LKML-Reference: <20100312161316.9520.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 9ab3cd7858d3..6eaef3df2d19 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4902,7 +4902,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, int ret; cpumask_var_t mask; - if (len < cpumask_size()) + if (len < nr_cpu_ids) + return -EINVAL; + if (len & (sizeof(unsigned long)-1)) return -EINVAL; if (!alloc_cpumask_var(&mask, GFP_KERNEL)) @@ -4910,10 +4912,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - if (copy_to_user(user_mask_ptr, mask, cpumask_size())) + int retlen = min(len, cpumask_size()); + + if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; else - ret = cpumask_size(); + ret = retlen; } free_cpumask_var(mask); -- cgit v1.2.2 From e3818b8dce2a934cd1521dbc4827e5238d8f45d8 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 15 Mar 2010 17:03:43 -0700 Subject: rcu: Make rcu_read_lock_bh_held() allow for disabled BH Disabling BH can stand in for rcu_read_lock_bh(), and this patch updates rcu_read_lock_bh_held() to allow for this. In order to avoid include-file hell, this function is moved out of line to kernel/rcupdate.c. This fixes a false positive RCU warning. Reported-by: Arnd Bergmann Reported-by: Eric Dumazet Signed-off-by: Paul E. McKenney Acked-by: Lai Jiangshan Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com LKML-Reference: <20100316000343.GA25857@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/rcupdate.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'kernel') diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index f1125c1a6321..63fe25433980 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -45,6 +45,7 @@ #include #include #include +#include #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; @@ -66,6 +67,28 @@ EXPORT_SYMBOL_GPL(rcu_sched_lock_map); int rcu_scheduler_active __read_mostly; EXPORT_SYMBOL_GPL(rcu_scheduler_active); +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +/** + * rcu_read_lock_bh_held - might we be in RCU-bh read-side critical section? + * + * Check for bottom half being disabled, which covers both the + * CONFIG_PROVE_RCU and not cases. Note that if someone uses + * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) + * will show the situation. + * + * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. + */ +int rcu_read_lock_bh_held(void) +{ + if (!debug_lockdep_rcu_enabled()) + return 1; + return in_softirq(); +} +EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); + +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + /* * This function is invoked towards the end of the scheduler's initialization * process. Before this is called, the idle task might contain -- cgit v1.2.2 From c890692bf37671b5b78a1870d55d6d87e1c8a509 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Mar 2010 14:08:43 -0800 Subject: kernel/sched.c: Suppress unused var warning On UP: kernel/sched.c: In function 'wake_up_new_task': kernel/sched.c:2631: warning: unused variable 'cpu' Signed-off-by: Andrew Morton Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6eaef3df2d19..82975b5b42f7 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2650,7 +2650,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; - int cpu = get_cpu(); + int cpu __maybe_unused = get_cpu(); #ifdef CONFIG_SMP /* -- cgit v1.2.2 From 8bc037fb89bb3104b9ae290d18c877624cd7d9cc Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 17 Mar 2010 09:36:58 +0900 Subject: sched: Use proper type in sched_getaffinity() Using the proper type fixes the following compiler warning: kernel/sched.c:4850: warning: comparison of distinct pointer types lacks a cast Signed-off-by: KOSAKI Motohiro Cc: torvalds@linux-foundation.org Cc: travis@sgi.com Cc: peterz@infradead.org Cc: drepper@redhat.com Cc: rja@sgi.com Cc: sharyath@in.ibm.com Cc: steiner@sgi.com LKML-Reference: <20100317090046.4C79.A69D9226@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 82975b5b42f7..49d2fa7b687a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4912,7 +4912,7 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ret = sched_getaffinity(pid, mask); if (ret == 0) { - int retlen = min(len, cpumask_size()); + size_t retlen = min_t(size_t, len, cpumask_size()); if (copy_to_user(user_mask_ptr, mask, retlen)) ret = -EFAULT; -- cgit v1.2.2 From dcd5c1662db59a6b82942f47fb6ac9dd63f6d3dd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 16 Mar 2010 01:05:02 +0100 Subject: perf: Fix unexported generic perf_arch_fetch_caller_regs perf_arch_fetch_caller_regs() is exported for the overriden x86 version, but not for the generic weak version. As a general rule, weak functions should not have their symbol exported in the same file they are defined. So let's export it on trace_event_perf.c as it is used by trace events only. This fixes: ERROR: ".perf_arch_fetch_caller_regs" [fs/xfs/xfs.ko] undefined! ERROR: ".perf_arch_fetch_caller_regs" [arch/powerpc/platforms/cell/spufs/spufs.ko] undefined! -v2: And also only build it if trace events are enabled. -v3: Fix changelog mistake Reported-by: Stephen Rothwell Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Xiao Guangrong Cc: Paul Mackerras LKML-Reference: <1268697902-9518-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 ++ kernel/trace/trace_event_perf.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fb3031cf9f17..574ee58a3046 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2786,10 +2786,12 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return NULL; } +#ifdef CONFIG_EVENT_TRACING __weak void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) { } +#endif /* * Output diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0709e4f75114..7d79a10c3cde 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -12,6 +12,8 @@ DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); +EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); + static char *perf_trace_buf; static char *perf_trace_buf_nmi; -- cgit v1.2.2 From 2271048d1b3b0aabf83d25b29c20646dcabedc05 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 18 Mar 2010 17:54:19 -0400 Subject: ring-buffer: Do 8 byte alignment for 64 bit that can not handle 4 byte align The ring buffer uses 4 byte alignment while recording events into the buffer, even on 64bit machines. This saves space when there are lots of events being recorded at 4 byte boundaries. The ring buffer has a zero copy method to write into the buffer, with the reserving of space and then committing it. This may cause problems when writing an 8 byte word into a 4 byte alignment (not 8). For x86 and PPC this is not an issue, but on some architectures this would cause an out-of-alignment exception. This patch uses CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS to determine if it is OK to use 4 byte alignments on 64 bit machines. If it is not, it forces the ring buffer event header to be 8 bytes and not 4, and will align the length of the data to be 8 byte aligned. This keeps the data payload at 8 byte alignments and will allow these machines to run without issue. The trick to this is that the header can be either 4 bytes or 8 bytes depending on the length of the data payload. The 4 byte header has a length field that supports up to 112 bytes. If the length of the data is more than 112, the length field is set to zero, and the actual length is stored in the next 4 bytes after the header. When CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is not set, the code forces zero in the 4 byte header forcing the length to be stored in the 4 byte array, even with a small data load. It also forces the length of the data load to be 8 byte aligned. The combination of these two guarantee that the data is always at 8 byte alignment. Tested-by: Frederic Weisbecker (on sparc64) Reported-by: Frederic Weisbecker Acked-by: David S. Miller Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 05a9f83b8819..d1187ef20caf 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -207,6 +207,14 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) #define RB_EVNT_MIN_SIZE 8U /* two 32bit words */ +#if !defined(CONFIG_64BIT) || defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +# define RB_FORCE_8BYTE_ALIGNMENT 0 +# define RB_ARCH_ALIGNMENT RB_ALIGNMENT +#else +# define RB_FORCE_8BYTE_ALIGNMENT 1 +# define RB_ARCH_ALIGNMENT 8U +#endif + /* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ #define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX @@ -1547,7 +1555,7 @@ rb_update_event(struct ring_buffer_event *event, case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) event->array[0] = length; else event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); @@ -1722,11 +1730,11 @@ static unsigned rb_calculate_event_length(unsigned length) if (!length) length = 1; - if (length > RB_MAX_SMALL_DATA) + if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) length += sizeof(event.array[0]); length += RB_EVNT_HDR_SIZE; - length = ALIGN(length, RB_ALIGNMENT); + length = ALIGN(length, RB_ARCH_ALIGNMENT); return length; } -- cgit v1.2.2 From 8c2eb4805d422bdbf60ba00ff233c794d23c3c00 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 19 Mar 2010 10:28:02 +0000 Subject: softlockup: Stop spurious softlockup messages due to overflow Ensure additions on touch_ts do not overflow. This can occur when the top 32 bits of the TSC reach 0xffffffff causing additions to touch_ts to overflow and this in turn generates spurious softlockup warnings. Signed-off-by: Colin Ian King Cc: Peter Zijlstra Cc: Eric Dumazet Cc: LKML-Reference: <1268994482.1798.6.camel@lenovo> Signed-off-by: Ingo Molnar --- kernel/softlockup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 0d4c7898ab80..4b493f67dcb5 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -155,11 +155,11 @@ void softlockup_tick(void) * Wake up the high-prio watchdog task twice per * threshold timespan. */ - if (now > touch_ts + softlockup_thresh/2) + if (time_after(now - softlockup_thresh/2, touch_ts)) wake_up_process(per_cpu(softlockup_watchdog, this_cpu)); /* Warn about unreasonable delays: */ - if (now <= (touch_ts + softlockup_thresh)) + if (time_before_eq(now - softlockup_thresh, touch_ts)) return; per_cpu(softlockup_print_ts, this_cpu) = touch_ts; -- cgit v1.2.2 From 830ec0458c390f29c6c99e1ff7feab9e36368d12 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 18 Mar 2010 14:47:30 -0700 Subject: time: Fix accumulation bug triggered by long delay. The logarithmic accumulation done in the timekeeping has some overflow protection that limits the max shift value. That means it will take more then shift loops to accumulate all of the cycles. This causes the shift decrement to underflow, which causes the loop to never exit. The simplest fix would be simply to do a: if (shift) shift--; However that is not optimal, as we know the cycle offset is larger then the interval << shift, the above would make shift drop to zero, then we would be spinning for quite awhile accumulating at interval chunks at a time. Instead, this patch only decreases shift if the offset is smaller then cycle_interval << shift. This makes sure we accumulate using the largest chunks possible without overflowing tick_length, and limits the number of iterations through the loop. This issue was found and reported by Sonic Zhang, who also tested the fix. Many thanks your explanation and testing! Reported-by: Sonic Zhang Signed-off-by: John Stultz Tested-by: Sonic Zhang LKML-Reference: <1268948850-5225-1-git-send-email-johnstul@us.ibm.com> Signed-off-by: Thomas Gleixner --- kernel/time/timekeeping.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 16736379a9ca..39f6177fafac 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -818,7 +818,8 @@ void update_wall_time(void) shift = min(shift, maxshift); while (offset >= timekeeper.cycle_interval) { offset = logarithmic_accumulation(offset, shift); - shift--; + if(offset < timekeeper.cycle_interval< Date: Thu, 11 Mar 2010 17:01:09 -0700 Subject: resources: add interfaces that return conflict information request_resource() and insert_resource() only return success or failure, which no information about what existing resource conflicted with the proposed new reservation. This patch adds request_resource_conflict() and insert_resource_conflict(), which return the conflicting resource. Callers may use this for better error messages or to adjust the new resource and retry the request. Signed-off-by: Bjorn Helgaas Signed-off-by: Jesse Barnes --- kernel/resource.c | 44 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 2d5be5d9bf5f..9c358e263534 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -219,19 +219,34 @@ void release_child_resources(struct resource *r) } /** - * request_resource - request and reserve an I/O or memory resource + * request_resource_conflict - request and reserve an I/O or memory resource * @root: root resource descriptor * @new: resource descriptor desired by caller * - * Returns 0 for success, negative error code on error. + * Returns 0 for success, conflict resource on error. */ -int request_resource(struct resource *root, struct resource *new) +struct resource *request_resource_conflict(struct resource *root, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __request_resource(root, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * request_resource - request and reserve an I/O or memory resource + * @root: root resource descriptor + * @new: resource descriptor desired by caller + * + * Returns 0 for success, negative error code on error. + */ +int request_resource(struct resource *root, struct resource *new) +{ + struct resource *conflict; + + conflict = request_resource_conflict(root, new); return conflict ? -EBUSY : 0; } @@ -474,25 +489,40 @@ static struct resource * __insert_resource(struct resource *parent, struct resou } /** - * insert_resource - Inserts a resource in the resource tree + * insert_resource_conflict - Inserts resource in the resource tree * @parent: parent of the new resource * @new: new resource to insert * - * Returns 0 on success, -EBUSY if the resource can't be inserted. + * Returns 0 on success, conflict resource if the resource can't be inserted. * - * This function is equivalent to request_resource when no conflict + * This function is equivalent to request_resource_conflict when no conflict * happens. If a conflict happens, and the conflicting resources * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. */ -int insert_resource(struct resource *parent, struct resource *new) +struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { struct resource *conflict; write_lock(&resource_lock); conflict = __insert_resource(parent, new); write_unlock(&resource_lock); + return conflict; +} + +/** + * insert_resource - Inserts a resource in the resource tree + * @parent: parent of the new resource + * @new: new resource to insert + * + * Returns 0 on success, -EBUSY if the resource can't be inserted. + */ +int insert_resource(struct resource *parent, struct resource *new) +{ + struct resource *conflict; + + conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } -- cgit v1.2.2 From cc8c3b78433222e5dbc1fdfcfdde29e1743f181a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 23 Mar 2010 22:40:53 +0100 Subject: genirq: Protect access to irq_desc->action in can_request_irq() can_request_irq() accesses and dereferences irq_desc->action w/o holding irq_desc->lock. So action can be freed on another CPU before it's dereferenced. Unlikely, but ... Protect it with desc->lock. Signed-off-by: Thomas Gleixner --- kernel/irq/manage.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 69a3d7b9414c..398fda155f6e 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -382,6 +382,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) { struct irq_desc *desc = irq_to_desc(irq); struct irqaction *action; + unsigned long flags; if (!desc) return 0; @@ -389,11 +390,14 @@ int can_request_irq(unsigned int irq, unsigned long irqflags) if (desc->status & IRQ_NOREQUEST) return 0; + raw_spin_lock_irqsave(&desc->lock, flags); action = desc->action; if (action) if (irqflags & action->flags & IRQF_SHARED) action = NULL; + raw_spin_unlock_irqrestore(&desc->lock, flags); + return !action; } -- cgit v1.2.2 From 860652bfb890bd861c999ec39fcffabe5b712f85 Mon Sep 17 00:00:00 2001 From: Henrik Kretzschmar Date: Wed, 24 Mar 2010 12:59:20 +0100 Subject: genirq: Move two IRQ functions from .init.text to .text Both functions should not be marked as __init, since they be called from modules after the init section is freed. Signed-off-by: Henrik Kretzschmar Cc: Yinghai Lu Cc: Peter Zijlstra Cc: Jiri Kosina LKML-Reference: <1269431961-5731-1-git-send-email-henne@nachtwindheim.de> Signed-off-by: Thomas Gleixner --- kernel/irq/chip.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 71eba24a39a2..3c2d6e7737f0 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -729,7 +729,7 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip, __set_irq_handler(irq, handle, 0, name); } -void __init set_irq_noprobe(unsigned int irq) +void set_irq_noprobe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; @@ -744,7 +744,7 @@ void __init set_irq_noprobe(unsigned int irq) raw_spin_unlock_irqrestore(&desc->lock, flags); } -void __init set_irq_probe(unsigned int irq) +void set_irq_probe(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); unsigned long flags; -- cgit v1.2.2 From 9d34706f42f9b8c15185423d9af98d37ba21d011 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 23 Mar 2010 13:35:12 -0700 Subject: cgroups: remove duplicate include commit e6a1105b ("cgroups: subsystem module loading interface") and commit c50cc752 ("sched, cgroups: Fix module export") result in duplicate including of module.h Signed-off-by: Li Zefan Acked-by: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ef909a329750..e2769e13980c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -27,7 +27,6 @@ */ #include -#include #include #include #include -- cgit v1.2.2 From 5ab116c9349ef52d6fbd2e2917a53f13194b048e Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 23 Mar 2010 13:35:34 -0700 Subject: cpuset: fix the problem that cpuset_mem_spread_node() returns an offline node cpuset_mem_spread_node() returns an offline node, and causes an oops. This patch fixes it by initializing task->mems_allowed to node_states[N_HIGH_MEMORY], and updating task->mems_allowed when doing memory hotplug. Signed-off-by: Miao Xie Acked-by: David Rientjes Reported-by: Nick Piggin Tested-by: Nick Piggin Cc: Paul Menage Cc: Li Zefan Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 20 ++++++++++++-------- kernel/kthread.c | 2 +- 2 files changed, 13 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index ba401fab459f..5d38bd74483c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -920,9 +920,6 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, * call to guarantee_online_mems(), as we know no one is changing * our task's cpuset. * - * Hold callback_mutex around the two modifications of our tasks - * mems_allowed to synchronize with cpuset_mems_allowed(). - * * While the mm_struct we are migrating is typically from some * other task, the task_struct mems_allowed that we are hacking * is for our current task, which must allocate new pages for that @@ -1391,11 +1388,10 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); - to = node_possible_map; } else { guarantee_online_cpus(cs, cpus_attach); - guarantee_online_mems(cs, &to); } + guarantee_online_mems(cs, &to); /* do per-task migration stuff possibly for each in the threadgroup */ cpuset_attach_task(tsk, &to, cs); @@ -2090,15 +2086,23 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { + nodemask_t oldmems; + cgroup_lock(); switch (action) { case MEM_ONLINE: - case MEM_OFFLINE: + oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - if (action == MEM_OFFLINE) - scan_for_empty_cpusets(&top_cpuset); + update_tasks_nodemask(&top_cpuset, &oldmems, NULL); + break; + case MEM_OFFLINE: + /* + * needn't update top_cpuset.mems_allowed explicitly because + * scan_for_empty_cpusets() will update it. + */ + scan_for_empty_cpusets(&top_cpuset); break; default: break; diff --git a/kernel/kthread.c b/kernel/kthread.c index 82ed0ea15194..83911c780175 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -219,7 +219,7 @@ int kthreadd(void *unused) set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); set_cpus_allowed_ptr(tsk, cpu_all_mask); - set_mems_allowed(node_possible_map); + set_mems_allowed(node_states[N_HIGH_MEMORY]); current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; -- cgit v1.2.2 From 53feb29767c29c877f9d47dcfe14211b5b0f7ebd Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 23 Mar 2010 13:35:35 -0700 Subject: cpuset: alloc nodemask_t on the heap rather than the stack Signed-off-by: Miao Xie Acked-by: David Rientjes Cc: Nick Piggin Cc: Paul Menage Cc: Li Zefan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 94 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 66 insertions(+), 28 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 5d38bd74483c..d10946748ec2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -970,15 +970,20 @@ static void cpuset_change_nodemask(struct task_struct *p, struct cpuset *cs; int migrate; const nodemask_t *oldmem = scan->data; - nodemask_t newmems; + NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL); + + if (!newmems) + return; cs = cgroup_cs(scan->cg); - guarantee_online_mems(cs, &newmems); + guarantee_online_mems(cs, newmems); task_lock(p); - cpuset_change_task_nodemask(p, &newmems); + cpuset_change_task_nodemask(p, newmems); task_unlock(p); + NODEMASK_FREE(newmems); + mm = get_task_mm(p); if (!mm) return; @@ -1048,16 +1053,21 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem, static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, const char *buf) { - nodemask_t oldmem; + NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL); int retval; struct ptr_heap heap; + if (!oldmem) + return -ENOMEM; + /* * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; * it's read-only */ - if (cs == &top_cpuset) - return -EACCES; + if (cs == &top_cpuset) { + retval = -EACCES; + goto done; + } /* * An empty mems_allowed is ok iff there are no tasks in the cpuset. @@ -1073,11 +1083,13 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, goto done; if (!nodes_subset(trialcs->mems_allowed, - node_states[N_HIGH_MEMORY])) - return -EINVAL; + node_states[N_HIGH_MEMORY])) { + retval = -EINVAL; + goto done; + } } - oldmem = cs->mems_allowed; - if (nodes_equal(oldmem, trialcs->mems_allowed)) { + *oldmem = cs->mems_allowed; + if (nodes_equal(*oldmem, trialcs->mems_allowed)) { retval = 0; /* Too easy - nothing to do */ goto done; } @@ -1093,10 +1105,11 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, cs->mems_allowed = trialcs->mems_allowed; mutex_unlock(&callback_mutex); - update_tasks_nodemask(cs, &oldmem, &heap); + update_tasks_nodemask(cs, oldmem, &heap); heap_free(&heap); done: + NODEMASK_FREE(oldmem); return retval; } @@ -1381,39 +1394,47 @@ static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *oldcont, struct task_struct *tsk, bool threadgroup) { - nodemask_t from, to; struct mm_struct *mm; struct cpuset *cs = cgroup_cs(cont); struct cpuset *oldcs = cgroup_cs(oldcont); + NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL); + NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL); + + if (from == NULL || to == NULL) + goto alloc_fail; if (cs == &top_cpuset) { cpumask_copy(cpus_attach, cpu_possible_mask); } else { guarantee_online_cpus(cs, cpus_attach); } - guarantee_online_mems(cs, &to); + guarantee_online_mems(cs, to); /* do per-task migration stuff possibly for each in the threadgroup */ - cpuset_attach_task(tsk, &to, cs); + cpuset_attach_task(tsk, to, cs); if (threadgroup) { struct task_struct *c; rcu_read_lock(); list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) { - cpuset_attach_task(c, &to, cs); + cpuset_attach_task(c, to, cs); } rcu_read_unlock(); } /* change mm; only needs to be done once even if threadgroup */ - from = oldcs->mems_allowed; - to = cs->mems_allowed; + *from = oldcs->mems_allowed; + *to = cs->mems_allowed; mm = get_task_mm(tsk); if (mm) { - mpol_rebind_mm(mm, &to); + mpol_rebind_mm(mm, to); if (is_memory_migrate(cs)) - cpuset_migrate_mm(mm, &from, &to); + cpuset_migrate_mm(mm, from, to); mmput(mm); } + +alloc_fail: + NODEMASK_FREE(from); + NODEMASK_FREE(to); } /* The various types of files and directories in a cpuset file system */ @@ -1558,13 +1579,21 @@ static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs) static int cpuset_sprintf_memlist(char *page, struct cpuset *cs) { - nodemask_t mask; + NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL); + int retval; + + if (mask == NULL) + return -ENOMEM; mutex_lock(&callback_mutex); - mask = cs->mems_allowed; + *mask = cs->mems_allowed; mutex_unlock(&callback_mutex); - return nodelist_scnprintf(page, PAGE_SIZE, mask); + retval = nodelist_scnprintf(page, PAGE_SIZE, *mask); + + NODEMASK_FREE(mask); + + return retval; } static ssize_t cpuset_common_file_read(struct cgroup *cont, @@ -1993,7 +2022,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ struct cgroup *cont; - nodemask_t oldmems; + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return; list_add_tail((struct list_head *)&root->stack_list, &queue); @@ -2010,7 +2042,7 @@ static void scan_for_empty_cpusets(struct cpuset *root) nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; - oldmems = cp->mems_allowed; + *oldmems = cp->mems_allowed; /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); @@ -2026,9 +2058,10 @@ static void scan_for_empty_cpusets(struct cpuset *root) remove_tasks_in_empty_cpuset(cp); else { update_tasks_cpumask(cp, NULL); - update_tasks_nodemask(cp, &oldmems, NULL); + update_tasks_nodemask(cp, oldmems, NULL); } } + NODEMASK_FREE(oldmems); } /* @@ -2086,16 +2119,19 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb, static int cpuset_track_online_nodes(struct notifier_block *self, unsigned long action, void *arg) { - nodemask_t oldmems; + NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL); + + if (oldmems == NULL) + return NOTIFY_DONE; cgroup_lock(); switch (action) { case MEM_ONLINE: - oldmems = top_cpuset.mems_allowed; + *oldmems = top_cpuset.mems_allowed; mutex_lock(&callback_mutex); top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; mutex_unlock(&callback_mutex); - update_tasks_nodemask(&top_cpuset, &oldmems, NULL); + update_tasks_nodemask(&top_cpuset, oldmems, NULL); break; case MEM_OFFLINE: /* @@ -2108,6 +2144,8 @@ static int cpuset_track_online_nodes(struct notifier_block *self, break; } cgroup_unlock(); + + NODEMASK_FREE(oldmems); return NOTIFY_OK; } #endif -- cgit v1.2.2 From 88be12c440cfa2fa3f5be83507360aac9ea1c54e Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 29 Mar 2010 12:01:50 +0100 Subject: slow-work: use get_ref wrapper instead of directly calling get_ref Otherwise we can get an oops if the user has no get_ref/put_ref requirement. Signed-off-by: Dave Airlie Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/slow-work.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/slow-work.c b/kernel/slow-work.c index 7494bbf5a270..7d3f4fa9ef4f 100644 --- a/kernel/slow-work.c +++ b/kernel/slow-work.c @@ -637,7 +637,7 @@ int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, goto cancelled; /* the timer holds a reference whilst it is pending */ - ret = work->ops->get_ref(work); + ret = slow_work_get_ref(work); if (ret < 0) goto cant_get_ref; -- cgit v1.2.2 From a53f4f9efaeb1d87cfae066346979d4d70e1abe9 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 29 Mar 2010 13:08:52 +0100 Subject: SLOW_WORK: CONFIG_SLOW_WORK_PROC should be CONFIG_SLOW_WORK_DEBUG CONFIG_SLOW_WORK_PROC was changed to CONFIG_SLOW_WORK_DEBUG, but not in all instances. Change the remaining instances. This makes the debugfs file display the time mark and the owner's description again. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- kernel/slow-work.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/slow-work.h b/kernel/slow-work.h index 321f3c59d732..a29ebd1ef41d 100644 --- a/kernel/slow-work.h +++ b/kernel/slow-work.h @@ -43,28 +43,28 @@ extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *); */ static inline void slow_work_set_thread_pid(int id, pid_t pid) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_pids[id] = pid; #endif } static inline void slow_work_mark_time(struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG work->mark = CURRENT_TIME; #endif } static inline void slow_work_begin_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG slow_work_execs[id] = work; #endif } static inline void slow_work_end_exec(int id, struct slow_work *work) { -#ifdef CONFIG_SLOW_WORK_PROC +#ifdef CONFIG_SLOW_WORK_DEBUG write_lock(&slow_work_execs_lock); slow_work_execs[id] = NULL; write_unlock(&slow_work_execs_lock); -- cgit v1.2.2 From eed63519e3e74d515d2007ecd895338d0ba2a85c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Sun, 28 Mar 2010 19:42:56 -0700 Subject: x86: Do not free zero sized per cpu areas This avoids an infinite loop in free_early_partial(). Add a warning to free_early_partial() to catch future problems. -v5: put back start > end back into WARN_ONCE() -v6: use one line for warning, suggested by Linus -v7: more tests -v8: remove the function name as suggested by Johannes WARN_ONCE() will print out that function name. Signed-off-by: Ian Campbell Signed-off-by: Yinghai Lu Tested-by: Konrad Rzeszutek Wilk Tested-by: Joel Becker Tested-by: Stanislaw Gruszka Acked-by: Johannes Weiner Cc: Peter Zijlstra Cc: David Miller Cc: Benjamin Herrenschmidt Cc: Linus Torvalds LKML-Reference: <1269830604-26214-4-git-send-email-yinghai@kernel.org> Signed-off-by: Ingo Molnar --- kernel/early_res.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/early_res.c b/kernel/early_res.c index 3cb2c661bb78..31aa9332ef3f 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -333,6 +333,12 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + if (start == end) + return; + + if (WARN_ONCE(start > end, " wrong range [%#llx, %#llx]\n", start, end)) + return; + try_next: i = find_overlapped_early(start, end); if (i >= max_early_res) -- cgit v1.2.2 From 570b8fb505896e007fd3bb07573ba6640e51851d Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 30 Mar 2010 00:04:00 +0100 Subject: CRED: Fix memory leak in error handling Fix a memory leak on an OOM condition in prepare_usermodehelper_creds(). Signed-off-by: Mathieu Desnoyers Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 1ed8ca18790c..1b1129d0cce8 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -364,7 +364,7 @@ struct cred *prepare_usermodehelper_creds(void) new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); if (!new) - return NULL; + goto free_tgcred; kdebug("prepare_usermodehelper_creds() alloc %p", new); @@ -397,6 +397,10 @@ struct cred *prepare_usermodehelper_creds(void) error: put_cred(new); +free_tgcred: +#ifdef CONFIG_KEYS + kfree(tgcred); +#endif return NULL; } -- cgit v1.2.2