From 47f61f397cc08b5a9a815bd03cb10c48dab66034 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Jul 2005 11:21:38 -0600 Subject: [PATCH] Add missing device_suspsend(PMSG_FREEZE) calls. In the recent addition of device_suspend calls into sys_reboot two code paths were missed. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 9a24374c23bc..5fc10d3e3891 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -391,6 +391,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user case LINUX_REBOOT_CMD_RESTART: notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); system_state = SYSTEM_RESTART; + device_suspend(PMSG_FREEZE); device_shutdown(); printk(KERN_EMERG "Restarting system.\n"); machine_restart(NULL); @@ -452,6 +453,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user } notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); system_state = SYSTEM_RESTART; + device_suspend(PMSG_FREEZE); device_shutdown(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); -- cgit v1.2.2 From 4a00ea1e18228e5ef99d4780671fda97226bda30 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Jul 2005 11:24:14 -0600 Subject: [PATCH] Refactor sys_reboot into reusable parts Because the factors of sys_reboot don't exist people calling into the reboot path duplicate the code badly, leading to inconsistent expectations of code in the reboot path. This patch should is just code motion. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/sys.c | 106 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 64 insertions(+), 42 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 5fc10d3e3891..7e033809ef5f 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -361,6 +361,62 @@ out_unlock: return retval; } +void kernel_restart(char *cmd) +{ + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); + system_state = SYSTEM_RESTART; + device_suspend(PMSG_FREEZE); + device_shutdown(); + if (!cmd) { + printk(KERN_EMERG "Restarting system.\n"); + } else { + printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); + } + printk(".\n"); + machine_restart(cmd); +} +EXPORT_SYMBOL_GPL(kernel_restart); + +void kernel_kexec(void) +{ +#ifdef CONFIG_KEXEC + struct kimage *image; + image = xchg(&kexec_image, 0); + if (!image) { + return; + } + notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); + system_state = SYSTEM_RESTART; + device_suspend(PMSG_FREEZE); + device_shutdown(); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + machine_kexec(image); +#endif +} +EXPORT_SYMBOL_GPL(kernel_kexec); + +void kernel_halt(void) +{ + notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); + system_state = SYSTEM_HALT; + device_suspend(PMSG_SUSPEND); + device_shutdown(); + printk(KERN_EMERG "System halted.\n"); + machine_halt(); +} +EXPORT_SYMBOL_GPL(kernel_halt); + +void kernel_power_off(void) +{ + notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); + system_state = SYSTEM_POWER_OFF; + device_suspend(PMSG_SUSPEND); + device_shutdown(); + printk(KERN_EMERG "Power down.\n"); + machine_power_off(); +} +EXPORT_SYMBOL_GPL(kernel_power_off); /* * Reboot system call: for obvious reasons only root may call it, @@ -389,12 +445,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user lock_kernel(); switch (cmd) { case LINUX_REBOOT_CMD_RESTART: - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_suspend(PMSG_FREEZE); - device_shutdown(); - printk(KERN_EMERG "Restarting system.\n"); - machine_restart(NULL); + kernel_restart(NULL); break; case LINUX_REBOOT_CMD_CAD_ON: @@ -406,23 +457,13 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user break; case LINUX_REBOOT_CMD_HALT: - notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); - system_state = SYSTEM_HALT; - device_suspend(PMSG_SUSPEND); - device_shutdown(); - printk(KERN_EMERG "System halted.\n"); - machine_halt(); + kernel_halt(); unlock_kernel(); do_exit(0); break; case LINUX_REBOOT_CMD_POWER_OFF: - notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); - system_state = SYSTEM_POWER_OFF; - device_suspend(PMSG_SUSPEND); - device_shutdown(); - printk(KERN_EMERG "Power down.\n"); - machine_power_off(); + kernel_power_off(); unlock_kernel(); do_exit(0); break; @@ -434,33 +475,14 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user } buffer[sizeof(buffer) - 1] = '\0'; - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, buffer); - system_state = SYSTEM_RESTART; - device_suspend(PMSG_FREEZE); - device_shutdown(); - printk(KERN_EMERG "Restarting system with command '%s'.\n", buffer); - machine_restart(buffer); + kernel_restart(buffer); break; -#ifdef CONFIG_KEXEC case LINUX_REBOOT_CMD_KEXEC: - { - struct kimage *image; - image = xchg(&kexec_image, 0); - if (!image) { - unlock_kernel(); - return -EINVAL; - } - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - system_state = SYSTEM_RESTART; - device_suspend(PMSG_FREEZE); - device_shutdown(); - printk(KERN_EMERG "Starting new kernel\n"); - machine_shutdown(); - machine_kexec(image); - break; - } -#endif + kernel_kexec(); + unlock_kernel(); + return -EINVAL; + #ifdef CONFIG_SOFTWARE_SUSPEND case LINUX_REBOOT_CMD_SW_SUSPEND: { -- cgit v1.2.2 From abcd9e51f5b832439b119d530db1353c12fd4073 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Jul 2005 11:27:34 -0600 Subject: [PATCH] Make ctrl_alt_del call kernel_restart to get a proper reboot. It is obvious we wanted to call kernel_restart here but since we don't have it the code was expanded inline and hasn't been correct since sometime in 2.4. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/sys.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 7e033809ef5f..31ac41a73329 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -502,8 +502,7 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user static void deferred_cad(void *dummy) { - notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); - machine_restart(NULL); + kernel_restart(NULL); } /* -- cgit v1.2.2 From 7c9034735eccbf82608a4602c59aaf6053ea9416 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Jul 2005 11:29:55 -0600 Subject: [PATCH] Add emergency_restart() When the kernel is working well and we want to restart cleanly kernel_restart is the function to use. But in many instances the kernel wants to reboot when thing are expected to be working very badly such as from panic or a software watchdog handler. This patch adds the function emergency_restart() so that callers can be clear what semantics they expect when calling restart. emergency_restart() is expected to be callable from interrupt context and possibly reliable in even more trying circumstances. This is an initial generic implementation for all architectures. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/sys.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 31ac41a73329..a74039036fb4 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -361,6 +361,12 @@ out_unlock: return retval; } +void emergency_restart(void) +{ + machine_emergency_restart(); +} +EXPORT_SYMBOL_GPL(emergency_restart); + void kernel_restart(char *cmd) { notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); -- cgit v1.2.2 From ff31977782a05504f2586ec9e3e5ab4b09a4c893 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Jul 2005 11:47:32 -0600 Subject: [PATCH] Use kernel_power_off in sysrq-o We already do all of the gymnastics to run from process context to call the power off code so call into the power off code cleanly. This especially helps acpi as part of it's shutdown logic should run acpi_shutdown called from device_shutdown which was not being called from here. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/power/poweroff.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c index 715081b2d829..7a4144ba3afd 100644 --- a/kernel/power/poweroff.c +++ b/kernel/power/poweroff.c @@ -9,6 +9,7 @@ #include #include #include +#include /* * When the user hits Sys-Rq o to power down the machine this is the @@ -17,8 +18,7 @@ static void do_poweroff(void *dummy) { - if (pm_power_off) - pm_power_off(); + kernel_power_off(); } static DECLARE_WORK(poweroff_work, do_poweroff, NULL); -- cgit v1.2.2 From 2f048ea81df94f72dee0d42b3d9b941c03b8c9c5 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Jul 2005 11:49:23 -0600 Subject: [PATCH] Call emergency_reboot from panic We know the system is in trouble so there is no question if this is an emergecy :) Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/panic.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 74ba5f3e46c7..aabc5f86fa3f 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -111,12 +111,11 @@ NORET_TYPE void panic(const char * fmt, ...) mdelay(1); i++; } - /* - * Should we run the reboot notifier. For the moment Im - * choosing not too. It might crash, be corrupt or do - * more harm than good for other reasons. + /* This will not be a clean reboot, with everything + * shutting down. But if there is a chance of + * rebooting the system it will be rebooted. */ - machine_restart(NULL); + emergency_restart(); } #ifdef __sparc__ { -- cgit v1.2.2 From fdde86ac50357b6a811e3574e47d189e81a21444 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 26 Jul 2005 12:01:17 -0600 Subject: [PATCH] swpsuspend: Have suspend to disk use factors of sys_reboot The suspend to disk code was a poor copy of the code in sys_reboot now that we have kernel_power_off, kernel_restart and kernel_halt use them instead of poorly duplicating them inline. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/power/disk.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 3ec789c6b537..664eb0469b6e 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -59,16 +59,13 @@ static void power_down(suspend_disk_method_t mode) error = pm_ops->enter(PM_SUSPEND_DISK); break; case PM_DISK_SHUTDOWN: - printk("Powering off system\n"); - device_shutdown(); - machine_power_off(); + kernel_power_off(); break; case PM_DISK_REBOOT: - device_shutdown(); - machine_restart(NULL); + kernel_restart(NULL); break; } - machine_halt(); + kernel_halt(); /* Valid image is on the disk, if we continue we risk serious data corruption after resume. */ printk(KERN_CRIT "Please power me down manually\n"); -- cgit v1.2.2 From 18586e721636527cb5177467fb17e2350615978a Mon Sep 17 00:00:00 2001 From: Andreas Steinmetz Date: Sat, 23 Jul 2005 13:42:04 +0200 Subject: [PATCH] Fix RLIMIT_RTPRIO breakage RLIMIT_RTPRIO is supposed to grant non privileged users the right to use SCHED_FIFO/SCHED_RR scheduling policies with priorites bounded by the RLIMIT_RTPRIO value via sched_setscheduler(). This is usually used by audio users. Unfortunately this is broken in 2.6.13rc3 as you can see in the excerpt from sched_setscheduler below: /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { /* can't change policy */ if (policy != p->policy) return -EPERM; After the above unconditional test which causes sched_setscheduler to fail with no regard to the RLIMIT_RTPRIO value the following check is made: /* can't increase priority */ if (policy != SCHED_NORMAL && param->sched_priority > p->rt_priority && param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) return -EPERM; Thus I do believe that the RLIMIT_RTPRIO value must be taken into account for the policy check, especially as the RLIMIT_RTPRIO limit is of no use without this change. The attached patch fixes this problem. Signed-off-by: Andreas Steinmetz Acked-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 4107db0dc091..a5fb654ea590 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3528,7 +3528,8 @@ recheck: */ if (!capable(CAP_SYS_NICE)) { /* can't change policy */ - if (policy != p->policy) + if (policy != p->policy && + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) return -EPERM; /* can't increase priority */ if (policy != SCHED_NORMAL && -- cgit v1.2.2 From d46523ea32a79fbc8cd1237f9441f45cc3f02456 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 25 Jul 2005 16:28:39 -0400 Subject: [PATCH] fix MAX_USER_RT_PRIO and MAX_RT_PRIO Here's the patch again to fix the code to handle if the values between MAX_USER_RT_PRIO and MAX_RT_PRIO are different. Without this patch, an SMP system will crash if the values are different. Signed-off-by: Steven Rostedt Cc: Ingo Molnar Signed-off-by: Dean Nelson Signed-off-by: Linus Torvalds --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index a5fb654ea590..a646e4f36c41 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3486,7 +3486,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + p->prio = MAX_RT_PRIO-1 - p->rt_priority; else p->prio = p->static_prio; } @@ -3518,7 +3518,8 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ if (param->sched_priority < 0 || - param->sched_priority > MAX_USER_RT_PRIO-1) + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; -- cgit v1.2.2 From bba0e4670a4e1841a96b561dcc60ebe335049891 Mon Sep 17 00:00:00 2001 From: Nigel Cunningham Date: Wed, 27 Jul 2005 11:43:41 -0700 Subject: [PATCH] Address BUG: using smp_processor_id() in preemptible [00000001] code This patch fixes a warning in the disable_nonboot_cpus call in kernel/power/smp.c. Signed-off by: Nigel Cunningham Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/smp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/smp.c b/kernel/power/smp.c index bbe23079c62c..911fc62b8225 100644 --- a/kernel/power/smp.c +++ b/kernel/power/smp.c @@ -38,7 +38,7 @@ void disable_nonboot_cpus(void) } printk("Error taking cpu %d down: %d\n", cpu, error); } - BUG_ON(smp_processor_id() != 0); + BUG_ON(raw_smp_processor_id() != 0); if (error) panic("cpus not sleeping"); } -- cgit v1.2.2 From d912d1ff218195c248c770eb677726695e07aa40 Mon Sep 17 00:00:00 2001 From: George Anzinger Date: Wed, 27 Jul 2005 11:43:44 -0700 Subject: [PATCH] itimer fixes Fix the recent off-by-one fix in the itimer code: 1. The repeating timer is figured using the requested time (not +1 as we know where we are in the jiffie). 2. The tests for interval too large are left to the time_val to jiffie code. Signed-off-by: George Anzinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/itimer.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/itimer.c b/kernel/itimer.c index a72cb0e5aa4b..7c1b25e25e47 100644 --- a/kernel/itimer.c +++ b/kernel/itimer.c @@ -112,28 +112,11 @@ asmlinkage long sys_getitimer(int which, struct itimerval __user *value) return error; } -/* - * Called with P->sighand->siglock held and P->signal->real_timer inactive. - * If interval is nonzero, arm the timer for interval ticks from now. - */ -static inline void it_real_arm(struct task_struct *p, unsigned long interval) -{ - p->signal->it_real_value = interval; /* XXX unnecessary field?? */ - if (interval == 0) - return; - if (interval > (unsigned long) LONG_MAX) - interval = LONG_MAX; - /* the "+ 1" below makes sure that the timer doesn't go off before - * the interval requested. This could happen if - * time requested % (usecs per jiffy) is more than the usecs left - * in the current jiffy */ - p->signal->real_timer.expires = jiffies + interval + 1; - add_timer(&p->signal->real_timer); -} void it_real_fn(unsigned long __data) { struct task_struct * p = (struct task_struct *) __data; + unsigned long inc = p->signal->it_real_incr; send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p); @@ -141,14 +124,23 @@ void it_real_fn(unsigned long __data) * Now restart the timer if necessary. We don't need any locking * here because do_setitimer makes sure we have finished running * before it touches anything. + * Note, we KNOW we are (or should be) at a jiffie edge here so + * we don't need the +1 stuff. Also, we want to use the prior + * expire value so as to not "slip" a jiffie if we are late. + * Deal with requesting a time prior to "now" here rather than + * in add_timer. */ - it_real_arm(p, p->signal->it_real_incr); + if (!inc) + return; + while (time_before_eq(p->signal->real_timer.expires, jiffies)) + p->signal->real_timer.expires += inc; + add_timer(&p->signal->real_timer); } int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue) { struct task_struct *tsk = current; - unsigned long val, interval; + unsigned long val, interval, expires; cputime_t cval, cinterval, nval, ninterval; switch (which) { @@ -164,7 +156,10 @@ again: } tsk->signal->it_real_incr = timeval_to_jiffies(&value->it_interval); - it_real_arm(tsk, timeval_to_jiffies(&value->it_value)); + expires = timeval_to_jiffies(&value->it_value); + if (expires) + mod_timer(&tsk->signal->real_timer, + jiffies + 1 + expires); spin_unlock_irq(&tsk->sighand->siglock); if (ovalue) { jiffies_to_timeval(val, &ovalue->it_value); -- cgit v1.2.2 From 951f22d5b1f0eaae35dafc669e3774a0c2084d10 Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 27 Jul 2005 11:44:57 -0700 Subject: [PATCH] s390: spin lock retry Split spin lock and r/w lock implementation into a single try which is done inline and an out of line function that repeatedly tries to get the lock before doing the cpu_relax(). Add a system control to set the number of retries before a cpu is yielded. The reason for the spin lock retry is that the diagnose 0x44 that is used to give up the virtual cpu is quite expensive. For spin locks that are held only for a short period of time the costs of the diagnoses outweights the savings for spin locks that are held for a longer timer. The default retry count is 1000. Signed-off-by: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e60b9c36f1f0..3e0bbee549ea 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -114,6 +114,7 @@ extern int unaligned_enabled; extern int sysctl_ieee_emulation_warnings; #endif extern int sysctl_userprocess_debug; +extern int spin_retry; #endif extern int sysctl_hz_timer; @@ -647,7 +648,16 @@ static ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, - +#if defined(CONFIG_ARCH_S390) + { + .ctl_name = KERN_SPIN_RETRY, + .procname = "spin_retry", + .data = &spin_retry, + .maxlen = sizeof (int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .ctl_name = 0 } }; -- cgit v1.2.2 From 207a7ba8dc000e1b13acac97f3736810dd86e8e2 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 27 Jul 2005 11:45:10 -0700 Subject: [PATCH] kernel/capability.c: add kerneldoc Add kerneldoc to kernel/capability.c Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index 64db1ee820c2..8986a37a67ea 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -31,8 +31,14 @@ static DEFINE_SPINLOCK(task_capability_lock); * uninteresting and/or not to be changed. */ -/* +/** * sys_capget - get the capabilities of a given process. + * @header: pointer to struct that contains capability version and + * target pid data + * @dataptr: pointer to struct that contains the effective, permitted, + * and inheritable capabilities that are returned + * + * Returns 0 on success and < 0 on error. */ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr) { @@ -141,8 +147,14 @@ static inline int cap_set_all(kernel_cap_t *effective, return ret; } -/* - * sys_capset - set capabilities for a given process, all processes, or all +/** + * sys_capset - set capabilities for a process or a group of processes + * @header: pointer to struct that contains capability version and + * target pid data + * @data: pointer to struct that contains the effective, permitted, + * and inheritable capabilities + * + * Set capabilities for a given process, all processes, or all * processes in a given process group. * * The restrictions on setting capabilities are specified as: @@ -152,6 +164,8 @@ static inline int cap_set_all(kernel_cap_t *effective, * I: any raised capabilities must be a subset of the (old current) permitted * P: any raised capabilities must be a subset of the (old current) permitted * E: must be set to a subset of (new target) permitted + * + * Returns 0 on success and < 0 on error. */ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data) { -- cgit v1.2.2 From d9fd8a6d443b509147280f058d4e59f0b796a323 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 27 Jul 2005 11:45:11 -0700 Subject: [PATCH] kernel/cpuset.c: add kerneldoc, fix typos Add kerneldoc to kernel/cpuset.c Fix cpuset typos in init/Kconfig Signed-off-by: Randy Dunlap Acked-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 984c0bf3807f..805fb9097318 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1440,10 +1440,10 @@ void __init cpuset_init_smp(void) /** * cpuset_fork - attach newly forked task to its parents cpuset. - * @p: pointer to task_struct of forking parent process. + * @tsk: pointer to task_struct of forking parent process. * * Description: By default, on fork, a task inherits its - * parents cpuset. The pointer to the shared cpuset is + * parent's cpuset. The pointer to the shared cpuset is * automatically copied in fork.c by dup_task_struct(). * This cpuset_fork() routine need only increment the usage * counter in that cpuset. @@ -1471,7 +1471,6 @@ void cpuset_fork(struct task_struct *tsk) * by the cpuset_sem semaphore. If you don't hold cpuset_sem, * then a zero cpuset use count is a license to any other task to * nuke the cpuset immediately. - * **/ void cpuset_exit(struct task_struct *tsk) @@ -1521,7 +1520,9 @@ void cpuset_init_current_mems_allowed(void) current->mems_allowed = NODE_MASK_ALL; } -/* +/** + * cpuset_update_current_mems_allowed - update mems parameters to new values + * * If the current tasks cpusets mems_allowed changed behind our backs, * update current->mems_allowed and mems_generation to the new value. * Do not call this routine if in_interrupt(). @@ -1540,13 +1541,20 @@ void cpuset_update_current_mems_allowed(void) } } +/** + * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed + * @nodes: pointer to a node bitmap that is and-ed with mems_allowed + */ void cpuset_restrict_to_mems_allowed(unsigned long *nodes) { bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), MAX_NUMNODES); } -/* +/** + * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed + * @zl: the zonelist to be checked + * * Are any of the nodes on zonelist zl allowed in current->mems_allowed? */ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) @@ -1562,8 +1570,12 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) return 0; } -/* - * Is 'current' valid, and is zone z allowed in current->mems_allowed? +/** + * cpuset_zone_allowed - is zone z allowed in current->mems_allowed + * @z: zone in question + * + * Is zone z allowed in current->mems_allowed, or is + * the CPU in interrupt context? (zone is always allowed in this case) */ int cpuset_zone_allowed(struct zone *z) { -- cgit v1.2.2 From e77e17161ccb8bd877bf83b3611cd318e451c605 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 27 Jul 2005 11:45:11 -0700 Subject: [PATCH] kernel/crash_dump.c: add kerneldoc Add kerneldoc to kernel/crash_dump.c Signed-off-by: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/crash_dump.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c index 459ba49e376a..334c37f5218a 100644 --- a/kernel/crash_dump.c +++ b/kernel/crash_dump.c @@ -18,7 +18,16 @@ /* Stores the physical address of elf header of crash image. */ unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX; -/* +/** + * copy_oldmem_page - copy one page from "oldmem" + * @pfn: page frame number to be copied + * @buf: target memory address for the copy; this can be in kernel address + * space or user address space (see @userbuf) + * @csize: number of bytes to copy + * @offset: offset in bytes into the page (based on pfn) to begin the copy + * @userbuf: if set, @buf is in user address space, use copy_to_user(), + * otherwise @buf is in kernel address space, use memcpy(). + * * Copy a page from "oldmem". For this page, there is no pte mapped * in the current kernel. We stitch up a pte, similar to kmap_atomic. */ -- cgit v1.2.2 From 77933d7276ee8fa0e2947641941a6f7a100a327b Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Wed, 27 Jul 2005 11:46:09 -0700 Subject: [PATCH] clean up inline static vs static inline `gcc -W' likes to complain if the static keyword is not at the beginning of the declaration. This patch fixes all remaining occurrences of "inline static" up with "static inline" in the entire kernel tree (140 occurrences in 47 files). While making this change I came across a few lines with trailing whitespace that I also fixed up, I have also added or removed a blank line or two here and there, but there are no functional changes in the patch. Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time.c b/kernel/time.c index d4335c1c884c..dd5ae1162a8f 100644 --- a/kernel/time.c +++ b/kernel/time.c @@ -128,7 +128,7 @@ asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __us * as real UNIX machines always do it. This avoids all headaches about * daylight saving times and warping kernel clocks. */ -inline static void warp_clock(void) +static inline void warp_clock(void) { write_seqlock_irq(&xtime_lock); wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; -- cgit v1.2.2 From e4ff4d7f9d85a2bc714307eb9113617182e62845 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 27 Jul 2005 10:41:23 -0700 Subject: [PATCH] Avoid device suspend on reboot My fairly ordinary x86 test box gets stuck during reboot on the wait_for_completion() in ide_do_drive_cmd(): Signed-off-by: Linus Torvalds --- kernel/sys.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a74039036fb4..8f255259ef9e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -371,7 +371,6 @@ void kernel_restart(char *cmd) { notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); system_state = SYSTEM_RESTART; - device_suspend(PMSG_FREEZE); device_shutdown(); if (!cmd) { printk(KERN_EMERG "Restarting system.\n"); -- cgit v1.2.2 From ed6b676ca8b50e0b538e61c283d52fd04f007abf Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 28 Jul 2005 21:15:49 -0700 Subject: [PATCH] x86_64: Switch to the interrupt stack when running a softirq in local_bh_enable() This avoids some potential stack overflows with very deep softirq callchains. i386 does this too. TOADD CFI annotation Signed-off-by: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index b4ab6af1dea8..31007d6542cc 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -86,7 +86,7 @@ restart: /* Reset the pending bitmask before enabling irqs */ local_softirq_pending() = 0; - local_irq_enable(); + //local_irq_enable(); h = softirq_vec; @@ -99,7 +99,7 @@ restart: pending >>= 1; } while (pending); - local_irq_disable(); + //local_irq_disable(); pending = local_softirq_pending(); if (pending && --max_restart) -- cgit v1.2.2 From 78fa74a23b16bdb0d944272b696915c4e0bb3ee1 Mon Sep 17 00:00:00 2001 From: George Anzinger Date: Thu, 28 Jul 2005 21:16:16 -0700 Subject: [PATCH] posix timers: fix normalization problem (We found this (after a customer complained) and it is in the kernel.org kernel. Seems that for CLOCK_MONOTONIC absolute timers and clock_nanosleep calls both the request time and wall_to_monotonic are subtracted prior to the normalize resulting in an overflow in the existing normalize test. This causes the result to be shifted ~4 seconds ahead instead of ~2 seconds back in time.) The normalize code in posix-timers.c fails when the tv_nsec member is ~1.2 seconds negative. This can happen on absolute timers (and clock_nanosleeps) requested on CLOCK_MONOTONIC (both the request time and wall_to_monotonic are subtracted resulting in the possibility of a number close to -2 seconds.) This fix uses the set_normalized_timespec() (which does not have an overflow problem) to fix the problem and as a side effect makes the code cleaner. Signed-off-by: George Anzinger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5b7b4736d82b..10b2ad749d14 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -896,21 +896,10 @@ static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, jiffies_64_f = get_jiffies_64(); } /* - * Take away now to get delta + * Take away now to get delta and normalize */ - oc.tv_sec -= now.tv_sec; - oc.tv_nsec -= now.tv_nsec; - /* - * Normalize... - */ - while ((oc.tv_nsec - NSEC_PER_SEC) >= 0) { - oc.tv_nsec -= NSEC_PER_SEC; - oc.tv_sec++; - } - while ((oc.tv_nsec) < 0) { - oc.tv_nsec += NSEC_PER_SEC; - oc.tv_sec--; - } + set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec, + oc.tv_nsec - now.tv_nsec); }else{ jiffies_64_f = get_jiffies_64(); } -- cgit v1.2.2 From 1108bae41e2ac596f46bc4cd8876b93063203d2b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 29 Jul 2005 12:50:57 -0600 Subject: [PATCH] reboot: remove device_suspend(PMSG_FREEZE) from kernel_kexec If device_suspend(PMSG_FREEZE) is not ready to be called in kernel_restart it is definitely not ready to be called in the even more fickle kernel_kexec. Signed-off-by: Eric W. Biederman Signed-off-by: Linus Torvalds --- kernel/sys.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 8f255259ef9e..000e81ad2c1d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -392,7 +392,6 @@ void kernel_kexec(void) } notifier_call_chain(&reboot_notifier_list, SYS_RESTART, NULL); system_state = SYSTEM_RESTART; - device_suspend(PMSG_FREEZE); device_shutdown(); printk(KERN_EMERG "Starting new kernel\n"); machine_shutdown(); -- cgit v1.2.2 From c70f5d6610c601ea2ae4ae4e49f66c80801e895f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 30 Jul 2005 10:22:49 -0700 Subject: [PATCH] revert bogus softirq changes This snuck in with an x86_64 change. Thanks to Richard Purdie for spotting it. Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 31007d6542cc..b4ab6af1dea8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -86,7 +86,7 @@ restart: /* Reset the pending bitmask before enabling irqs */ local_softirq_pending() = 0; - //local_irq_enable(); + local_irq_enable(); h = softirq_vec; @@ -99,7 +99,7 @@ restart: pending >>= 1; } while (pending); - //local_irq_disable(); + local_irq_disable(); pending = local_softirq_pending(); if (pending && --max_restart) -- cgit v1.2.2 From 6cb54819d7b1867053e2dfd8c0ca3a8dc65a7eff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 1 Aug 2005 13:39:13 +0200 Subject: [PATCH] remove sys_set_zone_reclaim() This removes sys_set_zone_reclaim() for now. While i'm sure Martin is trying to solve a real problem, we must not hard-code an incomplete and insufficient approach into a syscall, because syscalls are pretty much for eternity. I am quite strongly convinced that this syscall must not hit v2.6.13 in its current form. Firstly, the syscall lacks basic syscall design: e.g. it allows the global setting of VM policy for unprivileged users. (!) [ Imagine an Oracle installation and a SAP installation on the same NUMA box fighting over the 'optimal' setting for this flag. What will they do? Will they try to set the flag to their own preferred value every second or so? ] Secondly, it was added based on a single datapoint from Martin: http://marc.theaimsgroup.com/?l=linux-mm&m=111763597218177&w=2 where Martin characterizes the numbers the following way: ' Run-to-run variability for "make -j" is huge, so these numbers aren't terribly useful except to see that with reclaim the benchmark still finishes in a reasonable amount of time. ' in other words: the fundamental problem has likely not been solved, only a tendential move into the right direction has been observed, and a handful of numbers were picked out of a set of hugely variable results, without showing the variability data. How much variance is there run-to-run? I'd really suggest to first walk the walk and see what's needed to get stable & predictable kernel compilation numbers on that NUMA box, before adding random syscalls to tune a particular aspect of the VM ... which approach might not even matter once the whole picture has been analyzed and understood! The third, most important point is that the syscall exposes VM tuning internals in a completely unstructured way. What sense does it make to have a _GLOBAL_ per-node setting for 'should we go to another node for reclaim'? If then it might make sense to do this per-app, via numalib or so. The change is minimalistic in that it doesnt remove the syscall and the underlying infrastructure changes, only the user-visible changes. We could perhaps add a CAP_SYS_ADMIN-only sysctl for this hack, a'ka /proc/sys/vm/swappiness, but even that looks quite counterproductive when the generic approach is that we are trying to reduce the number of external factors in the VM balance picture. Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 42b40ae5eada..1ab2370e2efa 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -79,7 +79,6 @@ cond_syscall(sys_request_key); cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); -cond_syscall(sys_set_zone_reclaim); cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); -- cgit v1.2.2 From 842bbaaa7394820c8f1fe0629cd15478653caf86 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 1 Aug 2005 21:11:47 -0700 Subject: [PATCH] Module per-cpu alignment cannot always be met The module code assumes noone will ever ask for a per-cpu area more than SMP_CACHE_BYTES aligned. However, as these cases show, gcc asks sometimes asks for 32-byte alignment for the per-cpu section on a module, and if CONFIG_X86_L1_CACHE_SHIFT is 4, we hit that BUG_ON(). This is obviously an unusual combination, as there have been few reports, but better to warn than die. See: http://www.ussg.iu.edu/hypermail/linux/kernel/0409.0/0768.html And more recently: http://bugs.gentoo.org/show_bug.cgi?id=97006 Signed-off-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 068e271ab3a5..c32995fbd8fd 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -250,13 +250,18 @@ static inline unsigned int block_size(int val) /* Created by linker magic */ extern char __per_cpu_start[], __per_cpu_end[]; -static void *percpu_modalloc(unsigned long size, unsigned long align) +static void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) { unsigned long extra; unsigned int i; void *ptr; - BUG_ON(align > SMP_CACHE_BYTES); + if (align > SMP_CACHE_BYTES) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", + name, align, SMP_CACHE_BYTES); + align = SMP_CACHE_BYTES; + } ptr = __per_cpu_start; for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { @@ -348,7 +353,8 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); #else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align) +static inline void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) { return NULL; } @@ -1644,7 +1650,8 @@ static struct module *load_module(void __user *umod, if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign); + sechdrs[pcpuindex].sh_addralign, + mod->name); if (!percpu) { err = -ENOMEM; goto free_mod; -- cgit v1.2.2 From c36f19e02a96488f550fdb678c92500afca3109b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 4 Aug 2005 11:36:26 +0200 Subject: [PATCH] Remove suspend() calls from shutdown path This removes the calls to device_suspend() from the shutdown path that were added sometime during 2.6.13-rc*. They aren't working properly on a number of configs (I got reports from both ppc powerbook users and x86 users) causing the system to not shutdown anymore. I think it isn't the right approach at the moment anyway. We have already a shutdown() callback for the drivers that actually care about shutdown and the suspend() code isn't yet in a good enough shape to be so much generalized. Also, the semantics of suspend and shutdown are slightly different on a number of setups and the way this was patched in provides little way for drivers to cleanly differenciate. It should have been at least a different message. For 2.6.13, I think we should revert to 2.6.12 behaviour and have a working suspend back. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 000e81ad2c1d..0bcaed6560ac 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -404,7 +404,6 @@ void kernel_halt(void) { notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); system_state = SYSTEM_HALT; - device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -415,7 +414,6 @@ void kernel_power_off(void) { notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); system_state = SYSTEM_POWER_OFF; - device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); -- cgit v1.2.2 From c306895167c8384b88bc02945a0d226a04218fa5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 4 Aug 2005 16:49:32 -0700 Subject: [PATCH] revert "timer exit cleanup" Revert this June 17 patch: it broke persistence of timers across execve(). Cc: Roland McGrath Cc: george anzinger Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 +++- kernel/posix-timers.c | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9d1b10ed0135..5b0fb9f09f21 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -829,8 +829,10 @@ fastcall NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); update_mem_hiwater(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) + if (group_dead) { + del_timer_sync(&tsk->signal->real_timer); acct_process(code); + } exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 10b2ad749d14..38798a2ff994 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1166,7 +1166,6 @@ void exit_itimers(struct signal_struct *sig) tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); itimer_delete(tmr); } - del_timer_sync(&sig->real_timer); } /* -- cgit v1.2.2 From 3077a260e9f316b611436b1506eec9cc5c4f8aa6 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Tue, 9 Aug 2005 10:07:59 -0700 Subject: [PATCH] cpuset release ABBA deadlock fix Fix possible cpuset_sem ABBA deadlock if 'notify_on_release' set. For a particular usage pattern, creating and destroying cpusets fairly frequently using notify_on_release, on a very large system, this deadlock can be seen every few days. If you are not using the cpuset notify_on_release feature, you will never see this deadlock. The existing code, on task exit (or cpuset deletion) did: get cpuset_sem if cpuset marked notify_on_release and is ready to release: compute cpuset path relative to /dev/cpuset mount point call_usermodehelper() forks /sbin/cpuset_release_agent with path drop cpuset_sem Unfortunately, the fork in call_usermodehelper can allocate memory, and allocating memory can require cpuset_sem, if the mems_generation values changed in the interim. This results in an ABBA deadlock, trying to obtain cpuset_sem when it is already held by the current task. To fix this, I put the cpuset path (which must be computed while holding cpuset_sem) in a temporary buffer, to be used in the call_usermodehelper call of /sbin/cpuset_release_agent only _after_ dropping cpuset_sem. So the new logic is: get cpuset_sem if cpuset marked notify_on_release and is ready to release: compute cpuset path relative to /dev/cpuset mount point stash path in kmalloc'd buffer drop cpuset_sem call_usermodehelper() forks /sbin/cpuset_release_agent with path free path The sharp eyed reader might notice that this patch does not contain any calls to kmalloc. The existing code in the check_for_release() routine was already kmalloc'ing a buffer to hold the cpuset path. In the old code, it just held the buffer for a few lines, over the cpuset_release_agent() call that in turn invoked call_usermodehelper(). In the new code, with the application of this patch, it returns that buffer via the new char **ppathbuf parameter, for later use and freeing in cpuset_release_agent(), which is called after cpuset_sem is dropped. Whereas the old code has just one call to cpuset_release_agent(), right in the check_for_release() routine, the new code has three calls to cpuset_release_agent(), from the various places that a cpuset can be released. This patch has been build and booted on SN2, and passed a stress test that previously hit the deadlock within a few seconds. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 68 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 805fb9097318..21a4e3b2cbda 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -398,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * - * Note final arg to call_usermodehelper() is 0 - that means - * don't wait. Since we are holding the global cpuset_sem here, - * and we are asking another thread (started from keventd) to rmdir a - * cpuset, we can't wait - or we'd deadlock with the removing thread - * on cpuset_sem. + * The final arg to call_usermodehelper() is 0, which means don't + * wait. The separate /sbin/cpuset_release_agent task is forked by + * call_usermodehelper(), then control in this thread returns here, + * without waiting for the release agent task. We don't bother to + * wait because the caller of this routine has no use for the exit + * status of the /sbin/cpuset_release_agent task, so no sense holding + * our caller up for that. + * + * The simple act of forking that task might require more memory, + * which might need cpuset_sem. So this routine must be called while + * cpuset_sem is not held, to avoid a possible deadlock. See also + * comments for check_for_release(), below. */ -static int cpuset_release_agent(char *cpuset_str) +static void cpuset_release_agent(const char *pathbuf) { char *argv[3], *envp[3]; int i; + if (!pathbuf) + return; + i = 0; argv[i++] = "/sbin/cpuset_release_agent"; - argv[i++] = cpuset_str; + argv[i++] = (char *)pathbuf; argv[i] = NULL; i = 0; @@ -421,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - return call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, 0); + kfree(pathbuf); } /* * Either cs->count of using tasks transitioned to zero, or the * cs->children list of child cpusets just became empty. If this * cs is notify_on_release() and now both the user count is zero and - * the list of children is empty, send notice to user land. + * the list of children is empty, prepare cpuset path in a kmalloc'd + * buffer, to be returned via ppathbuf, so that the caller can invoke + * cpuset_release_agent() with it later on, once cpuset_sem is dropped. + * Call here with cpuset_sem held. + * + * This check_for_release() routine is responsible for kmalloc'ing + * pathbuf. The above cpuset_release_agent() is responsible for + * kfree'ing pathbuf. The caller of these routines is responsible + * for providing a pathbuf pointer, initialized to NULL, then + * calling check_for_release() with cpuset_sem held and the address + * of the pathbuf pointer, then dropping cpuset_sem, then calling + * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ -static void check_for_release(struct cpuset *cs) +static void check_for_release(struct cpuset *cs, char **ppathbuf) { if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && list_empty(&cs->children)) { @@ -441,10 +463,9 @@ static void check_for_release(struct cpuset *cs) if (!buf) return; if (cpuset_path(cs, buf, PAGE_SIZE) < 0) - goto out; - cpuset_release_agent(buf); -out: - kfree(buf); + kfree(buf); + else + *ppathbuf = buf; } } @@ -727,14 +748,14 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return 0; } -static int attach_task(struct cpuset *cs, char *buf) +static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; - if (sscanf(buf, "%d", &pid) != 1) + if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; @@ -777,7 +798,7 @@ static int attach_task(struct cpuset *cs, char *buf) put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) - check_for_release(oldcs); + check_for_release(oldcs, ppathbuf); return 0; } @@ -801,6 +822,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us struct cftype *cft = __d_cft(file->f_dentry); cpuset_filetype_t type = cft->private; char *buffer; + char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ @@ -841,7 +863,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; case FILE_TASKLIST: - retval = attach_task(cs, buffer); + retval = attach_task(cs, buffer, &pathbuf); break; default: retval = -EINVAL; @@ -852,6 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = nbytes; out2: up(&cpuset_sem); + cpuset_release_agent(pathbuf); out1: kfree(buffer); return retval; @@ -1357,6 +1380,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cpuset *cs = dentry->d_fsdata; struct dentry *d; struct cpuset *parent; + char *pathbuf = NULL; /* the vfs holds both inode->i_sem already */ @@ -1376,7 +1400,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ if (list_empty(&parent->children)) - check_for_release(parent); + check_for_release(parent, &pathbuf); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; @@ -1384,6 +1408,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) cpuset_d_remove_dir(d); dput(d); up(&cpuset_sem); + cpuset_release_agent(pathbuf); return 0; } @@ -1483,10 +1508,13 @@ void cpuset_exit(struct task_struct *tsk) task_unlock(tsk); if (notify_on_release(cs)) { + char *pathbuf = NULL; + down(&cpuset_sem); if (atomic_dec_and_test(&cs->count)) - check_for_release(cs); + check_for_release(cs, &pathbuf); up(&cpuset_sem); + cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); } -- cgit v1.2.2 From 606867443764edac5a2c542f2fa0a12ef7a7c7fd Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 10 Aug 2005 11:29:15 -0700 Subject: [PATCH] remove name length check in a workqueue We have a chek in there to make sure that the name won't overflow task_struct.comm[], but it's triggering for scsi with lots of HBAs, only scsi is using single-threaded workqueues which don't append the "/%d" anyway. All too hard. Just kill the BUG_ON. Cc: Ingo Molnar Signed-off-by: Andrew Morton [ kthread_create() uses vsnprintf() and limits the thing, so no actual overflow can actually happen regardless ] Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 259cf55da3c9..c7e36d4a70ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -308,8 +308,6 @@ struct workqueue_struct *__create_workqueue(const char *name, struct workqueue_struct *wq; struct task_struct *p; - BUG_ON(strlen(name) > 10); - wq = kmalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; -- cgit v1.2.2