From c70f5d6610c601ea2ae4ae4e49f66c80801e895f Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 30 Jul 2005 10:22:49 -0700 Subject: [PATCH] revert bogus softirq changes This snuck in with an x86_64 change. Thanks to Richard Purdie for spotting it. Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/softirq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/softirq.c b/kernel/softirq.c index 31007d6542cc..b4ab6af1dea8 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -86,7 +86,7 @@ restart: /* Reset the pending bitmask before enabling irqs */ local_softirq_pending() = 0; - //local_irq_enable(); + local_irq_enable(); h = softirq_vec; @@ -99,7 +99,7 @@ restart: pending >>= 1; } while (pending); - //local_irq_disable(); + local_irq_disable(); pending = local_softirq_pending(); if (pending && --max_restart) -- cgit v1.2.2 From 6cb54819d7b1867053e2dfd8c0ca3a8dc65a7eff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 1 Aug 2005 13:39:13 +0200 Subject: [PATCH] remove sys_set_zone_reclaim() This removes sys_set_zone_reclaim() for now. While i'm sure Martin is trying to solve a real problem, we must not hard-code an incomplete and insufficient approach into a syscall, because syscalls are pretty much for eternity. I am quite strongly convinced that this syscall must not hit v2.6.13 in its current form. Firstly, the syscall lacks basic syscall design: e.g. it allows the global setting of VM policy for unprivileged users. (!) [ Imagine an Oracle installation and a SAP installation on the same NUMA box fighting over the 'optimal' setting for this flag. What will they do? Will they try to set the flag to their own preferred value every second or so? ] Secondly, it was added based on a single datapoint from Martin: http://marc.theaimsgroup.com/?l=linux-mm&m=111763597218177&w=2 where Martin characterizes the numbers the following way: ' Run-to-run variability for "make -j" is huge, so these numbers aren't terribly useful except to see that with reclaim the benchmark still finishes in a reasonable amount of time. ' in other words: the fundamental problem has likely not been solved, only a tendential move into the right direction has been observed, and a handful of numbers were picked out of a set of hugely variable results, without showing the variability data. How much variance is there run-to-run? I'd really suggest to first walk the walk and see what's needed to get stable & predictable kernel compilation numbers on that NUMA box, before adding random syscalls to tune a particular aspect of the VM ... which approach might not even matter once the whole picture has been analyzed and understood! The third, most important point is that the syscall exposes VM tuning internals in a completely unstructured way. What sense does it make to have a _GLOBAL_ per-node setting for 'should we go to another node for reclaim'? If then it might make sense to do this per-app, via numalib or so. The change is minimalistic in that it doesnt remove the syscall and the underlying infrastructure changes, only the user-visible changes. We could perhaps add a CAP_SYS_ADMIN-only sysctl for this hack, a'ka /proc/sys/vm/swappiness, but even that looks quite counterproductive when the generic approach is that we are trying to reduce the number of external factors in the VM balance picture. Signed-off-by: Ingo Molnar Signed-off-by: Linus Torvalds --- kernel/sys_ni.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 42b40ae5eada..1ab2370e2efa 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -79,7 +79,6 @@ cond_syscall(sys_request_key); cond_syscall(sys_keyctl); cond_syscall(compat_sys_keyctl); cond_syscall(compat_sys_socketcall); -cond_syscall(sys_set_zone_reclaim); cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); -- cgit v1.2.2 From 842bbaaa7394820c8f1fe0629cd15478653caf86 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 1 Aug 2005 21:11:47 -0700 Subject: [PATCH] Module per-cpu alignment cannot always be met The module code assumes noone will ever ask for a per-cpu area more than SMP_CACHE_BYTES aligned. However, as these cases show, gcc asks sometimes asks for 32-byte alignment for the per-cpu section on a module, and if CONFIG_X86_L1_CACHE_SHIFT is 4, we hit that BUG_ON(). This is obviously an unusual combination, as there have been few reports, but better to warn than die. See: http://www.ussg.iu.edu/hypermail/linux/kernel/0409.0/0768.html And more recently: http://bugs.gentoo.org/show_bug.cgi?id=97006 Signed-off-by: Rusty Russell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/module.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 068e271ab3a5..c32995fbd8fd 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -250,13 +250,18 @@ static inline unsigned int block_size(int val) /* Created by linker magic */ extern char __per_cpu_start[], __per_cpu_end[]; -static void *percpu_modalloc(unsigned long size, unsigned long align) +static void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) { unsigned long extra; unsigned int i; void *ptr; - BUG_ON(align > SMP_CACHE_BYTES); + if (align > SMP_CACHE_BYTES) { + printk(KERN_WARNING "%s: per-cpu alignment %li > %i\n", + name, align, SMP_CACHE_BYTES); + align = SMP_CACHE_BYTES; + } ptr = __per_cpu_start; for (i = 0; i < pcpu_num_used; ptr += block_size(pcpu_size[i]), i++) { @@ -348,7 +353,8 @@ static int percpu_modinit(void) } __initcall(percpu_modinit); #else /* ... !CONFIG_SMP */ -static inline void *percpu_modalloc(unsigned long size, unsigned long align) +static inline void *percpu_modalloc(unsigned long size, unsigned long align, + const char *name) { return NULL; } @@ -1644,7 +1650,8 @@ static struct module *load_module(void __user *umod, if (pcpuindex) { /* We have a special allocation for this section. */ percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size, - sechdrs[pcpuindex].sh_addralign); + sechdrs[pcpuindex].sh_addralign, + mod->name); if (!percpu) { err = -ENOMEM; goto free_mod; -- cgit v1.2.2 From c36f19e02a96488f550fdb678c92500afca3109b Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Thu, 4 Aug 2005 11:36:26 +0200 Subject: [PATCH] Remove suspend() calls from shutdown path This removes the calls to device_suspend() from the shutdown path that were added sometime during 2.6.13-rc*. They aren't working properly on a number of configs (I got reports from both ppc powerbook users and x86 users) causing the system to not shutdown anymore. I think it isn't the right approach at the moment anyway. We have already a shutdown() callback for the drivers that actually care about shutdown and the suspend() code isn't yet in a good enough shape to be so much generalized. Also, the semantics of suspend and shutdown are slightly different on a number of setups and the way this was patched in provides little way for drivers to cleanly differenciate. It should have been at least a different message. For 2.6.13, I think we should revert to 2.6.12 behaviour and have a working suspend back. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 000e81ad2c1d..0bcaed6560ac 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -404,7 +404,6 @@ void kernel_halt(void) { notifier_call_chain(&reboot_notifier_list, SYS_HALT, NULL); system_state = SYSTEM_HALT; - device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "System halted.\n"); machine_halt(); @@ -415,7 +414,6 @@ void kernel_power_off(void) { notifier_call_chain(&reboot_notifier_list, SYS_POWER_OFF, NULL); system_state = SYSTEM_POWER_OFF; - device_suspend(PMSG_SUSPEND); device_shutdown(); printk(KERN_EMERG "Power down.\n"); machine_power_off(); -- cgit v1.2.2 From c306895167c8384b88bc02945a0d226a04218fa5 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 4 Aug 2005 16:49:32 -0700 Subject: [PATCH] revert "timer exit cleanup" Revert this June 17 patch: it broke persistence of timers across execve(). Cc: Roland McGrath Cc: george anzinger Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 +++- kernel/posix-timers.c | 1 - 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 9d1b10ed0135..5b0fb9f09f21 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -829,8 +829,10 @@ fastcall NORET_TYPE void do_exit(long code) acct_update_integrals(tsk); update_mem_hiwater(tsk); group_dead = atomic_dec_and_test(&tsk->signal->live); - if (group_dead) + if (group_dead) { + del_timer_sync(&tsk->signal->real_timer); acct_process(code); + } exit_mm(tsk); exit_sem(tsk); diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 10b2ad749d14..38798a2ff994 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -1166,7 +1166,6 @@ void exit_itimers(struct signal_struct *sig) tmr = list_entry(sig->posix_timers.next, struct k_itimer, list); itimer_delete(tmr); } - del_timer_sync(&sig->real_timer); } /* -- cgit v1.2.2 From 3077a260e9f316b611436b1506eec9cc5c4f8aa6 Mon Sep 17 00:00:00 2001 From: Paul Jackson Date: Tue, 9 Aug 2005 10:07:59 -0700 Subject: [PATCH] cpuset release ABBA deadlock fix Fix possible cpuset_sem ABBA deadlock if 'notify_on_release' set. For a particular usage pattern, creating and destroying cpusets fairly frequently using notify_on_release, on a very large system, this deadlock can be seen every few days. If you are not using the cpuset notify_on_release feature, you will never see this deadlock. The existing code, on task exit (or cpuset deletion) did: get cpuset_sem if cpuset marked notify_on_release and is ready to release: compute cpuset path relative to /dev/cpuset mount point call_usermodehelper() forks /sbin/cpuset_release_agent with path drop cpuset_sem Unfortunately, the fork in call_usermodehelper can allocate memory, and allocating memory can require cpuset_sem, if the mems_generation values changed in the interim. This results in an ABBA deadlock, trying to obtain cpuset_sem when it is already held by the current task. To fix this, I put the cpuset path (which must be computed while holding cpuset_sem) in a temporary buffer, to be used in the call_usermodehelper call of /sbin/cpuset_release_agent only _after_ dropping cpuset_sem. So the new logic is: get cpuset_sem if cpuset marked notify_on_release and is ready to release: compute cpuset path relative to /dev/cpuset mount point stash path in kmalloc'd buffer drop cpuset_sem call_usermodehelper() forks /sbin/cpuset_release_agent with path free path The sharp eyed reader might notice that this patch does not contain any calls to kmalloc. The existing code in the check_for_release() routine was already kmalloc'ing a buffer to hold the cpuset path. In the old code, it just held the buffer for a few lines, over the cpuset_release_agent() call that in turn invoked call_usermodehelper(). In the new code, with the application of this patch, it returns that buffer via the new char **ppathbuf parameter, for later use and freeing in cpuset_release_agent(), which is called after cpuset_sem is dropped. Whereas the old code has just one call to cpuset_release_agent(), right in the check_for_release() routine, the new code has three calls to cpuset_release_agent(), from the various places that a cpuset can be released. This patch has been build and booted on SN2, and passed a stress test that previously hit the deadlock within a few seconds. Signed-off-by: Paul Jackson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 68 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 805fb9097318..21a4e3b2cbda 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -398,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * - * Note final arg to call_usermodehelper() is 0 - that means - * don't wait. Since we are holding the global cpuset_sem here, - * and we are asking another thread (started from keventd) to rmdir a - * cpuset, we can't wait - or we'd deadlock with the removing thread - * on cpuset_sem. + * The final arg to call_usermodehelper() is 0, which means don't + * wait. The separate /sbin/cpuset_release_agent task is forked by + * call_usermodehelper(), then control in this thread returns here, + * without waiting for the release agent task. We don't bother to + * wait because the caller of this routine has no use for the exit + * status of the /sbin/cpuset_release_agent task, so no sense holding + * our caller up for that. + * + * The simple act of forking that task might require more memory, + * which might need cpuset_sem. So this routine must be called while + * cpuset_sem is not held, to avoid a possible deadlock. See also + * comments for check_for_release(), below. */ -static int cpuset_release_agent(char *cpuset_str) +static void cpuset_release_agent(const char *pathbuf) { char *argv[3], *envp[3]; int i; + if (!pathbuf) + return; + i = 0; argv[i++] = "/sbin/cpuset_release_agent"; - argv[i++] = cpuset_str; + argv[i++] = (char *)pathbuf; argv[i] = NULL; i = 0; @@ -421,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - return call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, 0); + kfree(pathbuf); } /* * Either cs->count of using tasks transitioned to zero, or the * cs->children list of child cpusets just became empty. If this * cs is notify_on_release() and now both the user count is zero and - * the list of children is empty, send notice to user land. + * the list of children is empty, prepare cpuset path in a kmalloc'd + * buffer, to be returned via ppathbuf, so that the caller can invoke + * cpuset_release_agent() with it later on, once cpuset_sem is dropped. + * Call here with cpuset_sem held. + * + * This check_for_release() routine is responsible for kmalloc'ing + * pathbuf. The above cpuset_release_agent() is responsible for + * kfree'ing pathbuf. The caller of these routines is responsible + * for providing a pathbuf pointer, initialized to NULL, then + * calling check_for_release() with cpuset_sem held and the address + * of the pathbuf pointer, then dropping cpuset_sem, then calling + * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ -static void check_for_release(struct cpuset *cs) +static void check_for_release(struct cpuset *cs, char **ppathbuf) { if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && list_empty(&cs->children)) { @@ -441,10 +463,9 @@ static void check_for_release(struct cpuset *cs) if (!buf) return; if (cpuset_path(cs, buf, PAGE_SIZE) < 0) - goto out; - cpuset_release_agent(buf); -out: - kfree(buf); + kfree(buf); + else + *ppathbuf = buf; } } @@ -727,14 +748,14 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return 0; } -static int attach_task(struct cpuset *cs, char *buf) +static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; - if (sscanf(buf, "%d", &pid) != 1) + if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; @@ -777,7 +798,7 @@ static int attach_task(struct cpuset *cs, char *buf) put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) - check_for_release(oldcs); + check_for_release(oldcs, ppathbuf); return 0; } @@ -801,6 +822,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us struct cftype *cft = __d_cft(file->f_dentry); cpuset_filetype_t type = cft->private; char *buffer; + char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ @@ -841,7 +863,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; case FILE_TASKLIST: - retval = attach_task(cs, buffer); + retval = attach_task(cs, buffer, &pathbuf); break; default: retval = -EINVAL; @@ -852,6 +874,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = nbytes; out2: up(&cpuset_sem); + cpuset_release_agent(pathbuf); out1: kfree(buffer); return retval; @@ -1357,6 +1380,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cpuset *cs = dentry->d_fsdata; struct dentry *d; struct cpuset *parent; + char *pathbuf = NULL; /* the vfs holds both inode->i_sem already */ @@ -1376,7 +1400,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ if (list_empty(&parent->children)) - check_for_release(parent); + check_for_release(parent, &pathbuf); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; @@ -1384,6 +1408,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) cpuset_d_remove_dir(d); dput(d); up(&cpuset_sem); + cpuset_release_agent(pathbuf); return 0; } @@ -1483,10 +1508,13 @@ void cpuset_exit(struct task_struct *tsk) task_unlock(tsk); if (notify_on_release(cs)) { + char *pathbuf = NULL; + down(&cpuset_sem); if (atomic_dec_and_test(&cs->count)) - check_for_release(cs); + check_for_release(cs, &pathbuf); up(&cpuset_sem); + cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); } -- cgit v1.2.2 From 606867443764edac5a2c542f2fa0a12ef7a7c7fd Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 10 Aug 2005 11:29:15 -0700 Subject: [PATCH] remove name length check in a workqueue We have a chek in there to make sure that the name won't overflow task_struct.comm[], but it's triggering for scsi with lots of HBAs, only scsi is using single-threaded workqueues which don't append the "/%d" anyway. All too hard. Just kill the BUG_ON. Cc: Ingo Molnar Signed-off-by: Andrew Morton [ kthread_create() uses vsnprintf() and limits the thing, so no actual overflow can actually happen regardless ] Signed-off-by: Linus Torvalds --- kernel/workqueue.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 259cf55da3c9..c7e36d4a70ca 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -308,8 +308,6 @@ struct workqueue_struct *__create_workqueue(const char *name, struct workqueue_struct *wq; struct task_struct *p; - BUG_ON(strlen(name) > 10); - wq = kmalloc(sizeof(*wq), GFP_KERNEL); if (!wq) return NULL; -- cgit v1.2.2