From 9ac7849e35f705830f7b016ff272b0ff1f7ff759 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 20 Jan 2007 16:00:26 +0900 Subject: devres: device resource management Implement device resource management, in short, devres. A device driver can allocate arbirary size of devres data which is associated with a release function. On driver detach, release function is invoked on the devres data, then, devres data is freed. devreses are typed by associated release functions. Some devreses are better represented by single instance of the type while others need multiple instances sharing the same release function. Both usages are supported. devreses can be grouped using devres group such that a device driver can easily release acquired resources halfway through initialization or selectively release resources (e.g. resources for port 1 out of 4 ports). This patch adds devres core including documentation and the following managed interfaces. * alloc/free : devm_kzalloc(), devm_kzfree() * IO region : devm_request_region(), devm_release_region() * IRQ : devm_request_irq(), devm_free_irq() * DMA : dmam_alloc_coherent(), dmam_free_coherent(), dmam_declare_coherent_memory(), dmam_pool_create(), dmam_pool_destroy() * PCI : pcim_enable_device(), pcim_pin_device(), pci_is_managed() * iomap : devm_ioport_map(), devm_ioport_unmap(), devm_ioremap(), devm_ioremap_nocache(), devm_iounmap(), pcim_iomap_table(), pcim_iomap(), pcim_iounmap() Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- kernel/irq/manage.c | 86 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/resource.c | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8b961adc3bd2..c4b7ed1cebf7 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -482,3 +482,89 @@ int request_irq(unsigned int irq, irq_handler_t handler, return retval; } EXPORT_SYMBOL(request_irq); + +/* + * Device resource management aware IRQ request/free implementation. + */ +struct irq_devres { + unsigned int irq; + void *dev_id; +}; + +static void devm_irq_release(struct device *dev, void *res) +{ + struct irq_devres *this = res; + + free_irq(this->irq, this->dev_id); +} + +static int devm_irq_match(struct device *dev, void *res, void *data) +{ + struct irq_devres *this = res, *match = data; + + return this->irq == match->irq && this->dev_id == match->dev_id; +} + +/** + * devm_request_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, dev_free_irq() must be used. + */ +int devm_request_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + kfree(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_irq); + +/** + * devm_free_irq - free an interrupt + * @dev: device to free interrupt for + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as free_irq(). + * This function instead of free_irq() should be used to manually + * free IRQs allocated with dev_request_irq(). + */ +void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) +{ + struct irq_devres match_data = { irq, dev_id }; + + free_irq(irq, dev_id); + WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, + &match_data)); +} +EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/resource.c b/kernel/resource.c index 7b9a497419d9..2a3f88636580 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -17,6 +17,7 @@ #include #include #include +#include #include @@ -617,6 +618,67 @@ void __release_region(struct resource *parent, resource_size_t start, } EXPORT_SYMBOL(__release_region); +/* + * Managed region resource + */ +struct region_devres { + struct resource *parent; + resource_size_t start; + resource_size_t n; +}; + +static void devm_region_release(struct device *dev, void *res) +{ + struct region_devres *this = res; + + __release_region(this->parent, this->start, this->n); +} + +static int devm_region_match(struct device *dev, void *res, void *match_data) +{ + struct region_devres *this = res, *match = match_data; + + return this->parent == match->parent && + this->start == match->start && this->n == match->n; +} + +struct resource * __devm_request_region(struct device *dev, + struct resource *parent, resource_size_t start, + resource_size_t n, const char *name) +{ + struct region_devres *dr = NULL; + struct resource *res; + + dr = devres_alloc(devm_region_release, sizeof(struct region_devres), + GFP_KERNEL); + if (!dr) + return NULL; + + dr->parent = parent; + dr->start = start; + dr->n = n; + + res = __request_region(parent, start, n, name); + if (res) + devres_add(dev, dr); + else + devres_free(dr); + + return res; +} +EXPORT_SYMBOL(__devm_request_region); + +void __devm_release_region(struct device *dev, struct resource *parent, + resource_size_t start, resource_size_t n) +{ + struct region_devres match_data = { parent, start, n }; + + __release_region(parent, start, n); + WARN_ON(devres_destroy(dev, devm_region_release, devm_region_match, + &match_data)); +} +EXPORT_SYMBOL(__devm_release_region); + /* * Called from init/main.c to reserve IO ports. */ -- cgit v1.2.2 From d23ad42324cc4378132e51f2fc5c9ba6cbe75182 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 10 Feb 2007 01:43:02 -0800 Subject: [PATCH] Use ZVC for free_pages This is again simplifies some of the VM counter calculations through the use of the ZVC consolidated counters. [michal.k.k.piotrowski@gmail.com: build fix] Signed-off-by: Christoph Lameter Signed-off-by: Michal Piotrowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/snapshot.c | 4 ++-- kernel/power/swsusp.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index c024606221c4..fc53ad068128 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -591,7 +591,7 @@ static unsigned int count_free_highmem_pages(void) for_each_zone(zone) if (populated_zone(zone) && is_highmem(zone)) - cnt += zone->free_pages; + cnt += zone_page_state(zone, NR_FREE_PAGES); return cnt; } @@ -869,7 +869,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) for_each_zone(zone) { meta += snapshot_additional_pages(zone); if (!is_highmem(zone)) - free += zone->free_pages; + free += zone_page_state(zone, NR_FREE_PAGES); } nr_pages += count_pages_for_highmem(nr_highmem); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 31aa0390c777..7fb834397a0d 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -230,9 +230,10 @@ int swsusp_shrink_memory(void) for_each_zone (zone) if (populated_zone(zone)) { if (is_highmem(zone)) { - highmem_size -= zone->free_pages; + highmem_size -= + zone_page_state(zone, NR_FREE_PAGES); } else { - tmp -= zone->free_pages; + tmp -= zone_page_state(zone, NR_FREE_PAGES); tmp += zone->lowmem_reserve[ZONE_NORMAL]; tmp += snapshot_additional_pages(zone); } -- cgit v1.2.2 From 96177299416dbccb73b54e6b344260154a445375 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Sat, 10 Feb 2007 01:43:03 -0800 Subject: [PATCH] Drop free_pages() nr_free_pages is now a simple access to a global variable. Make it a macro instead of a function. The nr_free_pages now requires vmstat.h to be included. There is one occurrence in power management where we need to add the include. Directly refrer to global_page_state() there to clarify why the #include was added. [akpm@osdl.org: arm build fix] [akpm@osdl.org: sparc64 build fix] Signed-off-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index ff3a6182f5f0..47ca5a2b653b 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "power.h" @@ -72,7 +73,8 @@ static int suspend_prepare(suspend_state_t state) goto Thaw; } - if ((free_pages = nr_free_pages()) < FREE_PAGE_NUMBER) { + if ((free_pages = global_page_state(NR_FREE_PAGES)) + < FREE_PAGE_NUMBER) { pr_debug("PM: free some memory\n"); shrink_all_memory(FREE_PAGE_NUMBER - free_pages); if (nr_free_pages() < FREE_PAGE_NUMBER) { -- cgit v1.2.2 From 6ff1b4426e3afc61dcb67299709fde9041d59265 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Sat, 10 Feb 2007 01:43:19 -0800 Subject: [PATCH] make reading /proc/sys/kernel/cap-bould not require CAP_SYS_MODULE Reading /proc/sys/kernel/cap-bound requires CAP_SYS_MODULE. (see proc_dointvec_bset in kernel/sysctl.c) sysctl appears to drive all over proc reading everything it can get it's hands on and is complaining when it is being denied access to read cap-bound. Clearly writing to cap-bound should be a sensitive operation but requiring CAP_SYS_MODULE to read cap-bound seems a bit to strong. I believe the information could with reasonable certainty be obtained by looking at a bunch of the output of /proc/pid/status which has very low security protection, so at best we are just getting a little obfuscation of information. Currently SELinux policy has to 'dontaudit' capability checks for CAP_SYS_MODULE for things like sysctl which just want to read cap-bound. In doing so we also as a byproduct have to hide warnings of potential exploits such as if at some time that sysctl actually tried to load a module. I wondered if anyone would have a problem opening cap-bound up to read from anyone? Acked-by: Chris Wright Cc: Stephen Smalley Cc: James Morris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 600b33358ded..41bbba1a15da 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1961,7 +1961,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, { int op; - if (!capable(CAP_SYS_MODULE)) { + if (write && !capable(CAP_SYS_MODULE)) { return -EPERM; } -- cgit v1.2.2 From e3c7db621bed4afb8e231cb005057f2feb5db557 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 10 Feb 2007 01:43:31 -0800 Subject: [PATCH] PM: Change code ordering in main.c As indicated in a recent thread on Linux-PM, it's necessary to call pm_ops->finish() before devce_resume(), but enable_nonboot_cpus() has to be called before pm_ops->finish() (cf. http://lists.osdl.org/pipermail/linux-pm/2006-November/004164.html). For consistency, it seems reasonable to call disable_nonboot_cpus() after device_suspend(). This way the suspend code will remain symmetrical with respect to the resume code and it may allow us to speed up things in the future by suspending and resuming devices and/or saving the suspend image in many threads. The following series of patches reorders the suspend and resume code so that nonboot CPUs are disabled after devices have been suspended and enabled before the devices are resumed. It also causes pm_ops->finish() to be called after enable_nonboot_cpus() wherever necessary. This patch: Change the ordering of code in kernel/power/main.c so that device_suspend() is called before disable_nonboot_cpus() and pm_ops->finish() is called after enable_nonboot_cpus() and before device_resume(), as indicated by recent discussion on Linux-PM (cf. http://lists.osdl.org/pipermail/linux-pm/2006-November/004164.html). Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Greg KH Cc: Nigel Cunningham Cc: Patrick Mochel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/main.c | 34 +++++++++++++++++++--------------- 1 file changed, 19 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/power/main.c b/kernel/power/main.c index 47ca5a2b653b..e1c413120469 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -44,6 +44,11 @@ void pm_set_ops(struct pm_ops * ops) mutex_unlock(&pm_mutex); } +static inline void pm_finish(suspend_state_t state) +{ + if (pm_ops->finish) + pm_ops->finish(state); +} /** * suspend_prepare - Do prep work before entering low-power state. @@ -64,10 +69,6 @@ static int suspend_prepare(suspend_state_t state) pm_prepare_console(); - error = disable_nonboot_cpus(); - if (error) - goto Enable_cpu; - if (freeze_processes()) { error = -EAGAIN; goto Thaw; @@ -90,18 +91,22 @@ static int suspend_prepare(suspend_state_t state) } suspend_console(); - if ((error = device_suspend(PMSG_SUSPEND))) { + error = device_suspend(PMSG_SUSPEND); + if (error) { printk(KERN_ERR "Some devices failed to suspend\n"); - goto Finish; + goto Resume_devices; } - return 0; - Finish: - if (pm_ops->finish) - pm_ops->finish(state); + error = disable_nonboot_cpus(); + if (!error) + return 0; + + enable_nonboot_cpus(); + Resume_devices: + pm_finish(state); + device_resume(); + resume_console(); Thaw: thaw_processes(); - Enable_cpu: - enable_nonboot_cpus(); pm_restore_console(); return error; } @@ -136,12 +141,11 @@ int suspend_enter(suspend_state_t state) static void suspend_finish(suspend_state_t state) { + enable_nonboot_cpus(); + pm_finish(state); device_resume(); resume_console(); thaw_processes(); - enable_nonboot_cpus(); - if (pm_ops && pm_ops->finish) - pm_ops->finish(state); pm_restore_console(); } -- cgit v1.2.2 From ed746e3b18f4df18afa3763155972c5835f284c5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 10 Feb 2007 01:43:32 -0800 Subject: [PATCH] swsusp: Change code ordering in disk.c Change the ordering of code in kernel/power/disk.c so that device_suspend() is called before disable_nonboot_cpus() and platform_finish() is called after enable_nonboot_cpus() and before device_resume(), as indicated by the recent discussion on Linux-PM (cf. http://lists.osdl.org/pipermail/linux-pm/2006-November/004164.html). The changes here only affect the built-in swsusp. [alexey.y.starikovskiy@linux.intel.com: fix LED blinking during image load] Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Greg KH Cc: Nigel Cunningham Cc: Patrick Mochel Cc: Alexey Starikovskiy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpu.c | 2 + kernel/power/disk.c | 115 ++++++++++++++++++++++++++-------------------------- 2 files changed, 60 insertions(+), 57 deletions(-) (limited to 'kernel') diff --git a/kernel/cpu.c b/kernel/cpu.c index 7406fe6966f9..3d4206ada5c9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -309,6 +309,8 @@ void enable_nonboot_cpus(void) mutex_lock(&cpu_add_remove_lock); cpu_hotplug_disabled = 0; mutex_unlock(&cpu_add_remove_lock); + if (cpus_empty(frozen_cpus)) + return; printk("Enabling non-boot CPUs ...\n"); for_each_cpu_mask(cpu, frozen_cpus) { diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 88fc5d7ac737..406b20adb27a 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -87,52 +87,24 @@ static inline void platform_finish(void) } } +static void unprepare_processes(void) +{ + thaw_processes(); + pm_restore_console(); +} + static int prepare_processes(void) { int error = 0; pm_prepare_console(); - - error = disable_nonboot_cpus(); - if (error) - goto enable_cpus; - if (freeze_processes()) { error = -EBUSY; - goto thaw; + unprepare_processes(); } - - if (pm_disk_mode == PM_DISK_TESTPROC) { - printk("swsusp debug: Waiting for 5 seconds.\n"); - mdelay(5000); - goto thaw; - } - - error = platform_prepare(); - if (error) - goto thaw; - - /* Free memory before shutting down devices. */ - if (!(error = swsusp_shrink_memory())) - return 0; - - platform_finish(); - thaw: - thaw_processes(); - enable_cpus: - enable_nonboot_cpus(); - pm_restore_console(); return error; } -static void unprepare_processes(void) -{ - platform_finish(); - thaw_processes(); - enable_nonboot_cpus(); - pm_restore_console(); -} - /** * pm_suspend_disk - The granpappy of hibernation power management. * @@ -150,29 +122,45 @@ int pm_suspend_disk(void) if (error) return error; - if (pm_disk_mode == PM_DISK_TESTPROC) - return 0; + if (pm_disk_mode == PM_DISK_TESTPROC) { + printk("swsusp debug: Waiting for 5 seconds.\n"); + mdelay(5000); + goto Thaw; + } + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (error) + goto Thaw; + + error = platform_prepare(); + if (error) + goto Thaw; suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) { - resume_console(); - printk("Some devices failed to suspend\n"); - goto Thaw; + printk(KERN_ERR "PM: Some devices failed to suspend\n"); + goto Resume_devices; } + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; if (pm_disk_mode == PM_DISK_TEST) { printk("swsusp debug: Waiting for 5 seconds.\n"); mdelay(5000); - goto Done; + goto Enable_cpus; } pr_debug("PM: snapshotting memory.\n"); in_suspend = 1; - if ((error = swsusp_suspend())) - goto Done; + error = swsusp_suspend(); + if (error) + goto Enable_cpus; if (in_suspend) { + enable_nonboot_cpus(); + platform_finish(); device_resume(); resume_console(); pr_debug("PM: writing image.\n"); @@ -188,7 +176,10 @@ int pm_suspend_disk(void) } swsusp_free(); - Done: + Enable_cpus: + enable_nonboot_cpus(); + Resume_devices: + platform_finish(); device_resume(); resume_console(); Thaw: @@ -237,19 +228,28 @@ static int software_resume(void) pr_debug("PM: Checking swsusp image.\n"); - if ((error = swsusp_check())) + error = swsusp_check(); + if (error) goto Done; pr_debug("PM: Preparing processes for restore.\n"); - if ((error = prepare_processes())) { + error = prepare_processes(); + if (error) { swsusp_close(); goto Done; } + error = platform_prepare(); + if (error) { + swsusp_free(); + goto Thaw; + } + pr_debug("PM: Reading swsusp image.\n"); - if ((error = swsusp_read())) { + error = swsusp_read(); + if (error) { swsusp_free(); goto Thaw; } @@ -257,21 +257,22 @@ static int software_resume(void) pr_debug("PM: Preparing devices for restore.\n"); suspend_console(); - if ((error = device_suspend(PMSG_PRETHAW))) { - resume_console(); - printk("Some devices failed to suspend\n"); - swsusp_free(); - goto Thaw; - } + error = device_suspend(PMSG_PRETHAW); + if (error) + goto Free; - mb(); + error = disable_nonboot_cpus(); + if (!error) + swsusp_resume(); - pr_debug("PM: Restoring saved image.\n"); - swsusp_resume(); - pr_debug("PM: Restore failed, recovering.n"); + enable_nonboot_cpus(); + Free: + swsusp_free(); + platform_finish(); device_resume(); resume_console(); Thaw: + printk(KERN_ERR "PM: Restore failed, recovering.\n"); unprepare_processes(); Done: /* For success case, the suspend path will release the lock */ -- cgit v1.2.2 From 259130526c267550bc365d3015917d90667732f1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 10 Feb 2007 01:43:33 -0800 Subject: [PATCH] swsusp: Change code ordering in user.c Change the ordering of code in kernel/power/user.c so that device_suspend() is called before disable_nonboot_cpus() and device_resume() is called after enable_nonboot_cpus(). This is needed to make the userland suspend call pm_ops->finish() after enable_nonboot_cpus() and before device_resume(), as indicated by the recent discussion on Linux-PM (cf. http://lists.osdl.org/pipermail/linux-pm/2006-November/004164.html). The changes here only affect the userland interface of swsusp. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Greg KH Cc: Nigel Cunningham Cc: Patrick Mochel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/user.c | 92 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 58 insertions(+), 34 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index f7b7a785a5c6..4f217683455f 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -122,6 +122,59 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } +static inline int snapshot_suspend(void) +{ + int error; + + mutex_lock(&pm_mutex); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); + if (error) + goto Finish; + + suspend_console(); + error = device_suspend(PMSG_FREEZE); + if (error) + goto Resume_devices; + + error = disable_nonboot_cpus(); + if (!error) { + in_suspend = 1; + error = swsusp_suspend(); + } + enable_nonboot_cpus(); + Resume_devices: + device_resume(); + resume_console(); + Finish: + mutex_unlock(&pm_mutex); + return error; +} + +static inline int snapshot_restore(void) +{ + int error; + + mutex_lock(&pm_mutex); + pm_prepare_console(); + suspend_console(); + error = device_suspend(PMSG_PRETHAW); + if (error) + goto Resume_devices; + + error = disable_nonboot_cpus(); + if (!error) + error = swsusp_resume(); + + enable_nonboot_cpus(); + Resume_devices: + device_resume(); + resume_console(); + pm_restore_console(); + mutex_unlock(&pm_mutex); + return error; +} + static int snapshot_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { @@ -145,14 +198,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, if (data->frozen) break; mutex_lock(&pm_mutex); - error = disable_nonboot_cpus(); - if (!error) { - error = freeze_processes(); - if (error) { - thaw_processes(); - enable_nonboot_cpus(); - error = -EBUSY; - } + if (freeze_processes()) { + thaw_processes(); + error = -EBUSY; } mutex_unlock(&pm_mutex); if (!error) @@ -164,7 +212,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; mutex_lock(&pm_mutex); thaw_processes(); - enable_nonboot_cpus(); mutex_unlock(&pm_mutex); data->frozen = 0; break; @@ -174,20 +221,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - mutex_lock(&pm_mutex); - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); - if (!error) { - suspend_console(); - error = device_suspend(PMSG_FREEZE); - if (!error) { - in_suspend = 1; - error = swsusp_suspend(); - device_resume(); - } - resume_console(); - } - mutex_unlock(&pm_mutex); + error = snapshot_suspend(); if (!error) error = put_user(in_suspend, (unsigned int __user *)arg); if (!error) @@ -201,17 +235,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - mutex_lock(&pm_mutex); - pm_prepare_console(); - suspend_console(); - error = device_suspend(PMSG_PRETHAW); - if (!error) { - error = swsusp_resume(); - device_resume(); - } - resume_console(); - pm_restore_console(); - mutex_unlock(&pm_mutex); + error = snapshot_restore(); break; case SNAPSHOT_FREE: -- cgit v1.2.2 From d12c610e08022a1b84d6bd4412c189214d32e713 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 10 Feb 2007 01:43:34 -0800 Subject: [PATCH] swsusp-change-code-ordering-in-userc-sanity The compiler will do that. And if it doesn't, we don't want to either ;) Cc: Rafael J. Wysocki Cc: Pavel Machek Cc: Greg KH Cc: Nigel Cunningham Cc: Patrick Mochel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/user.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 4f217683455f..b70d83d6b16e 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -122,7 +122,7 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } -static inline int snapshot_suspend(void) +static int snapshot_suspend(void) { int error; @@ -151,7 +151,7 @@ static inline int snapshot_suspend(void) return error; } -static inline int snapshot_restore(void) +static int snapshot_restore(void) { int error; -- cgit v1.2.2 From 2b5b09b3b576d7323d8b4244429a83f16dc5446a Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sat, 10 Feb 2007 01:43:35 -0800 Subject: [PATCH] swsusp: Change pm_ops handling by userland interface Make the userland interface of swsusp call pm_ops->finish() after enable_nonboot_cpus() and before resume_device(), as indicated by the recent discussion on Linux-PM (cf. http://lists.osdl.org/pipermail/linux-pm/2006-November/004164.html). This patch changes the SNAPSHOT_PMOPS ioctl so that its first function, PMOPS_PREPARE, only sets a switch turning the platform suspend mode on, and its last function, PMOPS_FINISH, only checks if the platform mode is enabled. This should allow the older userland tools to work with new kernels without any modifications. The changes here only affect the userland interface of swsusp. Signed-off-by: Rafael J. Wysocki Acked-by: Pavel Machek Cc: Greg KH Cc: Nigel Cunningham Cc: Patrick Mochel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/power/user.c | 71 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 59 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index b70d83d6b16e..dd09efe7df54 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -37,6 +37,7 @@ static struct snapshot_data { int mode; char frozen; char ready; + char platform_suspend; } snapshot_state; static atomic_t device_available = ATOMIC_INIT(1); @@ -66,6 +67,7 @@ static int snapshot_open(struct inode *inode, struct file *filp) data->bitmap = NULL; data->frozen = 0; data->ready = 0; + data->platform_suspend = 0; return 0; } @@ -122,7 +124,23 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } -static int snapshot_suspend(void) +static inline int platform_prepare(void) +{ + int error = 0; + + if (pm_ops && pm_ops->prepare) + error = pm_ops->prepare(PM_SUSPEND_DISK); + + return error; +} + +static inline void platform_finish(void) +{ + if (pm_ops && pm_ops->finish) + pm_ops->finish(PM_SUSPEND_DISK); +} + +static inline int snapshot_suspend(int platform_suspend) { int error; @@ -132,6 +150,11 @@ static int snapshot_suspend(void) if (error) goto Finish; + if (platform_suspend) { + error = platform_prepare(); + if (error) + goto Finish; + } suspend_console(); error = device_suspend(PMSG_FREEZE); if (error) @@ -144,6 +167,9 @@ static int snapshot_suspend(void) } enable_nonboot_cpus(); Resume_devices: + if (platform_suspend) + platform_finish(); + device_resume(); resume_console(); Finish: @@ -151,12 +177,17 @@ static int snapshot_suspend(void) return error; } -static int snapshot_restore(void) +static inline int snapshot_restore(int platform_suspend) { int error; mutex_lock(&pm_mutex); pm_prepare_console(); + if (platform_suspend) { + error = platform_prepare(); + if (error) + goto Finish; + } suspend_console(); error = device_suspend(PMSG_PRETHAW); if (error) @@ -168,8 +199,12 @@ static int snapshot_restore(void) enable_nonboot_cpus(); Resume_devices: + if (platform_suspend) + platform_finish(); + device_resume(); resume_console(); + Finish: pm_restore_console(); mutex_unlock(&pm_mutex); return error; @@ -221,7 +256,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = snapshot_suspend(); + error = snapshot_suspend(data->platform_suspend); if (!error) error = put_user(in_suspend, (unsigned int __user *)arg); if (!error) @@ -235,7 +270,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, error = -EPERM; break; } - error = snapshot_restore(); + error = snapshot_restore(data->platform_suspend); break; case SNAPSHOT_FREE: @@ -306,6 +341,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_S2RAM: + if (!pm_ops) { + error = -ENOSYS; + break; + } + if (!data->frozen) { error = -EPERM; break; @@ -343,28 +383,35 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, break; case SNAPSHOT_PMOPS: + error = -EINVAL; + switch (arg) { case PMOPS_PREPARE: - if (pm_ops->prepare) { - error = pm_ops->prepare(PM_SUSPEND_DISK); + if (pm_ops && pm_ops->enter) { + data->platform_suspend = 1; + error = 0; + } else { + error = -ENOSYS; } break; case PMOPS_ENTER: - kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); - error = pm_ops->enter(PM_SUSPEND_DISK); + if (data->platform_suspend) { + kernel_shutdown_prepare(SYSTEM_SUSPEND_DISK); + error = pm_ops->enter(PM_SUSPEND_DISK); + error = 0; + } break; case PMOPS_FINISH: - if (pm_ops && pm_ops->finish) { - pm_ops->finish(PM_SUSPEND_DISK); - } + if (data->platform_suspend) + error = 0; + break; default: printk(KERN_ERR "SNAPSHOT_PMOPS: invalid argument %ld\n", arg); - error = -EINVAL; } break; -- cgit v1.2.2 From dc29a3657b52ac687970d81d7194cf4238702124 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 10 Feb 2007 01:43:43 -0800 Subject: [PATCH] kernel/time/clocksource.c needs struct task_struct on m68k kernel/time/clocksource.c needs struct task_struct on m68k. Because it uses spin_unlock_irq(), which, on m68k, uses hardirq_count(), which uses preempt_count(), which needs to dereference struct task_struct, we have to include sched.h. Because it would cause a loop inclusion, we cannot include sched.h in any other of asm-m68k/system.h, linux/thread_info.h, linux/hardirq.h, which leaves this ugly include in a C file as the only simple solution. Signed-off-by: Mathieu Desnoyers Cc: Ingo Molnar Cc: Roman Zippel Cc: Thomas Gleixner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/time/clocksource.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 22504afc0d34..d9ef176c4e09 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -28,6 +28,7 @@ #include #include #include +#include /* for spin_unlock_irq() using preempt_count() m68k */ /* XXX - Would like a better way for initializing curr_clocksource */ extern struct clocksource clocksource_jiffies; -- cgit v1.2.2 From 3ee75ac3c0f4904633322b7d9b111566fbc4a7d3 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 10 Feb 2007 01:44:39 -0800 Subject: [PATCH] sysctl_{,ms_}jiffies: fix oldlen semantics currently it's 1) if *oldlenp == 0, don't writeback anything 2) if *oldlenp >= table->maxlen, don't writeback more than table->maxlen bytes and rewrite *oldlenp don't look at underlying type granularity 3) if 0 < *oldlenp < table->maxlen, *cough* string sysctls don't writeback more than *oldlenp bytes. OK, that's because sizeof(char) == 1 int sysctls writeback anything in (0, table->maxlen] range Though accept integers divisible by sizeof(int) for writing. sysctl_jiffies and sysctl_ms_jiffies don't writeback anything but sizeof(int), which violates 1) and 2). So, make sysctl_jiffies and sysctl_ms_jiffies accept a) *oldlenp == 0, not doing writeback b) *oldlenp >= sizeof(int), writing one integer. -EINVAL still returned for *oldlenp == 1, 2, 3. Signed-off-by: Alexey Dobriyan Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 41bbba1a15da..16ef870fa75a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2553,17 +2553,23 @@ int sysctl_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - if (oldval) { + if (oldval && oldlenp) { size_t olen; - if (oldlenp) { - if (get_user(olen, oldlenp)) + + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen) { + int val; + + if (olen < sizeof(int)) + return -EINVAL; + + val = *(int *)(table->data) / HZ; + if (put_user(val, (int __user *)oldval)) + return -EFAULT; + if (put_user(sizeof(int), oldlenp)) return -EFAULT; - if (olen!=sizeof(int)) - return -EINVAL; } - if (put_user(*(int *)(table->data)/HZ, (int __user *)oldval) || - (oldlenp && put_user(sizeof(int),oldlenp))) - return -EFAULT; } if (newval && newlen) { int new; @@ -2581,17 +2587,23 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { - if (oldval) { + if (oldval && oldlenp) { size_t olen; - if (oldlenp) { - if (get_user(olen, oldlenp)) + + if (get_user(olen, oldlenp)) + return -EFAULT; + if (olen) { + int val; + + if (olen < sizeof(int)) + return -EINVAL; + + val = jiffies_to_msecs(*(int *)(table->data)); + if (put_user(val, (int __user *)oldval)) + return -EFAULT; + if (put_user(sizeof(int), oldlenp)) return -EFAULT; - if (olen!=sizeof(int)) - return -EINVAL; } - if (put_user(jiffies_to_msecs(*(int *)(table->data)), (int __user *)oldval) || - (oldlenp && put_user(sizeof(int),oldlenp))) - return -EFAULT; } if (newval && newlen) { int new; -- cgit v1.2.2 From 0c12b51712ced2c0d89a8ec3d546ed810f86d33e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 10 Feb 2007 01:44:56 -0800 Subject: [PATCH] kill_pid_info: kill acquired_tasklist_lock Kill acquired_tasklist_lock, sig_needs_tasklist() is very cheap nowadays. Signed-off-by: Oleg Nesterov Cc: "Eric W. Biederman" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 5630255d2e2a..ea4632bd40a0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1119,19 +1119,18 @@ kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; - int acquired_tasklist_lock = 0; struct task_struct *p; rcu_read_lock(); - if (unlikely(sig_needs_tasklist(sig))) { + if (unlikely(sig_needs_tasklist(sig))) read_lock(&tasklist_lock); - acquired_tasklist_lock = 1; - } + p = pid_task(pid, PIDTYPE_PID); error = -ESRCH; if (p) error = group_send_sig_info(sig, info, p); - if (unlikely(acquired_tasklist_lock)) + + if (unlikely(sig_needs_tasklist(sig))) read_unlock(&tasklist_lock); rcu_read_unlock(); return error; -- cgit v1.2.2 From 381a229209aa6f7f72375797b7bcfcfe2ae6fcbb Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sat, 10 Feb 2007 01:44:58 -0800 Subject: [PATCH] lockdep: more unlock-on-error fixes - returns after DEBUG_LOCKS_WARN_ON added in 3 places - debug_locks checking after lookup_chain_cache() added in __lock_acquire() - locking for testing and changing global variable max_lockdep_depth added in __lock_acquire() From: Ingo Molnar My __acquire_lock() cleanup introduced a locking bug: on SMP systems we'd release a non-owned graph lock. Fix this by moving the graph unlock back, and by leaving the max_lockdep_depth variable update possibly racy. (we dont care, it's just statistics) Also add some minimal debugging code to graph_unlock()/graph_lock(), which caught this locking bug. Signed-off-by: Jarek Poplawski Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 509efd49540f..2d616f4d853c 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -70,6 +70,9 @@ static int graph_lock(void) static inline int graph_unlock(void) { + if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) + return DEBUG_LOCKS_WARN_ON(1); + __raw_spin_unlock(&lockdep_lock); return 0; } @@ -712,6 +715,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth) struct lock_list *entry; int ret; + if (!__raw_spin_is_locked(&lockdep_lock)) + return DEBUG_LOCKS_WARN_ON(1); + if (depth > max_recursion_depth) max_recursion_depth = depth; if (depth >= RECURSION_LIMIT) @@ -1293,7 +1299,8 @@ out_unlock_set: if (!subclass || force) lock->class_cache = class; - DEBUG_LOCKS_WARN_ON(class->subclass != subclass); + if (DEBUG_LOCKS_WARN_ON(class->subclass != subclass)) + return NULL; return class; } @@ -1308,7 +1315,8 @@ static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - DEBUG_LOCKS_WARN_ON(!irqs_disabled()); + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; /* * We can walk it lock-free, because entries only get added * to the hash: @@ -1394,7 +1402,9 @@ static void check_chain_key(struct task_struct *curr) return; } id = hlock->class - lock_classes; - DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS); + if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + return; + if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; @@ -2205,7 +2215,11 @@ out_calc_hash: if (!check_prevs_add(curr, hlock)) return 0; graph_unlock(); - } + } else + /* after lookup_chain_cache(): */ + if (unlikely(!debug_locks)) + return 0; + curr->lockdep_depth++; check_chain_key(curr); if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { @@ -2214,6 +2228,7 @@ out_calc_hash: printk("turning off the locking correctness validator.\n"); return 0; } + if (unlikely(curr->lockdep_depth > max_lockdep_depth)) max_lockdep_depth = curr->lockdep_depth; -- cgit v1.2.2 From 068135e63518314d4efd711142f674ad0841599e Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Sat, 10 Feb 2007 01:44:59 -0800 Subject: [PATCH] lockdep: add graph depth information to /proc/lockdep Generate locking graph information into /proc/lockdep, for lock hierarchy documentation and visualization purposes. sample output: c089fd5c OPS: 138 FD: 14 BD: 1 --..: &tty->termios_mutex -> [c07a3430] tty_ldisc_lock -> [c07a37f0] &port_lock_key -> [c07afdc0] &rq->rq_lock_key#2 The lock classes listed are all the first-hop lock dependencies that lockdep has seen so far. Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep.c | 19 ++++++++++++------- kernel/lockdep_proc.c | 41 +++++++++++++++++++++++++++++------------ 2 files changed, 41 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 2d616f4d853c..592c576d77a7 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -490,7 +490,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth) * Add a new dependency to the head of the list: */ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip) + struct list_head *head, unsigned long ip, int distance) { struct lock_list *entry; /* @@ -502,6 +502,7 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this, return 0; entry->class = this; + entry->distance = distance; if (!save_trace(&entry->trace)) return 0; @@ -906,7 +907,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next) + struct held_lock *next, int distance) { struct lock_list *entry; int ret; @@ -984,8 +985,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * L2 added to its dependency list, due to the first chain.) */ list_for_each_entry(entry, &prev->class->locks_after, entry) { - if (entry->class == next->class) + if (entry->class == next->class) { + if (distance == 1) + entry->distance = 1; return 2; + } } /* @@ -993,12 +997,13 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * to the previous lock's dependency list: */ ret = add_lock_to_list(prev->class, next->class, - &prev->class->locks_after, next->acquire_ip); + &prev->class->locks_after, next->acquire_ip, distance); + if (!ret) return 0; ret = add_lock_to_list(next->class, prev->class, - &next->class->locks_before, next->acquire_ip); + &next->class->locks_before, next->acquire_ip, distance); if (!ret) return 0; @@ -1046,13 +1051,14 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) goto out_bug; for (;;) { + int distance = curr->lockdep_depth - depth + 1; hlock = curr->held_locks + depth-1; /* * Only non-recursive-read entries get new dependencies * added: */ if (hlock->read != 2) { - if (!check_prev_add(curr, hlock, next)) + if (!check_prev_add(curr, hlock, next, distance)) return 0; /* * Stop after the first non-trylock entry, @@ -2779,4 +2785,3 @@ void debug_show_held_locks(struct task_struct *task) } EXPORT_SYMBOL_GPL(debug_show_held_locks); - diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index b554b40a4aa6..57a547a2da3f 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -77,12 +77,29 @@ static unsigned long count_backward_deps(struct lock_class *class) return ret; } +static void print_name(struct seq_file *m, struct lock_class *class) +{ + char str[128]; + const char *name = class->name; + + if (!name) { + name = __get_key_name(class->key, str); + seq_printf(m, "%s", name); + } else{ + seq_printf(m, "%s", name); + if (class->name_version > 1) + seq_printf(m, "#%d", class->name_version); + if (class->subclass) + seq_printf(m, "/%d", class->subclass); + } +} + static int l_show(struct seq_file *m, void *v) { unsigned long nr_forward_deps, nr_backward_deps; struct lock_class *class = m->private; - char str[128], c1, c2, c3, c4; - const char *name; + struct lock_list *entry; + char c1, c2, c3, c4; seq_printf(m, "%p", class->key); #ifdef CONFIG_DEBUG_LOCKDEP @@ -97,16 +114,16 @@ static int l_show(struct seq_file *m, void *v) get_usage_chars(class, &c1, &c2, &c3, &c4); seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); - name = class->name; - if (!name) { - name = __get_key_name(class->key, str); - seq_printf(m, ": %s", name); - } else{ - seq_printf(m, ": %s", name); - if (class->name_version > 1) - seq_printf(m, "#%d", class->name_version); - if (class->subclass) - seq_printf(m, "/%d", class->subclass); + seq_printf(m, ": "); + print_name(m, class); + seq_puts(m, "\n"); + + list_for_each_entry(entry, &class->locks_after, entry) { + if (entry->distance == 1) { + seq_printf(m, " -> [%p] ", entry->class); + print_name(m, entry->class); + seq_puts(m, "\n"); + } } seq_puts(m, "\n"); -- cgit v1.2.2 From c376222960ae91d5ffb9197ee36771aaed1d9f90 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 10 Feb 2007 01:45:03 -0800 Subject: [PATCH] Transform kmem_cache_alloc()+memset(0) -> kmem_cache_zalloc(). Replace appropriate pairs of "kmem_cache_alloc()" + "memset(0)" with the corresponding "kmem_cache_zalloc()" call. Signed-off-by: Robert P. J. Day Cc: "Luck, Tony" Cc: Andi Kleen Cc: Roland McGrath Cc: James Bottomley Cc: Greg KH Acked-by: Joel Becker Cc: Steven Whitehouse Cc: Jan Kara Cc: Michael Halcrow Cc: "David S. Miller" Cc: Stephen Smalley Cc: James Morris Cc: Chris Wright Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/posix-timers.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 5fe87de10ff0..a1bf61617839 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -399,10 +399,9 @@ EXPORT_SYMBOL_GPL(register_posix_clock); static struct k_itimer * alloc_posix_timer(void) { struct k_itimer *tmr; - tmr = kmem_cache_alloc(posix_timers_cache, GFP_KERNEL); + tmr = kmem_cache_zalloc(posix_timers_cache, GFP_KERNEL); if (!tmr) return tmr; - memset(tmr, 0, sizeof (struct k_itimer)); if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { kmem_cache_free(posix_timers_cache, tmr); tmr = NULL; -- cgit v1.2.2 From 23c887522e912ca494950796a95df8dd210f4b01 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 10 Feb 2007 01:45:05 -0800 Subject: [PATCH] Relay: add CPU hotplug support Mathieu originally needed to add this for tracing Xen, but it's something that's needed for any application that can be tracing while cpus are added. unplug isn't supported by this patch. The thought was that at minumum a new buffer needs to be added when a cpu comes up, but it wasn't worth the effort to remove buffers on cpu down since they'd be freed soon anyway when the channel was closed. [zanussi@us.ibm.com: avoid lock_cpu_hotplug deadlock] Signed-off-by: Mathieu Desnoyers Cc: Tom Zanussi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/relay.c | 180 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 127 insertions(+), 53 deletions(-) (limited to 'kernel') diff --git a/kernel/relay.c b/kernel/relay.c index 284e2e8b4eed..ef923f6de2e7 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -7,6 +7,8 @@ * Copyright (C) 1999-2005 - Karim Yaghmour (karim@opersys.com) * * Moved to kernel/relay.c by Paul Mundt, 2006. + * November 2006 - CPU hotplug support by Mathieu Desnoyers + * (mathieu.desnoyers@polymtl.ca) * * This file is released under the GPL. */ @@ -18,6 +20,11 @@ #include #include #include +#include + +/* list of open channels, for cpu hotplug */ +static DEFINE_MUTEX(relay_channels_mutex); +static LIST_HEAD(relay_channels); /* * close() vm_op implementation for relay file mapping. @@ -187,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf) __free_page(buf->page_array[i]); kfree(buf->page_array); } + chan->buf[buf->cpu] = NULL; kfree(buf->padding); kfree(buf); kref_put(&chan->kref, relay_destroy_channel); @@ -362,51 +370,69 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) void relay_reset(struct rchan *chan) { unsigned int i; - struct rchan_buf *prev = NULL; if (!chan) return; - for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i] || chan->buf[i] == prev) - break; - __relay_reset(chan->buf[i], 0); - prev = chan->buf[i]; + if (chan->is_global && chan->buf[0]) { + __relay_reset(chan->buf[0], 0); + return; } + + mutex_lock(&relay_channels_mutex); + for_each_online_cpu(i) + if (chan->buf[i]) + __relay_reset(chan->buf[i], 0); + mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_reset); /* * relay_open_buf - create a new relay channel buffer * - * Internal - used by relay_open(). + * used by relay_open() and CPU hotplug. */ -static struct rchan_buf *relay_open_buf(struct rchan *chan, - const char *filename, - struct dentry *parent, - int *is_global) +static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) { - struct rchan_buf *buf; + struct rchan_buf *buf = NULL; struct dentry *dentry; + char *tmpname; - if (*is_global) + if (chan->is_global) return chan->buf[0]; + tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL); + if (!tmpname) + goto end; + snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu); + buf = relay_create_buf(chan); if (!buf) - return NULL; + goto free_name; + + buf->cpu = cpu; + __relay_reset(buf, 1); /* Create file in fs */ - dentry = chan->cb->create_buf_file(filename, parent, S_IRUSR, - buf, is_global); - if (!dentry) { - relay_destroy_buf(buf); - return NULL; - } + dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, + buf, &chan->is_global); + if (!dentry) + goto free_buf; buf->dentry = dentry; - __relay_reset(buf, 1); + if(chan->is_global) { + chan->buf[0] = buf; + buf->cpu = 0; + } + + goto free_name; + +free_buf: + relay_destroy_buf(buf); +free_name: + kfree(tmpname); +end: return buf; } @@ -447,6 +473,47 @@ static void setup_callbacks(struct rchan *chan, chan->cb = cb; } +/** + * + * relay_hotcpu_callback - CPU hotplug callback + * @nb: notifier block + * @action: hotplug action to take + * @hcpu: CPU number + * + * Returns the success/failure of the operation. (NOTIFY_OK, NOTIFY_BAD) + */ +static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + unsigned int hotcpu = (unsigned long)hcpu; + struct rchan *chan; + + switch(action) { + case CPU_UP_PREPARE: + mutex_lock(&relay_channels_mutex); + list_for_each_entry(chan, &relay_channels, list) { + if (chan->buf[hotcpu]) + continue; + chan->buf[hotcpu] = relay_open_buf(chan, hotcpu); + if(!chan->buf[hotcpu]) { + printk(KERN_ERR + "relay_hotcpu_callback: cpu %d buffer " + "creation failed\n", hotcpu); + mutex_unlock(&relay_channels_mutex); + return NOTIFY_BAD; + } + } + mutex_unlock(&relay_channels_mutex); + break; + case CPU_DEAD: + /* No need to flush the cpu : will be flushed upon + * final relay_flush() call. */ + break; + } + return NOTIFY_OK; +} + /** * relay_open - create a new relay channel * @base_filename: base name of files to create @@ -454,6 +521,7 @@ static void setup_callbacks(struct rchan *chan, * @subbuf_size: size of sub-buffers * @n_subbufs: number of sub-buffers * @cb: client callback functions + * @private_data: user-defined data * * Returns channel pointer if successful, %NULL otherwise. * @@ -466,13 +534,11 @@ struct rchan *relay_open(const char *base_filename, struct dentry *parent, size_t subbuf_size, size_t n_subbufs, - struct rchan_callbacks *cb) + struct rchan_callbacks *cb, + void *private_data) { unsigned int i; struct rchan *chan; - char *tmpname; - int is_global = 0; - if (!base_filename) return NULL; @@ -487,38 +553,32 @@ struct rchan *relay_open(const char *base_filename, chan->n_subbufs = n_subbufs; chan->subbuf_size = subbuf_size; chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); + chan->parent = parent; + chan->private_data = private_data; + strlcpy(chan->base_filename, base_filename, NAME_MAX); setup_callbacks(chan, cb); kref_init(&chan->kref); - tmpname = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!tmpname) - goto free_chan; - + mutex_lock(&relay_channels_mutex); for_each_online_cpu(i) { - sprintf(tmpname, "%s%d", base_filename, i); - chan->buf[i] = relay_open_buf(chan, tmpname, parent, - &is_global); + chan->buf[i] = relay_open_buf(chan, i); if (!chan->buf[i]) goto free_bufs; - - chan->buf[i]->cpu = i; } + list_add(&chan->list, &relay_channels); + mutex_unlock(&relay_channels_mutex); - kfree(tmpname); return chan; free_bufs: - for (i = 0; i < NR_CPUS; i++) { + for_each_online_cpu(i) { if (!chan->buf[i]) break; relay_close_buf(chan->buf[i]); - if (is_global) - break; } - kfree(tmpname); -free_chan: kref_put(&chan->kref, relay_destroy_channel); + mutex_unlock(&relay_channels_mutex); return NULL; } EXPORT_SYMBOL_GPL(relay_open); @@ -619,24 +679,26 @@ EXPORT_SYMBOL_GPL(relay_subbufs_consumed); void relay_close(struct rchan *chan) { unsigned int i; - struct rchan_buf *prev = NULL; if (!chan) return; - for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i] || chan->buf[i] == prev) - break; - relay_close_buf(chan->buf[i]); - prev = chan->buf[i]; - } + mutex_lock(&relay_channels_mutex); + if (chan->is_global && chan->buf[0]) + relay_close_buf(chan->buf[0]); + else + for_each_possible_cpu(i) + if (chan->buf[i]) + relay_close_buf(chan->buf[i]); if (chan->last_toobig) printk(KERN_WARNING "relay: one or more items not logged " "[item size (%Zd) > sub-buffer size (%Zd)]\n", chan->last_toobig, chan->subbuf_size); + list_del(&chan->list); kref_put(&chan->kref, relay_destroy_channel); + mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_close); @@ -649,17 +711,20 @@ EXPORT_SYMBOL_GPL(relay_close); void relay_flush(struct rchan *chan) { unsigned int i; - struct rchan_buf *prev = NULL; if (!chan) return; - for (i = 0; i < NR_CPUS; i++) { - if (!chan->buf[i] || chan->buf[i] == prev) - break; - relay_switch_subbuf(chan->buf[i], 0); - prev = chan->buf[i]; + if (chan->is_global && chan->buf[0]) { + relay_switch_subbuf(chan->buf[0], 0); + return; } + + mutex_lock(&relay_channels_mutex); + for_each_possible_cpu(i) + if (chan->buf[i]) + relay_switch_subbuf(chan->buf[i], 0); + mutex_unlock(&relay_channels_mutex); } EXPORT_SYMBOL_GPL(relay_flush); @@ -1022,3 +1087,12 @@ const struct file_operations relay_file_operations = { .sendfile = relay_file_sendfile, }; EXPORT_SYMBOL_GPL(relay_file_operations); + +static __init int relay_init(void) +{ + + hotcpu_notifier(relay_hotcpu_callback, 0); + return 0; +} + +module_init(relay_init); -- cgit v1.2.2 From b035b6de24932ffd4a2b1c6619a2f5711da6920f Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 10 Feb 2007 01:45:10 -0800 Subject: [PATCH] Consolidate default sched_clock() Use attribute(weak). Signed-off-by: Alexey Dobriyan Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index cca93cc0dd7d..1cd4ee769e20 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -56,6 +56,16 @@ #include +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (1000000000 / HZ); +} + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], -- cgit v1.2.2 From 34f5a39899f3f3e815da64f48ddb72942d86c366 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 10 Feb 2007 01:45:24 -0800 Subject: [PATCH] Add TAINT_USER and ability to set taint flags from userspace Allow taint flags to be set from userspace by writing to /proc/sys/kernel/tainted, and add a new taint flag, TAINT_USER, to be used when userspace has potentially done something dangerous that might compromise the kernel. This will allow support personnel to ask further questions about what may have caused the user taint flag to have been set. For example, they might examine the logs of the realtime JVM to see if the Java program has used the really silly, stupid, dangerous, and completely-non-portable direct access to physical memory feature which MUST be implemented according to the Real-Time Specification for Java (RTSJ). Sigh. What were those silly people at Sun thinking? [akpm@osdl.org: build fix] [bunk@stusta.de: cleanup] Signed-off-by: "Theodore Ts'o" Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/panic.c | 6 ++++-- kernel/sysctl.c | 27 +++++++++++++++++++++++++-- 2 files changed, 29 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/panic.c b/kernel/panic.c index 525e365f7239..623d1828259a 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -150,6 +150,7 @@ EXPORT_SYMBOL(panic); * 'R' - User forced a module unload. * 'M' - Machine had a machine check experience. * 'B' - System has hit bad_page. + * 'U' - Userspace-defined naughtiness. * * The string is overwritten by the next call to print_taint(). */ @@ -158,13 +159,14 @@ const char *print_tainted(void) { static char buf[20]; if (tainted) { - snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c", + snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c", tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', tainted & TAINT_FORCED_MODULE ? 'F' : ' ', tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', - tainted & TAINT_BAD_PAGE ? 'B' : ' '); + tainted & TAINT_BAD_PAGE ? 'B' : ' ', + tainted & TAINT_USER ? 'U' : ' '); } else snprintf(buf, sizeof(buf), "Not tainted"); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 16ef870fa75a..7733ef58aaca 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -151,6 +151,8 @@ static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, #ifdef CONFIG_PROC_SYSCTL static int proc_do_cad_pid(ctl_table *table, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos); +static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos); #endif static ctl_table root_table[]; @@ -174,6 +176,7 @@ extern ctl_table inotify_table[]; int sysctl_legacy_va_layout; #endif + static void *get_uts(ctl_table *table, int write) { char *which = table->data; @@ -344,14 +347,16 @@ static ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, +#ifdef CONFIG_PROC_SYSCTL { .ctl_name = KERN_TAINTED, .procname = "tainted", .data = &tainted, .maxlen = sizeof(int), - .mode = 0444, - .proc_handler = &proc_dointvec, + .mode = 0644, + .proc_handler = &proc_dointvec_taint, }, +#endif { .ctl_name = KERN_CAP_BSET, .procname = "cap-bound", @@ -1927,6 +1932,7 @@ int proc_dointvec(ctl_table *table, int write, struct file *filp, #define OP_SET 0 #define OP_AND 1 +#define OP_OR 2 static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, int *valp, @@ -1938,6 +1944,7 @@ static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp, switch(op) { case OP_SET: *valp = val; break; case OP_AND: *valp &= val; break; + case OP_OR: *valp |= val; break; } } else { int val = *valp; @@ -1970,6 +1977,22 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp, do_proc_dointvec_bset_conv,&op); } +/* + * Taint values can only be increased + */ +static int proc_dointvec_taint(ctl_table *table, int write, struct file *filp, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int op; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + op = OP_OR; + return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, + do_proc_dointvec_bset_conv,&op); +} + struct do_proc_dointvec_minmax_conv_param { int *min; int *max; -- cgit v1.2.2 From 3db5db4fcdafc85b99d171336a7d2f25765ccd13 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 10 Feb 2007 01:45:40 -0800 Subject: [PATCH] use cycle_t instead of u64 in struct time_interpolator The 32bit and 64bit PARISC Linux kernels suffers from the problem, that the gettimeofday() call sometimes returns non-monotonic times. The easiest way to fix this, is to drop the PARISC-specific implementation and switch over to the generic TIME_INTERPOLATION framework. But in order to make it even compile on 32bit PARISC, the patch below which touches the generic Linux code, is mandatory. More information and the full patch with the parisc-specific changes is included in this thread: http://lists.parisc-linux.org/pipermail/parisc-linux/2006-December/031003.html As far as I could see, this patch does not change anything for the existing architectures which use this framework (IA64 and SPARC64), since "cycles_t" is defined there as unsigned 64bit-integer anyway (which then makes this patch a no-change for them). Signed-off-by: Helge Deller Cc: Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/timer.c b/kernel/timer.c index c2a8ccfc2882..d38801a95866 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1624,7 +1624,7 @@ struct time_interpolator *time_interpolator __read_mostly; static struct time_interpolator *time_interpolator_list __read_mostly; static DEFINE_SPINLOCK(time_interpolator_lock); -static inline u64 time_interpolator_get_cycles(unsigned int src) +static inline cycles_t time_interpolator_get_cycles(unsigned int src) { unsigned long (*x)(void); @@ -1650,8 +1650,8 @@ static inline u64 time_interpolator_get_counter(int writelock) if (time_interpolator->jitter) { - u64 lcycle; - u64 now; + cycles_t lcycle; + cycles_t now; do { lcycle = time_interpolator->last_cycle; -- cgit v1.2.2 From cb799b8988e40a7871ae8e976248c33c562e3555 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Sat, 10 Feb 2007 01:45:51 -0800 Subject: [PATCH] sysctl warning fix kernel/sysctl.c:2816: warning: 'sysctl_ipc_data' defined but not used Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7733ef58aaca..84cab0ce44d9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -2767,12 +2767,14 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, { return -ENOSYS; } +#ifdef CONFIG_SYSVIPC static int sysctl_ipc_data(ctl_table *table, int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { return -ENOSYS; } +#endif #endif /* CONFIG_SYSCTL_SYSCALL */ /* -- cgit v1.2.2 From b653d081c17e26101980c858a9808740533b78b4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 10 Feb 2007 01:45:54 -0800 Subject: [PATCH] proc: remove useless (and buggy) ->nlink settings Bug: pnx8550 code creates directory but resets ->nlink to 1. create_proc_entry() et al will correctly set ->nlink for you. Signed-off-by: Alexey Dobriyan Cc: Ralf Baechle Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jeff Dike Cc: Corey Minyard Cc: Alan Cox Cc: Kyle McMartin Cc: Martin Schwidefsky Cc: Greg KH Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/proc.c | 1 - kernel/profile.c | 1 - 2 files changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 61f5c717a8f5..6d3be06e8ce6 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -136,7 +136,6 @@ void register_irq_proc(unsigned int irq) entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); if (entry) { - entry->nlink = 1; entry->data = (void *)(long)irq; entry->read_proc = irq_affinity_read_proc; entry->write_proc = irq_affinity_write_proc; diff --git a/kernel/profile.c b/kernel/profile.c index d6579d511069..9bfadb248dd8 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -449,7 +449,6 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) /* create /proc/irq/prof_cpu_mask */ if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) return; - entry->nlink = 1; entry->data = (void *)&prof_cpu_mask; entry->read_proc = prof_cpu_mask_read_proc; entry->write_proc = prof_cpu_mask_write_proc; -- cgit v1.2.2 From 72fd4a35a824331d7a0f4168d7576502d95d34b3 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 10 Feb 2007 01:45:59 -0800 Subject: [PATCH] Numerous fixes to kernel-doc info in source files. A variety of (mostly) innocuous fixes to the embedded kernel-doc content in source files, including: * make multi-line initial descriptions single line * denote some function names, constants and structs as such * change erroneous opening '/*' to '/**' in a few places * reword some text for clarity Signed-off-by: Robert P. J. Day Cc: "Randy.Dunlap" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 3 +-- kernel/hrtimer.c | 6 +++--- kernel/kfifo.c | 10 +++++----- kernel/kthread.c | 6 +++--- kernel/printk.c | 2 +- kernel/relay.c | 12 ++++++------ kernel/sched.c | 9 ++++----- kernel/signal.c | 2 +- kernel/sys.c | 10 +++++----- kernel/timer.c | 20 ++++++++++---------- kernel/workqueue.c | 6 ++---- 11 files changed, 41 insertions(+), 45 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index fec12eb12471..bc71fdfcd8a7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -257,8 +257,7 @@ static int has_stopped_jobs(int pgrp) } /** - * reparent_to_init - Reparent the calling kernel thread to the init task - * of the pid space that the thread belongs to. + * reparent_to_init - Reparent the calling kernel thread to the init task of the pid space that the thread belongs to. * * If a kernel thread is launched as a result of a system call, or if * it ever exits, it should generally reparent itself to init so that diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index d0ba190dfeb6..f44e499e8fca 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -102,7 +102,7 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) = * * The function calculates the monotonic clock from the realtime * clock and the wall_to_monotonic offset and stores the result - * in normalized timespec format in the variable pointed to by ts. + * in normalized timespec format in the variable pointed to by @ts. */ void ktime_get_ts(struct timespec *ts) { @@ -583,8 +583,8 @@ EXPORT_SYMBOL_GPL(hrtimer_init); * @which_clock: which clock to query * @tp: pointer to timespec variable to store the resolution * - * Store the resolution of the clock selected by which_clock in the - * variable pointed to by tp. + * Store the resolution of the clock selected by @which_clock in the + * variable pointed to by @tp. */ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) { diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 5d1d907378a2..cee419143fd4 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -32,8 +32,8 @@ * @gfp_mask: get_free_pages mask, passed to kmalloc() * @lock: the lock to be used to protect the fifo buffer * - * Do NOT pass the kfifo to kfifo_free() after use ! Simply free the - * struct kfifo with kfree(). + * Do NOT pass the kfifo to kfifo_free() after use! Simply free the + * &struct kfifo with kfree(). */ struct kfifo *kfifo_init(unsigned char *buffer, unsigned int size, gfp_t gfp_mask, spinlock_t *lock) @@ -108,7 +108,7 @@ EXPORT_SYMBOL(kfifo_free); * @buffer: the data to be added. * @len: the length of the data to be added. * - * This function copies at most 'len' bytes from the 'buffer' into + * This function copies at most @len bytes from the @buffer into * the FIFO depending on the free space, and returns the number of * bytes copied. * @@ -155,8 +155,8 @@ EXPORT_SYMBOL(__kfifo_put); * @buffer: where the data must be copied. * @len: the size of the destination buffer. * - * This function copies at most 'len' bytes from the FIFO into the - * 'buffer' and returns the number of copied bytes. + * This function copies at most @len bytes from the FIFO into the + * @buffer and returns the number of copied bytes. * * Note that with only one concurrent reader and one concurrent * writer, you don't need extra locking to use these functions. diff --git a/kernel/kthread.c b/kernel/kthread.c index 1db8c72d0d38..87c50ccd1d4e 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -50,7 +50,7 @@ static struct kthread_stop_info kthread_stop_info; /** * kthread_should_stop - should this kthread return now? * - * When someone calls kthread_stop on your kthread, it will be woken + * When someone calls kthread_stop() on your kthread, it will be woken * and this will return true. You should then return, and your return * value will be passed through to kthread_stop(). */ @@ -143,7 +143,7 @@ static void keventd_create_kthread(struct work_struct *work) * it. See also kthread_run(), kthread_create_on_cpu(). * * When woken, the thread will run @threadfn() with @data as its - * argument. @threadfn can either call do_exit() directly if it is a + * argument. @threadfn() can either call do_exit() directly if it is a * standalone thread for which noone will call kthread_stop(), or * return when 'kthread_should_stop()' is true (which means * kthread_stop() has been called). The return value should be zero @@ -192,7 +192,7 @@ EXPORT_SYMBOL(kthread_create); * * Description: This function is equivalent to set_cpus_allowed(), * except that @cpu doesn't need to be online, and the thread must be - * stopped (i.e., just returned from kthread_create(). + * stopped (i.e., just returned from kthread_create()). */ void kthread_bind(struct task_struct *k, unsigned int cpu) { diff --git a/kernel/printk.c b/kernel/printk.c index c770e1a4e882..3e79e18dce33 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -483,7 +483,7 @@ static int have_callable_console(void) * printk - print a kernel message * @fmt: format string * - * This is printk. It can be called from any context. We want it to work. + * This is printk(). It can be called from any context. We want it to work. * * We try to grab the console_sem. If we succeed, it's easy - we log the output and * call the console drivers. If we fail to get the semaphore we place the output diff --git a/kernel/relay.c b/kernel/relay.c index ef923f6de2e7..ef8a935710a2 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -328,7 +328,7 @@ static void wakeup_readers(struct work_struct *work) * @buf: the channel buffer * @init: 1 if this is a first-time initialization * - * See relay_reset for description of effect. + * See relay_reset() for description of effect. */ static void __relay_reset(struct rchan_buf *buf, unsigned int init) { @@ -364,7 +364,7 @@ static void __relay_reset(struct rchan_buf *buf, unsigned int init) * and restarting the channel in its initial state. The buffers * are not freed, so any mappings are still in effect. * - * NOTE: Care should be taken that the channel isn't actually + * NOTE. Care should be taken that the channel isn't actually * being used by anything when this call is made. */ void relay_reset(struct rchan *chan) @@ -528,7 +528,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, * Creates a channel buffer for each cpu using the sizes and * attributes specified. The created channel buffer files * will be named base_filename0...base_filenameN-1. File - * permissions will be S_IRUSR. + * permissions will be %S_IRUSR. */ struct rchan *relay_open(const char *base_filename, struct dentry *parent, @@ -648,7 +648,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf); * subbufs_consumed should be the number of sub-buffers newly consumed, * not the total consumed. * - * NOTE: Kernel clients don't need to call this function if the channel + * NOTE. Kernel clients don't need to call this function if the channel * mode is 'overwrite'. */ void relay_subbufs_consumed(struct rchan *chan, @@ -749,7 +749,7 @@ static int relay_file_open(struct inode *inode, struct file *filp) * @filp: the file * @vma: the vma describing what to map * - * Calls upon relay_mmap_buf to map the file into user space. + * Calls upon relay_mmap_buf() to map the file into user space. */ static int relay_file_mmap(struct file *filp, struct vm_area_struct *vma) { @@ -891,7 +891,7 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos, * @read_pos: file read position * @buf: relay channel buffer * - * If the read_pos is in the middle of padding, return the + * If the @read_pos is in the middle of padding, return the * position of the first actually available byte, otherwise * return the original value. */ diff --git a/kernel/sched.c b/kernel/sched.c index 1cd4ee769e20..1fd67e16cd31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4203,13 +4203,12 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) } /** - * sched_setscheduler - change the scheduling policy and/or RT priority of - * a thread. + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * - * NOTE: the task may be already dead + * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) @@ -4577,7 +4576,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread + * This function yields the current CPU by moving the calling thread * to the expired array. If there are no other threads running on this * CPU then this function will return. */ @@ -4704,7 +4703,7 @@ EXPORT_SYMBOL(cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * this is a shortcut for kernel-space yielding - it marks the + * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ void __sched yield(void) diff --git a/kernel/signal.c b/kernel/signal.c index ea4632bd40a0..228fdb5c01d1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2282,7 +2282,7 @@ static int do_tkill(int tgid, int pid, int sig) * @pid: the PID of the thread * @sig: signal to be sent * - * This syscall also checks the tgid and returns -ESRCH even if the PID + * This syscall also checks the @tgid and returns -ESRCH even if the PID * exists but it's not belonging to the target process anymore. This * method solves the problem of threads exiting and PIDs getting reused. */ diff --git a/kernel/sys.c b/kernel/sys.c index 6e2101dec0fc..e1024383314d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -215,7 +215,7 @@ EXPORT_SYMBOL_GPL(atomic_notifier_chain_unregister); * This routine uses RCU to synchronize with changes to the chain. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain + * with %NOTIFY_STOP_MASK then atomic_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -313,7 +313,7 @@ EXPORT_SYMBOL_GPL(blocking_notifier_chain_unregister); * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain + * with %NOTIFY_STOP_MASK then blocking_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -393,7 +393,7 @@ EXPORT_SYMBOL_GPL(raw_notifier_chain_unregister); * All locking must be provided by the caller. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then raw_notifier_call_chain + * with %NOTIFY_STOP_MASK then raw_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -487,7 +487,7 @@ EXPORT_SYMBOL_GPL(srcu_notifier_chain_unregister); * run in a process context, so they are allowed to block. * * If the return value of the notifier can be and'ed - * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain + * with %NOTIFY_STOP_MASK then srcu_notifier_call_chain() * will return immediately, with the return value of * the notifier function which halted execution. * Otherwise the return value is the return value @@ -538,7 +538,7 @@ EXPORT_SYMBOL_GPL(srcu_init_notifier_head); * Registers a function with the list of functions * to be called at reboot time. * - * Currently always returns zero, as blocking_notifier_chain_register + * Currently always returns zero, as blocking_notifier_chain_register() * always returns zero. */ diff --git a/kernel/timer.c b/kernel/timer.c index d38801a95866..31ab627df8a0 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -85,7 +85,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * - * __round_jiffies rounds an absolute time in the future (in jiffies) + * __round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -98,7 +98,7 @@ static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies(unsigned long j, int cpu) { @@ -142,7 +142,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies); * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * - * __round_jiffies_relative rounds a time delta in the future (in jiffies) + * __round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -155,7 +155,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies); * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { @@ -173,7 +173,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); * round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * - * round_jiffies rounds an absolute time in the future (in jiffies) + * round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -182,7 +182,7 @@ EXPORT_SYMBOL_GPL(__round_jiffies_relative); * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies(unsigned long j) { @@ -194,7 +194,7 @@ EXPORT_SYMBOL_GPL(round_jiffies); * round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * - * round_jiffies_relative rounds a time delta in the future (in jiffies) + * round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. @@ -203,7 +203,7 @@ EXPORT_SYMBOL_GPL(round_jiffies); * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * - * The return value is the rounded version of the "j" parameter. + * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies_relative(unsigned long j) { @@ -387,7 +387,7 @@ void add_timer_on(struct timer_list *timer, int cpu) * @timer: the timer to be modified * @expires: new timeout in jiffies * - * mod_timer is a more efficient way to update the expire field of an + * mod_timer() is a more efficient way to update the expire field of an * active timer (if the timer is inactive it will be activated) * * mod_timer(timer, expires) is equivalent to: @@ -490,7 +490,7 @@ out: * the timer it also makes sure the handler has finished executing on other * CPUs. * - * Synchronization rules: callers must prevent restarting of the timer, + * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts. The caller must not hold locks which would prevent * completion of the timer's handler. The timer's handler must not call diff --git a/kernel/workqueue.c b/kernel/workqueue.c index a3da07c5af28..020d1fff57dc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -656,8 +656,7 @@ void flush_scheduled_work(void) EXPORT_SYMBOL(flush_scheduled_work); /** - * cancel_rearming_delayed_workqueue - reliably kill off a delayed - * work whose handler rearms the delayed work. + * cancel_rearming_delayed_workqueue - reliably kill off a delayed work whose handler rearms the delayed work. * @wq: the controlling workqueue structure * @dwork: the delayed work struct */ @@ -670,8 +669,7 @@ void cancel_rearming_delayed_workqueue(struct workqueue_struct *wq, EXPORT_SYMBOL(cancel_rearming_delayed_workqueue); /** - * cancel_rearming_delayed_work - reliably kill off a delayed keventd - * work whose handler rearms the delayed work. + * cancel_rearming_delayed_work - reliably kill off a delayed keventd work whose handler rearms the delayed work. * @dwork: the delayed work struct */ void cancel_rearming_delayed_work(struct delayed_work *dwork) -- cgit v1.2.2 From d4d23add3abcd18d8021b99f230df608ccb2f007 Mon Sep 17 00:00:00 2001 From: Kyle McMartin Date: Sat, 10 Feb 2007 01:46:00 -0800 Subject: [PATCH] Common compat_sys_sysinfo I noticed that almost all architectures implemented exactly the same sys32_sysinfo... except parisc, where a bug was to be found in handling of the uptime. So let's remove a whole whack of code for fun and profit. Cribbed compat_sys_sysinfo from x86_64's implementation, since I figured it would be the best tested. This patch incorporates Arnd's suggestion of not using set_fs/get_fs, but instead extracting out the common code from sys_sysinfo. Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/compat.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/timer.c | 58 ++++++++++++++++++++++++++++---------------------- 2 files changed, 99 insertions(+), 25 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 6952dd057300..cebb4c28c039 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1016,3 +1016,69 @@ asmlinkage long compat_sys_migrate_pages(compat_pid_t pid, return sys_migrate_pages(pid, nr_bits + 1, old, new); } #endif + +struct compat_sysinfo { + s32 uptime; + u32 loads[3]; + u32 totalram; + u32 freeram; + u32 sharedram; + u32 bufferram; + u32 totalswap; + u32 freeswap; + u16 procs; + u16 pad; + u32 totalhigh; + u32 freehigh; + u32 mem_unit; + char _f[20-2*sizeof(u32)-sizeof(int)]; +}; + +asmlinkage long +compat_sys_sysinfo(struct compat_sysinfo __user *info) +{ + struct sysinfo s; + + do_sysinfo(&s); + + /* Check to see if any memory value is too large for 32-bit and scale + * down if needed + */ + if ((s.totalram >> 32) || (s.totalswap >> 32)) { + int bitcount = 0; + + while (s.mem_unit < PAGE_SIZE) { + s.mem_unit <<= 1; + bitcount++; + } + + s.totalram >>= bitcount; + s.freeram >>= bitcount; + s.sharedram >>= bitcount; + s.bufferram >>= bitcount; + s.totalswap >>= bitcount; + s.freeswap >>= bitcount; + s.totalhigh >>= bitcount; + s.freehigh >>= bitcount; + } + + if (!access_ok(VERIFY_WRITE, info, sizeof(struct compat_sysinfo)) || + __put_user (s.uptime, &info->uptime) || + __put_user (s.loads[0], &info->loads[0]) || + __put_user (s.loads[1], &info->loads[1]) || + __put_user (s.loads[2], &info->loads[2]) || + __put_user (s.totalram, &info->totalram) || + __put_user (s.freeram, &info->freeram) || + __put_user (s.sharedram, &info->sharedram) || + __put_user (s.bufferram, &info->bufferram) || + __put_user (s.totalswap, &info->totalswap) || + __put_user (s.freeswap, &info->freeswap) || + __put_user (s.procs, &info->procs) || + __put_user (s.totalhigh, &info->totalhigh) || + __put_user (s.freehigh, &info->freehigh) || + __put_user (s.mem_unit, &info->mem_unit)) + return -EFAULT; + + return 0; +} + diff --git a/kernel/timer.c b/kernel/timer.c index 31ab627df8a0..8533c3796082 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1392,17 +1392,16 @@ asmlinkage long sys_gettid(void) } /** - * sys_sysinfo - fill in sysinfo struct + * do_sysinfo - fill in sysinfo struct * @info: pointer to buffer to fill */ -asmlinkage long sys_sysinfo(struct sysinfo __user *info) +int do_sysinfo(struct sysinfo *info) { - struct sysinfo val; unsigned long mem_total, sav_total; unsigned int mem_unit, bitcount; unsigned long seq; - memset((char *)&val, 0, sizeof(struct sysinfo)); + memset(info, 0, sizeof(struct sysinfo)); do { struct timespec tp; @@ -1422,17 +1421,17 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; tp.tv_sec++; } - val.uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); + info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - val.loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - val.loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); + info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - val.procs = nr_threads; + info->procs = nr_threads; } while (read_seqretry(&xtime_lock, seq)); - si_meminfo(&val); - si_swapinfo(&val); + si_meminfo(info); + si_swapinfo(info); /* * If the sum of all the available memory (i.e. ram + swap) @@ -1443,11 +1442,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) * -Erik Andersen */ - mem_total = val.totalram + val.totalswap; - if (mem_total < val.totalram || mem_total < val.totalswap) + mem_total = info->totalram + info->totalswap; + if (mem_total < info->totalram || mem_total < info->totalswap) goto out; bitcount = 0; - mem_unit = val.mem_unit; + mem_unit = info->mem_unit; while (mem_unit > 1) { bitcount++; mem_unit >>= 1; @@ -1459,22 +1458,31 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) /* * If mem_total did not overflow, multiply all memory values by - * val.mem_unit and set it to 1. This leaves things compatible + * info->mem_unit and set it to 1. This leaves things compatible * with 2.2.x, and also retains compatibility with earlier 2.4.x * kernels... */ - val.mem_unit = 1; - val.totalram <<= bitcount; - val.freeram <<= bitcount; - val.sharedram <<= bitcount; - val.bufferram <<= bitcount; - val.totalswap <<= bitcount; - val.freeswap <<= bitcount; - val.totalhigh <<= bitcount; - val.freehigh <<= bitcount; + info->mem_unit = 1; + info->totalram <<= bitcount; + info->freeram <<= bitcount; + info->sharedram <<= bitcount; + info->bufferram <<= bitcount; + info->totalswap <<= bitcount; + info->freeswap <<= bitcount; + info->totalhigh <<= bitcount; + info->freehigh <<= bitcount; + +out: + return 0; +} + +asmlinkage long sys_sysinfo(struct sysinfo __user *info) +{ + struct sysinfo val; + + do_sysinfo(&val); - out: if (copy_to_user(info, &val, sizeof(struct sysinfo))) return -EFAULT; -- cgit v1.2.2 From 11f57cedcf382574a1e41d6cec2349f287fcea67 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 10 Feb 2007 01:46:09 -0800 Subject: [PATCH] audit: fix audit_filter_user_rules() initialization bug gcc emits this warning: kernel/auditfilter.c: In function 'audit_filter_user': kernel/auditfilter.c:1611: warning: 'state' is used uninitialized in this function I tend to agree with gcc - there are a couple of plausible exit paths from audit_filter_user_rules() where it does not set 'state', keeping the variable uninitialized. For example if a filter rule has an AUDIT_POSSIBLE action. Initialize to 'wont audit'. Fix whitespace damage too. Signed-off-by: Ingo Molnar Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/auditfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 9c8c23227c7f..87865f8b4ce3 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -1601,8 +1601,8 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb, int audit_filter_user(struct netlink_skb_parms *cb, int type) { + enum audit_state state = AUDIT_DISABLED; struct audit_entry *e; - enum audit_state state; int ret = 1; rcu_read_lock(); -- cgit v1.2.2 From e3e8a75d2acfc61ebf25524666a0a2c6abb0620c Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Sat, 10 Feb 2007 01:46:19 -0800 Subject: [PATCH] Extract and use wake_up_klogd() Remove hack with printing space to wake up klogd. Use explicit wake_up_klogd(). See earlier discussion http://groups.google.com/group/fa.linux.kernel/browse_frm/thread/75f496668409f58d/1a8f28983a51e1ff?lnk=st&q=wake_up_klogd+group%3Afa.linux.kernel&rnum=2#1a8f28983a51e1ff Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 3e79e18dce33..4da26b067976 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -783,6 +783,12 @@ int is_console_locked(void) return console_locked; } +void wake_up_klogd(void) +{ + if (!oops_in_progress && waitqueue_active(&log_wait)) + wake_up_interruptible(&log_wait); +} + /** * release_console_sem - unlock the console system * @@ -825,8 +831,8 @@ void release_console_sem(void) console_locked = 0; up(&console_sem); spin_unlock_irqrestore(&logbuf_lock, flags); - if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) - wake_up_interruptible(&log_wait); + if (wake_klogd) + wake_up_klogd(); } EXPORT_SYMBOL(release_console_sem); -- cgit v1.2.2 From 1efc5da3cf567d2f6b795f9d2112ed97fec4ee7c Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Sat, 10 Feb 2007 01:46:29 -0800 Subject: [PATCH] order of lockdep off/on in vprintk() should be changed The order of locking between lockdep_off/on() and local_irq_save/restore() in vprintk() should be changed. * In kernel/printk.c : vprintk() does : preempt_disable() local_irq_save() lockdep_off() spin_lock(&logbuf_lock) spin_unlock(&logbuf_lock) if(!down_trylock(&console_sem)) up(&console_sem) lockdep_on() local_irq_restore() preempt_enable() The goals here is to make sure we do not call printk() recursively from kernel/lockdep.c:__lock_acquire() (called from spin_* and down/up) nor from kernel/lockdep.c:trace_hardirqs_on/off() (called from local_irq_restore/save). It can then potentially call printk() through mark_held_locks/mark_lock. It correctly protects against the spin_lock call and the up/down call, but it does not protect against local_irq_restore. It could cause infinite recursive printk/trace_hardirqs_on() calls when printk() is called from the mark_lock() error handing path. We should change the locking so it becomes correct : preempt_disable() lockdep_off() local_irq_save() spin_lock(&logbuf_lock) spin_unlock(&logbuf_lock) if(!down_trylock(&console_sem)) up(&console_sem) local_irq_restore() lockdep_on() preempt_enable() Signed-off-by: Mathieu Desnoyers Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/printk.c b/kernel/printk.c index 4da26b067976..0c151877ff71 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -529,7 +529,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) zap_locks(); /* This stops the holder of console_sem just where we want him */ - local_irq_save(flags); + raw_local_irq_save(flags); lockdep_off(); spin_lock(&logbuf_lock); printk_cpu = smp_processor_id(); @@ -618,7 +618,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) up(&console_sem); } lockdep_on(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } else { /* * Someone else owns the drivers. We drop the spinlock, which @@ -628,7 +628,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) printk_cpu = UINT_MAX; spin_unlock(&logbuf_lock); lockdep_on(); - local_irq_restore(flags); + raw_local_irq_restore(flags); } preempt_enable(); -- cgit v1.2.2 From 501b9ebf43f9973c3e246c8fbd17144d81a989ef Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Sat, 10 Feb 2007 01:46:34 -0800 Subject: [PATCH] Fix apparent typo CONFIG_LOCKDEP_DEBUG Replace the apparent typo CONFIG_LOCKDEP_DEBUG with the correct CONFIG_DEBUG_LOCKDEP. Signed-off-by: Robert P. J. Day Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/lockdep_proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c index 57a547a2da3f..88fc611b3ae9 100644 --- a/kernel/lockdep_proc.c +++ b/kernel/lockdep_proc.c @@ -244,7 +244,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) sum_forward_deps += count_forward_deps(class); } -#ifdef CONFIG_LOCKDEP_DEBUG +#ifdef CONFIG_DEBUG_LOCKDEP DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); #endif seq_printf(m, " lock-classes: %11lu [max: %lu]\n", -- cgit v1.2.2 From 8d06087714b78e8921bd30b5c64202fe80c47339 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sat, 10 Feb 2007 01:46:38 -0800 Subject: [PATCH] _proc_do_string(): fix short reads If you try to read things like /proc/sys/kernel/osrelease with single-byte reads, you get just one byte and then EOF. This is because _proc_do_string() assumes that the caller is read()ing into a buffer which is large enough to fit the whole string in a single hit. Fix. Cc: "Eric W. Biederman" Cc: Michael Tokarev Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sysctl.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 84cab0ce44d9..e0ac6cd79fcf 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1686,13 +1686,12 @@ static int _proc_do_string(void* data, int maxlen, int write, size_t len; char __user *p; char c; - - if (!data || !maxlen || !*lenp || - (*ppos && !write)) { + + if (!data || !maxlen || !*lenp) { *lenp = 0; return 0; } - + if (write) { len = 0; p = buffer; @@ -1713,6 +1712,15 @@ static int _proc_do_string(void* data, int maxlen, int write, len = strlen(data); if (len > maxlen) len = maxlen; + + if (*ppos > len) { + *lenp = 0; + return 0; + } + + data += *ppos; + len -= *ppos; + if (len > *lenp) len = *lenp; if (len) -- cgit v1.2.2 From 4b98d11b40f03382918796f3c5c936d5495d20a4 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 10 Feb 2007 01:46:45 -0800 Subject: [PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct They are fat: 4x8 bytes in task_struct. They are uncoditionally updated in every fork, read, write and sendfile. They are used only if you have some "extended acct fields feature". And please, please, please, read(2) knows about bytes, not characters, why it is called "rchar"? Signed-off-by: Alexey Dobriyan Cc: Jay Lan Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index d57118da73ff..80284eb488ce 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1038,10 +1038,12 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->utime = cputime_zero; p->stime = cputime_zero; p->sched_time = 0; +#ifdef CONFIG_TASK_XACCT p->rchar = 0; /* I/O counter: bytes read */ p->wchar = 0; /* I/O counter: bytes written */ p->syscr = 0; /* I/O counter: read syscalls */ p->syscw = 0; /* I/O counter: write syscalls */ +#endif task_io_accounting_init(p); acct_clear_integrals(p); -- cgit v1.2.2 From 5ea8176994003483a18c8fed580901e2125f8a83 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 11 Feb 2007 15:41:31 +0000 Subject: [PATCH] sort the devres mess out * Split the implementation-agnostic stuff in separate files. * Make sure that targets using non-default request_irq() pull kernel/irq/devres.o * Introduce new symbols (HAS_IOPORT and HAS_IOMEM) defaulting to positive; allow architectures to turn them off (we needed these symbols anyway for dependencies of quite a few drivers). * protect the ioport-related parts of lib/devres.o with CONFIG_HAS_IOPORT. Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- kernel/irq/Makefile | 2 +- kernel/irq/devres.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/irq/manage.c | 86 --------------------------------------------------- 3 files changed, 89 insertions(+), 87 deletions(-) create mode 100644 kernel/irq/devres.c (limited to 'kernel') diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 1dab0ac3f797..681c52dbfe22 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -1,5 +1,5 @@ -obj-y := handle.o manage.o spurious.o resend.o chip.o +obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c new file mode 100644 index 000000000000..85a430da0fb6 --- /dev/null +++ b/kernel/irq/devres.c @@ -0,0 +1,88 @@ +#include +#include + +/* + * Device resource management aware IRQ request/free implementation. + */ +struct irq_devres { + unsigned int irq; + void *dev_id; +}; + +static void devm_irq_release(struct device *dev, void *res) +{ + struct irq_devres *this = res; + + free_irq(this->irq, this->dev_id); +} + +static int devm_irq_match(struct device *dev, void *res, void *data) +{ + struct irq_devres *this = res, *match = data; + + return this->irq == match->irq && this->dev_id == match->dev_id; +} + +/** + * devm_request_irq - allocate an interrupt line for a managed device + * @dev: device to request interrupt for + * @irq: Interrupt line to allocate + * @handler: Function to be called when the IRQ occurs + * @irqflags: Interrupt type flags + * @devname: An ascii name for the claiming device + * @dev_id: A cookie passed back to the handler function + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as + * request_irq(). IRQs requested with this function will be + * automatically freed on driver detach. + * + * If an IRQ allocated with this function needs to be freed + * separately, dev_free_irq() must be used. + */ +int devm_request_irq(struct device *dev, unsigned int irq, + irq_handler_t handler, unsigned long irqflags, + const char *devname, void *dev_id) +{ + struct irq_devres *dr; + int rc; + + dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), + GFP_KERNEL); + if (!dr) + return -ENOMEM; + + rc = request_irq(irq, handler, irqflags, devname, dev_id); + if (rc) { + kfree(dr); + return rc; + } + + dr->irq = irq; + dr->dev_id = dev_id; + devres_add(dev, dr); + + return 0; +} +EXPORT_SYMBOL(devm_request_irq); + +/** + * devm_free_irq - free an interrupt + * @dev: device to free interrupt for + * @irq: Interrupt line to free + * @dev_id: Device identity to free + * + * Except for the extra @dev argument, this function takes the + * same arguments and performs the same function as free_irq(). + * This function instead of free_irq() should be used to manually + * free IRQs allocated with dev_request_irq(). + */ +void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) +{ + struct irq_devres match_data = { irq, dev_id }; + + free_irq(irq, dev_id); + WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, + &match_data)); +} +EXPORT_SYMBOL(devm_free_irq); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index c4b7ed1cebf7..8b961adc3bd2 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -482,89 +482,3 @@ int request_irq(unsigned int irq, irq_handler_t handler, return retval; } EXPORT_SYMBOL(request_irq); - -/* - * Device resource management aware IRQ request/free implementation. - */ -struct irq_devres { - unsigned int irq; - void *dev_id; -}; - -static void devm_irq_release(struct device *dev, void *res) -{ - struct irq_devres *this = res; - - free_irq(this->irq, this->dev_id); -} - -static int devm_irq_match(struct device *dev, void *res, void *data) -{ - struct irq_devres *this = res, *match = data; - - return this->irq == match->irq && this->dev_id == match->dev_id; -} - -/** - * devm_request_irq - allocate an interrupt line for a managed device - * @dev: device to request interrupt for - * @irq: Interrupt line to allocate - * @handler: Function to be called when the IRQ occurs - * @irqflags: Interrupt type flags - * @devname: An ascii name for the claiming device - * @dev_id: A cookie passed back to the handler function - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as - * request_irq(). IRQs requested with this function will be - * automatically freed on driver detach. - * - * If an IRQ allocated with this function needs to be freed - * separately, dev_free_irq() must be used. - */ -int devm_request_irq(struct device *dev, unsigned int irq, - irq_handler_t handler, unsigned long irqflags, - const char *devname, void *dev_id) -{ - struct irq_devres *dr; - int rc; - - dr = devres_alloc(devm_irq_release, sizeof(struct irq_devres), - GFP_KERNEL); - if (!dr) - return -ENOMEM; - - rc = request_irq(irq, handler, irqflags, devname, dev_id); - if (rc) { - kfree(dr); - return rc; - } - - dr->irq = irq; - dr->dev_id = dev_id; - devres_add(dev, dr); - - return 0; -} -EXPORT_SYMBOL(devm_request_irq); - -/** - * devm_free_irq - free an interrupt - * @dev: device to free interrupt for - * @irq: Interrupt line to free - * @dev_id: Device identity to free - * - * Except for the extra @dev argument, this function takes the - * same arguments and performs the same function as free_irq(). - * This function instead of free_irq() should be used to manually - * free IRQs allocated with dev_request_irq(). - */ -void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id) -{ - struct irq_devres match_data = { irq, dev_id }; - - free_irq(irq, dev_id); - WARN_ON(devres_destroy(dev, devm_irq_release, devm_irq_match, - &match_data)); -} -EXPORT_SYMBOL(devm_free_irq); -- cgit v1.2.2 From a304e1b82808904c561b7b149b467e338c53fcce Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Mon, 12 Feb 2007 00:52:00 -0800 Subject: [PATCH] Debug shared irqs Drivers registering IRQ handlers with SA_SHIRQ really ought to be able to handle an interrupt happening before request_irq() returns. They also ought to be able to handle an interrupt happening during the start of their call to free_irq(). Let's test that hypothesis.... [bunk@stusta.de: Kconfig fixes] Signed-off-by: David Woodhouse Cc: Arjan van de Ven Signed-off-by: Jesper Juhl Signed-off-by: Ingo Molnar Signed-off-by: Adrian Bunk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 8b961adc3bd2..400b12a63649 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -357,6 +357,7 @@ void free_irq(unsigned int irq, void *dev_id) struct irq_desc *desc; struct irqaction **p; unsigned long flags; + irqreturn_t (*handler)(int, void *) = NULL; WARN_ON(in_interrupt()); if (irq >= NR_IRQS) @@ -396,6 +397,8 @@ void free_irq(unsigned int irq, void *dev_id) /* Make sure it's not being used on another CPU */ synchronize_irq(irq); + if (action->flags & IRQF_SHARED) + handler = action->handler; kfree(action); return; } @@ -403,6 +406,17 @@ void free_irq(unsigned int irq, void *dev_id) spin_unlock_irqrestore(&desc->lock, flags); return; } +#ifdef CONFIG_DEBUG_SHIRQ + if (handler) { + /* + * It's a shared IRQ -- the driver ought to be prepared for it + * to happen even now it's being freed, so let's make sure.... + * We do this after actually deregistering it, to make sure that + * a 'real' IRQ doesn't run in parallel with our fake + */ + handler(irq, dev_id); + } +#endif } EXPORT_SYMBOL(free_irq); @@ -475,6 +489,25 @@ int request_irq(unsigned int irq, irq_handler_t handler, select_smp_affinity(irq); +#ifdef CONFIG_DEBUG_SHIRQ + if (irqflags & IRQF_SHARED) { + /* + * It's a shared IRQ -- the driver ought to be prepared for it + * to happen immediately, so let's make sure.... + * We do this before actually registering it, to make sure that + * a 'real' IRQ doesn't run in parallel with our fake + */ + if (irqflags & IRQF_DISABLED) { + unsigned long flags; + + local_irq_save(flags); + handler(irq, dev_id); + local_irq_restore(flags); + } else + handler(irq, dev_id); + } +#endif + retval = setup_irq(irq, action); if (retval) kfree(action); -- cgit v1.2.2 From 3f0504471536a2b6978b9a99ed1c222950fff07a Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Mon, 12 Feb 2007 00:52:04 -0800 Subject: [PATCH] kernel: shut up the IRQ mismatch messages The problem is various drivers legally validly and sensibly try to claim IRQs but the kernel insists on vomiting forth a giant irrelevant debugging spew when the types clash. Edit kernel/irq/manage.c go down to mismatch: in setup_irq() and ifdef out the if clause that checks for mismatches. It'll then just do the right thing and work sanely. For the current -mm kernel this will do the trick (and moves it into shared irq debugging as in debug mode the info spew is useful). I've had a variant of this in my private tree for some time as I got fed up on the mess on boxes where old legacy IRQs get reused. Signed-off-by: Alan Cox Cc: Arjan van de Ven Cc: Ingo Molnar Cc: David Woodhouse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/irq/manage.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 400b12a63649..7c85d69188ef 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -328,12 +328,14 @@ int setup_irq(unsigned int irq, struct irqaction *new) return 0; mismatch: +#ifdef CONFIG_DEBUG_SHIRQ if (!(new->flags & IRQF_PROBE_SHARED)) { printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq); if (old_name) printk(KERN_ERR "current handler: %s\n", old_name); dump_stack(); } +#endif spin_unlock_irqrestore(&desc->lock, flags); return -EBUSY; } -- cgit v1.2.2 From 944be0b224724fcbf63c3a3fe3a5478c325a6547 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 12 Feb 2007 00:52:26 -0800 Subject: [PATCH] close_files(): add scheduling point close_files() can sometimes take long enough to trigger the soft lockup detector. Cc: Eric Dumazet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index bc71fdfcd8a7..14f17033f563 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -430,8 +430,10 @@ static void close_files(struct files_struct * files) while (set) { if (set & 1) { struct file * file = xchg(&fdt->fd[i], NULL); - if (file) + if (file) { filp_close(file, files); + cond_resched(); + } } i++; set >>= 1; -- cgit v1.2.2 From 8d42db189ca99703f0f4f91c477cb54808c8eaaa Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Feb 2007 00:52:55 -0800 Subject: [PATCH] signal: rewrite kill_something_info so it uses newer helpers The goal is to remove users of the old signal helper functions so they can be removed. Signed-off-by: Eric W. Biederman Cc: Alan Cox Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 228fdb5c01d1..de66def71644 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1191,8 +1191,10 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid); static int kill_something_info(int sig, struct siginfo *info, int pid) { + int ret; + rcu_read_lock(); if (!pid) { - return kill_pg_info(sig, info, process_group(current)); + ret = kill_pgrp_info(sig, info, task_pgrp(current)); } else if (pid == -1) { int retval = 0, count = 0; struct task_struct * p; @@ -1207,12 +1209,14 @@ static int kill_something_info(int sig, struct siginfo *info, int pid) } } read_unlock(&tasklist_lock); - return count ? retval : -ESRCH; + ret = count ? retval : -ESRCH; } else if (pid < 0) { - return kill_pg_info(sig, info, -pid); + ret = kill_pgrp_info(sig, info, find_pid(-pid)); } else { - return kill_proc_info(sig, info, pid); + ret = kill_pid_info(sig, info, find_pid(pid)); } + rcu_read_unlock(); + return ret; } /* -- cgit v1.2.2 From 04a2e6a5cbf84e85fe86de0a18f6509b147e1d89 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Feb 2007 00:52:56 -0800 Subject: [PATCH] pid: make session_of_pgrp use struct pid instead of pid_t To properly implement a pid namespace I need to deal exclusively in terms of struct pid, because pid_t values become ambiguous. To this end session_of_pgrp is transformed to take and return a struct pid pointer. To avoid the need to worry about reference counting I now require my caller to hold the appropriate locks. Leaving callers repsonsible for increasing the reference count if they need access to the result outside of the locks. Since session_of_pgrp currently only has one caller and that caller simply uses only test the result for equality with another process group, the locking change means I don't actually have to acquire the tasklist_lock at all. tiocspgrp is also modified to take and release the lock. The logic there is a little more complicated but nothing I won't need when I convert pgrp of a tty to a struct pid pointer. Signed-off-by: Eric W. Biederman Cc: Alan Cox Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 14f17033f563..3ac6a7a6f857 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -185,21 +185,19 @@ repeat: * This checks not only the pgrp, but falls back on the pid if no * satisfactory pgrp is found. I dunno - gdb doesn't work correctly * without this... + * + * The caller must hold rcu lock or the tasklist lock. */ -int session_of_pgrp(int pgrp) +struct pid *session_of_pgrp(struct pid *pgrp) { struct task_struct *p; - int sid = 0; - - read_lock(&tasklist_lock); + struct pid *sid = NULL; - p = find_task_by_pid_type(PIDTYPE_PGID, pgrp); + p = pid_task(pgrp, PIDTYPE_PGID); if (p == NULL) - p = find_task_by_pid(pgrp); + p = pid_task(pgrp, PIDTYPE_PID); if (p != NULL) - sid = process_session(p); - - read_unlock(&tasklist_lock); + sid = task_session(p); return sid; } -- cgit v1.2.2 From 0475ac0845f9295bc5f69af45f58dff2c104c8d1 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Feb 2007 00:52:57 -0800 Subject: [PATCH] pid: use struct pid for talking about process groups in exitc Modify has_stopped_jobs and will_become_orphan_pgrp to use struct pid based process groups. This reduces the number of hash tables looks ups and paves the way for multiple pid spaces. Signed-off-by: Eric W. Biederman Cc: Alan Cox Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 42 ++++++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 3ac6a7a6f857..407b80aaefda 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -210,22 +210,22 @@ struct pid *session_of_pgrp(struct pid *pgrp) * * "I ask you, have you ever known what it is to be an orphan?" */ -static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task) +static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignored_task) { struct task_struct *p; int ret = 1; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p == ignored_task || p->exit_state || is_init(p->real_parent)) continue; - if (process_group(p->real_parent) != pgrp && - process_session(p->real_parent) == process_session(p)) { + if (task_pgrp(p->real_parent) != pgrp && + task_session(p->real_parent) == task_session(p)) { ret = 0; break; } - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return ret; /* (sighing) "Often!" */ } @@ -234,23 +234,23 @@ int is_orphaned_pgrp(int pgrp) int retval; read_lock(&tasklist_lock); - retval = will_become_orphaned_pgrp(pgrp, NULL); + retval = will_become_orphaned_pgrp(find_pid(pgrp), NULL); read_unlock(&tasklist_lock); return retval; } -static int has_stopped_jobs(int pgrp) +static int has_stopped_jobs(struct pid *pgrp) { int retval = 0; struct task_struct *p; - do_each_task_pid(pgrp, PIDTYPE_PGID, p) { + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { if (p->state != TASK_STOPPED) continue; retval = 1; break; - } while_each_task_pid(pgrp, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); return retval; } @@ -648,14 +648,14 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) * than we are, and it was the only connection * outside, so the child pgrp is now orphaned. */ - if ((process_group(p) != process_group(father)) && - (process_session(p) == process_session(father))) { - int pgrp = process_group(p); + if ((task_pgrp(p) != task_pgrp(father)) && + (task_session(p) == task_session(father))) { + struct pid *pgrp = task_pgrp(p); if (will_become_orphaned_pgrp(pgrp, NULL) && has_stopped_jobs(pgrp)) { - __kill_pg_info(SIGHUP, SEND_SIG_PRIV, pgrp); - __kill_pg_info(SIGCONT, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } } } @@ -735,6 +735,7 @@ static void exit_notify(struct task_struct *tsk) int state; struct task_struct *t; struct list_head ptrace_dead, *_p, *_n; + struct pid *pgrp; if (signal_pending(tsk) && !(tsk->signal->flags & SIGNAL_GROUP_EXIT) && !thread_group_empty(tsk)) { @@ -787,12 +788,13 @@ static void exit_notify(struct task_struct *tsk) t = tsk->real_parent; - if ((process_group(t) != process_group(tsk)) && - (process_session(t) == process_session(tsk)) && - will_become_orphaned_pgrp(process_group(tsk), tsk) && - has_stopped_jobs(process_group(tsk))) { - __kill_pg_info(SIGHUP, SEND_SIG_PRIV, process_group(tsk)); - __kill_pg_info(SIGCONT, SEND_SIG_PRIV, process_group(tsk)); + pgrp = task_pgrp(tsk); + if ((task_pgrp(t) != pgrp) && + (task_session(t) != task_session(tsk)) && + will_become_orphaned_pgrp(pgrp, tsk) && + has_stopped_jobs(pgrp)) { + __kill_pgrp_info(SIGHUP, SEND_SIG_PRIV, pgrp); + __kill_pgrp_info(SIGCONT, SEND_SIG_PRIV, pgrp); } /* Let father know we died -- cgit v1.2.2 From 3e7cd6c413c9e6fbb5e1ee2acdadb4ababd2d474 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Feb 2007 00:52:58 -0800 Subject: [PATCH] pid: replace is_orphaned_pgrp with is_current_pgrp_orphaned Every call to is_orphaned_pgrp passed in process_group(current) which is racy with respect to another thread changing our process group. It didn't bite us because we were dealing with integers and the worse we would get would be a stale answer. In switching the checks to use struct pid to be a little more efficient and prepare the way for pid namespaces this race became apparent. So I simplified the calls to the more specialized is_current_pgrp_orphaned so I didn't have to worry about making logic changes to avoid the race. Signed-off-by: Eric W. Biederman Cc: Alan Cox Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/exit.c | 4 ++-- kernel/signal.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 407b80aaefda..f132349c0325 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -229,12 +229,12 @@ static int will_become_orphaned_pgrp(struct pid *pgrp, struct task_struct *ignor return ret; /* (sighing) "Often!" */ } -int is_orphaned_pgrp(int pgrp) +int is_current_pgrp_orphaned(void) { int retval; read_lock(&tasklist_lock); - retval = will_become_orphaned_pgrp(find_pid(pgrp), NULL); + retval = will_become_orphaned_pgrp(task_pgrp(current), NULL); read_unlock(&tasklist_lock); return retval; diff --git a/kernel/signal.c b/kernel/signal.c index de66def71644..a9b679ed795c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1909,7 +1909,7 @@ relock: /* signals can be posted during this window */ - if (is_orphaned_pgrp(process_group(current))) + if (is_current_pgrp_orphaned()) goto relock; spin_lock_irq(¤t->sighand->siglock); -- cgit v1.2.2 From ab521dc0f8e117fd808d3e425216864d60390500 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Feb 2007 00:53:00 -0800 Subject: [PATCH] tty: update the tty layer to work with struct pid Of kernel subsystems that work with pids the tty layer is probably the largest consumer. But it has the nice virtue that the assiation with a session only lasts until the session leader exits. Which means that no reference counting is required. So using struct pid winds up being a simple optimization to avoid hash table lookups. In the long term the use of pid_nr also ensures that when we have multiple pid spaces mixed everything will work correctly. Signed-off-by: Eric W. Biederman Cc: Alan Cox Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 2 +- kernel/sys.c | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 80284eb488ce..0b6293d94d96 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -869,7 +869,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts sig->it_prof_incr = cputime_zero; sig->leader = 0; /* session leadership doesn't inherit */ - sig->tty_old_pgrp = 0; + sig->tty_old_pgrp = NULL; sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; diff --git a/kernel/sys.c b/kernel/sys.c index e1024383314d..efcf76e0dada 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1510,7 +1510,6 @@ asmlinkage long sys_setsid(void) spin_lock(&group_leader->sighand->siglock); group_leader->signal->tty = NULL; - group_leader->signal->tty_old_pgrp = 0; spin_unlock(&group_leader->sighand->siglock); err = process_group(group_leader); -- cgit v1.2.2 From 41487c65bfcce9c8e4d123da1719fcfd8df6d4d0 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Feb 2007 00:53:01 -0800 Subject: [PATCH] pid: replace do/while_each_task_pid with do/while_each_pid_task There isn't any real advantage to this change except that it allows the old functions to be removed. Which is easier on maintenance and puts the code in a more uniform style. Signed-off-by: Eric W. Biederman Cc: Alan Cox Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/capability.c | 8 +++++--- kernel/sys.c | 40 ++++++++++++++++++++++++---------------- 2 files changed, 29 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index edb845a6e84a..c8d3c7762034 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -92,15 +92,17 @@ out: * cap_set_pg - set capabilities for all processes in a given process * group. We call this holding task_capability_lock and tasklist_lock. */ -static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, +static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { struct task_struct *g, *target; int ret = -EPERM; int found = 0; + struct pid *pgrp; - do_each_task_pid(pgrp, PIDTYPE_PGID, g) { + pgrp = find_pid(pgrp_nr); + do_each_pid_task(pgrp, PIDTYPE_PGID, g) { target = g; while_each_thread(g, target) { if (!security_capset_check(target, effective, @@ -113,7 +115,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective, } found = 1; } - } while_each_task_pid(pgrp, PIDTYPE_PGID, g); + } while_each_pid_task(pgrp, PIDTYPE_PGID, g); if (!found) ret = 0; diff --git a/kernel/sys.c b/kernel/sys.c index efcf76e0dada..123b165080e6 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -596,6 +596,7 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) struct task_struct *g, *p; struct user_struct *user; int error = -EINVAL; + struct pid *pgrp; if (which > 2 || which < 0) goto out; @@ -610,18 +611,21 @@ asmlinkage long sys_setpriority(int which, int who, int niceval) read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: - if (!who) - who = current->pid; - p = find_task_by_pid(who); + if (who) + p = find_task_by_pid(who); + else + p = current; if (p) error = set_one_prio(p, niceval, error); break; case PRIO_PGRP: - if (!who) - who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + if (who) + pgrp = find_pid(who); + else + pgrp = task_pgrp(current); + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { error = set_one_prio(p, niceval, error); - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -656,6 +660,7 @@ asmlinkage long sys_getpriority(int which, int who) struct task_struct *g, *p; struct user_struct *user; long niceval, retval = -ESRCH; + struct pid *pgrp; if (which > 2 || which < 0) return -EINVAL; @@ -663,9 +668,10 @@ asmlinkage long sys_getpriority(int which, int who) read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: - if (!who) - who = current->pid; - p = find_task_by_pid(who); + if (who) + p = find_task_by_pid(who); + else + p = current; if (p) { niceval = 20 - task_nice(p); if (niceval > retval) @@ -673,13 +679,15 @@ asmlinkage long sys_getpriority(int which, int who) } break; case PRIO_PGRP: - if (!who) - who = process_group(current); - do_each_task_pid(who, PIDTYPE_PGID, p) { + if (who) + pgrp = find_pid(who); + else + pgrp = task_pgrp(current); + do_each_pid_task(pgrp, PIDTYPE_PGID, p) { niceval = 20 - task_nice(p); if (niceval > retval) retval = niceval; - } while_each_task_pid(who, PIDTYPE_PGID, p); + } while_each_pid_task(pgrp, PIDTYPE_PGID, p); break; case PRIO_USER: user = current->user; @@ -1388,7 +1396,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) if (p->real_parent == group_leader) { err = -EPERM; - if (process_session(p) != process_session(group_leader)) + if (task_session(p) != task_session(group_leader)) goto out; err = -EACCES; if (p->did_exec) @@ -1407,7 +1415,7 @@ asmlinkage long sys_setpgid(pid_t pid, pid_t pgid) struct task_struct *g = find_task_by_pid_type(PIDTYPE_PGID, pgid); - if (!g || process_session(g) != process_session(group_leader)) + if (!g || task_session(g) != task_session(group_leader)) goto out; } -- cgit v1.2.2 From 27b0b2f44adffe0193a695bb528a83b550b8e54b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 12 Feb 2007 00:53:02 -0800 Subject: [PATCH] pid: remove the now unused kill_pg kill_pg_info and __kill_pg_info Now that I have changed all of the in-tree users remove the old version of these functions. This should make it clear to any out of tree users that they should be using kill_pgrp kill_pgrp_info or __kill_pgrp_info instead. Signed-off-by: Eric W. Biederman Cc: Alan Cox Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/signal.c | 27 --------------------------- 1 file changed, 27 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index a9b679ed795c..8072e568bbe0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1096,26 +1096,6 @@ int kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp) return retval; } -int __kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) -{ - if (pgrp <= 0) - return -EINVAL; - - return __kill_pgrp_info(sig, info, find_pid(pgrp)); -} - -int -kill_pg_info(int sig, struct siginfo *info, pid_t pgrp) -{ - int retval; - - read_lock(&tasklist_lock); - retval = __kill_pg_info(sig, info, pgrp); - read_unlock(&tasklist_lock); - - return retval; -} - int kill_pid_info(int sig, struct siginfo *info, struct pid *pid) { int error; @@ -1314,12 +1294,6 @@ int kill_pid(struct pid *pid, int sig, int priv) } EXPORT_SYMBOL(kill_pid); -int -kill_pg(pid_t pgrp, int sig, int priv) -{ - return kill_pg_info(sig, __si_special(priv), pgrp); -} - int kill_proc(pid_t pid, int sig, int priv) { @@ -1959,7 +1933,6 @@ EXPORT_SYMBOL(recalc_sigpending); EXPORT_SYMBOL_GPL(dequeue_signal); EXPORT_SYMBOL(flush_signals); EXPORT_SYMBOL(force_sig); -EXPORT_SYMBOL(kill_pg); EXPORT_SYMBOL(kill_proc); EXPORT_SYMBOL(ptrace_notify); EXPORT_SYMBOL(send_sig); -- cgit v1.2.2 From ff91691bccdb741efb2df0489058a4961fa79598 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Mon, 12 Feb 2007 00:53:51 -0800 Subject: [PATCH] sched: avoid div in rebalance_tick Avoid expensive integer divide 3 times per CPU per tick. A userspace test of this loop went from 26ns, down to 19ns on a G5; and from 123ns down to 28ns on a P3. (Also avoid a variable bit shift, as suggested by Alan. The effect of this wasn't noticable on the CPUs I tested with). Signed-off-by: Nick Piggin Cc: Ingo Molnar Cc: Alan Cox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sched.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 1fd67e16cd31..08f86178aa34 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2897,14 +2897,16 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static void update_load(struct rq *this_rq) { unsigned long this_load; - int i, scale; + unsigned int i, scale; this_load = this_rq->raw_weighted_load; /* Update our load: */ - for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { + for (i = 0, scale = 1; i < 3; i++, scale += scale) { unsigned long old_load, new_load; + /* scale is effectively 1 << i now, and >> i divides by scale */ + old_load = this_rq->cpu_load[i]; new_load = this_load; /* @@ -2914,7 +2916,7 @@ static void update_load(struct rq *this_rq) */ if (new_load > old_load) new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } } -- cgit v1.2.2 From 9a32144e9d7b4e21341174b1a83b82a82353be86 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 12 Feb 2007 00:55:35 -0800 Subject: [PATCH] mark struct file_operations const 7 Many struct file_operations in the kernel can be "const". Marking them const moves these to the .rodata section, which avoids false sharing with potential dirty data. In addition it'll catch accidental writes at compile time to these shared resources. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6b05dc69c959..232aed2b10f9 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2656,7 +2656,7 @@ static int cpuset_open(struct inode *inode, struct file *file) return single_open(file, proc_cpuset_show, pid); } -struct file_operations proc_cpuset_operations = { +const struct file_operations proc_cpuset_operations = { .open = cpuset_open, .read = seq_read, .llseek = seq_lseek, -- cgit v1.2.2 From 92e1d5be91a0e3ffa5c4697eeb09b2aa22792122 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Mon, 12 Feb 2007 00:55:39 -0800 Subject: [PATCH] mark struct inode_operations const 2 Many struct inode_operations in the kernel can be "const". Marking them const moves these to the .rodata section, which avoids false sharing with potential dirty data. In addition it'll catch accidental writes at compile time to these shared resources. Signed-off-by: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 232aed2b10f9..f382b0f775e1 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1540,7 +1540,7 @@ static const struct file_operations cpuset_file_operations = { .release = cpuset_file_release, }; -static struct inode_operations cpuset_dir_inode_operations = { +static const struct inode_operations cpuset_dir_inode_operations = { .lookup = simple_lookup, .mkdir = cpuset_mkdir, .rmdir = cpuset_rmdir, -- cgit v1.2.2