From a7aea373b4ca428f1be2c1fedd2f26c8e3f2864d Mon Sep 17 00:00:00 2001 From: "Ira W. Snyder" Date: Thu, 23 Apr 2009 16:17:54 -0700 Subject: fsldma: use PCI Read Multiple command By default, the Freescale 83xx DMA controller uses the PCI Read Line command when reading data over the PCI bus. Setting the controller to use the PCI Read Multiple command instead allows the controller to read much larger bursts of data, which provides a drastic speed increase. The slowdown due to using PCI Read Line was only observed when a PCI-to-PCI bridge was between the devices trying to communicate. A simple test driver showed an increase from 4MB/sec to 116MB/sec when performing DMA over the PCI bus. Using DMA to transfer between blocks of local SDRAM showed no change in performance with this patch. The dmatest driver was also used to verify the correctness of the transfers, and showed no errors. Signed-off-by: Ira W. Snyder Acked-by: Timur Tabi Acked-by: Kumar Gala Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 10 ++++++++-- drivers/dma/fsldma.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index f18d1bde0439..a1cb25e277b5 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -12,6 +12,11 @@ * also fit for MPC8560, MPC8555, MPC8548, MPC8641, and etc. * The support for MPC8349 DMA contorller is also added. * + * This driver instructs the DMA controller to issue the PCI Read Multiple + * command for PCI read operations, instead of using the default PCI Read Line + * command. Please be aware that this setting may result in read pre-fetching + * on some platforms. + * * This is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -49,9 +54,10 @@ static void dma_init(struct fsl_dma_chan *fsl_chan) case FSL_DMA_IP_83XX: /* Set the channel to below modes: * EOTIE - End-of-transfer interrupt enable + * PRC_RM - PCI read multiple */ - DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE, - 32); + DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE + | FSL_DMA_MR_PRC_RM, 32); break; } diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h index 4f21a512d848..dc7f26865797 100644 --- a/drivers/dma/fsldma.h +++ b/drivers/dma/fsldma.h @@ -38,6 +38,7 @@ /* Special MR definition for MPC8349 */ #define FSL_DMA_MR_EOTIE 0x00000080 +#define FSL_DMA_MR_PRC_RM 0x00000800 #define FSL_DMA_SR_CH 0x00000020 #define FSL_DMA_SR_PE 0x00000010 -- cgit v1.2.2 From be30b226f2ae618cd719e40267d9923db1db9001 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Thu, 28 May 2009 09:20:42 +0000 Subject: fsldma: enable external start for the 83xx controller The 83xx controller has external start capability, but lacks external pause capability. Hook up the external start function pointer for the 83xx controller. Signed-off-by: Ira W. Snyder Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index a1cb25e277b5..10bcf0cb0efc 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -877,9 +877,9 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev, switch (new_fsl_chan->feature & FSL_DMA_IP_MASK) { case FSL_DMA_IP_85XX: - new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; new_fsl_chan->toggle_ext_pause = fsl_chan_toggle_ext_pause; case FSL_DMA_IP_83XX: + new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size; new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size; } -- cgit v1.2.2 From 43a1a3ed6bf5a1b9ae197b4f5f20033baf19db61 Mon Sep 17 00:00:00 2001 From: Ira Snyder Date: Thu, 28 May 2009 09:26:40 +0000 Subject: fsldma: do not clear bandwidth control bits on the 83xx controller The 83xx controller does not support the external pause feature. The bit in the mode register that controls external pause on the 85xx controller happens to be part of the bandwidth control settings for the 83xx controller. This patch fixes the driver so that it only clears the external pause bit if the hardware is the 85xx controller. When driving the 83xx controller, the bit is left untouched. This follows the existing convention that mode registers settings are not touched unless necessary. Signed-off-by: Ira W. Snyder Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 10bcf0cb0efc..6e60c77a145c 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -147,10 +147,11 @@ static void dma_start(struct fsl_dma_chan *fsl_chan) if (fsl_chan->feature & FSL_DMA_CHAN_PAUSE_EXT) { DMA_OUT(fsl_chan, &fsl_chan->reg_base->bcr, 0, 32); mr_set |= FSL_DMA_MR_EMP_EN; - } else + } else if ((fsl_chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) { DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) & ~FSL_DMA_MR_EMP_EN, 32); + } if (fsl_chan->feature & FSL_DMA_CHAN_START_EXT) mr_set |= FSL_DMA_MR_EMS_EN; -- cgit v1.2.2 From 04a820ead0838c76e9c1242feb5e71048bf3e9dc Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 30 Jun 2009 02:14:00 -0400 Subject: olpc_battery: Fix up eeprom read function The eeprom read function was placing values into the wrong place in 'buf'; we were starting from buf[off], rather than buf[0]. Also, the for loop that we were using was much uglier than it needed to be. This cleans it up a bit. Signed-off-by: Andres Salomon Signed-off-by: Anton Vorontsov --- drivers/power/olpc_battery.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c index 5fbca2681baa..9c216dd41550 100644 --- a/drivers/power/olpc_battery.c +++ b/drivers/power/olpc_battery.c @@ -8,6 +8,7 @@ * published by the Free Software Foundation. */ +#include #include #include #include @@ -334,21 +335,21 @@ static ssize_t olpc_bat_eeprom_read(struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { uint8_t ec_byte; - int ret, end; + int ret; + int i; if (off >= EEPROM_SIZE) return 0; if (off + count > EEPROM_SIZE) count = EEPROM_SIZE - off; - end = EEPROM_START + off + count; - for (ec_byte = EEPROM_START + off; ec_byte < end; ec_byte++) { - ret = olpc_ec_cmd(EC_BAT_EEPROM, &ec_byte, 1, - &buf[ec_byte - EEPROM_START], 1); + for (i = 0; i < count; i++) { + ec_byte = EEPROM_START + off + i; + ret = olpc_ec_cmd(EC_BAT_EEPROM, &ec_byte, 1, &buf[i], 1); if (ret) { - printk(KERN_ERR "olpc-battery: EC command " - "EC_BAT_EEPROM @ 0x%x failed -" - " %d!\n", ec_byte, ret); + pr_err("olpc-battery: " + "EC_BAT_EEPROM cmd @ 0x%x failed - %d!\n", + ec_byte, ret); return -EIO; } } -- cgit v1.2.2 From 8f7e57985fa794ab6afdcd3642581d9e1fe6de31 Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Tue, 30 Jun 2009 02:16:17 -0400 Subject: olpc_battery: Ensure that the TRICKLE bit is checked There are times when the battery is present but trickle charging, and the EC sets only the TRICKLE bit. So we must check for the bit when we're checking the charging/present status. Signed-off-by: Andres Salomon Signed-off-by: Anton Vorontsov --- drivers/power/olpc_battery.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c index 9c216dd41550..58e419299cd6 100644 --- a/drivers/power/olpc_battery.c +++ b/drivers/power/olpc_battery.c @@ -36,6 +36,7 @@ #define BAT_STAT_AC 0x10 #define BAT_STAT_CHARGING 0x20 #define BAT_STAT_DISCHARGING 0x40 +#define BAT_STAT_TRICKLE 0x80 #define BAT_ERR_INFOFAIL 0x02 #define BAT_ERR_OVERVOLTAGE 0x04 @@ -90,7 +91,7 @@ static char bat_serial[17]; /* Ick */ static int olpc_bat_get_status(union power_supply_propval *val, uint8_t ec_byte) { if (olpc_platform_info.ecver > 0x44) { - if (ec_byte & BAT_STAT_CHARGING) + if (ec_byte & (BAT_STAT_CHARGING | BAT_STAT_TRICKLE)) val->intval = POWER_SUPPLY_STATUS_CHARGING; else if (ec_byte & BAT_STAT_DISCHARGING) val->intval = POWER_SUPPLY_STATUS_DISCHARGING; @@ -220,7 +221,8 @@ static int olpc_bat_get_property(struct power_supply *psy, It doesn't matter though -- the EC will return the last-known information, and it's as if we just ran that _little_ bit faster and managed to read it out before the battery went away. */ - if (!(ec_byte & BAT_STAT_PRESENT) && psp != POWER_SUPPLY_PROP_PRESENT) + if (!(ec_byte & (BAT_STAT_PRESENT | BAT_STAT_TRICKLE)) && + psp != POWER_SUPPLY_PROP_PRESENT) return -ENODEV; switch (psp) { @@ -230,7 +232,8 @@ static int olpc_bat_get_property(struct power_supply *psy, return ret; break; case POWER_SUPPLY_PROP_PRESENT: - val->intval = !!(ec_byte & BAT_STAT_PRESENT); + val->intval = !!(ec_byte & (BAT_STAT_PRESENT | + BAT_STAT_TRICKLE)); break; case POWER_SUPPLY_PROP_HEALTH: -- cgit v1.2.2 From bfdb46ce8494eae30dbaae65c81e684e6db6228b Mon Sep 17 00:00:00 2001 From: Ryan Mallon Date: Thu, 18 Jun 2009 11:26:26 +1200 Subject: Add ds2782 battery gas gauge driver This patch adds a driver for ds2782 battery devices. Signed-off-by: Ryan Mallon Signed-off-by: Anton Vorontsov --- drivers/power/Kconfig | 7 + drivers/power/Makefile | 1 + drivers/power/ds2782_battery.c | 330 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 338 insertions(+) create mode 100644 drivers/power/ds2782_battery.c (limited to 'drivers') diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index 7eda34838bfe..bdbc4f73fcdc 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -43,6 +43,13 @@ config BATTERY_DS2760 help Say Y here to enable support for batteries with ds2760 chip. +config BATTERY_DS2782 + tristate "DS2782 standalone gas-gauge" + depends on I2C + help + Say Y here to enable support for the DS2782 standalone battery + gas-gauge. + config BATTERY_PMU tristate "Apple PMU battery" depends on PPC32 && ADB_PMU diff --git a/drivers/power/Makefile b/drivers/power/Makefile index daf3179689aa..380d17c9ae29 100644 --- a/drivers/power/Makefile +++ b/drivers/power/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_APM_POWER) += apm_power.o obj-$(CONFIG_WM8350_POWER) += wm8350_power.o obj-$(CONFIG_BATTERY_DS2760) += ds2760_battery.o +obj-$(CONFIG_BATTERY_DS2782) += ds2782_battery.o obj-$(CONFIG_BATTERY_PMU) += pmu_battery.o obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o diff --git a/drivers/power/ds2782_battery.c b/drivers/power/ds2782_battery.c new file mode 100644 index 000000000000..da14f374cb60 --- /dev/null +++ b/drivers/power/ds2782_battery.c @@ -0,0 +1,330 @@ +/* + * I2C client/driver for the Maxim/Dallas DS2782 Stand-Alone Fuel Gauge IC + * + * Copyright (C) 2009 Bluewater Systems Ltd + * + * Author: Ryan Mallon + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DS2782_REG_RARC 0x06 /* Remaining active relative capacity */ + +#define DS2782_REG_VOLT_MSB 0x0c +#define DS2782_REG_TEMP_MSB 0x0a +#define DS2782_REG_CURRENT_MSB 0x0e + +/* EEPROM Block */ +#define DS2782_REG_RSNSP 0x69 /* Sense resistor value */ + +/* Current unit measurement in uA for a 1 milli-ohm sense resistor */ +#define DS2782_CURRENT_UNITS 1563 + +#define to_ds2782_info(x) container_of(x, struct ds2782_info, battery) + +struct ds2782_info { + struct i2c_client *client; + struct power_supply battery; + int id; +}; + +static DEFINE_IDR(battery_id); +static DEFINE_MUTEX(battery_lock); + +static inline int ds2782_read_reg(struct ds2782_info *info, int reg, u8 *val) +{ + int ret; + + ret = i2c_smbus_read_byte_data(info->client, reg); + if (ret < 0) { + dev_err(&info->client->dev, "register read failed\n"); + return ret; + } + + *val = ret; + return 0; +} + +static inline int ds2782_read_reg16(struct ds2782_info *info, int reg_msb, + s16 *val) +{ + int ret; + + ret = swab16(i2c_smbus_read_word_data(info->client, reg_msb)); + if (ret < 0) { + dev_err(&info->client->dev, "register read failed\n"); + return ret; + } + + *val = ret; + return 0; +} + +static int ds2782_get_temp(struct ds2782_info *info, int *temp) +{ + s16 raw; + int err; + + /* + * Temperature is measured in units of 0.125 degrees celcius, the + * power_supply class measures temperature in tenths of degrees + * celsius. The temperature value is stored as a 10 bit number, plus + * sign in the upper bits of a 16 bit register. + */ + err = ds2782_read_reg16(info, DS2782_REG_TEMP_MSB, &raw); + if (err) + return err; + *temp = ((raw / 32) * 125) / 100; + return 0; +} + +static int ds2782_get_current(struct ds2782_info *info, int *current_uA) +{ + int sense_res; + int err; + u8 sense_res_raw; + s16 raw; + + /* + * The units of measurement for current are dependent on the value of + * the sense resistor. + */ + err = ds2782_read_reg(info, DS2782_REG_RSNSP, &sense_res_raw); + if (err) + return err; + if (sense_res_raw == 0) { + dev_err(&info->client->dev, "sense resistor value is 0\n"); + return -ENXIO; + } + sense_res = 1000 / sense_res_raw; + + dev_dbg(&info->client->dev, "sense resistor = %d milli-ohms\n", + sense_res); + err = ds2782_read_reg16(info, DS2782_REG_CURRENT_MSB, &raw); + if (err) + return err; + *current_uA = raw * (DS2782_CURRENT_UNITS / sense_res); + return 0; +} + +static int ds2782_get_voltage(struct ds2782_info *info, int *voltage_uA) +{ + s16 raw; + int err; + + /* + * Voltage is measured in units of 4.88mV. The voltage is stored as + * a 10-bit number plus sign, in the upper bits of a 16-bit register + */ + err = ds2782_read_reg16(info, DS2782_REG_VOLT_MSB, &raw); + if (err) + return err; + *voltage_uA = (raw / 32) * 4800; + return 0; +} + +static int ds2782_get_capacity(struct ds2782_info *info, int *capacity) +{ + int err; + u8 raw; + + err = ds2782_read_reg(info, DS2782_REG_RARC, &raw); + if (err) + return err; + *capacity = raw; + return raw; +} + +static int ds2782_get_status(struct ds2782_info *info, int *status) +{ + int err; + int current_uA; + int capacity; + + err = ds2782_get_current(info, ¤t_uA); + if (err) + return err; + + err = ds2782_get_capacity(info, &capacity); + if (err) + return err; + + if (capacity == 100) + *status = POWER_SUPPLY_STATUS_FULL; + else if (current_uA == 0) + *status = POWER_SUPPLY_STATUS_NOT_CHARGING; + else if (current_uA < 0) + *status = POWER_SUPPLY_STATUS_DISCHARGING; + else + *status = POWER_SUPPLY_STATUS_CHARGING; + + return 0; +} + +static int ds2782_battery_get_property(struct power_supply *psy, + enum power_supply_property prop, + union power_supply_propval *val) +{ + struct ds2782_info *info = to_ds2782_info(psy); + int ret; + + switch (prop) { + case POWER_SUPPLY_PROP_STATUS: + ret = ds2782_get_status(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_CAPACITY: + ret = ds2782_get_capacity(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + ret = ds2782_get_voltage(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_CURRENT_NOW: + ret = ds2782_get_current(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_TEMP: + ret = ds2782_get_temp(info, &val->intval); + break; + + default: + ret = -EINVAL; + } + + return ret; +} + +static enum power_supply_property ds2782_battery_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_CAPACITY, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_TEMP, +}; + +static void ds2782_power_supply_init(struct power_supply *battery) +{ + battery->type = POWER_SUPPLY_TYPE_BATTERY; + battery->properties = ds2782_battery_props; + battery->num_properties = ARRAY_SIZE(ds2782_battery_props); + battery->get_property = ds2782_battery_get_property; + battery->external_power_changed = NULL; +} + +static int ds2782_battery_remove(struct i2c_client *client) +{ + struct ds2782_info *info = i2c_get_clientdata(client); + + power_supply_unregister(&info->battery); + kfree(info->battery.name); + + mutex_lock(&battery_lock); + idr_remove(&battery_id, info->id); + mutex_unlock(&battery_lock); + + i2c_set_clientdata(client, info); + + kfree(info); + return 0; +} + +static int ds2782_battery_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct ds2782_info *info; + int ret; + int num; + + /* Get an ID for this battery */ + ret = idr_pre_get(&battery_id, GFP_KERNEL); + if (ret == 0) { + ret = -ENOMEM; + goto fail_id; + } + + mutex_lock(&battery_lock); + ret = idr_get_new(&battery_id, client, &num); + mutex_unlock(&battery_lock); + if (ret < 0) + goto fail_id; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto fail_info; + } + + info->battery.name = kasprintf(GFP_KERNEL, "ds2782-%d", num); + if (!info->battery.name) { + ret = -ENOMEM; + goto fail_name; + } + + i2c_set_clientdata(client, info); + info->client = client; + ds2782_power_supply_init(&info->battery); + + ret = power_supply_register(&client->dev, &info->battery); + if (ret) { + dev_err(&client->dev, "failed to register battery\n"); + goto fail_register; + } + + return 0; + +fail_register: + kfree(info->battery.name); +fail_name: + i2c_set_clientdata(client, info); + kfree(info); +fail_info: + mutex_lock(&battery_lock); + idr_remove(&battery_id, num); + mutex_unlock(&battery_lock); +fail_id: + return ret; +} + +static const struct i2c_device_id ds2782_id[] = { + {"ds2782", 0}, + {}, +}; + +static struct i2c_driver ds2782_battery_driver = { + .driver = { + .name = "ds2782-battery", + }, + .probe = ds2782_battery_probe, + .remove = ds2782_battery_remove, + .id_table = ds2782_id, +}; + +static int __init ds2782_init(void) +{ + return i2c_add_driver(&ds2782_battery_driver); +} +module_init(ds2782_init); + +static void __exit ds2782_exit(void) +{ + i2c_del_driver(&ds2782_battery_driver); +} +module_exit(ds2782_exit); + +MODULE_AUTHOR("Ryan Mallon "); +MODULE_DESCRIPTION("Maxim/Dallas DS2782 Stand-Alone Fuel Gauage IC driver"); +MODULE_LICENSE("GPL"); -- cgit v1.2.2 From daf4219dbcbb2efcd638fcd3c29a622e1c18cc38 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 1 Jul 2009 16:12:53 -0700 Subject: dmaengine: move HIGHMEM64G restriction to ASYNC_TX_DMA On HIGHMEM64G systems dma_addr_t is known to be larger than (void *) which precludes async_xor from performing dma address conversions by reusing the input parameter address list. However, other parts of the dmaengine infrastructure do not suffer this constraint, so the HIGHMEM64G restriction can be down-levelled. Signed-off-by: Dan Williams --- drivers/dma/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 3b3c01b6f1ee..babf214a509b 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -4,7 +4,7 @@ menuconfig DMADEVICES bool "DMA Engine support" - depends on !HIGHMEM64G && HAS_DMA + depends on HAS_DMA help DMA engines can do asynchronous data transfers without involving the host CPU. Currently, this framework can be @@ -100,7 +100,7 @@ config NET_DMA config ASYNC_TX_DMA bool "Async_tx: Offload support for the async_tx api" - depends on DMA_ENGINE + depends on DMA_ENGINE && !HIGHMEM64G help This allows the async_tx api to take advantage of offload engines for memcpy, memset, xor, and raid6 p+q operations. If your platform has -- cgit v1.2.2 From 5381837f125cc62ad703fbcdfcd7566fc81fd404 Mon Sep 17 00:00:00 2001 From: Tom Peng Date: Wed, 1 Jul 2009 20:37:26 +0800 Subject: [SCSI] libsas: reuse the original port when hotplugging phys in wide ports There's a hotplug problem in the way libsas allocates ports: it loops over the available ports first trying to add to an existing for a wide port and otherwise allocating the next free port. This scheme only works if the port array is packed from zero, which fails if a port gets hot unplugged and the array becomes sparse. In that case, a new port is formed even if there's a wide port it should be part of. Fix this by creating two loops over all the ports: the first to see if the phy should be part of a wide port and the second to form a new port in an empty port slot. Signed-off-by: Tom Peng Signed-off-by: Jack Wang Signed-off-by: Lindar Liu Cc: Stable Tree Signed-off-by: James Bottomley --- drivers/scsi/libsas/sas_port.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/libsas/sas_port.c b/drivers/scsi/libsas/sas_port.c index e6ac59c023f1..fe8b74c706d2 100644 --- a/drivers/scsi/libsas/sas_port.c +++ b/drivers/scsi/libsas/sas_port.c @@ -56,7 +56,7 @@ static void sas_form_port(struct asd_sas_phy *phy) } } - /* find a port */ + /* see if the phy should be part of a wide port */ spin_lock_irqsave(&sas_ha->phy_port_lock, flags); for (i = 0; i < sas_ha->num_phys; i++) { port = sas_ha->sas_port[i]; @@ -69,12 +69,23 @@ static void sas_form_port(struct asd_sas_phy *phy) SAS_DPRINTK("phy%d matched wide port%d\n", phy->id, port->id); break; - } else if (*(u64 *) port->sas_addr == 0 && port->num_phys==0) { - memcpy(port->sas_addr, phy->sas_addr, SAS_ADDR_SIZE); - break; } spin_unlock(&port->phy_list_lock); } + /* The phy does not match any existing port, create a new one */ + if (i == sas_ha->num_phys) { + for (i = 0; i < sas_ha->num_phys; i++) { + port = sas_ha->sas_port[i]; + spin_lock(&port->phy_list_lock); + if (*(u64 *)port->sas_addr == 0 + && port->num_phys == 0) { + memcpy(port->sas_addr, phy->sas_addr, + SAS_ADDR_SIZE); + break; + } + spin_unlock(&port->phy_list_lock); + } + } if (i >= sas_ha->num_phys) { printk(KERN_NOTICE "%s: couldn't find a free port, bug?\n", -- cgit v1.2.2 From e3d433040ee6077e33d4ad22e2f60a38b085786d Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 28 Jun 2009 09:26:20 -0700 Subject: drivers/dma/fsldma.c: Remove unnecessary semicolons Signed-off-by: Joe Perches Signed-off-by: Dan Williams --- drivers/dma/fsldma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 6e60c77a145c..ef87a8984145 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -142,7 +142,7 @@ static int dma_is_idle(struct fsl_dma_chan *fsl_chan) static void dma_start(struct fsl_dma_chan *fsl_chan) { - u32 mr_set = 0;; + u32 mr_set = 0; if (fsl_chan->feature & FSL_DMA_CHAN_PAUSE_EXT) { DMA_OUT(fsl_chan, &fsl_chan->reg_base->bcr, 0, 32); -- cgit v1.2.2 From c019894efc9c9ba5939948caa78c133b1ec8ae63 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 28 Jun 2009 09:26:21 -0700 Subject: drivers/dma: Remove unnecessary semicolons Signed-off-by: Joe Perches Signed-off-by: Dan Williams --- drivers/dma/dmatest.c | 2 +- drivers/dma/mv_xor.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index fb7da5141e96..cec1ec0b7d00 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -114,7 +114,7 @@ static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len) buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); for ( ; i < start + len; i++) buf[i] = PATTERN_SRC | PATTERN_COPY - | (~i & PATTERN_COUNT_MASK);; + | (~i & PATTERN_COUNT_MASK); for ( ; i < test_buf_size; i++) buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); buf++; diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index ddab94f51224..3f23eabe09f2 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1176,7 +1176,7 @@ static int __devinit mv_xor_probe(struct platform_device *pdev) if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) dma_dev->device_prep_dma_memset = mv_xor_prep_dma_memset; if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { - dma_dev->max_xor = 8; ; + dma_dev->max_xor = 8; dma_dev->device_prep_dma_xor = mv_xor_prep_dma_xor; } -- cgit v1.2.2 From 0a2ff57d6fba92842272889b4bca447344cd9d36 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 3 Jul 2009 19:26:51 +0200 Subject: dmaengine: dmatest: add a maximum number of test iterations The dmatest usually waits for the killing of its kthreads to stop running tests. This patch adds a parameter that sets a maximum number of test iterations. This feature is quite interesting for debugging when you set a lot of traces in your dmaengine controller driver. Signed-off-by: Nicolas Ferre Cc: Haavard Skinnemoen Acked-by: Maciej Sosnowski Signed-off-by: Andrew Morton Signed-off-by: Dan Williams --- drivers/dma/dmatest.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index cec1ec0b7d00..2d973d60e7b9 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -38,6 +38,11 @@ module_param(max_channels, uint, S_IRUGO); MODULE_PARM_DESC(max_channels, "Maximum number of channels to use (default: all)"); +static unsigned int iterations; +module_param(iterations, uint, S_IRUGO); +MODULE_PARM_DESC(iterations, + "Iterations before stopping test (default: infinite)"); + static unsigned int xor_sources = 3; module_param(xor_sources, uint, S_IRUGO); MODULE_PARM_DESC(xor_sources, @@ -270,7 +275,8 @@ static int dmatest_func(void *data) flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT; - while (!kthread_should_stop()) { + while (!kthread_should_stop() + && !(iterations && total_tests >= iterations)) { struct dma_device *dev = chan->device; struct dma_async_tx_descriptor *tx = NULL; dma_addr_t dma_srcs[src_cnt]; @@ -416,6 +422,13 @@ err_srcbuf: err_srcs: pr_notice("%s: terminating after %u tests, %u failures (status %d)\n", thread_name, total_tests, failed_tests, ret); + + if (iterations > 0) + while (!kthread_should_stop()) { + DECLARE_WAIT_QUEUE_HEAD(wait_dmatest_exit); + interruptible_sleep_on(&wait_dmatest_exit); + } + return ret; } -- cgit v1.2.2 From f1aef8b6e6abf32a3a269542f95a19e2cb319f6c Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Mon, 6 Jul 2009 18:19:44 +0200 Subject: dmaengine: dmatest: correct thread_count while using multiple thread per channel It seems that thread_count is not properly calculated in dmatest. In fact the thread count number that is returned from dmatest_add_threads() is not correctly added to the thread_count and thus not properly printed. Signed-off-by: Nicolas Ferre Acked-by: Haavard Skinnemoen Acked-by: Maciej Sosnowski Signed-off-by: Andrew Morton Signed-off-by: Dan Williams --- drivers/dma/dmatest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 2d973d60e7b9..d93017fc7872 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -508,11 +508,11 @@ static int dmatest_add_channel(struct dma_chan *chan) if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) { cnt = dmatest_add_threads(dtc, DMA_MEMCPY); - thread_count += cnt > 0 ?: 0; + thread_count += cnt > 0 ? cnt : 0; } if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { cnt = dmatest_add_threads(dtc, DMA_XOR); - thread_count += cnt > 0 ?: 0; + thread_count += cnt > 0 ? cnt : 0; } pr_info("dmatest: Started %u threads using %s\n", -- cgit v1.2.2 From dc78baa2b90b289590911b40b6800f77d0dc935a Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Fri, 3 Jul 2009 19:24:33 +0200 Subject: dmaengine: at_hdmac: new driver for the Atmel AHB DMA Controller This AHB DMA Controller (aka HDMA or DMAC on AT91 systems) is availlable on at91sam9rl chip. It will be used on other products in the future. This first release covers only the memory-to-memory tranfer type. This is the only tranfer type supported by this chip. On other products, it will be used also for peripheral DMA transfer (slave API support to come). I used dmatest client without problem in different configurations to test it. Full documentation for this controller can be found in the SAM9RL datasheet: http://www.atmel.com/dyn/products/product_card.asp?part_id=4243 Signed-off-by: Nicolas Ferre Acked-by: Maciej Sosnowski Signed-off-by: Dan Williams --- drivers/dma/Kconfig | 8 + drivers/dma/Makefile | 1 + drivers/dma/at_hdmac.c | 1009 +++++++++++++++++++++++++++++++++++++++++++ drivers/dma/at_hdmac_regs.h | 386 +++++++++++++++++ 4 files changed, 1404 insertions(+) create mode 100644 drivers/dma/at_hdmac.c create mode 100644 drivers/dma/at_hdmac_regs.h (limited to 'drivers') diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index babf214a509b..bc8fb41cd623 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -46,6 +46,14 @@ config DW_DMAC Support the Synopsys DesignWare AHB DMA controller. This can be integrated in chips such as the Atmel AT32ap7000. +config AT_HDMAC + tristate "Atmel AHB DMA support" + depends on ARCH_AT91SAM9RL + select DMA_ENGINE + help + Support the Atmel AHB DMA controller. This can be integrated in + chips such as the Atmel AT91SAM9RL. + config FSL_DMA tristate "Freescale Elo and Elo Plus DMA support" depends on FSL_SOC diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index 2e5dc96700d2..d7bc5fd17d84 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -7,4 +7,5 @@ obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o obj-$(CONFIG_FSL_DMA) += fsldma.o obj-$(CONFIG_MV_XOR) += mv_xor.o obj-$(CONFIG_DW_DMAC) += dw_dmac.o +obj-$(CONFIG_AT_HDMAC) += at_hdmac.o obj-$(CONFIG_MX3_IPU) += ipu/ diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c new file mode 100644 index 000000000000..64dbf0ce128e --- /dev/null +++ b/drivers/dma/at_hdmac.c @@ -0,0 +1,1009 @@ +/* + * Driver for the Atmel AHB DMA Controller (aka HDMA or DMAC on AT91 systems) + * + * Copyright (C) 2008 Atmel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * + * This supports the Atmel AHB DMA Controller, + * + * The driver has currently been tested with the Atmel AT91SAM9RL + * and AT91SAM9G45 series. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "at_hdmac_regs.h" + +/* + * Glossary + * -------- + * + * at_hdmac : Name of the ATmel AHB DMA Controller + * at_dma_ / atdma : ATmel DMA controller entity related + * atc_ / atchan : ATmel DMA Channel entity related + */ + +#define ATC_DEFAULT_CFG (ATC_FIFOCFG_HALFFIFO) +#define ATC_DEFAULT_CTRLA (0) +#define ATC_DEFAULT_CTRLB (ATC_SIF(0) \ + |ATC_DIF(1)) + +/* + * Initial number of descriptors to allocate for each channel. This could + * be increased during dma usage. + */ +static unsigned int init_nr_desc_per_channel = 64; +module_param(init_nr_desc_per_channel, uint, 0644); +MODULE_PARM_DESC(init_nr_desc_per_channel, + "initial descriptors per channel (default: 64)"); + + +/* prototypes */ +static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx); + + +/*----------------------------------------------------------------------*/ + +static struct at_desc *atc_first_active(struct at_dma_chan *atchan) +{ + return list_first_entry(&atchan->active_list, + struct at_desc, desc_node); +} + +static struct at_desc *atc_first_queued(struct at_dma_chan *atchan) +{ + return list_first_entry(&atchan->queue, + struct at_desc, desc_node); +} + +/** + * atc_alloc_descriptor - allocate and return an initilized descriptor + * @chan: the channel to allocate descriptors for + * @gfp_flags: GFP allocation flags + * + * Note: The ack-bit is positioned in the descriptor flag at creation time + * to make initial allocation more convenient. This bit will be cleared + * and control will be given to client at usage time (during + * preparation functions). + */ +static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan, + gfp_t gfp_flags) +{ + struct at_desc *desc = NULL; + struct at_dma *atdma = to_at_dma(chan->device); + dma_addr_t phys; + + desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys); + if (desc) { + memset(desc, 0, sizeof(struct at_desc)); + dma_async_tx_descriptor_init(&desc->txd, chan); + /* txd.flags will be overwritten in prep functions */ + desc->txd.flags = DMA_CTRL_ACK; + desc->txd.tx_submit = atc_tx_submit; + desc->txd.phys = phys; + } + + return desc; +} + +/** + * atc_desc_get - get a unsused descriptor from free_list + * @atchan: channel we want a new descriptor for + */ +static struct at_desc *atc_desc_get(struct at_dma_chan *atchan) +{ + struct at_desc *desc, *_desc; + struct at_desc *ret = NULL; + unsigned int i = 0; + LIST_HEAD(tmp_list); + + spin_lock_bh(&atchan->lock); + list_for_each_entry_safe(desc, _desc, &atchan->free_list, desc_node) { + i++; + if (async_tx_test_ack(&desc->txd)) { + list_del(&desc->desc_node); + ret = desc; + break; + } + dev_dbg(chan2dev(&atchan->chan_common), + "desc %p not ACKed\n", desc); + } + spin_unlock_bh(&atchan->lock); + dev_vdbg(chan2dev(&atchan->chan_common), + "scanned %u descriptors on freelist\n", i); + + /* no more descriptor available in initial pool: create one more */ + if (!ret) { + ret = atc_alloc_descriptor(&atchan->chan_common, GFP_ATOMIC); + if (ret) { + spin_lock_bh(&atchan->lock); + atchan->descs_allocated++; + spin_unlock_bh(&atchan->lock); + } else { + dev_err(chan2dev(&atchan->chan_common), + "not enough descriptors available\n"); + } + } + + return ret; +} + +/** + * atc_desc_put - move a descriptor, including any children, to the free list + * @atchan: channel we work on + * @desc: descriptor, at the head of a chain, to move to free list + */ +static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc) +{ + if (desc) { + struct at_desc *child; + + spin_lock_bh(&atchan->lock); + list_for_each_entry(child, &desc->txd.tx_list, desc_node) + dev_vdbg(chan2dev(&atchan->chan_common), + "moving child desc %p to freelist\n", + child); + list_splice_init(&desc->txd.tx_list, &atchan->free_list); + dev_vdbg(chan2dev(&atchan->chan_common), + "moving desc %p to freelist\n", desc); + list_add(&desc->desc_node, &atchan->free_list); + spin_unlock_bh(&atchan->lock); + } +} + +/** + * atc_assign_cookie - compute and assign new cookie + * @atchan: channel we work on + * @desc: descriptor to asign cookie for + * + * Called with atchan->lock held and bh disabled + */ +static dma_cookie_t +atc_assign_cookie(struct at_dma_chan *atchan, struct at_desc *desc) +{ + dma_cookie_t cookie = atchan->chan_common.cookie; + + if (++cookie < 0) + cookie = 1; + + atchan->chan_common.cookie = cookie; + desc->txd.cookie = cookie; + + return cookie; +} + +/** + * atc_dostart - starts the DMA engine for real + * @atchan: the channel we want to start + * @first: first descriptor in the list we want to begin with + * + * Called with atchan->lock held and bh disabled + */ +static void atc_dostart(struct at_dma_chan *atchan, struct at_desc *first) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + + /* ASSERT: channel is idle */ + if (atc_chan_is_enabled(atchan)) { + dev_err(chan2dev(&atchan->chan_common), + "BUG: Attempted to start non-idle channel\n"); + dev_err(chan2dev(&atchan->chan_common), + " channel: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n", + channel_readl(atchan, SADDR), + channel_readl(atchan, DADDR), + channel_readl(atchan, CTRLA), + channel_readl(atchan, CTRLB), + channel_readl(atchan, DSCR)); + + /* The tasklet will hopefully advance the queue... */ + return; + } + + vdbg_dump_regs(atchan); + + /* clear any pending interrupt */ + while (dma_readl(atdma, EBCISR)) + cpu_relax(); + + channel_writel(atchan, SADDR, 0); + channel_writel(atchan, DADDR, 0); + channel_writel(atchan, CTRLA, 0); + channel_writel(atchan, CTRLB, 0); + channel_writel(atchan, DSCR, first->txd.phys); + dma_writel(atdma, CHER, atchan->mask); + + vdbg_dump_regs(atchan); +} + +/** + * atc_chain_complete - finish work for one transaction chain + * @atchan: channel we work on + * @desc: descriptor at the head of the chain we want do complete + * + * Called with atchan->lock held and bh disabled */ +static void +atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc) +{ + dma_async_tx_callback callback; + void *param; + struct dma_async_tx_descriptor *txd = &desc->txd; + + dev_vdbg(chan2dev(&atchan->chan_common), + "descriptor %u complete\n", txd->cookie); + + atchan->completed_cookie = txd->cookie; + callback = txd->callback; + param = txd->callback_param; + + /* move children to free_list */ + list_splice_init(&txd->tx_list, &atchan->free_list); + /* move myself to free_list */ + list_move(&desc->desc_node, &atchan->free_list); + + /* unmap dma addresses */ + if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { + if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) + dma_unmap_single(chan2parent(&atchan->chan_common), + desc->lli.daddr, + desc->len, DMA_FROM_DEVICE); + else + dma_unmap_page(chan2parent(&atchan->chan_common), + desc->lli.daddr, + desc->len, DMA_FROM_DEVICE); + } + if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) + dma_unmap_single(chan2parent(&atchan->chan_common), + desc->lli.saddr, + desc->len, DMA_TO_DEVICE); + else + dma_unmap_page(chan2parent(&atchan->chan_common), + desc->lli.saddr, + desc->len, DMA_TO_DEVICE); + } + + /* + * The API requires that no submissions are done from a + * callback, so we don't need to drop the lock here + */ + if (callback) + callback(param); + + dma_run_dependencies(txd); +} + +/** + * atc_complete_all - finish work for all transactions + * @atchan: channel to complete transactions for + * + * Eventually submit queued descriptors if any + * + * Assume channel is idle while calling this function + * Called with atchan->lock held and bh disabled + */ +static void atc_complete_all(struct at_dma_chan *atchan) +{ + struct at_desc *desc, *_desc; + LIST_HEAD(list); + + dev_vdbg(chan2dev(&atchan->chan_common), "complete all\n"); + + BUG_ON(atc_chan_is_enabled(atchan)); + + /* + * Submit queued descriptors ASAP, i.e. before we go through + * the completed ones. + */ + if (!list_empty(&atchan->queue)) + atc_dostart(atchan, atc_first_queued(atchan)); + /* empty active_list now it is completed */ + list_splice_init(&atchan->active_list, &list); + /* empty queue list by moving descriptors (if any) to active_list */ + list_splice_init(&atchan->queue, &atchan->active_list); + + list_for_each_entry_safe(desc, _desc, &list, desc_node) + atc_chain_complete(atchan, desc); +} + +/** + * atc_cleanup_descriptors - cleanup up finished descriptors in active_list + * @atchan: channel to be cleaned up + * + * Called with atchan->lock held and bh disabled + */ +static void atc_cleanup_descriptors(struct at_dma_chan *atchan) +{ + struct at_desc *desc, *_desc; + struct at_desc *child; + + dev_vdbg(chan2dev(&atchan->chan_common), "cleanup descriptors\n"); + + list_for_each_entry_safe(desc, _desc, &atchan->active_list, desc_node) { + if (!(desc->lli.ctrla & ATC_DONE)) + /* This one is currently in progress */ + return; + + list_for_each_entry(child, &desc->txd.tx_list, desc_node) + if (!(child->lli.ctrla & ATC_DONE)) + /* Currently in progress */ + return; + + /* + * No descriptors so far seem to be in progress, i.e. + * this chain must be done. + */ + atc_chain_complete(atchan, desc); + } +} + +/** + * atc_advance_work - at the end of a transaction, move forward + * @atchan: channel where the transaction ended + * + * Called with atchan->lock held and bh disabled + */ +static void atc_advance_work(struct at_dma_chan *atchan) +{ + dev_vdbg(chan2dev(&atchan->chan_common), "advance_work\n"); + + if (list_empty(&atchan->active_list) || + list_is_singular(&atchan->active_list)) { + atc_complete_all(atchan); + } else { + atc_chain_complete(atchan, atc_first_active(atchan)); + /* advance work */ + atc_dostart(atchan, atc_first_active(atchan)); + } +} + + +/** + * atc_handle_error - handle errors reported by DMA controller + * @atchan: channel where error occurs + * + * Called with atchan->lock held and bh disabled + */ +static void atc_handle_error(struct at_dma_chan *atchan) +{ + struct at_desc *bad_desc; + struct at_desc *child; + + /* + * The descriptor currently at the head of the active list is + * broked. Since we don't have any way to report errors, we'll + * just have to scream loudly and try to carry on. + */ + bad_desc = atc_first_active(atchan); + list_del_init(&bad_desc->desc_node); + + /* As we are stopped, take advantage to push queued descriptors + * in active_list */ + list_splice_init(&atchan->queue, atchan->active_list.prev); + + /* Try to restart the controller */ + if (!list_empty(&atchan->active_list)) + atc_dostart(atchan, atc_first_active(atchan)); + + /* + * KERN_CRITICAL may seem harsh, but since this only happens + * when someone submits a bad physical address in a + * descriptor, we should consider ourselves lucky that the + * controller flagged an error instead of scribbling over + * random memory locations. + */ + dev_crit(chan2dev(&atchan->chan_common), + "Bad descriptor submitted for DMA!\n"); + dev_crit(chan2dev(&atchan->chan_common), + " cookie: %d\n", bad_desc->txd.cookie); + atc_dump_lli(atchan, &bad_desc->lli); + list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) + atc_dump_lli(atchan, &child->lli); + + /* Pretend the descriptor completed successfully */ + atc_chain_complete(atchan, bad_desc); +} + + +/*-- IRQ & Tasklet ---------------------------------------------------*/ + +static void atc_tasklet(unsigned long data) +{ + struct at_dma_chan *atchan = (struct at_dma_chan *)data; + + /* Channel cannot be enabled here */ + if (atc_chan_is_enabled(atchan)) { + dev_err(chan2dev(&atchan->chan_common), + "BUG: channel enabled in tasklet\n"); + return; + } + + spin_lock(&atchan->lock); + if (test_and_clear_bit(0, &atchan->error_status)) + atc_handle_error(atchan); + else + atc_advance_work(atchan); + + spin_unlock(&atchan->lock); +} + +static irqreturn_t at_dma_interrupt(int irq, void *dev_id) +{ + struct at_dma *atdma = (struct at_dma *)dev_id; + struct at_dma_chan *atchan; + int i; + u32 status, pending, imr; + int ret = IRQ_NONE; + + do { + imr = dma_readl(atdma, EBCIMR); + status = dma_readl(atdma, EBCISR); + pending = status & imr; + + if (!pending) + break; + + dev_vdbg(atdma->dma_common.dev, + "interrupt: status = 0x%08x, 0x%08x, 0x%08x\n", + status, imr, pending); + + for (i = 0; i < atdma->dma_common.chancnt; i++) { + atchan = &atdma->chan[i]; + if (pending & (AT_DMA_CBTC(i) | AT_DMA_ERR(i))) { + if (pending & AT_DMA_ERR(i)) { + /* Disable channel on AHB error */ + dma_writel(atdma, CHDR, atchan->mask); + /* Give information to tasklet */ + set_bit(0, &atchan->error_status); + } + tasklet_schedule(&atchan->tasklet); + ret = IRQ_HANDLED; + } + } + + } while (pending); + + return ret; +} + + +/*-- DMA Engine API --------------------------------------------------*/ + +/** + * atc_tx_submit - set the prepared descriptor(s) to be executed by the engine + * @desc: descriptor at the head of the transaction chain + * + * Queue chain if DMA engine is working already + * + * Cookie increment and adding to active_list or queue must be atomic + */ +static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct at_desc *desc = txd_to_at_desc(tx); + struct at_dma_chan *atchan = to_at_dma_chan(tx->chan); + dma_cookie_t cookie; + + spin_lock_bh(&atchan->lock); + cookie = atc_assign_cookie(atchan, desc); + + if (list_empty(&atchan->active_list)) { + dev_vdbg(chan2dev(tx->chan), "tx_submit: started %u\n", + desc->txd.cookie); + atc_dostart(atchan, desc); + list_add_tail(&desc->desc_node, &atchan->active_list); + } else { + dev_vdbg(chan2dev(tx->chan), "tx_submit: queued %u\n", + desc->txd.cookie); + list_add_tail(&desc->desc_node, &atchan->queue); + } + + spin_unlock_bh(&atchan->lock); + + return cookie; +} + +/** + * atc_prep_dma_memcpy - prepare a memcpy operation + * @chan: the channel to prepare operation on + * @dest: operation virtual destination address + * @src: operation virtual source address + * @len: operation length + * @flags: tx descriptor status flags + */ +static struct dma_async_tx_descriptor * +atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, + size_t len, unsigned long flags) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_desc *desc = NULL; + struct at_desc *first = NULL; + struct at_desc *prev = NULL; + size_t xfer_count; + size_t offset; + unsigned int src_width; + unsigned int dst_width; + u32 ctrla; + u32 ctrlb; + + dev_vdbg(chan2dev(chan), "prep_dma_memcpy: d0x%x s0x%x l0x%zx f0x%lx\n", + dest, src, len, flags); + + if (unlikely(!len)) { + dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n"); + return NULL; + } + + ctrla = ATC_DEFAULT_CTRLA; + ctrlb = ATC_DEFAULT_CTRLB + | ATC_SRC_ADDR_MODE_INCR + | ATC_DST_ADDR_MODE_INCR + | ATC_FC_MEM2MEM; + + /* + * We can be a lot more clever here, but this should take care + * of the most common optimization. + */ + if (!((src | dest | len) & 3)) { + ctrla |= ATC_SRC_WIDTH_WORD | ATC_DST_WIDTH_WORD; + src_width = dst_width = 2; + } else if (!((src | dest | len) & 1)) { + ctrla |= ATC_SRC_WIDTH_HALFWORD | ATC_DST_WIDTH_HALFWORD; + src_width = dst_width = 1; + } else { + ctrla |= ATC_SRC_WIDTH_BYTE | ATC_DST_WIDTH_BYTE; + src_width = dst_width = 0; + } + + for (offset = 0; offset < len; offset += xfer_count << src_width) { + xfer_count = min_t(size_t, (len - offset) >> src_width, + ATC_BTSIZE_MAX); + + desc = atc_desc_get(atchan); + if (!desc) + goto err_desc_get; + + desc->lli.saddr = src + offset; + desc->lli.daddr = dest + offset; + desc->lli.ctrla = ctrla | xfer_count; + desc->lli.ctrlb = ctrlb; + + desc->txd.cookie = 0; + async_tx_ack(&desc->txd); + + if (!first) { + first = desc; + } else { + /* inform the HW lli about chaining */ + prev->lli.dscr = desc->txd.phys; + /* insert the link descriptor to the LD ring */ + list_add_tail(&desc->desc_node, + &first->txd.tx_list); + } + prev = desc; + } + + /* First descriptor of the chain embedds additional information */ + first->txd.cookie = -EBUSY; + first->len = len; + + /* set end-of-link to the last link descriptor of list*/ + set_desc_eol(desc); + + desc->txd.flags = flags; /* client is in control of this ack */ + + return &first->txd; + +err_desc_get: + atc_desc_put(atchan, first); + return NULL; +} + +/** + * atc_is_tx_complete - poll for transaction completion + * @chan: DMA channel + * @cookie: transaction identifier to check status of + * @done: if not %NULL, updated with last completed transaction + * @used: if not %NULL, updated with last used transaction + * + * If @done and @used are passed in, upon return they reflect the driver + * internal state and can be used with dma_async_is_complete() to check + * the status of multiple cookies without re-checking hardware state. + */ +static enum dma_status +atc_is_tx_complete(struct dma_chan *chan, + dma_cookie_t cookie, + dma_cookie_t *done, dma_cookie_t *used) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + dma_cookie_t last_used; + dma_cookie_t last_complete; + enum dma_status ret; + + dev_vdbg(chan2dev(chan), "is_tx_complete: %d (d%d, u%d)\n", + cookie, done ? *done : 0, used ? *used : 0); + + spin_lock_bh(atchan->lock); + + last_complete = atchan->completed_cookie; + last_used = chan->cookie; + + ret = dma_async_is_complete(cookie, last_complete, last_used); + if (ret != DMA_SUCCESS) { + atc_cleanup_descriptors(atchan); + + last_complete = atchan->completed_cookie; + last_used = chan->cookie; + + ret = dma_async_is_complete(cookie, last_complete, last_used); + } + + spin_unlock_bh(atchan->lock); + + if (done) + *done = last_complete; + if (used) + *used = last_used; + + return ret; +} + +/** + * atc_issue_pending - try to finish work + * @chan: target DMA channel + */ +static void atc_issue_pending(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + + dev_vdbg(chan2dev(chan), "issue_pending\n"); + + if (!atc_chan_is_enabled(atchan)) { + spin_lock_bh(&atchan->lock); + atc_advance_work(atchan); + spin_unlock_bh(&atchan->lock); + } +} + +/** + * atc_alloc_chan_resources - allocate resources for DMA channel + * @chan: allocate descriptor resources for this channel + * @client: current client requesting the channel be ready for requests + * + * return - the number of allocated descriptors + */ +static int atc_alloc_chan_resources(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma *atdma = to_at_dma(chan->device); + struct at_desc *desc; + int i; + LIST_HEAD(tmp_list); + + dev_vdbg(chan2dev(chan), "alloc_chan_resources\n"); + + /* ASSERT: channel is idle */ + if (atc_chan_is_enabled(atchan)) { + dev_dbg(chan2dev(chan), "DMA channel not idle ?\n"); + return -EIO; + } + + /* have we already been set up? */ + if (!list_empty(&atchan->free_list)) + return atchan->descs_allocated; + + /* Allocate initial pool of descriptors */ + for (i = 0; i < init_nr_desc_per_channel; i++) { + desc = atc_alloc_descriptor(chan, GFP_KERNEL); + if (!desc) { + dev_err(atdma->dma_common.dev, + "Only %d initial descriptors\n", i); + break; + } + list_add_tail(&desc->desc_node, &tmp_list); + } + + spin_lock_bh(&atchan->lock); + atchan->descs_allocated = i; + list_splice(&tmp_list, &atchan->free_list); + atchan->completed_cookie = chan->cookie = 1; + spin_unlock_bh(&atchan->lock); + + /* channel parameters */ + channel_writel(atchan, CFG, ATC_DEFAULT_CFG); + + dev_dbg(chan2dev(chan), + "alloc_chan_resources: allocated %d descriptors\n", + atchan->descs_allocated); + + return atchan->descs_allocated; +} + +/** + * atc_free_chan_resources - free all channel resources + * @chan: DMA channel + */ +static void atc_free_chan_resources(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma *atdma = to_at_dma(chan->device); + struct at_desc *desc, *_desc; + LIST_HEAD(list); + + dev_dbg(chan2dev(chan), "free_chan_resources: (descs allocated=%u)\n", + atchan->descs_allocated); + + /* ASSERT: channel is idle */ + BUG_ON(!list_empty(&atchan->active_list)); + BUG_ON(!list_empty(&atchan->queue)); + BUG_ON(atc_chan_is_enabled(atchan)); + + list_for_each_entry_safe(desc, _desc, &atchan->free_list, desc_node) { + dev_vdbg(chan2dev(chan), " freeing descriptor %p\n", desc); + list_del(&desc->desc_node); + /* free link descriptor */ + dma_pool_free(atdma->dma_desc_pool, desc, desc->txd.phys); + } + list_splice_init(&atchan->free_list, &list); + atchan->descs_allocated = 0; + + dev_vdbg(chan2dev(chan), "free_chan_resources: done\n"); +} + + +/*-- Module Management -----------------------------------------------*/ + +/** + * at_dma_off - disable DMA controller + * @atdma: the Atmel HDAMC device + */ +static void at_dma_off(struct at_dma *atdma) +{ + dma_writel(atdma, EN, 0); + + /* disable all interrupts */ + dma_writel(atdma, EBCIDR, -1L); + + /* confirm that all channels are disabled */ + while (dma_readl(atdma, CHSR) & atdma->all_chan_mask) + cpu_relax(); +} + +static int __init at_dma_probe(struct platform_device *pdev) +{ + struct at_dma_platform_data *pdata; + struct resource *io; + struct at_dma *atdma; + size_t size; + int irq; + int err; + int i; + + /* get DMA Controller parameters from platform */ + pdata = pdev->dev.platform_data; + if (!pdata || pdata->nr_channels > AT_DMA_MAX_NR_CHANNELS) + return -EINVAL; + + io = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!io) + return -EINVAL; + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + size = sizeof(struct at_dma); + size += pdata->nr_channels * sizeof(struct at_dma_chan); + atdma = kzalloc(size, GFP_KERNEL); + if (!atdma) + return -ENOMEM; + + /* discover transaction capabilites from the platform data */ + atdma->dma_common.cap_mask = pdata->cap_mask; + atdma->all_chan_mask = (1 << pdata->nr_channels) - 1; + + size = io->end - io->start + 1; + if (!request_mem_region(io->start, size, pdev->dev.driver->name)) { + err = -EBUSY; + goto err_kfree; + } + + atdma->regs = ioremap(io->start, size); + if (!atdma->regs) { + err = -ENOMEM; + goto err_release_r; + } + + atdma->clk = clk_get(&pdev->dev, "dma_clk"); + if (IS_ERR(atdma->clk)) { + err = PTR_ERR(atdma->clk); + goto err_clk; + } + clk_enable(atdma->clk); + + /* force dma off, just in case */ + at_dma_off(atdma); + + err = request_irq(irq, at_dma_interrupt, 0, "at_hdmac", atdma); + if (err) + goto err_irq; + + platform_set_drvdata(pdev, atdma); + + /* create a pool of consistent memory blocks for hardware descriptors */ + atdma->dma_desc_pool = dma_pool_create("at_hdmac_desc_pool", + &pdev->dev, sizeof(struct at_desc), + 4 /* word alignment */, 0); + if (!atdma->dma_desc_pool) { + dev_err(&pdev->dev, "No memory for descriptors dma pool\n"); + err = -ENOMEM; + goto err_pool_create; + } + + /* clear any pending interrupt */ + while (dma_readl(atdma, EBCISR)) + cpu_relax(); + + /* initialize channels related values */ + INIT_LIST_HEAD(&atdma->dma_common.channels); + for (i = 0; i < pdata->nr_channels; i++, atdma->dma_common.chancnt++) { + struct at_dma_chan *atchan = &atdma->chan[i]; + + atchan->chan_common.device = &atdma->dma_common; + atchan->chan_common.cookie = atchan->completed_cookie = 1; + atchan->chan_common.chan_id = i; + list_add_tail(&atchan->chan_common.device_node, + &atdma->dma_common.channels); + + atchan->ch_regs = atdma->regs + ch_regs(i); + spin_lock_init(&atchan->lock); + atchan->mask = 1 << i; + + INIT_LIST_HEAD(&atchan->active_list); + INIT_LIST_HEAD(&atchan->queue); + INIT_LIST_HEAD(&atchan->free_list); + + tasklet_init(&atchan->tasklet, atc_tasklet, + (unsigned long)atchan); + atc_enable_irq(atchan); + } + + /* set base routines */ + atdma->dma_common.device_alloc_chan_resources = atc_alloc_chan_resources; + atdma->dma_common.device_free_chan_resources = atc_free_chan_resources; + atdma->dma_common.device_is_tx_complete = atc_is_tx_complete; + atdma->dma_common.device_issue_pending = atc_issue_pending; + atdma->dma_common.dev = &pdev->dev; + + /* set prep routines based on capability */ + if (dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask)) + atdma->dma_common.device_prep_dma_memcpy = atc_prep_dma_memcpy; + + dma_writel(atdma, EN, AT_DMA_ENABLE); + + dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s), %d channels\n", + dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask) ? "cpy " : "", + dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask) ? "slave " : "", + atdma->dma_common.chancnt); + + dma_async_device_register(&atdma->dma_common); + + return 0; + +err_pool_create: + platform_set_drvdata(pdev, NULL); + free_irq(platform_get_irq(pdev, 0), atdma); +err_irq: + clk_disable(atdma->clk); + clk_put(atdma->clk); +err_clk: + iounmap(atdma->regs); + atdma->regs = NULL; +err_release_r: + release_mem_region(io->start, size); +err_kfree: + kfree(atdma); + return err; +} + +static int __exit at_dma_remove(struct platform_device *pdev) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + struct dma_chan *chan, *_chan; + struct resource *io; + + at_dma_off(atdma); + dma_async_device_unregister(&atdma->dma_common); + + dma_pool_destroy(atdma->dma_desc_pool); + platform_set_drvdata(pdev, NULL); + free_irq(platform_get_irq(pdev, 0), atdma); + + list_for_each_entry_safe(chan, _chan, &atdma->dma_common.channels, + device_node) { + struct at_dma_chan *atchan = to_at_dma_chan(chan); + + /* Disable interrupts */ + atc_disable_irq(atchan); + tasklet_disable(&atchan->tasklet); + + tasklet_kill(&atchan->tasklet); + list_del(&chan->device_node); + } + + clk_disable(atdma->clk); + clk_put(atdma->clk); + + iounmap(atdma->regs); + atdma->regs = NULL; + + io = platform_get_resource(pdev, IORESOURCE_MEM, 0); + release_mem_region(io->start, io->end - io->start + 1); + + kfree(atdma); + + return 0; +} + +static void at_dma_shutdown(struct platform_device *pdev) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + + at_dma_off(platform_get_drvdata(pdev)); + clk_disable(atdma->clk); +} + +static int at_dma_suspend_late(struct platform_device *pdev, pm_message_t mesg) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + + at_dma_off(platform_get_drvdata(pdev)); + clk_disable(atdma->clk); + return 0; +} + +static int at_dma_resume_early(struct platform_device *pdev) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + + clk_enable(atdma->clk); + dma_writel(atdma, EN, AT_DMA_ENABLE); + return 0; + +} + +static struct platform_driver at_dma_driver = { + .remove = __exit_p(at_dma_remove), + .shutdown = at_dma_shutdown, + .suspend_late = at_dma_suspend_late, + .resume_early = at_dma_resume_early, + .driver = { + .name = "at_hdmac", + }, +}; + +static int __init at_dma_init(void) +{ + return platform_driver_probe(&at_dma_driver, at_dma_probe); +} +module_init(at_dma_init); + +static void __exit at_dma_exit(void) +{ + platform_driver_unregister(&at_dma_driver); +} +module_exit(at_dma_exit); + +MODULE_DESCRIPTION("Atmel AHB DMA Controller driver"); +MODULE_AUTHOR("Nicolas Ferre "); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:at_hdmac"); diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h new file mode 100644 index 000000000000..ad2d4f402bf7 --- /dev/null +++ b/drivers/dma/at_hdmac_regs.h @@ -0,0 +1,386 @@ +/* + * Header file for the Atmel AHB DMA Controller driver + * + * Copyright (C) 2008 Atmel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#ifndef AT_HDMAC_REGS_H +#define AT_HDMAC_REGS_H + +#include + +#define AT_DMA_MAX_NR_CHANNELS 8 + + +#define AT_DMA_GCFG 0x00 /* Global Configuration Register */ +#define AT_DMA_IF_BIGEND(i) (0x1 << (i)) /* AHB-Lite Interface i in Big-endian mode */ +#define AT_DMA_ARB_CFG (0x1 << 4) /* Arbiter mode. */ +#define AT_DMA_ARB_CFG_FIXED (0x0 << 4) +#define AT_DMA_ARB_CFG_ROUND_ROBIN (0x1 << 4) + +#define AT_DMA_EN 0x04 /* Controller Enable Register */ +#define AT_DMA_ENABLE (0x1 << 0) + +#define AT_DMA_SREQ 0x08 /* Software Single Request Register */ +#define AT_DMA_SSREQ(x) (0x1 << ((x) << 1)) /* Request a source single transfer on channel x */ +#define AT_DMA_DSREQ(x) (0x1 << (1 + ((x) << 1))) /* Request a destination single transfer on channel x */ + +#define AT_DMA_CREQ 0x0C /* Software Chunk Transfer Request Register */ +#define AT_DMA_SCREQ(x) (0x1 << ((x) << 1)) /* Request a source chunk transfer on channel x */ +#define AT_DMA_DCREQ(x) (0x1 << (1 + ((x) << 1))) /* Request a destination chunk transfer on channel x */ + +#define AT_DMA_LAST 0x10 /* Software Last Transfer Flag Register */ +#define AT_DMA_SLAST(x) (0x1 << ((x) << 1)) /* This src rq is last tx of buffer on channel x */ +#define AT_DMA_DLAST(x) (0x1 << (1 + ((x) << 1))) /* This dst rq is last tx of buffer on channel x */ + +#define AT_DMA_SYNC 0x14 /* Request Synchronization Register */ +#define AT_DMA_SYR(h) (0x1 << (h)) /* Synchronize handshake line h */ + +/* Error, Chained Buffer transfer completed and Buffer transfer completed Interrupt registers */ +#define AT_DMA_EBCIER 0x18 /* Enable register */ +#define AT_DMA_EBCIDR 0x1C /* Disable register */ +#define AT_DMA_EBCIMR 0x20 /* Mask Register */ +#define AT_DMA_EBCISR 0x24 /* Status Register */ +#define AT_DMA_CBTC_OFFSET 8 +#define AT_DMA_ERR_OFFSET 16 +#define AT_DMA_BTC(x) (0x1 << (x)) +#define AT_DMA_CBTC(x) (0x1 << (AT_DMA_CBTC_OFFSET + (x))) +#define AT_DMA_ERR(x) (0x1 << (AT_DMA_ERR_OFFSET + (x))) + +#define AT_DMA_CHER 0x28 /* Channel Handler Enable Register */ +#define AT_DMA_ENA(x) (0x1 << (x)) +#define AT_DMA_SUSP(x) (0x1 << ( 8 + (x))) +#define AT_DMA_KEEP(x) (0x1 << (24 + (x))) + +#define AT_DMA_CHDR 0x2C /* Channel Handler Disable Register */ +#define AT_DMA_DIS(x) (0x1 << (x)) +#define AT_DMA_RES(x) (0x1 << ( 8 + (x))) + +#define AT_DMA_CHSR 0x30 /* Channel Handler Status Register */ +#define AT_DMA_EMPT(x) (0x1 << (16 + (x))) +#define AT_DMA_STAL(x) (0x1 << (24 + (x))) + + +#define AT_DMA_CH_REGS_BASE 0x3C /* Channel registers base address */ +#define ch_regs(x) (AT_DMA_CH_REGS_BASE + (x) * 0x28) /* Channel x base addr */ + +/* Hardware register offset for each channel */ +#define ATC_SADDR_OFFSET 0x00 /* Source Address Register */ +#define ATC_DADDR_OFFSET 0x04 /* Destination Address Register */ +#define ATC_DSCR_OFFSET 0x08 /* Descriptor Address Register */ +#define ATC_CTRLA_OFFSET 0x0C /* Control A Register */ +#define ATC_CTRLB_OFFSET 0x10 /* Control B Register */ +#define ATC_CFG_OFFSET 0x14 /* Configuration Register */ +#define ATC_SPIP_OFFSET 0x18 /* Src PIP Configuration Register */ +#define ATC_DPIP_OFFSET 0x1C /* Dst PIP Configuration Register */ + + +/* Bitfield definitions */ + +/* Bitfields in DSCR */ +#define ATC_DSCR_IF(i) (0x3 & (i)) /* Dsc feched via AHB-Lite Interface i */ + +/* Bitfields in CTRLA */ +#define ATC_BTSIZE_MAX 0xFFFFUL /* Maximum Buffer Transfer Size */ +#define ATC_BTSIZE(x) (ATC_BTSIZE_MAX & (x)) /* Buffer Transfer Size */ +#define ATC_SCSIZE_MASK (0x7 << 16) /* Source Chunk Transfer Size */ +#define ATC_SCSIZE_1 (0x0 << 16) +#define ATC_SCSIZE_4 (0x1 << 16) +#define ATC_SCSIZE_8 (0x2 << 16) +#define ATC_SCSIZE_16 (0x3 << 16) +#define ATC_SCSIZE_32 (0x4 << 16) +#define ATC_SCSIZE_64 (0x5 << 16) +#define ATC_SCSIZE_128 (0x6 << 16) +#define ATC_SCSIZE_256 (0x7 << 16) +#define ATC_DCSIZE_MASK (0x7 << 20) /* Destination Chunk Transfer Size */ +#define ATC_DCSIZE_1 (0x0 << 20) +#define ATC_DCSIZE_4 (0x1 << 20) +#define ATC_DCSIZE_8 (0x2 << 20) +#define ATC_DCSIZE_16 (0x3 << 20) +#define ATC_DCSIZE_32 (0x4 << 20) +#define ATC_DCSIZE_64 (0x5 << 20) +#define ATC_DCSIZE_128 (0x6 << 20) +#define ATC_DCSIZE_256 (0x7 << 20) +#define ATC_SRC_WIDTH_MASK (0x3 << 24) /* Source Single Transfer Size */ +#define ATC_SRC_WIDTH_BYTE (0x0 << 24) +#define ATC_SRC_WIDTH_HALFWORD (0x1 << 24) +#define ATC_SRC_WIDTH_WORD (0x2 << 24) +#define ATC_DST_WIDTH_MASK (0x3 << 28) /* Destination Single Transfer Size */ +#define ATC_DST_WIDTH_BYTE (0x0 << 28) +#define ATC_DST_WIDTH_HALFWORD (0x1 << 28) +#define ATC_DST_WIDTH_WORD (0x2 << 28) +#define ATC_DONE (0x1 << 31) /* Tx Done (only written back in descriptor) */ + +/* Bitfields in CTRLB */ +#define ATC_SIF(i) (0x3 & (i)) /* Src tx done via AHB-Lite Interface i */ +#define ATC_DIF(i) ((0x3 & (i)) << 4) /* Dst tx done via AHB-Lite Interface i */ +#define ATC_SRC_PIP (0x1 << 8) /* Source Picture-in-Picture enabled */ +#define ATC_DST_PIP (0x1 << 12) /* Destination Picture-in-Picture enabled */ +#define ATC_SRC_DSCR_DIS (0x1 << 16) /* Src Descriptor fetch disable */ +#define ATC_DST_DSCR_DIS (0x1 << 20) /* Dst Descriptor fetch disable */ +#define ATC_FC_MASK (0x7 << 21) /* Choose Flow Controller */ +#define ATC_FC_MEM2MEM (0x0 << 21) /* Mem-to-Mem (DMA) */ +#define ATC_FC_MEM2PER (0x1 << 21) /* Mem-to-Periph (DMA) */ +#define ATC_FC_PER2MEM (0x2 << 21) /* Periph-to-Mem (DMA) */ +#define ATC_FC_PER2PER (0x3 << 21) /* Periph-to-Periph (DMA) */ +#define ATC_FC_PER2MEM_PER (0x4 << 21) /* Periph-to-Mem (Peripheral) */ +#define ATC_FC_MEM2PER_PER (0x5 << 21) /* Mem-to-Periph (Peripheral) */ +#define ATC_FC_PER2PER_PER (0x6 << 21) /* Periph-to-Periph (Src Peripheral) */ +#define ATC_SRC_ADDR_MODE_MASK (0x3 << 24) +#define ATC_SRC_ADDR_MODE_INCR (0x0 << 24) /* Incrementing Mode */ +#define ATC_SRC_ADDR_MODE_DECR (0x1 << 24) /* Decrementing Mode */ +#define ATC_SRC_ADDR_MODE_FIXED (0x2 << 24) /* Fixed Mode */ +#define ATC_DST_ADDR_MODE_MASK (0x3 << 28) +#define ATC_DST_ADDR_MODE_INCR (0x0 << 28) /* Incrementing Mode */ +#define ATC_DST_ADDR_MODE_DECR (0x1 << 28) /* Decrementing Mode */ +#define ATC_DST_ADDR_MODE_FIXED (0x2 << 28) /* Fixed Mode */ +#define ATC_IEN (0x1 << 30) /* BTC interrupt enable (active low) */ +#define ATC_AUTO (0x1 << 31) /* Auto multiple buffer tx enable */ + +/* Bitfields in CFG */ +#define ATC_SRC_PER(h) (0xFU & (h)) /* Channel src rq associated with periph handshaking ifc h */ +#define ATC_DST_PER(h) ((0xFU & (h)) << 4) /* Channel dst rq associated with periph handshaking ifc h */ +#define ATC_SRC_REP (0x1 << 8) /* Source Replay Mod */ +#define ATC_SRC_H2SEL (0x1 << 9) /* Source Handshaking Mod */ +#define ATC_SRC_H2SEL_SW (0x0 << 9) +#define ATC_SRC_H2SEL_HW (0x1 << 9) +#define ATC_DST_REP (0x1 << 12) /* Destination Replay Mod */ +#define ATC_DST_H2SEL (0x1 << 13) /* Destination Handshaking Mod */ +#define ATC_DST_H2SEL_SW (0x0 << 13) +#define ATC_DST_H2SEL_HW (0x1 << 13) +#define ATC_SOD (0x1 << 16) /* Stop On Done */ +#define ATC_LOCK_IF (0x1 << 20) /* Interface Lock */ +#define ATC_LOCK_B (0x1 << 21) /* AHB Bus Lock */ +#define ATC_LOCK_IF_L (0x1 << 22) /* Master Interface Arbiter Lock */ +#define ATC_LOCK_IF_L_CHUNK (0x0 << 22) +#define ATC_LOCK_IF_L_BUFFER (0x1 << 22) +#define ATC_AHB_PROT_MASK (0x7 << 24) /* AHB Protection */ +#define ATC_FIFOCFG_MASK (0x3 << 28) /* FIFO Request Configuration */ +#define ATC_FIFOCFG_LARGESTBURST (0x0 << 28) +#define ATC_FIFOCFG_HALFFIFO (0x1 << 28) +#define ATC_FIFOCFG_ENOUGHSPACE (0x2 << 28) + +/* Bitfields in SPIP */ +#define ATC_SPIP_HOLE(x) (0xFFFFU & (x)) +#define ATC_SPIP_BOUNDARY(x) ((0x3FF & (x)) << 16) + +/* Bitfields in DPIP */ +#define ATC_DPIP_HOLE(x) (0xFFFFU & (x)) +#define ATC_DPIP_BOUNDARY(x) ((0x3FF & (x)) << 16) + + +/*-- descriptors -----------------------------------------------------*/ + +/* LLI == Linked List Item; aka DMA buffer descriptor */ +struct at_lli { + /* values that are not changed by hardware */ + dma_addr_t saddr; + dma_addr_t daddr; + /* value that may get written back: */ + u32 ctrla; + /* more values that are not changed by hardware */ + u32 ctrlb; + dma_addr_t dscr; /* chain to next lli */ +}; + +/** + * struct at_desc - software descriptor + * @at_lli: hardware lli structure + * @txd: support for the async_tx api + * @desc_node: node on the channed descriptors list + * @len: total transaction bytecount + */ +struct at_desc { + /* FIRST values the hardware uses */ + struct at_lli lli; + + /* THEN values for driver housekeeping */ + struct dma_async_tx_descriptor txd; + struct list_head desc_node; + size_t len; +}; + +static inline struct at_desc * +txd_to_at_desc(struct dma_async_tx_descriptor *txd) +{ + return container_of(txd, struct at_desc, txd); +} + + +/*-- Channels --------------------------------------------------------*/ + +/** + * struct at_dma_chan - internal representation of an Atmel HDMAC channel + * @chan_common: common dmaengine channel object members + * @device: parent device + * @ch_regs: memory mapped register base + * @mask: channel index in a mask + * @error_status: transmit error status information from irq handler + * to tasklet (use atomic operations) + * @tasklet: bottom half to finish transaction work + * @lock: serializes enqueue/dequeue operations to descriptors lists + * @completed_cookie: identifier for the most recently completed operation + * @active_list: list of descriptors dmaengine is being running on + * @queue: list of descriptors ready to be submitted to engine + * @free_list: list of descriptors usable by the channel + * @descs_allocated: records the actual size of the descriptor pool + */ +struct at_dma_chan { + struct dma_chan chan_common; + struct at_dma *device; + void __iomem *ch_regs; + u8 mask; + unsigned long error_status; + struct tasklet_struct tasklet; + + spinlock_t lock; + + /* these other elements are all protected by lock */ + dma_cookie_t completed_cookie; + struct list_head active_list; + struct list_head queue; + struct list_head free_list; + unsigned int descs_allocated; +}; + +#define channel_readl(atchan, name) \ + __raw_readl((atchan)->ch_regs + ATC_##name##_OFFSET) + +#define channel_writel(atchan, name, val) \ + __raw_writel((val), (atchan)->ch_regs + ATC_##name##_OFFSET) + +static inline struct at_dma_chan *to_at_dma_chan(struct dma_chan *dchan) +{ + return container_of(dchan, struct at_dma_chan, chan_common); +} + + +/*-- Controller ------------------------------------------------------*/ + +/** + * struct at_dma - internal representation of an Atmel HDMA Controller + * @chan_common: common dmaengine dma_device object members + * @ch_regs: memory mapped register base + * @clk: dma controller clock + * @all_chan_mask: all channels availlable in a mask + * @dma_desc_pool: base of DMA descriptor region (DMA address) + * @chan: channels table to store at_dma_chan structures + */ +struct at_dma { + struct dma_device dma_common; + void __iomem *regs; + struct clk *clk; + + u8 all_chan_mask; + + struct dma_pool *dma_desc_pool; + /* AT THE END channels table */ + struct at_dma_chan chan[0]; +}; + +#define dma_readl(atdma, name) \ + __raw_readl((atdma)->regs + AT_DMA_##name) +#define dma_writel(atdma, name, val) \ + __raw_writel((val), (atdma)->regs + AT_DMA_##name) + +static inline struct at_dma *to_at_dma(struct dma_device *ddev) +{ + return container_of(ddev, struct at_dma, dma_common); +} + + +/*-- Helper functions ------------------------------------------------*/ + +static struct device *chan2dev(struct dma_chan *chan) +{ + return &chan->dev->device; +} +static struct device *chan2parent(struct dma_chan *chan) +{ + return chan->dev->device.parent; +} + +#if defined(VERBOSE_DEBUG) +static void vdbg_dump_regs(struct at_dma_chan *atchan) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + + dev_err(chan2dev(&atchan->chan_common), + " channel %d : imr = 0x%x, chsr = 0x%x\n", + atchan->chan_common.chan_id, + dma_readl(atdma, EBCIMR), + dma_readl(atdma, CHSR)); + + dev_err(chan2dev(&atchan->chan_common), + " channel: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n", + channel_readl(atchan, SADDR), + channel_readl(atchan, DADDR), + channel_readl(atchan, CTRLA), + channel_readl(atchan, CTRLB), + channel_readl(atchan, DSCR)); +} +#else +static void vdbg_dump_regs(struct at_dma_chan *atchan) {} +#endif + +static void atc_dump_lli(struct at_dma_chan *atchan, struct at_lli *lli) +{ + dev_printk(KERN_CRIT, chan2dev(&atchan->chan_common), + " desc: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n", + lli->saddr, lli->daddr, + lli->ctrla, lli->ctrlb, lli->dscr); +} + + +static void atc_setup_irq(struct at_dma_chan *atchan, int on) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + u32 ebci; + + /* enable interrupts on buffer chain completion & error */ + ebci = AT_DMA_CBTC(atchan->chan_common.chan_id) + | AT_DMA_ERR(atchan->chan_common.chan_id); + if (on) + dma_writel(atdma, EBCIER, ebci); + else + dma_writel(atdma, EBCIDR, ebci); +} + +static inline void atc_enable_irq(struct at_dma_chan *atchan) +{ + atc_setup_irq(atchan, 1); +} + +static inline void atc_disable_irq(struct at_dma_chan *atchan) +{ + atc_setup_irq(atchan, 0); +} + + +/** + * atc_chan_is_enabled - test if given channel is enabled + * @atchan: channel we want to test status + */ +static inline int atc_chan_is_enabled(struct at_dma_chan *atchan) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + + return !!(dma_readl(atdma, CHSR) & atchan->mask); +} + + +/** + * set_desc_eol - set end-of-link to descriptor so it will end transfer + * @desc: descriptor, signle or at the end of a chain, to end chain on + */ +static void set_desc_eol(struct at_desc *desc) +{ + desc->lli.ctrlb |= ATC_SRC_DSCR_DIS | ATC_DST_DSCR_DIS; + desc->lli.dscr = 0; +} + +#endif /* AT_HDMAC_REGS_H */ -- cgit v1.2.2 From 808347f6a31792079e345ec865e9cfcb6e8ae6b2 Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Wed, 22 Jul 2009 20:04:45 +0200 Subject: dmaengine: at_hdmac: add DMA slave transfers This patch for at_hdmac adds the slave transfers capability to the Atmel DMA controller available on some AT91 SOCs. This allow peripheral to memory and memory to peripheral transfers with hardware handshaking. Slave structure for controller specific information is passed through channel private data. This at_dma_slave structure is defined in at_hdmac.h header file and relative hardware definition are moved to this file from at_hdmac_regs.h. Doing this we allow the channel configuration from platform definition code. This work is intensively based on dw_dmac and several slave implementations. Signed-off-by: Nicolas Ferre Signed-off-by: Dan Williams --- drivers/dma/at_hdmac.c | 208 +++++++++++++++++++++++++++++++++++++++++++- drivers/dma/at_hdmac_regs.h | 49 ++--------- 2 files changed, 214 insertions(+), 43 deletions(-) (limited to 'drivers') diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c index 64dbf0ce128e..9a1e5fb412ed 100644 --- a/drivers/dma/at_hdmac.c +++ b/drivers/dma/at_hdmac.c @@ -608,6 +608,187 @@ err_desc_get: return NULL; } + +/** + * atc_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction + * @chan: DMA channel + * @sgl: scatterlist to transfer to/from + * @sg_len: number of entries in @scatterlist + * @direction: DMA direction + * @flags: tx descriptor status flags + */ +static struct dma_async_tx_descriptor * +atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + unsigned int sg_len, enum dma_data_direction direction, + unsigned long flags) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma_slave *atslave = chan->private; + struct at_desc *first = NULL; + struct at_desc *prev = NULL; + u32 ctrla; + u32 ctrlb; + dma_addr_t reg; + unsigned int reg_width; + unsigned int mem_width; + unsigned int i; + struct scatterlist *sg; + size_t total_len = 0; + + dev_vdbg(chan2dev(chan), "prep_slave_sg: %s f0x%lx\n", + direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE", + flags); + + if (unlikely(!atslave || !sg_len)) { + dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n"); + return NULL; + } + + reg_width = atslave->reg_width; + + sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction); + + ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla; + ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN; + + switch (direction) { + case DMA_TO_DEVICE: + ctrla |= ATC_DST_WIDTH(reg_width); + ctrlb |= ATC_DST_ADDR_MODE_FIXED + | ATC_SRC_ADDR_MODE_INCR + | ATC_FC_MEM2PER; + reg = atslave->tx_reg; + for_each_sg(sgl, sg, sg_len, i) { + struct at_desc *desc; + u32 len; + u32 mem; + + desc = atc_desc_get(atchan); + if (!desc) + goto err_desc_get; + + mem = sg_phys(sg); + len = sg_dma_len(sg); + mem_width = 2; + if (unlikely(mem & 3 || len & 3)) + mem_width = 0; + + desc->lli.saddr = mem; + desc->lli.daddr = reg; + desc->lli.ctrla = ctrla + | ATC_SRC_WIDTH(mem_width) + | len >> mem_width; + desc->lli.ctrlb = ctrlb; + + if (!first) { + first = desc; + } else { + /* inform the HW lli about chaining */ + prev->lli.dscr = desc->txd.phys; + /* insert the link descriptor to the LD ring */ + list_add_tail(&desc->desc_node, + &first->txd.tx_list); + } + prev = desc; + total_len += len; + } + break; + case DMA_FROM_DEVICE: + ctrla |= ATC_SRC_WIDTH(reg_width); + ctrlb |= ATC_DST_ADDR_MODE_INCR + | ATC_SRC_ADDR_MODE_FIXED + | ATC_FC_PER2MEM; + + reg = atslave->rx_reg; + for_each_sg(sgl, sg, sg_len, i) { + struct at_desc *desc; + u32 len; + u32 mem; + + desc = atc_desc_get(atchan); + if (!desc) + goto err_desc_get; + + mem = sg_phys(sg); + len = sg_dma_len(sg); + mem_width = 2; + if (unlikely(mem & 3 || len & 3)) + mem_width = 0; + + desc->lli.saddr = reg; + desc->lli.daddr = mem; + desc->lli.ctrla = ctrla + | ATC_DST_WIDTH(mem_width) + | len >> mem_width; + desc->lli.ctrlb = ctrlb; + + if (!first) { + first = desc; + } else { + /* inform the HW lli about chaining */ + prev->lli.dscr = desc->txd.phys; + /* insert the link descriptor to the LD ring */ + list_add_tail(&desc->desc_node, + &first->txd.tx_list); + } + prev = desc; + total_len += len; + } + break; + default: + return NULL; + } + + /* set end-of-link to the last link descriptor of list*/ + set_desc_eol(prev); + + /* First descriptor of the chain embedds additional information */ + first->txd.cookie = -EBUSY; + first->len = total_len; + + /* last link descriptor of list is responsible of flags */ + prev->txd.flags = flags; /* client is in control of this ack */ + + return &first->txd; + +err_desc_get: + dev_err(chan2dev(chan), "not enough descriptors available\n"); + atc_desc_put(atchan, first); + return NULL; +} + +static void atc_terminate_all(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma *atdma = to_at_dma(chan->device); + struct at_desc *desc, *_desc; + LIST_HEAD(list); + + /* + * This is only called when something went wrong elsewhere, so + * we don't really care about the data. Just disable the + * channel. We still have to poll the channel enable bit due + * to AHB/HSB limitations. + */ + spin_lock_bh(&atchan->lock); + + dma_writel(atdma, CHDR, atchan->mask); + + /* confirm that this channel is disabled */ + while (dma_readl(atdma, CHSR) & atchan->mask) + cpu_relax(); + + /* active_list entries will end up before queued entries */ + list_splice_init(&atchan->queue, &list); + list_splice_init(&atchan->active_list, &list); + + spin_unlock_bh(&atchan->lock); + + /* Flush all pending and queued descriptors */ + list_for_each_entry_safe(desc, _desc, &list, desc_node) + atc_chain_complete(atchan, desc); +} + /** * atc_is_tx_complete - poll for transaction completion * @chan: DMA channel @@ -686,7 +867,9 @@ static int atc_alloc_chan_resources(struct dma_chan *chan) struct at_dma_chan *atchan = to_at_dma_chan(chan); struct at_dma *atdma = to_at_dma(chan->device); struct at_desc *desc; + struct at_dma_slave *atslave; int i; + u32 cfg; LIST_HEAD(tmp_list); dev_vdbg(chan2dev(chan), "alloc_chan_resources\n"); @@ -697,7 +880,23 @@ static int atc_alloc_chan_resources(struct dma_chan *chan) return -EIO; } - /* have we already been set up? */ + cfg = ATC_DEFAULT_CFG; + + atslave = chan->private; + if (atslave) { + /* + * We need controller-specific data to set up slave + * transfers. + */ + BUG_ON(!atslave->dma_dev || atslave->dma_dev != atdma->dma_common.dev); + + /* if cfg configuration specified take it instad of default */ + if (atslave->cfg) + cfg = atslave->cfg; + } + + /* have we already been set up? + * reconfigure channel but no need to reallocate descriptors */ if (!list_empty(&atchan->free_list)) return atchan->descs_allocated; @@ -719,7 +918,7 @@ static int atc_alloc_chan_resources(struct dma_chan *chan) spin_unlock_bh(&atchan->lock); /* channel parameters */ - channel_writel(atchan, CFG, ATC_DEFAULT_CFG); + channel_writel(atchan, CFG, cfg); dev_dbg(chan2dev(chan), "alloc_chan_resources: allocated %d descriptors\n", @@ -888,6 +1087,11 @@ static int __init at_dma_probe(struct platform_device *pdev) if (dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask)) atdma->dma_common.device_prep_dma_memcpy = atc_prep_dma_memcpy; + if (dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask)) { + atdma->dma_common.device_prep_slave_sg = atc_prep_slave_sg; + atdma->dma_common.device_terminate_all = atc_terminate_all; + } + dma_writel(atdma, EN, AT_DMA_ENABLE); dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s), %d channels\n", diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h index ad2d4f402bf7..4c972afc49ec 100644 --- a/drivers/dma/at_hdmac_regs.h +++ b/drivers/dma/at_hdmac_regs.h @@ -87,29 +87,14 @@ /* Bitfields in CTRLA */ #define ATC_BTSIZE_MAX 0xFFFFUL /* Maximum Buffer Transfer Size */ #define ATC_BTSIZE(x) (ATC_BTSIZE_MAX & (x)) /* Buffer Transfer Size */ -#define ATC_SCSIZE_MASK (0x7 << 16) /* Source Chunk Transfer Size */ -#define ATC_SCSIZE_1 (0x0 << 16) -#define ATC_SCSIZE_4 (0x1 << 16) -#define ATC_SCSIZE_8 (0x2 << 16) -#define ATC_SCSIZE_16 (0x3 << 16) -#define ATC_SCSIZE_32 (0x4 << 16) -#define ATC_SCSIZE_64 (0x5 << 16) -#define ATC_SCSIZE_128 (0x6 << 16) -#define ATC_SCSIZE_256 (0x7 << 16) -#define ATC_DCSIZE_MASK (0x7 << 20) /* Destination Chunk Transfer Size */ -#define ATC_DCSIZE_1 (0x0 << 20) -#define ATC_DCSIZE_4 (0x1 << 20) -#define ATC_DCSIZE_8 (0x2 << 20) -#define ATC_DCSIZE_16 (0x3 << 20) -#define ATC_DCSIZE_32 (0x4 << 20) -#define ATC_DCSIZE_64 (0x5 << 20) -#define ATC_DCSIZE_128 (0x6 << 20) -#define ATC_DCSIZE_256 (0x7 << 20) +/* Chunck Tranfer size definitions are in at_hdmac.h */ #define ATC_SRC_WIDTH_MASK (0x3 << 24) /* Source Single Transfer Size */ +#define ATC_SRC_WIDTH(x) ((x) << 24) #define ATC_SRC_WIDTH_BYTE (0x0 << 24) #define ATC_SRC_WIDTH_HALFWORD (0x1 << 24) #define ATC_SRC_WIDTH_WORD (0x2 << 24) #define ATC_DST_WIDTH_MASK (0x3 << 28) /* Destination Single Transfer Size */ +#define ATC_DST_WIDTH(x) ((x) << 28) #define ATC_DST_WIDTH_BYTE (0x0 << 28) #define ATC_DST_WIDTH_HALFWORD (0x1 << 28) #define ATC_DST_WIDTH_WORD (0x2 << 28) @@ -129,7 +114,8 @@ #define ATC_FC_PER2PER (0x3 << 21) /* Periph-to-Periph (DMA) */ #define ATC_FC_PER2MEM_PER (0x4 << 21) /* Periph-to-Mem (Peripheral) */ #define ATC_FC_MEM2PER_PER (0x5 << 21) /* Mem-to-Periph (Peripheral) */ -#define ATC_FC_PER2PER_PER (0x6 << 21) /* Periph-to-Periph (Src Peripheral) */ +#define ATC_FC_PER2PER_SRCPER (0x6 << 21) /* Periph-to-Periph (Src Peripheral) */ +#define ATC_FC_PER2PER_DSTPER (0x7 << 21) /* Periph-to-Periph (Dst Peripheral) */ #define ATC_SRC_ADDR_MODE_MASK (0x3 << 24) #define ATC_SRC_ADDR_MODE_INCR (0x0 << 24) /* Incrementing Mode */ #define ATC_SRC_ADDR_MODE_DECR (0x1 << 24) /* Decrementing Mode */ @@ -142,27 +128,7 @@ #define ATC_AUTO (0x1 << 31) /* Auto multiple buffer tx enable */ /* Bitfields in CFG */ -#define ATC_SRC_PER(h) (0xFU & (h)) /* Channel src rq associated with periph handshaking ifc h */ -#define ATC_DST_PER(h) ((0xFU & (h)) << 4) /* Channel dst rq associated with periph handshaking ifc h */ -#define ATC_SRC_REP (0x1 << 8) /* Source Replay Mod */ -#define ATC_SRC_H2SEL (0x1 << 9) /* Source Handshaking Mod */ -#define ATC_SRC_H2SEL_SW (0x0 << 9) -#define ATC_SRC_H2SEL_HW (0x1 << 9) -#define ATC_DST_REP (0x1 << 12) /* Destination Replay Mod */ -#define ATC_DST_H2SEL (0x1 << 13) /* Destination Handshaking Mod */ -#define ATC_DST_H2SEL_SW (0x0 << 13) -#define ATC_DST_H2SEL_HW (0x1 << 13) -#define ATC_SOD (0x1 << 16) /* Stop On Done */ -#define ATC_LOCK_IF (0x1 << 20) /* Interface Lock */ -#define ATC_LOCK_B (0x1 << 21) /* AHB Bus Lock */ -#define ATC_LOCK_IF_L (0x1 << 22) /* Master Interface Arbiter Lock */ -#define ATC_LOCK_IF_L_CHUNK (0x0 << 22) -#define ATC_LOCK_IF_L_BUFFER (0x1 << 22) -#define ATC_AHB_PROT_MASK (0x7 << 24) /* AHB Protection */ -#define ATC_FIFOCFG_MASK (0x3 << 28) /* FIFO Request Configuration */ -#define ATC_FIFOCFG_LARGESTBURST (0x0 << 28) -#define ATC_FIFOCFG_HALFFIFO (0x1 << 28) -#define ATC_FIFOCFG_ENOUGHSPACE (0x2 << 28) +/* are in at_hdmac.h */ /* Bitfields in SPIP */ #define ATC_SPIP_HOLE(x) (0xFFFFU & (x)) @@ -316,11 +282,12 @@ static void vdbg_dump_regs(struct at_dma_chan *atchan) dma_readl(atdma, CHSR)); dev_err(chan2dev(&atchan->chan_common), - " channel: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n", + " channel: s0x%x d0x%x ctrl0x%x:0x%x cfg0x%x l0x%x\n", channel_readl(atchan, SADDR), channel_readl(atchan, DADDR), channel_readl(atchan, CTRLA), channel_readl(atchan, CTRLB), + channel_readl(atchan, CFG), channel_readl(atchan, DSCR)); } #else -- cgit v1.2.2 From 3995bd9332a51b626237d6671cfeb7235e6c1305 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Jul 2009 11:13:14 -0700 Subject: iwlwifi: fix TX queue race I had a problem on 4965 hardware (well, probably other hardware too, but others don't survive my stress testing right now, unfortunately) where the driver was sending invalid commands to the device, but no such thing could be seen from the driver's point of view. I could reproduce this fairly easily by sending multiple TCP streams with iperf on different TIDs, though sometimes a single iperf stream was sufficient. It even happened with a single core, but I have forced preemption turned on. The culprit was a queue overrun, where we advanced the queue's write pointer over the read pointer. After careful analysis I've come to the conclusion that the cause is a race condition between iwlwifi and mac80211. mac80211, of course, checks whether the queue is stopped, before transmitting a frame. This effectively looks like this: lock(queues) if (stopped(queue)) { unlock(queues) return busy; } unlock(queues) ... <-- this place will be important there is some more code here drv_tx(frame) The driver, on the other hand, can stop and start queues, which does lock(queues) mark_running/stopped(queue) unlock(queues) [if marked running: wake up tasklet to send pending frames] Now, however, once the driver starts the queue, mac80211 can see that and end up at the marked place above, at which point for some reason the driver seems to stop the queue again (I don't understand that) and then we end up transmitting while the queue is actually full. Now, this shouldn't actually matter much, but for some reason I've seen it happen multiple times in a row and the queue actually overflows, at which point the queue bites itself in the tail and things go completely wrong. This patch fixes this by just dropping the packet should this have happened, and making the lock in iwlwifi cover everything so iwlwifi can't race against itself (dropping the lock there might make it more likely, but it did seem to happen without that too). Since we can't hold the lock across drv_tx() above, I see no way to fix this in mac80211, but I also don't understand why I haven't seen this before -- maybe I just never stress tested it this badly. With this patch, the device has survived many minutes of simultanously sending two iperf streams on different TIDs with combined throughput of about 60 Mbps. Signed-off-by: Johannes Berg Signed-off-by: Reinette Chatre Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-tx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c index 9bbeec9427f0..5febb3186365 100644 --- a/drivers/net/wireless/iwlwifi/iwl-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-tx.c @@ -720,8 +720,6 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) goto drop_unlock; } - spin_unlock_irqrestore(&priv->lock, flags); - hdr_len = ieee80211_hdrlen(fc); /* Find (or create) index into station table for destination station */ @@ -729,7 +727,7 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) if (sta_id == IWL_INVALID_STATION) { IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n", hdr->addr1); - goto drop; + goto drop_unlock; } IWL_DEBUG_TX(priv, "station Id %d\n", sta_id); @@ -750,14 +748,17 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) txq_id = priv->stations[sta_id].tid[tid].agg.txq_id; swq_id = iwl_virtual_agg_queue_num(swq_id, txq_id); } - priv->stations[sta_id].tid[tid].tfds_in_queue++; } txq = &priv->txq[txq_id]; q = &txq->q; txq->swq_id = swq_id; - spin_lock_irqsave(&priv->lock, flags); + if (unlikely(iwl_queue_space(q) < q->high_mark)) + goto drop_unlock; + + if (ieee80211_is_data_qos(fc)) + priv->stations[sta_id].tid[tid].tfds_in_queue++; /* Set up driver data for this TFD */ memset(&(txq->txb[q->write_ptr]), 0, sizeof(struct iwl_tx_info)); @@ -902,7 +903,6 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) drop_unlock: spin_unlock_irqrestore(&priv->lock, flags); -drop: return -1; } EXPORT_SYMBOL(iwl_tx_skb); -- cgit v1.2.2 From 45f5fa32b130b2a59f9b726be45ce7fa73fb834c Mon Sep 17 00:00:00 2001 From: reinette chatre Date: Tue, 21 Jul 2009 09:29:07 -0700 Subject: iwlagn: fix minimum number of queues setting We need to provide a reasonable minimum that will result in a working setup if used. Set minimum to be 10 to provide for 4 standard TX queues + 1 command queue + 2 (unused) HCCA queues + 4 HT queues (one per AC). We allow the user to change the number of queues used via a module parameter and use this minimum value to check if it is valid. Without this patch a user can select a value for the number of queues that will result in a failing setup. Signed-off-by: Reinette Chatre Reviewed-by: Tomas Winkler Acked-by: Tomas Winkler Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-3945.h | 2 +- drivers/net/wireless/iwlwifi/iwl-dev.h | 6 ++++-- drivers/net/wireless/iwlwifi/iwl3945-base.c | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.h b/drivers/net/wireless/iwlwifi/iwl-3945.h index fbb3a573463e..2de6471d4be9 100644 --- a/drivers/net/wireless/iwlwifi/iwl-3945.h +++ b/drivers/net/wireless/iwlwifi/iwl-3945.h @@ -112,7 +112,7 @@ enum iwl3945_antenna { #define IWL_TX_FIFO_NONE 7 /* Minimum number of queues. MAX_NUM is defined in hw specific files */ -#define IWL_MIN_NUM_QUEUES 4 +#define IWL39_MIN_NUM_QUEUES 4 #define IEEE80211_DATA_LEN 2304 #define IEEE80211_4ADDR_LEN 30 diff --git a/drivers/net/wireless/iwlwifi/iwl-dev.h b/drivers/net/wireless/iwlwifi/iwl-dev.h index e2d620f0b6e8..650e20af20fa 100644 --- a/drivers/net/wireless/iwlwifi/iwl-dev.h +++ b/drivers/net/wireless/iwlwifi/iwl-dev.h @@ -258,8 +258,10 @@ struct iwl_channel_info { #define IWL_TX_FIFO_HCCA_2 6 #define IWL_TX_FIFO_NONE 7 -/* Minimum number of queues. MAX_NUM is defined in hw specific files */ -#define IWL_MIN_NUM_QUEUES 4 +/* Minimum number of queues. MAX_NUM is defined in hw specific files. + * Set the minimum to accommodate the 4 standard TX queues, 1 command + * queue, 2 (unused) HCCA queues, and 4 HT queues (one for each AC) */ +#define IWL_MIN_NUM_QUEUES 10 /* Power management (not Tx power) structures */ diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index 956798f2c80c..2f50ab60bfdf 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -4018,10 +4018,10 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e SET_IEEE80211_DEV(hw, &pdev->dev); if ((iwl3945_mod_params.num_of_queues > IWL39_MAX_NUM_QUEUES) || - (iwl3945_mod_params.num_of_queues < IWL_MIN_NUM_QUEUES)) { + (iwl3945_mod_params.num_of_queues < IWL39_MIN_NUM_QUEUES)) { IWL_ERR(priv, "invalid queues_num, should be between %d and %d\n", - IWL_MIN_NUM_QUEUES, IWL39_MAX_NUM_QUEUES); + IWL39_MIN_NUM_QUEUES, IWL39_MAX_NUM_QUEUES); err = -EINVAL; goto out_ieee80211_free_hw; } -- cgit v1.2.2 From 2a21f86917f7a9fe13b180e895a816871a234dee Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 25 Jul 2009 15:22:59 +0300 Subject: wireless: ERR_PTR vs null iwm_wdev_alloc() returns an ERR_PTR on failure and not null. It also prints its own dev_err() message so I removed that as well. Compile tested only. Sorry. Found by smatch (http://repo.or.cz/w/smatch.git). Signed-off-by: Dan Carpenter Acked-by: Zhu Yi Signed-off-by: John W. Linville --- drivers/net/wireless/iwmc3200wifi/netdev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/net/wireless/iwmc3200wifi/netdev.c b/drivers/net/wireless/iwmc3200wifi/netdev.c index aea5ccf24ccf..bf294e41753b 100644 --- a/drivers/net/wireless/iwmc3200wifi/netdev.c +++ b/drivers/net/wireless/iwmc3200wifi/netdev.c @@ -106,10 +106,8 @@ void *iwm_if_alloc(int sizeof_bus, struct device *dev, int ret = 0; wdev = iwm_wdev_alloc(sizeof_bus, dev); - if (!wdev) { - dev_err(dev, "no memory for wireless device instance\n"); - return ERR_PTR(-ENOMEM); - } + if (IS_ERR(wdev)) + return wdev; iwm = wdev_to_iwm(wdev); iwm->bus_ops = if_ops; -- cgit v1.2.2 From 3d0ccd021b23c18ea2d399fe4a43c955485c765c Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sat, 25 Jul 2009 23:02:32 +0200 Subject: airo: Buffer overflow SSID_rid has space for only 3 ssids. txPowerLevels[i] is read before the bounds check for i Signed-off-by: Roel Kluin Acked-by: Dan Williams Signed-off-by: John W. Linville --- drivers/net/wireless/airo.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index c70604f0329e..8ce5e4cee168 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -5918,20 +5918,19 @@ static int airo_set_essid(struct net_device *dev, readSsidRid(local, &SSID_rid); /* Check if we asked for `any' */ - if(dwrq->flags == 0) { + if (dwrq->flags == 0) { /* Just send an empty SSID list */ memset(&SSID_rid, 0, sizeof(SSID_rid)); } else { - int index = (dwrq->flags & IW_ENCODE_INDEX) - 1; + unsigned index = (dwrq->flags & IW_ENCODE_INDEX) - 1; /* Check the size of the string */ - if(dwrq->length > IW_ESSID_MAX_SIZE) { + if (dwrq->length > IW_ESSID_MAX_SIZE) return -E2BIG ; - } + /* Check if index is valid */ - if((index < 0) || (index >= 4)) { + if (index >= ARRAY_SIZE(SSID_rid.ssids)) return -EINVAL; - } /* Set the SSID */ memset(SSID_rid.ssids[index].ssid, 0, @@ -6819,7 +6818,7 @@ static int airo_set_txpow(struct net_device *dev, return -EINVAL; } clear_bit (FLAG_RADIO_OFF, &local->flags); - for (i = 0; cap_rid.txPowerLevels[i] && (i < 8); i++) + for (i = 0; i < 8 && cap_rid.txPowerLevels[i]; i++) if (v == cap_rid.txPowerLevels[i]) { readConfigRid(local, 1); local->config.txPower = v; -- cgit v1.2.2 From 008749fc9917b799c469478141ddd1a4c81d06ca Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sat, 25 Jul 2009 23:21:22 +0200 Subject: ath9k: Read outside array bounds Incorrect limits leads to reads outside array bounds. Signed-off-by: Roel Kluin Acked-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- drivers/net/wireless/ath/ath9k/eeprom.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/net/wireless/ath/ath9k/eeprom.c b/drivers/net/wireless/ath/ath9k/eeprom.c index a2fda702b620..ce0e86c36a82 100644 --- a/drivers/net/wireless/ath/ath9k/eeprom.c +++ b/drivers/net/wireless/ath/ath9k/eeprom.c @@ -460,7 +460,7 @@ static int ath9k_hw_4k_check_eeprom(struct ath_hw *ah) integer = swab32(eep->modalHeader.antCtrlCommon); eep->modalHeader.antCtrlCommon = integer; - for (i = 0; i < AR5416_MAX_CHAINS; i++) { + for (i = 0; i < AR5416_EEP4K_MAX_CHAINS; i++) { integer = swab32(eep->modalHeader.antCtrlChain[i]); eep->modalHeader.antCtrlChain[i] = integer; } @@ -914,7 +914,7 @@ static void ath9k_hw_set_4k_power_per_rate_table(struct ath_hw *ah, ctlMode, numCtlModes, isHt40CtlMode, (pCtlMode[ctlMode] & EXT_ADDITIVE)); - for (i = 0; (i < AR5416_NUM_CTLS) && + for (i = 0; (i < AR5416_EEP4K_NUM_CTLS) && pEepData->ctlIndex[i]; i++) { DPRINTF(ah->ah_sc, ATH_DBG_EEPROM, " LOOP-Ctlidx %d: cfgCtl 0x%2.2x " -- cgit v1.2.2 From 082e708acc50a5b625b9bde0bb1af90dfdbd1942 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sat, 25 Jul 2009 23:34:31 +0200 Subject: iwlwifi: Read outside array bounds tid is bounded (above) by the size of default_tid_to_tx_fifo (17 elements), but the size of priv->stations[].tid[] is MAX_TID_COUNT (9) elements. Signed-off-by: Roel Kluin Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-tx.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c index 5febb3186365..2e89040e63be 100644 --- a/drivers/net/wireless/iwlwifi/iwl-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-tx.c @@ -1171,6 +1171,8 @@ int iwl_tx_agg_start(struct iwl_priv *priv, const u8 *ra, u16 tid, u16 *ssn) IWL_ERR(priv, "Start AGG on invalid station\n"); return -ENXIO; } + if (unlikely(tid >= MAX_TID_COUNT)) + return -EINVAL; if (priv->stations[sta_id].tid[tid].agg.state != IWL_AGG_OFF) { IWL_ERR(priv, "Start AGG when state is not IWL_AGG_OFF !\n"); -- cgit v1.2.2 From 7cb7f45c7feef43c8f71f5cfedfc0b19be2142f7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 27 Jul 2009 18:42:38 -0400 Subject: Revert "ACPICA: Remove obsolete acpi_os_validate_address interface" This reverts commit f9ca058430333c9a24c5ca926aa445125f88df18. which caused a regression: http://bugzilla.kernel.org/show_bug.cgi?id=13620 Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/acobject.h | 1 + drivers/acpi/acpica/dsopcode.c | 24 ++++++++++++++++++++++++ drivers/acpi/acpica/exfldio.c | 6 ++++++ 3 files changed, 31 insertions(+) (limited to 'drivers') diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h index 544dcf834922..eb6f038b03d9 100644 --- a/drivers/acpi/acpica/acobject.h +++ b/drivers/acpi/acpica/acobject.h @@ -97,6 +97,7 @@ #define AOPOBJ_OBJECT_INITIALIZED 0x08 #define AOPOBJ_SETUP_COMPLETE 0x10 #define AOPOBJ_SINGLE_DATUM 0x20 +#define AOPOBJ_INVALID 0x40 /* Used if host OS won't allow an op_region address */ /****************************************************************************** * diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c index 584d766e6f12..b79978f7bc71 100644 --- a/drivers/acpi/acpica/dsopcode.c +++ b/drivers/acpi/acpica/dsopcode.c @@ -397,6 +397,30 @@ acpi_status acpi_ds_get_region_arguments(union acpi_operand_object *obj_desc) status = acpi_ds_execute_arguments(node, acpi_ns_get_parent_node(node), extra_desc->extra.aml_length, extra_desc->extra.aml_start); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Validate the region address/length via the host OS */ + + status = acpi_os_validate_address(obj_desc->region.space_id, + obj_desc->region.address, + (acpi_size) obj_desc->region.length, + acpi_ut_get_node_name(node)); + + if (ACPI_FAILURE(status)) { + /* + * Invalid address/length. We will emit an error message and mark + * the region as invalid, so that it will cause an additional error if + * it is ever used. Then return AE_OK. + */ + ACPI_EXCEPTION((AE_INFO, status, + "During address validation of OpRegion [%4.4s]", + node->name.ascii)); + obj_desc->common.flags |= AOPOBJ_INVALID; + status = AE_OK; + } + return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c index d4075b821021..6687be167f5f 100644 --- a/drivers/acpi/acpica/exfldio.c +++ b/drivers/acpi/acpica/exfldio.c @@ -113,6 +113,12 @@ acpi_ex_setup_region(union acpi_operand_object *obj_desc, } } + /* Exit if Address/Length have been disallowed by the host OS */ + + if (rgn_desc->common.flags & AOPOBJ_INVALID) { + return_ACPI_STATUS(AE_AML_ILLEGAL_ADDRESS); + } + /* * Exit now for SMBus address space, it has a non-linear address space * and the request cannot be directly validated -- cgit v1.2.2 From 48f5690d45b79ffeedc5ab24243b576056f1d2ff Mon Sep 17 00:00:00 2001 From: unsik Kim Date: Tue, 28 Jul 2009 08:52:06 +0200 Subject: mg_disk: remove prohibited sleep operation mflash's polling driver operate in standard request_fn_proc's context, sleep in this isn't permitted. Signed-off-by: unsik Kim Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index f703f5478246..5b120eab2baa 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -245,8 +245,6 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec) mg_dump_status("not ready", status, host); return MG_ERR_INV_STAT; } - if (prv_data->use_polling) - msleep(1); status = inb((unsigned long)host->dev_base + MG_REG_STATUS); } while (time_before(cur_jiffies, expire)); -- cgit v1.2.2 From eb32baec15c38ae6f06cb898a9f791578c5f8c79 Mon Sep 17 00:00:00 2001 From: unsik Kim Date: Tue, 28 Jul 2009 08:52:07 +0200 Subject: mg_disk: fix reading invalid status when use polling driver When using polling driver, little delay is required to access status register. Without this, host might read invalid status. Signed-off-by: unsik Kim Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'drivers') diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 5b120eab2baa..6440d5945414 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -219,6 +219,16 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec) host->error = MG_ERR_NONE; expire = jiffies + msecs_to_jiffies(msec); + /* These 2 times dummy status read prevents reading invalid + * status. A very little time (3 times of mflash operating clk) + * is required for busy bit is set. Use dummy read instead of + * busy wait, because mflash's PLL is machine dependent. + */ + if (prv_data->use_polling) { + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + } + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); do { -- cgit v1.2.2 From 394c6cc63c1d6900ad7498a3221a1d48fc00c4fa Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Jul 2009 08:56:34 +0200 Subject: mg_disk: fix issue with data integrity on error in mg_write() We cannot acknowledge the sector write before checking its status (which is done on the next loop iteration) and we also need to do the final status register check after writing the last sector. Fix mg_write() to match mg_write_intr() in this regard. While at it: - add mg_read_one() and mg_write_one() helpers - always use MG_SECTOR_SIZE and remove MG_STORAGE_BUFFER_SIZE [bart: thanks to Tejun for porting the patch over recent block changes] Cc: unsik Kim Cc: Tejun Heo Signed-off-by: Bartlomiej Zolnierkiewicz =================================================================== Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 89 ++++++++++++++++++++++++++----------------------- 1 file changed, 47 insertions(+), 42 deletions(-) (limited to 'drivers') diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 6440d5945414..19917d5481bd 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -36,7 +36,6 @@ /* Register offsets */ #define MG_BUFF_OFFSET 0x8000 -#define MG_STORAGE_BUFFER_SIZE 0x200 #define MG_REG_OFFSET 0xC000 #define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */ #define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */ @@ -477,9 +476,18 @@ static unsigned int mg_out(struct mg_host *host, return MG_ERR_NONE; } +static void mg_read_one(struct mg_host *host, struct request *req) +{ + u16 *buff = (u16 *)req->buffer; + u32 i; + + for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) + *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET + + (i << 1)); +} + static void mg_read(struct request *req) { - u32 j; struct mg_host *host = req->rq_disk->private_data; if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), @@ -490,26 +498,33 @@ static void mg_read(struct request *req) blk_rq_sectors(req), blk_rq_pos(req), req->buffer); do { - u16 *buff = (u16 *)req->buffer; - if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) - *buff++ = inw((unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); + + mg_read_one(host, req); outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); } +static void mg_write_one(struct mg_host *host, struct request *req) +{ + u16 *buff = (u16 *)req->buffer; + u32 i; + + for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) + outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET + + (i << 1)); +} + static void mg_write(struct request *req) { - u32 j; struct mg_host *host = req->rq_disk->private_data; + bool rem; if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), MG_CMD_WR, NULL) != MG_ERR_NONE) { @@ -520,27 +535,37 @@ static void mg_write(struct request *req) MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", blk_rq_sectors(req), blk_rq_pos(req), req->buffer); - do { - u16 *buff = (u16 *)req->buffer; + if (mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } + + mg_write_one(host, req); + + outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + do { + if (blk_rq_sectors(req) > 1 && + mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) - outw(*buff++, (unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); - outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + - MG_REG_COMMAND); - } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); + rem = mg_end_request(host, 0, MG_SECTOR_SIZE); + if (rem) + mg_write_one(host, req); + + outb(MG_CMD_WR_CONF, + (unsigned long)host->dev_base + MG_REG_COMMAND); + } while (rem); } static void mg_read_intr(struct mg_host *host) { struct request *req = host->req; u32 i; - u16 *buff; /* check status */ do { @@ -558,13 +583,7 @@ static void mg_read_intr(struct mg_host *host) return; ok_to_read: - /* get current segment of request */ - buff = (u16 *)req->buffer; - - /* read 1 sector */ - for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) - *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET + - (i << 1)); + mg_read_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); @@ -583,8 +602,7 @@ ok_to_read: static void mg_write_intr(struct mg_host *host) { struct request *req = host->req; - u32 i, j; - u16 *buff; + u32 i; bool rem; /* check status */ @@ -605,12 +623,7 @@ static void mg_write_intr(struct mg_host *host) ok_to_write: if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) { /* write 1 sector and set handler if remains */ - buff = (u16 *)req->buffer; - for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) { - outw(*buff, (unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); - buff++; - } + mg_write_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", blk_rq_pos(req), blk_rq_sectors(req), req->buffer); host->mg_do_intr = mg_write_intr; @@ -675,9 +688,6 @@ static unsigned int mg_issue_req(struct request *req, unsigned int sect_num, unsigned int sect_cnt) { - u16 *buff; - u32 i; - switch (rq_data_dir(req)) { case READ: if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) @@ -701,12 +711,7 @@ static unsigned int mg_issue_req(struct request *req, mg_bad_rw_intr(host); return host->error; } - buff = (u16 *)req->buffer; - for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) { - outw(*buff, (unsigned long)host->dev_base + - MG_BUFF_OFFSET + (i << 1)); - buff++; - } + mg_write_one(host, req); mod_timer(&host->timer, jiffies + 3 * HZ); outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); -- cgit v1.2.2 From a85a00a699740f6f9863f88aef22060fe1534681 Mon Sep 17 00:00:00 2001 From: unsik Kim Date: Tue, 28 Jul 2009 08:57:33 +0200 Subject: mg_disk: Add missing ready status check on mg_write() When last sector is written, ready bit of status register should be checked. Signed-off-by: unsik Kim Acked-by: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'drivers') diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 19917d5481bd..6d7fbaa92248 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -524,16 +524,16 @@ static void mg_write_one(struct mg_host *host, struct request *req) static void mg_write(struct request *req) { struct mg_host *host = req->rq_disk->private_data; - bool rem; + unsigned int rem = blk_rq_sectors(req); - if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), + if (mg_out(host, blk_rq_pos(req), rem, MG_CMD_WR, NULL) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - blk_rq_sectors(req), blk_rq_pos(req), req->buffer); + rem, blk_rq_pos(req), req->buffer); if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { @@ -541,25 +541,23 @@ static void mg_write(struct request *req) return; } - mg_write_one(host, req); + do { + mg_write_one(host, req); - outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); + outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + + MG_REG_COMMAND); - do { - if (blk_rq_sectors(req) > 1 && - mg_wait(host, ATA_DRQ, - MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + rem--; + if (rem > 1 && mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } else if (mg_wait(host, MG_STAT_READY, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - - rem = mg_end_request(host, 0, MG_SECTOR_SIZE); - if (rem) - mg_write_one(host, req); - - outb(MG_CMD_WR_CONF, - (unsigned long)host->dev_base + MG_REG_COMMAND); - } while (rem); + } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); } static void mg_read_intr(struct mg_host *host) -- cgit v1.2.2 From 430453fc2a5f3f2c1d98ebc3c3d4c54f3060e3c3 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 28 Jul 2009 09:59:47 +0200 Subject: libertas: Read outside array bounds reads bss->rates[j] before checking bounds of index, and should use ARRAY_SIZE to determine the size of the array. Signed-off-by: Roel Kluin Acked-by: Holger Schurig Acked-by: Dan Williams Signed-off-by: John W. Linville --- drivers/net/wireless/libertas/scan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/wireless/libertas/scan.c b/drivers/net/wireless/libertas/scan.c index 601b54249677..6c95af3023cc 100644 --- a/drivers/net/wireless/libertas/scan.c +++ b/drivers/net/wireless/libertas/scan.c @@ -5,6 +5,7 @@ * for sending scan commands to the firmware. */ #include +#include #include #include #include @@ -876,7 +877,7 @@ static inline char *lbs_translate_scan(struct lbs_private *priv, iwe.u.bitrate.disabled = 0; iwe.u.bitrate.value = 0; - for (j = 0; bss->rates[j] && (j < sizeof(bss->rates)); j++) { + for (j = 0; j < ARRAY_SIZE(bss->rates) && bss->rates[j]; j++) { /* Bit rate given in 500 kb/s units */ iwe.u.bitrate.value = bss->rates[j] * 500000; current_val = iwe_stream_add_value(info, start, current_val, -- cgit v1.2.2 From 57921c312e8cef72ba35a4cfe870b376da0b1b87 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Tue, 28 Jul 2009 12:05:00 +0200 Subject: libertas: Read buffer overflow Several arrays were read before checking whether the index was within bounds. ARRAY_SIZE() should be used to determine the size of arrays. rates->rates has an arraysize of 1, so calling get_common_rates() with a rates_size of MAX_RATES (14) was causing reads out of bounds. tmp_size can increment at most to (ARRAY_SIZE(lbs_bg_rates) - 1) * (*rates_size - 1), so that should be the number of elements of tmp[]. A goto can be eliminated: ret was already set upon its declaration. Signed-off-by: Roel Kluin Signed-off-by: John W. Linville --- drivers/net/wireless/libertas/assoc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/net/wireless/libertas/assoc.c b/drivers/net/wireless/libertas/assoc.c index b9b374119033..d6997371c27e 100644 --- a/drivers/net/wireless/libertas/assoc.c +++ b/drivers/net/wireless/libertas/assoc.c @@ -1,6 +1,7 @@ /* Copyright (C) 2006, Red Hat, Inc. */ #include +#include #include #include #include @@ -43,21 +44,21 @@ static int get_common_rates(struct lbs_private *priv, u16 *rates_size) { u8 *card_rates = lbs_bg_rates; - size_t num_card_rates = sizeof(lbs_bg_rates); int ret = 0, i, j; - u8 tmp[30]; + u8 tmp[(ARRAY_SIZE(lbs_bg_rates) - 1) * (*rates_size - 1)]; size_t tmp_size = 0; /* For each rate in card_rates that exists in rate1, copy to tmp */ - for (i = 0; card_rates[i] && (i < num_card_rates); i++) { - for (j = 0; rates[j] && (j < *rates_size); j++) { + for (i = 0; i < ARRAY_SIZE(lbs_bg_rates) && card_rates[i]; i++) { + for (j = 0; j < *rates_size && rates[j]; j++) { if (rates[j] == card_rates[i]) tmp[tmp_size++] = card_rates[i]; } } lbs_deb_hex(LBS_DEB_JOIN, "AP rates ", rates, *rates_size); - lbs_deb_hex(LBS_DEB_JOIN, "card rates ", card_rates, num_card_rates); + lbs_deb_hex(LBS_DEB_JOIN, "card rates ", card_rates, + ARRAY_SIZE(lbs_bg_rates)); lbs_deb_hex(LBS_DEB_JOIN, "common rates", tmp, tmp_size); lbs_deb_join("TX data rate 0x%02x\n", priv->cur_rate); @@ -69,10 +70,7 @@ static int get_common_rates(struct lbs_private *priv, lbs_pr_alert("Previously set fixed data rate %#x isn't " "compatible with the network.\n", priv->cur_rate); ret = -1; - goto done; } - ret = 0; - done: memset(rates, 0, *rates_size); *rates_size = min_t(int, tmp_size, *rates_size); @@ -322,7 +320,7 @@ static int lbs_associate(struct lbs_private *priv, rates = (struct mrvl_ie_rates_param_set *) pos; rates->header.type = cpu_to_le16(TLV_TYPE_RATES); memcpy(&rates->rates, &bss->rates, MAX_RATES); - tmplen = MAX_RATES; + tmplen = min_t(u16, ARRAY_SIZE(rates->rates), MAX_RATES); if (get_common_rates(priv, rates->rates, &tmplen)) { ret = -1; goto done; @@ -598,7 +596,7 @@ static int lbs_adhoc_join(struct lbs_private *priv, /* Copy Data rates from the rates recorded in scan response */ memset(cmd.bss.rates, 0, sizeof(cmd.bss.rates)); - ratesize = min_t(u16, sizeof(cmd.bss.rates), MAX_RATES); + ratesize = min_t(u16, ARRAY_SIZE(cmd.bss.rates), MAX_RATES); memcpy(cmd.bss.rates, bss->rates, ratesize); if (get_common_rates(priv, cmd.bss.rates, &ratesize)) { lbs_deb_join("ADHOC_JOIN: get_common_rates returned error.\n"); -- cgit v1.2.2 From 6e900de3fff01e84c96632409359a84825c54b28 Mon Sep 17 00:00:00 2001 From: Mark Ware Date: Mon, 20 Jul 2009 21:51:03 +1000 Subject: cpm_uart: Don't use alloc_bootmem in cpm_uart_cpm2.c This is another alloc_bootmem() -> kzalloc() change, this time to fix the non-fatal badness caused when booting with a cpm2_uart console. Signed-off-by: Mark Ware Signed-off-by: Kumar Gala --- drivers/serial/cpm_uart/cpm_uart_cpm2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/serial/cpm_uart/cpm_uart_cpm2.c b/drivers/serial/cpm_uart/cpm_uart_cpm2.c index 141c0a3333ad..a9802e76b5fa 100644 --- a/drivers/serial/cpm_uart/cpm_uart_cpm2.c +++ b/drivers/serial/cpm_uart/cpm_uart_cpm2.c @@ -132,7 +132,7 @@ int cpm_uart_allocbuf(struct uart_cpm_port *pinfo, unsigned int is_con) memsz = L1_CACHE_ALIGN(pinfo->rx_nrfifos * pinfo->rx_fifosize) + L1_CACHE_ALIGN(pinfo->tx_nrfifos * pinfo->tx_fifosize); if (is_con) { - mem_addr = alloc_bootmem(memsz); + mem_addr = kzalloc(memsz, GFP_NOWAIT); dma_addr = virt_to_bus(mem_addr); } else -- cgit v1.2.2 From f294526279cda8934b0313ebd02184a16ba888c9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 19 Jul 2009 14:46:09 +0300 Subject: lguest: dereferencing freed mem in add_eventfd() "new" was freed and then dereferenced. Also the return value wasn't being used so I modified the caller as well. Compile tested only. Found by smatch (http://repo.or.cz/w/smatch.git). regards, dan carpenter Signed-off-by: Dan Carpenter Signed-off-by: Rusty Russell --- drivers/lguest/lguest_user.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 9f9a2953b383..407722a8e0c4 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -52,8 +52,9 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) new->map[new->num].addr = addr; new->map[new->num].event = eventfd_ctx_fdget(fd); if (IS_ERR(new->map[new->num].event)) { + int err = PTR_ERR(new->map[new->num].event); kfree(new); - return PTR_ERR(new->map[new->num].event); + return err; } new->num++; @@ -83,7 +84,7 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); - return 0; + return err; } /*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt -- cgit v1.2.2 From ff52c3fc7188855ede75d87b022271f0da309e5b Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 23 Jul 2009 14:57:37 +0300 Subject: virtio: fix memory leak on device removal Make vp_free_vectors do the reverse of vq_request_vectors. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) (limited to 'drivers') diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index bcec78ffc765..ca40517ef9c2 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -258,7 +258,6 @@ static void vp_free_vectors(struct virtio_device *vdev) for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(vp_dev->msix_entries[i].vector, vp_dev); - vp_dev->msix_used_vectors = 0; if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ @@ -267,9 +266,16 @@ static void vp_free_vectors(struct virtio_device *vdev) /* Flush the write out to device */ ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - vp_dev->msix_enabled = 0; pci_disable_msix(vp_dev->pci_dev); + vp_dev->msix_enabled = 0; + vp_dev->msix_vectors = 0; } + + vp_dev->msix_used_vectors = 0; + kfree(vp_dev->msix_names); + vp_dev->msix_names = NULL; + kfree(vp_dev->msix_entries); + vp_dev->msix_entries = NULL; } static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, @@ -297,11 +303,11 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, GFP_KERNEL); if (!vp_dev->msix_entries) - goto error_entries; + goto error; vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, GFP_KERNEL); if (!vp_dev->msix_names) - goto error_names; + goto error; for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; @@ -314,7 +320,7 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, IRQF_SHARED, name, vp_dev); if (err) - goto error_irq; + goto error; vp_dev->intx_enabled = 1; } else { vp_dev->msix_vectors = err; @@ -328,7 +334,7 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_config_changed, 0, vp_dev->msix_names[v], vp_dev); if (err) - goto error_irq; + goto error; ++vp_dev->msix_used_vectors; iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); @@ -336,7 +342,7 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); if (v == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; - goto error_irq; + goto error; } } @@ -349,16 +355,12 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) - goto error_irq; + goto error; ++vp_dev->msix_used_vectors; } return 0; -error_irq: +error: vp_free_vectors(vdev); - kfree(vp_dev->msix_names); -error_names: - kfree(vp_dev->msix_entries); -error_entries: return err; } -- cgit v1.2.2 From f6c82507030d61e15928d5cad946d3eac1c4a384 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 15:48:01 +0300 Subject: virtio: delete vq from list This makes delete vq the reverse of find vq. This is required to make it possible to retry find_vqs after a failure, otherwise the list gets corrupted. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index ca40517ef9c2..a1cb1a1c6522 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -464,7 +464,11 @@ static void vp_del_vq(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_vq_info *info = vq->priv; - unsigned long size; + unsigned long flags, size; + + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); -- cgit v1.2.2 From e969fed542cae08cb11d666efac4f7c5d624d09f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 15:48:08 +0300 Subject: virtio: refactor find_vqs This refactors find_vqs, making it more readable and robust, and fixing two regressions from 2.6.30: - double free_irq causing BUG_ON on device removal - probe failure when vq can't be assigned to msi-x vector (reported on old host kernels) Tested-by: Amit Shah Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_pci.c | 212 +++++++++++++++++++++++++------------------- 1 file changed, 119 insertions(+), 93 deletions(-) (limited to 'drivers') diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index a1cb1a1c6522..248e00ec4dc1 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,8 +52,10 @@ struct virtio_pci_device char (*msix_names)[256]; /* Number of available vectors */ unsigned msix_vectors; - /* Vectors allocated */ + /* Vectors allocated, excluding per-vq vectors if any */ unsigned msix_used_vectors; + /* Whether we have vector per vq */ + bool per_vq_vectors; }; /* Constants for MSI-X */ @@ -278,27 +280,24 @@ static void vp_free_vectors(struct virtio_device *vdev) vp_dev->msix_entries = NULL; } -static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, - int *options, int noptions) -{ - int i; - for (i = 0; i < noptions; ++i) - if (!pci_enable_msix(dev, entries, options[i])) - return options[i]; - return -EBUSY; -} - -static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) +static int vp_request_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned i, v; int err = -ENOMEM; - /* We want at most one vector per queue and one for config changes. - * Fallback to separate vectors for config and a shared for queues. - * Finally fall back to regular interrupts. */ - int options[] = { max_vqs + 1, 2 }; - int nvectors = max(options[0], options[1]); + + if (!nvectors) { + /* Can't allocate MSI-X vectors, use regular interrupt */ + vp_dev->msix_vectors = 0; + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, + IRQF_SHARED, name, vp_dev); + if (err) + return err; + vp_dev->intx_enabled = 1; + return 0; + } vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, GFP_KERNEL); @@ -312,41 +311,34 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; - err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, - options, ARRAY_SIZE(options)); - if (err < 0) { - /* Can't allocate enough MSI-X vectors, use regular interrupt */ - vp_dev->msix_vectors = 0; - err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, - IRQF_SHARED, name, vp_dev); - if (err) - goto error; - vp_dev->intx_enabled = 1; - } else { - vp_dev->msix_vectors = err; - vp_dev->msix_enabled = 1; - - /* Set the vector used for configuration */ - v = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, - "%s-config", name); - err = request_irq(vp_dev->msix_entries[v].vector, - vp_config_changed, 0, vp_dev->msix_names[v], - vp_dev); - if (err) - goto error; - ++vp_dev->msix_used_vectors; + err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors); + if (err > 0) + err = -ENOSPC; + if (err) + goto error; + vp_dev->msix_vectors = nvectors; + vp_dev->msix_enabled = 1; + + /* Set the vector used for configuration */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-config", name); + err = request_irq(vp_dev->msix_entries[v].vector, + vp_config_changed, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; - iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; - goto error; - } + iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Verify we had enough resources to assign the vector */ + v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + if (v == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto error; } - if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) { + if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, @@ -366,13 +358,14 @@ error: static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), - const char *name) + const char *name, + u16 vector) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq; unsigned long flags, size; - u16 num, vector; + u16 num; int err; /* Select the queue we're interested in */ @@ -391,7 +384,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, info->queue_index = index; info->num = num; - info->vector = VIRTIO_MSI_NO_VECTOR; + info->vector = vector; size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); @@ -415,22 +408,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, vq->priv = info; info->vq = vq; - /* allocate per-vq vector if available and necessary */ - if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) { - vector = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, - "%s-%s", dev_name(&vp_dev->vdev.dev), name); - err = request_irq(vp_dev->msix_entries[vector].vector, - vring_interrupt, 0, - vp_dev->msix_names[vector], vq); - if (err) - goto out_request_irq; - info->vector = vector; - ++vp_dev->msix_used_vectors; - } else - vector = VP_MSIX_VQ_VECTOR; - - if (callback && vp_dev->msix_enabled) { + if (vector != VIRTIO_MSI_NO_VECTOR) { iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); if (vector == VIRTIO_MSI_NO_VECTOR) { @@ -446,11 +424,6 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, return vq; out_assign: - if (info->vector != VIRTIO_MSI_NO_VECTOR) { - free_irq(vp_dev->msix_entries[info->vector].vector, vq); - --vp_dev->msix_used_vectors; - } -out_request_irq: vring_del_virtqueue(vq); out_activate_queue: iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); @@ -472,9 +445,6 @@ static void vp_del_vq(struct virtqueue *vq) iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); - if (info->vector != VIRTIO_MSI_NO_VECTOR) - free_irq(vp_dev->msix_entries[info->vector].vector, vq); - if (vp_dev->msix_enabled) { iowrite16(VIRTIO_MSI_NO_VECTOR, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); @@ -495,36 +465,62 @@ static void vp_del_vq(struct virtqueue *vq) /* the config->del_vqs() implementation */ static void vp_del_vqs(struct virtio_device *vdev) { + struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq, *n; + struct virtio_pci_vq_info *info; - list_for_each_entry_safe(vq, n, &vdev->vqs, list) + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + info = vq->priv; + if (vp_dev->per_vq_vectors) + free_irq(vp_dev->msix_entries[info->vector].vector, vq); vp_del_vq(vq); + } + vp_dev->per_vq_vectors = false; vp_free_vectors(vdev); } -/* the config->find_vqs() implementation */ -static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) +static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[], + int nvectors, + bool per_vq_vectors) { - int vectors = 0; - int i, err; - - /* How many vectors would we like? */ - for (i = 0; i < nvqs; ++i) - if (callbacks[i]) - ++vectors; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u16 vector; + int i, err, allocated_vectors; - err = vp_request_vectors(vdev, vectors); + err = vp_request_vectors(vdev, nvectors, per_vq_vectors); if (err) goto error_request; + vp_dev->per_vq_vectors = per_vq_vectors; + allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { - vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) + if (!callbacks[i] || !vp_dev->msix_enabled) + vector = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + vector = allocated_vectors++; + else + vector = VP_MSIX_VQ_VECTOR; + vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i], vector); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); goto error_find; + } + /* allocate per-vq irq if available and necessary */ + if (vp_dev->per_vq_vectors && vector != VIRTIO_MSI_NO_VECTOR) { + snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, + "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(vp_dev->msix_entries[vector].vector, + vring_interrupt, 0, + vp_dev->msix_names[vector], vqs[i]); + if (err) { + vp_del_vq(vqs[i]); + goto error_find; + } + } } return 0; @@ -532,7 +528,37 @@ error_find: vp_del_vqs(vdev); error_request: - return PTR_ERR(vqs[i]); + return err; +} + +/* the config->find_vqs() implementation */ +static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + int vectors = 0; + int i, uninitialized_var(err); + + /* How many vectors would we like? */ + for (i = 0; i < nvqs; ++i) + if (callbacks[i]) + ++vectors; + + /* We want at most one vector per queue and one for config changes. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + vectors + 1, true); + if (!err) + return 0; + /* Fallback to separate vectors for config and a shared for queues. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 2, false); + if (!err) + return 0; + /* Finally fall back to regular interrupts. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 0, false); + return err; } static struct virtio_config_ops virtio_pci_config_ops = { -- cgit v1.2.2 From 2e04ef76916d1e29a077ea9d0f2003c8fd86724d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: fix comment style I don't really notice it (except to begrudge the extra vertical space), but Ingo does. And he pointed out that one excuse of lguest is as a teaching tool, it should set a good example. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- drivers/lguest/core.c | 114 +++++---- drivers/lguest/hypercalls.c | 141 +++++++---- drivers/lguest/interrupts_and_traps.c | 288 +++++++++++++++-------- drivers/lguest/lg.h | 23 +- drivers/lguest/lguest_device.c | 150 +++++++----- drivers/lguest/lguest_user.c | 137 +++++++---- drivers/lguest/page_tables.c | 427 ++++++++++++++++++++++------------ drivers/lguest/segments.c | 106 ++++++--- drivers/lguest/x86/core.c | 372 +++++++++++++++++++---------- drivers/lguest/x86/switcher_32.S | 18 +- 10 files changed, 1168 insertions(+), 608 deletions(-) (limited to 'drivers') diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a6974e9b8ebf..cd058bc903ff 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,6 +1,8 @@ -/*P:400 This contains run_guest() which actually calls into the Host<->Guest +/*P:400 + * This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines. :*/ + * Host to do something. This file also contains useful helper routines. +:*/ #include #include #include @@ -24,7 +26,8 @@ static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); -/*H:010 We need to set up the Switcher at a high virtual address. Remember the +/*H:010 + * We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the * CPU to run the Guest, and then changes back to the Host when a trap or * interrupt happens. @@ -33,7 +36,8 @@ DEFINE_MUTEX(lguest_lock); * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. */ + * it's not a simple one-liner. + */ static __init int map_switcher(void) { int i, err; @@ -47,8 +51,10 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of struct page pointers. map_vm_area() wants - * this, rather than just an array of pages. */ + /* + * We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. + */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -56,8 +62,10 @@ static __init int map_switcher(void) goto out; } - /* Now we actually allocate the pages. The Guest will see these pages, - * so we make sure they're zeroed. */ + /* + * Now we actually allocate the pages. The Guest will see these pages, + * so we make sure they're zeroed. + */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { unsigned long addr = get_zeroed_page(GFP_KERNEL); if (!addr) { @@ -67,19 +75,23 @@ static __init int map_switcher(void) switcher_page[i] = virt_to_page(addr); } - /* First we check that the Switcher won't overlap the fixmap area at + /* + * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. */ + * very strange effects if it ever happened. + */ if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 + /* + * Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. */ + * allocates an extra guard page, so we need space for that. + */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); @@ -89,11 +101,13 @@ static __init int map_switcher(void) goto free_pages; } - /* This code actually sets up the pages we've allocated to appear at + /* + * This code actually sets up the pages we've allocated to appear at * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't - * care. */ + * care. + */ pagep = switcher_page; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { @@ -101,8 +115,10 @@ static __init int map_switcher(void) goto free_vma; } - /* Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from _switcher.S). */ + /* + * Now the Switcher is mapped at the right address, we can't fail! + * Copy in the compiled-in Switcher code (from _switcher.S). + */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); @@ -124,8 +140,7 @@ out: } /*:*/ -/* Cleaning up the mapping when the module is unloaded is almost... - * too easy. */ +/* Cleaning up the mapping when the module is unloaded is almost... too easy. */ static void unmap_switcher(void) { unsigned int i; @@ -151,16 +166,19 @@ static void unmap_switcher(void) * But we can't trust the Guest: it might be trying to access the Launcher * code. We have to check that the range is below the pfn_limit the Launcher * gave us. We have to make sure that addr + len doesn't give us a false - * positive by overflowing, too. */ + * positive by overflowing, too. + */ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len) { return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); } -/* This routine copies memory from the Guest. Here we can see how useful the +/* + * This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random - * value (all zeroes) instead of needing to return an error. */ + * value (all zeroes) instead of needing to return an error. + */ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) { if (!lguest_address_ok(cpu->lg, addr, bytes) @@ -181,9 +199,11 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, } /*:*/ -/*H:030 Let's jump straight to the the main loop which runs the Guest. +/*H:030 + * Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep - * going around and around until something interesting happens. */ + * going around and around until something interesting happens. + */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ @@ -195,8 +215,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. */ + /* + * It's possible the Guest did a NOTIFY hypercall to the + * Launcher, in which case we return from the read() now. + */ if (cpu->pending_notify) { if (!send_notify_to_eventfd(cpu)) { if (put_user(cpu->pending_notify, user)) @@ -209,29 +231,39 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (signal_pending(current)) return -ERESTARTSYS; - /* Check if there are any interrupts which can be delivered now: + /* + * Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next - * run the Guest. */ + * run the Guest. + */ irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) try_deliver_interrupt(cpu, irq, more); - /* All long-lived kernel loops need to check with this horrible + /* + * All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, - * it stops us. */ + * it stops us. + */ try_to_freeze(); - /* Just make absolutely sure the Guest is still alive. One of - * those hypercalls could have been fatal, for example. */ + /* + * Just make absolutely sure the Guest is still alive. One of + * those hypercalls could have been fatal, for example. + */ if (cpu->lg->dead) break; - /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. */ + /* + * If the Guest asked to be stopped, we sleep. The Guest's + * clock timer will wake us. + */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure no interrupt snuck in - * which we should be doing. */ + /* + * Just before we sleep, make sure no interrupt snuck in + * which we should be doing. + */ if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else @@ -239,8 +271,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) continue; } - /* OK, now we're ready to jump into the Guest. First we put up - * the "Do Not Disturb" sign: */ + /* + * OK, now we're ready to jump into the Guest. First we put up + * the "Do Not Disturb" sign: + */ local_irq_disable(); /* Actually run the Guest until something happens. */ @@ -327,8 +361,10 @@ static void __exit fini(void) } /*:*/ -/* The Host side of lguest can be a module. This is a nice way for people to - * play with it. */ +/* + * The Host side of lguest can be a module. This is a nice way for people to + * play with it. + */ module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index c29ffa19cb74..787ab4bc09f0 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -1,8 +1,10 @@ -/*P:500 Just as userspace programs request kernel operations through a system +/*P:500 + * Just as userspace programs request kernel operations through a system * call, the Guest requests Host operations through a "hypercall". You might * notice this nomenclature doesn't really follow any logic, but the name has * been around for long enough that we're stuck with it. As you'd expect, this - * code is basically a one big switch statement. :*/ + * code is basically a one big switch statement. +:*/ /* Copyright (C) 2006 Rusty Russell IBM Corporation @@ -28,30 +30,41 @@ #include #include "lg.h" -/*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ +/*H:120 + * This is the core hypercall routine: where the Guest gets what it wants. + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. + */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: - /* This call does nothing, except by breaking out of the Guest - * it makes us process all the asynchronous hypercalls. */ + /* + * This call does nothing, except by breaking out of the Guest + * it makes us process all the asynchronous hypercalls. + */ break; case LHCALL_SEND_INTERRUPTS: - /* This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. */ + /* + * This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. + */ break; case LHCALL_LGUEST_INIT: - /* You can't get here unless you're already initialized. Don't - * do that. */ + /* + * You can't get here unless you're already initialized. Don't + * do that. + */ kill_guest(cpu, "already have lguest_data"); break; case LHCALL_SHUTDOWN: { - /* Shutdown is such a trivial hypercall that we do it in four - * lines right here. */ char msg[128]; - /* If the lgread fails, it will call kill_guest() itself; the - * kill_guest() with the message will be ignored. */ + /* + * Shutdown is such a trivial hypercall that we do it in four + * lines right here. + * + * If the lgread fails, it will call kill_guest() itself; the + * kill_guest() with the message will be ignored. + */ __lgread(cpu, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; kill_guest(cpu, "CRASH: %s", msg); @@ -60,16 +73,17 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) break; } case LHCALL_FLUSH_TLB: - /* FLUSH_TLB comes in two flavors, depending on the - * argument: */ + /* FLUSH_TLB comes in two flavors, depending on the argument: */ if (args->arg1) guest_pagetable_clear_all(cpu); else guest_pagetable_flush_user(cpu); break; - /* All these calls simply pass the arguments through to the right - * routines. */ + /* + * All these calls simply pass the arguments through to the right + * routines. + */ case LHCALL_NEW_PGTABLE: guest_new_pagetable(cpu, args->arg1); break; @@ -112,15 +126,16 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) kill_guest(cpu, "Bad hypercall %li\n", args->arg0); } } -/*:*/ -/*H:124 Asynchronous hypercalls are easy: we just look in the array in the +/*H:124 + * Asynchronous hypercalls are easy: we just look in the array in the * Guest's "struct lguest_data" to see if any new ones are marked "ready". * * We are careful to do these in order: obviously we respect the order the * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before - * checking for a normal hcall). */ + * checking for a normal hcall). + */ static void do_async_hcalls(struct lg_cpu *cpu) { unsigned int i; @@ -133,22 +148,28 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* We process "struct lguest_data"s hcalls[] ring once. */ for (i = 0; i < ARRAY_SIZE(st); i++) { struct hcall_args args; - /* We remember where we were up to from last time. This makes + /* + * We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest - * places them in the ring. */ + * places them in the ring. + */ unsigned int n = cpu->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) break; - /* OK, we have hypercall. Increment the "next_hcall" cursor, - * and wrap back to 0 if we reach the end. */ + /* + * OK, we have hypercall. Increment the "next_hcall" cursor, + * and wrap back to 0 if we reach the end. + */ if (++cpu->next_hcall == LHCALL_RING_SIZE) cpu->next_hcall = 0; - /* Copy the hypercall arguments into a local copy of - * the hcall_args struct. */ + /* + * Copy the hypercall arguments into a local copy of the + * hcall_args struct. + */ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { kill_guest(cpu, "Fetching async hypercalls"); @@ -164,19 +185,25 @@ static void do_async_hcalls(struct lg_cpu *cpu) break; } - /* Stop doing hypercalls if they want to notify the Launcher: - * it needs to service this first. */ + /* + * Stop doing hypercalls if they want to notify the Launcher: + * it needs to service this first. + */ if (cpu->pending_notify) break; } } -/* Last of all, we look at what happens first of all. The very first time the - * Guest makes a hypercall, we end up here to set things up: */ +/* + * Last of all, we look at what happens first of all. The very first time the + * Guest makes a hypercall, we end up here to set things up: + */ static void initialize(struct lg_cpu *cpu) { - /* You can't do anything until you're initialized. The Guest knows the - * rules, so we're unforgiving here. */ + /* + * You can't do anything until you're initialized. The Guest knows the + * rules, so we're unforgiving here. + */ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); return; @@ -185,32 +212,40 @@ static void initialize(struct lg_cpu *cpu) if (lguest_arch_init_hypercalls(cpu)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* The Guest tells us where we're not to deliver interrupts by putting - * the range of addresses into "struct lguest_data". */ + /* + * The Guest tells us where we're not to deliver interrupts by putting + * the range of addresses into "struct lguest_data". + */ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* We write the current time into the Guest's data page once so it can - * set its clock. */ + /* + * We write the current time into the Guest's data page once so it can + * set its clock. + */ write_timestamp(cpu); /* page_tables.c will also do some setup. */ page_table_guest_data_init(cpu); - /* This is the one case where the above accesses might have been the + /* + * This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest - * pagetable. */ + * pagetable. + */ guest_pagetable_clear_all(cpu); } /*:*/ -/*M:013 If a Guest reads from a page (so creates a mapping) that it has never +/*M:013 + * If a Guest reads from a page (so creates a mapping) that it has never * written to, and then the Launcher writes to it (ie. the output of a virtual * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But - * a similar scenario might one day bite us, so it's worth mentioning. :*/ + * a similar scenario might one day bite us, so it's worth mentioning. +:*/ /*H:100 * Hypercalls @@ -229,17 +264,22 @@ void do_hypercalls(struct lg_cpu *cpu) return; } - /* The Guest has initialized. + /* + * The Guest has initialized. * - * Look in the hypercall ring for the async hypercalls: */ + * Look in the hypercall ring for the async hypercalls: + */ do_async_hcalls(cpu); - /* If we stopped reading the hypercall ring because the Guest did a + /* + * If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do - * the hypercall. */ + * the hypercall. + */ if (!cpu->pending_notify) { do_hcall(cpu, cpu->hcall); - /* Tricky point: we reset the hcall pointer to mark the + /* + * Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. * Normally it doesn't matter: the Guest will run again and @@ -248,13 +288,16 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. Finding that bug sucked. */ + * hypercall. Finding that bug sucked. + */ cpu->hcall = NULL; } } -/* This routine supplies the Guest with time: it's used for wallclock time at - * initial boot and as a rough time source if the TSC isn't available. */ +/* + * This routine supplies the Guest with time: it's used for wallclock time at + * initial boot and as a rough time source if the TSC isn't available. + */ void write_timestamp(struct lg_cpu *cpu) { struct timespec now; diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0e9067b0d507..18648180db02 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -1,4 +1,5 @@ -/*P:800 Interrupts (traps) are complicated enough to earn their own file. +/*P:800 + * Interrupts (traps) are complicated enough to earn their own file. * There are three classes of interrupts: * * 1) Real hardware interrupts which occur while we're running the Guest, @@ -10,7 +11,8 @@ * just like real hardware would deliver them. Traps from the Guest can be set * up to go directly back into the Guest, but sometimes the Host wants to see * them first, so we also have a way of "reflecting" them into the Guest as if - * they had been delivered to it directly. :*/ + * they had been delivered to it directly. +:*/ #include #include #include @@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi) return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); } -/* The "type" of the interrupt handler is a 4 bit field: we only support a - * couple of types. */ +/* + * The "type" of the interrupt handler is a 4 bit field: we only support a + * couple of types. + */ static int idt_type(u32 lo, u32 hi) { return (hi >> 8) & 0xF; @@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi) return (hi & 0x8000); } -/* We need a helper to "push" a value onto the Guest's stack, since that's a - * big part of what delivering an interrupt does. */ +/* + * We need a helper to "push" a value onto the Guest's stack, since that's a + * big part of what delivering an interrupt does. + */ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ @@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) lgwrite(cpu, *gstack, u32, val); } -/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or +/*H:210 + * The set_guest_interrupt() routine actually delivers the interrupt or * trap. The mechanics of delivering traps and interrupts to the Guest are the * same, except some traps have an "error code" which gets pushed onto the * stack as well: the caller tells us if this is one. @@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo - * it). */ + * it). + */ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, bool has_err) { @@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, u32 eflags, ss, irq_enable; unsigned long virtstack; - /* There are two cases for interrupts: one where the Guest is already + /* + * There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in - * userspace. We check the privilege level to find out. */ + * userspace. We check the privilege level to find out. + */ if ((cpu->regs->ss&0x3) != GUEST_PL) { - /* The Guest told us their kernel stack with the SET_STACK - * hypercall: both the virtual address and the segment */ + /* + * The Guest told us their kernel stack with the SET_STACK + * hypercall: both the virtual address and the segment. + */ virtstack = cpu->esp1; ss = cpu->ss1; origstack = gstack = guest_pa(cpu, virtstack); - /* We push the old stack segment and pointer onto the new + /* + * We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege - * levels and expect these here. */ + * levels and expect these here. + */ push_guest_stack(cpu, &gstack, cpu->regs->ss); push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { @@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, origstack = gstack = guest_pa(cpu, virtstack); } - /* Remember that we never let the Guest actually disable interrupts, so + /* + * Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest - * copy it back in "lguest_iret". */ + * copy it back in "lguest_iret". + */ eflags = cpu->regs->eflags; if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; - /* An interrupt is expected to push three things on the stack: the old + /* + * An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction - * pointer. */ + * pointer. + */ push_guest_stack(cpu, &gstack, eflags); push_guest_stack(cpu, &gstack, cpu->regs->cs); push_guest_stack(cpu, &gstack, cpu->regs->eip); @@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); - /* Now we've pushed all the old state, we change the stack, the code - * segment and the address to execute. */ + /* + * Now we've pushed all the old state, we change the stack, the code + * segment and the address to execute. + */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); - /* There are two kinds of interrupt handlers: 0xE is an "interrupt - * gate" which expects interrupts to be disabled on entry. */ + /* + * There are two kinds of interrupt handlers: 0xE is an "interrupt + * gate" which expects interrupts to be disabled on entry. + */ if (idt_type(lo, hi) == 0xE) if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) kill_guest(cpu, "Disabling interrupts"); @@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. */ + * we go to sleep when the Guest has halted itself. + */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; @@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) if (!cpu->lg->lguest_data) return LGUEST_IRQS; - /* Take our "irqs_pending" array and remove any interrupts the Guest - * wants blocked: the result ends up in "blk". */ + /* + * Take our "irqs_pending" array and remove any interrupts the Guest + * wants blocked: the result ends up in "blk". + */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return LGUEST_IRQS; @@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) return irq; } -/* This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). */ +/* + * This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). + */ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; BUG_ON(irq >= LGUEST_IRQS); - /* They may be in the middle of an iret, where they asked us never to - * deliver interrupts. */ + /* + * They may be in the middle of an iret, where they asked us never to + * deliver interrupts. + */ if (cpu->regs->eip >= cpu->lg->noirq_start && (cpu->regs->eip < cpu->lg->noirq_end)) return; @@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) } } - /* Look at the IDT entry the Guest gave us for this interrupt. The + /* + * Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip - * over them. */ + * over them. + */ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ clear_bit(irq, cpu->irqs_pending); - /* set_guest_interrupt() takes the interrupt descriptor and a + /* + * set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto - * the stack as well: virtual interrupts never do. */ + * the stack as well: virtual interrupts never do. + */ set_guest_interrupt(cpu, idt->a, idt->b, false); } - /* Every time we deliver an interrupt, we update the timestamp in the + /* + * Every time we deliver an interrupt, we update the timestamp in the * Guest's lguest_data struct. It would be better for the Guest if we * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every - * timer interrupt. */ + * timer interrupt. + */ write_timestamp(cpu); - /* If there are no other interrupts we want to deliver, clear - * the pending flag. */ + /* + * If there are no other interrupts we want to deliver, clear + * the pending flag. + */ if (!more) put_user(0, &cpu->lg->lguest_data->irq_pending); } @@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) /* And this is the routine when we want to set an interrupt for the Guest. */ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) { - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_bit(irq, cpu->irqs_pending); - /* Make sure it sees it; it might be asleep (eg. halted), or - * running the Guest right now, in which case kick_process() - * will knock it out. */ + /* + * Make sure it sees it; it might be asleep (eg. halted), or running + * the Guest right now, in which case kick_process() will knock it out. + */ if (!wake_up_process(cpu->tsk)) kick_process(cpu->tsk); } /*:*/ -/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent +/* + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent * me a patch, so we support that too. It'd be a big step for lguest if half * the Plan 9 user base were to start using it. * * Actually now I think of it, it's possible that Ron *is* half the Plan 9 - * userbase. Oh well. */ + * userbase. Oh well. + */ static bool could_be_syscall(unsigned int num) { /* Normal Linux SYSCALL_VECTOR or reserved vector? */ @@ -274,9 +316,11 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps like +/*H:220 + * Now we've got the routines to deliver interrupts, delivering traps like * page fault is easy. The only trick is that Intel decided that some traps - * should have error codes: */ + * should have error codes: + */ static bool has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); @@ -285,13 +329,17 @@ static bool has_err(unsigned int trap) /* deliver_trap() returns true if it could deliver the trap. */ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) { - /* Trap numbers are always 8 bit, but we set an impossible trap number - * for traps inside the Switcher, so check that here. */ + /* + * Trap numbers are always 8 bit, but we set an impossible trap number + * for traps inside the Switcher, so check that here. + */ if (num >= ARRAY_SIZE(cpu->arch.idt)) return false; - /* Early on the Guest hasn't set the IDT entries (or maybe it put a - * bogus one in): if we fail here, the Guest will be killed. */ + /* + * Early on the Guest hasn't set the IDT entries (or maybe it put a + * bogus one in): if we fail here, the Guest will be killed. + */ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return false; set_guest_interrupt(cpu, cpu->arch.idt[num].a, @@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) return true; } -/*H:250 Here's the hard part: returning to the Host every time a trap happens +/*H:250 + * Here's the hard part: returning to the Host every time a trap happens * and then calling deliver_trap() and re-entering the Guest is slow. * Particularly because Guest userspace system calls are traps (usually trap * 128). @@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) * the other hypervisors would beat it up at lunchtime. * * This routine indicates if a particular trap number could be delivered - * directly. */ + * directly. + */ static bool direct_trap(unsigned int num) { - /* Hardware interrupts don't go to the Guest at all (except system - * call). */ + /* + * Hardware interrupts don't go to the Guest at all (except system + * call). + */ if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) return false; - /* The Host needs to see page faults (for shadow paging and to save the + /* + * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. */ + * and of course, the hypercall trap. + */ return num != 14 && num != 13 && num != 7 && num != 6 && num != LGUEST_TRAP_ENTRY; } /*:*/ -/*M:005 The Guest has the ability to turn its interrupt gates into trap gates, +/*M:005 + * The Guest has the ability to turn its interrupt gates into trap gates, * if it is careful. The Host will let trap gates can go directly to the * Guest, but the Guest needs the interrupts atomically disabled for an * interrupt gate. It can do this by pointing the trap gate at instructions - * within noirq_start and noirq_end, where it can safely disable interrupts. */ + * within noirq_start and noirq_end, where it can safely disable interrupts. + */ -/*M:006 The Guests do not use the sysenter (fast system call) instruction, +/*M:006 + * The Guests do not use the sysenter (fast system call) instruction, * because it's hardcoded to enter privilege level 0 and so can't go direct. * It's about twice as fast as the older "int 0x80" system call, so it might * still be worthwhile to handle it in the Switcher and lcall down to the * Guest. The sysenter semantics are hairy tho: search for that keyword in - * entry.S :*/ + * entry.S +:*/ -/*H:260 When we make traps go directly into the Guest, we need to make sure +/*H:260 + * When we make traps go directly into the Guest, we need to make sure * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the * CPU trying to deliver the trap will fault while trying to push the interrupt * words on the stack: this is called a double fault, and it forces us to kill * the Guest. * - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. + */ void pin_stack_pages(struct lg_cpu *cpu) { unsigned int i; - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or - * two pages of stack space. */ + /* + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or + * two pages of stack space. + */ for (i = 0; i < cpu->lg->stack_pages; i++) - /* The stack grows *upwards*, so the address we're given is the + /* + * The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to - * get to the rest of the stack pages. */ + * get to the rest of the stack pages. + */ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); } -/* Direct traps also mean that we need to know whenever the Guest wants to use +/* + * Direct traps also mean that we need to know whenever the Guest wants to use * a different kernel stack, so we can change the IDT entries to use that * stack. The IDT entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * * In Linux each process has its own kernel stack, so this happens a lot: we - * change stacks on each context switch. */ + * change stacks on each context switch. + */ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) { - /* You are not allowed have a stack segment with privilege level 0: bad - * Guest! */ + /* + * You're not allowed a stack segment with privilege level 0: bad Guest! + */ if ((seg & 0x3) != GUEST_PL) kill_guest(cpu, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ @@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) pin_stack_pages(cpu); } -/* All this reference to mapping stacks leads us neatly into the other complex - * part of the Host: page table handling. */ +/* + * All this reference to mapping stacks leads us neatly into the other complex + * part of the Host: page table handling. + */ -/*H:235 This is the routine which actually checks the Guest's IDT entry and - * transfers it into the entry in "struct lguest": */ +/*H:235 + * This is the routine which actually checks the Guest's IDT entry and + * transfers it into the entry in "struct lguest": + */ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { @@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, if (type != 0xE && type != 0xF) kill_guest(cpu, "bad IDT type %i", type); - /* We only copy the handler address, present bit, privilege level and + /* + * We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered * manually with an "int" instruction. This is usually GUEST_PL, - * except for system calls which userspace can use. */ + * except for system calls which userspace can use. + */ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); trap->b = (hi&0xFFFFEF00); } -/*H:230 While we're here, dealing with delivering traps and interrupts to the +/*H:230 + * While we're here, dealing with delivering traps and interrupts to the * Guest, we might as well complete the picture: how the Guest tells us where * it wants them to go. This would be simple, except making traps fast * requires some tricks. * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. + */ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) { - /* Guest never handles: NMI, doublefault, spurious interrupt or - * hypercall. We ignore when it tries to set them. */ + /* + * Guest never handles: NMI, doublefault, spurious interrupt or + * hypercall. We ignore when it tries to set them. + */ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) return; - /* Mark the IDT as changed: next time the Guest runs we'll know we have - * to copy this again. */ + /* + * Mark the IDT as changed: next time the Guest runs we'll know we have + * to copy this again. + */ cpu->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ @@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); } -/* The default entry for each interrupt points into the Switcher routines which +/* + * The default entry for each interrupt points into the Switcher routines which * simply return to the Host. The run_guest() loop will then call - * deliver_trap() to bounce it back into the Guest. */ + * deliver_trap() to bounce it back into the Guest. + */ static void default_idt_entry(struct desc_struct *idt, int trap, const unsigned long handler, @@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt, /* A present interrupt gate. */ u32 flags = 0x8e00; - /* Set the privilege level on the entry for the hypercall: this allows - * the Guest to use the "int" instruction to trigger it. */ + /* + * Set the privilege level on the entry for the hypercall: this allows + * the Guest to use the "int" instruction to trigger it. + */ if (trap == LGUEST_TRAP_ENTRY) flags |= (GUEST_PL << 13); else if (base) - /* Copy priv. level from what Guest asked for. This allows - * debug (int 3) traps from Guest userspace, for example. */ + /* + * Copy privilege level from what Guest asked for. This allows + * debug (int 3) traps from Guest userspace, for example. + */ flags |= (base->b & 0x6000); /* Now pack it into the IDT entry in its weird format. */ @@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state, default_idt_entry(&state->guest_idt[i], i, def[i], NULL); } -/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead +/*H:240 + * We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just - * before we run the Guest. This routine does that copy. */ + * before we run the Guest. This routine does that copy. + */ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def) { unsigned int i; - /* We can simply copy the direct traps, otherwise we use the default - * ones in the Switcher: they will return to the Host. */ + /* + * We can simply copy the direct traps, otherwise we use the default + * ones in the Switcher: they will return to the Host. + */ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { const struct desc_struct *gidt = &cpu->arch.idt[i]; @@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, if (!direct_trap(i)) continue; - /* Only trap gates (type 15) can go direct to the Guest. + /* + * Only trap gates (type 15) can go direct to the Guest. * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. * * If it can't go direct, we still need to copy the priv. level: * they might want to give userspace access to a software - * interrupt. */ + * interrupt. + */ if (idt_type(gidt->a, gidt->b) == 0xF) idt[i] = *gidt; else @@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * the next timer interrupt (in nanoseconds). We use the high-resolution timer * infrastructure to set a callback at that time. * - * 0 means "turn off the clock". */ + * 0 means "turn off the clock". + */ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) { ktime_t expires; @@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) return; } - /* We use wallclock time here, so the Guest might not be running for + /* + * We use wallclock time here, so the Guest might not be running for * all the time between now and the timer interrupt it asked for. This - * is almost always the right thing to do. */ + * is almost always the right thing to do. + */ expires = ktime_add_ns(ktime_get_real(), delta); hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 01c591923793..74c0db691b53 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -54,13 +54,13 @@ struct lg_cpu { unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ - /* At end of a page shared mapped over lguest_pages in guest. */ + /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; struct lguest_pages *last_pages; - int cpu_pgd; /* which pgd this cpu is currently using */ + int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; @@ -96,8 +96,11 @@ struct lguest unsigned int nr_cpus; u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ + + /* + * This provides the offset to the base of guest-physical memory in the + * Launcher. + */ void __user *mem_base; unsigned long kernel_address; @@ -122,11 +125,13 @@ bool lguest_address_ok(const struct lguest *lg, void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); -/*H:035 Using memory-copy operations like that is usually inconvient, so we +/*H:035 + * Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * - * This reads into a variable of the given type then returns that. */ + * This reads into a variable of the given type then returns that. + */ #define lgread(cpu, addr, type) \ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) @@ -140,9 +145,11 @@ void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); int run_guest(struct lg_cpu *cpu, unsigned long __user *user); -/* Helper macros to obtain the first 12 or the last 20 bits, this is only the +/* + * Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined - * in the kernel. */ + * in the kernel. + */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e082cdac88b4..cc000e79c3d1 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,12 @@ -/*P:050 Lguest guests use a very simple method to describe devices. It's a +/*P:050 + * Lguest guests use a very simple method to describe devices. It's a * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. +:*/ #include #include #include @@ -20,8 +22,10 @@ /* The pointer to our (page) of device descriptions. */ static void *lguest_devices; -/* For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. */ +/* + * For Guests, device memory can be used as normal memory, so we cast away the + * __iomem to quieten sparse. + */ static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) { return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); @@ -32,8 +36,10 @@ static inline void lguest_unmap(void *addr) iounmap((__force void __iomem *)addr); } -/*D:100 Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. */ +/*D:100 + * Each lguest device is just a virtio device plus a pointer to its entry + * in the lguest_devices page. + */ struct lguest_device { struct virtio_device vdev; @@ -41,9 +47,11 @@ struct lguest_device { struct lguest_device_desc *desc; }; -/* Since the virtio infrastructure hands us a pointer to the virtio_device all +/* + * Since the virtio infrastructure hands us a pointer to the virtio_device all * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. */ + * lguest_device it's enclosed in. + */ #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) /*D:130 @@ -55,7 +63,8 @@ struct lguest_device { * the driver will look at them during setup. * * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. */ + * immediately after the descriptor. + */ static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) { return (void *)(desc + 1); @@ -98,10 +107,12 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } -/* The virtio core takes the features the Host offers, and copies the - * ones supported by the driver into the vdev->features array. Once - * that's all sorted out, this routine is called so we can tell the - * Host which features we understand and accept. */ +/* + * The virtio core takes the features the Host offers, and copies the ones + * supported by the driver into the vdev->features array. Once that's all + * sorted out, this routine is called so we can tell the Host which features we + * understand and accept. + */ static void lg_finalize_features(struct virtio_device *vdev) { unsigned int i, bits; @@ -112,10 +123,11 @@ static void lg_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - /* The vdev->feature array is a Linux bitmask: this isn't the - * same as a the simple array of bits used by lguest devices - * for features. So we do this slow, manual conversion which is - * completely general. */ + /* + * The vdev->feature array is a Linux bitmask: this isn't the same as a + * the simple array of bits used by lguest devices for features. So we + * do this slow, manual conversion which is completely general. + */ memset(out_features, 0, desc->feature_len); bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; for (i = 0; i < bits; i++) { @@ -146,15 +158,19 @@ static void lg_set(struct virtio_device *vdev, unsigned int offset, memcpy(lg_config(desc) + offset, buf, len); } -/* The operations to get and set the status word just access the status field - * of the device descriptor. */ +/* + * The operations to get and set the status word just access the status field + * of the device descriptor. + */ static u8 lg_get_status(struct virtio_device *vdev) { return to_lgdev(vdev)->desc->status; } -/* To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". */ +/* + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the + * descriptor address of the device. A zero status means "reset". + */ static void set_status(struct virtio_device *vdev, u8 status) { unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; @@ -200,13 +216,17 @@ struct lguest_vq_info void *pages; }; -/* When the virtio_ring code wants to prod the Host, it calls us here and we +/* + * When the virtio_ring code wants to prod the Host, it calls us here and we * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. */ + * knows which virtqueue we're talking about. + */ static void lg_notify(struct virtqueue *vq) { - /* We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. */ + /* + * We store our virtqueue information in the "priv" pointer of the + * virtqueue structure. + */ struct lguest_vq_info *lvq = vq->priv; kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); @@ -215,7 +235,8 @@ static void lg_notify(struct virtqueue *vq) /* An extern declaration inside a C file is bad form. Don't do it. */ extern void lguest_setup_irq(unsigned int irq); -/* This routine finds the first virtqueue described in the configuration of +/* + * This routine finds the first virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -225,7 +246,8 @@ extern void lguest_setup_irq(unsigned int irq); * simpler for the Host to simply tell us where the pages are. * * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ + * function. + */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), @@ -244,9 +266,11 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, if (!lvq) return ERR_PTR(-ENOMEM); - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + /* + * Make a copy of the "struct lguest_vqconfig" entry, which sits after * the descriptor. We need a copy because the config space might not - * be aligned correctly. */ + * be aligned correctly. + */ memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); printk("Mapping virtqueue %i addr %lx\n", index, @@ -261,8 +285,10 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, goto free_lvq; } - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. */ + /* + * OK, tell virtio_ring.c to set up a virtqueue now we know its size + * and we've got a pointer to its pages. + */ vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, lvq->pages, lg_notify, callback, name); if (!vq) { @@ -273,18 +299,23 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* Make sure the interrupt is allocated. */ lguest_setup_irq(lvq->config.irq); - /* Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. */ - /* FIXME: We used to have a flag for the Host to tell us we could use + /* + * Tell the interrupt for this virtqueue to go to the virtio_ring + * interrupt handler. + * + * FIXME: We used to have a flag for the Host to tell us we could use * the interrupt as a source of randomness: it'd be nice to have that - * back.. */ + * back. + */ err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) goto destroy_vring; - /* Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. */ + /* + * Last of all we hook up our 'struct lguest_vq_info" to the + * virtqueue's priv pointer. + */ vq->priv = lvq; return vq; @@ -358,11 +389,14 @@ static struct virtio_config_ops lguest_config_ops = { .del_vqs = lg_del_vqs, }; -/* The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ +/* + * The root device for the lguest virtio devices. This makes them appear as + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. + */ static struct device *lguest_root; -/*D:120 This is the core of the lguest bus: actually adding a new device. +/*D:120 + * This is the core of the lguest bus: actually adding a new device. * It's a separate function because it's neater that way, and because an * earlier version of the code supported hotplug and unplug. They were removed * early on because they were never used. @@ -371,14 +405,14 @@ static struct device *lguest_root; * * It's worth reading this carefully: we start with a pointer to the new device * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. */ + * descriptor page so we can uniquely identify it if things go badly wrong. + */ static void add_lguest_device(struct lguest_device_desc *d, unsigned int offset) { struct lguest_device *ldev; - /* Start with zeroed memory; Linux's device layer seems to count on - * it. */ + /* Start with zeroed memory; Linux's device layer counts on it. */ ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) { printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", @@ -390,15 +424,19 @@ static void add_lguest_device(struct lguest_device_desc *d, ldev->vdev.dev.parent = lguest_root; /* We have a unique device index thanks to the dev_index counter. */ ldev->vdev.id.device = d->type; - /* We have a simple set of routines for querying the device's - * configuration information and setting its status. */ + /* + * We have a simple set of routines for querying the device's + * configuration information and setting its status. + */ ldev->vdev.config = &lguest_config_ops; /* And we remember the device's descriptor for lguest_config_ops. */ ldev->desc = d; - /* register_virtio_device() sets up the generic fields for the struct + /* + * register_virtio_device() sets up the generic fields for the struct * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. */ + * infrastructure look for a matching driver. + */ if (register_virtio_device(&ldev->vdev) != 0) { printk(KERN_ERR "Failed to register lguest dev %u type %u\n", offset, d->type); @@ -406,8 +444,10 @@ static void add_lguest_device(struct lguest_device_desc *d, } } -/*D:110 scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". */ +/*D:110 + * scan_devices() simply iterates through the device page. The type 0 is + * reserved to mean "end of devices". + */ static void scan_devices(void) { unsigned int i; @@ -426,7 +466,8 @@ static void scan_devices(void) } } -/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the +/*D:105 + * Fairly early in boot, lguest_devices_init() is called to set up the * lguest device infrastructure. We check that we are a Guest by checking * pv_info.name: there are other ways of checking, but this seems most * obvious to me. @@ -437,7 +478,8 @@ static void scan_devices(void) * correct sysfs incantation). * * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. */ + * lguest_devices page. + */ static int __init lguest_devices_init(void) { if (strcmp(pv_info.name, "lguest") != 0) @@ -456,11 +498,13 @@ static int __init lguest_devices_init(void) /* We do this after core stuff, but before the drivers. */ postcore_initcall(lguest_devices_init); -/*D:150 At this point in the journey we used to now wade through the lguest +/*D:150 + * At this point in the journey we used to now wade through the lguest * devices themselves: net, block and console. Since they're all now virtio * devices rather than lguest-specific, I've decided to ignore them. Mostly, * they're kind of boring. But this does mean you'll never experience the * thrill of reading the forbidden love scene buried deep in the block driver. * * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?". */ + * come from?", and "What do you do when someone asks for optimization?". + */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 407722a8e0c4..7e92017103dc 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 + * This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will * tell us the Guest's memory layout, pagetable, entry point and kernel address * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. :*/ + * or the Guest doing a NOTIFY out to the Launcher. +:*/ #include #include #include @@ -37,8 +39,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) if (!addr) return -EINVAL; - /* Replace the old array with the new one, carefully: others can - * be accessing it at the same time */ + /* + * Replace the old array with the new one, carefully: others can + * be accessing it at the same time. + */ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), GFP_KERNEL); if (!new) @@ -61,8 +65,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) /* Now put new one in place. */ rcu_assign_pointer(lg->eventfds, new); - /* We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. */ + /* + * We're not in a big hurry. Wait until noone's looking at old + * version, then delete it. + */ synchronize_rcu(); kfree(old); @@ -87,8 +93,10 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) return err; } -/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt - * number to /dev/lguest. */ +/*L:050 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt + * number to /dev/lguest. + */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long irq; @@ -102,8 +110,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return 0; } -/*L:040 Once our Guest is initialized, the Launcher makes it run by reading - * from /dev/lguest. */ +/*L:040 + * Once our Guest is initialized, the Launcher makes it run by reading + * from /dev/lguest. + */ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; @@ -139,8 +149,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest sent I/O, - * clear the flag. */ + /* + * If we returned from read() last time because the Guest sent I/O, + * clear the flag. + */ if (cpu->pending_notify) cpu->pending_notify = 0; @@ -148,8 +160,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(cpu, (unsigned long __user *)user); } -/*L:025 This actually initializes a CPU. For the moment, a Guest is only - * uniprocessor, so "id" is always 0. */ +/*L:025 + * This actually initializes a CPU. For the moment, a Guest is only + * uniprocessor, so "id" is always 0. + */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { /* We have a limited number the number of CPUs in the lguest struct. */ @@ -164,8 +178,10 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* Each CPU has a timer it can set. */ init_clockdev(cpu); - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ + /* + * We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. + */ cpu->regs_page = get_zeroed_page(GFP_KERNEL); if (!cpu->regs_page) return -ENOMEM; @@ -173,29 +189,38 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* We actually put the registers at the bottom of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - /* Now we initialize the Guest's registers, handing it the start - * address. */ + /* + * Now we initialize the Guest's registers, handing it the start + * address. + */ lguest_arch_setup_regs(cpu, start_ip); - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (eg. console input). */ + /* + * We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (eg. console input). + */ cpu->tsk = current; - /* We need to keep a pointer to the Launcher's memory map, because if + /* + * We need to keep a pointer to the Launcher's memory map, because if * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ + * reference, it is destroyed before close() is called. + */ cpu->mm = get_task_mm(cpu->tsk); - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ + /* + * We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. + */ cpu->last_pages = NULL; /* No error == success. */ return 0; } -/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) - * values (in addition to the LHREQ_INITIALIZE value). These are: +/*L:020 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in + * addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. * @@ -207,14 +232,15 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) */ static int initialize(struct file *file, const unsigned long __user *input) { - /* "struct lguest" contains everything we (the Host) know about a - * Guest. */ + /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; unsigned long args[3]; - /* We grab the Big Lguest lock, which protects against multiple - * simultaneous initializations. */ + /* + * We grab the Big Lguest lock, which protects against multiple + * simultaneous initializations. + */ mutex_lock(&lguest_lock); /* You can't initialize twice! Close the device and start again... */ if (file->private_data) { @@ -249,8 +275,10 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_eventfds; - /* Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. */ + /* + * Initialize the Guest's shadow page tables, using the toplevel + * address the Launcher gave us. This allocates memory, so can fail. + */ err = init_guest_pagetable(lg); if (err) goto free_regs; @@ -275,7 +303,8 @@ unlock: return err; } -/*L:010 The first operation the Launcher does must be a write. All writes +/*L:010 + * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use * writes of other values to send interrupts. @@ -283,12 +312,15 @@ unlock: * Note that we overload the "offset" in the /dev/lguest file to indicate what * CPU number we're dealing with. Currently this is always 0, since we only * support uniprocessor Guests, but you can see the beginnings of SMP support - * here. */ + * here. + */ static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { - /* Once the Guest is initialized, we hold the "struct lguest" in the - * file private data. */ + /* + * Once the Guest is initialized, we hold the "struct lguest" in the + * file private data. + */ struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; @@ -323,13 +355,15 @@ static ssize_t write(struct file *file, const char __user *in, } } -/*L:060 The final piece of interface code is the close() routine. It reverses +/*L:060 + * The final piece of interface code is the close() routine. It reverses * everything done in initialize(). This is usually called because the * Launcher exited. * * Note that the close routine returns 0 or a negative error number: it can't * really fail, but it can whine. I blame Sun for this wart, and K&R C for - * letting them do it. :*/ + * letting them do it. +:*/ static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; @@ -339,8 +373,10 @@ static int close(struct inode *inode, struct file *file) if (!lg) return 0; - /* We need the big lock, to protect from inter-guest I/O and other - * Launchers initializing guests. */ + /* + * We need the big lock, to protect from inter-guest I/O and other + * Launchers initializing guests. + */ mutex_lock(&lguest_lock); /* Free up the shadow page tables for the Guest. */ @@ -351,8 +387,10 @@ static int close(struct inode *inode, struct file *file) hrtimer_cancel(&lg->cpus[i].hrt); /* We can free up the register page we allocated. */ free_page(lg->cpus[i].regs_page); - /* Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. */ + /* + * Now all the memory cleanups are done, it's safe to release + * the Launcher's memory management structure. + */ mmput(lg->cpus[i].mm); } @@ -361,8 +399,10 @@ static int close(struct inode *inode, struct file *file) eventfd_ctx_put(lg->eventfds->map[i].event); kfree(lg->eventfds); - /* If lg->dead doesn't contain an error code it will be NULL or a - * kmalloc()ed string, either of which is ok to hand to kfree(). */ + /* + * If lg->dead doesn't contain an error code it will be NULL or a + * kmalloc()ed string, either of which is ok to hand to kfree(). + */ if (!IS_ERR(lg->dead)) kfree(lg->dead); /* Free the memory allocated to the lguest_struct */ @@ -386,7 +426,8 @@ static int close(struct inode *inode, struct file *file) * * We begin our understanding with the Host kernel interface which the Launcher * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: */ + * work happens in the read(), write() and close() routines: + */ static struct file_operations lguest_fops = { .owner = THIS_MODULE, .release = close, @@ -394,8 +435,10 @@ static struct file_operations lguest_fops = { .read = read, }; -/* This is a textbook example of a "misc" character device. Populate a "struct - * miscdevice" and register it with misc_register(). */ +/* + * This is a textbook example of a "misc" character device. Populate a "struct + * miscdevice" and register it with misc_register(). + */ static struct miscdevice lguest_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "lguest", diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a6fe1abda240..3da902e4b4cb 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1,9 +1,11 @@ -/*P:700 The pagetable code, on the other hand, still shows the scars of +/*P:700 + * The pagetable code, on the other hand, still shows the scars of * previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust * it nor use it: we verify and convert it here then point the CPU to the - * converted Guest pages when running the Guest. :*/ + * converted Guest pages when running the Guest. +:*/ /* Copyright (C) Rusty Russell IBM Corporation 2006. * GPL v2 and any later version */ @@ -17,10 +19,12 @@ #include #include "lg.h" -/*M:008 We hold reference to pages, which prevents them from being swapped. +/*M:008 + * We hold reference to pages, which prevents them from being swapped. * It'd be nice to have a callback in the "struct mm_struct" when Linux wants * to swap out. If we had this, and a shrinker callback to trim PTE pages, we - * could probably consider launching Guests as non-root. :*/ + * could probably consider launching Guests as non-root. +:*/ /*H:300 * The Page Table Code @@ -45,16 +49,19 @@ * (v) Flushing (throwing away) page tables, * (vi) Mapping the Switcher when the Guest is about to run, * (vii) Setting up the page tables initially. - :*/ +:*/ - -/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is +/* + * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. */ + * page. + */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) -/* For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. */ +/* + * For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. + */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define RESERVE_MEM 2U @@ -64,13 +71,16 @@ #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* We actually need a separate PTE page for each CPU. Remember that after the +/* + * We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. */ + * CPU's guest to see the pages of any other CPU. + */ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) -/*H:320 The page table code is curly enough to need helper functions to keep it +/*H:320 + * The page table code is curly enough to need helper functions to keep it * clear and clean. * * There are two functions which return pointers to the shadow (aka "real") @@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * spgd_addr() takes the virtual address and returns a pointer to the top-level * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's - * usually the current one). */ + * usually the current one). + */ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); @@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) } #ifdef CONFIG_X86_PAE -/* This routine then takes the PGD entry given above, which contains the +/* + * This routine then takes the PGD entry given above, which contains the * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. */ + * given address. + */ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { unsigned int index = pmd_index(vaddr); @@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } #endif -/* This routine then takes the page directory entry returned above, which +/* + * This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a - * pointer to the PTE entry for the given address. */ + * pointer to the PTE entry for the given address. + */ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { #ifdef CONFIG_X86_PAE @@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) return &page[pte_index(vaddr)]; } -/* These two functions just like the above two, except they access the Guest - * page tables. Hence they return a Guest address. */ +/* + * These two functions just like the above two, except they access the Guest + * page tables. Hence they return a Guest address. + */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); @@ -175,17 +192,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as - * an optimization (ie. pre-faulting). :*/ +/*M:014 + * get_pfn is slow: we could probably try to grab batches of pages here as + * an optimization (ie. pre-faulting). +:*/ -/*H:350 This routine takes a page number given by the Guest and converts it to +/*H:350 + * This routine takes a page number given by the Guest and converts it to * an actual, physical page number. It can fail for several reasons: the * virtual address might not be mapped by the Launcher, the write flag is set * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * * This holds a reference to the page, so release_pte() is careful to put that - * back. */ + * back. + */ static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; @@ -198,33 +219,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) return -1UL; } -/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table +/*H:340 + * Converting a Guest page table entry to a shadow (ie. real) page table * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page - * number. */ + * number. + */ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) { unsigned long pfn, base, flags; - /* The Guest sets the global flag, because it thinks that it is using + /* + * The Guest sets the global flag, because it thinks that it is using * PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually - * use the global bit, so throw it away. */ + * use the global bit, so throw it away. + */ flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; - /* We need a temporary "unsigned long" variable to hold the answer from + /* + * We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the - * page, given the virtual number. */ + * page, given the virtual number. + */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); - /* When we destroy the Guest, we'll go through the shadow page + /* + * When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think - * this one is valid! */ + * this one is valid! + */ flags = 0; } /* Now we assemble our shadow PTE from the page number and flags. */ @@ -234,8 +263,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) /*H:460 And to complete the chain, release_pte() looks like this: */ static void release_pte(pte_t pte) { - /* Remember that get_user_pages_fast() took a reference to the page, in - * get_pfn()? We have to put it back now. */ + /* + * Remember that get_user_pages_fast() took a reference to the page, in + * get_pfn()? We have to put it back now. + */ if (pte_flags(pte) & _PAGE_PRESENT) put_page(pte_page(pte)); } @@ -273,7 +304,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. */ + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; @@ -298,22 +330,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); - /* And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* middle level not present? We can't map it in. */ + /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; @@ -324,8 +360,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; @@ -334,17 +372,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); - /* And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif gpte = lgread(cpu, gpte_ptr, pte_t); @@ -353,8 +397,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; - /* Check they're not trying to write to a page the Guest wants - * read-only (bit 2 of errcode == write). */ + /* + * Check they're not trying to write to a page the Guest wants + * read-only (bit 2 of errcode == write). + */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; @@ -362,8 +408,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; - /* Check that the Guest PTE flags are OK, and the page number is below - * the pfn_limit (ie. not mapping the Launcher binary). */ + /* + * Check that the Guest PTE flags are OK, and the page number is below + * the pfn_limit (ie. not mapping the Launcher binary). + */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ @@ -373,29 +421,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); - /* If there was a valid shadow PTE entry here before, we release it. - * This can happen with a write to a previously read-only entry. */ + + /* + * If there was a valid shadow PTE entry here before, we release it. + * This can happen with a write to a previously read-only entry. + */ release_pte(*spte); - /* If this is a write, we insist that the Guest page is writable (the - * final arg to gpte_to_spte()). */ + /* + * If this is a write, we insist that the Guest page is writable (the + * final arg to gpte_to_spte()). + */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else - /* If this is a read, don't set the "writable" bit in the page + /* + * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so - * we can update the Guest's _PAGE_DIRTY flag. */ + * we can update the Guest's _PAGE_DIRTY flag. + */ native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); - /* Finally, we write the Guest PTE entry back: we've set the - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ + /* + * Finally, we write the Guest PTE entry back: we've set the + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. + */ lgwrite(cpu, gpte_ptr, pte_t, gpte); - /* The fault is fixed, the page table is populated, the mapping + /* + * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest - * has that a page fault occurred at all. */ + * has that a page fault occurred at all. + */ return true; } @@ -408,7 +467,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * mapped, so it's overkill. * * This is a quick version which answers the question: is this virtual address - * mapped by the shadow page tables, and is it writable? */ + * mapped by the shadow page tables, and is it writable? + */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; @@ -428,16 +488,20 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) return false; #endif - /* Check the flags on the pte entry itself: it must be present and - * writable. */ + /* + * Check the flags on the pte entry itself: it must be present and + * writable. + */ flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } -/* So, when pin_stack_pages() asks us to pin a page, we check if it's already +/* + * So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 - * (meaning "write"). */ + * (meaning "write"). + */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) @@ -485,9 +549,11 @@ static void release_pgd(pgd_t *spgd) /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; - /* Converting the pfn to find the actual PTE page is easy: turn + /* + * Converting the pfn to find the actual PTE page is easy: turn * the page number into a physical address, then convert to a - * virtual address (easy for kernel pages like this one). */ + * virtual address (easy for kernel pages like this one). + */ pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -499,9 +565,12 @@ static void release_pgd(pgd_t *spgd) } } #endif -/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() + +/*H:445 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ + * It simply releases every PTE page from 0 up to the Guest's kernel address. + */ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; @@ -510,10 +579,12 @@ static void flush_user_mappings(struct lguest *lg, int idx) release_pgd(lg->pgdirs[idx].pgdir + i); } -/*H:440 (v) Flushing (throwing away) page tables, +/*H:440 + * (v) Flushing (throwing away) page tables, * * The Guest has a hypercall to throw away the page tables: it's used when a - * large number of mappings have been changed. */ + * large number of mappings have been changed. + */ void guest_pagetable_flush_user(struct lg_cpu *cpu) { /* Drop the userspace part of the current page table. */ @@ -551,9 +622,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } -/* We keep several page tables. This is a simple routine to find the page +/* + * We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given - * us. */ + * us. + */ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; @@ -563,9 +636,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) return i; } -/*H:435 And this is us, creating the new page directory. If we really do +/*H:435 + * And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set - * blank_pgdir. */ + * blank_pgdir. + */ static unsigned int new_pgdir(struct lg_cpu *cpu, unsigned long gpgdir, int *blank_pgdir) @@ -575,8 +650,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, pmd_t *pmd_table; #endif - /* We pick one entry at random to throw out. Choosing the Least - * Recently Used might be better, but this is easy. */ + /* + * We pick one entry at random to throw out. Choosing the Least + * Recently Used might be better, but this is easy. + */ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { @@ -587,8 +664,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, next = cpu->cpu_pgd; else { #ifdef CONFIG_X86_PAE - /* In PAE mode, allocate a pmd page and populate the - * last pgd entry. */ + /* + * In PAE mode, allocate a pmd page and populate the + * last pgd entry. + */ pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd_table) { free_page((long)cpu->lg->pgdirs[next].pgdir); @@ -598,8 +677,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, set_pgd(cpu->lg->pgdirs[next].pgdir + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + /* + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! + */ *blank_pgdir = 1; } #else @@ -615,19 +696,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 (iv) Switching page tables +/*H:430 + * (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. */ + * pgdir). This occurs on almost every context switch. + */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ newpgdir = find_pgdir(cpu->lg, pgtable); - /* If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. */ + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ @@ -637,9 +722,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) pin_stack_pages(cpu); } -/*H:470 Finally, a routine which throws away everything: all PGD entries in all +/*H:470 + * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used - * when we destroy the Guest. */ + * when we destroy the Guest. + */ static void release_all_pagetables(struct lguest *lg) { unsigned int i, j; @@ -656,8 +743,10 @@ static void release_all_pagetables(struct lguest *lg) spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - /* And release the pmd entries of that pmd page, - * except for the switcher pmd. */ + /* + * And release the pmd entries of that pmd page, + * except for the switcher pmd. + */ for (k = 0; k < SWITCHER_PMD_INDEX; k++) release_pmd(&pmdpage[k]); #endif @@ -667,10 +756,12 @@ static void release_all_pagetables(struct lguest *lg) } } -/* We also throw away everything when a Guest tells us it's changed a kernel +/* + * We also throw away everything when a Guest tells us it's changed a kernel * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as - * everything faults back in, but it's rare. */ + * everything faults back in, but it's rare. + */ void guest_pagetable_clear_all(struct lg_cpu *cpu) { release_all_pagetables(cpu->lg); @@ -678,15 +769,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) pin_stack_pages(cpu); } /*:*/ -/*M:009 Since we throw away all mappings when a kernel mapping changes, our + +/*M:009 + * Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I - * don't know, but the term "puissant code-fu" comes to mind. :*/ + * don't know, but the term "puissant code-fu" comes to mind. +:*/ -/*H:420 This is the routine which actually sets the page table entry for then +/*H:420 + * This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they @@ -715,31 +810,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, spmd = spmd_addr(cpu, *spgd, vaddr); if (pmd_flags(*spmd) & _PAGE_PRESENT) { #endif - /* Otherwise, we start by releasing - * the existing entry. */ + /* Otherwise, start by releasing the existing entry. */ pte_t *spte = spte_addr(cpu, *spgd, vaddr); release_pte(*spte); - /* If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us - * in now. This shaves 10% off a - * copy-on-write micro-benchmark. */ + /* + * If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us in + * now. This shaves 10% off a copy-on-write + * micro-benchmark. + */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); native_set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); - } else - /* Otherwise kill it and we can demand_page() - * it in later. */ + } else { + /* + * Otherwise kill it and we can demand_page() + * it in later. + */ native_set_pte(spte, __pte(0)); + } #ifdef CONFIG_X86_PAE } #endif } } -/*H:410 Updating a PTE entry is a little trickier. +/*H:410 + * Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have @@ -748,12 +848,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * all the page tables, not just the current one. This is rare. * * The benefit is that when we have to track a new page table, we can keep all - * the kernel mappings. This speeds up context switch immensely. */ + * the kernel mappings. This speeds up context switch immensely. + */ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { - /* Kernel mappings must be changed on all top levels. Slow, but doesn't - * happen often. */ + /* + * Kernel mappings must be changed on all top levels. Slow, but doesn't + * happen often. + */ if (vaddr >= cpu->lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) @@ -802,12 +905,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) } #endif -/* Once we know how much memory we have we can construct simple identity - * (which set virtual == physical) and linear mappings - * which will get the Guest far enough into the boot to create its own. +/* + * Once we know how much memory we have we can construct simple identity (which + * set virtual == physical) and linear mappings which will get the Guest far + * enough into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ + * know its size here). + */ static unsigned long setup_pagetables(struct lguest *lg, unsigned long mem, unsigned long initrd_size) @@ -825,8 +930,10 @@ static unsigned long setup_pagetables(struct lguest *lg, unsigned int phys_linear; #endif - /* We have mapped_pages frames to map, so we need - * linear_pages page tables to map them. */ + /* + * We have mapped_pages frames to map, so we need linear_pages page + * tables to map them. + */ mapped_pages = mem / PAGE_SIZE; linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; @@ -839,8 +946,10 @@ static unsigned long setup_pagetables(struct lguest *lg, #ifdef CONFIG_X86_PAE pmds = (void *)linear - PAGE_SIZE; #endif - /* Linear mapping is easy: put every page's address into the - * mapping in order. */ + /* + * Linear mapping is easy: put every page's address into the + * mapping in order. + */ for (i = 0; i < mapped_pages; i++) { pte_t pte; pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); @@ -848,8 +957,10 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } - /* The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. */ + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ #ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { @@ -880,15 +991,19 @@ static unsigned long setup_pagetables(struct lguest *lg, } #endif - /* We return the top level (guest-physical) address: remember where - * this is. */ + /* + * We return the top level (guest-physical) address: remember where + * this is. + */ return (unsigned long)pgdir - mem_base; } -/*H:500 (vii) Setting up the page tables initially. +/*H:500 + * (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: */ + * its first page table is. We set some things up here: + */ int init_guest_pagetable(struct lguest *lg) { u64 mem; @@ -898,14 +1013,18 @@ int init_guest_pagetable(struct lguest *lg) pgd_t *pgd; pmd_t *pmd_table; #endif - /* Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). */ + /* + * Get the Guest memory size and the ramdisk size from the boot header + * located at lg->mem_base (Guest address 0). + */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) || get_user(initrd_size, &boot->hdr.ramdisk_size)) return -EFAULT; - /* We start on the first shadow page table, and give it a blank PGD - * page. */ + /* + * We start on the first shadow page table, and give it a blank PGD + * page. + */ lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) return lg->pgdirs[0].gpgdir; @@ -931,17 +1050,21 @@ void page_table_guest_data_init(struct lg_cpu *cpu) /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. */ + /* + * We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. + */ || put_user(RESERVE_MEM * 1024 * 1024, &cpu->lg->lguest_data->reserve_mem) || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* In flush_user_mappings() we loop from 0 to + /* + * In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the - * Switcher mappings, so check that now. */ + * Switcher mappings, so check that now. + */ #ifdef CONFIG_X86_PAE if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) @@ -964,12 +1087,14 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 (vi) Mapping the Switcher when the Guest is about to run. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. */ + * Guest is about to run on this CPU. + */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); @@ -990,20 +1115,24 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) #else pgd_t switcher_pgd; - /* Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). */ + /* + * Make the last PGD entry for this Guest point to the Switcher's PTE + * page for this CPU (with appropriate flags). + */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; #endif - /* We also change the Switcher PTE page. When we're running the Guest, + /* + * We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out - * again. */ + * again. + */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], @@ -1019,10 +1148,12 @@ static void free_switcher_pte_pages(void) free_page((long)switcher_pte_page(i)); } -/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given +/*H:520 + * Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * - * Currently the Switcher is less than a page long, so "pages" is always 1. */ + * Currently the Switcher is less than a page long, so "pages" is always 1. + */ static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages) @@ -1043,13 +1174,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu, native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); - /* The second page contains the "struct lguest_ro_state", and is - * read-only. */ + /* + * The second page contains the "struct lguest_ro_state", and is + * read-only. + */ native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } -/* We've made it through the page table code. Perhaps our tired brains are +/* + * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page tables @@ -1058,10 +1192,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD * have implemented shadow page table support directly into hardware. * - * There is just one file remaining in the Host. */ + * There is just one file remaining in the Host. + */ -/*H:510 At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. */ +/*H:510 + * At boot or module load time, init_pagetables() allocates and populates + * the Switcher PTE page for each CPU. + */ __init int init_pagetables(struct page **switcher_page, unsigned int pages) { unsigned int i; diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 482ed5a18750..951c57b0a7e0 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -1,4 +1,5 @@ -/*P:600 The x86 architecture has segments, which involve a table of descriptors +/*P:600 + * The x86 architecture has segments, which involve a table of descriptors * which can be used to do funky things with virtual address interpretation. * We originally used to use segments so the Guest couldn't alter the * Guest<->Host Switcher, and then we had to trim Guest segments, and restore @@ -8,7 +9,8 @@ * * In these modern times, the segment handling code consists of simple sanity * checks, and the worst you'll experience reading this code is butterfly-rash - * from frolicking through its parklike serenity. :*/ + * from frolicking through its parklike serenity. +:*/ #include "lg.h" /*H:600 @@ -41,10 +43,12 @@ * begin. */ -/* There are several entries we don't let the Guest set. The TSS entry is the +/* + * There are several entries we don't let the Guest set. The TSS entry is the * "Task State Segment" which controls all kinds of delicate things. The * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the - * the Guest can't be trusted to deal with double faults. */ + * the Guest can't be trusted to deal with double faults. + */ static bool ignored_gdt(unsigned int num) { return (num == GDT_ENTRY_TSS @@ -53,42 +57,52 @@ static bool ignored_gdt(unsigned int num) || num == GDT_ENTRY_DOUBLEFAULT_TSS); } -/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We +/*H:630 + * Once the Guest gave us new GDT entries, we fix them up a little. We * don't care if they're invalid: the worst that can happen is a General * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a - * mess: the message will be "unhandled trap 256". */ + * mess: the message will be "unhandled trap 256". + */ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) { unsigned int i; for (i = start; i < end; i++) { - /* We never copy these ones to real GDT, so we don't care what - * they say */ + /* + * We never copy these ones to real GDT, so we don't care what + * they say + */ if (ignored_gdt(i)) continue; - /* Segment descriptors contain a privilege level: the Guest is + /* + * Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's - * running at privilege level 1. If so, we fix it here. */ + * running at privilege level 1. If so, we fix it here. + */ if ((cpu->arch.gdt[i].b & 0x00006000) == 0) cpu->arch.gdt[i].b |= (GUEST_PL << 13); - /* Each descriptor has an "accessed" bit. If we don't set it + /* + * Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't - * writable by the Guest, so bad things can happen. */ + * writable by the Guest, so bad things can happen. + */ cpu->arch.gdt[i].b |= 0x00000100; } } -/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep +/*H:610 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep * a GDT for each CPU, and copy across the Guest's entries each time we want to * run the Guest on that CPU. * * This routine is called at boot or modprobe time for each CPU to set up the * constant GDT entries: the ones which are the same no matter what Guest we're - * running. */ + * running. + */ void setup_default_gdt_entries(struct lguest_ro_state *state) { struct desc_struct *gdt = state->guest_gdt; @@ -98,30 +112,37 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - /* The TSS segment refers to the TSS entry for this particular CPU. + /* + * The TSS segment refers to the TSS entry for this particular CPU. * Forgive the magic flags: the 0x8900 means the entry is Present, it's * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ + * means Saturn is eclipsed by Mercury in the twelfth house. + */ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | ((tss >> 16) & 0x000000FF); } -/* This routine sets up the initial Guest GDT for booting. All entries start - * as 0 (unusable). */ +/* + * This routine sets up the initial Guest GDT for booting. All entries start + * as 0 (unusable). + */ void setup_guest_gdt(struct lg_cpu *cpu) { - /* Start with full 0-4G segments... */ + /* + * Start with full 0-4G segments...except the Guest is allowed to use + * them, so set the privilege level appropriately in the flags. + */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - /* ...except the Guest is allowed to use them, so set the privilege - * level appropriately in the flags. */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } -/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" - * entries. */ +/*H:650 + * An optimization of copy_gdt(), for just the three "thead-local storage" + * entries. + */ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; @@ -130,26 +151,34 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) gdt[i] = cpu->arch.gdt[i]; } -/*H:640 When the Guest is run on a different CPU, or the GDT entries have - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this - * CPU's GDT. */ +/*H:640 + * When the Guest is run on a different CPU, or the GDT entries have changed, + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's + * GDT. + */ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; - /* The default entries from setup_default_gdt_entries() are not - * replaced. See ignored_gdt() above. */ + /* + * The default entries from setup_default_gdt_entries() are not + * replaced. See ignored_gdt() above. + */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) gdt[i] = cpu->arch.gdt[i]; } -/*H:620 This is where the Guest asks us to load a new GDT entry - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ +/*H:620 + * This is where the Guest asks us to load a new GDT entry + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. + */ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { - /* We assume the Guest has the same number of GDT entries as the - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ + /* + * We assume the Guest has the same number of GDT entries as the + * Host, otherwise we'd have to dynamically allocate the Guest GDT. + */ if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); @@ -157,15 +186,19 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) cpu->arch.gdt[num].a = lo; cpu->arch.gdt[num].b = hi; fixup_gdt_table(cpu, num, num+1); - /* Mark that the GDT changed so the core knows it has to copy it again, - * even if the Guest is run on the same CPU. */ + /* + * Mark that the GDT changed so the core knows it has to copy it again, + * even if the Guest is run on the same CPU. + */ cpu->changed |= CHANGED_GDT; } -/* This is the fast-track version for just changing the three TLS entries. +/* + * This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover - * both cases? */ + * both cases? + */ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) { struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; @@ -175,7 +208,6 @@ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) /* Note that just the TLS entries have changed. */ cpu->changed |= CHANGED_GDT_TLS; } -/*:*/ /*H:660 * With this, we have finished the Host. diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index eaf722fe309a..96f7d88ec7f8 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,13 +17,15 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/*P:450 This file contains the x86-specific lguest code. It used to be all +/*P:450 + * This file contains the x86-specific lguest code. It used to be all * mixed in with drivers/lguest/core.c but several foolhardy code slashers * wrestled most of the dependencies out to here in preparation for porting * lguest to other architectures (see what I mean by foolhardy?). * * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. :*/ + * were implemented after days of debugging pain. +:*/ #include #include #include @@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); */ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { - /* Copying all this data can be quite expensive. We usually run the + /* + * Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ + * Guest has changed. + */ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { __get_cpu_var(last_cpu) = cpu; cpu->last_pages = pages; cpu->changed = CHANGED_ALL; } - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ + /* + * These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. + */ pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ + /* + * Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). + */ map_switcher_in_guest(cpu, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use + /* + * Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ + * level 1). + */ pages->state.guest_tss.sp1 = cpu->esp1; pages->state.guest_tss.ss1 = cpu->ss1; @@ -125,40 +135,53 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ + /* + * Copy the guest-specific information into this CPU's "struct + * lguest_pages". + */ copy_in_guest_info(cpu, pages); - /* Set the trap number to 256 (impossible value). If we fault while + /* + * Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ + * cause us to abort the Guest. + */ cpu->regs->trapnum = 256; - /* Now: we push the "eflags" register on the stack, then do an "lcall". + /* + * Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using * the dedicated lguest code segment, as well as jumping into the * Switcher. * * The lcall also pushes the old code segment (KERNEL_CS) onto the * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... */ + * exactly match the stack layout created by an interrupt... + */ asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ + /* + * This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. + */ : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the + /* + * %eax contains the pages pointer. ("0" refers to the * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page - * directory. */ + * directory. + */ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) - /* We tell gcc that all these registers could change, + /* + * We tell gcc that all these registers could change, * which means we don't have to save and restore them in - * the Switcher. */ + * the Switcher. + */ : "memory", "%edx", "%ecx", "%edi", "%esi"); } /*:*/ -/*M:002 There are hooks in the scheduler which we can register to tell when we +/*M:002 + * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us * to lazily disable SYSENTER which would regain some performance, and should * also simplify copy_in_guest_info(). Note that we'd still need to restore @@ -166,56 +189,72 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * * We could also try using this hooks for PGE, but that might be too expensive. * - * The hooks were designed for KVM, but we can also put them to good use. :*/ + * The hooks were designed for KVM, but we can also put them to good use. +:*/ -/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. */ +/*H:040 + * This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. + */ void lguest_arch_run_guest(struct lg_cpu *cpu) { - /* Remember the awfully-named TS bit? If the Guest has asked to set it + /* + * Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it - * uses the FPU. */ + * uses the FPU. + */ if (cpu->ts) unlazy_fpu(current); - /* SYSENTER is an optimized way of doing system calls. We can't allow + /* + * SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest * won't try it because we don't advertise it in CPUID, but a malicious * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. */ + * CPU to disable it before running the Guest. + */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - /* Now we actually run the Guest. It will return when something + /* + * Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it - * was doing. */ + * was doing. + */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" structure contains two extra entries which are + /* + * Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some - * traps set. */ + * traps set. + */ /* Restore SYSENTER if it's supposed to be on. */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - /* If the Guest page faulted, then the cr2 register will tell us the + /* + * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. */ + * cr2, or we could even move off to a different CPU. + */ if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, + /* + * Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. * math_state_restore() may sleep and we may even move off to * a different CPU. So all the critical stuff should be done - * before this. */ + * before this. + */ else if (cpu->regs->trapnum == 7) math_state_restore(); } -/*H:130 Now we've examined the hypercall code; our Guest can make requests. +/*H:130 + * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements @@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ + * instructions and skip over it. We return true if we did. + */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ + /* + * The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. + */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - /* This must be the Guest kernel trying to do something, not userspace! + /* + * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ + /* + * 0x66 is an "operand prefix". It means it's using the upper 16 bits + * of the eax register. + */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ @@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu) insn = lgread(cpu, physaddr + insnlen, u8); } - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ + /* + * We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. + */ switch (insn & 0xFE) { case 0xE4: /* in ,%al */ insnlen += 2; @@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu) return 0; } - /* If it was an "IN" instruction, they expect the result to be read + /* + * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ + * traditionally means "there's nothing there". + */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) @@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* Our hypercalls mechanism used to be based on direct software interrupts. +/* + * Our hypercalls mechanism used to be based on direct software interrupts. * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to * change over to using kvm hypercalls. * @@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu) */ static void rewrite_hypercall(struct lg_cpu *cpu) { - /* This are the opcodes we use to patch the Guest. The opcode for "int + /* + * This are the opcodes we use to patch the Guest. The opcode for "int * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). */ + * complete the sequence with a NOP (0x90). + */ u8 insn[3] = {0xcd, 0x1f, 0x90}; __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* The above write might have caused a copy of that page to be made + /* + * The above write might have caused a copy of that page to be made * (if it was read-only). We need to make sure the Guest has * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. */ + * drop them all. + */ guest_pagetable_clear_all(cpu); } @@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu) { u8 insn[3]; - /* This must be the Guest kernel trying to do something. + /* + * This must be the Guest kernel trying to do something. * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return false; @@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* Check if this was one of those annoying IN or OUT + /* + * Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. */ + * back into the Guest after we've done it. + */ if (cpu->regs->errcode == 0) { if (emulate_insn(cpu)) return; } - /* If KVM is active, the vmcall instruction triggers a - * General Protection Fault. Normally it triggers an - * invalid opcode fault (6): */ + /* + * If KVM is active, the vmcall instruction triggers a General + * Protection Fault. Normally it triggers an invalid opcode + * fault (6): + */ case 6: - /* We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ + /* + * We need to check if ring == GUEST_PL and faulting + * instruction == vmcall. + */ if (is_hypercall(cpu)) { rewrite_hypercall(cpu); return; } break; case 14: /* We've intercepted a Page Fault. */ - /* The Guest accessed a virtual address that wasn't mapped. + /* + * The Guest accessed a virtual address that wasn't mapped. * This happens a lot: we don't actually set up most of the page * tables for the Guest at all when we start: as it runs it asks * for more and more, and we set them up as required. In this * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. */ + * whether kernel or userspace code. + */ if (demand_page(cpu, cpu->arch.last_pagefault, cpu->regs->errcode)) return; - /* OK, it's really not there (or not OK): the Guest needs to + /* + * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the * fault occurred. * * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL */ + * lg->lguest_data could be NULL + */ if (cpu->lg->lguest_data && put_user(cpu->arch.last_pagefault, &cpu->lg->lguest_data->cr2)) kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already restored the - * Floating Point Unit, so we just continue without telling - * it. */ + /* + * If the Guest doesn't want to know, we already restored the + * Floating Point Unit, so we just continue without telling it. + */ if (!cpu->ts) return; break; case 32 ... 255: - /* These values mean a real interrupt occurred, in which case + /* + * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again */ + * return to run the Guest again + */ cond_resched(); return; case LGUEST_TRAP_ENTRY: - /* Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. */ + /* + * Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. + */ cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* If the Guest doesn't have a handler (either it hasn't + /* + * If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. */ + * it handle), it dies with this cryptic error message. + */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault : cpu->regs->errcode); } -/* Now we can look at each of the routines this calls, in increasing order of +/* + * Now we can look at each of the routines this calls, in increasing order of * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), * deliver_trap() and demand_page(). After all those, we'll be ready to * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ + * duality will be complete. +:*/ static void adjust_pge(void *on) { if (on) @@ -439,13 +515,16 @@ static void adjust_pge(void *on) write_cr4(read_cr4() & ~X86_CR4_PGE); } -/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. */ +/*H:020 + * Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. + */ void __init lguest_arch_host_init(void) { int i; - /* Most of the i386/switcher.S doesn't care that it's been moved; on + /* + * Most of the i386/switcher.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void) * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. */ + * compiled-in switcher code and the high-mapped copy we just made. + */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void) for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ + /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; - /* The Global Descriptor Table: the Host has a different one + /* + * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ + * byte, not the size, hence the "-1"). + */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - /* All CPUs on the Host use the same Interrupt Descriptor + /* + * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ + * descriptor. + */ store_idt(&state->host_idt_desc); - /* The descriptors for the Guest's GDT and IDT can be filled + /* + * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ + * ->guest_idt before actually running the Guest. + */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; - /* We know where we want the stack to be when the Guest enters + /* + * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ + * we start it at the end of that structure. + */ state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ + /* + * And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. + */ state->guest_tss.ss0 = LGUEST_DS; - /* x86 can have a finegrained bitmap which indicates what I/O + /* + * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our - * structure, meaning "none". */ + * structure, meaning "none". + */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ + /* + * Some GDT entries are the same across all Guests, so we can + * set them up now. + */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ + /* + * The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. + */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } - /* In the Switcher, we want the %cs segment register to use the + /* + * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ + * need this structure to feed to Intel's "lcall" instruction. + */ lguest_entry.offset = (long)switch_to_guest + switcher_offset(); lguest_entry.segment = LGUEST_CS; - /* Finally, we need to turn off "Page Global Enable". PGE is an + /* + * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. @@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void) * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ + * on when we return, but that slowed the Switcher down noticibly. + */ - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ + /* + * We don't need the complexity of CPUs coming and going while we're + * doing this. + */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ + /* + * adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). + */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); @@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. */ + /* + * The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. + */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; - /* Having checked it, we simply set lg->lguest_data to point straight + /* + * Having checked it, we simply set lg->lguest_data to point straight * into the Launcher's memory at the right place and then use * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid - * optimizations. */ + * optimizations. + */ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - /* We insist that the Time Stamp Counter exist and doesn't change with + /* + * We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC * changes could be handled in software. I decided that time going * backwards might be good for benchmarks, but it's bad for users. * * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ + * TSCs for its own purposes, and we use that here. + */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) tsc_speed = tsc_khz; else @@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) } /*:*/ -/*L:030 lguest_arch_setup_regs() +/*L:030 + * lguest_arch_setup_regs() * * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ + * allocate the structure, so they will be 0. + */ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { struct lguest_regs *regs = cpu->regs; - /* There are four "segment" registers which the Guest needs to boot: + /* + * There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment * __KERNEL_CS, and the "data", "extra" and "stack" segment registers * refer to the kernel data segment __KERNEL_DS. * * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ + * at privilege level 1 (GUEST_PL). + */ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; regs->cs = __KERNEL_CS|GUEST_PL; - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + /* + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) * is supposed to always be "1". Bit 9 (0x200) controls whether * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ + * running the Guest. + */ regs->eflags = X86_EFLAGS_IF | 0x2; - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ + /* + * The "Extended Instruction Pointer" register says where the Guest is + * running. + */ regs->eip = start; - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ + /* + * %esi points to our boot information, at physical address 0, so don't + * touch it. + */ - /* There are a couple of GDT entries the Guest expects when first - * booting. */ + /* There are a couple of GDT entries the Guest expects at boot. */ setup_guest_gdt(cpu); } diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 3fc15318a80f..6dec09793836 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,12 +1,15 @@ -/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the +/*P:900 + * This is the Switcher: code which sits at 0xFFC00000 astride both the * Host and Guest to do the low-level Guest<->Host switch. It is as simple as * it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage - * can be found in "make Guest". :*/ + * can be found in "make Guest". + :*/ -/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must +/*M:012 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must * gain at least 1% more performance. Since neither LOC nor performance can be * measured beforehand, it generally means implementing a feature then deciding * if it's worth it. And once it's implemented, who can say no? @@ -31,11 +34,14 @@ * Host (which is actually really easy). * * Two questions remain. Would the performance gain outweigh the complexity? - * And who would write the verse documenting it? :*/ + * And who would write the verse documenting it? +:*/ -/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their +/*M:011 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their * code). It's worth doing though, since it would let us use oprofile in the - * Host when a Guest is running. :*/ + * Host when a Guest is running. +:*/ /*S:100 * Welcome to the Switcher itself! -- cgit v1.2.2 From a91d74a3c4de8115295ee87350c13a329164aaaf Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: update commentry Every so often, after code shuffles, I need to go through and unbitrot the Lguest Journey (see drivers/lguest/README). Since we now use RCU in a simple form in one place I took the opportunity to expand that explanation. Signed-off-by: Rusty Russell Cc: Ingo Molnar Cc: Paul McKenney --- drivers/lguest/core.c | 7 ++- drivers/lguest/hypercalls.c | 6 ++- drivers/lguest/lguest_device.c | 11 +++-- drivers/lguest/lguest_user.c | 100 +++++++++++++++++++++++++++++++++++---- drivers/lguest/page_tables.c | 84 ++++++++++++++++++++++++-------- drivers/lguest/x86/core.c | 2 +- drivers/lguest/x86/switcher_32.S | 6 +-- 7 files changed, 176 insertions(+), 40 deletions(-) (limited to 'drivers') diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index cd058bc903ff..1e2cb846b3c9 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -217,10 +217,15 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) /* * It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. + * Launcher. */ if (cpu->pending_notify) { + /* + * Does it just needs to write to a registered + * eventfd (ie. the appropriate virtqueue thread)? + */ if (!send_notify_to_eventfd(cpu)) { + /* OK, we tell the main Laucher. */ if (put_user(cpu->pending_notify, user)) return -EFAULT; return sizeof(cpu->pending_notify); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index 787ab4bc09f0..83511eb0923d 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -59,7 +59,7 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) case LHCALL_SHUTDOWN: { char msg[128]; /* - * Shutdown is such a trivial hypercall that we do it in four + * Shutdown is such a trivial hypercall that we do it in five * lines right here. * * If the lgread fails, it will call kill_guest() itself; the @@ -245,6 +245,10 @@ static void initialize(struct lg_cpu *cpu) * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But * a similar scenario might one day bite us, so it's worth mentioning. + * + * Note that if we used a shared anonymous mapping in the Launcher instead of + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we + * need that to switch the Launcher to processes (away from threads) anyway. :*/ /*H:100 diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index cc000e79c3d1..1401c1ace1ec 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -236,7 +236,7 @@ static void lg_notify(struct virtqueue *vq) extern void lguest_setup_irq(unsigned int irq); /* - * This routine finds the first virtqueue described in the configuration of + * This routine finds the Nth virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -244,9 +244,6 @@ extern void lguest_setup_irq(unsigned int irq); * everyone wants to do it differently. The KVM coders want the Guest to * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. - * - * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, @@ -422,7 +419,11 @@ static void add_lguest_device(struct lguest_device_desc *d, /* This devices' parent is the lguest/ dir. */ ldev->vdev.dev.parent = lguest_root; - /* We have a unique device index thanks to the dev_index counter. */ + /* + * The device type comes straight from the descriptor. There's also a + * device vendor field in the virtio_device struct, which we leave as + * 0. + */ ldev->vdev.id.device = d->type; /* * We have a simple set of routines for querying the device's diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 7e92017103dc..b4d3f7ca554f 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,9 +1,8 @@ -/*P:200 - * This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout, pagetable, entry point and kernel address - * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. + * tell us the Guest's memory layout and entry point. A read will run the + * Guest until something happens, such as a signal or the Guest doing a NOTIFY + * out to the Launcher. :*/ #include #include @@ -13,14 +12,41 @@ #include #include "lg.h" +/*L:056 + * Before we move on, let's jump ahead and look at what the kernel does when + * it needs to look up the eventfds. That will complete our picture of how we + * use RCU. + * + * The notification value is in cpu->pending_notify: we return true if it went + * to an eventfd. + */ bool send_notify_to_eventfd(struct lg_cpu *cpu) { unsigned int i; struct lg_eventfd_map *map; - /* lg->eventfds is RCU-protected */ + /* + * This "rcu_read_lock()" helps track when someone is still looking at + * the (RCU-using) eventfds array. It's not actually a lock at all; + * indeed it's a noop in many configurations. (You didn't expect me to + * explain all the RCU secrets here, did you?) + */ rcu_read_lock(); + /* + * rcu_dereference is the counter-side of rcu_assign_pointer(); it + * makes sure we don't access the memory pointed to by + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, + * but Alpha allows this! Paul McKenney points out that a really + * aggressive compiler could have the same effect: + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html + * + * So play safe, use rcu_dereference to get the rcu-protected pointer: + */ map = rcu_dereference(cpu->lg->eventfds); + /* + * Simple array search: even if they add an eventfd while we do this, + * we'll continue to use the old array and just won't see the new one. + */ for (i = 0; i < map->num; i++) { if (map->map[i].addr == cpu->pending_notify) { eventfd_signal(map->map[i].event, 1); @@ -28,14 +54,43 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) break; } } + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ rcu_read_unlock(); + + /* If we cleared the notification, it's because we found a match. */ return cpu->pending_notify == 0; } +/*L:055 + * One of the more tricksy tricks in the Linux Kernel is a technique called + * Read Copy Update. Since one point of lguest is to teach lguest journeyers + * about kernel coding, I use it here. (In case you're curious, other purposes + * include learning about virtualization and instilling a deep appreciation for + * simplicity and puppies). + * + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we + * add new eventfds without ever blocking readers from accessing the array. + * The current Launcher only does this during boot, so that never happens. But + * Read Copy Update is cool, and adding a lock risks damaging even more puppies + * than this code does. + * + * We allocate a brand new one-larger array, copy the old one and add our new + * element. Then we make the lg eventfd pointer point to the new array. + * That's the easy part: now we need to free the old one, but we need to make + * sure no slow CPU somewhere is still looking at it. That's what + * synchronize_rcu does for us: waits until every CPU has indicated that it has + * moved on to know it's no longer using the old one. + * + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. + */ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) { struct lg_eventfd_map *new, *old = lg->eventfds; + /* + * We don't allow notifications on value 0 anyway (pending_notify of + * 0 means "nothing pending"). + */ if (!addr) return -EINVAL; @@ -62,12 +117,20 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) } new->num++; - /* Now put new one in place. */ + /* + * Now put new one in place: rcu_assign_pointer() is a fancy way of + * doing "lg->eventfds = new", but it uses memory barriers to make + * absolutely sure that the contents of "new" written above is nailed + * down before we actually do the assignment. + * + * We have to think about these kinds of things when we're operating on + * live data without locks. + */ rcu_assign_pointer(lg->eventfds, new); /* * We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. + * version, then free it. */ synchronize_rcu(); kfree(old); @@ -75,6 +138,14 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) return 0; } +/*L:052 + * Receiving notifications from the Guest is usually done by attaching a + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will + * become readable when the Guest does an LHCALL_NOTIFY with that value. + * + * This is really convenient for processing each virtqueue in a separate + * thread. + */ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) { unsigned long addr, fd; @@ -86,6 +157,11 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) if (get_user(fd, input) != 0) return -EFAULT; + /* + * Just make sure two callers don't add eventfds at once. We really + * only need to lock against callers adding to the same Guest, so using + * the Big Lguest Lock is overkill. But this is setup, not a fast path. + */ mutex_lock(&lguest_lock); err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); @@ -106,6 +182,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) if (irq >= LGUEST_IRQS) return -EINVAL; + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_interrupt(cpu, irq); return 0; } @@ -307,10 +387,10 @@ unlock: * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts. + * writes of other values to send interrupts or set up receipt of notifications. * * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with. Currently this is always 0, since we only + * CPU number we're dealing with. Currently this is always 0 since we only * support uniprocessor Guests, but you can see the beginnings of SMP support * here. */ diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index 3da902e4b4cb..a8d0aee3bc0e 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -29,10 +29,10 @@ /*H:300 * The Page Table Code * - * We use two-level page tables for the Guest. If you're not entirely - * comfortable with virtual addresses, physical addresses and page tables then - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with - * diagrams!). + * We use two-level page tables for the Guest, or three-level with PAE. If + * you're not entirely comfortable with virtual addresses, physical addresses + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page + * Table Handling" (with diagrams!). * * The Guest keeps page tables, but we maintain the actual ones here: these are * called "shadow" page tables. Which is a very Guest-centric name: these are @@ -52,9 +52,8 @@ :*/ /* - * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is - * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) + * or 512 PTE entries with PAE (2MB). */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) @@ -81,7 +80,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); /*H:320 * The page table code is curly enough to need helper functions to keep it - * clear and clean. + * clear and clean. The kernel itself provides many of them; one advantage + * of insisting that the Guest and Host use the same CONFIG_PAE setting. * * There are two functions which return pointers to the shadow (aka "real") * page tables. @@ -155,7 +155,7 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } /* - * These two functions just like the above two, except they access the Guest + * These functions are just like the above two, except they access the Guest * page tables. Hence they return a Guest address. */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) @@ -165,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) } #ifdef CONFIG_X86_PAE +/* Follow the PGD to the PMD. */ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; @@ -172,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) return gpage + pmd_index(vaddr) * sizeof(pmd_t); } +/* Follow the PMD to the PTE. */ static unsigned long gpte_addr(struct lg_cpu *cpu, pmd_t gpmd, unsigned long vaddr) { @@ -181,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, return gpage + pte_index(vaddr) * sizeof(pte_t); } #else +/* Follow the PGD to the PTE (no mid-level for !PAE). */ static unsigned long gpte_addr(struct lg_cpu *cpu, pgd_t gpgd, unsigned long vaddr) { @@ -314,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; + /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; @@ -391,6 +395,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif + + /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ @@ -507,6 +513,7 @@ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) kill_guest(cpu, "bad stack page %#lx", vaddr); } +/*:*/ #ifdef CONFIG_X86_PAE static void release_pmd(pmd_t *spmd) @@ -543,7 +550,11 @@ static void release_pgd(pgd_t *spgd) } #else /* !CONFIG_X86_PAE */ -/*H:450 If we chase down the release_pgd() code, it looks like this: */ +/*H:450 + * If we chase down the release_pgd() code, the non-PAE version looks like + * this. The PAE version is almost identical, but instead of calling + * release_pte it calls release_pmd(), which looks much like this. + */ static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ @@ -898,17 +909,21 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } + #ifdef CONFIG_X86_PAE +/* For setting a mid-level, we just throw everything away. It's easy. */ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) { guest_pagetable_clear_all(&lg->cpus[0]); } #endif -/* - * Once we know how much memory we have we can construct simple identity (which +/*H:505 + * To get through boot, we construct simple identity page mappings (which * set virtual == physical) and linear mappings which will get the Guest far - * enough into the boot to create its own. + * enough into the boot to create its own. The linear mapping means we + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, + * as you'll see. * * We lay them out of the way, just below the initrd (which is why we need to * know its size here). @@ -944,6 +959,10 @@ static unsigned long setup_pagetables(struct lguest *lg, linear = (void *)pgdir - linear_pages * PAGE_SIZE; #ifdef CONFIG_X86_PAE + /* + * And the single mid page goes below that. We only use one, but + * that's enough to map 1G, which definitely gets us through boot. + */ pmds = (void *)linear - PAGE_SIZE; #endif /* @@ -957,13 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } +#ifdef CONFIG_X86_PAE /* - * The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. + * Make the Guest PMD entries point to the corresponding place in the + * linear mapping (up to one page worth of PMD). */ -#ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); @@ -971,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } + /* One PGD entry, pointing to that PMD page. */ set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; + /* + * And the third PGD entry (ie. addresses 3G-4G). + * + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. + */ if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) return -EFAULT; #else + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; + /* + * Create a PGD entry which points to the right part of the + * linear PTE pages. + */ pgd = __pgd((phys_linear + i * sizeof(pte_t)) | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + /* + * Copy it into the PGD page at 0 and PAGE_OFFSET. + */ if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) + i / PTRS_PER_PTE], @@ -992,8 +1030,8 @@ static unsigned long setup_pagetables(struct lguest *lg, #endif /* - * We return the top level (guest-physical) address: remember where - * this is. + * We return the top level (guest-physical) address: we remember where + * this is to write it into lguest_data when the Guest initializes. */ return (unsigned long)pgdir - mem_base; } @@ -1031,7 +1069,9 @@ int init_guest_pagetable(struct lguest *lg) lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; + #ifdef CONFIG_X86_PAE + /* For PAE, we also create the initial mid-level. */ pgd = lg->pgdirs[0].pgdir; pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); if (!pmd_table) @@ -1040,11 +1080,13 @@ int init_guest_pagetable(struct lguest *lg) set_pgd(pgd + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); #endif + + /* This is the current page table. */ lg->cpus[0].cpu_pgd = 0; return 0; } -/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ @@ -1105,12 +1147,16 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) pmd_t switcher_pmd; pmd_t *pmd_table; + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + /* Figure out where the pmd page is, by reading the PGD, and converting + * it to a virtual address. */ pmd_table = __va(pgd_pfn(cpu->lg-> pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); + /* Now write it into the shadow page table. */ native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); #else pgd_t switcher_pgd; diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index 96f7d88ec7f8..6ae388849a3b 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -187,7 +187,7 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * also simplify copy_in_guest_info(). Note that we'd still need to restore * things when we exit to Launcher userspace, but that's fairly easy. * - * We could also try using this hooks for PGE, but that might be too expensive. + * We could also try using these hooks for PGE, but that might be too expensive. * * The hooks were designed for KVM, but we can also put them to good use. :*/ diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 6dec09793836..40634b0db9f7 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,7 +1,7 @@ /*P:900 - * This is the Switcher: code which sits at 0xFFC00000 astride both the - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as - * it can be made, but it's naturally very specific to x86. + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride + * both the Host and Guest to do the low-level Guest<->Host switch. It is as + * simple as it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage -- cgit v1.2.2 From 1842f23c05b6a866be831aa60bc8a8731c58ddd0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:46 -0600 Subject: lguest and virtio: cleanup struct definitions to Linux style. I've been doing this for years, and akpm picked me up on it about 12 months ago. lguest partly serves as example code, so let's do it Right. Also, remove two unused fields in struct vblk_info in the example launcher. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- drivers/lguest/lg.h | 9 +++------ drivers/lguest/lguest_device.c | 3 +-- 2 files changed, 4 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 74c0db691b53..bc28745d05af 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -16,15 +16,13 @@ void free_pagetables(void); int init_pagetables(struct page **switcher_page, unsigned int pages); -struct pgdir -{ +struct pgdir { unsigned long gpgdir; pgd_t *pgdir; }; /* We have two pages shared with guests, per cpu. */ -struct lguest_pages -{ +struct lguest_pages { /* This is the stack page mapped rw in guest */ char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; struct lguest_regs regs; @@ -89,8 +87,7 @@ struct lg_eventfd_map { }; /* The private info the thread maintains about the guest. */ -struct lguest -{ +struct lguest { struct lguest_data __user *lguest_data; struct lg_cpu cpus[NR_CPUS]; unsigned int nr_cpus; diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 1401c1ace1ec..b6200bc39b58 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -207,8 +207,7 @@ static void lg_reset(struct virtio_device *vdev) */ /*D:140 This is the information we remember about each virtqueue. */ -struct lguest_vq_info -{ +struct lguest_vq_info { /* A copy of the information contained in the device config. */ struct lguest_vqconfig config; -- cgit v1.2.2 From dfb3cf00e402686f671db697adbd8b9f4c219268 Mon Sep 17 00:00:00 2001 From: Swen Schillig Date: Mon, 13 Jul 2009 15:06:02 +0200 Subject: [SCSI] zfcp: Fix invalid command order We should not modify the port status after triggering an ERP action for the port. It is not guaranteed which status is finally active when the ERP action is performed. This can lead to situations which are unwanted and hard to debug in case of a failure. Signed-off-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_fsf.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index c57658f3d34f..bbb7ef0b052d 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -1731,15 +1731,16 @@ static void zfcp_fsf_close_physical_port_handler(struct zfcp_fsf_req *req) zfcp_fsf_access_denied_port(req, port); break; case FSF_PORT_BOXED: - zfcp_erp_port_boxed(port, "fscpph2", req); - req->status |= ZFCP_STATUS_FSFREQ_ERROR | - ZFCP_STATUS_FSFREQ_RETRY; /* can't use generic zfcp_erp_modify_port_status because * ZFCP_STATUS_COMMON_OPEN must not be reset for the port */ atomic_clear_mask(ZFCP_STATUS_PORT_PHYS_OPEN, &port->status); list_for_each_entry(unit, &port->unit_list_head, list) atomic_clear_mask(ZFCP_STATUS_COMMON_OPEN, &unit->status); + zfcp_erp_port_boxed(port, "fscpph2", req); + req->status |= ZFCP_STATUS_FSFREQ_ERROR | + ZFCP_STATUS_FSFREQ_RETRY; + break; case FSF_ADAPTER_STATUS_AVAILABLE: switch (header->fsf_status_qual.word[0]) { -- cgit v1.2.2 From acf7b86150701de105aa8307b4b3f9dc533c45bb Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:03 +0200 Subject: [SCSI] zfcp: Acquire qdio_stat_lock when reading the queue utilization req_q_util is not atomic, so the qdio_stat_lock must be held when reading this variable. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_sysfs.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c index 3e51e64d1108..0fe5cce818cb 100644 --- a/drivers/s390/scsi/zfcp_sysfs.c +++ b/drivers/s390/scsi/zfcp_sysfs.c @@ -494,9 +494,14 @@ static ssize_t zfcp_sysfs_adapter_q_full_show(struct device *dev, struct Scsi_Host *scsi_host = class_to_shost(dev); struct zfcp_adapter *adapter = (struct zfcp_adapter *) scsi_host->hostdata[0]; + u64 util; + + spin_lock_bh(&adapter->qdio_stat_lock); + util = adapter->req_q_util; + spin_unlock_bh(&adapter->qdio_stat_lock); return sprintf(buf, "%d %llu\n", atomic_read(&adapter->qdio_outb_full), - (unsigned long long)adapter->req_q_util); + (unsigned long long)util); } static DEVICE_ATTR(queue_full, S_IRUGO, zfcp_sysfs_adapter_q_full_show, NULL); -- cgit v1.2.2 From 1e9b16430ff4fd09408a74342d6b8338228e2f70 Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:04 +0200 Subject: [SCSI] zfcp: Return -ENOMEM for allocation failures in zfcp_fsf When a fsf_req or a qtcb cannot be allocated return -ENOMEM instead of -EIO. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_fsf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index bbb7ef0b052d..c4eb62b32a32 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -722,7 +722,7 @@ static struct zfcp_fsf_req *zfcp_fsf_req_create(struct zfcp_adapter *adapter, req = zfcp_fsf_alloc_qtcb(pool); if (unlikely(!req)) - return ERR_PTR(-EIO); + return ERR_PTR(-ENOMEM); if (adapter->req_no == 0) adapter->req_no++; -- cgit v1.2.2 From 688a1820bde27749f22b18b94ef1c9bc179b1b29 Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:05 +0200 Subject: [SCSI] zfcp: Use correct flags for zfcp_erp_notify zfcp_erp_notify uses the ZFCP_ERP_STATUS_* flags, so it is ZFCP_STATUS_ERP_LOWMEM instead of ZFCP_ERP_NOMEM. Signalling ZFCP_ERP_FAILED is not necessary, the missing d_id will show that the nameserver did not return the d_id. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_erp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index 8030e25152fb..e7d7ef55e37d 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -854,10 +854,10 @@ void zfcp_erp_port_strategy_open_lookup(struct work_struct *work) retval = zfcp_fc_ns_gid_pn(&port->erp_action); if (retval == -ENOMEM) - zfcp_erp_notify(&port->erp_action, ZFCP_ERP_NOMEM); + zfcp_erp_notify(&port->erp_action, ZFCP_STATUS_ERP_LOWMEM); port->erp_action.step = ZFCP_ERP_STEP_NAMESERVER_LOOKUP; if (retval) - zfcp_erp_notify(&port->erp_action, ZFCP_ERP_FAILED); + zfcp_erp_notify(&port->erp_action, 0); zfcp_port_put(port); } -- cgit v1.2.2 From 426f6059b0eb66cec139f4b9066168ab72b85774 Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:06 +0200 Subject: [SCSI] zfcp: Use unchained mode for small ct and els requests The ELS ADISC and the GID_PN requests sent from zfcp fit into unchained FSF requests. Change the FSF allocation logic to use unchained requests whenever possible where everything fits in one SBAL. This avoids acquiring more SBALs than necessary, especially during zfcp recovery when things might be stalled. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_fsf.c | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index c4eb62b32a32..bec912547fb8 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -1010,6 +1010,23 @@ skip_fsfstatus: send_ct->handler(send_ct->handler_data); } +static void zfcp_fsf_setup_ct_els_unchained(struct qdio_buffer_element *sbale, + struct scatterlist *sg_req, + struct scatterlist *sg_resp) +{ + sbale[0].flags |= SBAL_FLAGS0_TYPE_WRITE_READ; + sbale[2].addr = sg_virt(sg_req); + sbale[2].length = sg_req->length; + sbale[3].addr = sg_virt(sg_resp); + sbale[3].length = sg_resp->length; + sbale[3].flags |= SBAL_FLAGS_LAST_ENTRY; +} + +static int zfcp_fsf_one_sbal(struct scatterlist *sg) +{ + return sg_is_last(sg) && sg->length <= PAGE_SIZE; +} + static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req, struct scatterlist *sg_req, struct scatterlist *sg_resp, @@ -1020,16 +1037,16 @@ static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req, int bytes; if (!(feat & FSF_FEATURE_ELS_CT_CHAINED_SBALS)) { - if (sg_req->length > PAGE_SIZE || sg_resp->length > PAGE_SIZE || - !sg_is_last(sg_req) || !sg_is_last(sg_resp)) + if (!zfcp_fsf_one_sbal(sg_req) || !zfcp_fsf_one_sbal(sg_resp)) return -EOPNOTSUPP; - sbale[0].flags |= SBAL_FLAGS0_TYPE_WRITE_READ; - sbale[2].addr = sg_virt(sg_req); - sbale[2].length = sg_req->length; - sbale[3].addr = sg_virt(sg_resp); - sbale[3].length = sg_resp->length; - sbale[3].flags |= SBAL_FLAGS_LAST_ENTRY; + zfcp_fsf_setup_ct_els_unchained(sbale, sg_req, sg_resp); + return 0; + } + + /* use single, unchained SBAL if it can hold the request */ + if (zfcp_fsf_one_sbal(sg_req) && zfcp_fsf_one_sbal(sg_resp)) { + zfcp_fsf_setup_ct_els_unchained(sbale, sg_req, sg_resp); return 0; } -- cgit v1.2.2 From 9072df4dc6e8fd569d583815edb0198af4b688b8 Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:07 +0200 Subject: [SCSI] zfcp: Use -EIO for SBAL allocation failures -ENOMEM is for memory allocation problems, -EIO for queue/SBAL allocation problems. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_fsf.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index bec912547fb8..0c24695a53cb 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -1053,14 +1053,14 @@ static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req, bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ, sg_req, max_sbals); if (bytes <= 0) - return -ENOMEM; + return -EIO; req->qtcb->bottom.support.req_buf_length = bytes; req->sbale_curr = ZFCP_LAST_SBALE_PER_SBAL; bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ, sg_resp, max_sbals); if (bytes <= 0) - return -ENOMEM; + return -EIO; req->qtcb->bottom.support.resp_buf_length = bytes; return 0; @@ -2559,7 +2559,6 @@ struct zfcp_fsf_req *zfcp_fsf_control_file(struct zfcp_adapter *adapter, bytes = zfcp_qdio_sbals_from_sg(req, direction, fsf_cfdc->sg, FSF_MAX_SBALS_PER_REQ); if (bytes != ZFCP_CFDC_MAX_SIZE) { - retval = -ENOMEM; zfcp_fsf_req_free(req); goto out; } -- cgit v1.2.2 From ddb3e0c111fed0a8bf74884dc918274acec2b618 Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:08 +0200 Subject: [SCSI] zfcp: Fix logic for physical port close After closing the port, we want it to be "not open" to consider the action to be successful. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_erp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index e7d7ef55e37d..0a7c6aef532a 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -801,7 +801,7 @@ static int zfcp_erp_port_forced_strategy(struct zfcp_erp_action *erp_action) return ZFCP_ERP_FAILED; case ZFCP_ERP_STEP_PHYS_PORT_CLOSING: - if (status & ZFCP_STATUS_PORT_PHYS_OPEN) + if (!(status & ZFCP_STATUS_PORT_PHYS_OPEN)) return ZFCP_ERP_SUCCEEDED; } return ZFCP_ERP_FAILED; -- cgit v1.2.2 From 85600f7f8370fe5b4be0debd8b401de7986b52ae Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:09 +0200 Subject: [SCSI] zfcp: Fix erp escalation procedure If an action fails, retry it until the erp count exceeds the threshold. If there is something fundamentally wrong, the FSF layer will trigger a more appropriate action depending on the FSF status codes. The followup for successful actions is a different followup than retrying failed actions, so split the code two functions to make this clear. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_erp.c | 50 +++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 26 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index 0a7c6aef532a..b5562f952654 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -553,40 +553,35 @@ static void _zfcp_erp_unit_reopen_all(struct zfcp_port *port, int clear, _zfcp_erp_unit_reopen(unit, clear, id, ref); } -static void zfcp_erp_strategy_followup_actions(struct zfcp_erp_action *act) +static void zfcp_erp_strategy_followup_failed(struct zfcp_erp_action *act) { - struct zfcp_adapter *adapter = act->adapter; - struct zfcp_port *port = act->port; - struct zfcp_unit *unit = act->unit; - u32 status = act->status; - - /* initiate follow-up actions depending on success of finished action */ switch (act->action) { - case ZFCP_ERP_ACTION_REOPEN_ADAPTER: - if (status == ZFCP_ERP_SUCCEEDED) - _zfcp_erp_port_reopen_all(adapter, 0, "ersfa_1", NULL); - else - _zfcp_erp_adapter_reopen(adapter, 0, "ersfa_2", NULL); + _zfcp_erp_adapter_reopen(act->adapter, 0, "ersff_1", NULL); break; - case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED: - if (status == ZFCP_ERP_SUCCEEDED) - _zfcp_erp_port_reopen(port, 0, "ersfa_3", NULL); - else - _zfcp_erp_adapter_reopen(adapter, 0, "ersfa_4", NULL); + _zfcp_erp_port_forced_reopen(act->port, 0, "ersff_2", NULL); break; - case ZFCP_ERP_ACTION_REOPEN_PORT: - if (status == ZFCP_ERP_SUCCEEDED) - _zfcp_erp_unit_reopen_all(port, 0, "ersfa_5", NULL); - else - _zfcp_erp_port_forced_reopen(port, 0, "ersfa_6", NULL); + _zfcp_erp_port_reopen(act->port, 0, "ersff_3", NULL); break; - case ZFCP_ERP_ACTION_REOPEN_UNIT: - if (status != ZFCP_ERP_SUCCEEDED) - _zfcp_erp_port_reopen(unit->port, 0, "ersfa_7", NULL); + _zfcp_erp_unit_reopen(act->unit, 0, "ersff_4", NULL); + break; + } +} + +static void zfcp_erp_strategy_followup_success(struct zfcp_erp_action *act) +{ + switch (act->action) { + case ZFCP_ERP_ACTION_REOPEN_ADAPTER: + _zfcp_erp_port_reopen_all(act->adapter, 0, "ersfs_1", NULL); + break; + case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED: + _zfcp_erp_port_reopen(act->port, 0, "ersfs_2", NULL); + break; + case ZFCP_ERP_ACTION_REOPEN_PORT: + _zfcp_erp_unit_reopen_all(act->port, 0, "ersfs_3", NULL); break; } } @@ -1289,7 +1284,10 @@ static int zfcp_erp_strategy(struct zfcp_erp_action *erp_action) retval = zfcp_erp_strategy_statechange(erp_action, retval); if (retval == ZFCP_ERP_EXIT) goto unlock; - zfcp_erp_strategy_followup_actions(erp_action); + if (retval == ZFCP_ERP_SUCCEEDED) + zfcp_erp_strategy_followup_success(erp_action); + if (retval == ZFCP_ERP_FAILED) + zfcp_erp_strategy_followup_failed(erp_action); unlock: write_unlock(&adapter->erp_lock); -- cgit v1.2.2 From cbf1ed0264da104573458aedc220ebfcd02567f6 Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:10 +0200 Subject: [SCSI] zfcp: Recover from stalled outbound queue Depending on interruptions on some storage systems, the complete channel can stall which looks like an outbound queue stall to Linux. When trying to acquire a free SBAL for a non-SCSI command, zfcp waits for 5 seconds for a free slot to appear. This is the right place to detect a queue stall: If the wait times out, we assume a stalled queue and try to recover this. The overall strategy should be to trigger the erp from specific events, and not try an overall escalation from one failed port to a full-blown queue recovery. If we manage to send a command, the status codes for this command or a timeout will trigger the right follow-on actions. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_fsf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 0c24695a53cb..b7e48844056a 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -670,8 +670,11 @@ static int zfcp_fsf_req_sbal_get(struct zfcp_adapter *adapter) zfcp_fsf_sbal_check(adapter), 5 * HZ); if (ret > 0) return 0; - if (!ret) + if (!ret) { atomic_inc(&adapter->qdio_outb_full); + /* assume hanging outbound queue, try queue recovery */ + zfcp_erp_adapter_reopen(adapter, 0, "fsrsg_1", NULL); + } spin_lock_bh(&adapter->req_q_lock); return -EIO; -- cgit v1.2.2 From 379d6bf6573ee6541a38bbe9140c1f0b94e3feae Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:11 +0200 Subject: [SCSI] zfcp: Add port only once to FC transport class When calling fc_remote_port_add make sure to not call it again before fc_remote_port_delete has been called. In other words, ensure to create a new fc_rport, then delete it, then create a new one again. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_scsi.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 967ede73f4c5..ba32709921a4 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -534,6 +534,9 @@ static void zfcp_scsi_rport_register(struct zfcp_port *port) struct fc_rport_identifiers ids; struct fc_rport *rport; + if (port->rport) + return; + ids.node_name = port->wwnn; ids.port_name = port->wwpn; ids.port_id = port->d_id; @@ -557,8 +560,10 @@ static void zfcp_scsi_rport_block(struct zfcp_port *port) { struct fc_rport *rport = port->rport; - if (rport) + if (rport) { fc_remote_port_delete(rport); + port->rport = NULL; + } } void zfcp_scsi_schedule_rport_register(struct zfcp_port *port) -- cgit v1.2.2 From 17a093ef018481ee1760da19568bad3c11da395d Mon Sep 17 00:00:00 2001 From: Swen Schillig Date: Mon, 13 Jul 2009 15:06:12 +0200 Subject: [SCSI] zfcp: avoid double notify in lowmem scenario In a LOWMEM condition an ERP notification would have been sent twice causing an unpredictable behaviour of the ERP. Signed-off-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_erp.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index b5562f952654..c75d6f35cb5f 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -848,11 +848,17 @@ void zfcp_erp_port_strategy_open_lookup(struct work_struct *work) gid_pn_work); retval = zfcp_fc_ns_gid_pn(&port->erp_action); - if (retval == -ENOMEM) + if (!retval) { + port->erp_action.step = ZFCP_ERP_STEP_NAMESERVER_LOOKUP; + goto out; + } + if (retval == -ENOMEM) { zfcp_erp_notify(&port->erp_action, ZFCP_STATUS_ERP_LOWMEM); - port->erp_action.step = ZFCP_ERP_STEP_NAMESERVER_LOOKUP; - if (retval) - zfcp_erp_notify(&port->erp_action, 0); + goto out; + } + /* all other error condtions */ + zfcp_erp_notify(&port->erp_action, 0); +out: zfcp_port_put(port); } -- cgit v1.2.2 From 27f492ccec94b6acd8440c83bfe0515ce4db0af0 Mon Sep 17 00:00:00 2001 From: Swen Schillig Date: Mon, 13 Jul 2009 15:06:13 +0200 Subject: [SCSI] zfcp: Fix wka port processing Under certain conditions it is possible that a WKA port ist not opened within the expected timeframe of half a second. In this situation the WKA port remains in the state OPENING preventing any succeding request to open the port. This led to unrecoverable remote ports. Fixing this by always setting an appropriate WKA port status before leaving the function and removing the timeout value here since it's not needed here because the general timeout processing would deal with it if required. Signed-off-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_fc.c | 8 +++----- drivers/s390/scsi/zfcp_fsf.c | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c index 2f0705d76b72..47daebfa7e59 100644 --- a/drivers/s390/scsi/zfcp_fc.c +++ b/drivers/s390/scsi/zfcp_fc.c @@ -79,11 +79,9 @@ static int zfcp_wka_port_get(struct zfcp_wka_port *wka_port) mutex_unlock(&wka_port->mutex); - wait_event_timeout( - wka_port->completion_wq, - wka_port->status == ZFCP_WKA_PORT_ONLINE || - wka_port->status == ZFCP_WKA_PORT_OFFLINE, - HZ >> 1); + wait_event(wka_port->completion_wq, + wka_port->status == ZFCP_WKA_PORT_ONLINE || + wka_port->status == ZFCP_WKA_PORT_OFFLINE); if (wka_port->status == ZFCP_WKA_PORT_ONLINE) { atomic_inc(&wka_port->refcount); diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index b7e48844056a..47795fbf081f 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -1627,10 +1627,10 @@ static void zfcp_fsf_open_wka_port_handler(struct zfcp_fsf_req *req) case FSF_ACCESS_DENIED: wka_port->status = ZFCP_WKA_PORT_OFFLINE; break; - case FSF_PORT_ALREADY_OPEN: - break; case FSF_GOOD: wka_port->handle = header->port_handle; + /* fall through */ + case FSF_PORT_ALREADY_OPEN: wka_port->status = ZFCP_WKA_PORT_ONLINE; } out: -- cgit v1.2.2 From a11a52be115889a5d1f738ed2e154807bceed4ee Mon Sep 17 00:00:00 2001 From: Christof Schmitt Date: Mon, 13 Jul 2009 15:06:14 +0200 Subject: [SCSI] zfcp: Fix tracing of request id for abort requests The trace record for SCSI abort requests has a field for the request id of the request to be aborted. Put the real request id instead of zero. Reviewed-by: Swen Schillig Signed-off-by: Christof Schmitt Signed-off-by: James Bottomley --- drivers/s390/scsi/zfcp_scsi.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index ba32709921a4..6925a1784682 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -167,20 +167,21 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) struct zfcp_unit *unit = scpnt->device->hostdata; struct zfcp_fsf_req *old_req, *abrt_req; unsigned long flags; - unsigned long old_req_id = (unsigned long) scpnt->host_scribble; + unsigned long old_reqid = (unsigned long) scpnt->host_scribble; int retval = SUCCESS; int retry = 3; + char *dbf_tag; /* avoid race condition between late normal completion and abort */ write_lock_irqsave(&adapter->abort_lock, flags); spin_lock(&adapter->req_list_lock); - old_req = zfcp_reqlist_find(adapter, old_req_id); + old_req = zfcp_reqlist_find(adapter, old_reqid); spin_unlock(&adapter->req_list_lock); if (!old_req) { write_unlock_irqrestore(&adapter->abort_lock, flags); zfcp_scsi_dbf_event_abort("lte1", adapter, scpnt, NULL, - old_req_id); + old_reqid); return FAILED; /* completion could be in progress */ } old_req->data = NULL; @@ -189,7 +190,7 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) write_unlock_irqrestore(&adapter->abort_lock, flags); while (retry--) { - abrt_req = zfcp_fsf_abort_fcp_command(old_req_id, unit); + abrt_req = zfcp_fsf_abort_fcp_command(old_reqid, unit); if (abrt_req) break; @@ -197,7 +198,7 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) if (!(atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_RUNNING)) { zfcp_scsi_dbf_event_abort("nres", adapter, scpnt, NULL, - old_req_id); + old_reqid); return SUCCESS; } } @@ -208,13 +209,14 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) abrt_req->status & ZFCP_STATUS_FSFREQ_COMPLETED); if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTSUCCEEDED) - zfcp_scsi_dbf_event_abort("okay", adapter, scpnt, abrt_req, 0); + dbf_tag = "okay"; else if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTNOTNEEDED) - zfcp_scsi_dbf_event_abort("lte2", adapter, scpnt, abrt_req, 0); + dbf_tag = "lte2"; else { - zfcp_scsi_dbf_event_abort("fail", adapter, scpnt, abrt_req, 0); + dbf_tag = "fail"; retval = FAILED; } + zfcp_scsi_dbf_event_abort(dbf_tag, adapter, scpnt, abrt_req, old_reqid); zfcp_fsf_req_free(abrt_req); return retval; } -- cgit v1.2.2 From 6187c242089d334102be76427a5a020240e6c19a Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 15 Jul 2009 15:02:57 -0500 Subject: [SCSI] libiscsi: disable bh in and abort handler. The session lock can be held in the scsi eh thread or the completion paths run from the net softirq. This disables bhs in iscsi_eh_abort when taking the session lock. Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/libiscsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 716cc344c5df..a751f6230c22 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1974,10 +1974,10 @@ int iscsi_eh_abort(struct scsi_cmnd *sc) * good and have never sent us a successful tmf response * then sent more data for the cmd. */ - spin_lock(&session->lock); + spin_lock_bh(&session->lock); fail_scsi_task(task, DID_ABORT); conn->tmf_state = TMF_INITIAL; - spin_unlock(&session->lock); + spin_unlock_bh(&session->lock); iscsi_start_tx(conn); goto success_unlocked; case TMF_TIMEDOUT: -- cgit v1.2.2 From 94bced3c1b371014cbd187f2df5539b13a0e3b90 Mon Sep 17 00:00:00 2001 From: Karen Higgins Date: Wed, 15 Jul 2009 15:02:58 -0500 Subject: [SCSI] qla4xxx: Correct Extended Sense Data Errors Fixed sense data errors occurring above the first 32 bytes, as required by some third party applications. Sense data in the first 32 bytes has always been correct. Patch updated to use srb data variables instead of scsi command scratchpad data area, as scratchpad area is already used. Also, corrected debug print alignment bug in dump_buffer routine. Changed KERN_DEBUG to KERN_INFO in printk statements in this routine. Changed version number to 5.01.00-k9 Signed-off-by: Karen Higgins [michaelc: fixed checkpath.pl errors] Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/qla4xxx/ql4_dbg.c | 15 ++-- drivers/scsi/qla4xxx/ql4_def.h | 7 ++ drivers/scsi/qla4xxx/ql4_fw.h | 7 ++ drivers/scsi/qla4xxx/ql4_isr.c | 145 +++++++++++++++++++++++++------------ drivers/scsi/qla4xxx/ql4_version.h | 2 +- 5 files changed, 122 insertions(+), 54 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/qla4xxx/ql4_dbg.c b/drivers/scsi/qla4xxx/ql4_dbg.c index fcc184cd066d..cbceb0ebabf7 100644 --- a/drivers/scsi/qla4xxx/ql4_dbg.c +++ b/drivers/scsi/qla4xxx/ql4_dbg.c @@ -15,19 +15,18 @@ void qla4xxx_dump_buffer(void *b, uint32_t size) uint32_t cnt; uint8_t *c = b; - printk(" 0 1 2 3 4 5 6 7 8 9 Ah Bh Ch Dh Eh " + printk(" 0 1 2 3 4 5 6 7 8 9 Ah Bh Ch Dh Eh " "Fh\n"); printk("------------------------------------------------------------" "--\n"); - for (cnt = 0; cnt < size; cnt++, c++) { - printk(KERN_DEBUG "%02x", *c); - if (!(cnt % 16)) - printk(KERN_DEBUG "\n"); + for (cnt = 0; cnt < size; c++) { + printk(KERN_INFO "%02x", *c); + if (!(++cnt % 16)) + printk(KERN_INFO "\n"); else - printk(KERN_DEBUG " "); + printk(KERN_INFO " "); } - if (cnt % 16) - printk(KERN_DEBUG "\n"); + printk(KERN_INFO "\n"); } diff --git a/drivers/scsi/qla4xxx/ql4_def.h b/drivers/scsi/qla4xxx/ql4_def.h index b586f27c3bd4..963e8553d210 100644 --- a/drivers/scsi/qla4xxx/ql4_def.h +++ b/drivers/scsi/qla4xxx/ql4_def.h @@ -184,6 +184,11 @@ struct srb { uint16_t cc_stat; u_long r_start; /* Time we recieve a cmd from OS */ u_long u_start; /* Time when we handed the cmd to F/W */ + + /* Used for extended sense / status continuation */ + uint8_t *req_sense_ptr; + uint16_t req_sense_len; + uint16_t reserved2; }; /* @@ -436,6 +441,8 @@ struct scsi_qla_host { /* Map ddb_list entry by FW ddb index */ struct ddb_entry *fw_ddb_index_map[MAX_DDB_ENTRIES]; + /* Saved srb for status continuation entry processing */ + struct srb *status_srb; }; static inline int is_qla4010(struct scsi_qla_host *ha) diff --git a/drivers/scsi/qla4xxx/ql4_fw.h b/drivers/scsi/qla4xxx/ql4_fw.h index 1b667a70cffa..9cd7a608df38 100644 --- a/drivers/scsi/qla4xxx/ql4_fw.h +++ b/drivers/scsi/qla4xxx/ql4_fw.h @@ -572,6 +572,7 @@ struct conn_event_log_entry { *************************************************************************/ #define IOCB_MAX_CDB_LEN 16 /* Bytes in a CBD */ #define IOCB_MAX_SENSEDATA_LEN 32 /* Bytes of sense data */ +#define IOCB_MAX_EXT_SENSEDATA_LEN 60 /* Bytes of extended sense data */ /* IOCB header structure */ struct qla4_header { @@ -733,6 +734,12 @@ struct status_entry { }; +/* Status Continuation entry */ +struct status_cont_entry { + struct qla4_header hdr; /* 00-03 */ + uint8_t ext_sense_data[IOCB_MAX_EXT_SENSEDATA_LEN]; /* 04-63 */ +}; + struct passthru0 { struct qla4_header hdr; /* 00-03 */ uint32_t handle; /* 04-07 */ diff --git a/drivers/scsi/qla4xxx/ql4_isr.c b/drivers/scsi/qla4xxx/ql4_isr.c index 799120fcb9be..8025ee16588e 100644 --- a/drivers/scsi/qla4xxx/ql4_isr.c +++ b/drivers/scsi/qla4xxx/ql4_isr.c @@ -10,6 +10,98 @@ #include "ql4_dbg.h" #include "ql4_inline.h" +/** + * qla4xxx_copy_sense - copy sense data into cmd sense buffer + * @ha: Pointer to host adapter structure. + * @sts_entry: Pointer to status entry structure. + * @srb: Pointer to srb structure. + **/ +static void qla4xxx_copy_sense(struct scsi_qla_host *ha, + struct status_entry *sts_entry, + struct srb *srb) +{ + struct scsi_cmnd *cmd = srb->cmd; + uint16_t sense_len; + + memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); + sense_len = le16_to_cpu(sts_entry->senseDataByteCnt); + if (sense_len == 0) + return; + + /* Save total available sense length, + * not to exceed cmd's sense buffer size */ + sense_len = min_t(uint16_t, sense_len, SCSI_SENSE_BUFFERSIZE); + srb->req_sense_ptr = cmd->sense_buffer; + srb->req_sense_len = sense_len; + + /* Copy sense from sts_entry pkt */ + sense_len = min_t(uint16_t, sense_len, IOCB_MAX_SENSEDATA_LEN); + memcpy(cmd->sense_buffer, sts_entry->senseData, sense_len); + + DEBUG2(printk(KERN_INFO "scsi%ld:%d:%d:%d: %s: sense key = %x, " + "ASL= %02x, ASC/ASCQ = %02x/%02x\n", ha->host_no, + cmd->device->channel, cmd->device->id, + cmd->device->lun, __func__, + sts_entry->senseData[2] & 0x0f, + sts_entry->senseData[7], + sts_entry->senseData[12], + sts_entry->senseData[13])); + + DEBUG5(qla4xxx_dump_buffer(cmd->sense_buffer, sense_len)); + srb->flags |= SRB_GOT_SENSE; + + /* Update srb, in case a sts_cont pkt follows */ + srb->req_sense_ptr += sense_len; + srb->req_sense_len -= sense_len; + if (srb->req_sense_len != 0) + ha->status_srb = srb; + else + ha->status_srb = NULL; +} + +/** + * qla4xxx_status_cont_entry - Process a Status Continuations entry. + * @ha: SCSI driver HA context + * @sts_cont: Entry pointer + * + * Extended sense data. + */ +static void +qla4xxx_status_cont_entry(struct scsi_qla_host *ha, + struct status_cont_entry *sts_cont) +{ + struct srb *srb = ha->status_srb; + struct scsi_cmnd *cmd; + uint8_t sense_len; + + if (srb == NULL) + return; + + cmd = srb->cmd; + if (cmd == NULL) { + DEBUG2(printk(KERN_INFO "scsi%ld: %s: Cmd already returned " + "back to OS srb=%p srb->state:%d\n", ha->host_no, + __func__, srb, srb->state)); + ha->status_srb = NULL; + return; + } + + /* Copy sense data. */ + sense_len = min_t(uint16_t, srb->req_sense_len, + IOCB_MAX_EXT_SENSEDATA_LEN); + memcpy(srb->req_sense_ptr, sts_cont->ext_sense_data, sense_len); + DEBUG5(qla4xxx_dump_buffer(srb->req_sense_ptr, sense_len)); + + srb->req_sense_ptr += sense_len; + srb->req_sense_len -= sense_len; + + /* Place command on done queue. */ + if (srb->req_sense_len == 0) { + qla4xxx_srb_compl(ha, srb); + ha->status_srb = NULL; + } +} + /** * qla4xxx_status_entry - processes status IOCBs * @ha: Pointer to host adapter structure. @@ -23,7 +115,6 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, struct srb *srb; struct ddb_entry *ddb_entry; uint32_t residual; - uint16_t sensebytecnt; srb = qla4xxx_del_from_active_array(ha, le32_to_cpu(sts_entry->handle)); if (!srb) { @@ -92,24 +183,7 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, break; /* Copy Sense Data into sense buffer. */ - memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); - - sensebytecnt = le16_to_cpu(sts_entry->senseDataByteCnt); - if (sensebytecnt == 0) - break; - - memcpy(cmd->sense_buffer, sts_entry->senseData, - min_t(uint16_t, sensebytecnt, SCSI_SENSE_BUFFERSIZE)); - - DEBUG2(printk("scsi%ld:%d:%d:%d: %s: sense key = %x, " - "ASC/ASCQ = %02x/%02x\n", ha->host_no, - cmd->device->channel, cmd->device->id, - cmd->device->lun, __func__, - sts_entry->senseData[2] & 0x0f, - sts_entry->senseData[12], - sts_entry->senseData[13])); - - srb->flags |= SRB_GOT_SENSE; + qla4xxx_copy_sense(ha, sts_entry, srb); break; case SCS_INCOMPLETE: @@ -176,23 +250,7 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, break; /* Copy Sense Data into sense buffer. */ - memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); - - sensebytecnt = - le16_to_cpu(sts_entry->senseDataByteCnt); - if (sensebytecnt == 0) - break; - - memcpy(cmd->sense_buffer, sts_entry->senseData, - min_t(uint16_t, sensebytecnt, SCSI_SENSE_BUFFERSIZE)); - - DEBUG2(printk("scsi%ld:%d:%d:%d: %s: sense key = %x, " - "ASC/ASCQ = %02x/%02x\n", ha->host_no, - cmd->device->channel, cmd->device->id, - cmd->device->lun, __func__, - sts_entry->senseData[2] & 0x0f, - sts_entry->senseData[12], - sts_entry->senseData[13])); + qla4xxx_copy_sense(ha, sts_entry, srb); } else { /* * If RISC reports underrun and target does not @@ -268,9 +326,10 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, status_entry_exit: - /* complete the request */ + /* complete the request, if not waiting for status_continuation pkt */ srb->cc_stat = sts_entry->completionStatus; - qla4xxx_srb_compl(ha, srb); + if (ha->status_srb == NULL) + qla4xxx_srb_compl(ha, srb); } /** @@ -305,10 +364,7 @@ static void qla4xxx_process_response_queue(struct scsi_qla_host * ha) /* process entry */ switch (sts_entry->hdr.entryType) { case ET_STATUS: - /* - * Common status - Single completion posted in single - * IOSB. - */ + /* Common status */ qla4xxx_status_entry(ha, sts_entry); break; @@ -316,9 +372,8 @@ static void qla4xxx_process_response_queue(struct scsi_qla_host * ha) break; case ET_STATUS_CONTINUATION: - /* Just throw away the status continuation entries */ - DEBUG2(printk("scsi%ld: %s: Status Continuation entry " - "- ignoring\n", ha->host_no, __func__)); + qla4xxx_status_cont_entry(ha, + (struct status_cont_entry *) sts_entry); break; case ET_COMMAND: diff --git a/drivers/scsi/qla4xxx/ql4_version.h b/drivers/scsi/qla4xxx/ql4_version.h index ab984cb89cea..6980cb279c81 100644 --- a/drivers/scsi/qla4xxx/ql4_version.h +++ b/drivers/scsi/qla4xxx/ql4_version.h @@ -5,5 +5,5 @@ * See LICENSE.qla4xxx for copyright and licensing details. */ -#define QLA4XXX_DRIVER_VERSION "5.01.00-k8" +#define QLA4XXX_DRIVER_VERSION "5.01.00-k9" -- cgit v1.2.2 From 5c656af7e4edfe44c85034d6fa7002909f9c3c59 Mon Sep 17 00:00:00 2001 From: Mike Christie Date: Wed, 15 Jul 2009 15:02:59 -0500 Subject: [SCSI] qla4xxx: add timeout handler Recently dm-multipath began calling blk_abort_queue. This causes all the commands/request running on the path to have the timeout function called. If a path does go down, and the LLD returns DID_*, dm-multpiath will eventually get this error and begin to call the cmd timeout handler. qla4xxx currently does not set a timed out handler and so the default one could return BLK_EH_NOT_HANDLED and end up firing the scsi eh and stopping IO to all paths on the host when only one path is affected. For software and offload iscsi we have a timed out handler already. This patch adds a driver specific one to qla4xxx because there are some ddb->state and session->state and command completion races that are better handled in the LLD. This also handles the problem where if the session is down, we do not need the scsi eh to run until the transport code has tried to reconnect us. Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/qla4xxx/ql4_os.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'drivers') diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index ec9da6ce8489..6841883b3611 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -66,6 +66,7 @@ static int qla4xxx_sess_get_param(struct iscsi_cls_session *sess, static int qla4xxx_host_get_param(struct Scsi_Host *shost, enum iscsi_host_param param, char *buf); static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session); +static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc); /* * SCSI host template entry points @@ -89,6 +90,7 @@ static struct scsi_host_template qla4xxx_driver_template = { .eh_device_reset_handler = qla4xxx_eh_device_reset, .eh_target_reset_handler = qla4xxx_eh_target_reset, .eh_host_reset_handler = qla4xxx_eh_host_reset, + .eh_timed_out = qla4xxx_eh_cmd_timed_out, .slave_configure = qla4xxx_slave_configure, .slave_alloc = qla4xxx_slave_alloc, @@ -124,6 +126,21 @@ static struct iscsi_transport qla4xxx_iscsi_transport = { static struct scsi_transport_template *qla4xxx_scsi_transport; +static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc) +{ + struct iscsi_cls_session *session; + struct ddb_entry *ddb_entry; + + session = starget_to_session(scsi_target(sc->device)); + ddb_entry = session->dd_data; + + /* if we are not logged in then the LLD is going to clean up the cmd */ + if (atomic_read(&ddb_entry->state) != DDB_STATE_ONLINE) + return BLK_EH_RESET_TIMER; + else + return BLK_EH_NOT_HANDLED; +} + static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session) { struct ddb_entry *ddb_entry = session->dd_data; -- cgit v1.2.2 From dca05c4c07c48da0509708d9e562578d269e90e5 Mon Sep 17 00:00:00 2001 From: Karen Higgins Date: Wed, 15 Jul 2009 15:03:00 -0500 Subject: [SCSI] qla4xxx: Fix Driver Fault Recovery Completion Fixed driver bug where adapter recovery did not complete if there were outstanding commands detected on that host adapter. Signed-off-by: Karen Higgins Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/qla4xxx/ql4_os.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index 6841883b3611..e1cc0d21d890 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -921,18 +921,17 @@ static int qla4xxx_recover_adapter(struct scsi_qla_host *ha, /* Flush any pending ddb changed AENs */ qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS); + qla4xxx_flush_active_srbs(ha); + /* Reset the firmware. If successful, function * returns with ISP interrupts enabled. */ - if (status == QLA_SUCCESS) { - DEBUG2(printk("scsi%ld: %s - Performing soft reset..\n", - ha->host_no, __func__)); - qla4xxx_flush_active_srbs(ha); - if (ql4xxx_lock_drvr_wait(ha) == QLA_SUCCESS) - status = qla4xxx_soft_reset(ha); - else - status = QLA_ERROR; - } + DEBUG2(printk("scsi%ld: %s - Performing soft reset..\n", + ha->host_no, __func__)); + if (ql4xxx_lock_drvr_wait(ha) == QLA_SUCCESS) + status = qla4xxx_soft_reset(ha); + else + status = QLA_ERROR; /* Flush any pending ddb changed AENs */ qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS); @@ -1661,7 +1660,7 @@ static int qla4xxx_eh_host_reset(struct scsi_cmnd *cmd) ha = (struct scsi_qla_host *) cmd->device->host->hostdata; dev_info(&ha->pdev->dev, - "scsi(%ld:%d:%d:%d): ADAPTER RESET ISSUED.\n", ha->host_no, + "scsi(%ld:%d:%d:%d): HOST RESET ISSUED.\n", ha->host_no, cmd->device->channel, cmd->device->id, cmd->device->lun); if (qla4xxx_wait_for_hba_online(ha) != QLA_SUCCESS) { -- cgit v1.2.2 From 612f73488785829d4f34aad00bfe30b904c94c9e Mon Sep 17 00:00:00 2001 From: Karen Higgins Date: Wed, 15 Jul 2009 15:03:01 -0500 Subject: [SCSI] qla4xxx: Fix srb lookup in qla4xxx_eh_device_reset eh_device_reset may be called from scsi error handler or sg_reset, etc. When called from sg_reset, there will not be an associated srb. The driver should lookup the corresponding device handle given information from the supplied cmd structure and should not assume that there exists an srb. Signed-off-by: Karen Higgins Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/qla4xxx/ql4_os.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index e1cc0d21d890..40e3cafb3a9c 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -1543,11 +1543,9 @@ static int qla4xxx_eh_device_reset(struct scsi_cmnd *cmd) { struct scsi_qla_host *ha = to_qla_host(cmd->device->host); struct ddb_entry *ddb_entry = cmd->device->hostdata; - struct srb *sp; int ret = FAILED, stat; - sp = (struct srb *) cmd->SCp.ptr; - if (!sp || !ddb_entry) + if (!ddb_entry) return ret; dev_info(&ha->pdev->dev, -- cgit v1.2.2 From 16ed55f9de6743ceece9bf528362cadff10f1c5c Mon Sep 17 00:00:00 2001 From: Karen Higgins Date: Wed, 15 Jul 2009 15:03:02 -0500 Subject: [SCSI] qla4xxx: Remove hiwat code so scsi eh does not get escalated when we can make progress Removed unnecessary hiwat code to free up the number available IOCBs. Eliminates unnecessary eh_ escalations due to inability to obtain IOCB pkt for marker. v2. - Remove define not used anymore and fix req_q_coun accounting. Signed-off-by: Karen Higgins [michaelc: ported patch from qlogic.com driver to upstream] Signed-off-by: Mike Christie Signed-off-by: James Bottomley --- drivers/scsi/qla4xxx/ql4_def.h | 2 - drivers/scsi/qla4xxx/ql4_iocb.c | 133 ++++++++++++++++++---------------------- drivers/scsi/qla4xxx/ql4_mbx.c | 10 --- 3 files changed, 60 insertions(+), 85 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/qla4xxx/ql4_def.h b/drivers/scsi/qla4xxx/ql4_def.h index 963e8553d210..81b5f29254e2 100644 --- a/drivers/scsi/qla4xxx/ql4_def.h +++ b/drivers/scsi/qla4xxx/ql4_def.h @@ -100,7 +100,6 @@ #define MAX_SRBS MAX_CMDS_TO_RISC #define MBOX_AEN_REG_COUNT 5 #define MAX_INIT_RETRIES 5 -#define IOCB_HIWAT_CUSHION 16 /* * Buffer sizes @@ -307,7 +306,6 @@ struct scsi_qla_host { uint32_t tot_ddbs; uint16_t iocb_cnt; - uint16_t iocb_hiwat; /* SRB cache. */ #define SRB_MIN_REQ 128 diff --git a/drivers/scsi/qla4xxx/ql4_iocb.c b/drivers/scsi/qla4xxx/ql4_iocb.c index 912a67494adf..e0c32159749c 100644 --- a/drivers/scsi/qla4xxx/ql4_iocb.c +++ b/drivers/scsi/qla4xxx/ql4_iocb.c @@ -10,9 +10,42 @@ #include "ql4_dbg.h" #include "ql4_inline.h" - #include +static int +qla4xxx_space_in_req_ring(struct scsi_qla_host *ha, uint16_t req_cnt) +{ + uint16_t cnt; + + /* Calculate number of free request entries. */ + if ((req_cnt + 2) >= ha->req_q_count) { + cnt = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out); + if (ha->request_in < cnt) + ha->req_q_count = cnt - ha->request_in; + else + ha->req_q_count = REQUEST_QUEUE_DEPTH - + (ha->request_in - cnt); + } + + /* Check if room for request in request ring. */ + if ((req_cnt + 2) < ha->req_q_count) + return 1; + else + return 0; +} + +static void qla4xxx_advance_req_ring_ptr(struct scsi_qla_host *ha) +{ + /* Advance request queue pointer */ + if (ha->request_in == (REQUEST_QUEUE_DEPTH - 1)) { + ha->request_in = 0; + ha->request_ptr = ha->request_ring; + } else { + ha->request_in++; + ha->request_ptr++; + } +} + /** * qla4xxx_get_req_pkt - returns a valid entry in request queue. * @ha: Pointer to host adapter structure. @@ -26,35 +59,18 @@ static int qla4xxx_get_req_pkt(struct scsi_qla_host *ha, struct queue_entry **queue_entry) { - uint16_t request_in; - uint8_t status = QLA_SUCCESS; - - *queue_entry = ha->request_ptr; + uint16_t req_cnt = 1; - /* get the latest request_in and request_out index */ - request_in = ha->request_in; - ha->request_out = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out); - - /* Advance request queue pointer and check for queue full */ - if (request_in == (REQUEST_QUEUE_DEPTH - 1)) { - request_in = 0; - ha->request_ptr = ha->request_ring; - } else { - request_in++; - ha->request_ptr++; - } - - /* request queue is full, try again later */ - if ((ha->iocb_cnt + 1) >= ha->iocb_hiwat) { - /* restore request pointer */ - ha->request_ptr = *queue_entry; - status = QLA_ERROR; - } else { - ha->request_in = request_in; + if (qla4xxx_space_in_req_ring(ha, req_cnt)) { + *queue_entry = ha->request_ptr; memset(*queue_entry, 0, sizeof(**queue_entry)); + + qla4xxx_advance_req_ring_ptr(ha); + ha->req_q_count -= req_cnt; + return QLA_SUCCESS; } - return status; + return QLA_ERROR; } /** @@ -100,21 +116,14 @@ exit_send_marker: return status; } -static struct continuation_t1_entry* qla4xxx_alloc_cont_entry( - struct scsi_qla_host *ha) +static struct continuation_t1_entry * +qla4xxx_alloc_cont_entry(struct scsi_qla_host *ha) { struct continuation_t1_entry *cont_entry; cont_entry = (struct continuation_t1_entry *)ha->request_ptr; - /* Advance request queue pointer */ - if (ha->request_in == (REQUEST_QUEUE_DEPTH - 1)) { - ha->request_in = 0; - ha->request_ptr = ha->request_ring; - } else { - ha->request_in++; - ha->request_ptr++; - } + qla4xxx_advance_req_ring_ptr(ha); /* Load packet defaults */ cont_entry->hdr.entryType = ET_CONTINUE; @@ -197,13 +206,10 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) struct scsi_cmnd *cmd = srb->cmd; struct ddb_entry *ddb_entry; struct command_t3_entry *cmd_entry; - int nseg; uint16_t tot_dsds; uint16_t req_cnt; - unsigned long flags; - uint16_t cnt; uint32_t index; char tag[2]; @@ -217,6 +223,19 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) index = (uint32_t)cmd->request->tag; + /* + * Check to see if adapter is online before placing request on + * request queue. If a reset occurs and a request is in the queue, + * the firmware will still attempt to process the request, retrieving + * garbage for pointers. + */ + if (!test_bit(AF_ONLINE, &ha->flags)) { + DEBUG2(printk("scsi%ld: %s: Adapter OFFLINE! " + "Do not issue command.\n", + ha->host_no, __func__)); + goto queuing_error; + } + /* Calculate the number of request entries needed. */ nseg = scsi_dma_map(cmd); if (nseg < 0) @@ -224,17 +243,7 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) tot_dsds = nseg; req_cnt = qla4xxx_calc_request_entries(tot_dsds); - - if (ha->req_q_count < (req_cnt + 2)) { - cnt = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out); - if (ha->request_in < cnt) - ha->req_q_count = cnt - ha->request_in; - else - ha->req_q_count = REQUEST_QUEUE_DEPTH - - (ha->request_in - cnt); - } - - if (ha->req_q_count < (req_cnt + 2)) + if (!qla4xxx_space_in_req_ring(ha, req_cnt)) goto queuing_error; /* total iocbs active */ @@ -286,32 +295,10 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) break; } - - /* Advance request queue pointer */ - ha->request_in++; - if (ha->request_in == REQUEST_QUEUE_DEPTH) { - ha->request_in = 0; - ha->request_ptr = ha->request_ring; - } else - ha->request_ptr++; - - + qla4xxx_advance_req_ring_ptr(ha); qla4xxx_build_scsi_iocbs(srb, cmd_entry, tot_dsds); wmb(); - /* - * Check to see if adapter is online before placing request on - * request queue. If a reset occurs and a request is in the queue, - * the firmware will still attempt to process the request, retrieving - * garbage for pointers. - */ - if (!test_bit(AF_ONLINE, &ha->flags)) { - DEBUG2(printk("scsi%ld: %s: Adapter OFFLINE! " - "Do not issue command.\n", - ha->host_no, __func__)); - goto queuing_error; - } - srb->cmd->host_scribble = (unsigned char *)srb; /* update counters */ diff --git a/drivers/scsi/qla4xxx/ql4_mbx.c b/drivers/scsi/qla4xxx/ql4_mbx.c index 051b0f5e8c8e..09d6d4b76f39 100644 --- a/drivers/scsi/qla4xxx/ql4_mbx.c +++ b/drivers/scsi/qla4xxx/ql4_mbx.c @@ -385,16 +385,6 @@ int qla4xxx_get_firmware_status(struct scsi_qla_host * ha) mbox_sts[0])); return QLA_ERROR; } - - /* High-water mark of IOCBs */ - ha->iocb_hiwat = mbox_sts[2]; - if (ha->iocb_hiwat > IOCB_HIWAT_CUSHION) - ha->iocb_hiwat -= IOCB_HIWAT_CUSHION; - else - dev_info(&ha->pdev->dev, "WARNING!!! You have less than %d " - "firmware IOCBs available (%d).\n", - IOCB_HIWAT_CUSHION, ha->iocb_hiwat); - return QLA_SUCCESS; } -- cgit v1.2.2 From a0cc1ecc098e31d03b3265712a3e280a7fabf438 Mon Sep 17 00:00:00 2001 From: Vasu Dev Date: Tue, 28 Jul 2009 17:33:37 -0700 Subject: [SCSI] libfc: fix a circular locking warning during sending RRQ Currently the fc_exch_rrq is called with fc_exch's ex_lock held. The fc_exch_rrq allocates new exch and that requires taking ex_lock again after EM lock. This locking order causes warning, see more details on this warning at :- http://www.open-fcoe.org/pipermail/devel/2009-July/003251.html This patch fixes this by dropping the ex_lock before calling fc_exch_rrq(). The fc_exch_rrq needs to grab ex_lock lock again to schedule RRQ retry and in the meanwhile fc_exch_reset could occur before ex_lock is grabbed inside fc_exch_rrq. So to handle this case, this patch adds additional check to detect fc_exch_reset after ex_lock acquired and in case the fc_exch_reset occurred then abandons the RRQ retry and releases the exch. Signed-off-by: Vasu Dev Signed-off-by: Robert Love Signed-off-by: James Bottomley --- drivers/scsi/libfc/fc_exch.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index 2bc22be5f849..145ab9ba55ea 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -415,9 +415,9 @@ static void fc_exch_timeout(struct work_struct *work) e_stat = ep->esb_stat; if (e_stat & ESB_ST_COMPLETE) { ep->esb_stat = e_stat & ~ESB_ST_REC_QUAL; + spin_unlock_bh(&ep->ex_lock); if (e_stat & ESB_ST_REC_QUAL) fc_exch_rrq(ep); - spin_unlock_bh(&ep->ex_lock); goto done; } else { resp = ep->resp; @@ -1624,14 +1624,14 @@ static void fc_exch_rrq(struct fc_exch *ep) struct fc_lport *lp; struct fc_els_rrq *rrq; struct fc_frame *fp; - struct fc_seq *rrq_sp; u32 did; lp = ep->lp; fp = fc_frame_alloc(lp, sizeof(*rrq)); if (!fp) - return; + goto retry; + rrq = fc_frame_payload_get(fp, sizeof(*rrq)); memset(rrq, 0, sizeof(*rrq)); rrq->rrq_cmd = ELS_RRQ; @@ -1647,13 +1647,20 @@ static void fc_exch_rrq(struct fc_exch *ep) fc_host_port_id(lp->host), FC_TYPE_ELS, FC_FC_FIRST_SEQ | FC_FC_END_SEQ | FC_FC_SEQ_INIT, 0); - rrq_sp = fc_exch_seq_send(lp, fp, fc_exch_rrq_resp, NULL, ep, - lp->e_d_tov); - if (!rrq_sp) { - ep->esb_stat |= ESB_ST_REC_QUAL; - fc_exch_timer_set_locked(ep, ep->r_a_tov); + if (fc_exch_seq_send(lp, fp, fc_exch_rrq_resp, NULL, ep, lp->e_d_tov)) + return; + +retry: + spin_lock_bh(&ep->ex_lock); + if (ep->state & (FC_EX_RST_CLEANUP | FC_EX_DONE)) { + spin_unlock_bh(&ep->ex_lock); + /* drop hold for rec qual */ + fc_exch_release(ep); return; } + ep->esb_stat |= ESB_ST_REC_QUAL; + fc_exch_timer_set_locked(ep, ep->r_a_tov); + spin_unlock_bh(&ep->ex_lock); } -- cgit v1.2.2 From 19252de6818ced0def0551d64a0a2975f52a523d Mon Sep 17 00:00:00 2001 From: Tom Peng Date: Fri, 17 Jul 2009 16:02:04 +0800 Subject: [SCSI] libsas: fix wide port hotplug issues Hotplug of phys which form wide ports simply does not work at the moment. Fix this by adding checks at the hotplug points to see if the attached sas address of the phy already exists (in which case it's part of a wide port) and act accordingly. Signed-off-by: Tom Peng Signed-off-by: Jack Wang Signed-off-by: Lindar Liu Signed-off-by: Kevin Ao [jejb: tidied up coding, fixed an error case and made TRUE/FALSE lower case to fix a ppc64 compile error in linux-next] Signed-off-by: James Bottomley --- drivers/scsi/libsas/sas_expander.c | 147 +++++++++++++++++++++++++++---------- 1 file changed, 107 insertions(+), 40 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 54fa1e42dc4d..b3381959acce 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -766,6 +766,7 @@ static int sas_ex_join_wide_port(struct domain_device *parent, int phy_id) if (!memcmp(phy->attached_sas_addr, ephy->attached_sas_addr, SAS_ADDR_SIZE) && ephy->port) { sas_port_add_phy(ephy->port, phy->phy); + phy->port = ephy->port; phy->phy_state = PHY_DEVICE_DISCOVERED; return 0; } @@ -945,11 +946,21 @@ static int sas_ex_discover_dev(struct domain_device *dev, int phy_id) if (ex->ex_phy[i].phy_state == PHY_VACANT || ex->ex_phy[i].phy_state == PHY_NOT_PRESENT) continue; - + /* + * Due to races, the phy might not get added to the + * wide port, so we add the phy to the wide port here. + */ if (SAS_ADDR(ex->ex_phy[i].attached_sas_addr) == - SAS_ADDR(child->sas_addr)) + SAS_ADDR(child->sas_addr)) { ex->ex_phy[i].phy_state= PHY_DEVICE_DISCOVERED; + res = sas_ex_join_wide_port(dev, i); + if (!res) + SAS_DPRINTK("Attaching ex phy%d to wide port %016llx\n", + i, SAS_ADDR(ex->ex_phy[i].attached_sas_addr)); + + } } + res = 0; } return res; @@ -1598,7 +1609,7 @@ static int sas_get_phy_attached_sas_addr(struct domain_device *dev, } static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id, - int from_phy) + int from_phy, bool update) { struct expander_device *ex = &dev->ex_dev; int res = 0; @@ -1611,7 +1622,9 @@ static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id, if (res) goto out; else if (phy_change_count != ex->ex_phy[i].phy_change_count) { - ex->ex_phy[i].phy_change_count = phy_change_count; + if (update) + ex->ex_phy[i].phy_change_count = + phy_change_count; *phy_id = i; return 0; } @@ -1653,31 +1666,52 @@ out: kfree(rg_req); return res; } +/** + * sas_find_bcast_dev - find the device issue BROADCAST(CHANGE). + * @dev:domain device to be detect. + * @src_dev: the device which originated BROADCAST(CHANGE). + * + * Add self-configuration expander suport. Suppose two expander cascading, + * when the first level expander is self-configuring, hotplug the disks in + * second level expander, BROADCAST(CHANGE) will not only be originated + * in the second level expander, but also be originated in the first level + * expander (see SAS protocol SAS 2r-14, 7.11 for detail), it is to say, + * expander changed count in two level expanders will all increment at least + * once, but the phy which chang count has changed is the source device which + * we concerned. + */ static int sas_find_bcast_dev(struct domain_device *dev, struct domain_device **src_dev) { struct expander_device *ex = &dev->ex_dev; int ex_change_count = -1; + int phy_id = -1; int res; + struct domain_device *ch; res = sas_get_ex_change_count(dev, &ex_change_count); if (res) goto out; - if (ex_change_count != -1 && - ex_change_count != ex->ex_change_count) { - *src_dev = dev; - ex->ex_change_count = ex_change_count; - } else { - struct domain_device *ch; - - list_for_each_entry(ch, &ex->children, siblings) { - if (ch->dev_type == EDGE_DEV || - ch->dev_type == FANOUT_DEV) { - res = sas_find_bcast_dev(ch, src_dev); - if (src_dev) - return res; - } + if (ex_change_count != -1 && ex_change_count != ex->ex_change_count) { + /* Just detect if this expander phys phy change count changed, + * in order to determine if this expander originate BROADCAST, + * and do not update phy change count field in our structure. + */ + res = sas_find_bcast_phy(dev, &phy_id, 0, false); + if (phy_id != -1) { + *src_dev = dev; + ex->ex_change_count = ex_change_count; + SAS_DPRINTK("Expander phy change count has changed\n"); + return res; + } else + SAS_DPRINTK("Expander phys DID NOT change\n"); + } + list_for_each_entry(ch, &ex->children, siblings) { + if (ch->dev_type == EDGE_DEV || ch->dev_type == FANOUT_DEV) { + res = sas_find_bcast_dev(ch, src_dev); + if (src_dev) + return res; } } out: @@ -1700,24 +1734,26 @@ static void sas_unregister_ex_tree(struct domain_device *dev) } static void sas_unregister_devs_sas_addr(struct domain_device *parent, - int phy_id) + int phy_id, bool last) { struct expander_device *ex_dev = &parent->ex_dev; struct ex_phy *phy = &ex_dev->ex_phy[phy_id]; struct domain_device *child, *n; - - list_for_each_entry_safe(child, n, &ex_dev->children, siblings) { - if (SAS_ADDR(child->sas_addr) == - SAS_ADDR(phy->attached_sas_addr)) { - if (child->dev_type == EDGE_DEV || - child->dev_type == FANOUT_DEV) - sas_unregister_ex_tree(child); - else - sas_unregister_dev(child); - break; + if (last) { + list_for_each_entry_safe(child, n, + &ex_dev->children, siblings) { + if (SAS_ADDR(child->sas_addr) == + SAS_ADDR(phy->attached_sas_addr)) { + if (child->dev_type == EDGE_DEV || + child->dev_type == FANOUT_DEV) + sas_unregister_ex_tree(child); + else + sas_unregister_dev(child); + break; + } } + sas_disable_routing(parent, phy->attached_sas_addr); } - sas_disable_routing(parent, phy->attached_sas_addr); memset(phy->attached_sas_addr, 0, SAS_ADDR_SIZE); sas_port_delete_phy(phy->port, phy->phy); if (phy->port->num_phys == 0) @@ -1770,15 +1806,31 @@ static int sas_discover_new(struct domain_device *dev, int phy_id) { struct ex_phy *ex_phy = &dev->ex_dev.ex_phy[phy_id]; struct domain_device *child; - int res; + bool found = false; + int res, i; SAS_DPRINTK("ex %016llx phy%d new device attached\n", SAS_ADDR(dev->sas_addr), phy_id); res = sas_ex_phy_discover(dev, phy_id); if (res) goto out; + /* to support the wide port inserted */ + for (i = 0; i < dev->ex_dev.num_phys; i++) { + struct ex_phy *ex_phy_temp = &dev->ex_dev.ex_phy[i]; + if (i == phy_id) + continue; + if (SAS_ADDR(ex_phy_temp->attached_sas_addr) == + SAS_ADDR(ex_phy->attached_sas_addr)) { + found = true; + break; + } + } + if (found) { + sas_ex_join_wide_port(dev, phy_id); + return 0; + } res = sas_ex_discover_devices(dev, phy_id); - if (res) + if (!res) goto out; list_for_each_entry(child, &dev->ex_dev.children, siblings) { if (SAS_ADDR(child->sas_addr) == @@ -1793,7 +1845,7 @@ out: return res; } -static int sas_rediscover_dev(struct domain_device *dev, int phy_id) +static int sas_rediscover_dev(struct domain_device *dev, int phy_id, bool last) { struct expander_device *ex = &dev->ex_dev; struct ex_phy *phy = &ex->ex_phy[phy_id]; @@ -1804,11 +1856,11 @@ static int sas_rediscover_dev(struct domain_device *dev, int phy_id) switch (res) { case SMP_RESP_NO_PHY: phy->phy_state = PHY_NOT_PRESENT; - sas_unregister_devs_sas_addr(dev, phy_id); + sas_unregister_devs_sas_addr(dev, phy_id, last); goto out; break; case SMP_RESP_PHY_VACANT: phy->phy_state = PHY_VACANT; - sas_unregister_devs_sas_addr(dev, phy_id); + sas_unregister_devs_sas_addr(dev, phy_id, last); goto out; break; case SMP_RESP_FUNC_ACC: break; @@ -1816,7 +1868,7 @@ static int sas_rediscover_dev(struct domain_device *dev, int phy_id) if (SAS_ADDR(attached_sas_addr) == 0) { phy->phy_state = PHY_EMPTY; - sas_unregister_devs_sas_addr(dev, phy_id); + sas_unregister_devs_sas_addr(dev, phy_id, last); } else if (SAS_ADDR(attached_sas_addr) == SAS_ADDR(phy->attached_sas_addr)) { SAS_DPRINTK("ex %016llx phy 0x%x broadcast flutter\n", @@ -1828,12 +1880,27 @@ out: return res; } +/** + * sas_rediscover - revalidate the domain. + * @dev:domain device to be detect. + * @phy_id: the phy id will be detected. + * + * NOTE: this process _must_ quit (return) as soon as any connection + * errors are encountered. Connection recovery is done elsewhere. + * Discover process only interrogates devices in order to discover the + * domain.For plugging out, we un-register the device only when it is + * the last phy in the port, for other phys in this port, we just delete it + * from the port.For inserting, we do discovery when it is the + * first phy,for other phys in this port, we add it to the port to + * forming the wide-port. + */ static int sas_rediscover(struct domain_device *dev, const int phy_id) { struct expander_device *ex = &dev->ex_dev; struct ex_phy *changed_phy = &ex->ex_phy[phy_id]; int res = 0; int i; + bool last = true; /* is this the last phy of the port */ SAS_DPRINTK("ex %016llx phy%d originated BROADCAST(CHANGE)\n", SAS_ADDR(dev->sas_addr), phy_id); @@ -1848,13 +1915,13 @@ static int sas_rediscover(struct domain_device *dev, const int phy_id) SAS_ADDR(changed_phy->attached_sas_addr)) { SAS_DPRINTK("phy%d part of wide port with " "phy%d\n", phy_id, i); - goto out; + last = false; + break; } } - res = sas_rediscover_dev(dev, phy_id); + res = sas_rediscover_dev(dev, phy_id, last); } else res = sas_discover_new(dev, phy_id); -out: return res; } @@ -1881,7 +1948,7 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev) do { phy_id = -1; - res = sas_find_bcast_phy(dev, &phy_id, i); + res = sas_find_bcast_phy(dev, &phy_id, i, true); if (phy_id == -1) break; res = sas_rediscover(dev, phy_id); -- cgit v1.2.2 From ffd4bc2a984fab40ed969163efdff321490e8032 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 29 Jul 2009 14:06:53 -0400 Subject: [SCSI] sd: Avoid sending extended inquiry to legacy devices Some USB devices crash when we send them an inquiry with the EVPD bit set, regardless of page requested (i.e. including page 0). We only need the extended inquiry to gain access to VPD pages 0xB0 and 0xB1. These appeared in SBC2 and SBC3 respectively, so we can restrict sending the extended inquiry to devices reporting SPC3 or higher. This fixes bugzilla.kernel.org #13657. Signed-off-by: Martin K. Petersen [jejb: added comment] Signed-off-by: James Bottomley --- drivers/scsi/sd.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5616cd780ff3..b7b9fec67a98 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1840,6 +1840,18 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) kfree(buffer); } +static int sd_try_extended_inquiry(struct scsi_device *sdp) +{ + /* + * Although VPD inquiries can go to SCSI-2 type devices, + * some USB ones crash on receiving them, and the pages + * we currently ask for are for SPC-3 and beyond + */ + if (sdp->scsi_level > SCSI_SPC_2) + return 1; + return 0; +} + /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. @@ -1877,8 +1889,12 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->media_present) { sd_read_capacity(sdkp, buffer); - sd_read_block_limits(sdkp); - sd_read_block_characteristics(sdkp); + + if (sd_try_extended_inquiry(sdp)) { + sd_read_block_limits(sdkp); + sd_read_block_characteristics(sdkp); + } + sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); -- cgit v1.2.2 From a541f8401d8e9113a89ee902cb8d8e412d6d3569 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Wed, 29 Jul 2009 08:49:52 +0000 Subject: iscsi: Use GFP_ATOMIC in iscsi_offload_mesg(). Changing to GFP_ATOMIC because the only caller in cnic/bnx2i may be calling this function while holding spin_lock. This problem was discovered by Mike Christie. Signed-off-by: Michael Chan Acked-by: Mike Christie Signed-off-by: David S. Miller --- drivers/scsi/scsi_transport_iscsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 783e33c65eb7..b47240ca4b19 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -990,7 +990,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, struct iscsi_uevent *ev; int len = NLMSG_SPACE(sizeof(*ev) + data_size); - skb = alloc_skb(len, GFP_NOIO); + skb = alloc_skb(len, GFP_ATOMIC); if (!skb) { printk(KERN_ERR "can not deliver iscsi offload message:OOM\n"); return -ENOMEM; @@ -1012,7 +1012,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, memcpy((char *)ev + sizeof(*ev), data, data_size); - return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_NOIO); + return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(iscsi_offload_mesg); -- cgit v1.2.2 From 3d54015b750e5d5e950a1dcee2735387fd4b6e1a Mon Sep 17 00:00:00 2001 From: roel kluin Date: Thu, 30 Jul 2009 00:26:32 +0000 Subject: 3c515: Write outside array bounds if dev_alloc_skb() fails on the first iteration, a write to cp->rx_ring[-1] occurs. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/3c515.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/3c515.c b/drivers/net/3c515.c index 3e00fa8ea65f..4a7c32895be5 100644 --- a/drivers/net/3c515.c +++ b/drivers/net/3c515.c @@ -832,7 +832,9 @@ static int corkscrew_open(struct net_device *dev) skb_reserve(skb, 2); /* Align IP on 16 byte boundaries */ vp->rx_ring[i].addr = isa_virt_to_bus(skb->data); } - vp->rx_ring[i - 1].next = isa_virt_to_bus(&vp->rx_ring[0]); /* Wrap the ring. */ + if (i != 0) + vp->rx_ring[i - 1].next = + isa_virt_to_bus(&vp->rx_ring[0]); /* Wrap the ring. */ outl(isa_virt_to_bus(&vp->rx_ring[0]), ioaddr + UpListPtr); } if (vp->full_bus_master_tx) { /* Boomerang bus master Tx. */ -- cgit v1.2.2 From f0c5b35c6c93c89a9d8ccab19b0b4842f5dfddc5 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Wed, 29 Jul 2009 03:18:56 +0000 Subject: eexpress: Read buffer overflow start_code is 69 words, but the code always writes a multiple of 16 words, so the last 11 words written are outside the array. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/eexpress.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c index 1686dca28748..1f016d66684a 100644 --- a/drivers/net/eexpress.c +++ b/drivers/net/eexpress.c @@ -1474,13 +1474,13 @@ static void eexp_hw_init586(struct net_device *dev) outw(0x0000, ioaddr + 0x800c); outw(0x0000, ioaddr + 0x800e); - for (i = 0; i < (sizeof(start_code)); i+=32) { + for (i = 0; i < ARRAY_SIZE(start_code) * 2; i+=32) { int j; outw(i, ioaddr + SM_PTR); - for (j = 0; j < 16; j+=2) + for (j = 0; j < 16 && (i+j)/2 < ARRAY_SIZE(start_code); j+=2) outw(start_code[(i+j)/2], ioaddr+0x4000+j); - for (j = 0; j < 16; j+=2) + for (j = 0; j < 16 && (i+j+16)/2 < ARRAY_SIZE(start_code); j+=2) outw(start_code[(i+j+16)/2], ioaddr+0x8000+j); } -- cgit v1.2.2 From daed953721850381673687c59f3a0df553eb6626 Mon Sep 17 00:00:00 2001 From: Frans Pop Date: Thu, 30 Jul 2009 17:16:05 -0400 Subject: hp-wmi: check that an input device exists in resume handler MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some systems may not support input events, or registering the input handler may have failed. So check that an input device exists before trying to set the docking and tablet mode state during resume. Fixes: http://bugzilla.kernel.org/show_bug.cgi?id=13865 Reported-and-tested-by: Cédric Godin Signed-off-by: Frans Pop Acked-by: Matthew Garrett Signed-off-by: Len Brown --- drivers/platform/x86/hp-wmi.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index ca508564a181..a2ad53e15874 100644 --- a/drivers/platform/x86/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c @@ -520,11 +520,13 @@ static int hp_wmi_resume_handler(struct platform_device *device) * the input layer will only actually pass it on if the state * changed. */ - - input_report_switch(hp_wmi_input_dev, SW_DOCK, hp_wmi_dock_state()); - input_report_switch(hp_wmi_input_dev, SW_TABLET_MODE, - hp_wmi_tablet_state()); - input_sync(hp_wmi_input_dev); + if (hp_wmi_input_dev) { + input_report_switch(hp_wmi_input_dev, SW_DOCK, + hp_wmi_dock_state()); + input_report_switch(hp_wmi_input_dev, SW_TABLET_MODE, + hp_wmi_tablet_state()); + input_sync(hp_wmi_input_dev); + } return 0; } -- cgit v1.2.2 From 72fc939789dbe7ca091b50b686d45ac0df15417a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Jul 2009 23:43:08 +0000 Subject: pppoe: fix /proc/net/pppoe If a socket is hashed in last slot of pppoe hash table (PPPOE_HASH_SIZE-1) we report it many times (up to filling seq buffer) (Only the last socket of last slot) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- drivers/net/pppoe.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index f0031f1f97e5..5f2090233d7b 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -1063,6 +1063,7 @@ static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); + po = NULL; while (++hash < PPPOE_HASH_SIZE) { po = pn->hash_table[hash]; if (po) -- cgit v1.2.2 From accff95c2500c7bce671c1f722de6f8810fe550d Mon Sep 17 00:00:00 2001 From: Jiajun Wu Date: Thu, 30 Jul 2009 14:20:42 -0700 Subject: gianfar: fix coalescing setup in ethtool support Parameter order for using mk_ic_value(count, time) was reversed, the patch fixes this. Signed-off-by: Jiajun Wu Signed-off-by: Li Yang Signed-off-by: David S. Miller --- drivers/net/gianfar_ethtool.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c index dbf06e9313cc..2234118eedbb 100644 --- a/drivers/net/gianfar_ethtool.c +++ b/drivers/net/gianfar_ethtool.c @@ -366,9 +366,8 @@ static int gfar_scoalesce(struct net_device *dev, struct ethtool_coalesce *cvals return -EINVAL; } - priv->rxic = mk_ic_value( - gfar_usecs2ticks(priv, cvals->rx_coalesce_usecs), - cvals->rx_max_coalesced_frames); + priv->rxic = mk_ic_value(cvals->rx_max_coalesced_frames, + gfar_usecs2ticks(priv, cvals->rx_coalesce_usecs)); /* Set up tx coalescing */ if ((cvals->tx_coalesce_usecs == 0) || @@ -390,9 +389,8 @@ static int gfar_scoalesce(struct net_device *dev, struct ethtool_coalesce *cvals return -EINVAL; } - priv->txic = mk_ic_value( - gfar_usecs2ticks(priv, cvals->tx_coalesce_usecs), - cvals->tx_max_coalesced_frames); + priv->txic = mk_ic_value(cvals->tx_max_coalesced_frames, + gfar_usecs2ticks(priv, cvals->tx_coalesce_usecs)); gfar_write(&priv->regs->rxic, 0); if (priv->rxcoalescing) -- cgit v1.2.2 From 8f9a71673d9f397a365f4d18c307e91141b8fe92 Mon Sep 17 00:00:00 2001 From: Peter P Waskiewicz Jr Date: Thu, 30 Jul 2009 12:25:09 +0000 Subject: ixgbe: Fix netpoll to be properly multiqueue aware Our ndo_poll_controller callback is broken for anything but non-multiqueue setups. This fixes that issue. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ixgbe/ixgbe_main.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 200454f30f6a..60c4a8bf7d38 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -5360,12 +5360,19 @@ static int ixgbe_del_sanmac_netdev(struct net_device *dev) static void ixgbe_netpoll(struct net_device *netdev) { struct ixgbe_adapter *adapter = netdev_priv(netdev); + int i; - disable_irq(adapter->pdev->irq); adapter->flags |= IXGBE_FLAG_IN_NETPOLL; - ixgbe_intr(adapter->pdev->irq, netdev); + if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED) { + int num_q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS; + for (i = 0; i < num_q_vectors; i++) { + struct ixgbe_q_vector *q_vector = adapter->q_vector[i]; + ixgbe_msix_clean_many(0, q_vector); + } + } else { + ixgbe_intr(adapter->pdev->irq, netdev); + } adapter->flags &= ~IXGBE_FLAG_IN_NETPOLL; - enable_irq(adapter->pdev->irq); } #endif -- cgit v1.2.2 From 0c19d6af9253f19b41821c29b9c49c2214f19425 Mon Sep 17 00:00:00 2001 From: Peter P Waskiewicz Jr Date: Thu, 30 Jul 2009 12:25:28 +0000 Subject: ixgbe: Fix usage of second flags bitmap when using LRO/RSC A second set of feature flag bits was added, and the hardware RSC engine flags were moved there. However, the code itself didn't make the move completely to use the new bitmap. Signed-off-by: Peter P Waskiewicz Jr Acked-by: Mallikarjuna R Chilakala Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ixgbe/ixgbe_ethtool.c | 6 +++--- drivers/net/ixgbe/ixgbe_main.c | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c index 2a978008fd6e..7ddb50c03f0d 100644 --- a/drivers/net/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ixgbe/ixgbe_ethtool.c @@ -1999,13 +1999,13 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data) ethtool_op_set_flags(netdev, data); - if (!(adapter->flags & IXGBE_FLAG2_RSC_CAPABLE)) + if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)) return 0; /* if state changes we need to update adapter->flags and reset */ if ((!!(data & ETH_FLAG_LRO)) != - (!!(adapter->flags & IXGBE_FLAG2_RSC_ENABLED))) { - adapter->flags ^= IXGBE_FLAG2_RSC_ENABLED; + (!!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED))) { + adapter->flags2 ^= IXGBE_FLAG2_RSC_ENABLED; if (netif_running(netdev)) ixgbe_reinit_locked(adapter); else diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 60c4a8bf7d38..110c65ab5cb5 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -780,7 +780,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, prefetch(next_rxd); cleaned_count++; - if (adapter->flags & IXGBE_FLAG2_RSC_CAPABLE) + if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE) rsc_count = ixgbe_get_rsc_count(rx_desc); if (rsc_count) { @@ -2036,7 +2036,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), psrtype); } } else { - if (!(adapter->flags & IXGBE_FLAG2_RSC_ENABLED) && + if (!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) && (netdev->mtu <= ETH_DATA_LEN)) rx_buf_len = MAXIMUM_ETHERNET_VLAN_SIZE; else @@ -2165,7 +2165,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl); } - if (adapter->flags & IXGBE_FLAG2_RSC_ENABLED) { + if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) { /* Enable 82599 HW-RSC */ for (i = 0; i < adapter->num_rx_queues; i++) { j = adapter->rx_ring[i].reg_idx; @@ -3812,8 +3812,8 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter) adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82598; } else if (hw->mac.type == ixgbe_mac_82599EB) { adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82599; - adapter->flags |= IXGBE_FLAG2_RSC_CAPABLE; - adapter->flags |= IXGBE_FLAG2_RSC_ENABLED; + adapter->flags2 |= IXGBE_FLAG2_RSC_CAPABLE; + adapter->flags2 |= IXGBE_FLAG2_RSC_ENABLED; adapter->flags |= IXGBE_FLAG_FDIR_HASH_CAPABLE; adapter->ring_feature[RING_F_FDIR].indices = IXGBE_MAX_FDIR_INDICES; @@ -5618,7 +5618,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev, if (pci_using_dac) netdev->features |= NETIF_F_HIGHDMA; - if (adapter->flags & IXGBE_FLAG2_RSC_ENABLED) + if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) netdev->features |= NETIF_F_LRO; /* make sure the EEPROM is good */ -- cgit v1.2.2 From 0a924578bc4a2823a95c151f56975c71f5c156bb Mon Sep 17 00:00:00 2001 From: Peter P Waskiewicz Jr Date: Thu, 30 Jul 2009 12:26:00 +0000 Subject: ixgbe: Fix RSC completion delay causing Rx interrupts to stop When a user disables interrupt throttling with ethtool on 82599 devices, the interrupt timer may not be re-enabled if hardware RSC is running. The RSC completions in hardware don't complete before the next ITR event tries to fire, so the ITR timer never gets re-armed. This patch increases the amount of time between interrupts when throttling is disabled (rx-usecs = 0) when the hardware RSC deature is enabled. Signed-off-by: Peter P Waskiewicz Jr Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ixgbe/ixgbe.h | 2 ++ drivers/net/ixgbe/ixgbe_ethtool.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h index 1b12c7ba275f..e11d83d5852b 100644 --- a/drivers/net/ixgbe/ixgbe.h +++ b/drivers/net/ixgbe/ixgbe.h @@ -96,6 +96,8 @@ #define IXGBE_TX_FLAGS_VLAN_PRIO_MASK 0x0000e000 #define IXGBE_TX_FLAGS_VLAN_SHIFT 16 +#define IXGBE_MAX_RSC_INT_RATE 162760 + /* wrapper around a pointer to a socket buffer, * so a DMA handle can be stored along with the buffer */ struct ixgbe_tx_buffer { diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c index 7ddb50c03f0d..79144e950a34 100644 --- a/drivers/net/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ixgbe/ixgbe_ethtool.c @@ -1975,7 +1975,10 @@ static int ixgbe_set_coalesce(struct net_device *netdev, * any other value means disable eitr, which is best * served by setting the interrupt rate very high */ - adapter->eitr_param = IXGBE_MAX_INT_RATE; + if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) + adapter->eitr_param = IXGBE_MAX_RSC_INT_RATE; + else + adapter->eitr_param = IXGBE_MAX_INT_RATE; adapter->itr_setting = 0; } -- cgit v1.2.2 From 95fc17aac45300f45968aacd97a536ddd8db8101 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Fri, 31 Jul 2009 12:39:15 +1000 Subject: md/raid6: release spare page at ->stop() Add missing call to safe_put_page from stop() by unifying open coded raid5_conf_t de-allocation under free_conf(). Cc: Signed-off-by: Dan Williams Signed-off-by: NeilBrown --- drivers/md/raid5.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 37835538b58e..39374230a463 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4316,6 +4316,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) return sectors * (raid_disks - conf->max_degraded); } +static void free_conf(raid5_conf_t *conf) +{ + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); +} + static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; @@ -4447,11 +4456,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) abort: if (conf) { - shrink_stripes(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); return ERR_PTR(-EIO); } else return ERR_PTR(-ENOMEM); @@ -4629,12 +4634,8 @@ abort: md_unregister_thread(mddev->thread); mddev->thread = NULL; if (conf) { - shrink_stripes(conf); print_raid5_conf(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); } mddev->private = NULL; printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); @@ -4649,13 +4650,10 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; - shrink_stripes(conf); - kfree(conf->stripe_hashtbl); mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); - kfree(conf->disks); - kfree(conf); + free_conf(conf); mddev->private = NULL; return 0; } -- cgit v1.2.2 From da60a91d012bcb10bc5bcd86d585c4281742832c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Jun 2009 09:33:32 +0200 Subject: sdhci: use SG_MITER_TO_SG/SG_MITER_FROM_SG so the page will be flushed on unmap on ARCH which need it. Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Pierre Ossman --- drivers/mmc/host/sdhci.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 62041c7e9246..fc96f8cb9c0b 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -773,8 +773,14 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) } if (!(host->flags & SDHCI_REQ_USE_DMA)) { - sg_miter_start(&host->sg_miter, - data->sg, data->sg_len, SG_MITER_ATOMIC); + int flags; + + flags = SG_MITER_ATOMIC; + if (host->data->flags & MMC_DATA_READ) + flags |= SG_MITER_TO_SG; + else + flags |= SG_MITER_FROM_SG; + sg_miter_start(&host->sg_miter, data->sg, data->sg_len, flags); host->blocks = data->blocks; } -- cgit v1.2.2 From 4b2a108cd0d34880fe9d932258ca5b2ccebcd05e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 22 Jun 2009 09:18:05 +0200 Subject: cb710: use SG_MITER_TO_SG/SG_MITER_FROM_SG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the code allready uses flush_kernel_dcache_page(). This patch updates the driver to the recent sg API changes which require that either SG_MITER_TO_SG or SG_MITER_FROM_SG is set. SG_MITER_TO_SG calls flush_kernel_dcache_page() in sg_mitter_stop() Signed-off-by: Sebastian Andrzej Siewior Acked-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Pierre Ossman --- drivers/misc/cb710/sgbuf2.c | 4 ---- drivers/mmc/host/cb710-mmc.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'drivers') diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c index d38a7acdb6ec..d019746551f3 100644 --- a/drivers/misc/cb710/sgbuf2.c +++ b/drivers/misc/cb710/sgbuf2.c @@ -114,7 +114,6 @@ static void sg_dwiter_write_slow(struct sg_mapping_iter *miter, uint32_t data) if (!left) return; addr += len; - flush_kernel_dcache_page(miter->page); } while (sg_dwiter_next(miter)); } @@ -142,9 +141,6 @@ void cb710_sg_dwiter_write_next_block(struct sg_mapping_iter *miter, uint32_t da return; } else sg_dwiter_write_slow(miter, data); - - if (miter->length == miter->consumed) - flush_kernel_dcache_page(miter->page); } EXPORT_SYMBOL_GPL(cb710_sg_dwiter_write_next_block); diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c index 11efefb1af51..4e72964a7b43 100644 --- a/drivers/mmc/host/cb710-mmc.c +++ b/drivers/mmc/host/cb710-mmc.c @@ -278,7 +278,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data) if (unlikely(data->blksz & 15 && (data->blocks != 1 || data->blksz != 8))) return -EINVAL; - sg_miter_start(&miter, data->sg, data->sg_len, 0); + sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_TO_SG); cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 15, CB710_MMC_C2_READ_PIO_SIZE_MASK); @@ -307,7 +307,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data) goto out; } out: - cb710_sg_miter_stop_writing(&miter); + sg_miter_stop(&miter); return err; } @@ -322,7 +322,7 @@ static int cb710_mmc_send(struct cb710_slot *slot, struct mmc_data *data) if (unlikely(data->blocks > 1 && data->blksz & 15)) return -EINVAL; - sg_miter_start(&miter, data->sg, data->sg_len, 0); + sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_FROM_SG); cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 0, CB710_MMC_C2_READ_PIO_SIZE_MASK); -- cgit v1.2.2 From a9239d750d9991f2feee78fc5669a4613abc1adb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 28 Jun 2009 09:26:31 -0700 Subject: imxmmc: Remove unnecessary semicolons Signed-off-by: Joe Perches Acked-by: Pavel Pisa Signed-off-by: Pierre Ossman --- drivers/mmc/host/imxmmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/mmc/host/imxmmc.c b/drivers/mmc/host/imxmmc.c index e0be21a4a696..bf98d7cc928a 100644 --- a/drivers/mmc/host/imxmmc.c +++ b/drivers/mmc/host/imxmmc.c @@ -652,7 +652,7 @@ static irqreturn_t imxmci_irq(int irq, void *devid) set_bit(IMXMCI_PEND_STARTED_b, &host->pending_events); tasklet_schedule(&host->tasklet); - return IRQ_RETVAL(handled);; + return IRQ_RETVAL(handled); } static void imxmci_tasklet_fnc(unsigned long data) -- cgit v1.2.2 From 550e7fd8afb7664ae7cedb398c407694e2bf7d3c Mon Sep 17 00:00:00 2001 From: Henrique de Moraes Holschuh Date: Sat, 1 Aug 2009 12:04:17 -0300 Subject: thinkpad-acpi: disable broken bay and dock subdrivers Currently, the ThinkPad-ACPI bay and dock drivers are completely broken, and cause a NULL pointer derreference in kernel mode (and, therefore, an OOPS) when they try to issue events (i.e. on dock, undock, bay ejection, etc). OTOH, the standard ACPI dock driver can handle the hotplug bays and docks of the ThinkPads just fine (including batteries) as of 2.6.27. In fact, it does a much better job of it than thinkpad-acpi ever did. It is just not worth the hassle to find a way to fix this crap without breaking the (deprecated) thinkpad-acpi dock/bay ABI. This is old, deprecated code that sees little testing or use. As a quick fix suitable for -stable backports, mark the thinkpad-acpi bay and dock subdrivers as BROKEN in Kconfig. The dead code will be removed by a later patch. This fixes bugzilla #13669, and should be applied to 2.6.27 and later. Signed-off-by: Henrique de Moraes Holschuh Reported-by: Joerg Platte Cc: stable@kernel.org Signed-off-by: Len Brown --- drivers/platform/x86/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 46dad12f952f..6335f63892dc 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -281,6 +281,7 @@ config THINKPAD_ACPI_DOCK bool "Legacy Docking Station Support" depends on THINKPAD_ACPI depends on ACPI_DOCK=n + depends on BROKEN default n ---help--- Allows the thinkpad_acpi driver to handle docking station events. @@ -294,7 +295,8 @@ config THINKPAD_ACPI_DOCK config THINKPAD_ACPI_BAY bool "Legacy Removable Bay Support" depends on THINKPAD_ACPI - default y + depends on BROKEN + default n ---help--- Allows the thinkpad_acpi driver to handle removable bays. It will electrically disable the device in the bay, and also generate -- cgit v1.2.2 From 1f6fc2de9525e34ee93bd392fa046369a8cfbf1e Mon Sep 17 00:00:00 2001 From: Henrique de Moraes Holschuh Date: Sat, 1 Aug 2009 12:04:18 -0300 Subject: thinkpad-acpi: remove dock and bay subdrivers The standard ACPI dock driver can handle the hotplug bays and docks of the ThinkPads just fine (including batteries) as of 2.6.27, and the code in thinkpad-acpi for the dock and bay subdrivers is currently broken anyway... Userspace needs some love to support the two-stage ejection nicely, but it is simple enough to do through udev rules (you don't even need HAL) so this wouldn't justify fixing the dock and bay subdrivers, either. That leaves warm-swap bays (_EJ3) support for thinkpad-acpi, as well as support for the weird dock of the model 570, but since such support has never left the "experimental" stage, it is also not a strong enough reason to find a way to fix this code. Users of ThinkPads with warm-swap bays are urged to request that _EJ3 support be added to the regular ACPI dock driver, if such feature is indeed useful for them. Signed-off-by: Henrique de Moraes Holschuh Signed-off-by: Len Brown --- drivers/platform/x86/Kconfig | 27 --- drivers/platform/x86/thinkpad_acpi.c | 327 ----------------------------------- 2 files changed, 354 deletions(-) (limited to 'drivers') diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 6335f63892dc..77c6097ced80 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -277,33 +277,6 @@ config THINKPAD_ACPI_UNSAFE_LEDS Say N here, unless you are building a kernel for your own use, and need to control the important firmware LEDs. -config THINKPAD_ACPI_DOCK - bool "Legacy Docking Station Support" - depends on THINKPAD_ACPI - depends on ACPI_DOCK=n - depends on BROKEN - default n - ---help--- - Allows the thinkpad_acpi driver to handle docking station events. - This support was made obsolete by the generic ACPI docking station - support (CONFIG_ACPI_DOCK). It will allow locking and removing the - laptop from the docking station, but will not properly connect PCI - devices. - - If you are not sure, say N here. - -config THINKPAD_ACPI_BAY - bool "Legacy Removable Bay Support" - depends on THINKPAD_ACPI - depends on BROKEN - default n - ---help--- - Allows the thinkpad_acpi driver to handle removable bays. It will - electrically disable the device in the bay, and also generate - notifications when the bay lever is ejected or inserted. - - If you are not sure, say Y here. - config THINKPAD_ACPI_VIDEO bool "Video output control support" depends on THINKPAD_ACPI diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index a463fd72c495..27d68e719e90 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -239,12 +239,6 @@ struct ibm_init_struct { }; static struct { -#ifdef CONFIG_THINKPAD_ACPI_BAY - u32 bay_status:1; - u32 bay_eject:1; - u32 bay_status2:1; - u32 bay_eject2:1; -#endif u32 bluetooth:1; u32 hotkey:1; u32 hotkey_mask:1; @@ -589,18 +583,6 @@ static int acpi_ec_write(int i, u8 v) return 1; } -#if defined(CONFIG_THINKPAD_ACPI_DOCK) || defined(CONFIG_THINKPAD_ACPI_BAY) -static int _sta(acpi_handle handle) -{ - int status; - - if (!handle || !acpi_evalf(handle, &status, "_STA", "d")) - status = 0; - - return status; -} -#endif - static int issue_thinkpad_cmos_command(int cmos_cmd) { if (!cmos_handle) @@ -4441,293 +4423,6 @@ static struct ibm_struct light_driver_data = { .exit = light_exit, }; -/************************************************************************* - * Dock subdriver - */ - -#ifdef CONFIG_THINKPAD_ACPI_DOCK - -static void dock_notify(struct ibm_struct *ibm, u32 event); -static int dock_read(char *p); -static int dock_write(char *buf); - -TPACPI_HANDLE(dock, root, "\\_SB.GDCK", /* X30, X31, X40 */ - "\\_SB.PCI0.DOCK", /* 600e/x,770e,770x,A2xm/p,T20-22,X20-21 */ - "\\_SB.PCI0.PCI1.DOCK", /* all others */ - "\\_SB.PCI.ISA.SLCE", /* 570 */ - ); /* A21e,G4x,R30,R31,R32,R40,R40e,R50e */ - -/* don't list other alternatives as we install a notify handler on the 570 */ -TPACPI_HANDLE(pci, root, "\\_SB.PCI"); /* 570 */ - -static const struct acpi_device_id ibm_pci_device_ids[] = { - {PCI_ROOT_HID_STRING, 0}, - {"", 0}, -}; - -static struct tp_acpi_drv_struct ibm_dock_acpidriver[2] = { - { - .notify = dock_notify, - .handle = &dock_handle, - .type = ACPI_SYSTEM_NOTIFY, - }, - { - /* THIS ONE MUST NEVER BE USED FOR DRIVER AUTOLOADING. - * We just use it to get notifications of dock hotplug - * in very old thinkpads */ - .hid = ibm_pci_device_ids, - .notify = dock_notify, - .handle = &pci_handle, - .type = ACPI_SYSTEM_NOTIFY, - }, -}; - -static struct ibm_struct dock_driver_data[2] = { - { - .name = "dock", - .read = dock_read, - .write = dock_write, - .acpi = &ibm_dock_acpidriver[0], - }, - { - .name = "dock", - .acpi = &ibm_dock_acpidriver[1], - }, -}; - -#define dock_docked() (_sta(dock_handle) & 1) - -static int __init dock_init(struct ibm_init_struct *iibm) -{ - vdbg_printk(TPACPI_DBG_INIT, "initializing dock subdriver\n"); - - TPACPI_ACPIHANDLE_INIT(dock); - - vdbg_printk(TPACPI_DBG_INIT, "dock is %s\n", - str_supported(dock_handle != NULL)); - - return (dock_handle)? 0 : 1; -} - -static int __init dock_init2(struct ibm_init_struct *iibm) -{ - int dock2_needed; - - vdbg_printk(TPACPI_DBG_INIT, "initializing dock subdriver part 2\n"); - - if (dock_driver_data[0].flags.acpi_driver_registered && - dock_driver_data[0].flags.acpi_notify_installed) { - TPACPI_ACPIHANDLE_INIT(pci); - dock2_needed = (pci_handle != NULL); - vdbg_printk(TPACPI_DBG_INIT, - "dock PCI handler for the TP 570 is %s\n", - str_supported(dock2_needed)); - } else { - vdbg_printk(TPACPI_DBG_INIT, - "dock subdriver part 2 not required\n"); - dock2_needed = 0; - } - - return (dock2_needed)? 0 : 1; -} - -static void dock_notify(struct ibm_struct *ibm, u32 event) -{ - int docked = dock_docked(); - int pci = ibm->acpi->hid && ibm->acpi->device && - acpi_match_device_ids(ibm->acpi->device, ibm_pci_device_ids); - int data; - - if (event == 1 && !pci) /* 570 */ - data = 1; /* button */ - else if (event == 1 && pci) /* 570 */ - data = 3; /* dock */ - else if (event == 3 && docked) - data = 1; /* button */ - else if (event == 3 && !docked) - data = 2; /* undock */ - else if (event == 0 && docked) - data = 3; /* dock */ - else { - printk(TPACPI_ERR "unknown dock event %d, status %d\n", - event, _sta(dock_handle)); - data = 0; /* unknown */ - } - acpi_bus_generate_proc_event(ibm->acpi->device, event, data); - acpi_bus_generate_netlink_event(ibm->acpi->device->pnp.device_class, - dev_name(&ibm->acpi->device->dev), - event, data); -} - -static int dock_read(char *p) -{ - int len = 0; - int docked = dock_docked(); - - if (!dock_handle) - len += sprintf(p + len, "status:\t\tnot supported\n"); - else if (!docked) - len += sprintf(p + len, "status:\t\tundocked\n"); - else { - len += sprintf(p + len, "status:\t\tdocked\n"); - len += sprintf(p + len, "commands:\tdock, undock\n"); - } - - return len; -} - -static int dock_write(char *buf) -{ - char *cmd; - - if (!dock_docked()) - return -ENODEV; - - while ((cmd = next_cmd(&buf))) { - if (strlencmp(cmd, "undock") == 0) { - if (!acpi_evalf(dock_handle, NULL, "_DCK", "vd", 0) || - !acpi_evalf(dock_handle, NULL, "_EJ0", "vd", 1)) - return -EIO; - } else if (strlencmp(cmd, "dock") == 0) { - if (!acpi_evalf(dock_handle, NULL, "_DCK", "vd", 1)) - return -EIO; - } else - return -EINVAL; - } - - return 0; -} - -#endif /* CONFIG_THINKPAD_ACPI_DOCK */ - -/************************************************************************* - * Bay subdriver - */ - -#ifdef CONFIG_THINKPAD_ACPI_BAY - -TPACPI_HANDLE(bay, root, "\\_SB.PCI.IDE.SECN.MAST", /* 570 */ - "\\_SB.PCI0.IDE0.IDES.IDSM", /* 600e/x, 770e, 770x */ - "\\_SB.PCI0.SATA.SCND.MSTR", /* T60, X60, Z60 */ - "\\_SB.PCI0.IDE0.SCND.MSTR", /* all others */ - ); /* A21e, R30, R31 */ -TPACPI_HANDLE(bay_ej, bay, "_EJ3", /* 600e/x, A2xm/p, A3x */ - "_EJ0", /* all others */ - ); /* 570,A21e,G4x,R30,R31,R32,R40e,R50e */ -TPACPI_HANDLE(bay2, root, "\\_SB.PCI0.IDE0.PRIM.SLAV", /* A3x, R32 */ - "\\_SB.PCI0.IDE0.IDEP.IDPS", /* 600e/x, 770e, 770x */ - ); /* all others */ -TPACPI_HANDLE(bay2_ej, bay2, "_EJ3", /* 600e/x, 770e, A3x */ - "_EJ0", /* 770x */ - ); /* all others */ - -static int __init bay_init(struct ibm_init_struct *iibm) -{ - vdbg_printk(TPACPI_DBG_INIT, "initializing bay subdriver\n"); - - TPACPI_ACPIHANDLE_INIT(bay); - if (bay_handle) - TPACPI_ACPIHANDLE_INIT(bay_ej); - TPACPI_ACPIHANDLE_INIT(bay2); - if (bay2_handle) - TPACPI_ACPIHANDLE_INIT(bay2_ej); - - tp_features.bay_status = bay_handle && - acpi_evalf(bay_handle, NULL, "_STA", "qv"); - tp_features.bay_status2 = bay2_handle && - acpi_evalf(bay2_handle, NULL, "_STA", "qv"); - - tp_features.bay_eject = bay_handle && bay_ej_handle && - (strlencmp(bay_ej_path, "_EJ0") == 0 || experimental); - tp_features.bay_eject2 = bay2_handle && bay2_ej_handle && - (strlencmp(bay2_ej_path, "_EJ0") == 0 || experimental); - - vdbg_printk(TPACPI_DBG_INIT, - "bay 1: status %s, eject %s; bay 2: status %s, eject %s\n", - str_supported(tp_features.bay_status), - str_supported(tp_features.bay_eject), - str_supported(tp_features.bay_status2), - str_supported(tp_features.bay_eject2)); - - return (tp_features.bay_status || tp_features.bay_eject || - tp_features.bay_status2 || tp_features.bay_eject2)? 0 : 1; -} - -static void bay_notify(struct ibm_struct *ibm, u32 event) -{ - acpi_bus_generate_proc_event(ibm->acpi->device, event, 0); - acpi_bus_generate_netlink_event(ibm->acpi->device->pnp.device_class, - dev_name(&ibm->acpi->device->dev), - event, 0); -} - -#define bay_occupied(b) (_sta(b##_handle) & 1) - -static int bay_read(char *p) -{ - int len = 0; - int occupied = bay_occupied(bay); - int occupied2 = bay_occupied(bay2); - int eject, eject2; - - len += sprintf(p + len, "status:\t\t%s\n", - tp_features.bay_status ? - (occupied ? "occupied" : "unoccupied") : - "not supported"); - if (tp_features.bay_status2) - len += sprintf(p + len, "status2:\t%s\n", occupied2 ? - "occupied" : "unoccupied"); - - eject = tp_features.bay_eject && occupied; - eject2 = tp_features.bay_eject2 && occupied2; - - if (eject && eject2) - len += sprintf(p + len, "commands:\teject, eject2\n"); - else if (eject) - len += sprintf(p + len, "commands:\teject\n"); - else if (eject2) - len += sprintf(p + len, "commands:\teject2\n"); - - return len; -} - -static int bay_write(char *buf) -{ - char *cmd; - - if (!tp_features.bay_eject && !tp_features.bay_eject2) - return -ENODEV; - - while ((cmd = next_cmd(&buf))) { - if (tp_features.bay_eject && strlencmp(cmd, "eject") == 0) { - if (!acpi_evalf(bay_ej_handle, NULL, NULL, "vd", 1)) - return -EIO; - } else if (tp_features.bay_eject2 && - strlencmp(cmd, "eject2") == 0) { - if (!acpi_evalf(bay2_ej_handle, NULL, NULL, "vd", 1)) - return -EIO; - } else - return -EINVAL; - } - - return 0; -} - -static struct tp_acpi_drv_struct ibm_bay_acpidriver = { - .notify = bay_notify, - .handle = &bay_handle, - .type = ACPI_SYSTEM_NOTIFY, -}; - -static struct ibm_struct bay_driver_data = { - .name = "bay", - .read = bay_read, - .write = bay_write, - .acpi = &ibm_bay_acpidriver, -}; - -#endif /* CONFIG_THINKPAD_ACPI_BAY */ - /************************************************************************* * CMOS subdriver */ @@ -7854,22 +7549,6 @@ static struct ibm_init_struct ibms_init[] __initdata = { .init = light_init, .data = &light_driver_data, }, -#ifdef CONFIG_THINKPAD_ACPI_DOCK - { - .init = dock_init, - .data = &dock_driver_data[0], - }, - { - .init = dock_init2, - .data = &dock_driver_data[1], - }, -#endif -#ifdef CONFIG_THINKPAD_ACPI_BAY - { - .init = bay_init, - .data = &bay_driver_data, - }, -#endif { .init = cmos_init, .data = &cmos_driver_data, @@ -7968,12 +7647,6 @@ TPACPI_PARAM(hotkey); TPACPI_PARAM(bluetooth); TPACPI_PARAM(video); TPACPI_PARAM(light); -#ifdef CONFIG_THINKPAD_ACPI_DOCK -TPACPI_PARAM(dock); -#endif -#ifdef CONFIG_THINKPAD_ACPI_BAY -TPACPI_PARAM(bay); -#endif /* CONFIG_THINKPAD_ACPI_BAY */ TPACPI_PARAM(cmos); TPACPI_PARAM(led); TPACPI_PARAM(beep); -- cgit v1.2.2 From 5b05d4696d38c3172e79e855cc1e2ed044589508 Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Sat, 1 Aug 2009 12:04:19 -0300 Subject: thinkpad-acpi: restrict procfs count value to sane upper limit Signed-off-by: Michael Buesch Acked-by: Henrique de Moraes Holschuh Signed-off-by: Len Brown --- drivers/platform/x86/thinkpad_acpi.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 27d68e719e90..18f9ee63c50a 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -766,6 +766,8 @@ static int dispatch_procfs_write(struct file *file, if (!ibm || !ibm->write) return -EINVAL; + if (count > PAGE_SIZE - 2) + return -EINVAL; kernbuf = kmalloc(count + 2, GFP_KERNEL); if (!kernbuf) -- cgit v1.2.2 From 59fe4fe34d7afdf63208124f313be9056feaa2f4 Mon Sep 17 00:00:00 2001 From: Henrique de Moraes Holschuh Date: Sat, 1 Aug 2009 12:04:20 -0300 Subject: thinkpad-acpi: fix incorrect use of TPACPI_BRGHT_MODE_ECNVRAM HBRV-based default selection of backlight control strategy didn't work well, at least the X41 defines it but doesn't use it and I don't think it will stop there. Switch to a white/blacklist. All models that have HBRV defined have been included in the list, and initially all ATI GPUs will get ECNVRAM, and the Intel GPUs will get UCMS_STEP. Symptoms of incorrect backlight mode selection are: 1. Non-working backlight control through sysfs; 2. Backlight gets reset to the lowest level at every shutdown, reboot and when thinkpad-acpi gets unloaded; This fixes a regression in 2.6.30, bugzilla #13826 Signed-off-by: Henrique de Moraes Holschuh Reported-by: Tobias Diedrich Cc: stable@kernel.org Signed-off-by: Len Brown --- drivers/platform/x86/thinkpad_acpi.c | 61 +++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 14 deletions(-) (limited to 'drivers') diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 18f9ee63c50a..e85600852502 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -5642,14 +5642,48 @@ static struct backlight_ops ibm_backlight_data = { /* --------------------------------------------------------------------- */ +/* + * These are only useful for models that have only one possibility + * of GPU. If the BIOS model handles both ATI and Intel, don't use + * these quirks. + */ +#define TPACPI_BRGHT_Q_NOEC 0x0001 /* Must NOT use EC HBRV */ +#define TPACPI_BRGHT_Q_EC 0x0002 /* Should or must use EC HBRV */ +#define TPACPI_BRGHT_Q_ASK 0x8000 /* Ask for user report */ + +static const struct tpacpi_quirk brightness_quirk_table[] __initconst = { + /* Models with ATI GPUs known to require ECNVRAM mode */ + TPACPI_Q_IBM('1', 'Y', TPACPI_BRGHT_Q_EC), /* T43/p ATI */ + + /* Models with ATI GPUs (waiting confirmation) */ + TPACPI_Q_IBM('1', 'R', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + TPACPI_Q_IBM('1', 'Q', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + TPACPI_Q_IBM('7', '6', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + TPACPI_Q_IBM('7', '8', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + + /* Models with Intel Extreme Graphics 2 (waiting confirmation) */ + TPACPI_Q_IBM('1', 'V', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC), + TPACPI_Q_IBM('1', 'W', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC), + TPACPI_Q_IBM('1', 'U', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC), + + /* Models with Intel GMA900 */ + TPACPI_Q_IBM('7', '0', TPACPI_BRGHT_Q_NOEC), /* T43, R52 */ + TPACPI_Q_IBM('7', '4', TPACPI_BRGHT_Q_NOEC), /* X41 */ + TPACPI_Q_IBM('7', '5', TPACPI_BRGHT_Q_NOEC), /* X41 Tablet */ +}; + static int __init brightness_init(struct ibm_init_struct *iibm) { int b; + unsigned long quirks; vdbg_printk(TPACPI_DBG_INIT, "initializing brightness subdriver\n"); mutex_init(&brightness_mutex); + quirks = tpacpi_check_quirks(brightness_quirk_table, + ARRAY_SIZE(brightness_quirk_table)); + /* * We always attempt to detect acpi support, so as to switch * Lenovo Vista BIOS to ACPI brightness mode even if we are not @@ -5706,23 +5740,13 @@ static int __init brightness_init(struct ibm_init_struct *iibm) /* TPACPI_BRGHT_MODE_AUTO not implemented yet, just use default */ if (brightness_mode == TPACPI_BRGHT_MODE_AUTO || brightness_mode == TPACPI_BRGHT_MODE_MAX) { - if (thinkpad_id.vendor == PCI_VENDOR_ID_IBM) { - /* - * IBM models that define HBRV probably have - * EC-based backlight level control - */ - if (acpi_evalf(ec_handle, NULL, "HBRV", "qd")) - /* T40-T43, R50-R52, R50e, R51e, X31-X41 */ - brightness_mode = TPACPI_BRGHT_MODE_ECNVRAM; - else - /* all other IBM ThinkPads */ - brightness_mode = TPACPI_BRGHT_MODE_UCMS_STEP; - } else - /* All Lenovo ThinkPads */ + if (quirks & TPACPI_BRGHT_Q_EC) + brightness_mode = TPACPI_BRGHT_MODE_ECNVRAM; + else brightness_mode = TPACPI_BRGHT_MODE_UCMS_STEP; dbg_printk(TPACPI_DBG_BRGHT, - "selected brightness_mode=%d\n", + "driver auto-selected brightness_mode=%d\n", brightness_mode); } @@ -5749,6 +5773,15 @@ static int __init brightness_init(struct ibm_init_struct *iibm) vdbg_printk(TPACPI_DBG_INIT | TPACPI_DBG_BRGHT, "brightness is supported\n"); + if (quirks & TPACPI_BRGHT_Q_ASK) { + printk(TPACPI_NOTICE + "brightness: will use unverified default: " + "brightness_mode=%d\n", brightness_mode); + printk(TPACPI_NOTICE + "brightness: please report to %s whether it works well " + "or not on your ThinkPad\n", TPACPI_MAIL); + } + ibm_backlight_device->props.max_brightness = (tp_features.bright_16levels)? 15 : 7; ibm_backlight_device->props.brightness = b & TP_EC_BACKLIGHT_LVLMSK; -- cgit v1.2.2 From 447c233da4d109c6194fefd69e5185cbc93cc062 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 2 Aug 2009 08:02:28 +0000 Subject: parisc: Fix read buffer overflow in pdc_stable driver Check whether index is within bounds before testing the element. Signed-off-by: Roel Kluin Signed-off-by: Helge Deller --- drivers/parisc/pdc_stable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index f9f9a5f1bbd0..13a64bc081b6 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -370,7 +370,7 @@ pdcspath_layer_read(struct pdcspath_entry *entry, char *buf) if (!i) /* entry is not ready */ return -ENODATA; - for (i = 0; devpath->layers[i] && (likely(i < 6)); i++) + for (i = 0; i < 6 && devpath->layers[i]; i++) out += sprintf(out, "%u ", devpath->layers[i]); out += sprintf(out, "\n"); -- cgit v1.2.2 From 450d6e306b4717bfae11218a02648509baf04ce1 Mon Sep 17 00:00:00 2001 From: Stoyan Gaydarov Date: Thu, 30 Jul 2009 10:25:19 +0000 Subject: parisc: fixed faulty check in lba_pci This patche fixes a spelling error that has resulted from copy and pasting. The location of the error was found using a semantic patch but the semantic patch was not trying to find these errors. After looking things over it seemed logical that this change was needed. Signed-off-by: Stoyan Gaydarov Signed-off-by: Helge Deller --- drivers/parisc/lba_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c index ede614616f8e..3aeb3279c92a 100644 --- a/drivers/parisc/lba_pci.c +++ b/drivers/parisc/lba_pci.c @@ -992,7 +992,7 @@ lba_pat_resources(struct parisc_device *pa_dev, struct lba_device *lba_dev) return; io_pdc_cell = kzalloc(sizeof(pdc_pat_cell_mod_maddr_block_t), GFP_KERNEL); - if (!pa_pdc_cell) { + if (!io_pdc_cell) { kfree(pa_pdc_cell); return; } -- cgit v1.2.2 From 6b4dbcd86a9d464057fcc7abe4d0574093071fcc Mon Sep 17 00:00:00 2001 From: Michael Buesch Date: Mon, 20 Jul 2009 22:58:44 +0000 Subject: parisc: isa-eeprom - Fix loff_t usage loff_t is a signed type. If userspace passes a negative ppos, the "count" range check is weakened. "count"s bigger than HPEE_MAX_LENGTH will pass the check. Also, if ppos is negative, the readb(eisa_eeprom_addr + *ppos) will poke in random memory. Signed-off-by: Michael Buesch Cc: stable@kernel.org Signed-off-by: Helge Deller --- drivers/parisc/eisa_eeprom.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c index 685d94e69d44..8c0b26e9b98a 100644 --- a/drivers/parisc/eisa_eeprom.c +++ b/drivers/parisc/eisa_eeprom.c @@ -55,7 +55,7 @@ static ssize_t eisa_eeprom_read(struct file * file, ssize_t ret; int i; - if (*ppos >= HPEE_MAX_LENGTH) + if (*ppos < 0 || *ppos >= HPEE_MAX_LENGTH) return 0; count = *ppos + count < HPEE_MAX_LENGTH ? count : HPEE_MAX_LENGTH - *ppos; -- cgit v1.2.2 From b10ff54f9f58adfb708b53e6e56ed3d7804ade74 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 8 Jul 2009 15:27:20 +0000 Subject: parisc: includecheck fix for ccio-dma.c fix the following 'make includecheck' warning: drivers/parisc/ccio-dma.c: linux/proc_fs.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Helge Deller --- drivers/parisc/ccio-dma.c | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 0f0e0b919ef4..a45b0c0d574e 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -70,7 +70,6 @@ #undef CCIO_COLLECT_STATS #endif -#include #include /* for proc_runway_root */ #ifdef DEBUG_CCIO_INIT -- cgit v1.2.2 From c6fe6b0783a8fd923d11dd0388cbd561ff15bdf1 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 2 Aug 2009 15:13:29 +0200 Subject: parisc: hp_sdc_mlc.c - check return value of down_trylock() Signed-off-by: Helge Deller --- drivers/input/serio/hp_sdc_mlc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c index b587e2d576ac..820e51673b26 100644 --- a/drivers/input/serio/hp_sdc_mlc.c +++ b/drivers/input/serio/hp_sdc_mlc.c @@ -296,7 +296,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc) priv->tseq[3] = 0; if (mlc->opacket & HIL_CTRL_APE) { priv->tseq[3] |= HP_SDC_LPC_APE_IPF; - down_trylock(&mlc->csem); + BUG_ON(down_trylock(&mlc->csem)); } enqueue: hp_sdc_enqueue_transaction(&priv->trans); -- cgit v1.2.2 From 1e0deabd35f210f22c03cc734a0335c07ae71ff3 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 2 Aug 2009 15:17:37 +0200 Subject: parisc: dino.c - check return value of pci_assign_resource() Signed-off-by: Helge Deller --- drivers/parisc/dino.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index c590974e9815..d69bde6a2343 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -614,7 +614,7 @@ dino_fixup_bus(struct pci_bus *bus) dev_name(&bus->self->dev), i, bus->self->resource[i].start, bus->self->resource[i].end); - pci_assign_resource(bus->self, i); + WARN_ON(pci_assign_resource(bus->self, i)); DBG("DEBUG %s after assign %d [0x%lx,0x%lx]\n", dev_name(&bus->self->dev), i, bus->self->resource[i].start, -- cgit v1.2.2 From 1a1dba32412c15c51d5fc0b9efadd2ea310356d7 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 2 Aug 2009 15:26:51 +0200 Subject: parisc: sticore.c - check return values Signed-off-by: Helge Deller --- drivers/video/console/sticore.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/video/console/sticore.c b/drivers/video/console/sticore.c index ef7870f5ea08..857b3668b3ba 100644 --- a/drivers/video/console/sticore.c +++ b/drivers/video/console/sticore.c @@ -957,9 +957,14 @@ static int __devinit sticore_pci_init(struct pci_dev *pd, #ifdef CONFIG_PCI unsigned long fb_base, rom_base; unsigned int fb_len, rom_len; + int err; struct sti_struct *sti; - pci_enable_device(pd); + err = pci_enable_device(pd); + if (err < 0) { + dev_err(&pd->dev, "Cannot enable PCI device\n"); + return err; + } fb_base = pci_resource_start(pd, 0); fb_len = pci_resource_len(pd, 0); @@ -1048,7 +1053,7 @@ static void __devinit sti_init_roms(void) /* Register drivers for native & PCI cards */ register_parisc_driver(&pa_sti_driver); - pci_register_driver(&pci_sti_driver); + WARN_ON(pci_register_driver(&pci_sti_driver)); /* if we didn't find the given default sti, take the first one */ if (!default_sti) -- cgit v1.2.2 From c43962321e8af5309dd3ffcd78743c89581265e5 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 2 Aug 2009 15:35:43 +0200 Subject: parisc: parisc-agp.c - use correct page_mask function Fix those compiler warnings, which indeed point to a bug: drivers/char/agp/parisc-agp.c:228: warning: initialization from incompatible pointer type drivers/char/agp/parisc-agp.c:201: warning: 'parisc_agp_page_mask_memory' defined but not used Signed-off-by: Helge Deller --- drivers/char/agp/parisc-agp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index f4bb43fb8016..e077701ae3d9 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -225,7 +225,7 @@ static const struct agp_bridge_driver parisc_agp_driver = { .configure = parisc_agp_configure, .fetch_size = parisc_agp_fetch_size, .tlb_flush = parisc_agp_tlbflush, - .mask_memory = parisc_agp_mask_memory, + .mask_memory = parisc_agp_page_mask_memory, .masks = parisc_agp_masks, .agp_enable = parisc_agp_enable, .cache_flush = global_cache_flush, -- cgit v1.2.2 From cae5a39f34d52c46ca49edfc3f297656a0fd60b7 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 2 Aug 2009 15:42:39 +0200 Subject: parisc: hppb.c - fix printk format strings Fix those warnings: drivers/parisc/hppb.c: In function 'hppb_probe': drivers/parisc/hppb.c:65: warning: format '%x' expects type 'unsigned int', but argument 2 has type 'resource_size_t' drivers/parisc/hppb.c:77: warning: format '%08x' expects type 'unsigned int', but argument 3 has type 'resource_size_t' drivers/parisc/hppb.c:77: warning: format '%08x' expects type 'unsigned int', but argument 4 has type 'resource_size_t' Signed-off-by: Helge Deller --- drivers/parisc/hppb.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/parisc/hppb.c b/drivers/parisc/hppb.c index 13856415b432..815db175d427 100644 --- a/drivers/parisc/hppb.c +++ b/drivers/parisc/hppb.c @@ -62,7 +62,8 @@ static int hppb_probe(struct parisc_device *dev) } card = card->next; } - printk(KERN_INFO "Found GeckoBoa at 0x%x\n", dev->hpa.start); + printk(KERN_INFO "Found GeckoBoa at 0x%llx\n", + (unsigned long long) dev->hpa.start); card->hpa = dev->hpa.start; card->mmio_region.name = "HP-PB Bus"; @@ -73,8 +74,10 @@ static int hppb_probe(struct parisc_device *dev) status = ccio_request_resource(dev, &card->mmio_region); if(status < 0) { - printk(KERN_ERR "%s: failed to claim HP-PB bus space (%08x, %08x)\n", - __FILE__, card->mmio_region.start, card->mmio_region.end); + printk(KERN_ERR "%s: failed to claim HP-PB " + "bus space (0x%08llx, 0x%08llx)\n", + __FILE__, (unsigned long long) card->mmio_region.start, + (unsigned long long) card->mmio_region.end); } return 0; -- cgit v1.2.2 From d0006f3281c920fbfead0f5035c62ec8053f980a Mon Sep 17 00:00:00 2001 From: Len Brown Date: Thu, 30 Jul 2009 16:00:53 -0400 Subject: ACPI: root-only read protection on /sys/firmware/acpi/tables/* they were world readable. Signed-off-by: Len Brown --- drivers/acpi/system.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c index 0944daec064f..9c61ab2177cf 100644 --- a/drivers/acpi/system.c +++ b/drivers/acpi/system.c @@ -121,7 +121,7 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr, table_attr->attr.size = 0; table_attr->attr.read = acpi_table_show; table_attr->attr.attr.name = table_attr->name; - table_attr->attr.attr.mode = 0444; + table_attr->attr.attr.mode = 0400; return; } -- cgit v1.2.2 From 74b5820808215f65b70b05a099d6d3c969b82689 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Wed, 29 Jul 2009 15:54:25 -0600 Subject: ACPI: bind workqueues to CPU 0 to avoid SMI corruption On some machines, a software-initiated SMI causes corruption unless the SMI runs on CPU 0. An SMI can be initiated by any AML, but typically it's done in GPE-related methods that are run via workqueues, so we can avoid the known corruption cases by binding the workqueues to CPU 0. References: http://bugzilla.kernel.org/show_bug.cgi?id=13751 https://bugs.launchpad.net/bugs/157171 https://bugs.launchpad.net/bugs/157691 Signed-off-by: Bjorn Helgaas Signed-off-by: Len Brown --- drivers/acpi/osl.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'drivers') diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 71670719d61a..5691f165a952 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -189,11 +189,36 @@ acpi_status __init acpi_os_initialize(void) return AE_OK; } +static void bind_to_cpu0(struct work_struct *work) +{ + set_cpus_allowed(current, cpumask_of_cpu(0)); + kfree(work); +} + +static void bind_workqueue(struct workqueue_struct *wq) +{ + struct work_struct *work; + + work = kzalloc(sizeof(struct work_struct), GFP_KERNEL); + INIT_WORK(work, bind_to_cpu0); + queue_work(wq, work); +} + acpi_status acpi_os_initialize1(void) { + /* + * On some machines, a software-initiated SMI causes corruption unless + * the SMI runs on CPU 0. An SMI can be initiated by any AML, but + * typically it's done in GPE-related methods that are run via + * workqueues, so we can avoid the known corruption cases by binding + * the workqueues to CPU 0. + */ kacpid_wq = create_singlethread_workqueue("kacpid"); + bind_workqueue(kacpid_wq); kacpi_notify_wq = create_singlethread_workqueue("kacpi_notify"); + bind_workqueue(kacpi_notify_wq); kacpi_hotplug_wq = create_singlethread_workqueue("kacpi_hotplug"); + bind_workqueue(kacpi_hotplug_wq); BUG_ON(!kacpid_wq); BUG_ON(!kacpi_notify_wq); BUG_ON(!kacpi_hotplug_wq); -- cgit v1.2.2 From aa7b2b2e973874df99a45b31adbed5978b46be1f Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Fri, 3 Jul 2009 10:49:03 +0800 Subject: ACPI: Don't treat generic error as ACPI error code in acpi memory hotplug driver Don't treat the generic error as ACPI error code. Otherwise when the generic code is returned, it will complain the following warning messag: >ACPI Exception (acpi_memhotplug-0171): UNKNOWN_STATUS_CODE, Cannot get acpi bus device [20080609] >ACPI: Cannot find driver data > ACPI Error (utglobal-0127): Unknown exception code: 0xFFFFFFED [20080609] > Pid: 85, comm: kacpi_notify Not tainted 2.6.27.19-5-default #1 Call Trace: [] show_trace_log_lvl+0x41/0x58 [] dump_stack+0x69/0x6f ..... At the same time when the generic error code is returned, the ACPI_EXCEPTION is replaced by the printk. Signed-off-by: Zhao Yakui Signed-off-by: Len Brown --- drivers/acpi/acpi_memhotplug.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 7a0f4aa4fa1e..37cbe72d17eb 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -38,6 +38,9 @@ #define _COMPONENT ACPI_MEMORY_DEVICE_COMPONENT +#undef PREFIX +#define PREFIX "ACPI:memory_hp:" + ACPI_MODULE_NAME("acpi_memhotplug"); MODULE_AUTHOR("Naveen B S "); MODULE_DESCRIPTION("Hotplug Mem Driver"); @@ -153,6 +156,7 @@ acpi_memory_get_device(acpi_handle handle, acpi_handle phandle; struct acpi_device *device = NULL; struct acpi_device *pdevice = NULL; + int result; if (!acpi_bus_get_device(handle, &device) && device) @@ -165,9 +169,9 @@ acpi_memory_get_device(acpi_handle handle, } /* Get the parent device */ - status = acpi_bus_get_device(phandle, &pdevice); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Cannot get acpi bus device")); + result = acpi_bus_get_device(phandle, &pdevice); + if (result) { + printk(KERN_WARNING PREFIX "Cannot get acpi bus device"); return -EINVAL; } @@ -175,9 +179,9 @@ acpi_memory_get_device(acpi_handle handle, * Now add the notified device. This creates the acpi_device * and invokes .add function */ - status = acpi_bus_add(&device, pdevice, handle, ACPI_BUS_TYPE_DEVICE); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Cannot add acpi bus")); + result = acpi_bus_add(&device, pdevice, handle, ACPI_BUS_TYPE_DEVICE); + if (result) { + printk(KERN_WARNING PREFIX "Cannot add acpi bus"); return -EINVAL; } -- cgit v1.2.2 From 5d2619fca753d270e63e76c9e18437b0d9bc8d75 Mon Sep 17 00:00:00 2001 From: Zhao Yakui Date: Tue, 7 Jul 2009 10:56:11 +0800 Subject: ACPI: Ingore the memory block with zero block size in course of memory hotplug If the memory block size is zero, ignore it and don't do the memory hotplug flowchart. Otherwise it will complain the following warning message: >System RAM resource 0 - ffffffffffffffff cannot be added Signed-off-by: Zhao Yakui Signed-off-by: Len Brown --- drivers/acpi/acpi_memhotplug.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 37cbe72d17eb..9a62224cc278 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -242,7 +242,12 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) num_enabled++; continue; } - + /* + * If the memory block size is zero, please ignore it. + * Don't try to do the following memory hotplug flowchart. + */ + if (!info->length) + continue; if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); @@ -257,8 +262,15 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) mem_device->state = MEMORY_INVALID_STATE; return -EINVAL; } - - return result; + /* + * Sometimes the memory device will contain several memory blocks. + * When one memory block is hot-added to the system memory, it will + * be regarded as a success. + * Otherwise if the last memory block can't be hot-added to the system + * memory, it will be failure and the memory device can't be bound with + * driver. + */ + return 0; } static int acpi_memory_powerdown_device(struct acpi_memory_device *mem_device) -- cgit v1.2.2 From 7334546a52c6764df120459509b1f803a073eacc Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Mon, 29 Jun 2009 09:40:07 +0100 Subject: eeepc-laptop: fix hot-unplug on resume OOPS on resume when the wireless adaptor is disabled during suspend was introduced by "eeepc-laptop: read rfkill soft-blocked state on resume". Unable to handle kernel NULL pointer dereference Process s2disk Tainted: G W IP: klist_put Call trace: ? klist_del ? device_del ? device_unregister ? pci_stop_dev ? pci_stop_bus ? pci_remove_device ? eeepc_rfkill_hotplug [eeepc_laptop] ? eeepc_hotk_resume [eeepc_laptop] ? acpi_device_resume ? device_resume ? hibernation_snapshot It appears the PCI device is removed twice. The eeepc_rfkill_hotplug() call from the resume handler is racing against the call from the ACPI notifier callback. The ACPI notification is triggered by the resume handler when it refreshes the value of CM_ASL_WLAN. The fix is to serialize hotplug calls using a workqueue. http://bugzilla.kernel.org/show_bug.cgi?id=13825 Signed-off-by: Alan Jenkins Acked-by: Corentin Chary Signed-off-by: Len Brown --- drivers/platform/x86/eeepc-laptop.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c index ec560f16d720..222ffb892f22 100644 --- a/drivers/platform/x86/eeepc-laptop.c +++ b/drivers/platform/x86/eeepc-laptop.c @@ -143,6 +143,7 @@ struct eeepc_hotk { struct rfkill *bluetooth_rfkill; struct rfkill *wwan3g_rfkill; struct hotplug_slot *hotplug_slot; + struct work_struct hotplug_work; }; /* The actual device the driver binds to */ @@ -660,7 +661,7 @@ static int eeepc_get_adapter_status(struct hotplug_slot *hotplug_slot, return 0; } -static void eeepc_rfkill_hotplug(void) +static void eeepc_hotplug_work(struct work_struct *work) { struct pci_dev *dev; struct pci_bus *bus = pci_find_bus(0, 1); @@ -701,7 +702,7 @@ static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data) if (event != ACPI_NOTIFY_BUS_CHECK) return; - eeepc_rfkill_hotplug(); + schedule_work(&ehotk->hotplug_work); } static void eeepc_hotk_notify(struct acpi_device *device, u32 event) @@ -892,7 +893,7 @@ static int eeepc_hotk_resume(struct acpi_device *device) rfkill_set_sw_state(ehotk->wlan_rfkill, wlan != 1); - eeepc_rfkill_hotplug(); + schedule_work(&ehotk->hotplug_work); } if (ehotk->bluetooth_rfkill) @@ -1093,6 +1094,8 @@ static int eeepc_rfkill_init(struct device *dev) { int result = 0; + INIT_WORK(&ehotk->hotplug_work, eeepc_hotplug_work); + eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P6"); eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P7"); -- cgit v1.2.2 From a53a8b56827cc429c6d9f861ad558beeb5f6103f Mon Sep 17 00:00:00 2001 From: Ben McKeegan Date: Tue, 28 Jul 2009 07:43:57 +0000 Subject: ppp: fix lost fragments in ppp_mp_explode() (resubmit) This patch fixes the corner cases where the sum of MTU of the free channels (adjusted for fragmentation overheads) is less than the MTU of PPP link. There are at least 3 situations where this case might arise: - some of the channels are busy - the multilink session is running in a degraded state (i.e. with less than its full complement of active channels) - by design, where multilink protocol is being used to artificially increase the effective link MTU of a single link. Without this patch, at most 1 fragment is ever sent per free channel for a given PPP frame and any remaining part of the PPP frame that does not fit into those fragments is silently discarded. This patch restores the original behaviour which was broken by commit 9c705260feea6ae329bc6b6d5f6d2ef0227eda0a 'ppp:ppp_mp_explode() redesign'. Once all 'free' channels have been given a fragment, an additional fragment is queued to each available channel in turn, as many times as necessary, until the entire PPP frame has been consumed. Signed-off-by: Ben McKeegan Signed-off-by: David S. Miller --- drivers/net/ppp_generic.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) (limited to 'drivers') diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 639d11bc444e..cd37d739ac74 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -1384,7 +1384,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) /* create a fragment for each channel */ bits = B; - while (nfree > 0 && len > 0) { + while (len > 0) { list = list->next; if (list == &ppp->channels) { i = 0; @@ -1431,29 +1431,31 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) *otherwise divide it according to the speed *of the channel we are going to transmit on */ - if (pch->speed == 0) { - flen = totlen/nfree ; - if (nbigger > 0) { - flen++; - nbigger--; - } - } else { - flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / - ((totspeed*totfree)/pch->speed)) - hdrlen; - if (nbigger > 0) { - flen += ((totfree - nzero)*pch->speed)/totspeed; - nbigger -= ((totfree - nzero)*pch->speed)/ + if (nfree > 0) { + if (pch->speed == 0) { + flen = totlen/nfree ; + if (nbigger > 0) { + flen++; + nbigger--; + } + } else { + flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / + ((totspeed*totfree)/pch->speed)) - hdrlen; + if (nbigger > 0) { + flen += ((totfree - nzero)*pch->speed)/totspeed; + nbigger -= ((totfree - nzero)*pch->speed)/ totspeed; + } } + nfree--; } - nfree--; /* *check if we are on the last channel or *we exceded the lenght of the data to *fragment */ - if ((nfree == 0) || (flen > len)) + if ((nfree <= 0) || (flen > len)) flen = len; /* *it is not worth to tx on slow channels: @@ -1467,7 +1469,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) continue; } - mtu = pch->chan->mtu + 2 - hdrlen; + mtu = pch->chan->mtu - hdrlen; if (mtu < 4) mtu = 4; if (flen > mtu) -- cgit v1.2.2 From 446e72f30eca76d6f9a1a54adf84d2c6ba2831f8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Jul 2009 03:47:39 +0000 Subject: pppol2tp: calls unregister_pernet_gen_device() at unload time Failure to call unregister_pernet_gen_device() can exhaust memory if module is loaded/unloaded many times. Signed-off-by: Eric Dumazet Acked-by: Cyrill Gorcunov Signed-off-by: David S. Miller --- drivers/net/pppol2tp.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/net/pppol2tp.c b/drivers/net/pppol2tp.c index e7935d09c896..e0f9219a0aea 100644 --- a/drivers/net/pppol2tp.c +++ b/drivers/net/pppol2tp.c @@ -2680,6 +2680,7 @@ out_unregister_pppol2tp_proto: static void __exit pppol2tp_exit(void) { unregister_pppox_proto(PX_PROTO_OL2TP); + unregister_pernet_gen_device(pppol2tp_net_id, &pppol2tp_net_ops); proto_unregister(&pppol2tp_sk_proto); } -- cgit v1.2.2 From 1b994b5a1b3cb5395598a08ef3bb0ac118d75c1b Mon Sep 17 00:00:00 2001 From: roel kluin Date: Sat, 1 Aug 2009 20:26:52 +0000 Subject: tulip: Read buffer overflow Check whether index is within bounds before testing the element. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/tulip/de4x5.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c index eb72d2e9ab3d..acfdccd44567 100644 --- a/drivers/net/tulip/de4x5.c +++ b/drivers/net/tulip/de4x5.c @@ -5059,7 +5059,7 @@ mii_get_phy(struct net_device *dev) if ((id == 0) || (id == 65535)) continue; /* Valid ID? */ for (j=0; jphy[k].id && (k < DE4X5_MAX_PHY); k++); + for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++); if (k < DE4X5_MAX_PHY) { memcpy((char *)&lp->phy[k], (char *)&phy_info[j], sizeof(struct phy_table)); @@ -5072,7 +5072,7 @@ mii_get_phy(struct net_device *dev) break; } if ((j == limit) && (i < DE4X5_MAX_MII)) { - for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++); + for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++); lp->phy[k].addr = i; lp->phy[k].id = id; lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */ @@ -5091,7 +5091,7 @@ mii_get_phy(struct net_device *dev) purgatory: lp->active = 0; if (lp->phy[0].id) { /* Reset the PHY devices */ - for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++) { /*For each PHY*/ + for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++) { /*For each PHY*/ mii_wr(MII_CR_RST, MII_CR, lp->phy[k].addr, DE4X5_MII); while (mii_rd(MII_CR, lp->phy[k].addr, DE4X5_MII) & MII_CR_RST); -- cgit v1.2.2 From 54706d99051582993037be5a076aa543fd7f1c38 Mon Sep 17 00:00:00 2001 From: roel kluin Date: Sat, 1 Aug 2009 20:20:13 +0000 Subject: s6gmac: Read buffer overflow Check whether index is within bounds before testing the element. In the last iteration i is PHY_MAX_ADDR. the condition `!(p = pd->mii.bus->phy_map[PHY_MAX_ADDR])' is undefined and may evaluate to false, which leads to a dereference of this invalid phy_map in the phy_connect() below. Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/s6gmac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/s6gmac.c b/drivers/net/s6gmac.c index 5345e47b35ac..4525cbe8dd69 100644 --- a/drivers/net/s6gmac.c +++ b/drivers/net/s6gmac.c @@ -793,7 +793,7 @@ static inline int s6gmac_phy_start(struct net_device *dev) struct s6gmac *pd = netdev_priv(dev); int i = 0; struct phy_device *p = NULL; - while ((!(p = pd->mii.bus->phy_map[i])) && (i < PHY_MAX_ADDR)) + while ((i < PHY_MAX_ADDR) && (!(p = pd->mii.bus->phy_map[i]))) i++; p = phy_connect(dev, dev_name(&p->dev), &s6gmac_adjust_link, 0, PHY_INTERFACE_MODE_RGMII); -- cgit v1.2.2 From 9bfdac94c78faf68ce038d5c45a385927f2667ce Mon Sep 17 00:00:00 2001 From: roel kluin Date: Fri, 31 Jul 2009 03:43:59 +0000 Subject: mISDN: Read buffer overflow Check whether index is within bounds before testing the element. Signed-off-by: Roel Kluin Acked-by: Karsten Keil Signed-off-by: David S. Miller --- drivers/isdn/mISDN/l1oip_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index 990e6a7e6674..0ebce046ade4 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -1480,7 +1480,7 @@ l1oip_init(void) return -ENOMEM; l1oip_cnt = 0; - while (type[l1oip_cnt] && l1oip_cnt < MAX_CARDS) { + while (l1oip_cnt < MAX_CARDS && type[l1oip_cnt]) { switch (type[l1oip_cnt] & 0xff) { case 1: pri = 0; -- cgit v1.2.2 From 50c643e7652458e649955408685a16e88ea6dbae Mon Sep 17 00:00:00 2001 From: Dhananjay Phadke Date: Sat, 1 Aug 2009 21:36:16 +0000 Subject: netxen: fix coherent dma mask setting Change default dma mask for NX3031 to 39 bit with ability to update it to 64-bit (if firmware indicates support). Old code was restricting it under 4GB (32-bit), sometimes causing failure to allocate descriptor rings on heavily populated system. NX2031 based NICs will still get 32-bit coherent mask. Signed-off-by: Dhananjay Phadke Signed-off-by: David S. Miller --- drivers/net/netxen/netxen_nic_main.c | 37 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c index 637ac8b89bac..3cd8cfcf627b 100644 --- a/drivers/net/netxen/netxen_nic_main.c +++ b/drivers/net/netxen/netxen_nic_main.c @@ -221,7 +221,7 @@ netxen_napi_disable(struct netxen_adapter *adapter) } } -static int nx_set_dma_mask(struct netxen_adapter *adapter, uint8_t revision_id) +static int nx_set_dma_mask(struct netxen_adapter *adapter) { struct pci_dev *pdev = adapter->pdev; uint64_t mask, cmask; @@ -229,19 +229,17 @@ static int nx_set_dma_mask(struct netxen_adapter *adapter, uint8_t revision_id) adapter->pci_using_dac = 0; mask = DMA_BIT_MASK(32); - /* - * Consistent DMA mask is set to 32 bit because it cannot be set to - * 35 bits. For P3 also leave it at 32 bits for now. Only the rings - * come off this pool. - */ cmask = DMA_BIT_MASK(32); + if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) { #ifndef CONFIG_IA64 - if (revision_id >= NX_P3_B0) - mask = DMA_BIT_MASK(39); - else if (revision_id == NX_P2_C1) mask = DMA_BIT_MASK(35); #endif + } else { + mask = DMA_BIT_MASK(39); + cmask = mask; + } + if (pci_set_dma_mask(pdev, mask) == 0 && pci_set_consistent_dma_mask(pdev, cmask) == 0) { adapter->pci_using_dac = 1; @@ -256,7 +254,7 @@ static int nx_update_dma_mask(struct netxen_adapter *adapter) { int change, shift, err; - uint64_t mask, old_mask; + uint64_t mask, old_mask, old_cmask; struct pci_dev *pdev = adapter->pdev; change = 0; @@ -272,14 +270,29 @@ nx_update_dma_mask(struct netxen_adapter *adapter) if (change) { old_mask = pdev->dma_mask; + old_cmask = pdev->dev.coherent_dma_mask; + mask = (1ULL<<(32+shift)) - 1; err = pci_set_dma_mask(pdev, mask); if (err) - return pci_set_dma_mask(pdev, old_mask); + goto err_out; + + if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) { + + err = pci_set_consistent_dma_mask(pdev, mask); + if (err) + goto err_out; + } + dev_info(&pdev->dev, "using %d-bit dma mask\n", 32+shift); } return 0; + +err_out: + pci_set_dma_mask(pdev, old_mask); + pci_set_consistent_dma_mask(pdev, old_cmask); + return err; } static void netxen_check_options(struct netxen_adapter *adapter) @@ -1006,7 +1019,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) revision_id = pdev->revision; adapter->ahw.revision_id = revision_id; - err = nx_set_dma_mask(adapter, revision_id); + err = nx_set_dma_mask(adapter); if (err) goto err_out_free_netdev; -- cgit v1.2.2 From df4e7f72f5156ef16a918da8a575ba90ec27ab77 Mon Sep 17 00:00:00 2001 From: Don Fry Date: Fri, 31 Jul 2009 08:40:06 +0000 Subject: pcnet32: remove superfluous NULL pointer check in pcnet32_probe1() Move the debug printk() into the proper place and remove superfluous NULL pointer check in pcnet32_probe1(). This takes care of the following entry from Dan's list: drivers/net/pcnet32.c +1889 pcnet32_probe1(298) warning: variable derefenced before check 'pdev' Reported-by: Dan Carpenter Signed-off-by: Bartlomiej Zolnierkiewicz Acked-by: Don Fry Signed-off-by: David S. Miller --- drivers/net/pcnet32.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) (limited to 'drivers') diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 1c35e1d637a0..b61c97254b3f 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -1611,8 +1611,11 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 && pcnet32_dwio_check(ioaddr)) { a = &pcnet32_dwio; - } else + } else { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX "No access methods\n"); goto err_release_region; + } } chip_version = @@ -1852,12 +1855,6 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) ((cards_found >= MAX_UNITS) || full_duplex[cards_found])) lp->options |= PCNET32_PORT_FD; - if (!a) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "No access methods\n"); - ret = -ENODEV; - goto err_free_consistent; - } lp->a = *a; /* prior to register_netdev, dev->name is not yet correct */ @@ -1973,14 +1970,13 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) return 0; - err_free_ring: +err_free_ring: pcnet32_free_ring(dev); - err_free_consistent: pci_free_consistent(lp->pci_dev, sizeof(*lp->init_block), lp->init_block, lp->init_dma_addr); - err_free_netdev: +err_free_netdev: free_netdev(dev); - err_release_region: +err_release_region: release_region(ioaddr, PCNET32_TOTAL_SIZE); return ret; } -- cgit v1.2.2 From 63097b3ad85788a64c75091bff351ecc850761b2 Mon Sep 17 00:00:00 2001 From: Don Fry Date: Fri, 31 Jul 2009 08:45:29 +0000 Subject: pcnet32: VLB support fixes VLB support has been broken since at least 2004-2005 period as some changes introduced back then assumed that ->pci_dev is always valid, lets try to fix it: - remove duplicated SET_NETDEV_DEV() call - call SET_NETDEV_DEV() only for PCI devices - check for ->pci_dev validity in pcnet32_open() [ Alternatively we may consider removing VLB support but there would not be much gain in it since an extra driver code needed for VLB support is minimal and quite simple. ] This takes care of the following entry from Dan's list: drivers/net/pcnet32.c +1889 pcnet32_probe1(298) warning: variable derefenced before check 'pdev' Reported-by: Dan Carpenter Signed-off-by: Bartlomiej Zolnierkiewicz Acked-by: Don Fry Signed-off-by: David S. Miller --- drivers/net/pcnet32.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index b61c97254b3f..16964ec73e67 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -1722,7 +1722,9 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) ret = -ENOMEM; goto err_release_region; } - SET_NETDEV_DEV(dev, &pdev->dev); + + if (pdev) + SET_NETDEV_DEV(dev, &pdev->dev); if (pcnet32_debug & NETIF_MSG_PROBE) printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr); @@ -1821,7 +1823,6 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) spin_lock_init(&lp->lock); - SET_NETDEV_DEV(dev, &pdev->dev); lp->name = chipname; lp->shared_irq = shared; lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */ @@ -2085,6 +2086,7 @@ static void pcnet32_free_ring(struct net_device *dev) static int pcnet32_open(struct net_device *dev) { struct pcnet32_private *lp = netdev_priv(dev); + struct pci_dev *pdev = lp->pci_dev; unsigned long ioaddr = dev->base_addr; u16 val; int i; @@ -2145,9 +2147,9 @@ static int pcnet32_open(struct net_device *dev) lp->a.write_csr(ioaddr, 124, val); /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */ - if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT && - (lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX || - lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) { + if (pdev && pdev->subsystem_vendor == PCI_VENDOR_ID_AT && + (pdev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX || + pdev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) { if (lp->options & PCNET32_PORT_ASEL) { lp->options = PCNET32_PORT_FD | PCNET32_PORT_100; if (netif_msg_link(lp)) -- cgit v1.2.2 From 5973bee46fe66db94fab198979dec87f263fc2a8 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Tue, 21 Jul 2009 00:40:46 +0200 Subject: [WATCHDOG] Fix COH 901 327 watchdog enablement Since the COH 901 327 found in U300 is clocked at 32 kHz we need to wait for the interrupt clearing flag to propagate through hardware in order not to accidentally fire off any interrupts when we enable them. Signed-off-by: Linus Walleij Signed-off-by: Wim Van Sebroeck --- drivers/watchdog/coh901327_wdt.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'drivers') diff --git a/drivers/watchdog/coh901327_wdt.c b/drivers/watchdog/coh901327_wdt.c index fecb307d28e9..aec7cefdef21 100644 --- a/drivers/watchdog/coh901327_wdt.c +++ b/drivers/watchdog/coh901327_wdt.c @@ -18,6 +18,7 @@ #include #include #include +#include #define DRV_NAME "WDOG COH 901 327" @@ -92,6 +93,8 @@ static struct clk *clk; static void coh901327_enable(u16 timeout) { u16 val; + unsigned long freq; + unsigned long delay_ns; clk_enable(clk); /* Restart timer if it is disabled */ @@ -102,6 +105,14 @@ static void coh901327_enable(u16 timeout) /* Acknowledge any pending interrupt so it doesn't just fire off */ writew(U300_WDOG_IER_WILL_BARK_IRQ_ACK_ENABLE, virtbase + U300_WDOG_IER); + /* + * The interrupt is cleared in the 32 kHz clock domain. + * Wait 3 32 kHz cycles for it to take effect + */ + freq = clk_get_rate(clk); + delay_ns = (1000000000 + freq - 1) / freq; /* Freq to ns and round up */ + delay_ns = 3 * delay_ns; /* Wait 3 cycles */ + ndelay(delay_ns); /* Enable the watchdog interrupt */ writew(U300_WDOG_IMR_WILL_BARK_IRQ_ENABLE, virtbase + U300_WDOG_IMR); /* Activate the watchdog timer */ -- cgit v1.2.2 From b564afcfb82fe3e63a7ce05a944eb5e11244d7cb Mon Sep 17 00:00:00 2001 From: Andreas Eversberg Date: Mon, 27 Jul 2009 07:24:04 +0000 Subject: mISDN: Fix handling of receive buffer size in L1oIP The size of receive buffer pointer was used to get size of receive buffer instead of recvbuf_size itself, so only 4/8 bytes could be transfered. This is a regression to 2.6.30 introduced by commit 8c90e11e3543d7de612194a042a148caeaab5f1d mISDN: Use kernel_{send,recv}msg instead of open coding Signed-off-by: Andreas Eversberg Signed-off-by: Karsten Keil Signed-off-by: David S. Miller --- drivers/isdn/mISDN/l1oip_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index 0ebce046ade4..7e5f30dbc0a0 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -731,10 +731,10 @@ l1oip_socket_thread(void *data) while (!signal_pending(current)) { struct kvec iov = { .iov_base = recvbuf, - .iov_len = sizeof(recvbuf), + .iov_len = recvbuf_size, }; recvlen = kernel_recvmsg(socket, &msg, &iov, 1, - sizeof(recvbuf), 0); + recvbuf_size, 0); if (recvlen > 0) { l1oip_socket_parse(hc, &sin_rx, recvbuf, recvlen); } else { -- cgit v1.2.2 From 79896cf42f6a96d7e14f2dc3473443d68d74031d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 2 Aug 2009 14:04:19 -0700 Subject: Make pci_claim_resource() use request_resource() rather than insert_resource() This function has traditionally used "insert_resource()", because before commit cebd78a8c5 ("Fix pci_claim_resource") it used to just insert the resource into whatever root resource tree that was indicated by "pcibios_select_root()". So there Matthew fixed it to actually look up the proper parent resource, which means that now it's actively wrong to then traverse the resource tree any more: we already know exactly where the new resource should go. And when we then did commit a76117dfd6 ("x86: Use pci_claim_resource"), which changed the x86 PCI code from the open-coded pr = pci_find_parent_resource(dev, r); if (!pr || request_resource(pr, r) < 0) { to using if (pci_claim_resource(dev, idx) < 0) { that "insert_resource()" now suddenly became a problem, and causes a regression covered by http://bugzilla.kernel.org/show_bug.cgi?id=13891 which this fixes. Reported-and-tested-by: Rafael J. Wysocki Cc: Matthew Wilcox Cc: Andrew Patterson Cc: Linux PCI Signed-off-by: Linus Torvalds --- drivers/pci/setup-res.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index b711fb7181e2..1898c7b47907 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -100,16 +100,16 @@ int pci_claim_resource(struct pci_dev *dev, int resource) { struct resource *res = &dev->resource[resource]; struct resource *root; - char *dtype = resource < PCI_BRIDGE_RESOURCES ? "device" : "bridge"; int err; root = pci_find_parent_resource(dev, res); err = -EINVAL; if (root != NULL) - err = insert_resource(root, res); + err = request_resource(root, res); if (err) { + const char *dtype = resource < PCI_BRIDGE_RESOURCES ? "device" : "bridge"; dev_err(&dev->dev, "BAR %d: %s of %s %pR\n", resource, root ? "address space collision on" : -- cgit v1.2.2 From ac5e7113e74872928844d00085bd47c988f12728 Mon Sep 17 00:00:00 2001 From: Andre Noll Date: Mon, 3 Aug 2009 10:59:47 +1000 Subject: md: Push down data integrity code to personalities. This patch replaces md_integrity_check() by two new public functions: md_integrity_register() and md_integrity_add_rdev() which are both personality-independent. md_integrity_register() is called from the ->run and ->hot_remove methods of all personalities that support data integrity. The function iterates over the component devices of the array and determines if all active devices are integrity capable and if their profiles match. If this is the case, the common profile is registered for the mddev via blk_integrity_register(). The second new function, md_integrity_add_rdev() is called from the ->hot_add_disk methods, i.e. whenever a new device is being added to a raid array. If the new device does not support data integrity, or has a profile different from the one already registered, data integrity for the mddev is disabled. For raid0 and linear, only the call to md_integrity_register() from the ->run method is necessary. Signed-off-by: Andre Noll Signed-off-by: NeilBrown --- drivers/md/linear.c | 1 + drivers/md/md.c | 94 ++++++++++++++++++++++++++++++++++---------------- drivers/md/md.h | 2 ++ drivers/md/multipath.c | 5 ++- drivers/md/raid0.c | 1 + drivers/md/raid1.c | 6 ++-- drivers/md/raid10.c | 4 +++ 7 files changed, 80 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5810fa906af0..54c8677f1e59 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -220,6 +220,7 @@ static int linear_run (mddev_t *mddev) mddev->queue->unplug_fn = linear_unplug; mddev->queue->backing_dev_info.congested_fn = linear_congested; mddev->queue->backing_dev_info.congested_data = mddev; + md_integrity_register(mddev); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index d4351ff0849f..180949e94a7b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1487,37 +1487,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); -static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(mddev_t *mddev) +{ + mdk_rdev_t *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (blk_get_integrity(mddev->gendisk)) + return 0; /* already registered */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + /* + * If at least one rdev is not integrity capable, we can not + * enable data integrity for the md device. + */ + if (!bdev_get_integrity(rdev->bdev)) + return -EINVAL; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + if (blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)) != 0) { + printk(KERN_ERR "md: failed to register integrity for %s\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_NOTICE "md: data integrity on %s enabled\n", + mdname(mddev)); + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* Disable data integrity if non-capable/non-matching disk is being added */ +void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { - struct mdk_personality *pers = mddev->pers; - struct gendisk *disk = mddev->gendisk; struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(disk); + struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); - /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ - if (pers && pers->level >= 4 && pers->level <= 6) + if (!bi_mddev) /* nothing to do */ return; - - /* If rdev is integrity capable, register profile for mddev */ - if (!bi_mddev && bi_rdev) { - if (blk_integrity_register(disk, bi_rdev)) - printk(KERN_ERR "%s: %s Could not register integrity!\n", - __func__, disk->disk_name); - else - printk(KERN_NOTICE "Enabling data integrity on %s\n", - disk->disk_name); + if (rdev->raid_disk < 0) /* skip spares */ return; - } - - /* Check that mddev and rdev have matching profiles */ - if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { - printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, - disk->disk_name, rdev->bdev->bd_disk->disk_name); - printk(KERN_NOTICE "Disabling data integrity on %s\n", - disk->disk_name); - blk_integrity_unregister(disk); - } + if (bi_rdev && blk_integrity_compare(mddev->gendisk, + rdev->bdev->bd_disk) >= 0) + return; + printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); + blk_integrity_unregister(mddev->gendisk); } +EXPORT_SYMBOL(md_integrity_add_rdev); static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { @@ -1591,7 +1630,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; - md_integrity_check(rdev, mddev); return 0; fail: @@ -4048,10 +4086,6 @@ static int do_md_run(mddev_t * mddev) } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - if (pers->level >= 4 && pers->level <= 6) - /* Cannot support integrity (yet) */ - blk_integrity_unregister(mddev->gendisk); - if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ diff --git a/drivers/md/md.h b/drivers/md/md.h index 9430a110db93..78f03168baf9 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -431,5 +431,7 @@ extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); extern int md_check_no_bitmap(mddev_t *mddev); +extern int md_integrity_register(mddev_t *mddev); +void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 237fe3fd235c..7140909f6662 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -313,6 +313,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(In_sync, &rdev->flags); rcu_assign_pointer(p->rdev, rdev); err = 0; + md_integrity_add_rdev(rdev, mddev); break; } @@ -345,7 +346,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -519,7 +522,7 @@ static int multipath_run (mddev_t *mddev) mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; mddev->queue->backing_dev_info.congested_data = mddev; - + md_integrity_register(mddev); return 0; out_free_conf: diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 335f490dcad6..898e2bdfee47 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -351,6 +351,7 @@ static int raid0_run(mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); + md_integrity_register(mddev); return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0569efba0c02..67e794d0097f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1144,7 +1144,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; } @@ -1178,7 +1178,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -2067,7 +2069,7 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; - + md_integrity_register(mddev); return 0; out_no_mem: diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7298a5e5a183..3d9020cf6f6e 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1170,6 +1170,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) break; } + md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; } @@ -1203,7 +1204,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -2225,6 +2228,7 @@ static int run(mddev_t *mddev) if (conf->near_copies < mddev->raid_disks) blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + md_integrity_register(mddev); return 0; out_free_conf: -- cgit v1.2.2 From 3a981b03f38dc3b8a69b77cbc679e66c1318a44a Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:55 +1000 Subject: md: when a level change reduces the number of devices, remove the excess. When an array is changed from RAID6 to RAID5, fewer drives are needed. So any device that is made superfluous by the level conversion must be marked as not-active. For the RAID6->RAID5 conversion, this will be a drive which only has 'Q' blocks on it. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 180949e94a7b..c194955aecae 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2695,6 +2695,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) ssize_t rv = len; struct mdk_personality *pers; void *priv; + mdk_rdev_t *rdev; if (mddev->pers == NULL) { if (len == 0) @@ -2774,6 +2775,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev_suspend(mddev); mddev->pers->stop(mddev); module_put(mddev->pers->owner); + /* Invalidate devices that are now superfluous */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= mddev->raid_disks) { + rdev->raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + } mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); -- cgit v1.2.2 From 3673f305faf1bc66ead751344f8262ace851ff44 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:56 +1000 Subject: md: avoid array overflow with bad v1.x metadata We trust the 'desc_nr' field in v1.x metadata enough to use it as an index in an array. This isn't really safe. So range-check the value first. Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index c194955aecae..249b2896d4ea 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1308,7 +1308,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } if (mddev->level != LEVEL_MULTIPATH) { int role; - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = 0xffff; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ break; -- cgit v1.2.2 From 70471dafe3390243c598a3165dfb86b8b8b3f4fe Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:57 +1000 Subject: md: Handle growth of v1.x metadata correctly. The v1.x metadata does not have a fixed size and can grow when devices are added. If it grows enough to require an extra sector of storage, we need to update the 'sb_size' to match. Without this, md can write out an incomplete superblock with a bad checksum, which will be rejected when trying to re-assemble the array. Cc: stable@kernel.org Signed-off-by: NeilBrown --- drivers/md/md.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 249b2896d4ea..52c988b072d0 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1399,8 +1399,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; - if (max_dev > le32_to_cpu(sb->max_dev)) + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } for (i=0; idev_roles[i] = cpu_to_le16(0xfffe); -- cgit v1.2.2 From e516402c0d4fc02be4af9fa8c18954d4f9deb44e Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:57 +1000 Subject: md/raid5: set reshape_position correctly when reshape starts. As the internal reshape_progress counter is the main driver for reshape, the fact that reshape_position sometimes starts with the wrong value has minimal effect. It is visible in sysfs and that is all. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 39374230a463..659151e5eda4 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5000,7 +5000,7 @@ static int raid5_start_reshape(mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); } mddev->raid_disks = conf->raid_disks; - mddev->reshape_position = 0; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); -- cgit v1.2.2 From 64bd660b51b2da92e99a5e97349f6558349f11c5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:58 +1000 Subject: md: allow raid5_quiesce to work properly when reshape is happening. The ->quiesce method is not supposed to stop resync/recovery/reshape, just normal IO. But in raid5 we don't have a way to know which stripes are being used for normal IO and which for resync etc, so we need to wait for all stripes to be idle to be sure that all writes have completed. However reshape keeps at least some stripe busy for an extended period of time, so a call to raid5_quiesce can block for several seconds needlessly. So arrange for reshape etc to pause briefly while raid5_quiesce is trying to quiesce the array so that the active_stripes count can drop to zero. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 659151e5eda4..2dc35b4c20ac 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3999,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return 0; } + /* Allow raid5_quiesce to complete */ + wait_event(conf->wait_for_overlap, conf->quiesce != 2); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); @@ -5104,12 +5107,18 @@ static void raid5_quiesce(mddev_t *mddev, int state) case 1: /* stop all writes */ spin_lock_irq(&conf->device_lock); - conf->quiesce = 1; + /* '2' tells resync/reshape to pause so that all + * active stripes can drain + */ + conf->quiesce = 2; wait_event_lock_irq(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, conf->device_lock, /* nothing */); + conf->quiesce = 1; spin_unlock_irq(&conf->device_lock); + /* allow reshape to continue */ + wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ -- cgit v1.2.2 From 449aad3e25358812c43afc60918c5ad3819488e7 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 3 Aug 2009 10:59:58 +1000 Subject: md: Use revalidate_disk to effect changes in size of device. As revalidate_disk calls check_disk_size_change, it will cause any capacity change of a gendisk to be propagated to the blockdev inode. So use that instead of mucking about with locks and i_size_write. Also add a call to revalidate_disk in do_md_run and a few other places where the gendisk capacity is changed. Signed-off-by: NeilBrown --- drivers/md/linear.c | 1 + drivers/md/md.c | 28 +++++----------------------- drivers/md/raid1.c | 1 + drivers/md/raid5.c | 12 ++---------- 4 files changed, 9 insertions(+), 33 deletions(-) (limited to 'drivers') diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 54c8677f1e59..5fe39c2a3d2b 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -257,6 +257,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); call_rcu(&oldconf->rcu, free_conf); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index 52c988b072d0..5b98bea4ff9b 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3741,17 +3741,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) mddev->array_sectors = sectors; set_capacity(mddev->gendisk, mddev->array_sectors); - if (mddev->pers) { - struct block_device *bdev = bdget_disk(mddev->gendisk, 0); - - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (mddev->pers) + revalidate_disk(mddev->gendisk); return len; } @@ -4241,6 +4232,7 @@ static int do_md_run(mddev_t * mddev) md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + revalidate_disk(mddev->gendisk); mddev->changed = 1; md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); @@ -5139,18 +5131,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (!rv) + revalidate_disk(mddev->gendisk); return rv; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 67e794d0097f..8726fd7ebce5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2134,6 +2134,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; + revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2dc35b4c20ac..2b521ee67dfa 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4858,6 +4858,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; + revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5058,7 +5059,6 @@ static void end_reshape(raid5_conf_t *conf) */ static void raid5_finish_reshape(mddev_t *mddev) { - struct block_device *bdev; raid5_conf_t *conf = mddev->private; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5067,15 +5067,7 @@ static void raid5_finish_reshape(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } + revalidate_disk(mddev->gendisk); } else { int d; mddev->degraded = conf->raid_disks; -- cgit v1.2.2 From eb4ad826419ab5b1260bc1625249114767d36bea Mon Sep 17 00:00:00 2001 From: Yevgeny Petrilin Date: Sun, 2 Aug 2009 20:22:18 -0700 Subject: mlx4_en: Fix double pci unmapping. In cases of fragmented skb, with the data pointers being wrapped around the TX buffer, the completion handling code would not forward the data pointer and the firs fragment was unmapped several times, while others were not unmapped at all. Signed-off-by: Yevgeny Petrilin Signed-off-by: David S. Miller --- drivers/net/mlx4/en_tx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c index 08c43f2ae72b..5a88b3f57693 100644 --- a/drivers/net/mlx4/en_tx.c +++ b/drivers/net/mlx4/en_tx.c @@ -249,6 +249,7 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, pci_unmap_page(mdev->pdev, (dma_addr_t) be64_to_cpu(data->addr), frag->size, PCI_DMA_TODEVICE); + ++data; } } /* Stamp the freed descriptor */ -- cgit v1.2.2 From 126b67b8d26f6623d199aa59279f2e3243f2144c Mon Sep 17 00:00:00 2001 From: Doug Thompson Date: Mon, 3 Aug 2009 12:37:06 +0200 Subject: amd64_edac: fix ECC checking On the good path of BIOS enabled ECC and no override, the value returned is 1 by omission and thus is deemed failing by the probe-function. Allow proper module initialization by clearing the retval explicitly. Signed-off-by: Doug Thompson Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 24964c1d0af9..5fa924d61b10 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -2977,6 +2977,9 @@ static int amd64_check_ecc_enabled(struct amd64_pvt *pvt) "ECC is enabled by BIOS, Proceeding " "with EDAC module initialization\n"); + /* Signal good ECC status */ + ret = 0; + /* CLEAR the override, since BIOS controlled it */ ecc_enable_override = 0; } -- cgit v1.2.2 From 202ff1ec8e53d5dd36e1a5bd4b0a7ed7dbd45087 Mon Sep 17 00:00:00 2001 From: Mallikarjuna R Chilakala Date: Mon, 3 Aug 2009 07:20:38 +0000 Subject: ixgbe: Patch to modify 82598 PCIe completion timeout values The default completion timeout values for 82598 should be in the range of 50us to 50ms, however the hardware default for these parts is 500us to 1ms which is less than the 10ms recommended by the pcie spec. To address this we need to increase the value to either 10ms to 250ms for capability version 1 configuration, or 16ms to 55ms for version 2. Signed-off-by: Mallikarjuna R Chilakala Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/ixgbe/ixgbe_82598.c | 67 ++++++++++++++++++++++++++++++++++++++++- drivers/net/ixgbe/ixgbe_type.h | 8 +++++ 2 files changed, 74 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c index b9923047ce11..522c03bc1dad 100644 --- a/drivers/net/ixgbe/ixgbe_82598.c +++ b/drivers/net/ixgbe/ixgbe_82598.c @@ -49,6 +49,51 @@ static s32 ixgbe_setup_copper_link_speed_82598(struct ixgbe_hw *hw, static s32 ixgbe_read_i2c_eeprom_82598(struct ixgbe_hw *hw, u8 byte_offset, u8 *eeprom_data); +/** + * ixgbe_set_pcie_completion_timeout - set pci-e completion timeout + * @hw: pointer to the HW structure + * + * The defaults for 82598 should be in the range of 50us to 50ms, + * however the hardware default for these parts is 500us to 1ms which is less + * than the 10ms recommended by the pci-e spec. To address this we need to + * increase the value to either 10ms to 250ms for capability version 1 config, + * or 16ms to 55ms for version 2. + **/ +void ixgbe_set_pcie_completion_timeout(struct ixgbe_hw *hw) +{ + struct ixgbe_adapter *adapter = hw->back; + u32 gcr = IXGBE_READ_REG(hw, IXGBE_GCR); + u16 pcie_devctl2; + + /* only take action if timeout value is defaulted to 0 */ + if (gcr & IXGBE_GCR_CMPL_TMOUT_MASK) + goto out; + + /* + * if capababilities version is type 1 we can write the + * timeout of 10ms to 250ms through the GCR register + */ + if (!(gcr & IXGBE_GCR_CAP_VER2)) { + gcr |= IXGBE_GCR_CMPL_TMOUT_10ms; + goto out; + } + + /* + * for version 2 capabilities we need to write the config space + * directly in order to set the completion timeout value for + * 16ms to 55ms + */ + pci_read_config_word(adapter->pdev, + IXGBE_PCI_DEVICE_CONTROL2, &pcie_devctl2); + pcie_devctl2 |= IXGBE_PCI_DEVICE_CONTROL2_16ms; + pci_write_config_word(adapter->pdev, + IXGBE_PCI_DEVICE_CONTROL2, pcie_devctl2); +out: + /* disable completion timeout resend */ + gcr &= ~IXGBE_GCR_CMPL_TMOUT_RESEND; + IXGBE_WRITE_REG(hw, IXGBE_GCR, gcr); +} + /** * ixgbe_get_pcie_msix_count_82598 - Gets MSI-X vector count * @hw: pointer to hardware structure @@ -152,6 +197,26 @@ out: return ret_val; } +/** + * ixgbe_start_hw_82598 - Prepare hardware for Tx/Rx + * @hw: pointer to hardware structure + * + * Starts the hardware using the generic start_hw function. + * Then set pcie completion timeout + **/ +s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw) +{ + s32 ret_val = 0; + + ret_val = ixgbe_start_hw_generic(hw); + + /* set the completion timeout for interface */ + if (ret_val == 0) + ixgbe_set_pcie_completion_timeout(hw); + + return ret_val; +} + /** * ixgbe_get_link_capabilities_82598 - Determines link capabilities * @hw: pointer to hardware structure @@ -1085,7 +1150,7 @@ out: static struct ixgbe_mac_operations mac_ops_82598 = { .init_hw = &ixgbe_init_hw_generic, .reset_hw = &ixgbe_reset_hw_82598, - .start_hw = &ixgbe_start_hw_generic, + .start_hw = &ixgbe_start_hw_82598, .clear_hw_cntrs = &ixgbe_clear_hw_cntrs_generic, .get_media_type = &ixgbe_get_media_type_82598, .get_supported_physical_layer = &ixgbe_get_supported_physical_layer_82598, diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h index fa87309dc087..be90eb4575f6 100644 --- a/drivers/net/ixgbe/ixgbe_type.h +++ b/drivers/net/ixgbe/ixgbe_type.h @@ -718,6 +718,12 @@ #define IXGBE_ECC_STATUS_82599 0x110E0 #define IXGBE_BAR_CTRL_82599 0x110F4 +/* PCI Express Control */ +#define IXGBE_GCR_CMPL_TMOUT_MASK 0x0000F000 +#define IXGBE_GCR_CMPL_TMOUT_10ms 0x00001000 +#define IXGBE_GCR_CMPL_TMOUT_RESEND 0x00010000 +#define IXGBE_GCR_CAP_VER2 0x00040000 + /* Time Sync Registers */ #define IXGBE_TSYNCRXCTL 0x05188 /* Rx Time Sync Control register - RW */ #define IXGBE_TSYNCTXCTL 0x08C00 /* Tx Time Sync Control register - RW */ @@ -1521,6 +1527,7 @@ /* PCI Bus Info */ #define IXGBE_PCI_LINK_STATUS 0xB2 +#define IXGBE_PCI_DEVICE_CONTROL2 0xC8 #define IXGBE_PCI_LINK_WIDTH 0x3F0 #define IXGBE_PCI_LINK_WIDTH_1 0x10 #define IXGBE_PCI_LINK_WIDTH_2 0x20 @@ -1531,6 +1538,7 @@ #define IXGBE_PCI_LINK_SPEED_5000 0x2 #define IXGBE_PCI_HEADER_TYPE_REGISTER 0x0E #define IXGBE_PCI_HEADER_TYPE_MULTIFUNC 0x80 +#define IXGBE_PCI_DEVICE_CONTROL2_16ms 0x0005 /* Number of 100 microseconds we wait for PCI Express master disable */ #define IXGBE_PCI_MASTER_DISABLE_TIMEOUT 800 -- cgit v1.2.2 From 371842448c05b42d11a4be1c8e4e81d62ecc7534 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 30 Jul 2009 17:43:48 -0700 Subject: cfg80211: fix regression on beacon world roaming feature A regression was added through patch a4ed90d6: "cfg80211: respect API on orig_flags on channel for beacon hint" We did indeed respect _orig flags but the intention was not clearly stated in the commit log. This patch fixes firmware issues picked up by iwlwifi when we lift passive scan of beaconing restrictions on channels its EEPROM has been configured to always enable. By doing so though we also disallowed beacon hints on devices registering their wiphy with custom world regulatory domains enabled, this happens to be currently ath5k, ath9k and ar9170. The passive scan and beacon restrictions on those devices would never be lifted even if we did find a beacon and the hardware did support such enhancements when world roaming. Since Johannes indicates iwlwifi firmware cannot be changed to allow beacon hinting we set up a flag now to specifically allow drivers to disable beacon hints for devices which cannot use them. We enable the flag on iwlwifi to disable beacon hints and by default enable it for all other drivers. It should be noted beacon hints lift passive scan flags and beacon restrictions when we receive a beacon from an AP on any 5 GHz non-DFS channels, and channels 12-14 on the 2.4 GHz band. We don't bother with channels 1-11 as those channels are allowed world wide. This should fix world roaming for ath5k, ath9k and ar9170, thereby improving scan time when we receive the first beacon from any AP, and also enabling beaconing operation (AP/IBSS/Mesh) on cards which would otherwise not be allowed to do so. Drivers not using custom regulatory stuff (wiphy_apply_custom_regulatory()) were not affected by this as the orig_flags for the channels would have been cleared upon wiphy registration. I tested this with a world roaming ath5k card. Cc: Jouni Malinen Signed-off-by: Luis R. Rodriguez Reviewed-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-core.c | 3 +++ drivers/net/wireless/iwlwifi/iwl3945-base.c | 3 +++ 2 files changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c index 6ab07165ea28..18b135f510e5 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.c +++ b/drivers/net/wireless/iwlwifi/iwl-core.c @@ -1332,6 +1332,9 @@ int iwl_setup_mac(struct iwl_priv *priv) hw->wiphy->custom_regulatory = true; + /* Firmware does not support this */ + hw->wiphy->disable_beacon_hints = true; + hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX; /* we create the 802.11 header and a zero-length SSID element */ hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index 2f50ab60bfdf..523843369ca2 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -3968,6 +3968,9 @@ static int iwl3945_setup_mac(struct iwl_priv *priv) hw->wiphy->custom_regulatory = true; + /* Firmware does not support this */ + hw->wiphy->disable_beacon_hints = true; + hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX_3945; /* we create the 802.11 header and a zero-length SSID element */ hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; -- cgit v1.2.2 From dbc1eec485625228895ded6baf6bd01ce2475410 Mon Sep 17 00:00:00 2001 From: Patrick Simmons Date: Sun, 2 Aug 2009 02:46:28 -0600 Subject: zd1211rw: fix unaligned access in zd_mac_rx Fix an unaligned memory access in the zd_mac_rx function of zd1211rw that causes problems on SPARC64. Signed-off-by: Patrick Simmons Signed-off-by: John W. Linville --- drivers/net/wireless/zd1211rw/zd_mac.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c index 40b07b988224..3bd3c779fff3 100644 --- a/drivers/net/wireless/zd1211rw/zd_mac.c +++ b/drivers/net/wireless/zd1211rw/zd_mac.c @@ -698,7 +698,7 @@ int zd_mac_rx(struct ieee80211_hw *hw, const u8 *buffer, unsigned int length) && !mac->pass_ctrl) return 0; - fc = *(__le16 *)buffer; + fc = get_unaligned((__le16*)buffer); need_padding = ieee80211_is_data_qos(fc) ^ ieee80211_has_a4(fc); skb = dev_alloc_skb(length + (need_padding ? 2 : 0)); -- cgit v1.2.2 From c37457e69ffd7d3c94cbfcc1c39be9a45dd7ad21 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 3 Aug 2009 11:11:45 +0200 Subject: drivers/net/wireless/iwlwifi: introduce missing kfree Move orthogonal error handling code up before a kzalloc, so that it doesn't have to free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,f1,l; position p1,p2; expression *ptr != NULL; @@ x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S <... when != x when != if (...) { <+...x...+> } ( x->f1 = E | (x->f1 == NULL || ...) | f(...,x->f1,...) ) ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Acked-by: Zhu Yi Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-debugfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/net/wireless/iwlwifi/iwl-debugfs.c b/drivers/net/wireless/iwlwifi/iwl-debugfs.c index 11e08c068917..ca00cc8ad4c7 100644 --- a/drivers/net/wireless/iwlwifi/iwl-debugfs.c +++ b/drivers/net/wireless/iwlwifi/iwl-debugfs.c @@ -308,18 +308,18 @@ static ssize_t iwl_dbgfs_nvm_read(struct file *file, return -ENODATA; } + ptr = priv->eeprom; + if (!ptr) { + IWL_ERR(priv, "Invalid EEPROM/OTP memory\n"); + return -ENOMEM; + } + /* 4 characters for byte 0xYY */ buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) { IWL_ERR(priv, "Can not allocate Buffer\n"); return -ENOMEM; } - - ptr = priv->eeprom; - if (!ptr) { - IWL_ERR(priv, "Invalid EEPROM/OTP memory\n"); - return -ENOMEM; - } pos += scnprintf(buf + pos, buf_size - pos, "NVM Type: %s\n", (priv->nvm_device_type == NVM_DEVICE_TYPE_OTP) ? "OTP" : "EEPROM"); -- cgit v1.2.2 From 9f9857bb5e147b977b9878c46e3dd87c9e8caf50 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 1 Aug 2009 10:55:53 +0200 Subject: drivers/net/wireless: introduce missing kfree Error handling code following a kzalloc should free the allocated data. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,f1,l; position p1,p2; expression *ptr != NULL; @@ x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S <... when != x when != if (...) { <+...x...+> } ( x->f1 = E | (x->f1 == NULL || ...) | f(...,x->f1,...) ) ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: John W. Linville --- drivers/net/wireless/iwmc3200wifi/commands.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/net/wireless/iwmc3200wifi/commands.c b/drivers/net/wireless/iwmc3200wifi/commands.c index 834a7f544e5d..e2334d123599 100644 --- a/drivers/net/wireless/iwmc3200wifi/commands.c +++ b/drivers/net/wireless/iwmc3200wifi/commands.c @@ -220,6 +220,7 @@ int iwm_store_rxiq_calib_result(struct iwm_priv *iwm) eeprom_rxiq = iwm_eeprom_access(iwm, IWM_EEPROM_CALIB_RXIQ); if (IS_ERR(eeprom_rxiq)) { IWM_ERR(iwm, "Couldn't access EEPROM RX IQ entry\n"); + kfree(rxiq); return PTR_ERR(eeprom_rxiq); } -- cgit v1.2.2 From b929c633b4067be18a335d278a66fd5deef3cabe Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 2 Aug 2009 09:44:12 +0200 Subject: libertas: Read buffer overflow Check whether index is within bounds before testing the element. Signed-off-by: Roel Kluin Signed-off-by: John W. Linville --- drivers/net/wireless/libertas/11d.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/wireless/libertas/11d.c b/drivers/net/wireless/libertas/11d.c index 9a5408e7d94a..5c6968101f0d 100644 --- a/drivers/net/wireless/libertas/11d.c +++ b/drivers/net/wireless/libertas/11d.c @@ -47,7 +47,7 @@ static u8 lbs_region_2_code(u8 *region) { u8 i; - for (i = 0; region[i] && i < COUNTRY_CODE_LEN; i++) + for (i = 0; i < COUNTRY_CODE_LEN && region[i]; i++) region[i] = toupper(region[i]); for (i = 0; i < ARRAY_SIZE(region_code_mapping); i++) { -- cgit v1.2.2 From 99f1b01562b7dcae75b043114f76163fbf84fcab Mon Sep 17 00:00:00 2001 From: Reinette Chatre Date: Mon, 3 Aug 2009 12:10:16 -0700 Subject: iwlagn: do not send key clear commands when rfkill enabled Do all key clearing except sending sommands to device when rfkill enabled. When rfkill enabled the interface is brought down and will be brought back up correctly after rfkill is enabled again. Same change is not needed for iwl3945 as it ignores return code when sending key clearing command to device. This fixes http://bugzilla.kernel.org/show_bug.cgi?id=13742 Signed-off-by: Reinette Chatre Tested-by: Frans Pop Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-sta.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers') diff --git a/drivers/net/wireless/iwlwifi/iwl-sta.c b/drivers/net/wireless/iwlwifi/iwl-sta.c index 2addf735b193..ffd5c61a7553 100644 --- a/drivers/net/wireless/iwlwifi/iwl-sta.c +++ b/drivers/net/wireless/iwlwifi/iwl-sta.c @@ -566,6 +566,8 @@ int iwl_remove_default_wep_key(struct iwl_priv *priv, unsigned long flags; spin_lock_irqsave(&priv->sta_lock, flags); + IWL_DEBUG_WEP(priv, "Removing default WEP key: idx=%d\n", + keyconf->keyidx); if (!test_and_clear_bit(keyconf->keyidx, &priv->ucode_key_table)) IWL_ERR(priv, "index %d not used in uCode key table.\n", @@ -573,6 +575,11 @@ int iwl_remove_default_wep_key(struct iwl_priv *priv, priv->default_wep_key--; memset(&priv->wep_keys[keyconf->keyidx], 0, sizeof(priv->wep_keys[0])); + if (iwl_is_rfkill(priv)) { + IWL_DEBUG_WEP(priv, "Not sending REPLY_WEPKEY command due to RFKILL.\n"); + spin_unlock_irqrestore(&priv->sta_lock, flags); + return 0; + } ret = iwl_send_static_wepkey_cmd(priv, 1); IWL_DEBUG_WEP(priv, "Remove default WEP key: idx=%d ret=%d\n", keyconf->keyidx, ret); @@ -853,6 +860,11 @@ int iwl_remove_dynamic_key(struct iwl_priv *priv, priv->stations[sta_id].sta.sta.modify_mask = STA_MODIFY_KEY_MASK; priv->stations[sta_id].sta.mode = STA_CONTROL_MODIFY_MSK; + if (iwl_is_rfkill(priv)) { + IWL_DEBUG_WEP(priv, "Not sending REPLY_ADD_STA command because RFKILL enabled. \n"); + spin_unlock_irqrestore(&priv->sta_lock, flags); + return 0; + } ret = iwl_send_add_sta(priv, &priv->stations[sta_id].sta, CMD_ASYNC); spin_unlock_irqrestore(&priv->sta_lock, flags); return ret; -- cgit v1.2.2 From f6caa14aa0b126d4a2933907d1519611b2a8524a Mon Sep 17 00:00:00 2001 From: Mike McCormack Date: Fri, 31 Jul 2009 01:57:42 +0000 Subject: sky2: Avoid transmits during sky2_down() This patch supersedes my previous patch "sky2: Avoid transmitting during sky2_restart". I have reworked the patch to avoid crashes during both sky2_restart() and sky2_set_ringparam(). Without this patch, the sky2 driver can be crashed by doing: # pktgen eth1 & (transmit many packets on eth1) # ethtool -G eth1 tx 510 I am aware you object to storing extra state, but I can't see a way around this. Without remembering that we're restarting, netif_wake_queue() is called in the ISR from sky2_tx_complete(), and netif_tx_lock() is used in sky2_tx_done(). If anybody can see a way around this, please let me know. Signed-off-by: Mike McCormack Signed-off-by: David S. Miller --- drivers/net/sky2.c | 14 +++++++++++++- drivers/net/sky2.h | 1 + 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 3550c5dcd93c..0a551d8f5d95 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -1488,6 +1488,8 @@ static int sky2_up(struct net_device *dev) sky2_set_vlan_mode(hw, port, sky2->vlgrp != NULL); #endif + sky2->restarting = 0; + err = sky2_rx_start(sky2); if (err) goto err_out; @@ -1500,6 +1502,9 @@ static int sky2_up(struct net_device *dev) sky2_set_multicast(dev); + /* wake queue incase we are restarting */ + netif_wake_queue(dev); + if (netif_msg_ifup(sky2)) printk(KERN_INFO PFX "%s: enabling interface\n", dev->name); return 0; @@ -1533,6 +1538,8 @@ static inline int tx_dist(unsigned tail, unsigned head) /* Number of list elements available for next tx */ static inline int tx_avail(const struct sky2_port *sky2) { + if (unlikely(sky2->restarting)) + return 0; return sky2->tx_pending - tx_dist(sky2->tx_cons, sky2->tx_prod); } @@ -1818,6 +1825,10 @@ static int sky2_down(struct net_device *dev) if (netif_msg_ifdown(sky2)) printk(KERN_INFO PFX "%s: disabling interface\n", dev->name); + /* explicitly shut off tx incase we're restarting */ + sky2->restarting = 1; + netif_tx_disable(dev); + /* Force flow control off */ sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF); @@ -2359,7 +2370,7 @@ static inline void sky2_tx_done(struct net_device *dev, u16 last) { struct sky2_port *sky2 = netdev_priv(dev); - if (netif_running(dev)) { + if (likely(netif_running(dev) && !sky2->restarting)) { netif_tx_lock(dev); sky2_tx_complete(sky2, last); netif_tx_unlock(dev); @@ -4283,6 +4294,7 @@ static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw, spin_lock_init(&sky2->phy_lock); sky2->tx_pending = TX_DEF_PENDING; sky2->rx_pending = RX_DEF_PENDING; + sky2->restarting = 0; hw->dev[port] = dev; diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h index b5549c9e5107..4486b066b43f 100644 --- a/drivers/net/sky2.h +++ b/drivers/net/sky2.h @@ -2051,6 +2051,7 @@ struct sky2_port { u8 duplex; /* DUPLEX_HALF, DUPLEX_FULL */ u8 rx_csum; u8 wol; + u8 restarting; enum flow_control flow_mode; enum flow_control flow_status; -- cgit v1.2.2 From 7781de74568bddfefbd2d32a934a8c791a2420cd Mon Sep 17 00:00:00 2001 From: Jakob Bornecrantz Date: Mon, 3 Aug 2009 13:43:58 +0100 Subject: drm: Small logic fix in drm_mode_setcrtc Match the logic to the comments in the debug message Signed-off-by: Jakob Bornecrantz Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 8fab7890a363..33be210d6723 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -1461,7 +1461,7 @@ int drm_mode_setcrtc(struct drm_device *dev, void *data, goto out; } - if (crtc_req->count_connectors > 0 && !mode && !fb) { + if (crtc_req->count_connectors > 0 && (!mode || !fb)) { DRM_DEBUG("Count connectors is %d but no mode or fb set\n", crtc_req->count_connectors); ret = -EINVAL; -- cgit v1.2.2 From 4cb72b1727140f131b2df5f37c2e54f5965f98c2 Mon Sep 17 00:00:00 2001 From: Jakob Bornecrantz Date: Mon, 3 Aug 2009 13:43:59 +0100 Subject: drm: Catch stop possible NULL pointer reference This was caught by Weiss. Also added some comments to the fb_changed and mode_changed variables to explain what they do. Signed-off-by: Jakob Bornecrantz Tested-by: Thomas White Signed-off-by: Dave Airlie --- drivers/gpu/drm/drm_crtc_helper.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c index 3da9cfa31848..6aaa2cb23365 100644 --- a/drivers/gpu/drm/drm_crtc_helper.c +++ b/drivers/gpu/drm/drm_crtc_helper.c @@ -706,8 +706,8 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set) struct drm_encoder **save_encoders, *new_encoder; struct drm_framebuffer *old_fb = NULL; bool save_enabled; - bool mode_changed = false; - bool fb_changed = false; + bool mode_changed = false; /* if true do a full mode set */ + bool fb_changed = false; /* if true and !mode_changed just do a flip */ struct drm_connector *connector; int count = 0, ro, fail = 0; struct drm_crtc_helper_funcs *crtc_funcs; @@ -758,6 +758,8 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set) if (set->crtc->fb == NULL) { DRM_DEBUG("crtc has no fb, full mode set\n"); mode_changed = true; + } else if (set->fb == NULL) { + mode_changed = true; } else if ((set->fb->bits_per_pixel != set->crtc->fb->bits_per_pixel) || set->fb->depth != set->crtc->fb->depth) -- cgit v1.2.2 From 0924d942256ac470c5f7b4ebaf7fe0415fc6fa59 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 3 Aug 2009 12:03:03 +1000 Subject: drm/radeon/kms: fix rv515 VRAM initialisation. This got missed in the VRAM init re-workings. Signed-of-by: Dave Airlie --- drivers/gpu/drm/radeon/rv515.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c index 551e608702e4..fd8f3ca716ea 100644 --- a/drivers/gpu/drm/radeon/rv515.c +++ b/drivers/gpu/drm/radeon/rv515.c @@ -370,6 +370,7 @@ void rv515_vram_info(struct radeon_device *rdev) rv515_vram_get_type(rdev); + r100_vram_init_sizes(rdev); /* FIXME: we should enforce default clock in case GPU is not in * default setup */ -- cgit v1.2.2 From 6d0897ba58139523d37e97855ee0fe2d78629da6 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Fri, 31 Jul 2009 10:47:51 +0200 Subject: drm/ttm: Fix a potential comparison of structs. On some architectures the comparison may cause a compilation failure. Original partial fix Signed-off-by: Thomas Hellstrom Signed-off-by: Pekka Paalanen Signed-off-by: Dave Airlie --- drivers/gpu/drm/ttm/ttm_bo_util.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index ce2e6f38ea01..ad4ada07c6cf 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -150,7 +150,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, #ifdef CONFIG_X86 dst = kmap_atomic_prot(d, KM_USER0, prot); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) dst = vmap(&d, 1, 0, prot); else dst = kmap(d); @@ -163,7 +163,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, #ifdef CONFIG_X86 kunmap_atomic(dst, KM_USER0); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) vunmap(dst); else kunmap(d); @@ -186,7 +186,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, #ifdef CONFIG_X86 src = kmap_atomic_prot(s, KM_USER0, prot); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) src = vmap(&s, 1, 0, prot); else src = kmap(s); @@ -199,7 +199,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, #ifdef CONFIG_X86 kunmap_atomic(src, KM_USER0); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) vunmap(src); else kunmap(s); -- cgit v1.2.2 From de05065ff5d6878523317ff4a0b48a1239f80f73 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Mon, 3 Aug 2009 12:05:34 +1000 Subject: drm/radeon/kms: fix nomodeset. The ordering was wrong to get the nomodeset parameter to work. Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_drv.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 3cfcee17dc56..0bd5879a4957 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -318,6 +318,14 @@ static int __init radeon_init(void) driver = &driver_old; driver->num_ioctls = radeon_max_ioctl; #if defined(CONFIG_DRM_RADEON_KMS) +#ifdef CONFIG_VGA_CONSOLE + if (vgacon_text_force() && radeon_modeset == -1) { + DRM_INFO("VGACON disable radeon kernel modesetting.\n"); + driver = &driver_old; + driver->driver_features &= ~DRIVER_MODESET; + radeon_modeset = 0; + } +#endif /* if enabled by default */ if (radeon_modeset == -1) { DRM_INFO("radeon default to kernel modesetting.\n"); @@ -329,17 +337,8 @@ static int __init radeon_init(void) driver->driver_features |= DRIVER_MODESET; driver->num_ioctls = radeon_max_kms_ioctl; } - /* if the vga console setting is enabled still * let modprobe override it */ -#ifdef CONFIG_VGA_CONSOLE - if (vgacon_text_force() && radeon_modeset == -1) { - DRM_INFO("VGACON disable radeon kernel modesetting.\n"); - driver = &driver_old; - driver->driver_features &= ~DRIVER_MODESET; - radeon_modeset = 0; - } -#endif #endif return drm_init(driver); } -- cgit v1.2.2 From c9b7fb54f0a51e587fa09be3a85666b43d36a850 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Wed, 29 Jul 2009 21:28:24 +0800 Subject: drm/radeon/kms: fix memory leak in radeon_driver_load_kms This patch fixes following kmemleak report: unreferenced object 0xffff88022cb53000 (size 4096): comm "work_for_cpu", pid 97, jiffies 4294672345 backtrace: [] create_object+0x19f/0x2a0 [] kmemleak_alloc+0x26/0x4c [] __kmalloc+0x187/0x1b0 [] kzalloc.clone.0+0x13/0x15 [radeon] [] radeon_driver_load_kms+0x26/0xe1 [radeon] [] drm_get_dev+0x37f/0x480 [drm] [] radeon_pci_probe+0x15/0x269 [radeon] [] local_pci_probe+0x17/0x1b [] do_work_for_cpu+0x18/0x2a [] kthread+0x8a/0x92 [] child_rip+0xa/0x20 [] 0xffffffffffffffff Signed-off-by: Xiaotian Feng Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon_kms.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 937a2f1cdb46..3357110e30ce 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -58,6 +58,8 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) if (r) { DRM_ERROR("Failed to initialize radeon, disabling IOCTL\n"); radeon_device_fini(rdev); + kfree(rdev); + dev->dev_private = NULL; return r; } return 0; -- cgit v1.2.2 From fee280d3fd9bc5247bef9f4ab35a4693bfffdcfd Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Mon, 3 Aug 2009 12:39:06 +0200 Subject: drm/ttm: Fix a sync object leak. If there are multiple simultaneous waiters for the same buffer object, a temporary reference to its sync object may be leaked. Signed-off-by: Thomas Hellstrom Signed-off-by: Dave Airlie --- drivers/gpu/drm/ttm/ttm_bo.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 6538d4236989..aa82d5370c38 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1575,6 +1575,10 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, driver->sync_obj_unref(&sync_obj); driver->sync_obj_unref(&tmp_obj); spin_lock(&bo->lock); + } else { + spin_unlock(&bo->lock); + driver->sync_obj_unref(&sync_obj); + spin_lock(&bo->lock); } } return 0; -- cgit v1.2.2 From fa99239cb73dbf419bea9f334b85ba94ac88a532 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 3 Aug 2009 14:20:32 +0200 Subject: drm/radeon: Read buffer overflow Check whether index is within bounds before grabbing the element. Signed-off-by: Roel Kluin Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/r100.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 05a44896dffb..f1ba8ff41130 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -722,13 +722,14 @@ int r100_cs_packet_parse(struct radeon_cs_parser *p, unsigned idx) { struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; - uint32_t header = ib_chunk->kdata[idx]; + uint32_t header; if (idx >= ib_chunk->length_dw) { DRM_ERROR("Can not parse packet at %d after CS end %d !\n", idx, ib_chunk->length_dw); return -EINVAL; } + header = ib_chunk->kdata[idx]; pkt->idx = idx; pkt->type = CP_PACKET_GET_TYPE(header); pkt->count = CP_PACKET_GET_COUNT(header); -- cgit v1.2.2 From c96e7c7a3a79931446ecf9494a8415e4d164ebd8 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Mon, 3 Aug 2009 14:22:53 +0200 Subject: drm/ttm: Read buffer overflow Check whether index is within bounds before grabbing the element. Signed-off-by: Roel Kluin Signed-off-by: Dave Airlie --- drivers/gpu/drm/ttm/ttm_bo.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index aa82d5370c38..c2b0d710d10f 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1182,13 +1182,14 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev, int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type) { - struct ttm_mem_type_manager *man = &bdev->man[mem_type]; + struct ttm_mem_type_manager *man; int ret = -EINVAL; if (mem_type >= TTM_NUM_MEM_TYPES) { printk(KERN_ERR TTM_PFX "Illegal memory type %d\n", mem_type); return ret; } + man = &bdev->man[mem_type]; if (!man->has_type) { printk(KERN_ERR TTM_PFX "Trying to take down uninitialized " -- cgit v1.2.2 From 0cb13536c3382004bd9b833565e2af33f26ed1fb Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Mon, 3 Aug 2009 21:10:01 -0700 Subject: 3c59x: Fix build failure with gcc 3.2 Fix the following build failure with gcc 3.2: CC [M] drivers/net/3c59x.o drivers/net/3c59x.c:2726:1: directives may not be used inside a macro argument drivers/net/3c59x.c:2725:59: unterminated argument list invoking macro "pr_err" drivers/net/3c59x.c: In function `dump_tx_ring': drivers/net/3c59x.c:2727: implicit declaration of function `pr_err' drivers/net/3c59x.c:2731: syntax error before ')' token Apparently gcc 3.2 doesn't like #if interleaved with a macro call. Signed-off-by: Jean Delvare Signed-off-by: David S. Miller --- drivers/net/3c59x.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'drivers') diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index c34aee91250b..c20416850948 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -2721,13 +2721,15 @@ dump_tx_ring(struct net_device *dev) &vp->tx_ring[vp->dirty_tx % TX_RING_SIZE]); issue_and_wait(dev, DownStall); for (i = 0; i < TX_RING_SIZE; i++) { - pr_err(" %d: @%p length %8.8x status %8.8x\n", i, - &vp->tx_ring[i], + unsigned int length; + #if DO_ZEROCOPY - le32_to_cpu(vp->tx_ring[i].frag[0].length), + length = le32_to_cpu(vp->tx_ring[i].frag[0].length); #else - le32_to_cpu(vp->tx_ring[i].length), + length = le32_to_cpu(vp->tx_ring[i].length); #endif + pr_err(" %d: @%p length %8.8x status %8.8x\n", + i, &vp->tx_ring[i], length, le32_to_cpu(vp->tx_ring[i].status)); } if (!stalled) -- cgit v1.2.2 From c2718348b41a8e7646516d9af8bb0231c6a44374 Mon Sep 17 00:00:00 2001 From: Doug Thompson Date: Tue, 4 Aug 2009 12:02:20 +0200 Subject: amd64_edac: print debug statements only on error Add forgotten return calls for the successful cases. Signed-off-by: Doug Thompson Signed-off-by: Borislav Petkov --- drivers/edac/amd64_edac.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 5fa924d61b10..e2a10bcba7a1 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -868,6 +868,8 @@ static void amd64_read_dbam_reg(struct amd64_pvt *pvt) goto err_reg; } + return; + err_reg: debugf0("Error reading F2x%03x.\n", reg); } @@ -2634,6 +2636,8 @@ static void amd64_read_mc_registers(struct amd64_pvt *pvt) amd64_dump_misc_regs(pvt); + return; + err_reg: debugf0("Reading an MC register failed\n"); -- cgit v1.2.2 From 1cef8e41073efe47e809f49670eb461307e52ccc Mon Sep 17 00:00:00 2001 From: Russell King Date: Mon, 27 Jul 2009 11:30:48 +0530 Subject: mfd: twl4030 irq fixes The TWL4030 IRQ handler has a bug which leads to spinlock lock-up. It is calling the 'unmask' function in a process context. :The mask/unmask/ack functions are only designed to be called from the IRQ handler code, or the proper API interfaces found in linux/interrupt.h. Also there is no need to have IRQ chaining mechanism. The right way to handle this is to claim the parent interrupt as a standard interrupt and arrange for handle_twl4030_pih to take care of the rest of the devices. Mail thread on this issue can be found at: http://marc.info/?l=linux-arm-kernel&m=124629940123396&w=2 Signed-off-by: Russell King Tested-by: Santosh Shilimkar Acked-by: Tony Lindgren Signed-off-by: Samuel Ortiz --- drivers/mfd/twl4030-irq.c | 55 +++++++++++++++++++++-------------------------- 1 file changed, 24 insertions(+), 31 deletions(-) (limited to 'drivers') diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c index bae61b22501c..7d430835655f 100644 --- a/drivers/mfd/twl4030-irq.c +++ b/drivers/mfd/twl4030-irq.c @@ -180,14 +180,9 @@ static struct completion irq_event; static int twl4030_irq_thread(void *data) { long irq = (long)data; - struct irq_desc *desc = irq_to_desc(irq); static unsigned i2c_errors; static const unsigned max_i2c_errors = 100; - if (!desc) { - pr_err("twl4030: Invalid IRQ: %ld\n", irq); - return -EINVAL; - } current->flags |= PF_NOFREEZE; @@ -240,7 +235,7 @@ static int twl4030_irq_thread(void *data) } local_irq_enable(); - desc->chip->unmask(irq); + enable_irq(irq); } return 0; @@ -255,25 +250,13 @@ static int twl4030_irq_thread(void *data) * thread. All we do here is acknowledge and mask the interrupt and wakeup * the kernel thread. */ -static void handle_twl4030_pih(unsigned int irq, struct irq_desc *desc) +static irqreturn_t handle_twl4030_pih(int irq, void *devid) { /* Acknowledge, clear *AND* mask the interrupt... */ - desc->chip->ack(irq); - complete(&irq_event); -} - -static struct task_struct *start_twl4030_irq_thread(long irq) -{ - struct task_struct *thread; - - init_completion(&irq_event); - thread = kthread_run(twl4030_irq_thread, (void *)irq, "twl4030-irq"); - if (!thread) - pr_err("twl4030: could not create irq %ld thread!\n", irq); - - return thread; + disable_irq_nosync(irq); + complete(devid); + return IRQ_HANDLED; } - /*----------------------------------------------------------------------*/ /* @@ -734,18 +717,28 @@ int twl_init_irq(int irq_num, unsigned irq_base, unsigned irq_end) } /* install an irq handler to demultiplex the TWL4030 interrupt */ - task = start_twl4030_irq_thread(irq_num); - if (!task) { - pr_err("twl4030: irq thread FAIL\n"); - status = -ESRCH; - goto fail; - } - set_irq_data(irq_num, task); - set_irq_chained_handler(irq_num, handle_twl4030_pih); - return status; + init_completion(&irq_event); + status = request_irq(irq_num, handle_twl4030_pih, IRQF_DISABLED, + "TWL4030-PIH", &irq_event); + if (status < 0) { + pr_err("twl4030: could not claim irq%d: %d\n", irq_num, status); + goto fail_rqirq; + } + + task = kthread_run(twl4030_irq_thread, (void *)irq_num, "twl4030-irq"); + if (IS_ERR(task)) { + pr_err("twl4030: could not create irq %d thread!\n", irq_num); + status = PTR_ERR(task); + goto fail_kthread; + } + return status; +fail_kthread: + free_irq(irq_num, &irq_event); +fail_rqirq: + /* clean up twl4030_sih_setup */ fail: for (i = irq_base; i < irq_end; i++) set_irq_chip_and_handler(i, NULL, NULL); -- cgit v1.2.2 From 26d204afa18f7df177f21bdb3759e0098ca8f7d5 Mon Sep 17 00:00:00 2001 From: "Pallipadi, Venkatesh" Date: Wed, 29 Jul 2009 13:36:10 -0700 Subject: [CPUFREQ] Fix NULL pointer dereference regression in conservative governor Commit ee88415caf736b89500f16e0a545614541a45005 introduced this regression when it removed enable bit in cpu_dbs_info_s. That added a possibility of dbs_cpufreq_notifier getting called for a CPU that is not yet managed by conservative governor. That will happen as the transition notifier is set as soon as one CPU switches to conservative governor and other CPUs can get a NULL pointer dereference without the enable bit check. Add the enable bit back again. Reported-by: Lermytte Christophe Signed-off-by: Venkatesh Pallipadi Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq_conservative.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 57490502b21c..bdea7e2f94ba 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -63,6 +63,7 @@ struct cpu_dbs_info_s { unsigned int down_skip; unsigned int requested_freq; int cpu; + unsigned int enable:1; /* * percpu mutex that serializes governor limit change with * do_dbs_timer invocation. We do not want do_dbs_timer to run @@ -141,6 +142,9 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_policy *policy; + if (!this_dbs_info->enable) + return 0; + policy = this_dbs_info->cur_policy; /* @@ -497,6 +501,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); delay -= jiffies % delay; + dbs_info->enable = 1; INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work, delay); @@ -504,6 +509,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { + dbs_info->enable = 0; cancel_delayed_work_sync(&dbs_info->work); } -- cgit v1.2.2 From 42c74b84c64633dd3badbfc2abd2ef1728b64b30 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Mon, 3 Aug 2009 10:58:11 -0400 Subject: [CPUFREQ] Do not set policy for offline cpus Suspend/Resume fails on multi socket, multi core systems because the cpufreq code erroneously sets the per_cpu policy_cpu value when a logical cpu is offline. This most notably results in missing sysfs files that are used to set the cpu frequencies of the various cpus. Signed-off-by: Prarit Bhargava Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b90eda8b3440..120d236c0ffb 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -924,6 +924,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) spin_lock_irqsave(&cpufreq_driver_lock, flags); for_each_cpu(j, policy->cpus) { + if (!cpu_online(j)) + continue; per_cpu(cpufreq_cpu_data, j) = policy; per_cpu(policy_cpu, j) = policy->cpu; } -- cgit v1.2.2 From d5194decd0a6f792b2789eebd4ddf022a248f655 Mon Sep 17 00:00:00 2001 From: Thomas Renninger Date: Wed, 29 Jul 2009 11:26:20 +0200 Subject: [CPUFREQ] Fix a kobject reference bug related to managed CPUs The first offline/online cycle is successful, the second not. Doing: echo 0 >cpu1/online echo 1 >cpu1/online echo 0 >cpu1/online The last command will trigger: Jul 22 14:39:50 linux kernel: [ 593.210125] ------------[ cut here ]------------ Jul 22 14:39:50 linux kernel: [ 593.210139] WARNING: at lib/kref.c:43 kref_get+0x23/0x2b() Jul 22 14:39:50 linux kernel: [ 593.210144] Hardware name: To Be Filled By O.E.M. Jul 22 14:39:50 linux kernel: [ 593.210148] Modules linked in: powernow_k8 Jul 22 14:39:50 linux kernel: [ 593.210158] Pid: 378, comm: kondemand/2 Tainted: G W 2.6.31-rc2 #38 Jul 22 14:39:50 linux kernel: [ 593.210163] Call Trace: Jul 22 14:39:50 linux kernel: [ 593.210171] [] ? kref_get+0x23/0x2b Jul 22 14:39:50 linux kernel: [ 593.210181] [] warn_slowpath_common+0x77/0xa4 Jul 22 14:39:50 linux kernel: [ 593.210190] [] warn_slowpath_null+0xf/0x11 Jul 22 14:39:50 linux kernel: [ 593.210198] [] kref_get+0x23/0x2b Jul 22 14:39:50 linux kernel: [ 593.210206] [] kobject_get+0x1a/0x22 Jul 22 14:39:50 linux kernel: [ 593.210214] [] cpufreq_cpu_get+0x8a/0xcb Jul 22 14:39:50 linux kernel: [ 593.210222] [] __cpufreq_driver_getavg+0x1d/0x67 Jul 22 14:39:50 linux kernel: [ 593.210231] [] do_dbs_timer+0x158/0x27f Jul 22 14:39:50 linux kernel: [ 593.210240] [] worker_thread+0x200/0x313 ... The output continues on every do_dbs_timer ondemand freq checking poll. This regression was introduced by git commit: 3f4a782b5ce2698b1870b5a7b573cd721d4fce33 The policy is released when the cpufreq device is removed in: __cpufreq_remove_dev(): /* if this isn't the CPU which is the parent of the kobj, we * only need to unlink, put and exit */ Not creating the symlink is not sever at all. As long as: sysfs_remove_link(&sys_dev->kobj, "cpufreq"); handles it gracefully that the symlink did not exist. Possibly no error should be returned at all, because ondemand governor would still provide the same functionality. Userspace in userspace gov case might be confused if the link is missing. Resolves http://bugzilla.kernel.org/show_bug.cgi?id=13903 CC: Mathieu Desnoyers CC: Venkatesh Pallipadi Signed-off-by: Thomas Renninger Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 120d236c0ffb..bd74a0b12176 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -858,6 +858,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) /* Check for existing affected CPUs. * They may not be aware of it due to CPU Hotplug. + * cpufreq_cpu_put is called when the device is removed + * in __cpufreq_remove_dev() */ managed_policy = cpufreq_cpu_get(j); if (unlikely(managed_policy)) { @@ -884,7 +886,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) ret = sysfs_create_link(&sys_dev->kobj, &managed_policy->kobj, "cpufreq"); - if (!ret) + if (ret) cpufreq_cpu_put(managed_policy); /* * Success. We only needed to be added to the mask. -- cgit v1.2.2 From 4bc5d34135039566b8d6efa2de7515b2be505da8 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Tue, 4 Aug 2009 14:03:25 -0400 Subject: [CPUFREQ] Make cpufreq suspend code conditional on powerpc. The suspend code runs with interrupts disabled, and the powerpc workaround we do in the cpufreq suspend hook calls the drivers ->get method. powernow-k8's ->get does an smp_call_function_single which needs interrupts enabled cpufreq's suspend/resume code was added in 42d4dc3f4e1e to work around a hardware problem on ppc powerbooks. If we make all this code conditional on powerpc, we avoid the issue above. Signed-off-by: Dave Jones --- drivers/cpufreq/cpufreq.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index bd74a0b12176..fd69086d08d5 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1248,13 +1248,22 @@ EXPORT_SYMBOL(cpufreq_get); static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg) { - int cpu = sysdev->id; int ret = 0; + +#ifdef __powerpc__ + int cpu = sysdev->id; unsigned int cur_freq = 0; struct cpufreq_policy *cpu_policy; dprintk("suspending cpu %u\n", cpu); + /* + * This whole bogosity is here because Powerbooks are made of fail. + * No sane platform should need any of the code below to be run. + * (it's entirely the wrong thing to do, as driver->get may + * reenable interrupts on some architectures). + */ + if (!cpu_online(cpu)) return 0; @@ -1313,6 +1322,7 @@ static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg) out: cpufreq_cpu_put(cpu_policy); +#endif /* __powerpc__ */ return ret; } @@ -1326,12 +1336,18 @@ out: */ static int cpufreq_resume(struct sys_device *sysdev) { - int cpu = sysdev->id; int ret = 0; + +#ifdef __powerpc__ + int cpu = sysdev->id; struct cpufreq_policy *cpu_policy; dprintk("resuming cpu %u\n", cpu); + /* As with the ->suspend method, all the code below is + * only necessary because Powerbooks suck. + * See commit 42d4dc3f4e1e for jokes. */ + if (!cpu_online(cpu)) return 0; @@ -1395,6 +1411,7 @@ out: schedule_work(&cpu_policy->update); fail: cpufreq_cpu_put(cpu_policy); +#endif /* __powerpc__ */ return ret; } -- cgit v1.2.2 From e0cff5ed27acd355264b210d9622da801a431e19 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 4 Aug 2009 11:46:41 -0700 Subject: igbvf: Allow VF driver to correctly recognize failure to set mac The VF driver was not correctly recognizing that it did not correctly set it's mac address. As a result the VF driver was unable to receive network traffic until being unloaded and reloaded. The issue was root caused to the fact that the CTS bit was not taken into account when checking for the request being NAKed. Signed-off-by: Alexander Duyck Signed-off-by: Jeff Kirsher Signed-off-by: David S. Miller --- drivers/net/igbvf/vf.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/net/igbvf/vf.c b/drivers/net/igbvf/vf.c index 2a4faf9ade69..a9a61efa964c 100644 --- a/drivers/net/igbvf/vf.c +++ b/drivers/net/igbvf/vf.c @@ -274,6 +274,8 @@ static s32 e1000_set_vfta_vf(struct e1000_hw *hw, u16 vid, bool set) err = mbx->ops.read_posted(hw, msgbuf, 2); + msgbuf[0] &= ~E1000_VT_MSGTYPE_CTS; + /* if nacked the vlan was rejected */ if (!err && (msgbuf[0] == (E1000_VF_SET_VLAN | E1000_VT_MSGTYPE_NACK))) err = -E1000_ERR_MAC_INIT; @@ -317,6 +319,8 @@ static void e1000_rar_set_vf(struct e1000_hw *hw, u8 * addr, u32 index) if (!ret_val) ret_val = mbx->ops.read_posted(hw, msgbuf, 3); + msgbuf[0] &= ~E1000_VT_MSGTYPE_CTS; + /* if nacked the address was rejected, use "perm_addr" */ if (!ret_val && (msgbuf[0] == (E1000_VF_SET_MAC_ADDR | E1000_VT_MSGTYPE_NACK))) -- cgit v1.2.2 From 357eb46d8f275b4e8484541234ea3ba06065e258 Mon Sep 17 00:00:00 2001 From: Hannes Hering Date: Tue, 4 Aug 2009 11:48:39 -0700 Subject: ehea: Fix napi list corruption on ifconfig down This patch fixes the napi list handling when an ehea interface is shut down to avoid corruption of the napi list. Signed-off-by: Hannes Hering Signed-off-by: David S. Miller --- drivers/net/ehea/ehea.h | 2 +- drivers/net/ehea/ehea_main.c | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h index 78952f8324e2..fa311a950996 100644 --- a/drivers/net/ehea/ehea.h +++ b/drivers/net/ehea/ehea.h @@ -40,7 +40,7 @@ #include #define DRV_NAME "ehea" -#define DRV_VERSION "EHEA_0101" +#define DRV_VERSION "EHEA_0102" /* eHEA capability flags */ #define DLPAR_PORT_ADD_REM 1 diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index e8d46cc1bec2..977c3d358279 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -1545,6 +1545,9 @@ static int ehea_clean_portres(struct ehea_port *port, struct ehea_port_res *pr) { int ret, i; + if (pr->qp) + netif_napi_del(&pr->napi); + ret = ehea_destroy_qp(pr->qp); if (!ret) { -- cgit v1.2.2 From 18eac1cc100fa2afd5f39085aae6b694e417734b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Aug 2009 10:58:29 -0700 Subject: tty-ldisc: make refcount be atomic_t 'users' count This is pure preparation of changing the ldisc reference counting to be a true refcount that defines the lifetime of the ldisc. But this is a purely syntactic change for now to make the next steps easier. This patch should make no semantic changes at all. But I wanted to make the ldisc refcount be an atomic (I will be touching it without locks soon enough), and I wanted to rename it so that there isn't quite as much confusion between 'ldo->refcount' (ldisk operations refcount) and 'ld->refcount' (ldisc refcount itself) in the same file. So it's now an atomic 'ld->users' count. It still starts at zero, despite having a reference from 'tty->ldisc', but that will change once we turn it into a _real_ refcount. Signed-off-by: Linus Torvalds Tested-by: OGAWA Hirofumi Tested-by: Sergey Senozhatsky Acked-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/char/tty_ldisc.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'drivers') diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index acd76b767d4c..fd175e60aad5 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -142,7 +142,7 @@ static struct tty_ldisc *tty_ldisc_try_get(int disc) /* lock it */ ldops->refcount++; ld->ops = ldops; - ld->refcount = 0; + atomic_set(&ld->users, 0); err = 0; } } @@ -206,7 +206,7 @@ static void tty_ldisc_put(struct tty_ldisc *ld) ldo->refcount--; module_put(ldo->owner); spin_unlock_irqrestore(&tty_ldisc_lock, flags); - WARN_ON(ld->refcount); + WARN_ON(atomic_read(&ld->users)); kfree(ld); } @@ -297,7 +297,7 @@ static int tty_ldisc_try(struct tty_struct *tty) spin_lock_irqsave(&tty_ldisc_lock, flags); ld = tty->ldisc; if (test_bit(TTY_LDISC, &tty->flags)) { - ld->refcount++; + atomic_inc(&ld->users); ret = 1; } spin_unlock_irqrestore(&tty_ldisc_lock, flags); @@ -324,7 +324,7 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) { /* wait_event is a macro */ wait_event(tty_ldisc_wait, tty_ldisc_try(tty)); - WARN_ON(tty->ldisc->refcount == 0); + WARN_ON(atomic_read(&tty->ldisc->users) == 0); return tty->ldisc; } EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); @@ -365,11 +365,9 @@ void tty_ldisc_deref(struct tty_ldisc *ld) BUG_ON(ld == NULL); spin_lock_irqsave(&tty_ldisc_lock, flags); - if (ld->refcount == 0) + if (atomic_read(&ld->users) == 0) printk(KERN_ERR "tty_ldisc_deref: no references.\n"); - else - ld->refcount--; - if (ld->refcount == 0) + else if (atomic_dec_and_test(&ld->users)) wake_up(&tty_ldisc_wait); spin_unlock_irqrestore(&tty_ldisc_lock, flags); } @@ -536,10 +534,10 @@ static int tty_ldisc_wait_idle(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty_ldisc_lock, flags); - while (tty->ldisc->refcount) { + while (atomic_read(&tty->ldisc->users)) { spin_unlock_irqrestore(&tty_ldisc_lock, flags); if (wait_event_timeout(tty_ldisc_wait, - tty->ldisc->refcount == 0, 5 * HZ) == 0) + atomic_read(&tty->ldisc->users) == 0, 5 * HZ) == 0) return -EBUSY; spin_lock_irqsave(&tty_ldisc_lock, flags); } -- cgit v1.2.2 From 65b770468e98941e45e19780dff9283e663e6b8b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Aug 2009 11:11:19 -0700 Subject: tty-ldisc: turn ldisc user count into a proper refcount By using the user count for the actual lifetime rules, we can get rid of the silly "wait_for_idle" logic, because any busy ldisc will automatically stay around until the last user releases it. This avoids a host of odd issues, and simplifies the code. So now, when the last ldisc reference is dropped, we just release the ldisc operations struct reference, and free the ldisc. It looks obvious enough, and it does work for me, but the counting _could_ be off. It probably isn't (bad counting in the new version would generally imply that the old code did something really bad, like free an ldisc with a non-zero count), but it does need some testing, and preferably somebody looking at it. With this change, both 'tty_ldisc_put()' and 'tty_ldisc_deref()' are just aliases for the new ref-counting 'put_ldisc()'. Both of them decrement the ldisc user count and free it if it goes down to zero. They're identical functions, in other words. But the reason they still exist as sepate functions is that one of them was exported (tty_ldisc_deref) and had a stupid name (so I don't want to use it as the main name), and the other one was used in multiple places (and I didn't want to make the patch larger just to rename the users). In addition to the refcounting, I did do some minimal cleanup. For example, now "tty_ldisc_try()" actually returns the ldisc it got under the lock, rather than returning true/false and then the caller would look up the ldisc again (now without the protection of the lock). That said, there's tons of dubious use of 'tty->ldisc' without obviously proper locking or refcounting left. I expressly did _not_ want to try to fix it all, keeping the patch minimal. There may or may not be bugs in that kind of code, but they wouldn't be _new_ bugs. That said, even if the bugs aren't new, the timing and lifetime will change. For example, some silly code may depend on the 'tty->ldisc' pointer not changing because they hold a refcount on the 'ldisc'. And that's no longer true - if you hold a ref on the ldisc, the 'ldisc' itself is safe, but tty->ldisc may change. So the proper locking (remains) to hold tty->ldisc_mutex if you expect tty->ldisc to be stable. That's not really a _new_ rule, but it's an example of something that the old code might have unintentionally depended on and hidden bugs. Whatever. The patch _looks_ sensible to me. The only users of ldisc->users are: - get_ldisc() - atomically increment the count - put_ldisc() - atomically decrements the count and releases if zero - tty_ldisc_try_get() - creates the ldisc, and sets the count to 1. The ldisc should then either be released, or be attached to a tty. Signed-off-by: Linus Torvalds Tested-by: OGAWA Hirofumi Tested-by: Sergey Senozhatsky Acked-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/char/tty_ldisc.c | 143 +++++++++++++++-------------------------------- 1 file changed, 46 insertions(+), 97 deletions(-) (limited to 'drivers') diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index fd175e60aad5..be55dfcf59ac 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -48,6 +48,34 @@ static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait); /* Line disc dispatch table */ static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; +static inline struct tty_ldisc *get_ldisc(struct tty_ldisc *ld) +{ + if (ld) + atomic_inc(&ld->users); + return ld; +} + +static inline void put_ldisc(struct tty_ldisc *ld) +{ + if (WARN_ON_ONCE(!ld)) + return; + + /* + * If this is the last user, free the ldisc, and + * release the ldisc ops. + */ + if (atomic_dec_and_test(&ld->users)) { + unsigned long flags; + struct tty_ldisc_ops *ldo = ld->ops; + + kfree(ld); + spin_lock_irqsave(&tty_ldisc_lock, flags); + ldo->refcount--; + module_put(ldo->owner); + spin_unlock_irqrestore(&tty_ldisc_lock, flags); + } +} + /** * tty_register_ldisc - install a line discipline * @disc: ldisc number @@ -142,7 +170,7 @@ static struct tty_ldisc *tty_ldisc_try_get(int disc) /* lock it */ ldops->refcount++; ld->ops = ldops; - atomic_set(&ld->users, 0); + atomic_set(&ld->users, 1); err = 0; } } @@ -181,35 +209,6 @@ static struct tty_ldisc *tty_ldisc_get(int disc) return ld; } -/** - * tty_ldisc_put - drop ldisc reference - * @ld: ldisc - * - * Drop a reference to a line discipline. Manage refcounts and - * module usage counts. Free the ldisc once the recount hits zero. - * - * Locking: - * takes tty_ldisc_lock to guard against ldisc races - */ - -static void tty_ldisc_put(struct tty_ldisc *ld) -{ - unsigned long flags; - int disc = ld->ops->num; - struct tty_ldisc_ops *ldo; - - BUG_ON(disc < N_TTY || disc >= NR_LDISCS); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - ldo = tty_ldiscs[disc]; - BUG_ON(ldo->refcount == 0); - ldo->refcount--; - module_put(ldo->owner); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - WARN_ON(atomic_read(&ld->users)); - kfree(ld); -} - static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos) { return (*pos < NR_LDISCS) ? pos : NULL; @@ -234,7 +233,7 @@ static int tty_ldiscs_seq_show(struct seq_file *m, void *v) if (IS_ERR(ld)) return 0; seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i); - tty_ldisc_put(ld); + put_ldisc(ld); return 0; } @@ -288,20 +287,17 @@ static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld) * Locking: takes tty_ldisc_lock */ -static int tty_ldisc_try(struct tty_struct *tty) +static struct tty_ldisc *tty_ldisc_try(struct tty_struct *tty) { unsigned long flags; struct tty_ldisc *ld; - int ret = 0; spin_lock_irqsave(&tty_ldisc_lock, flags); - ld = tty->ldisc; - if (test_bit(TTY_LDISC, &tty->flags)) { - atomic_inc(&ld->users); - ret = 1; - } + ld = NULL; + if (test_bit(TTY_LDISC, &tty->flags)) + ld = get_ldisc(tty->ldisc); spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return ret; + return ld; } /** @@ -322,10 +318,11 @@ static int tty_ldisc_try(struct tty_struct *tty) struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) { + struct tty_ldisc *ld; + /* wait_event is a macro */ - wait_event(tty_ldisc_wait, tty_ldisc_try(tty)); - WARN_ON(atomic_read(&tty->ldisc->users) == 0); - return tty->ldisc; + wait_event(tty_ldisc_wait, (ld = tty_ldisc_try(tty)) != NULL); + return ld; } EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); @@ -342,9 +339,7 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) { - if (tty_ldisc_try(tty)) - return tty->ldisc; - return NULL; + return tty_ldisc_try(tty); } EXPORT_SYMBOL_GPL(tty_ldisc_ref); @@ -360,19 +355,15 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref); void tty_ldisc_deref(struct tty_ldisc *ld) { - unsigned long flags; - - BUG_ON(ld == NULL); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if (atomic_read(&ld->users) == 0) - printk(KERN_ERR "tty_ldisc_deref: no references.\n"); - else if (atomic_dec_and_test(&ld->users)) - wake_up(&tty_ldisc_wait); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); + put_ldisc(ld); } EXPORT_SYMBOL_GPL(tty_ldisc_deref); +static inline void tty_ldisc_put(struct tty_ldisc *ld) +{ + put_ldisc(ld); +} + /** * tty_ldisc_enable - allow ldisc use * @tty: terminal to activate ldisc on @@ -520,31 +511,6 @@ static int tty_ldisc_halt(struct tty_struct *tty) return cancel_delayed_work(&tty->buf.work); } -/** - * tty_ldisc_wait_idle - wait for the ldisc to become idle - * @tty: tty to wait for - * - * Wait for the line discipline to become idle. The discipline must - * have been halted for this to guarantee it remains idle. - * - * tty_ldisc_lock protects the ref counts currently. - */ - -static int tty_ldisc_wait_idle(struct tty_struct *tty) -{ - unsigned long flags; - spin_lock_irqsave(&tty_ldisc_lock, flags); - while (atomic_read(&tty->ldisc->users)) { - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - if (wait_event_timeout(tty_ldisc_wait, - atomic_read(&tty->ldisc->users) == 0, 5 * HZ) == 0) - return -EBUSY; - spin_lock_irqsave(&tty_ldisc_lock, flags); - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return 0; -} - /** * tty_set_ldisc - set line discipline * @tty: the terminal to set @@ -640,14 +606,6 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) flush_scheduled_work(); - /* Let any existing reference holders finish */ - retval = tty_ldisc_wait_idle(tty); - if (retval < 0) { - clear_bit(TTY_LDISC_CHANGING, &tty->flags); - tty_ldisc_put(new_ldisc); - return retval; - } - mutex_lock(&tty->ldisc_mutex); if (test_bit(TTY_HUPPED, &tty->flags)) { /* We were raced by the hangup method. It will have stomped @@ -793,7 +751,6 @@ void tty_ldisc_hangup(struct tty_struct *tty) if (tty->ldisc) { /* Not yet closed */ /* Switch back to N_TTY */ tty_ldisc_halt(tty); - tty_ldisc_wait_idle(tty); tty_ldisc_reinit(tty); /* At this point we have a closed ldisc and we want to reopen it. We could defer this to the next open but @@ -858,14 +815,6 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty) tty_ldisc_halt(tty); flush_scheduled_work(); - /* - * Wait for any short term users (we know they are just driver - * side waiters as the file is closing so user count on the file - * side is zero. - */ - - tty_ldisc_wait_idle(tty); - mutex_lock(&tty->ldisc_mutex); /* * Now kill off the ldisc -- cgit v1.2.2 From cbe9352fa08f90aa03b4dbf1bbabfc95d196e562 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Aug 2009 14:54:56 -0700 Subject: tty-ldisc: be more careful in 'put_ldisc' locking Use 'atomic_dec_and_lock()' to make sure that we always hold the tty_ldisc_lock when the ldisc count goes to zero. That way we can never race against 'tty_ldisc_try()' increasing the count again. Reported-by: OGAWA Hirofumi Signed-off-by: Linus Torvalds Tested-by: Sergey Senozhatsky Signed-off-by: Greg Kroah-Hartman --- drivers/char/tty_ldisc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers') diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index be55dfcf59ac..1733d3439ad2 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -55,25 +55,32 @@ static inline struct tty_ldisc *get_ldisc(struct tty_ldisc *ld) return ld; } -static inline void put_ldisc(struct tty_ldisc *ld) +static void put_ldisc(struct tty_ldisc *ld) { + unsigned long flags; + if (WARN_ON_ONCE(!ld)) return; /* * If this is the last user, free the ldisc, and * release the ldisc ops. + * + * We really want an "atomic_dec_and_lock_irqsave()", + * but we don't have it, so this does it by hand. */ - if (atomic_dec_and_test(&ld->users)) { - unsigned long flags; + local_irq_save(flags); + if (atomic_dec_and_lock(&ld->users, &tty_ldisc_lock)) { struct tty_ldisc_ops *ldo = ld->ops; - kfree(ld); - spin_lock_irqsave(&tty_ldisc_lock, flags); ldo->refcount--; module_put(ldo->owner); spin_unlock_irqrestore(&tty_ldisc_lock, flags); + + kfree(ld); + return; } + local_irq_restore(flags); } /** -- cgit v1.2.2