aboutsummaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorRafael J. Wysocki <rjw@sisk.pl>2009-08-05 17:56:54 -0400
committerRafael J. Wysocki <rjw@sisk.pl>2009-08-05 17:56:54 -0400
commitc00aafcd4977769e8728292302ddbbb8b1082fab (patch)
tree5766bcfbfd7b24816b54298b8ef34054f8cf0fae /drivers
parent2e6713c7662cc5ebc7346b033c404cb2f708fd51 (diff)
parent90bc1a658a53f8832ee799685703977a450e5af9 (diff)
Merge branch 'master' into for-linus
Diffstat (limited to 'drivers')
-rw-r--r--drivers/acpi/acpi_memhotplug.c34
-rw-r--r--drivers/acpi/acpica/acobject.h1
-rw-r--r--drivers/acpi/acpica/dsopcode.c24
-rw-r--r--drivers/acpi/acpica/exfldio.c6
-rw-r--r--drivers/acpi/osl.c25
-rw-r--r--drivers/acpi/system.c2
-rw-r--r--drivers/block/mg_disk.c101
-rw-r--r--drivers/char/agp/parisc-agp.c2
-rw-r--r--drivers/char/tty_ldisc.c152
-rw-r--r--drivers/cpufreq/cpufreq.c27
-rw-r--r--drivers/cpufreq/cpufreq_conservative.c6
-rw-r--r--drivers/dma/Kconfig12
-rw-r--r--drivers/dma/Makefile1
-rw-r--r--drivers/dma/at_hdmac.c1213
-rw-r--r--drivers/dma/at_hdmac_regs.h353
-rw-r--r--drivers/dma/dmatest.c21
-rw-r--r--drivers/dma/fsldma.c17
-rw-r--r--drivers/dma/fsldma.h1
-rw-r--r--drivers/dma/mv_xor.c2
-rw-r--r--drivers/edac/amd64_edac.c7
-rw-r--r--drivers/gpu/drm/drm_crtc.c2
-rw-r--r--drivers/gpu/drm/drm_crtc_helper.c6
-rw-r--r--drivers/gpu/drm/radeon/r100.c3
-rw-r--r--drivers/gpu/drm/radeon/radeon_drv.c17
-rw-r--r--drivers/gpu/drm/radeon/radeon_kms.c2
-rw-r--r--drivers/gpu/drm/radeon/rv515.c1
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo.c7
-rw-r--r--drivers/gpu/drm/ttm/ttm_bo_util.c8
-rw-r--r--drivers/input/serio/hp_sdc_mlc.c2
-rw-r--r--drivers/isdn/mISDN/l1oip_core.c2
-rw-r--r--drivers/lguest/core.c119
-rw-r--r--drivers/lguest/hypercalls.c145
-rw-r--r--drivers/lguest/interrupts_and_traps.c288
-rw-r--r--drivers/lguest/lg.h32
-rw-r--r--drivers/lguest/lguest_device.c160
-rw-r--r--drivers/lguest/lguest_user.c232
-rw-r--r--drivers/lguest/page_tables.c489
-rw-r--r--drivers/lguest/segments.c106
-rw-r--r--drivers/lguest/x86/core.c374
-rw-r--r--drivers/lguest/x86/switcher_32.S22
-rw-r--r--drivers/md/linear.c2
-rw-r--r--drivers/md/md.c144
-rw-r--r--drivers/md/md.h2
-rw-r--r--drivers/md/multipath.c5
-rw-r--r--drivers/md/raid0.c1
-rw-r--r--drivers/md/raid1.c7
-rw-r--r--drivers/md/raid10.c4
-rw-r--r--drivers/md/raid5.c51
-rw-r--r--drivers/mfd/twl4030-irq.c55
-rw-r--r--drivers/misc/cb710/sgbuf2.c4
-rw-r--r--drivers/mmc/host/cb710-mmc.c6
-rw-r--r--drivers/mmc/host/imxmmc.c2
-rw-r--r--drivers/mmc/host/sdhci.c10
-rw-r--r--drivers/net/3c515.c4
-rw-r--r--drivers/net/3c59x.c10
-rw-r--r--drivers/net/eexpress.c6
-rw-r--r--drivers/net/ehea/ehea.h2
-rw-r--r--drivers/net/ehea/ehea_main.c3
-rw-r--r--drivers/net/gianfar_ethtool.c10
-rw-r--r--drivers/net/igbvf/vf.c4
-rw-r--r--drivers/net/ixgbe/ixgbe.h2
-rw-r--r--drivers/net/ixgbe/ixgbe_82598.c67
-rw-r--r--drivers/net/ixgbe/ixgbe_ethtool.c11
-rw-r--r--drivers/net/ixgbe/ixgbe_main.c25
-rw-r--r--drivers/net/ixgbe/ixgbe_type.h8
-rw-r--r--drivers/net/mlx4/en_tx.c1
-rw-r--r--drivers/net/netxen/netxen_nic_main.c37
-rw-r--r--drivers/net/pcnet32.c30
-rw-r--r--drivers/net/ppp_generic.c34
-rw-r--r--drivers/net/pppoe.c1
-rw-r--r--drivers/net/pppol2tp.c1
-rw-r--r--drivers/net/s6gmac.c2
-rw-r--r--drivers/net/sky2.c14
-rw-r--r--drivers/net/sky2.h1
-rw-r--r--drivers/net/tulip/de4x5.c6
-rw-r--r--drivers/net/wireless/airo.c13
-rw-r--r--drivers/net/wireless/ath/ath9k/eeprom.c4
-rw-r--r--drivers/net/wireless/iwlwifi/iwl-3945.h2
-rw-r--r--drivers/net/wireless/iwlwifi/iwl-core.c3
-rw-r--r--drivers/net/wireless/iwlwifi/iwl-debugfs.c12
-rw-r--r--drivers/net/wireless/iwlwifi/iwl-dev.h6
-rw-r--r--drivers/net/wireless/iwlwifi/iwl-sta.c12
-rw-r--r--drivers/net/wireless/iwlwifi/iwl-tx.c14
-rw-r--r--drivers/net/wireless/iwlwifi/iwl3945-base.c7
-rw-r--r--drivers/net/wireless/iwmc3200wifi/commands.c1
-rw-r--r--drivers/net/wireless/iwmc3200wifi/netdev.c6
-rw-r--r--drivers/net/wireless/libertas/11d.c2
-rw-r--r--drivers/net/wireless/libertas/assoc.c18
-rw-r--r--drivers/net/wireless/libertas/scan.c3
-rw-r--r--drivers/net/wireless/zd1211rw/zd_mac.c2
-rw-r--r--drivers/parisc/ccio-dma.c1
-rw-r--r--drivers/parisc/dino.c2
-rw-r--r--drivers/parisc/eisa_eeprom.c2
-rw-r--r--drivers/parisc/hppb.c9
-rw-r--r--drivers/parisc/lba_pci.c2
-rw-r--r--drivers/parisc/pdc_stable.c2
-rw-r--r--drivers/pci/setup-res.c4
-rw-r--r--drivers/platform/x86/Kconfig25
-rw-r--r--drivers/platform/x86/eeepc-laptop.c9
-rw-r--r--drivers/platform/x86/hp-wmi.c12
-rw-r--r--drivers/platform/x86/thinkpad_acpi.c390
-rw-r--r--drivers/power/Kconfig7
-rw-r--r--drivers/power/Makefile1
-rw-r--r--drivers/power/ds2782_battery.c330
-rw-r--r--drivers/power/olpc_battery.c26
-rw-r--r--drivers/s390/scsi/zfcp_erp.c68
-rw-r--r--drivers/s390/scsi/zfcp_fc.c8
-rw-r--r--drivers/s390/scsi/zfcp_fsf.c56
-rw-r--r--drivers/s390/scsi/zfcp_scsi.c25
-rw-r--r--drivers/s390/scsi/zfcp_sysfs.c7
-rw-r--r--drivers/scsi/libfc/fc_exch.c23
-rw-r--r--drivers/scsi/libiscsi.c4
-rw-r--r--drivers/scsi/libsas/sas_expander.c147
-rw-r--r--drivers/scsi/libsas/sas_port.c19
-rw-r--r--drivers/scsi/qla4xxx/ql4_dbg.c15
-rw-r--r--drivers/scsi/qla4xxx/ql4_def.h9
-rw-r--r--drivers/scsi/qla4xxx/ql4_fw.h7
-rw-r--r--drivers/scsi/qla4xxx/ql4_iocb.c133
-rw-r--r--drivers/scsi/qla4xxx/ql4_isr.c145
-rw-r--r--drivers/scsi/qla4xxx/ql4_mbx.c10
-rw-r--r--drivers/scsi/qla4xxx/ql4_os.c40
-rw-r--r--drivers/scsi/qla4xxx/ql4_version.h2
-rw-r--r--drivers/scsi/scsi_transport_iscsi.c4
-rw-r--r--drivers/scsi/sd.c20
-rw-r--r--drivers/serial/cpm_uart/cpm_uart_cpm2.c2
-rw-r--r--drivers/video/console/sticore.c9
-rw-r--r--drivers/virtio/virtio_pci.c240
-rw-r--r--drivers/watchdog/coh901327_wdt.c11
128 files changed, 4677 insertions, 1850 deletions
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c
index 7a0f4aa4fa1e..9a62224cc278 100644
--- a/drivers/acpi/acpi_memhotplug.c
+++ b/drivers/acpi/acpi_memhotplug.c
@@ -38,6 +38,9 @@
38 38
39#define _COMPONENT ACPI_MEMORY_DEVICE_COMPONENT 39#define _COMPONENT ACPI_MEMORY_DEVICE_COMPONENT
40 40
41#undef PREFIX
42#define PREFIX "ACPI:memory_hp:"
43
41ACPI_MODULE_NAME("acpi_memhotplug"); 44ACPI_MODULE_NAME("acpi_memhotplug");
42MODULE_AUTHOR("Naveen B S <naveen.b.s@intel.com>"); 45MODULE_AUTHOR("Naveen B S <naveen.b.s@intel.com>");
43MODULE_DESCRIPTION("Hotplug Mem Driver"); 46MODULE_DESCRIPTION("Hotplug Mem Driver");
@@ -153,6 +156,7 @@ acpi_memory_get_device(acpi_handle handle,
153 acpi_handle phandle; 156 acpi_handle phandle;
154 struct acpi_device *device = NULL; 157 struct acpi_device *device = NULL;
155 struct acpi_device *pdevice = NULL; 158 struct acpi_device *pdevice = NULL;
159 int result;
156 160
157 161
158 if (!acpi_bus_get_device(handle, &device) && device) 162 if (!acpi_bus_get_device(handle, &device) && device)
@@ -165,9 +169,9 @@ acpi_memory_get_device(acpi_handle handle,
165 } 169 }
166 170
167 /* Get the parent device */ 171 /* Get the parent device */
168 status = acpi_bus_get_device(phandle, &pdevice); 172 result = acpi_bus_get_device(phandle, &pdevice);
169 if (ACPI_FAILURE(status)) { 173 if (result) {
170 ACPI_EXCEPTION((AE_INFO, status, "Cannot get acpi bus device")); 174 printk(KERN_WARNING PREFIX "Cannot get acpi bus device");
171 return -EINVAL; 175 return -EINVAL;
172 } 176 }
173 177
@@ -175,9 +179,9 @@ acpi_memory_get_device(acpi_handle handle,
175 * Now add the notified device. This creates the acpi_device 179 * Now add the notified device. This creates the acpi_device
176 * and invokes .add function 180 * and invokes .add function
177 */ 181 */
178 status = acpi_bus_add(&device, pdevice, handle, ACPI_BUS_TYPE_DEVICE); 182 result = acpi_bus_add(&device, pdevice, handle, ACPI_BUS_TYPE_DEVICE);
179 if (ACPI_FAILURE(status)) { 183 if (result) {
180 ACPI_EXCEPTION((AE_INFO, status, "Cannot add acpi bus")); 184 printk(KERN_WARNING PREFIX "Cannot add acpi bus");
181 return -EINVAL; 185 return -EINVAL;
182 } 186 }
183 187
@@ -238,7 +242,12 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
238 num_enabled++; 242 num_enabled++;
239 continue; 243 continue;
240 } 244 }
241 245 /*
246 * If the memory block size is zero, please ignore it.
247 * Don't try to do the following memory hotplug flowchart.
248 */
249 if (!info->length)
250 continue;
242 if (node < 0) 251 if (node < 0)
243 node = memory_add_physaddr_to_nid(info->start_addr); 252 node = memory_add_physaddr_to_nid(info->start_addr);
244 253
@@ -253,8 +262,15 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device)
253 mem_device->state = MEMORY_INVALID_STATE; 262 mem_device->state = MEMORY_INVALID_STATE;
254 return -EINVAL; 263 return -EINVAL;
255 } 264 }
256 265 /*
257 return result; 266 * Sometimes the memory device will contain several memory blocks.
267 * When one memory block is hot-added to the system memory, it will
268 * be regarded as a success.
269 * Otherwise if the last memory block can't be hot-added to the system
270 * memory, it will be failure and the memory device can't be bound with
271 * driver.
272 */
273 return 0;
258} 274}
259 275
260static int acpi_memory_powerdown_device(struct acpi_memory_device *mem_device) 276static int acpi_memory_powerdown_device(struct acpi_memory_device *mem_device)
diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h
index 544dcf834922..eb6f038b03d9 100644
--- a/drivers/acpi/acpica/acobject.h
+++ b/drivers/acpi/acpica/acobject.h
@@ -97,6 +97,7 @@
97#define AOPOBJ_OBJECT_INITIALIZED 0x08 97#define AOPOBJ_OBJECT_INITIALIZED 0x08
98#define AOPOBJ_SETUP_COMPLETE 0x10 98#define AOPOBJ_SETUP_COMPLETE 0x10
99#define AOPOBJ_SINGLE_DATUM 0x20 99#define AOPOBJ_SINGLE_DATUM 0x20
100#define AOPOBJ_INVALID 0x40 /* Used if host OS won't allow an op_region address */
100 101
101/****************************************************************************** 102/******************************************************************************
102 * 103 *
diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c
index 584d766e6f12..b79978f7bc71 100644
--- a/drivers/acpi/acpica/dsopcode.c
+++ b/drivers/acpi/acpica/dsopcode.c
@@ -397,6 +397,30 @@ acpi_status acpi_ds_get_region_arguments(union acpi_operand_object *obj_desc)
397 status = acpi_ds_execute_arguments(node, acpi_ns_get_parent_node(node), 397 status = acpi_ds_execute_arguments(node, acpi_ns_get_parent_node(node),
398 extra_desc->extra.aml_length, 398 extra_desc->extra.aml_length,
399 extra_desc->extra.aml_start); 399 extra_desc->extra.aml_start);
400 if (ACPI_FAILURE(status)) {
401 return_ACPI_STATUS(status);
402 }
403
404 /* Validate the region address/length via the host OS */
405
406 status = acpi_os_validate_address(obj_desc->region.space_id,
407 obj_desc->region.address,
408 (acpi_size) obj_desc->region.length,
409 acpi_ut_get_node_name(node));
410
411 if (ACPI_FAILURE(status)) {
412 /*
413 * Invalid address/length. We will emit an error message and mark
414 * the region as invalid, so that it will cause an additional error if
415 * it is ever used. Then return AE_OK.
416 */
417 ACPI_EXCEPTION((AE_INFO, status,
418 "During address validation of OpRegion [%4.4s]",
419 node->name.ascii));
420 obj_desc->common.flags |= AOPOBJ_INVALID;
421 status = AE_OK;
422 }
423
400 return_ACPI_STATUS(status); 424 return_ACPI_STATUS(status);
401} 425}
402 426
diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c
index d4075b821021..6687be167f5f 100644
--- a/drivers/acpi/acpica/exfldio.c
+++ b/drivers/acpi/acpica/exfldio.c
@@ -113,6 +113,12 @@ acpi_ex_setup_region(union acpi_operand_object *obj_desc,
113 } 113 }
114 } 114 }
115 115
116 /* Exit if Address/Length have been disallowed by the host OS */
117
118 if (rgn_desc->common.flags & AOPOBJ_INVALID) {
119 return_ACPI_STATUS(AE_AML_ILLEGAL_ADDRESS);
120 }
121
116 /* 122 /*
117 * Exit now for SMBus address space, it has a non-linear address space 123 * Exit now for SMBus address space, it has a non-linear address space
118 * and the request cannot be directly validated 124 * and the request cannot be directly validated
diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c
index 71670719d61a..5691f165a952 100644
--- a/drivers/acpi/osl.c
+++ b/drivers/acpi/osl.c
@@ -189,11 +189,36 @@ acpi_status __init acpi_os_initialize(void)
189 return AE_OK; 189 return AE_OK;
190} 190}
191 191
192static void bind_to_cpu0(struct work_struct *work)
193{
194 set_cpus_allowed(current, cpumask_of_cpu(0));
195 kfree(work);
196}
197
198static void bind_workqueue(struct workqueue_struct *wq)
199{
200 struct work_struct *work;
201
202 work = kzalloc(sizeof(struct work_struct), GFP_KERNEL);
203 INIT_WORK(work, bind_to_cpu0);
204 queue_work(wq, work);
205}
206
192acpi_status acpi_os_initialize1(void) 207acpi_status acpi_os_initialize1(void)
193{ 208{
209 /*
210 * On some machines, a software-initiated SMI causes corruption unless
211 * the SMI runs on CPU 0. An SMI can be initiated by any AML, but
212 * typically it's done in GPE-related methods that are run via
213 * workqueues, so we can avoid the known corruption cases by binding
214 * the workqueues to CPU 0.
215 */
194 kacpid_wq = create_singlethread_workqueue("kacpid"); 216 kacpid_wq = create_singlethread_workqueue("kacpid");
217 bind_workqueue(kacpid_wq);
195 kacpi_notify_wq = create_singlethread_workqueue("kacpi_notify"); 218 kacpi_notify_wq = create_singlethread_workqueue("kacpi_notify");
219 bind_workqueue(kacpi_notify_wq);
196 kacpi_hotplug_wq = create_singlethread_workqueue("kacpi_hotplug"); 220 kacpi_hotplug_wq = create_singlethread_workqueue("kacpi_hotplug");
221 bind_workqueue(kacpi_hotplug_wq);
197 BUG_ON(!kacpid_wq); 222 BUG_ON(!kacpid_wq);
198 BUG_ON(!kacpi_notify_wq); 223 BUG_ON(!kacpi_notify_wq);
199 BUG_ON(!kacpi_hotplug_wq); 224 BUG_ON(!kacpi_hotplug_wq);
diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c
index 0944daec064f..9c61ab2177cf 100644
--- a/drivers/acpi/system.c
+++ b/drivers/acpi/system.c
@@ -121,7 +121,7 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr,
121 table_attr->attr.size = 0; 121 table_attr->attr.size = 0;
122 table_attr->attr.read = acpi_table_show; 122 table_attr->attr.read = acpi_table_show;
123 table_attr->attr.attr.name = table_attr->name; 123 table_attr->attr.attr.name = table_attr->name;
124 table_attr->attr.attr.mode = 0444; 124 table_attr->attr.attr.mode = 0400;
125 125
126 return; 126 return;
127} 127}
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index f703f5478246..6d7fbaa92248 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -36,7 +36,6 @@
36 36
37/* Register offsets */ 37/* Register offsets */
38#define MG_BUFF_OFFSET 0x8000 38#define MG_BUFF_OFFSET 0x8000
39#define MG_STORAGE_BUFFER_SIZE 0x200
40#define MG_REG_OFFSET 0xC000 39#define MG_REG_OFFSET 0xC000
41#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */ 40#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */
42#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */ 41#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */
@@ -219,6 +218,16 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
219 host->error = MG_ERR_NONE; 218 host->error = MG_ERR_NONE;
220 expire = jiffies + msecs_to_jiffies(msec); 219 expire = jiffies + msecs_to_jiffies(msec);
221 220
221 /* These 2 times dummy status read prevents reading invalid
222 * status. A very little time (3 times of mflash operating clk)
223 * is required for busy bit is set. Use dummy read instead of
224 * busy wait, because mflash's PLL is machine dependent.
225 */
226 if (prv_data->use_polling) {
227 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
228 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
229 }
230
222 status = inb((unsigned long)host->dev_base + MG_REG_STATUS); 231 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
223 232
224 do { 233 do {
@@ -245,8 +254,6 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec)
245 mg_dump_status("not ready", status, host); 254 mg_dump_status("not ready", status, host);
246 return MG_ERR_INV_STAT; 255 return MG_ERR_INV_STAT;
247 } 256 }
248 if (prv_data->use_polling)
249 msleep(1);
250 257
251 status = inb((unsigned long)host->dev_base + MG_REG_STATUS); 258 status = inb((unsigned long)host->dev_base + MG_REG_STATUS);
252 } while (time_before(cur_jiffies, expire)); 259 } while (time_before(cur_jiffies, expire));
@@ -469,9 +476,18 @@ static unsigned int mg_out(struct mg_host *host,
469 return MG_ERR_NONE; 476 return MG_ERR_NONE;
470} 477}
471 478
479static void mg_read_one(struct mg_host *host, struct request *req)
480{
481 u16 *buff = (u16 *)req->buffer;
482 u32 i;
483
484 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
485 *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
486 (i << 1));
487}
488
472static void mg_read(struct request *req) 489static void mg_read(struct request *req)
473{ 490{
474 u32 j;
475 struct mg_host *host = req->rq_disk->private_data; 491 struct mg_host *host = req->rq_disk->private_data;
476 492
477 if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), 493 if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req),
@@ -482,49 +498,65 @@ static void mg_read(struct request *req)
482 blk_rq_sectors(req), blk_rq_pos(req), req->buffer); 498 blk_rq_sectors(req), blk_rq_pos(req), req->buffer);
483 499
484 do { 500 do {
485 u16 *buff = (u16 *)req->buffer;
486
487 if (mg_wait(host, ATA_DRQ, 501 if (mg_wait(host, ATA_DRQ,
488 MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) { 502 MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) {
489 mg_bad_rw_intr(host); 503 mg_bad_rw_intr(host);
490 return; 504 return;
491 } 505 }
492 for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) 506
493 *buff++ = inw((unsigned long)host->dev_base + 507 mg_read_one(host, req);
494 MG_BUFF_OFFSET + (j << 1));
495 508
496 outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + 509 outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base +
497 MG_REG_COMMAND); 510 MG_REG_COMMAND);
498 } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); 511 } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
499} 512}
500 513
514static void mg_write_one(struct mg_host *host, struct request *req)
515{
516 u16 *buff = (u16 *)req->buffer;
517 u32 i;
518
519 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
520 outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET +
521 (i << 1));
522}
523
501static void mg_write(struct request *req) 524static void mg_write(struct request *req)
502{ 525{
503 u32 j;
504 struct mg_host *host = req->rq_disk->private_data; 526 struct mg_host *host = req->rq_disk->private_data;
527 unsigned int rem = blk_rq_sectors(req);
505 528
506 if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), 529 if (mg_out(host, blk_rq_pos(req), rem,
507 MG_CMD_WR, NULL) != MG_ERR_NONE) { 530 MG_CMD_WR, NULL) != MG_ERR_NONE) {
508 mg_bad_rw_intr(host); 531 mg_bad_rw_intr(host);
509 return; 532 return;
510 } 533 }
511 534
512 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", 535 MG_DBG("requested %d sects (from %ld), buffer=0x%p\n",
513 blk_rq_sectors(req), blk_rq_pos(req), req->buffer); 536 rem, blk_rq_pos(req), req->buffer);
537
538 if (mg_wait(host, ATA_DRQ,
539 MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
540 mg_bad_rw_intr(host);
541 return;
542 }
514 543
515 do { 544 do {
516 u16 *buff = (u16 *)req->buffer; 545 mg_write_one(host, req);
517 546
518 if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { 547 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
548 MG_REG_COMMAND);
549
550 rem--;
551 if (rem > 1 && mg_wait(host, ATA_DRQ,
552 MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
553 mg_bad_rw_intr(host);
554 return;
555 } else if (mg_wait(host, MG_STAT_READY,
556 MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) {
519 mg_bad_rw_intr(host); 557 mg_bad_rw_intr(host);
520 return; 558 return;
521 } 559 }
522 for (j = 0; j < MG_SECTOR_SIZE >> 1; j++)
523 outw(*buff++, (unsigned long)host->dev_base +
524 MG_BUFF_OFFSET + (j << 1));
525
526 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
527 MG_REG_COMMAND);
528 } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); 560 } while (mg_end_request(host, 0, MG_SECTOR_SIZE));
529} 561}
530 562
@@ -532,7 +564,6 @@ static void mg_read_intr(struct mg_host *host)
532{ 564{
533 struct request *req = host->req; 565 struct request *req = host->req;
534 u32 i; 566 u32 i;
535 u16 *buff;
536 567
537 /* check status */ 568 /* check status */
538 do { 569 do {
@@ -550,13 +581,7 @@ static void mg_read_intr(struct mg_host *host)
550 return; 581 return;
551 582
552ok_to_read: 583ok_to_read:
553 /* get current segment of request */ 584 mg_read_one(host, req);
554 buff = (u16 *)req->buffer;
555
556 /* read 1 sector */
557 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++)
558 *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET +
559 (i << 1));
560 585
561 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", 586 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
562 blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); 587 blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer);
@@ -575,8 +600,7 @@ ok_to_read:
575static void mg_write_intr(struct mg_host *host) 600static void mg_write_intr(struct mg_host *host)
576{ 601{
577 struct request *req = host->req; 602 struct request *req = host->req;
578 u32 i, j; 603 u32 i;
579 u16 *buff;
580 bool rem; 604 bool rem;
581 605
582 /* check status */ 606 /* check status */
@@ -597,12 +621,7 @@ static void mg_write_intr(struct mg_host *host)
597ok_to_write: 621ok_to_write:
598 if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) { 622 if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) {
599 /* write 1 sector and set handler if remains */ 623 /* write 1 sector and set handler if remains */
600 buff = (u16 *)req->buffer; 624 mg_write_one(host, req);
601 for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) {
602 outw(*buff, (unsigned long)host->dev_base +
603 MG_BUFF_OFFSET + (j << 1));
604 buff++;
605 }
606 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", 625 MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n",
607 blk_rq_pos(req), blk_rq_sectors(req), req->buffer); 626 blk_rq_pos(req), blk_rq_sectors(req), req->buffer);
608 host->mg_do_intr = mg_write_intr; 627 host->mg_do_intr = mg_write_intr;
@@ -667,9 +686,6 @@ static unsigned int mg_issue_req(struct request *req,
667 unsigned int sect_num, 686 unsigned int sect_num,
668 unsigned int sect_cnt) 687 unsigned int sect_cnt)
669{ 688{
670 u16 *buff;
671 u32 i;
672
673 switch (rq_data_dir(req)) { 689 switch (rq_data_dir(req)) {
674 case READ: 690 case READ:
675 if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) 691 if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr)
@@ -693,12 +709,7 @@ static unsigned int mg_issue_req(struct request *req,
693 mg_bad_rw_intr(host); 709 mg_bad_rw_intr(host);
694 return host->error; 710 return host->error;
695 } 711 }
696 buff = (u16 *)req->buffer; 712 mg_write_one(host, req);
697 for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) {
698 outw(*buff, (unsigned long)host->dev_base +
699 MG_BUFF_OFFSET + (i << 1));
700 buff++;
701 }
702 mod_timer(&host->timer, jiffies + 3 * HZ); 713 mod_timer(&host->timer, jiffies + 3 * HZ);
703 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + 714 outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base +
704 MG_REG_COMMAND); 715 MG_REG_COMMAND);
diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c
index f4bb43fb8016..e077701ae3d9 100644
--- a/drivers/char/agp/parisc-agp.c
+++ b/drivers/char/agp/parisc-agp.c
@@ -225,7 +225,7 @@ static const struct agp_bridge_driver parisc_agp_driver = {
225 .configure = parisc_agp_configure, 225 .configure = parisc_agp_configure,
226 .fetch_size = parisc_agp_fetch_size, 226 .fetch_size = parisc_agp_fetch_size,
227 .tlb_flush = parisc_agp_tlbflush, 227 .tlb_flush = parisc_agp_tlbflush,
228 .mask_memory = parisc_agp_mask_memory, 228 .mask_memory = parisc_agp_page_mask_memory,
229 .masks = parisc_agp_masks, 229 .masks = parisc_agp_masks,
230 .agp_enable = parisc_agp_enable, 230 .agp_enable = parisc_agp_enable,
231 .cache_flush = global_cache_flush, 231 .cache_flush = global_cache_flush,
diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c
index acd76b767d4c..1733d3439ad2 100644
--- a/drivers/char/tty_ldisc.c
+++ b/drivers/char/tty_ldisc.c
@@ -48,6 +48,41 @@ static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait);
48/* Line disc dispatch table */ 48/* Line disc dispatch table */
49static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; 49static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS];
50 50
51static inline struct tty_ldisc *get_ldisc(struct tty_ldisc *ld)
52{
53 if (ld)
54 atomic_inc(&ld->users);
55 return ld;
56}
57
58static void put_ldisc(struct tty_ldisc *ld)
59{
60 unsigned long flags;
61
62 if (WARN_ON_ONCE(!ld))
63 return;
64
65 /*
66 * If this is the last user, free the ldisc, and
67 * release the ldisc ops.
68 *
69 * We really want an "atomic_dec_and_lock_irqsave()",
70 * but we don't have it, so this does it by hand.
71 */
72 local_irq_save(flags);
73 if (atomic_dec_and_lock(&ld->users, &tty_ldisc_lock)) {
74 struct tty_ldisc_ops *ldo = ld->ops;
75
76 ldo->refcount--;
77 module_put(ldo->owner);
78 spin_unlock_irqrestore(&tty_ldisc_lock, flags);
79
80 kfree(ld);
81 return;
82 }
83 local_irq_restore(flags);
84}
85
51/** 86/**
52 * tty_register_ldisc - install a line discipline 87 * tty_register_ldisc - install a line discipline
53 * @disc: ldisc number 88 * @disc: ldisc number
@@ -142,7 +177,7 @@ static struct tty_ldisc *tty_ldisc_try_get(int disc)
142 /* lock it */ 177 /* lock it */
143 ldops->refcount++; 178 ldops->refcount++;
144 ld->ops = ldops; 179 ld->ops = ldops;
145 ld->refcount = 0; 180 atomic_set(&ld->users, 1);
146 err = 0; 181 err = 0;
147 } 182 }
148 } 183 }
@@ -181,35 +216,6 @@ static struct tty_ldisc *tty_ldisc_get(int disc)
181 return ld; 216 return ld;
182} 217}
183 218
184/**
185 * tty_ldisc_put - drop ldisc reference
186 * @ld: ldisc
187 *
188 * Drop a reference to a line discipline. Manage refcounts and
189 * module usage counts. Free the ldisc once the recount hits zero.
190 *
191 * Locking:
192 * takes tty_ldisc_lock to guard against ldisc races
193 */
194
195static void tty_ldisc_put(struct tty_ldisc *ld)
196{
197 unsigned long flags;
198 int disc = ld->ops->num;
199 struct tty_ldisc_ops *ldo;
200
201 BUG_ON(disc < N_TTY || disc >= NR_LDISCS);
202
203 spin_lock_irqsave(&tty_ldisc_lock, flags);
204 ldo = tty_ldiscs[disc];
205 BUG_ON(ldo->refcount == 0);
206 ldo->refcount--;
207 module_put(ldo->owner);
208 spin_unlock_irqrestore(&tty_ldisc_lock, flags);
209 WARN_ON(ld->refcount);
210 kfree(ld);
211}
212
213static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos) 219static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
214{ 220{
215 return (*pos < NR_LDISCS) ? pos : NULL; 221 return (*pos < NR_LDISCS) ? pos : NULL;
@@ -234,7 +240,7 @@ static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
234 if (IS_ERR(ld)) 240 if (IS_ERR(ld))
235 return 0; 241 return 0;
236 seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i); 242 seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i);
237 tty_ldisc_put(ld); 243 put_ldisc(ld);
238 return 0; 244 return 0;
239} 245}
240 246
@@ -288,20 +294,17 @@ static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
288 * Locking: takes tty_ldisc_lock 294 * Locking: takes tty_ldisc_lock
289 */ 295 */
290 296
291static int tty_ldisc_try(struct tty_struct *tty) 297static struct tty_ldisc *tty_ldisc_try(struct tty_struct *tty)
292{ 298{
293 unsigned long flags; 299 unsigned long flags;
294 struct tty_ldisc *ld; 300 struct tty_ldisc *ld;
295 int ret = 0;
296 301
297 spin_lock_irqsave(&tty_ldisc_lock, flags); 302 spin_lock_irqsave(&tty_ldisc_lock, flags);
298 ld = tty->ldisc; 303 ld = NULL;
299 if (test_bit(TTY_LDISC, &tty->flags)) { 304 if (test_bit(TTY_LDISC, &tty->flags))
300 ld->refcount++; 305 ld = get_ldisc(tty->ldisc);
301 ret = 1;
302 }
303 spin_unlock_irqrestore(&tty_ldisc_lock, flags); 306 spin_unlock_irqrestore(&tty_ldisc_lock, flags);
304 return ret; 307 return ld;
305} 308}
306 309
307/** 310/**
@@ -322,10 +325,11 @@ static int tty_ldisc_try(struct tty_struct *tty)
322 325
323struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) 326struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
324{ 327{
328 struct tty_ldisc *ld;
329
325 /* wait_event is a macro */ 330 /* wait_event is a macro */
326 wait_event(tty_ldisc_wait, tty_ldisc_try(tty)); 331 wait_event(tty_ldisc_wait, (ld = tty_ldisc_try(tty)) != NULL);
327 WARN_ON(tty->ldisc->refcount == 0); 332 return ld;
328 return tty->ldisc;
329} 333}
330EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); 334EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
331 335
@@ -342,9 +346,7 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
342 346
343struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) 347struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
344{ 348{
345 if (tty_ldisc_try(tty)) 349 return tty_ldisc_try(tty);
346 return tty->ldisc;
347 return NULL;
348} 350}
349EXPORT_SYMBOL_GPL(tty_ldisc_ref); 351EXPORT_SYMBOL_GPL(tty_ldisc_ref);
350 352
@@ -360,21 +362,15 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref);
360 362
361void tty_ldisc_deref(struct tty_ldisc *ld) 363void tty_ldisc_deref(struct tty_ldisc *ld)
362{ 364{
363 unsigned long flags; 365 put_ldisc(ld);
364
365 BUG_ON(ld == NULL);
366
367 spin_lock_irqsave(&tty_ldisc_lock, flags);
368 if (ld->refcount == 0)
369 printk(KERN_ERR "tty_ldisc_deref: no references.\n");
370 else
371 ld->refcount--;
372 if (ld->refcount == 0)
373 wake_up(&tty_ldisc_wait);
374 spin_unlock_irqrestore(&tty_ldisc_lock, flags);
375} 366}
376EXPORT_SYMBOL_GPL(tty_ldisc_deref); 367EXPORT_SYMBOL_GPL(tty_ldisc_deref);
377 368
369static inline void tty_ldisc_put(struct tty_ldisc *ld)
370{
371 put_ldisc(ld);
372}
373
378/** 374/**
379 * tty_ldisc_enable - allow ldisc use 375 * tty_ldisc_enable - allow ldisc use
380 * @tty: terminal to activate ldisc on 376 * @tty: terminal to activate ldisc on
@@ -523,31 +519,6 @@ static int tty_ldisc_halt(struct tty_struct *tty)
523} 519}
524 520
525/** 521/**
526 * tty_ldisc_wait_idle - wait for the ldisc to become idle
527 * @tty: tty to wait for
528 *
529 * Wait for the line discipline to become idle. The discipline must
530 * have been halted for this to guarantee it remains idle.
531 *
532 * tty_ldisc_lock protects the ref counts currently.
533 */
534
535static int tty_ldisc_wait_idle(struct tty_struct *tty)
536{
537 unsigned long flags;
538 spin_lock_irqsave(&tty_ldisc_lock, flags);
539 while (tty->ldisc->refcount) {
540 spin_unlock_irqrestore(&tty_ldisc_lock, flags);
541 if (wait_event_timeout(tty_ldisc_wait,
542 tty->ldisc->refcount == 0, 5 * HZ) == 0)
543 return -EBUSY;
544 spin_lock_irqsave(&tty_ldisc_lock, flags);
545 }
546 spin_unlock_irqrestore(&tty_ldisc_lock, flags);
547 return 0;
548}
549
550/**
551 * tty_set_ldisc - set line discipline 522 * tty_set_ldisc - set line discipline
552 * @tty: the terminal to set 523 * @tty: the terminal to set
553 * @ldisc: the line discipline 524 * @ldisc: the line discipline
@@ -642,14 +613,6 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc)
642 613
643 flush_scheduled_work(); 614 flush_scheduled_work();
644 615
645 /* Let any existing reference holders finish */
646 retval = tty_ldisc_wait_idle(tty);
647 if (retval < 0) {
648 clear_bit(TTY_LDISC_CHANGING, &tty->flags);
649 tty_ldisc_put(new_ldisc);
650 return retval;
651 }
652
653 mutex_lock(&tty->ldisc_mutex); 616 mutex_lock(&tty->ldisc_mutex);
654 if (test_bit(TTY_HUPPED, &tty->flags)) { 617 if (test_bit(TTY_HUPPED, &tty->flags)) {
655 /* We were raced by the hangup method. It will have stomped 618 /* We were raced by the hangup method. It will have stomped
@@ -795,7 +758,6 @@ void tty_ldisc_hangup(struct tty_struct *tty)
795 if (tty->ldisc) { /* Not yet closed */ 758 if (tty->ldisc) { /* Not yet closed */
796 /* Switch back to N_TTY */ 759 /* Switch back to N_TTY */
797 tty_ldisc_halt(tty); 760 tty_ldisc_halt(tty);
798 tty_ldisc_wait_idle(tty);
799 tty_ldisc_reinit(tty); 761 tty_ldisc_reinit(tty);
800 /* At this point we have a closed ldisc and we want to 762 /* At this point we have a closed ldisc and we want to
801 reopen it. We could defer this to the next open but 763 reopen it. We could defer this to the next open but
@@ -860,14 +822,6 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
860 tty_ldisc_halt(tty); 822 tty_ldisc_halt(tty);
861 flush_scheduled_work(); 823 flush_scheduled_work();
862 824
863 /*
864 * Wait for any short term users (we know they are just driver
865 * side waiters as the file is closing so user count on the file
866 * side is zero.
867 */
868
869 tty_ldisc_wait_idle(tty);
870
871 mutex_lock(&tty->ldisc_mutex); 825 mutex_lock(&tty->ldisc_mutex);
872 /* 826 /*
873 * Now kill off the ldisc 827 * Now kill off the ldisc
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c
index b90eda8b3440..fd69086d08d5 100644
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -858,6 +858,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
858 858
859 /* Check for existing affected CPUs. 859 /* Check for existing affected CPUs.
860 * They may not be aware of it due to CPU Hotplug. 860 * They may not be aware of it due to CPU Hotplug.
861 * cpufreq_cpu_put is called when the device is removed
862 * in __cpufreq_remove_dev()
861 */ 863 */
862 managed_policy = cpufreq_cpu_get(j); 864 managed_policy = cpufreq_cpu_get(j);
863 if (unlikely(managed_policy)) { 865 if (unlikely(managed_policy)) {
@@ -884,7 +886,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
884 ret = sysfs_create_link(&sys_dev->kobj, 886 ret = sysfs_create_link(&sys_dev->kobj,
885 &managed_policy->kobj, 887 &managed_policy->kobj,
886 "cpufreq"); 888 "cpufreq");
887 if (!ret) 889 if (ret)
888 cpufreq_cpu_put(managed_policy); 890 cpufreq_cpu_put(managed_policy);
889 /* 891 /*
890 * Success. We only needed to be added to the mask. 892 * Success. We only needed to be added to the mask.
@@ -924,6 +926,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev)
924 926
925 spin_lock_irqsave(&cpufreq_driver_lock, flags); 927 spin_lock_irqsave(&cpufreq_driver_lock, flags);
926 for_each_cpu(j, policy->cpus) { 928 for_each_cpu(j, policy->cpus) {
929 if (!cpu_online(j))
930 continue;
927 per_cpu(cpufreq_cpu_data, j) = policy; 931 per_cpu(cpufreq_cpu_data, j) = policy;
928 per_cpu(policy_cpu, j) = policy->cpu; 932 per_cpu(policy_cpu, j) = policy->cpu;
929 } 933 }
@@ -1244,13 +1248,22 @@ EXPORT_SYMBOL(cpufreq_get);
1244 1248
1245static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg) 1249static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg)
1246{ 1250{
1247 int cpu = sysdev->id;
1248 int ret = 0; 1251 int ret = 0;
1252
1253#ifdef __powerpc__
1254 int cpu = sysdev->id;
1249 unsigned int cur_freq = 0; 1255 unsigned int cur_freq = 0;
1250 struct cpufreq_policy *cpu_policy; 1256 struct cpufreq_policy *cpu_policy;
1251 1257
1252 dprintk("suspending cpu %u\n", cpu); 1258 dprintk("suspending cpu %u\n", cpu);
1253 1259
1260 /*
1261 * This whole bogosity is here because Powerbooks are made of fail.
1262 * No sane platform should need any of the code below to be run.
1263 * (it's entirely the wrong thing to do, as driver->get may
1264 * reenable interrupts on some architectures).
1265 */
1266
1254 if (!cpu_online(cpu)) 1267 if (!cpu_online(cpu))
1255 return 0; 1268 return 0;
1256 1269
@@ -1309,6 +1322,7 @@ static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg)
1309 1322
1310out: 1323out:
1311 cpufreq_cpu_put(cpu_policy); 1324 cpufreq_cpu_put(cpu_policy);
1325#endif /* __powerpc__ */
1312 return ret; 1326 return ret;
1313} 1327}
1314 1328
@@ -1322,12 +1336,18 @@ out:
1322 */ 1336 */
1323static int cpufreq_resume(struct sys_device *sysdev) 1337static int cpufreq_resume(struct sys_device *sysdev)
1324{ 1338{
1325 int cpu = sysdev->id;
1326 int ret = 0; 1339 int ret = 0;
1340
1341#ifdef __powerpc__
1342 int cpu = sysdev->id;
1327 struct cpufreq_policy *cpu_policy; 1343 struct cpufreq_policy *cpu_policy;
1328 1344
1329 dprintk("resuming cpu %u\n", cpu); 1345 dprintk("resuming cpu %u\n", cpu);
1330 1346
1347 /* As with the ->suspend method, all the code below is
1348 * only necessary because Powerbooks suck.
1349 * See commit 42d4dc3f4e1e for jokes. */
1350
1331 if (!cpu_online(cpu)) 1351 if (!cpu_online(cpu))
1332 return 0; 1352 return 0;
1333 1353
@@ -1391,6 +1411,7 @@ out:
1391 schedule_work(&cpu_policy->update); 1411 schedule_work(&cpu_policy->update);
1392fail: 1412fail:
1393 cpufreq_cpu_put(cpu_policy); 1413 cpufreq_cpu_put(cpu_policy);
1414#endif /* __powerpc__ */
1394 return ret; 1415 return ret;
1395} 1416}
1396 1417
diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c
index 57490502b21c..bdea7e2f94ba 100644
--- a/drivers/cpufreq/cpufreq_conservative.c
+++ b/drivers/cpufreq/cpufreq_conservative.c
@@ -63,6 +63,7 @@ struct cpu_dbs_info_s {
63 unsigned int down_skip; 63 unsigned int down_skip;
64 unsigned int requested_freq; 64 unsigned int requested_freq;
65 int cpu; 65 int cpu;
66 unsigned int enable:1;
66 /* 67 /*
67 * percpu mutex that serializes governor limit change with 68 * percpu mutex that serializes governor limit change with
68 * do_dbs_timer invocation. We do not want do_dbs_timer to run 69 * do_dbs_timer invocation. We do not want do_dbs_timer to run
@@ -141,6 +142,9 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
141 142
142 struct cpufreq_policy *policy; 143 struct cpufreq_policy *policy;
143 144
145 if (!this_dbs_info->enable)
146 return 0;
147
144 policy = this_dbs_info->cur_policy; 148 policy = this_dbs_info->cur_policy;
145 149
146 /* 150 /*
@@ -497,6 +501,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
497 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); 501 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
498 delay -= jiffies % delay; 502 delay -= jiffies % delay;
499 503
504 dbs_info->enable = 1;
500 INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); 505 INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
501 queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work, 506 queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work,
502 delay); 507 delay);
@@ -504,6 +509,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
504 509
505static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) 510static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
506{ 511{
512 dbs_info->enable = 0;
507 cancel_delayed_work_sync(&dbs_info->work); 513 cancel_delayed_work_sync(&dbs_info->work);
508} 514}
509 515
diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig
index 070357aaedbc..81e1020fb514 100644
--- a/drivers/dma/Kconfig
+++ b/drivers/dma/Kconfig
@@ -4,7 +4,7 @@
4 4
5menuconfig DMADEVICES 5menuconfig DMADEVICES
6 bool "DMA Engine support" 6 bool "DMA Engine support"
7 depends on !HIGHMEM64G && HAS_DMA 7 depends on HAS_DMA
8 help 8 help
9 DMA engines can do asynchronous data transfers without 9 DMA engines can do asynchronous data transfers without
10 involving the host CPU. Currently, this framework can be 10 involving the host CPU. Currently, this framework can be
@@ -46,6 +46,14 @@ config DW_DMAC
46 Support the Synopsys DesignWare AHB DMA controller. This 46 Support the Synopsys DesignWare AHB DMA controller. This
47 can be integrated in chips such as the Atmel AT32ap7000. 47 can be integrated in chips such as the Atmel AT32ap7000.
48 48
49config AT_HDMAC
50 tristate "Atmel AHB DMA support"
51 depends on ARCH_AT91SAM9RL
52 select DMA_ENGINE
53 help
54 Support the Atmel AHB DMA controller. This can be integrated in
55 chips such as the Atmel AT91SAM9RL.
56
49config FSL_DMA 57config FSL_DMA
50 tristate "Freescale Elo and Elo Plus DMA support" 58 tristate "Freescale Elo and Elo Plus DMA support"
51 depends on FSL_SOC 59 depends on FSL_SOC
@@ -108,7 +116,7 @@ config NET_DMA
108 116
109config ASYNC_TX_DMA 117config ASYNC_TX_DMA
110 bool "Async_tx: Offload support for the async_tx api" 118 bool "Async_tx: Offload support for the async_tx api"
111 depends on DMA_ENGINE 119 depends on DMA_ENGINE && !HIGHMEM64G
112 help 120 help
113 This allows the async_tx api to take advantage of offload engines for 121 This allows the async_tx api to take advantage of offload engines for
114 memcpy, memset, xor, and raid6 p+q operations. If your platform has 122 memcpy, memset, xor, and raid6 p+q operations. If your platform has
diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile
index a0b6564800c4..40e1e0083571 100644
--- a/drivers/dma/Makefile
+++ b/drivers/dma/Makefile
@@ -7,5 +7,6 @@ obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o
7obj-$(CONFIG_FSL_DMA) += fsldma.o 7obj-$(CONFIG_FSL_DMA) += fsldma.o
8obj-$(CONFIG_MV_XOR) += mv_xor.o 8obj-$(CONFIG_MV_XOR) += mv_xor.o
9obj-$(CONFIG_DW_DMAC) += dw_dmac.o 9obj-$(CONFIG_DW_DMAC) += dw_dmac.o
10obj-$(CONFIG_AT_HDMAC) += at_hdmac.o
10obj-$(CONFIG_MX3_IPU) += ipu/ 11obj-$(CONFIG_MX3_IPU) += ipu/
11obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o 12obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o
diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c
new file mode 100644
index 000000000000..9a1e5fb412ed
--- /dev/null
+++ b/drivers/dma/at_hdmac.c
@@ -0,0 +1,1213 @@
1/*
2 * Driver for the Atmel AHB DMA Controller (aka HDMA or DMAC on AT91 systems)
3 *
4 * Copyright (C) 2008 Atmel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 *
12 * This supports the Atmel AHB DMA Controller,
13 *
14 * The driver has currently been tested with the Atmel AT91SAM9RL
15 * and AT91SAM9G45 series.
16 */
17
18#include <linux/clk.h>
19#include <linux/dmaengine.h>
20#include <linux/dma-mapping.h>
21#include <linux/dmapool.h>
22#include <linux/interrupt.h>
23#include <linux/module.h>
24#include <linux/platform_device.h>
25
26#include "at_hdmac_regs.h"
27
28/*
29 * Glossary
30 * --------
31 *
32 * at_hdmac : Name of the ATmel AHB DMA Controller
33 * at_dma_ / atdma : ATmel DMA controller entity related
34 * atc_ / atchan : ATmel DMA Channel entity related
35 */
36
37#define ATC_DEFAULT_CFG (ATC_FIFOCFG_HALFFIFO)
38#define ATC_DEFAULT_CTRLA (0)
39#define ATC_DEFAULT_CTRLB (ATC_SIF(0) \
40 |ATC_DIF(1))
41
42/*
43 * Initial number of descriptors to allocate for each channel. This could
44 * be increased during dma usage.
45 */
46static unsigned int init_nr_desc_per_channel = 64;
47module_param(init_nr_desc_per_channel, uint, 0644);
48MODULE_PARM_DESC(init_nr_desc_per_channel,
49 "initial descriptors per channel (default: 64)");
50
51
52/* prototypes */
53static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx);
54
55
56/*----------------------------------------------------------------------*/
57
58static struct at_desc *atc_first_active(struct at_dma_chan *atchan)
59{
60 return list_first_entry(&atchan->active_list,
61 struct at_desc, desc_node);
62}
63
64static struct at_desc *atc_first_queued(struct at_dma_chan *atchan)
65{
66 return list_first_entry(&atchan->queue,
67 struct at_desc, desc_node);
68}
69
70/**
71 * atc_alloc_descriptor - allocate and return an initilized descriptor
72 * @chan: the channel to allocate descriptors for
73 * @gfp_flags: GFP allocation flags
74 *
75 * Note: The ack-bit is positioned in the descriptor flag at creation time
76 * to make initial allocation more convenient. This bit will be cleared
77 * and control will be given to client at usage time (during
78 * preparation functions).
79 */
80static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan,
81 gfp_t gfp_flags)
82{
83 struct at_desc *desc = NULL;
84 struct at_dma *atdma = to_at_dma(chan->device);
85 dma_addr_t phys;
86
87 desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys);
88 if (desc) {
89 memset(desc, 0, sizeof(struct at_desc));
90 dma_async_tx_descriptor_init(&desc->txd, chan);
91 /* txd.flags will be overwritten in prep functions */
92 desc->txd.flags = DMA_CTRL_ACK;
93 desc->txd.tx_submit = atc_tx_submit;
94 desc->txd.phys = phys;
95 }
96
97 return desc;
98}
99
100/**
101 * atc_desc_get - get a unsused descriptor from free_list
102 * @atchan: channel we want a new descriptor for
103 */
104static struct at_desc *atc_desc_get(struct at_dma_chan *atchan)
105{
106 struct at_desc *desc, *_desc;
107 struct at_desc *ret = NULL;
108 unsigned int i = 0;
109 LIST_HEAD(tmp_list);
110
111 spin_lock_bh(&atchan->lock);
112 list_for_each_entry_safe(desc, _desc, &atchan->free_list, desc_node) {
113 i++;
114 if (async_tx_test_ack(&desc->txd)) {
115 list_del(&desc->desc_node);
116 ret = desc;
117 break;
118 }
119 dev_dbg(chan2dev(&atchan->chan_common),
120 "desc %p not ACKed\n", desc);
121 }
122 spin_unlock_bh(&atchan->lock);
123 dev_vdbg(chan2dev(&atchan->chan_common),
124 "scanned %u descriptors on freelist\n", i);
125
126 /* no more descriptor available in initial pool: create one more */
127 if (!ret) {
128 ret = atc_alloc_descriptor(&atchan->chan_common, GFP_ATOMIC);
129 if (ret) {
130 spin_lock_bh(&atchan->lock);
131 atchan->descs_allocated++;
132 spin_unlock_bh(&atchan->lock);
133 } else {
134 dev_err(chan2dev(&atchan->chan_common),
135 "not enough descriptors available\n");
136 }
137 }
138
139 return ret;
140}
141
142/**
143 * atc_desc_put - move a descriptor, including any children, to the free list
144 * @atchan: channel we work on
145 * @desc: descriptor, at the head of a chain, to move to free list
146 */
147static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc)
148{
149 if (desc) {
150 struct at_desc *child;
151
152 spin_lock_bh(&atchan->lock);
153 list_for_each_entry(child, &desc->txd.tx_list, desc_node)
154 dev_vdbg(chan2dev(&atchan->chan_common),
155 "moving child desc %p to freelist\n",
156 child);
157 list_splice_init(&desc->txd.tx_list, &atchan->free_list);
158 dev_vdbg(chan2dev(&atchan->chan_common),
159 "moving desc %p to freelist\n", desc);
160 list_add(&desc->desc_node, &atchan->free_list);
161 spin_unlock_bh(&atchan->lock);
162 }
163}
164
165/**
166 * atc_assign_cookie - compute and assign new cookie
167 * @atchan: channel we work on
168 * @desc: descriptor to asign cookie for
169 *
170 * Called with atchan->lock held and bh disabled
171 */
172static dma_cookie_t
173atc_assign_cookie(struct at_dma_chan *atchan, struct at_desc *desc)
174{
175 dma_cookie_t cookie = atchan->chan_common.cookie;
176
177 if (++cookie < 0)
178 cookie = 1;
179
180 atchan->chan_common.cookie = cookie;
181 desc->txd.cookie = cookie;
182
183 return cookie;
184}
185
186/**
187 * atc_dostart - starts the DMA engine for real
188 * @atchan: the channel we want to start
189 * @first: first descriptor in the list we want to begin with
190 *
191 * Called with atchan->lock held and bh disabled
192 */
193static void atc_dostart(struct at_dma_chan *atchan, struct at_desc *first)
194{
195 struct at_dma *atdma = to_at_dma(atchan->chan_common.device);
196
197 /* ASSERT: channel is idle */
198 if (atc_chan_is_enabled(atchan)) {
199 dev_err(chan2dev(&atchan->chan_common),
200 "BUG: Attempted to start non-idle channel\n");
201 dev_err(chan2dev(&atchan->chan_common),
202 " channel: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n",
203 channel_readl(atchan, SADDR),
204 channel_readl(atchan, DADDR),
205 channel_readl(atchan, CTRLA),
206 channel_readl(atchan, CTRLB),
207 channel_readl(atchan, DSCR));
208
209 /* The tasklet will hopefully advance the queue... */
210 return;
211 }
212
213 vdbg_dump_regs(atchan);
214
215 /* clear any pending interrupt */
216 while (dma_readl(atdma, EBCISR))
217 cpu_relax();
218
219 channel_writel(atchan, SADDR, 0);
220 channel_writel(atchan, DADDR, 0);
221 channel_writel(atchan, CTRLA, 0);
222 channel_writel(atchan, CTRLB, 0);
223 channel_writel(atchan, DSCR, first->txd.phys);
224 dma_writel(atdma, CHER, atchan->mask);
225
226 vdbg_dump_regs(atchan);
227}
228
229/**
230 * atc_chain_complete - finish work for one transaction chain
231 * @atchan: channel we work on
232 * @desc: descriptor at the head of the chain we want do complete
233 *
234 * Called with atchan->lock held and bh disabled */
235static void
236atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc)
237{
238 dma_async_tx_callback callback;
239 void *param;
240 struct dma_async_tx_descriptor *txd = &desc->txd;
241
242 dev_vdbg(chan2dev(&atchan->chan_common),
243 "descriptor %u complete\n", txd->cookie);
244
245 atchan->completed_cookie = txd->cookie;
246 callback = txd->callback;
247 param = txd->callback_param;
248
249 /* move children to free_list */
250 list_splice_init(&txd->tx_list, &atchan->free_list);
251 /* move myself to free_list */
252 list_move(&desc->desc_node, &atchan->free_list);
253
254 /* unmap dma addresses */
255 if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) {
256 if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE)
257 dma_unmap_single(chan2parent(&atchan->chan_common),
258 desc->lli.daddr,
259 desc->len, DMA_FROM_DEVICE);
260 else
261 dma_unmap_page(chan2parent(&atchan->chan_common),
262 desc->lli.daddr,
263 desc->len, DMA_FROM_DEVICE);
264 }
265 if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) {
266 if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE)
267 dma_unmap_single(chan2parent(&atchan->chan_common),
268 desc->lli.saddr,
269 desc->len, DMA_TO_DEVICE);
270 else
271 dma_unmap_page(chan2parent(&atchan->chan_common),
272 desc->lli.saddr,
273 desc->len, DMA_TO_DEVICE);
274 }
275
276 /*
277 * The API requires that no submissions are done from a
278 * callback, so we don't need to drop the lock here
279 */
280 if (callback)
281 callback(param);
282
283 dma_run_dependencies(txd);
284}
285
286/**
287 * atc_complete_all - finish work for all transactions
288 * @atchan: channel to complete transactions for
289 *
290 * Eventually submit queued descriptors if any
291 *
292 * Assume channel is idle while calling this function
293 * Called with atchan->lock held and bh disabled
294 */
295static void atc_complete_all(struct at_dma_chan *atchan)
296{
297 struct at_desc *desc, *_desc;
298 LIST_HEAD(list);
299
300 dev_vdbg(chan2dev(&atchan->chan_common), "complete all\n");
301
302 BUG_ON(atc_chan_is_enabled(atchan));
303
304 /*
305 * Submit queued descriptors ASAP, i.e. before we go through
306 * the completed ones.
307 */
308 if (!list_empty(&atchan->queue))
309 atc_dostart(atchan, atc_first_queued(atchan));
310 /* empty active_list now it is completed */
311 list_splice_init(&atchan->active_list, &list);
312 /* empty queue list by moving descriptors (if any) to active_list */
313 list_splice_init(&atchan->queue, &atchan->active_list);
314
315 list_for_each_entry_safe(desc, _desc, &list, desc_node)
316 atc_chain_complete(atchan, desc);
317}
318
319/**
320 * atc_cleanup_descriptors - cleanup up finished descriptors in active_list
321 * @atchan: channel to be cleaned up
322 *
323 * Called with atchan->lock held and bh disabled
324 */
325static void atc_cleanup_descriptors(struct at_dma_chan *atchan)
326{
327 struct at_desc *desc, *_desc;
328 struct at_desc *child;
329
330 dev_vdbg(chan2dev(&atchan->chan_common), "cleanup descriptors\n");
331
332 list_for_each_entry_safe(desc, _desc, &atchan->active_list, desc_node) {
333 if (!(desc->lli.ctrla & ATC_DONE))
334 /* This one is currently in progress */
335 return;
336
337 list_for_each_entry(child, &desc->txd.tx_list, desc_node)
338 if (!(child->lli.ctrla & ATC_DONE))
339 /* Currently in progress */
340 return;
341
342 /*
343 * No descriptors so far seem to be in progress, i.e.
344 * this chain must be done.
345 */
346 atc_chain_complete(atchan, desc);
347 }
348}
349
350/**
351 * atc_advance_work - at the end of a transaction, move forward
352 * @atchan: channel where the transaction ended
353 *
354 * Called with atchan->lock held and bh disabled
355 */
356static void atc_advance_work(struct at_dma_chan *atchan)
357{
358 dev_vdbg(chan2dev(&atchan->chan_common), "advance_work\n");
359
360 if (list_empty(&atchan->active_list) ||
361 list_is_singular(&atchan->active_list)) {
362 atc_complete_all(atchan);
363 } else {
364 atc_chain_complete(atchan, atc_first_active(atchan));
365 /* advance work */
366 atc_dostart(atchan, atc_first_active(atchan));
367 }
368}
369
370
371/**
372 * atc_handle_error - handle errors reported by DMA controller
373 * @atchan: channel where error occurs
374 *
375 * Called with atchan->lock held and bh disabled
376 */
377static void atc_handle_error(struct at_dma_chan *atchan)
378{
379 struct at_desc *bad_desc;
380 struct at_desc *child;
381
382 /*
383 * The descriptor currently at the head of the active list is
384 * broked. Since we don't have any way to report errors, we'll
385 * just have to scream loudly and try to carry on.
386 */
387 bad_desc = atc_first_active(atchan);
388 list_del_init(&bad_desc->desc_node);
389
390 /* As we are stopped, take advantage to push queued descriptors
391 * in active_list */
392 list_splice_init(&atchan->queue, atchan->active_list.prev);
393
394 /* Try to restart the controller */
395 if (!list_empty(&atchan->active_list))
396 atc_dostart(atchan, atc_first_active(atchan));
397
398 /*
399 * KERN_CRITICAL may seem harsh, but since this only happens
400 * when someone submits a bad physical address in a
401 * descriptor, we should consider ourselves lucky that the
402 * controller flagged an error instead of scribbling over
403 * random memory locations.
404 */
405 dev_crit(chan2dev(&atchan->chan_common),
406 "Bad descriptor submitted for DMA!\n");
407 dev_crit(chan2dev(&atchan->chan_common),
408 " cookie: %d\n", bad_desc->txd.cookie);
409 atc_dump_lli(atchan, &bad_desc->lli);
410 list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node)
411 atc_dump_lli(atchan, &child->lli);
412
413 /* Pretend the descriptor completed successfully */
414 atc_chain_complete(atchan, bad_desc);
415}
416
417
418/*-- IRQ & Tasklet ---------------------------------------------------*/
419
420static void atc_tasklet(unsigned long data)
421{
422 struct at_dma_chan *atchan = (struct at_dma_chan *)data;
423
424 /* Channel cannot be enabled here */
425 if (atc_chan_is_enabled(atchan)) {
426 dev_err(chan2dev(&atchan->chan_common),
427 "BUG: channel enabled in tasklet\n");
428 return;
429 }
430
431 spin_lock(&atchan->lock);
432 if (test_and_clear_bit(0, &atchan->error_status))
433 atc_handle_error(atchan);
434 else
435 atc_advance_work(atchan);
436
437 spin_unlock(&atchan->lock);
438}
439
440static irqreturn_t at_dma_interrupt(int irq, void *dev_id)
441{
442 struct at_dma *atdma = (struct at_dma *)dev_id;
443 struct at_dma_chan *atchan;
444 int i;
445 u32 status, pending, imr;
446 int ret = IRQ_NONE;
447
448 do {
449 imr = dma_readl(atdma, EBCIMR);
450 status = dma_readl(atdma, EBCISR);
451 pending = status & imr;
452
453 if (!pending)
454 break;
455
456 dev_vdbg(atdma->dma_common.dev,
457 "interrupt: status = 0x%08x, 0x%08x, 0x%08x\n",
458 status, imr, pending);
459
460 for (i = 0; i < atdma->dma_common.chancnt; i++) {
461 atchan = &atdma->chan[i];
462 if (pending & (AT_DMA_CBTC(i) | AT_DMA_ERR(i))) {
463 if (pending & AT_DMA_ERR(i)) {
464 /* Disable channel on AHB error */
465 dma_writel(atdma, CHDR, atchan->mask);
466 /* Give information to tasklet */
467 set_bit(0, &atchan->error_status);
468 }
469 tasklet_schedule(&atchan->tasklet);
470 ret = IRQ_HANDLED;
471 }
472 }
473
474 } while (pending);
475
476 return ret;
477}
478
479
480/*-- DMA Engine API --------------------------------------------------*/
481
482/**
483 * atc_tx_submit - set the prepared descriptor(s) to be executed by the engine
484 * @desc: descriptor at the head of the transaction chain
485 *
486 * Queue chain if DMA engine is working already
487 *
488 * Cookie increment and adding to active_list or queue must be atomic
489 */
490static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx)
491{
492 struct at_desc *desc = txd_to_at_desc(tx);
493 struct at_dma_chan *atchan = to_at_dma_chan(tx->chan);
494 dma_cookie_t cookie;
495
496 spin_lock_bh(&atchan->lock);
497 cookie = atc_assign_cookie(atchan, desc);
498
499 if (list_empty(&atchan->active_list)) {
500 dev_vdbg(chan2dev(tx->chan), "tx_submit: started %u\n",
501 desc->txd.cookie);
502 atc_dostart(atchan, desc);
503 list_add_tail(&desc->desc_node, &atchan->active_list);
504 } else {
505 dev_vdbg(chan2dev(tx->chan), "tx_submit: queued %u\n",
506 desc->txd.cookie);
507 list_add_tail(&desc->desc_node, &atchan->queue);
508 }
509
510 spin_unlock_bh(&atchan->lock);
511
512 return cookie;
513}
514
515/**
516 * atc_prep_dma_memcpy - prepare a memcpy operation
517 * @chan: the channel to prepare operation on
518 * @dest: operation virtual destination address
519 * @src: operation virtual source address
520 * @len: operation length
521 * @flags: tx descriptor status flags
522 */
523static struct dma_async_tx_descriptor *
524atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src,
525 size_t len, unsigned long flags)
526{
527 struct at_dma_chan *atchan = to_at_dma_chan(chan);
528 struct at_desc *desc = NULL;
529 struct at_desc *first = NULL;
530 struct at_desc *prev = NULL;
531 size_t xfer_count;
532 size_t offset;
533 unsigned int src_width;
534 unsigned int dst_width;
535 u32 ctrla;
536 u32 ctrlb;
537
538 dev_vdbg(chan2dev(chan), "prep_dma_memcpy: d0x%x s0x%x l0x%zx f0x%lx\n",
539 dest, src, len, flags);
540
541 if (unlikely(!len)) {
542 dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n");
543 return NULL;
544 }
545
546 ctrla = ATC_DEFAULT_CTRLA;
547 ctrlb = ATC_DEFAULT_CTRLB
548 | ATC_SRC_ADDR_MODE_INCR
549 | ATC_DST_ADDR_MODE_INCR
550 | ATC_FC_MEM2MEM;
551
552 /*
553 * We can be a lot more clever here, but this should take care
554 * of the most common optimization.
555 */
556 if (!((src | dest | len) & 3)) {
557 ctrla |= ATC_SRC_WIDTH_WORD | ATC_DST_WIDTH_WORD;
558 src_width = dst_width = 2;
559 } else if (!((src | dest | len) & 1)) {
560 ctrla |= ATC_SRC_WIDTH_HALFWORD | ATC_DST_WIDTH_HALFWORD;
561 src_width = dst_width = 1;
562 } else {
563 ctrla |= ATC_SRC_WIDTH_BYTE | ATC_DST_WIDTH_BYTE;
564 src_width = dst_width = 0;
565 }
566
567 for (offset = 0; offset < len; offset += xfer_count << src_width) {
568 xfer_count = min_t(size_t, (len - offset) >> src_width,
569 ATC_BTSIZE_MAX);
570
571 desc = atc_desc_get(atchan);
572 if (!desc)
573 goto err_desc_get;
574
575 desc->lli.saddr = src + offset;
576 desc->lli.daddr = dest + offset;
577 desc->lli.ctrla = ctrla | xfer_count;
578 desc->lli.ctrlb = ctrlb;
579
580 desc->txd.cookie = 0;
581 async_tx_ack(&desc->txd);
582
583 if (!first) {
584 first = desc;
585 } else {
586 /* inform the HW lli about chaining */
587 prev->lli.dscr = desc->txd.phys;
588 /* insert the link descriptor to the LD ring */
589 list_add_tail(&desc->desc_node,
590 &first->txd.tx_list);
591 }
592 prev = desc;
593 }
594
595 /* First descriptor of the chain embedds additional information */
596 first->txd.cookie = -EBUSY;
597 first->len = len;
598
599 /* set end-of-link to the last link descriptor of list*/
600 set_desc_eol(desc);
601
602 desc->txd.flags = flags; /* client is in control of this ack */
603
604 return &first->txd;
605
606err_desc_get:
607 atc_desc_put(atchan, first);
608 return NULL;
609}
610
611
612/**
613 * atc_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction
614 * @chan: DMA channel
615 * @sgl: scatterlist to transfer to/from
616 * @sg_len: number of entries in @scatterlist
617 * @direction: DMA direction
618 * @flags: tx descriptor status flags
619 */
620static struct dma_async_tx_descriptor *
621atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl,
622 unsigned int sg_len, enum dma_data_direction direction,
623 unsigned long flags)
624{
625 struct at_dma_chan *atchan = to_at_dma_chan(chan);
626 struct at_dma_slave *atslave = chan->private;
627 struct at_desc *first = NULL;
628 struct at_desc *prev = NULL;
629 u32 ctrla;
630 u32 ctrlb;
631 dma_addr_t reg;
632 unsigned int reg_width;
633 unsigned int mem_width;
634 unsigned int i;
635 struct scatterlist *sg;
636 size_t total_len = 0;
637
638 dev_vdbg(chan2dev(chan), "prep_slave_sg: %s f0x%lx\n",
639 direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE",
640 flags);
641
642 if (unlikely(!atslave || !sg_len)) {
643 dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n");
644 return NULL;
645 }
646
647 reg_width = atslave->reg_width;
648
649 sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction);
650
651 ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla;
652 ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN;
653
654 switch (direction) {
655 case DMA_TO_DEVICE:
656 ctrla |= ATC_DST_WIDTH(reg_width);
657 ctrlb |= ATC_DST_ADDR_MODE_FIXED
658 | ATC_SRC_ADDR_MODE_INCR
659 | ATC_FC_MEM2PER;
660 reg = atslave->tx_reg;
661 for_each_sg(sgl, sg, sg_len, i) {
662 struct at_desc *desc;
663 u32 len;
664 u32 mem;
665
666 desc = atc_desc_get(atchan);
667 if (!desc)
668 goto err_desc_get;
669
670 mem = sg_phys(sg);
671 len = sg_dma_len(sg);
672 mem_width = 2;
673 if (unlikely(mem & 3 || len & 3))
674 mem_width = 0;
675
676 desc->lli.saddr = mem;
677 desc->lli.daddr = reg;
678 desc->lli.ctrla = ctrla
679 | ATC_SRC_WIDTH(mem_width)
680 | len >> mem_width;
681 desc->lli.ctrlb = ctrlb;
682
683 if (!first) {
684 first = desc;
685 } else {
686 /* inform the HW lli about chaining */
687 prev->lli.dscr = desc->txd.phys;
688 /* insert the link descriptor to the LD ring */
689 list_add_tail(&desc->desc_node,
690 &first->txd.tx_list);
691 }
692 prev = desc;
693 total_len += len;
694 }
695 break;
696 case DMA_FROM_DEVICE:
697 ctrla |= ATC_SRC_WIDTH(reg_width);
698 ctrlb |= ATC_DST_ADDR_MODE_INCR
699 | ATC_SRC_ADDR_MODE_FIXED
700 | ATC_FC_PER2MEM;
701
702 reg = atslave->rx_reg;
703 for_each_sg(sgl, sg, sg_len, i) {
704 struct at_desc *desc;
705 u32 len;
706 u32 mem;
707
708 desc = atc_desc_get(atchan);
709 if (!desc)
710 goto err_desc_get;
711
712 mem = sg_phys(sg);
713 len = sg_dma_len(sg);
714 mem_width = 2;
715 if (unlikely(mem & 3 || len & 3))
716 mem_width = 0;
717
718 desc->lli.saddr = reg;
719 desc->lli.daddr = mem;
720 desc->lli.ctrla = ctrla
721 | ATC_DST_WIDTH(mem_width)
722 | len >> mem_width;
723 desc->lli.ctrlb = ctrlb;
724
725 if (!first) {
726 first = desc;
727 } else {
728 /* inform the HW lli about chaining */
729 prev->lli.dscr = desc->txd.phys;
730 /* insert the link descriptor to the LD ring */
731 list_add_tail(&desc->desc_node,
732 &first->txd.tx_list);
733 }
734 prev = desc;
735 total_len += len;
736 }
737 break;
738 default:
739 return NULL;
740 }
741
742 /* set end-of-link to the last link descriptor of list*/
743 set_desc_eol(prev);
744
745 /* First descriptor of the chain embedds additional information */
746 first->txd.cookie = -EBUSY;
747 first->len = total_len;
748
749 /* last link descriptor of list is responsible of flags */
750 prev->txd.flags = flags; /* client is in control of this ack */
751
752 return &first->txd;
753
754err_desc_get:
755 dev_err(chan2dev(chan), "not enough descriptors available\n");
756 atc_desc_put(atchan, first);
757 return NULL;
758}
759
760static void atc_terminate_all(struct dma_chan *chan)
761{
762 struct at_dma_chan *atchan = to_at_dma_chan(chan);
763 struct at_dma *atdma = to_at_dma(chan->device);
764 struct at_desc *desc, *_desc;
765 LIST_HEAD(list);
766
767 /*
768 * This is only called when something went wrong elsewhere, so
769 * we don't really care about the data. Just disable the
770 * channel. We still have to poll the channel enable bit due
771 * to AHB/HSB limitations.
772 */
773 spin_lock_bh(&atchan->lock);
774
775 dma_writel(atdma, CHDR, atchan->mask);
776
777 /* confirm that this channel is disabled */
778 while (dma_readl(atdma, CHSR) & atchan->mask)
779 cpu_relax();
780
781 /* active_list entries will end up before queued entries */
782 list_splice_init(&atchan->queue, &list);
783 list_splice_init(&atchan->active_list, &list);
784
785 spin_unlock_bh(&atchan->lock);
786
787 /* Flush all pending and queued descriptors */
788 list_for_each_entry_safe(desc, _desc, &list, desc_node)
789 atc_chain_complete(atchan, desc);
790}
791
792/**
793 * atc_is_tx_complete - poll for transaction completion
794 * @chan: DMA channel
795 * @cookie: transaction identifier to check status of
796 * @done: if not %NULL, updated with last completed transaction
797 * @used: if not %NULL, updated with last used transaction
798 *
799 * If @done and @used are passed in, upon return they reflect the driver
800 * internal state and can be used with dma_async_is_complete() to check
801 * the status of multiple cookies without re-checking hardware state.
802 */
803static enum dma_status
804atc_is_tx_complete(struct dma_chan *chan,
805 dma_cookie_t cookie,
806 dma_cookie_t *done, dma_cookie_t *used)
807{
808 struct at_dma_chan *atchan = to_at_dma_chan(chan);
809 dma_cookie_t last_used;
810 dma_cookie_t last_complete;
811 enum dma_status ret;
812
813 dev_vdbg(chan2dev(chan), "is_tx_complete: %d (d%d, u%d)\n",
814 cookie, done ? *done : 0, used ? *used : 0);
815
816 spin_lock_bh(atchan->lock);
817
818 last_complete = atchan->completed_cookie;
819 last_used = chan->cookie;
820
821 ret = dma_async_is_complete(cookie, last_complete, last_used);
822 if (ret != DMA_SUCCESS) {
823 atc_cleanup_descriptors(atchan);
824
825 last_complete = atchan->completed_cookie;
826 last_used = chan->cookie;
827
828 ret = dma_async_is_complete(cookie, last_complete, last_used);
829 }
830
831 spin_unlock_bh(atchan->lock);
832
833 if (done)
834 *done = last_complete;
835 if (used)
836 *used = last_used;
837
838 return ret;
839}
840
841/**
842 * atc_issue_pending - try to finish work
843 * @chan: target DMA channel
844 */
845static void atc_issue_pending(struct dma_chan *chan)
846{
847 struct at_dma_chan *atchan = to_at_dma_chan(chan);
848
849 dev_vdbg(chan2dev(chan), "issue_pending\n");
850
851 if (!atc_chan_is_enabled(atchan)) {
852 spin_lock_bh(&atchan->lock);
853 atc_advance_work(atchan);
854 spin_unlock_bh(&atchan->lock);
855 }
856}
857
858/**
859 * atc_alloc_chan_resources - allocate resources for DMA channel
860 * @chan: allocate descriptor resources for this channel
861 * @client: current client requesting the channel be ready for requests
862 *
863 * return - the number of allocated descriptors
864 */
865static int atc_alloc_chan_resources(struct dma_chan *chan)
866{
867 struct at_dma_chan *atchan = to_at_dma_chan(chan);
868 struct at_dma *atdma = to_at_dma(chan->device);
869 struct at_desc *desc;
870 struct at_dma_slave *atslave;
871 int i;
872 u32 cfg;
873 LIST_HEAD(tmp_list);
874
875 dev_vdbg(chan2dev(chan), "alloc_chan_resources\n");
876
877 /* ASSERT: channel is idle */
878 if (atc_chan_is_enabled(atchan)) {
879 dev_dbg(chan2dev(chan), "DMA channel not idle ?\n");
880 return -EIO;
881 }
882
883 cfg = ATC_DEFAULT_CFG;
884
885 atslave = chan->private;
886 if (atslave) {
887 /*
888 * We need controller-specific data to set up slave
889 * transfers.
890 */
891 BUG_ON(!atslave->dma_dev || atslave->dma_dev != atdma->dma_common.dev);
892
893 /* if cfg configuration specified take it instad of default */
894 if (atslave->cfg)
895 cfg = atslave->cfg;
896 }
897
898 /* have we already been set up?
899 * reconfigure channel but no need to reallocate descriptors */
900 if (!list_empty(&atchan->free_list))
901 return atchan->descs_allocated;
902
903 /* Allocate initial pool of descriptors */
904 for (i = 0; i < init_nr_desc_per_channel; i++) {
905 desc = atc_alloc_descriptor(chan, GFP_KERNEL);
906 if (!desc) {
907 dev_err(atdma->dma_common.dev,
908 "Only %d initial descriptors\n", i);
909 break;
910 }
911 list_add_tail(&desc->desc_node, &tmp_list);
912 }
913
914 spin_lock_bh(&atchan->lock);
915 atchan->descs_allocated = i;
916 list_splice(&tmp_list, &atchan->free_list);
917 atchan->completed_cookie = chan->cookie = 1;
918 spin_unlock_bh(&atchan->lock);
919
920 /* channel parameters */
921 channel_writel(atchan, CFG, cfg);
922
923 dev_dbg(chan2dev(chan),
924 "alloc_chan_resources: allocated %d descriptors\n",
925 atchan->descs_allocated);
926
927 return atchan->descs_allocated;
928}
929
930/**
931 * atc_free_chan_resources - free all channel resources
932 * @chan: DMA channel
933 */
934static void atc_free_chan_resources(struct dma_chan *chan)
935{
936 struct at_dma_chan *atchan = to_at_dma_chan(chan);
937 struct at_dma *atdma = to_at_dma(chan->device);
938 struct at_desc *desc, *_desc;
939 LIST_HEAD(list);
940
941 dev_dbg(chan2dev(chan), "free_chan_resources: (descs allocated=%u)\n",
942 atchan->descs_allocated);
943
944 /* ASSERT: channel is idle */
945 BUG_ON(!list_empty(&atchan->active_list));
946 BUG_ON(!list_empty(&atchan->queue));
947 BUG_ON(atc_chan_is_enabled(atchan));
948
949 list_for_each_entry_safe(desc, _desc, &atchan->free_list, desc_node) {
950 dev_vdbg(chan2dev(chan), " freeing descriptor %p\n", desc);
951 list_del(&desc->desc_node);
952 /* free link descriptor */
953 dma_pool_free(atdma->dma_desc_pool, desc, desc->txd.phys);
954 }
955 list_splice_init(&atchan->free_list, &list);
956 atchan->descs_allocated = 0;
957
958 dev_vdbg(chan2dev(chan), "free_chan_resources: done\n");
959}
960
961
962/*-- Module Management -----------------------------------------------*/
963
964/**
965 * at_dma_off - disable DMA controller
966 * @atdma: the Atmel HDAMC device
967 */
968static void at_dma_off(struct at_dma *atdma)
969{
970 dma_writel(atdma, EN, 0);
971
972 /* disable all interrupts */
973 dma_writel(atdma, EBCIDR, -1L);
974
975 /* confirm that all channels are disabled */
976 while (dma_readl(atdma, CHSR) & atdma->all_chan_mask)
977 cpu_relax();
978}
979
980static int __init at_dma_probe(struct platform_device *pdev)
981{
982 struct at_dma_platform_data *pdata;
983 struct resource *io;
984 struct at_dma *atdma;
985 size_t size;
986 int irq;
987 int err;
988 int i;
989
990 /* get DMA Controller parameters from platform */
991 pdata = pdev->dev.platform_data;
992 if (!pdata || pdata->nr_channels > AT_DMA_MAX_NR_CHANNELS)
993 return -EINVAL;
994
995 io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
996 if (!io)
997 return -EINVAL;
998
999 irq = platform_get_irq(pdev, 0);
1000 if (irq < 0)
1001 return irq;
1002
1003 size = sizeof(struct at_dma);
1004 size += pdata->nr_channels * sizeof(struct at_dma_chan);
1005 atdma = kzalloc(size, GFP_KERNEL);
1006 if (!atdma)
1007 return -ENOMEM;
1008
1009 /* discover transaction capabilites from the platform data */
1010 atdma->dma_common.cap_mask = pdata->cap_mask;
1011 atdma->all_chan_mask = (1 << pdata->nr_channels) - 1;
1012
1013 size = io->end - io->start + 1;
1014 if (!request_mem_region(io->start, size, pdev->dev.driver->name)) {
1015 err = -EBUSY;
1016 goto err_kfree;
1017 }
1018
1019 atdma->regs = ioremap(io->start, size);
1020 if (!atdma->regs) {
1021 err = -ENOMEM;
1022 goto err_release_r;
1023 }
1024
1025 atdma->clk = clk_get(&pdev->dev, "dma_clk");
1026 if (IS_ERR(atdma->clk)) {
1027 err = PTR_ERR(atdma->clk);
1028 goto err_clk;
1029 }
1030 clk_enable(atdma->clk);
1031
1032 /* force dma off, just in case */
1033 at_dma_off(atdma);
1034
1035 err = request_irq(irq, at_dma_interrupt, 0, "at_hdmac", atdma);
1036 if (err)
1037 goto err_irq;
1038
1039 platform_set_drvdata(pdev, atdma);
1040
1041 /* create a pool of consistent memory blocks for hardware descriptors */
1042 atdma->dma_desc_pool = dma_pool_create("at_hdmac_desc_pool",
1043 &pdev->dev, sizeof(struct at_desc),
1044 4 /* word alignment */, 0);
1045 if (!atdma->dma_desc_pool) {
1046 dev_err(&pdev->dev, "No memory for descriptors dma pool\n");
1047 err = -ENOMEM;
1048 goto err_pool_create;
1049 }
1050
1051 /* clear any pending interrupt */
1052 while (dma_readl(atdma, EBCISR))
1053 cpu_relax();
1054
1055 /* initialize channels related values */
1056 INIT_LIST_HEAD(&atdma->dma_common.channels);
1057 for (i = 0; i < pdata->nr_channels; i++, atdma->dma_common.chancnt++) {
1058 struct at_dma_chan *atchan = &atdma->chan[i];
1059
1060 atchan->chan_common.device = &atdma->dma_common;
1061 atchan->chan_common.cookie = atchan->completed_cookie = 1;
1062 atchan->chan_common.chan_id = i;
1063 list_add_tail(&atchan->chan_common.device_node,
1064 &atdma->dma_common.channels);
1065
1066 atchan->ch_regs = atdma->regs + ch_regs(i);
1067 spin_lock_init(&atchan->lock);
1068 atchan->mask = 1 << i;
1069
1070 INIT_LIST_HEAD(&atchan->active_list);
1071 INIT_LIST_HEAD(&atchan->queue);
1072 INIT_LIST_HEAD(&atchan->free_list);
1073
1074 tasklet_init(&atchan->tasklet, atc_tasklet,
1075 (unsigned long)atchan);
1076 atc_enable_irq(atchan);
1077 }
1078
1079 /* set base routines */
1080 atdma->dma_common.device_alloc_chan_resources = atc_alloc_chan_resources;
1081 atdma->dma_common.device_free_chan_resources = atc_free_chan_resources;
1082 atdma->dma_common.device_is_tx_complete = atc_is_tx_complete;
1083 atdma->dma_common.device_issue_pending = atc_issue_pending;
1084 atdma->dma_common.dev = &pdev->dev;
1085
1086 /* set prep routines based on capability */
1087 if (dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask))
1088 atdma->dma_common.device_prep_dma_memcpy = atc_prep_dma_memcpy;
1089
1090 if (dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask)) {
1091 atdma->dma_common.device_prep_slave_sg = atc_prep_slave_sg;
1092 atdma->dma_common.device_terminate_all = atc_terminate_all;
1093 }
1094
1095 dma_writel(atdma, EN, AT_DMA_ENABLE);
1096
1097 dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s), %d channels\n",
1098 dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask) ? "cpy " : "",
1099 dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask) ? "slave " : "",
1100 atdma->dma_common.chancnt);
1101
1102 dma_async_device_register(&atdma->dma_common);
1103
1104 return 0;
1105
1106err_pool_create:
1107 platform_set_drvdata(pdev, NULL);
1108 free_irq(platform_get_irq(pdev, 0), atdma);
1109err_irq:
1110 clk_disable(atdma->clk);
1111 clk_put(atdma->clk);
1112err_clk:
1113 iounmap(atdma->regs);
1114 atdma->regs = NULL;
1115err_release_r:
1116 release_mem_region(io->start, size);
1117err_kfree:
1118 kfree(atdma);
1119 return err;
1120}
1121
1122static int __exit at_dma_remove(struct platform_device *pdev)
1123{
1124 struct at_dma *atdma = platform_get_drvdata(pdev);
1125 struct dma_chan *chan, *_chan;
1126 struct resource *io;
1127
1128 at_dma_off(atdma);
1129 dma_async_device_unregister(&atdma->dma_common);
1130
1131 dma_pool_destroy(atdma->dma_desc_pool);
1132 platform_set_drvdata(pdev, NULL);
1133 free_irq(platform_get_irq(pdev, 0), atdma);
1134
1135 list_for_each_entry_safe(chan, _chan, &atdma->dma_common.channels,
1136 device_node) {
1137 struct at_dma_chan *atchan = to_at_dma_chan(chan);
1138
1139 /* Disable interrupts */
1140 atc_disable_irq(atchan);
1141 tasklet_disable(&atchan->tasklet);
1142
1143 tasklet_kill(&atchan->tasklet);
1144 list_del(&chan->device_node);
1145 }
1146
1147 clk_disable(atdma->clk);
1148 clk_put(atdma->clk);
1149
1150 iounmap(atdma->regs);
1151 atdma->regs = NULL;
1152
1153 io = platform_get_resource(pdev, IORESOURCE_MEM, 0);
1154 release_mem_region(io->start, io->end - io->start + 1);
1155
1156 kfree(atdma);
1157
1158 return 0;
1159}
1160
1161static void at_dma_shutdown(struct platform_device *pdev)
1162{
1163 struct at_dma *atdma = platform_get_drvdata(pdev);
1164
1165 at_dma_off(platform_get_drvdata(pdev));
1166 clk_disable(atdma->clk);
1167}
1168
1169static int at_dma_suspend_late(struct platform_device *pdev, pm_message_t mesg)
1170{
1171 struct at_dma *atdma = platform_get_drvdata(pdev);
1172
1173 at_dma_off(platform_get_drvdata(pdev));
1174 clk_disable(atdma->clk);
1175 return 0;
1176}
1177
1178static int at_dma_resume_early(struct platform_device *pdev)
1179{
1180 struct at_dma *atdma = platform_get_drvdata(pdev);
1181
1182 clk_enable(atdma->clk);
1183 dma_writel(atdma, EN, AT_DMA_ENABLE);
1184 return 0;
1185
1186}
1187
1188static struct platform_driver at_dma_driver = {
1189 .remove = __exit_p(at_dma_remove),
1190 .shutdown = at_dma_shutdown,
1191 .suspend_late = at_dma_suspend_late,
1192 .resume_early = at_dma_resume_early,
1193 .driver = {
1194 .name = "at_hdmac",
1195 },
1196};
1197
1198static int __init at_dma_init(void)
1199{
1200 return platform_driver_probe(&at_dma_driver, at_dma_probe);
1201}
1202module_init(at_dma_init);
1203
1204static void __exit at_dma_exit(void)
1205{
1206 platform_driver_unregister(&at_dma_driver);
1207}
1208module_exit(at_dma_exit);
1209
1210MODULE_DESCRIPTION("Atmel AHB DMA Controller driver");
1211MODULE_AUTHOR("Nicolas Ferre <nicolas.ferre@atmel.com>");
1212MODULE_LICENSE("GPL");
1213MODULE_ALIAS("platform:at_hdmac");
diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h
new file mode 100644
index 000000000000..4c972afc49ec
--- /dev/null
+++ b/drivers/dma/at_hdmac_regs.h
@@ -0,0 +1,353 @@
1/*
2 * Header file for the Atmel AHB DMA Controller driver
3 *
4 * Copyright (C) 2008 Atmel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 */
11#ifndef AT_HDMAC_REGS_H
12#define AT_HDMAC_REGS_H
13
14#include <mach/at_hdmac.h>
15
16#define AT_DMA_MAX_NR_CHANNELS 8
17
18
19#define AT_DMA_GCFG 0x00 /* Global Configuration Register */
20#define AT_DMA_IF_BIGEND(i) (0x1 << (i)) /* AHB-Lite Interface i in Big-endian mode */
21#define AT_DMA_ARB_CFG (0x1 << 4) /* Arbiter mode. */
22#define AT_DMA_ARB_CFG_FIXED (0x0 << 4)
23#define AT_DMA_ARB_CFG_ROUND_ROBIN (0x1 << 4)
24
25#define AT_DMA_EN 0x04 /* Controller Enable Register */
26#define AT_DMA_ENABLE (0x1 << 0)
27
28#define AT_DMA_SREQ 0x08 /* Software Single Request Register */
29#define AT_DMA_SSREQ(x) (0x1 << ((x) << 1)) /* Request a source single transfer on channel x */
30#define AT_DMA_DSREQ(x) (0x1 << (1 + ((x) << 1))) /* Request a destination single transfer on channel x */
31
32#define AT_DMA_CREQ 0x0C /* Software Chunk Transfer Request Register */
33#define AT_DMA_SCREQ(x) (0x1 << ((x) << 1)) /* Request a source chunk transfer on channel x */
34#define AT_DMA_DCREQ(x) (0x1 << (1 + ((x) << 1))) /* Request a destination chunk transfer on channel x */
35
36#define AT_DMA_LAST 0x10 /* Software Last Transfer Flag Register */
37#define AT_DMA_SLAST(x) (0x1 << ((x) << 1)) /* This src rq is last tx of buffer on channel x */
38#define AT_DMA_DLAST(x) (0x1 << (1 + ((x) << 1))) /* This dst rq is last tx of buffer on channel x */
39
40#define AT_DMA_SYNC 0x14 /* Request Synchronization Register */
41#define AT_DMA_SYR(h) (0x1 << (h)) /* Synchronize handshake line h */
42
43/* Error, Chained Buffer transfer completed and Buffer transfer completed Interrupt registers */
44#define AT_DMA_EBCIER 0x18 /* Enable register */
45#define AT_DMA_EBCIDR 0x1C /* Disable register */
46#define AT_DMA_EBCIMR 0x20 /* Mask Register */
47#define AT_DMA_EBCISR 0x24 /* Status Register */
48#define AT_DMA_CBTC_OFFSET 8
49#define AT_DMA_ERR_OFFSET 16
50#define AT_DMA_BTC(x) (0x1 << (x))
51#define AT_DMA_CBTC(x) (0x1 << (AT_DMA_CBTC_OFFSET + (x)))
52#define AT_DMA_ERR(x) (0x1 << (AT_DMA_ERR_OFFSET + (x)))
53
54#define AT_DMA_CHER 0x28 /* Channel Handler Enable Register */
55#define AT_DMA_ENA(x) (0x1 << (x))
56#define AT_DMA_SUSP(x) (0x1 << ( 8 + (x)))
57#define AT_DMA_KEEP(x) (0x1 << (24 + (x)))
58
59#define AT_DMA_CHDR 0x2C /* Channel Handler Disable Register */
60#define AT_DMA_DIS(x) (0x1 << (x))
61#define AT_DMA_RES(x) (0x1 << ( 8 + (x)))
62
63#define AT_DMA_CHSR 0x30 /* Channel Handler Status Register */
64#define AT_DMA_EMPT(x) (0x1 << (16 + (x)))
65#define AT_DMA_STAL(x) (0x1 << (24 + (x)))
66
67
68#define AT_DMA_CH_REGS_BASE 0x3C /* Channel registers base address */
69#define ch_regs(x) (AT_DMA_CH_REGS_BASE + (x) * 0x28) /* Channel x base addr */
70
71/* Hardware register offset for each channel */
72#define ATC_SADDR_OFFSET 0x00 /* Source Address Register */
73#define ATC_DADDR_OFFSET 0x04 /* Destination Address Register */
74#define ATC_DSCR_OFFSET 0x08 /* Descriptor Address Register */
75#define ATC_CTRLA_OFFSET 0x0C /* Control A Register */
76#define ATC_CTRLB_OFFSET 0x10 /* Control B Register */
77#define ATC_CFG_OFFSET 0x14 /* Configuration Register */
78#define ATC_SPIP_OFFSET 0x18 /* Src PIP Configuration Register */
79#define ATC_DPIP_OFFSET 0x1C /* Dst PIP Configuration Register */
80
81
82/* Bitfield definitions */
83
84/* Bitfields in DSCR */
85#define ATC_DSCR_IF(i) (0x3 & (i)) /* Dsc feched via AHB-Lite Interface i */
86
87/* Bitfields in CTRLA */
88#define ATC_BTSIZE_MAX 0xFFFFUL /* Maximum Buffer Transfer Size */
89#define ATC_BTSIZE(x) (ATC_BTSIZE_MAX & (x)) /* Buffer Transfer Size */
90/* Chunck Tranfer size definitions are in at_hdmac.h */
91#define ATC_SRC_WIDTH_MASK (0x3 << 24) /* Source Single Transfer Size */
92#define ATC_SRC_WIDTH(x) ((x) << 24)
93#define ATC_SRC_WIDTH_BYTE (0x0 << 24)
94#define ATC_SRC_WIDTH_HALFWORD (0x1 << 24)
95#define ATC_SRC_WIDTH_WORD (0x2 << 24)
96#define ATC_DST_WIDTH_MASK (0x3 << 28) /* Destination Single Transfer Size */
97#define ATC_DST_WIDTH(x) ((x) << 28)
98#define ATC_DST_WIDTH_BYTE (0x0 << 28)
99#define ATC_DST_WIDTH_HALFWORD (0x1 << 28)
100#define ATC_DST_WIDTH_WORD (0x2 << 28)
101#define ATC_DONE (0x1 << 31) /* Tx Done (only written back in descriptor) */
102
103/* Bitfields in CTRLB */
104#define ATC_SIF(i) (0x3 & (i)) /* Src tx done via AHB-Lite Interface i */
105#define ATC_DIF(i) ((0x3 & (i)) << 4) /* Dst tx done via AHB-Lite Interface i */
106#define ATC_SRC_PIP (0x1 << 8) /* Source Picture-in-Picture enabled */
107#define ATC_DST_PIP (0x1 << 12) /* Destination Picture-in-Picture enabled */
108#define ATC_SRC_DSCR_DIS (0x1 << 16) /* Src Descriptor fetch disable */
109#define ATC_DST_DSCR_DIS (0x1 << 20) /* Dst Descriptor fetch disable */
110#define ATC_FC_MASK (0x7 << 21) /* Choose Flow Controller */
111#define ATC_FC_MEM2MEM (0x0 << 21) /* Mem-to-Mem (DMA) */
112#define ATC_FC_MEM2PER (0x1 << 21) /* Mem-to-Periph (DMA) */
113#define ATC_FC_PER2MEM (0x2 << 21) /* Periph-to-Mem (DMA) */
114#define ATC_FC_PER2PER (0x3 << 21) /* Periph-to-Periph (DMA) */
115#define ATC_FC_PER2MEM_PER (0x4 << 21) /* Periph-to-Mem (Peripheral) */
116#define ATC_FC_MEM2PER_PER (0x5 << 21) /* Mem-to-Periph (Peripheral) */
117#define ATC_FC_PER2PER_SRCPER (0x6 << 21) /* Periph-to-Periph (Src Peripheral) */
118#define ATC_FC_PER2PER_DSTPER (0x7 << 21) /* Periph-to-Periph (Dst Peripheral) */
119#define ATC_SRC_ADDR_MODE_MASK (0x3 << 24)
120#define ATC_SRC_ADDR_MODE_INCR (0x0 << 24) /* Incrementing Mode */
121#define ATC_SRC_ADDR_MODE_DECR (0x1 << 24) /* Decrementing Mode */
122#define ATC_SRC_ADDR_MODE_FIXED (0x2 << 24) /* Fixed Mode */
123#define ATC_DST_ADDR_MODE_MASK (0x3 << 28)
124#define ATC_DST_ADDR_MODE_INCR (0x0 << 28) /* Incrementing Mode */
125#define ATC_DST_ADDR_MODE_DECR (0x1 << 28) /* Decrementing Mode */
126#define ATC_DST_ADDR_MODE_FIXED (0x2 << 28) /* Fixed Mode */
127#define ATC_IEN (0x1 << 30) /* BTC interrupt enable (active low) */
128#define ATC_AUTO (0x1 << 31) /* Auto multiple buffer tx enable */
129
130/* Bitfields in CFG */
131/* are in at_hdmac.h */
132
133/* Bitfields in SPIP */
134#define ATC_SPIP_HOLE(x) (0xFFFFU & (x))
135#define ATC_SPIP_BOUNDARY(x) ((0x3FF & (x)) << 16)
136
137/* Bitfields in DPIP */
138#define ATC_DPIP_HOLE(x) (0xFFFFU & (x))
139#define ATC_DPIP_BOUNDARY(x) ((0x3FF & (x)) << 16)
140
141
142/*-- descriptors -----------------------------------------------------*/
143
144/* LLI == Linked List Item; aka DMA buffer descriptor */
145struct at_lli {
146 /* values that are not changed by hardware */
147 dma_addr_t saddr;
148 dma_addr_t daddr;
149 /* value that may get written back: */
150 u32 ctrla;
151 /* more values that are not changed by hardware */
152 u32 ctrlb;
153 dma_addr_t dscr; /* chain to next lli */
154};
155
156/**
157 * struct at_desc - software descriptor
158 * @at_lli: hardware lli structure
159 * @txd: support for the async_tx api
160 * @desc_node: node on the channed descriptors list
161 * @len: total transaction bytecount
162 */
163struct at_desc {
164 /* FIRST values the hardware uses */
165 struct at_lli lli;
166
167 /* THEN values for driver housekeeping */
168 struct dma_async_tx_descriptor txd;
169 struct list_head desc_node;
170 size_t len;
171};
172
173static inline struct at_desc *
174txd_to_at_desc(struct dma_async_tx_descriptor *txd)
175{
176 return container_of(txd, struct at_desc, txd);
177}
178
179
180/*-- Channels --------------------------------------------------------*/
181
182/**
183 * struct at_dma_chan - internal representation of an Atmel HDMAC channel
184 * @chan_common: common dmaengine channel object members
185 * @device: parent device
186 * @ch_regs: memory mapped register base
187 * @mask: channel index in a mask
188 * @error_status: transmit error status information from irq handler
189 * to tasklet (use atomic operations)
190 * @tasklet: bottom half to finish transaction work
191 * @lock: serializes enqueue/dequeue operations to descriptors lists
192 * @completed_cookie: identifier for the most recently completed operation
193 * @active_list: list of descriptors dmaengine is being running on
194 * @queue: list of descriptors ready to be submitted to engine
195 * @free_list: list of descriptors usable by the channel
196 * @descs_allocated: records the actual size of the descriptor pool
197 */
198struct at_dma_chan {
199 struct dma_chan chan_common;
200 struct at_dma *device;
201 void __iomem *ch_regs;
202 u8 mask;
203 unsigned long error_status;
204 struct tasklet_struct tasklet;
205
206 spinlock_t lock;
207
208 /* these other elements are all protected by lock */
209 dma_cookie_t completed_cookie;
210 struct list_head active_list;
211 struct list_head queue;
212 struct list_head free_list;
213 unsigned int descs_allocated;
214};
215
216#define channel_readl(atchan, name) \
217 __raw_readl((atchan)->ch_regs + ATC_##name##_OFFSET)
218
219#define channel_writel(atchan, name, val) \
220 __raw_writel((val), (atchan)->ch_regs + ATC_##name##_OFFSET)
221
222static inline struct at_dma_chan *to_at_dma_chan(struct dma_chan *dchan)
223{
224 return container_of(dchan, struct at_dma_chan, chan_common);
225}
226
227
228/*-- Controller ------------------------------------------------------*/
229
230/**
231 * struct at_dma - internal representation of an Atmel HDMA Controller
232 * @chan_common: common dmaengine dma_device object members
233 * @ch_regs: memory mapped register base
234 * @clk: dma controller clock
235 * @all_chan_mask: all channels availlable in a mask
236 * @dma_desc_pool: base of DMA descriptor region (DMA address)
237 * @chan: channels table to store at_dma_chan structures
238 */
239struct at_dma {
240 struct dma_device dma_common;
241 void __iomem *regs;
242 struct clk *clk;
243
244 u8 all_chan_mask;
245
246 struct dma_pool *dma_desc_pool;
247 /* AT THE END channels table */
248 struct at_dma_chan chan[0];
249};
250
251#define dma_readl(atdma, name) \
252 __raw_readl((atdma)->regs + AT_DMA_##name)
253#define dma_writel(atdma, name, val) \
254 __raw_writel((val), (atdma)->regs + AT_DMA_##name)
255
256static inline struct at_dma *to_at_dma(struct dma_device *ddev)
257{
258 return container_of(ddev, struct at_dma, dma_common);
259}
260
261
262/*-- Helper functions ------------------------------------------------*/
263
264static struct device *chan2dev(struct dma_chan *chan)
265{
266 return &chan->dev->device;
267}
268static struct device *chan2parent(struct dma_chan *chan)
269{
270 return chan->dev->device.parent;
271}
272
273#if defined(VERBOSE_DEBUG)
274static void vdbg_dump_regs(struct at_dma_chan *atchan)
275{
276 struct at_dma *atdma = to_at_dma(atchan->chan_common.device);
277
278 dev_err(chan2dev(&atchan->chan_common),
279 " channel %d : imr = 0x%x, chsr = 0x%x\n",
280 atchan->chan_common.chan_id,
281 dma_readl(atdma, EBCIMR),
282 dma_readl(atdma, CHSR));
283
284 dev_err(chan2dev(&atchan->chan_common),
285 " channel: s0x%x d0x%x ctrl0x%x:0x%x cfg0x%x l0x%x\n",
286 channel_readl(atchan, SADDR),
287 channel_readl(atchan, DADDR),
288 channel_readl(atchan, CTRLA),
289 channel_readl(atchan, CTRLB),
290 channel_readl(atchan, CFG),
291 channel_readl(atchan, DSCR));
292}
293#else
294static void vdbg_dump_regs(struct at_dma_chan *atchan) {}
295#endif
296
297static void atc_dump_lli(struct at_dma_chan *atchan, struct at_lli *lli)
298{
299 dev_printk(KERN_CRIT, chan2dev(&atchan->chan_common),
300 " desc: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n",
301 lli->saddr, lli->daddr,
302 lli->ctrla, lli->ctrlb, lli->dscr);
303}
304
305
306static void atc_setup_irq(struct at_dma_chan *atchan, int on)
307{
308 struct at_dma *atdma = to_at_dma(atchan->chan_common.device);
309 u32 ebci;
310
311 /* enable interrupts on buffer chain completion & error */
312 ebci = AT_DMA_CBTC(atchan->chan_common.chan_id)
313 | AT_DMA_ERR(atchan->chan_common.chan_id);
314 if (on)
315 dma_writel(atdma, EBCIER, ebci);
316 else
317 dma_writel(atdma, EBCIDR, ebci);
318}
319
320static inline void atc_enable_irq(struct at_dma_chan *atchan)
321{
322 atc_setup_irq(atchan, 1);
323}
324
325static inline void atc_disable_irq(struct at_dma_chan *atchan)
326{
327 atc_setup_irq(atchan, 0);
328}
329
330
331/**
332 * atc_chan_is_enabled - test if given channel is enabled
333 * @atchan: channel we want to test status
334 */
335static inline int atc_chan_is_enabled(struct at_dma_chan *atchan)
336{
337 struct at_dma *atdma = to_at_dma(atchan->chan_common.device);
338
339 return !!(dma_readl(atdma, CHSR) & atchan->mask);
340}
341
342
343/**
344 * set_desc_eol - set end-of-link to descriptor so it will end transfer
345 * @desc: descriptor, signle or at the end of a chain, to end chain on
346 */
347static void set_desc_eol(struct at_desc *desc)
348{
349 desc->lli.ctrlb |= ATC_SRC_DSCR_DIS | ATC_DST_DSCR_DIS;
350 desc->lli.dscr = 0;
351}
352
353#endif /* AT_HDMAC_REGS_H */
diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c
index fb7da5141e96..d93017fc7872 100644
--- a/drivers/dma/dmatest.c
+++ b/drivers/dma/dmatest.c
@@ -38,6 +38,11 @@ module_param(max_channels, uint, S_IRUGO);
38MODULE_PARM_DESC(max_channels, 38MODULE_PARM_DESC(max_channels,
39 "Maximum number of channels to use (default: all)"); 39 "Maximum number of channels to use (default: all)");
40 40
41static unsigned int iterations;
42module_param(iterations, uint, S_IRUGO);
43MODULE_PARM_DESC(iterations,
44 "Iterations before stopping test (default: infinite)");
45
41static unsigned int xor_sources = 3; 46static unsigned int xor_sources = 3;
42module_param(xor_sources, uint, S_IRUGO); 47module_param(xor_sources, uint, S_IRUGO);
43MODULE_PARM_DESC(xor_sources, 48MODULE_PARM_DESC(xor_sources,
@@ -114,7 +119,7 @@ static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len)
114 buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); 119 buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
115 for ( ; i < start + len; i++) 120 for ( ; i < start + len; i++)
116 buf[i] = PATTERN_SRC | PATTERN_COPY 121 buf[i] = PATTERN_SRC | PATTERN_COPY
117 | (~i & PATTERN_COUNT_MASK);; 122 | (~i & PATTERN_COUNT_MASK);
118 for ( ; i < test_buf_size; i++) 123 for ( ; i < test_buf_size; i++)
119 buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); 124 buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK);
120 buf++; 125 buf++;
@@ -270,7 +275,8 @@ static int dmatest_func(void *data)
270 275
271 flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT; 276 flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT;
272 277
273 while (!kthread_should_stop()) { 278 while (!kthread_should_stop()
279 && !(iterations && total_tests >= iterations)) {
274 struct dma_device *dev = chan->device; 280 struct dma_device *dev = chan->device;
275 struct dma_async_tx_descriptor *tx = NULL; 281 struct dma_async_tx_descriptor *tx = NULL;
276 dma_addr_t dma_srcs[src_cnt]; 282 dma_addr_t dma_srcs[src_cnt];
@@ -416,6 +422,13 @@ err_srcbuf:
416err_srcs: 422err_srcs:
417 pr_notice("%s: terminating after %u tests, %u failures (status %d)\n", 423 pr_notice("%s: terminating after %u tests, %u failures (status %d)\n",
418 thread_name, total_tests, failed_tests, ret); 424 thread_name, total_tests, failed_tests, ret);
425
426 if (iterations > 0)
427 while (!kthread_should_stop()) {
428 DECLARE_WAIT_QUEUE_HEAD(wait_dmatest_exit);
429 interruptible_sleep_on(&wait_dmatest_exit);
430 }
431
419 return ret; 432 return ret;
420} 433}
421 434
@@ -495,11 +508,11 @@ static int dmatest_add_channel(struct dma_chan *chan)
495 508
496 if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) { 509 if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) {
497 cnt = dmatest_add_threads(dtc, DMA_MEMCPY); 510 cnt = dmatest_add_threads(dtc, DMA_MEMCPY);
498 thread_count += cnt > 0 ?: 0; 511 thread_count += cnt > 0 ? cnt : 0;
499 } 512 }
500 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { 513 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
501 cnt = dmatest_add_threads(dtc, DMA_XOR); 514 cnt = dmatest_add_threads(dtc, DMA_XOR);
502 thread_count += cnt > 0 ?: 0; 515 thread_count += cnt > 0 ? cnt : 0;
503 } 516 }
504 517
505 pr_info("dmatest: Started %u threads using %s\n", 518 pr_info("dmatest: Started %u threads using %s\n",
diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c
index f18d1bde0439..ef87a8984145 100644
--- a/drivers/dma/fsldma.c
+++ b/drivers/dma/fsldma.c
@@ -12,6 +12,11 @@
12 * also fit for MPC8560, MPC8555, MPC8548, MPC8641, and etc. 12 * also fit for MPC8560, MPC8555, MPC8548, MPC8641, and etc.
13 * The support for MPC8349 DMA contorller is also added. 13 * The support for MPC8349 DMA contorller is also added.
14 * 14 *
15 * This driver instructs the DMA controller to issue the PCI Read Multiple
16 * command for PCI read operations, instead of using the default PCI Read Line
17 * command. Please be aware that this setting may result in read pre-fetching
18 * on some platforms.
19 *
15 * This is free software; you can redistribute it and/or modify 20 * This is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by 21 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2 of the License, or 22 * the Free Software Foundation; either version 2 of the License, or
@@ -49,9 +54,10 @@ static void dma_init(struct fsl_dma_chan *fsl_chan)
49 case FSL_DMA_IP_83XX: 54 case FSL_DMA_IP_83XX:
50 /* Set the channel to below modes: 55 /* Set the channel to below modes:
51 * EOTIE - End-of-transfer interrupt enable 56 * EOTIE - End-of-transfer interrupt enable
57 * PRC_RM - PCI read multiple
52 */ 58 */
53 DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE, 59 DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE
54 32); 60 | FSL_DMA_MR_PRC_RM, 32);
55 break; 61 break;
56 } 62 }
57 63
@@ -136,15 +142,16 @@ static int dma_is_idle(struct fsl_dma_chan *fsl_chan)
136 142
137static void dma_start(struct fsl_dma_chan *fsl_chan) 143static void dma_start(struct fsl_dma_chan *fsl_chan)
138{ 144{
139 u32 mr_set = 0;; 145 u32 mr_set = 0;
140 146
141 if (fsl_chan->feature & FSL_DMA_CHAN_PAUSE_EXT) { 147 if (fsl_chan->feature & FSL_DMA_CHAN_PAUSE_EXT) {
142 DMA_OUT(fsl_chan, &fsl_chan->reg_base->bcr, 0, 32); 148 DMA_OUT(fsl_chan, &fsl_chan->reg_base->bcr, 0, 32);
143 mr_set |= FSL_DMA_MR_EMP_EN; 149 mr_set |= FSL_DMA_MR_EMP_EN;
144 } else 150 } else if ((fsl_chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) {
145 DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, 151 DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr,
146 DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) 152 DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32)
147 & ~FSL_DMA_MR_EMP_EN, 32); 153 & ~FSL_DMA_MR_EMP_EN, 32);
154 }
148 155
149 if (fsl_chan->feature & FSL_DMA_CHAN_START_EXT) 156 if (fsl_chan->feature & FSL_DMA_CHAN_START_EXT)
150 mr_set |= FSL_DMA_MR_EMS_EN; 157 mr_set |= FSL_DMA_MR_EMS_EN;
@@ -871,9 +878,9 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev,
871 878
872 switch (new_fsl_chan->feature & FSL_DMA_IP_MASK) { 879 switch (new_fsl_chan->feature & FSL_DMA_IP_MASK) {
873 case FSL_DMA_IP_85XX: 880 case FSL_DMA_IP_85XX:
874 new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start;
875 new_fsl_chan->toggle_ext_pause = fsl_chan_toggle_ext_pause; 881 new_fsl_chan->toggle_ext_pause = fsl_chan_toggle_ext_pause;
876 case FSL_DMA_IP_83XX: 882 case FSL_DMA_IP_83XX:
883 new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start;
877 new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size; 884 new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size;
878 new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size; 885 new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size;
879 } 886 }
diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h
index 4f21a512d848..dc7f26865797 100644
--- a/drivers/dma/fsldma.h
+++ b/drivers/dma/fsldma.h
@@ -38,6 +38,7 @@
38 38
39/* Special MR definition for MPC8349 */ 39/* Special MR definition for MPC8349 */
40#define FSL_DMA_MR_EOTIE 0x00000080 40#define FSL_DMA_MR_EOTIE 0x00000080
41#define FSL_DMA_MR_PRC_RM 0x00000800
41 42
42#define FSL_DMA_SR_CH 0x00000020 43#define FSL_DMA_SR_CH 0x00000020
43#define FSL_DMA_SR_PE 0x00000010 44#define FSL_DMA_SR_PE 0x00000010
diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c
index ddab94f51224..3f23eabe09f2 100644
--- a/drivers/dma/mv_xor.c
+++ b/drivers/dma/mv_xor.c
@@ -1176,7 +1176,7 @@ static int __devinit mv_xor_probe(struct platform_device *pdev)
1176 if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) 1176 if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask))
1177 dma_dev->device_prep_dma_memset = mv_xor_prep_dma_memset; 1177 dma_dev->device_prep_dma_memset = mv_xor_prep_dma_memset;
1178 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { 1178 if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) {
1179 dma_dev->max_xor = 8; ; 1179 dma_dev->max_xor = 8;
1180 dma_dev->device_prep_dma_xor = mv_xor_prep_dma_xor; 1180 dma_dev->device_prep_dma_xor = mv_xor_prep_dma_xor;
1181 } 1181 }
1182 1182
diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c
index 24964c1d0af9..e2a10bcba7a1 100644
--- a/drivers/edac/amd64_edac.c
+++ b/drivers/edac/amd64_edac.c
@@ -868,6 +868,8 @@ static void amd64_read_dbam_reg(struct amd64_pvt *pvt)
868 goto err_reg; 868 goto err_reg;
869 } 869 }
870 870
871 return;
872
871err_reg: 873err_reg:
872 debugf0("Error reading F2x%03x.\n", reg); 874 debugf0("Error reading F2x%03x.\n", reg);
873} 875}
@@ -2634,6 +2636,8 @@ static void amd64_read_mc_registers(struct amd64_pvt *pvt)
2634 2636
2635 amd64_dump_misc_regs(pvt); 2637 amd64_dump_misc_regs(pvt);
2636 2638
2639 return;
2640
2637err_reg: 2641err_reg:
2638 debugf0("Reading an MC register failed\n"); 2642 debugf0("Reading an MC register failed\n");
2639 2643
@@ -2977,6 +2981,9 @@ static int amd64_check_ecc_enabled(struct amd64_pvt *pvt)
2977 "ECC is enabled by BIOS, Proceeding " 2981 "ECC is enabled by BIOS, Proceeding "
2978 "with EDAC module initialization\n"); 2982 "with EDAC module initialization\n");
2979 2983
2984 /* Signal good ECC status */
2985 ret = 0;
2986
2980 /* CLEAR the override, since BIOS controlled it */ 2987 /* CLEAR the override, since BIOS controlled it */
2981 ecc_enable_override = 0; 2988 ecc_enable_override = 0;
2982 } 2989 }
diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c
index 8fab7890a363..33be210d6723 100644
--- a/drivers/gpu/drm/drm_crtc.c
+++ b/drivers/gpu/drm/drm_crtc.c
@@ -1461,7 +1461,7 @@ int drm_mode_setcrtc(struct drm_device *dev, void *data,
1461 goto out; 1461 goto out;
1462 } 1462 }
1463 1463
1464 if (crtc_req->count_connectors > 0 && !mode && !fb) { 1464 if (crtc_req->count_connectors > 0 && (!mode || !fb)) {
1465 DRM_DEBUG("Count connectors is %d but no mode or fb set\n", 1465 DRM_DEBUG("Count connectors is %d but no mode or fb set\n",
1466 crtc_req->count_connectors); 1466 crtc_req->count_connectors);
1467 ret = -EINVAL; 1467 ret = -EINVAL;
diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c
index 3da9cfa31848..6aaa2cb23365 100644
--- a/drivers/gpu/drm/drm_crtc_helper.c
+++ b/drivers/gpu/drm/drm_crtc_helper.c
@@ -706,8 +706,8 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set)
706 struct drm_encoder **save_encoders, *new_encoder; 706 struct drm_encoder **save_encoders, *new_encoder;
707 struct drm_framebuffer *old_fb = NULL; 707 struct drm_framebuffer *old_fb = NULL;
708 bool save_enabled; 708 bool save_enabled;
709 bool mode_changed = false; 709 bool mode_changed = false; /* if true do a full mode set */
710 bool fb_changed = false; 710 bool fb_changed = false; /* if true and !mode_changed just do a flip */
711 struct drm_connector *connector; 711 struct drm_connector *connector;
712 int count = 0, ro, fail = 0; 712 int count = 0, ro, fail = 0;
713 struct drm_crtc_helper_funcs *crtc_funcs; 713 struct drm_crtc_helper_funcs *crtc_funcs;
@@ -758,6 +758,8 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set)
758 if (set->crtc->fb == NULL) { 758 if (set->crtc->fb == NULL) {
759 DRM_DEBUG("crtc has no fb, full mode set\n"); 759 DRM_DEBUG("crtc has no fb, full mode set\n");
760 mode_changed = true; 760 mode_changed = true;
761 } else if (set->fb == NULL) {
762 mode_changed = true;
761 } else if ((set->fb->bits_per_pixel != 763 } else if ((set->fb->bits_per_pixel !=
762 set->crtc->fb->bits_per_pixel) || 764 set->crtc->fb->bits_per_pixel) ||
763 set->fb->depth != set->crtc->fb->depth) 765 set->fb->depth != set->crtc->fb->depth)
diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c
index 05a44896dffb..f1ba8ff41130 100644
--- a/drivers/gpu/drm/radeon/r100.c
+++ b/drivers/gpu/drm/radeon/r100.c
@@ -722,13 +722,14 @@ int r100_cs_packet_parse(struct radeon_cs_parser *p,
722 unsigned idx) 722 unsigned idx)
723{ 723{
724 struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; 724 struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx];
725 uint32_t header = ib_chunk->kdata[idx]; 725 uint32_t header;
726 726
727 if (idx >= ib_chunk->length_dw) { 727 if (idx >= ib_chunk->length_dw) {
728 DRM_ERROR("Can not parse packet at %d after CS end %d !\n", 728 DRM_ERROR("Can not parse packet at %d after CS end %d !\n",
729 idx, ib_chunk->length_dw); 729 idx, ib_chunk->length_dw);
730 return -EINVAL; 730 return -EINVAL;
731 } 731 }
732 header = ib_chunk->kdata[idx];
732 pkt->idx = idx; 733 pkt->idx = idx;
733 pkt->type = CP_PACKET_GET_TYPE(header); 734 pkt->type = CP_PACKET_GET_TYPE(header);
734 pkt->count = CP_PACKET_GET_COUNT(header); 735 pkt->count = CP_PACKET_GET_COUNT(header);
diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c
index 3cfcee17dc56..0bd5879a4957 100644
--- a/drivers/gpu/drm/radeon/radeon_drv.c
+++ b/drivers/gpu/drm/radeon/radeon_drv.c
@@ -318,6 +318,14 @@ static int __init radeon_init(void)
318 driver = &driver_old; 318 driver = &driver_old;
319 driver->num_ioctls = radeon_max_ioctl; 319 driver->num_ioctls = radeon_max_ioctl;
320#if defined(CONFIG_DRM_RADEON_KMS) 320#if defined(CONFIG_DRM_RADEON_KMS)
321#ifdef CONFIG_VGA_CONSOLE
322 if (vgacon_text_force() && radeon_modeset == -1) {
323 DRM_INFO("VGACON disable radeon kernel modesetting.\n");
324 driver = &driver_old;
325 driver->driver_features &= ~DRIVER_MODESET;
326 radeon_modeset = 0;
327 }
328#endif
321 /* if enabled by default */ 329 /* if enabled by default */
322 if (radeon_modeset == -1) { 330 if (radeon_modeset == -1) {
323 DRM_INFO("radeon default to kernel modesetting.\n"); 331 DRM_INFO("radeon default to kernel modesetting.\n");
@@ -329,17 +337,8 @@ static int __init radeon_init(void)
329 driver->driver_features |= DRIVER_MODESET; 337 driver->driver_features |= DRIVER_MODESET;
330 driver->num_ioctls = radeon_max_kms_ioctl; 338 driver->num_ioctls = radeon_max_kms_ioctl;
331 } 339 }
332
333 /* if the vga console setting is enabled still 340 /* if the vga console setting is enabled still
334 * let modprobe override it */ 341 * let modprobe override it */
335#ifdef CONFIG_VGA_CONSOLE
336 if (vgacon_text_force() && radeon_modeset == -1) {
337 DRM_INFO("VGACON disable radeon kernel modesetting.\n");
338 driver = &driver_old;
339 driver->driver_features &= ~DRIVER_MODESET;
340 radeon_modeset = 0;
341 }
342#endif
343#endif 342#endif
344 return drm_init(driver); 343 return drm_init(driver);
345} 344}
diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c
index 937a2f1cdb46..3357110e30ce 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -58,6 +58,8 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags)
58 if (r) { 58 if (r) {
59 DRM_ERROR("Failed to initialize radeon, disabling IOCTL\n"); 59 DRM_ERROR("Failed to initialize radeon, disabling IOCTL\n");
60 radeon_device_fini(rdev); 60 radeon_device_fini(rdev);
61 kfree(rdev);
62 dev->dev_private = NULL;
61 return r; 63 return r;
62 } 64 }
63 return 0; 65 return 0;
diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c
index 551e608702e4..fd8f3ca716ea 100644
--- a/drivers/gpu/drm/radeon/rv515.c
+++ b/drivers/gpu/drm/radeon/rv515.c
@@ -370,6 +370,7 @@ void rv515_vram_info(struct radeon_device *rdev)
370 370
371 rv515_vram_get_type(rdev); 371 rv515_vram_get_type(rdev);
372 372
373 r100_vram_init_sizes(rdev);
373 /* FIXME: we should enforce default clock in case GPU is not in 374 /* FIXME: we should enforce default clock in case GPU is not in
374 * default setup 375 * default setup
375 */ 376 */
diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c
index 6538d4236989..c2b0d710d10f 100644
--- a/drivers/gpu/drm/ttm/ttm_bo.c
+++ b/drivers/gpu/drm/ttm/ttm_bo.c
@@ -1182,13 +1182,14 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev,
1182 1182
1183int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type) 1183int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type)
1184{ 1184{
1185 struct ttm_mem_type_manager *man = &bdev->man[mem_type]; 1185 struct ttm_mem_type_manager *man;
1186 int ret = -EINVAL; 1186 int ret = -EINVAL;
1187 1187
1188 if (mem_type >= TTM_NUM_MEM_TYPES) { 1188 if (mem_type >= TTM_NUM_MEM_TYPES) {
1189 printk(KERN_ERR TTM_PFX "Illegal memory type %d\n", mem_type); 1189 printk(KERN_ERR TTM_PFX "Illegal memory type %d\n", mem_type);
1190 return ret; 1190 return ret;
1191 } 1191 }
1192 man = &bdev->man[mem_type];
1192 1193
1193 if (!man->has_type) { 1194 if (!man->has_type) {
1194 printk(KERN_ERR TTM_PFX "Trying to take down uninitialized " 1195 printk(KERN_ERR TTM_PFX "Trying to take down uninitialized "
@@ -1575,6 +1576,10 @@ int ttm_bo_wait(struct ttm_buffer_object *bo,
1575 driver->sync_obj_unref(&sync_obj); 1576 driver->sync_obj_unref(&sync_obj);
1576 driver->sync_obj_unref(&tmp_obj); 1577 driver->sync_obj_unref(&tmp_obj);
1577 spin_lock(&bo->lock); 1578 spin_lock(&bo->lock);
1579 } else {
1580 spin_unlock(&bo->lock);
1581 driver->sync_obj_unref(&sync_obj);
1582 spin_lock(&bo->lock);
1578 } 1583 }
1579 } 1584 }
1580 return 0; 1585 return 0;
diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c
index ce2e6f38ea01..ad4ada07c6cf 100644
--- a/drivers/gpu/drm/ttm/ttm_bo_util.c
+++ b/drivers/gpu/drm/ttm/ttm_bo_util.c
@@ -150,7 +150,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
150#ifdef CONFIG_X86 150#ifdef CONFIG_X86
151 dst = kmap_atomic_prot(d, KM_USER0, prot); 151 dst = kmap_atomic_prot(d, KM_USER0, prot);
152#else 152#else
153 if (prot != PAGE_KERNEL) 153 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
154 dst = vmap(&d, 1, 0, prot); 154 dst = vmap(&d, 1, 0, prot);
155 else 155 else
156 dst = kmap(d); 156 dst = kmap(d);
@@ -163,7 +163,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src,
163#ifdef CONFIG_X86 163#ifdef CONFIG_X86
164 kunmap_atomic(dst, KM_USER0); 164 kunmap_atomic(dst, KM_USER0);
165#else 165#else
166 if (prot != PAGE_KERNEL) 166 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
167 vunmap(dst); 167 vunmap(dst);
168 else 168 else
169 kunmap(d); 169 kunmap(d);
@@ -186,7 +186,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst,
186#ifdef CONFIG_X86 186#ifdef CONFIG_X86
187 src = kmap_atomic_prot(s, KM_USER0, prot); 187 src = kmap_atomic_prot(s, KM_USER0, prot);
188#else 188#else
189 if (prot != PAGE_KERNEL) 189 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
190 src = vmap(&s, 1, 0, prot); 190 src = vmap(&s, 1, 0, prot);
191 else 191 else
192 src = kmap(s); 192 src = kmap(s);
@@ -199,7 +199,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst,
199#ifdef CONFIG_X86 199#ifdef CONFIG_X86
200 kunmap_atomic(src, KM_USER0); 200 kunmap_atomic(src, KM_USER0);
201#else 201#else
202 if (prot != PAGE_KERNEL) 202 if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL))
203 vunmap(src); 203 vunmap(src);
204 else 204 else
205 kunmap(s); 205 kunmap(s);
diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c
index b587e2d576ac..820e51673b26 100644
--- a/drivers/input/serio/hp_sdc_mlc.c
+++ b/drivers/input/serio/hp_sdc_mlc.c
@@ -296,7 +296,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc)
296 priv->tseq[3] = 0; 296 priv->tseq[3] = 0;
297 if (mlc->opacket & HIL_CTRL_APE) { 297 if (mlc->opacket & HIL_CTRL_APE) {
298 priv->tseq[3] |= HP_SDC_LPC_APE_IPF; 298 priv->tseq[3] |= HP_SDC_LPC_APE_IPF;
299 down_trylock(&mlc->csem); 299 BUG_ON(down_trylock(&mlc->csem));
300 } 300 }
301 enqueue: 301 enqueue:
302 hp_sdc_enqueue_transaction(&priv->trans); 302 hp_sdc_enqueue_transaction(&priv->trans);
diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c
index c3b661a666cb..7e5f30dbc0a0 100644
--- a/drivers/isdn/mISDN/l1oip_core.c
+++ b/drivers/isdn/mISDN/l1oip_core.c
@@ -1480,7 +1480,7 @@ l1oip_init(void)
1480 return -ENOMEM; 1480 return -ENOMEM;
1481 1481
1482 l1oip_cnt = 0; 1482 l1oip_cnt = 0;
1483 while (type[l1oip_cnt] && l1oip_cnt < MAX_CARDS) { 1483 while (l1oip_cnt < MAX_CARDS && type[l1oip_cnt]) {
1484 switch (type[l1oip_cnt] & 0xff) { 1484 switch (type[l1oip_cnt] & 0xff) {
1485 case 1: 1485 case 1:
1486 pri = 0; 1486 pri = 0;
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index a6974e9b8ebf..1e2cb846b3c9 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -1,6 +1,8 @@
1/*P:400 This contains run_guest() which actually calls into the Host<->Guest 1/*P:400
2 * This contains run_guest() which actually calls into the Host<->Guest
2 * Switcher and analyzes the return, such as determining if the Guest wants the 3 * Switcher and analyzes the return, such as determining if the Guest wants the
3 * Host to do something. This file also contains useful helper routines. :*/ 4 * Host to do something. This file also contains useful helper routines.
5:*/
4#include <linux/module.h> 6#include <linux/module.h>
5#include <linux/stringify.h> 7#include <linux/stringify.h>
6#include <linux/stddef.h> 8#include <linux/stddef.h>
@@ -24,7 +26,8 @@ static struct page **switcher_page;
24/* This One Big lock protects all inter-guest data structures. */ 26/* This One Big lock protects all inter-guest data structures. */
25DEFINE_MUTEX(lguest_lock); 27DEFINE_MUTEX(lguest_lock);
26 28
27/*H:010 We need to set up the Switcher at a high virtual address. Remember the 29/*H:010
30 * We need to set up the Switcher at a high virtual address. Remember the
28 * Switcher is a few hundred bytes of assembler code which actually changes the 31 * Switcher is a few hundred bytes of assembler code which actually changes the
29 * CPU to run the Guest, and then changes back to the Host when a trap or 32 * CPU to run the Guest, and then changes back to the Host when a trap or
30 * interrupt happens. 33 * interrupt happens.
@@ -33,7 +36,8 @@ DEFINE_MUTEX(lguest_lock);
33 * Host since it will be running as the switchover occurs. 36 * Host since it will be running as the switchover occurs.
34 * 37 *
35 * Trying to map memory at a particular address is an unusual thing to do, so 38 * Trying to map memory at a particular address is an unusual thing to do, so
36 * it's not a simple one-liner. */ 39 * it's not a simple one-liner.
40 */
37static __init int map_switcher(void) 41static __init int map_switcher(void)
38{ 42{
39 int i, err; 43 int i, err;
@@ -47,8 +51,10 @@ static __init int map_switcher(void)
47 * easy. 51 * easy.
48 */ 52 */
49 53
50 /* We allocate an array of struct page pointers. map_vm_area() wants 54 /*
51 * this, rather than just an array of pages. */ 55 * We allocate an array of struct page pointers. map_vm_area() wants
56 * this, rather than just an array of pages.
57 */
52 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, 58 switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES,
53 GFP_KERNEL); 59 GFP_KERNEL);
54 if (!switcher_page) { 60 if (!switcher_page) {
@@ -56,8 +62,10 @@ static __init int map_switcher(void)
56 goto out; 62 goto out;
57 } 63 }
58 64
59 /* Now we actually allocate the pages. The Guest will see these pages, 65 /*
60 * so we make sure they're zeroed. */ 66 * Now we actually allocate the pages. The Guest will see these pages,
67 * so we make sure they're zeroed.
68 */
61 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { 69 for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) {
62 unsigned long addr = get_zeroed_page(GFP_KERNEL); 70 unsigned long addr = get_zeroed_page(GFP_KERNEL);
63 if (!addr) { 71 if (!addr) {
@@ -67,19 +75,23 @@ static __init int map_switcher(void)
67 switcher_page[i] = virt_to_page(addr); 75 switcher_page[i] = virt_to_page(addr);
68 } 76 }
69 77
70 /* First we check that the Switcher won't overlap the fixmap area at 78 /*
79 * First we check that the Switcher won't overlap the fixmap area at
71 * the top of memory. It's currently nowhere near, but it could have 80 * the top of memory. It's currently nowhere near, but it could have
72 * very strange effects if it ever happened. */ 81 * very strange effects if it ever happened.
82 */
73 if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ 83 if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){
74 err = -ENOMEM; 84 err = -ENOMEM;
75 printk("lguest: mapping switcher would thwack fixmap\n"); 85 printk("lguest: mapping switcher would thwack fixmap\n");
76 goto free_pages; 86 goto free_pages;
77 } 87 }
78 88
79 /* Now we reserve the "virtual memory area" we want: 0xFFC00000 89 /*
90 * Now we reserve the "virtual memory area" we want: 0xFFC00000
80 * (SWITCHER_ADDR). We might not get it in theory, but in practice 91 * (SWITCHER_ADDR). We might not get it in theory, but in practice
81 * it's worked so far. The end address needs +1 because __get_vm_area 92 * it's worked so far. The end address needs +1 because __get_vm_area
82 * allocates an extra guard page, so we need space for that. */ 93 * allocates an extra guard page, so we need space for that.
94 */
83 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, 95 switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
84 VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR 96 VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR
85 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); 97 + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
@@ -89,11 +101,13 @@ static __init int map_switcher(void)
89 goto free_pages; 101 goto free_pages;
90 } 102 }
91 103
92 /* This code actually sets up the pages we've allocated to appear at 104 /*
105 * This code actually sets up the pages we've allocated to appear at
93 * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the 106 * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the
94 * kind of pages we're mapping (kernel pages), and a pointer to our 107 * kind of pages we're mapping (kernel pages), and a pointer to our
95 * array of struct pages. It increments that pointer, but we don't 108 * array of struct pages. It increments that pointer, but we don't
96 * care. */ 109 * care.
110 */
97 pagep = switcher_page; 111 pagep = switcher_page;
98 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); 112 err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep);
99 if (err) { 113 if (err) {
@@ -101,8 +115,10 @@ static __init int map_switcher(void)
101 goto free_vma; 115 goto free_vma;
102 } 116 }
103 117
104 /* Now the Switcher is mapped at the right address, we can't fail! 118 /*
105 * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ 119 * Now the Switcher is mapped at the right address, we can't fail!
120 * Copy in the compiled-in Switcher code (from <arch>_switcher.S).
121 */
106 memcpy(switcher_vma->addr, start_switcher_text, 122 memcpy(switcher_vma->addr, start_switcher_text,
107 end_switcher_text - start_switcher_text); 123 end_switcher_text - start_switcher_text);
108 124
@@ -124,8 +140,7 @@ out:
124} 140}
125/*:*/ 141/*:*/
126 142
127/* Cleaning up the mapping when the module is unloaded is almost... 143/* Cleaning up the mapping when the module is unloaded is almost... too easy. */
128 * too easy. */
129static void unmap_switcher(void) 144static void unmap_switcher(void)
130{ 145{
131 unsigned int i; 146 unsigned int i;
@@ -151,16 +166,19 @@ static void unmap_switcher(void)
151 * But we can't trust the Guest: it might be trying to access the Launcher 166 * But we can't trust the Guest: it might be trying to access the Launcher
152 * code. We have to check that the range is below the pfn_limit the Launcher 167 * code. We have to check that the range is below the pfn_limit the Launcher
153 * gave us. We have to make sure that addr + len doesn't give us a false 168 * gave us. We have to make sure that addr + len doesn't give us a false
154 * positive by overflowing, too. */ 169 * positive by overflowing, too.
170 */
155bool lguest_address_ok(const struct lguest *lg, 171bool lguest_address_ok(const struct lguest *lg,
156 unsigned long addr, unsigned long len) 172 unsigned long addr, unsigned long len)
157{ 173{
158 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); 174 return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr);
159} 175}
160 176
161/* This routine copies memory from the Guest. Here we can see how useful the 177/*
178 * This routine copies memory from the Guest. Here we can see how useful the
162 * kill_lguest() routine we met in the Launcher can be: we return a random 179 * kill_lguest() routine we met in the Launcher can be: we return a random
163 * value (all zeroes) instead of needing to return an error. */ 180 * value (all zeroes) instead of needing to return an error.
181 */
164void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) 182void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes)
165{ 183{
166 if (!lguest_address_ok(cpu->lg, addr, bytes) 184 if (!lguest_address_ok(cpu->lg, addr, bytes)
@@ -181,9 +199,11 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
181} 199}
182/*:*/ 200/*:*/
183 201
184/*H:030 Let's jump straight to the the main loop which runs the Guest. 202/*H:030
203 * Let's jump straight to the the main loop which runs the Guest.
185 * Remember, this is called by the Launcher reading /dev/lguest, and we keep 204 * Remember, this is called by the Launcher reading /dev/lguest, and we keep
186 * going around and around until something interesting happens. */ 205 * going around and around until something interesting happens.
206 */
187int run_guest(struct lg_cpu *cpu, unsigned long __user *user) 207int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
188{ 208{
189 /* We stop running once the Guest is dead. */ 209 /* We stop running once the Guest is dead. */
@@ -195,10 +215,17 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
195 if (cpu->hcall) 215 if (cpu->hcall)
196 do_hypercalls(cpu); 216 do_hypercalls(cpu);
197 217
198 /* It's possible the Guest did a NOTIFY hypercall to the 218 /*
199 * Launcher, in which case we return from the read() now. */ 219 * It's possible the Guest did a NOTIFY hypercall to the
220 * Launcher.
221 */
200 if (cpu->pending_notify) { 222 if (cpu->pending_notify) {
223 /*
224 * Does it just needs to write to a registered
225 * eventfd (ie. the appropriate virtqueue thread)?
226 */
201 if (!send_notify_to_eventfd(cpu)) { 227 if (!send_notify_to_eventfd(cpu)) {
228 /* OK, we tell the main Laucher. */
202 if (put_user(cpu->pending_notify, user)) 229 if (put_user(cpu->pending_notify, user))
203 return -EFAULT; 230 return -EFAULT;
204 return sizeof(cpu->pending_notify); 231 return sizeof(cpu->pending_notify);
@@ -209,29 +236,39 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
209 if (signal_pending(current)) 236 if (signal_pending(current))
210 return -ERESTARTSYS; 237 return -ERESTARTSYS;
211 238
212 /* Check if there are any interrupts which can be delivered now: 239 /*
240 * Check if there are any interrupts which can be delivered now:
213 * if so, this sets up the hander to be executed when we next 241 * if so, this sets up the hander to be executed when we next
214 * run the Guest. */ 242 * run the Guest.
243 */
215 irq = interrupt_pending(cpu, &more); 244 irq = interrupt_pending(cpu, &more);
216 if (irq < LGUEST_IRQS) 245 if (irq < LGUEST_IRQS)
217 try_deliver_interrupt(cpu, irq, more); 246 try_deliver_interrupt(cpu, irq, more);
218 247
219 /* All long-lived kernel loops need to check with this horrible 248 /*
249 * All long-lived kernel loops need to check with this horrible
220 * thing called the freezer. If the Host is trying to suspend, 250 * thing called the freezer. If the Host is trying to suspend,
221 * it stops us. */ 251 * it stops us.
252 */
222 try_to_freeze(); 253 try_to_freeze();
223 254
224 /* Just make absolutely sure the Guest is still alive. One of 255 /*
225 * those hypercalls could have been fatal, for example. */ 256 * Just make absolutely sure the Guest is still alive. One of
257 * those hypercalls could have been fatal, for example.
258 */
226 if (cpu->lg->dead) 259 if (cpu->lg->dead)
227 break; 260 break;
228 261
229 /* If the Guest asked to be stopped, we sleep. The Guest's 262 /*
230 * clock timer will wake us. */ 263 * If the Guest asked to be stopped, we sleep. The Guest's
264 * clock timer will wake us.
265 */
231 if (cpu->halted) { 266 if (cpu->halted) {
232 set_current_state(TASK_INTERRUPTIBLE); 267 set_current_state(TASK_INTERRUPTIBLE);
233 /* Just before we sleep, make sure no interrupt snuck in 268 /*
234 * which we should be doing. */ 269 * Just before we sleep, make sure no interrupt snuck in
270 * which we should be doing.
271 */
235 if (interrupt_pending(cpu, &more) < LGUEST_IRQS) 272 if (interrupt_pending(cpu, &more) < LGUEST_IRQS)
236 set_current_state(TASK_RUNNING); 273 set_current_state(TASK_RUNNING);
237 else 274 else
@@ -239,8 +276,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
239 continue; 276 continue;
240 } 277 }
241 278
242 /* OK, now we're ready to jump into the Guest. First we put up 279 /*
243 * the "Do Not Disturb" sign: */ 280 * OK, now we're ready to jump into the Guest. First we put up
281 * the "Do Not Disturb" sign:
282 */
244 local_irq_disable(); 283 local_irq_disable();
245 284
246 /* Actually run the Guest until something happens. */ 285 /* Actually run the Guest until something happens. */
@@ -327,8 +366,10 @@ static void __exit fini(void)
327} 366}
328/*:*/ 367/*:*/
329 368
330/* The Host side of lguest can be a module. This is a nice way for people to 369/*
331 * play with it. */ 370 * The Host side of lguest can be a module. This is a nice way for people to
371 * play with it.
372 */
332module_init(init); 373module_init(init);
333module_exit(fini); 374module_exit(fini);
334MODULE_LICENSE("GPL"); 375MODULE_LICENSE("GPL");
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index c29ffa19cb74..83511eb0923d 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -1,8 +1,10 @@
1/*P:500 Just as userspace programs request kernel operations through a system 1/*P:500
2 * Just as userspace programs request kernel operations through a system
2 * call, the Guest requests Host operations through a "hypercall". You might 3 * call, the Guest requests Host operations through a "hypercall". You might
3 * notice this nomenclature doesn't really follow any logic, but the name has 4 * notice this nomenclature doesn't really follow any logic, but the name has
4 * been around for long enough that we're stuck with it. As you'd expect, this 5 * been around for long enough that we're stuck with it. As you'd expect, this
5 * code is basically a one big switch statement. :*/ 6 * code is basically a one big switch statement.
7:*/
6 8
7/* Copyright (C) 2006 Rusty Russell IBM Corporation 9/* Copyright (C) 2006 Rusty Russell IBM Corporation
8 10
@@ -28,30 +30,41 @@
28#include <asm/pgtable.h> 30#include <asm/pgtable.h>
29#include "lg.h" 31#include "lg.h"
30 32
31/*H:120 This is the core hypercall routine: where the Guest gets what it wants. 33/*H:120
32 * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ 34 * This is the core hypercall routine: where the Guest gets what it wants.
35 * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both.
36 */
33static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) 37static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
34{ 38{
35 switch (args->arg0) { 39 switch (args->arg0) {
36 case LHCALL_FLUSH_ASYNC: 40 case LHCALL_FLUSH_ASYNC:
37 /* This call does nothing, except by breaking out of the Guest 41 /*
38 * it makes us process all the asynchronous hypercalls. */ 42 * This call does nothing, except by breaking out of the Guest
43 * it makes us process all the asynchronous hypercalls.
44 */
39 break; 45 break;
40 case LHCALL_SEND_INTERRUPTS: 46 case LHCALL_SEND_INTERRUPTS:
41 /* This call does nothing too, but by breaking out of the Guest 47 /*
42 * it makes us process any pending interrupts. */ 48 * This call does nothing too, but by breaking out of the Guest
49 * it makes us process any pending interrupts.
50 */
43 break; 51 break;
44 case LHCALL_LGUEST_INIT: 52 case LHCALL_LGUEST_INIT:
45 /* You can't get here unless you're already initialized. Don't 53 /*
46 * do that. */ 54 * You can't get here unless you're already initialized. Don't
55 * do that.
56 */
47 kill_guest(cpu, "already have lguest_data"); 57 kill_guest(cpu, "already have lguest_data");
48 break; 58 break;
49 case LHCALL_SHUTDOWN: { 59 case LHCALL_SHUTDOWN: {
50 /* Shutdown is such a trivial hypercall that we do it in four
51 * lines right here. */
52 char msg[128]; 60 char msg[128];
53 /* If the lgread fails, it will call kill_guest() itself; the 61 /*
54 * kill_guest() with the message will be ignored. */ 62 * Shutdown is such a trivial hypercall that we do it in five
63 * lines right here.
64 *
65 * If the lgread fails, it will call kill_guest() itself; the
66 * kill_guest() with the message will be ignored.
67 */
55 __lgread(cpu, msg, args->arg1, sizeof(msg)); 68 __lgread(cpu, msg, args->arg1, sizeof(msg));
56 msg[sizeof(msg)-1] = '\0'; 69 msg[sizeof(msg)-1] = '\0';
57 kill_guest(cpu, "CRASH: %s", msg); 70 kill_guest(cpu, "CRASH: %s", msg);
@@ -60,16 +73,17 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
60 break; 73 break;
61 } 74 }
62 case LHCALL_FLUSH_TLB: 75 case LHCALL_FLUSH_TLB:
63 /* FLUSH_TLB comes in two flavors, depending on the 76 /* FLUSH_TLB comes in two flavors, depending on the argument: */
64 * argument: */
65 if (args->arg1) 77 if (args->arg1)
66 guest_pagetable_clear_all(cpu); 78 guest_pagetable_clear_all(cpu);
67 else 79 else
68 guest_pagetable_flush_user(cpu); 80 guest_pagetable_flush_user(cpu);
69 break; 81 break;
70 82
71 /* All these calls simply pass the arguments through to the right 83 /*
72 * routines. */ 84 * All these calls simply pass the arguments through to the right
85 * routines.
86 */
73 case LHCALL_NEW_PGTABLE: 87 case LHCALL_NEW_PGTABLE:
74 guest_new_pagetable(cpu, args->arg1); 88 guest_new_pagetable(cpu, args->arg1);
75 break; 89 break;
@@ -112,15 +126,16 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
112 kill_guest(cpu, "Bad hypercall %li\n", args->arg0); 126 kill_guest(cpu, "Bad hypercall %li\n", args->arg0);
113 } 127 }
114} 128}
115/*:*/
116 129
117/*H:124 Asynchronous hypercalls are easy: we just look in the array in the 130/*H:124
131 * Asynchronous hypercalls are easy: we just look in the array in the
118 * Guest's "struct lguest_data" to see if any new ones are marked "ready". 132 * Guest's "struct lguest_data" to see if any new ones are marked "ready".
119 * 133 *
120 * We are careful to do these in order: obviously we respect the order the 134 * We are careful to do these in order: obviously we respect the order the
121 * Guest put them in the ring, but we also promise the Guest that they will 135 * Guest put them in the ring, but we also promise the Guest that they will
122 * happen before any normal hypercall (which is why we check this before 136 * happen before any normal hypercall (which is why we check this before
123 * checking for a normal hcall). */ 137 * checking for a normal hcall).
138 */
124static void do_async_hcalls(struct lg_cpu *cpu) 139static void do_async_hcalls(struct lg_cpu *cpu)
125{ 140{
126 unsigned int i; 141 unsigned int i;
@@ -133,22 +148,28 @@ static void do_async_hcalls(struct lg_cpu *cpu)
133 /* We process "struct lguest_data"s hcalls[] ring once. */ 148 /* We process "struct lguest_data"s hcalls[] ring once. */
134 for (i = 0; i < ARRAY_SIZE(st); i++) { 149 for (i = 0; i < ARRAY_SIZE(st); i++) {
135 struct hcall_args args; 150 struct hcall_args args;
136 /* We remember where we were up to from last time. This makes 151 /*
152 * We remember where we were up to from last time. This makes
137 * sure that the hypercalls are done in the order the Guest 153 * sure that the hypercalls are done in the order the Guest
138 * places them in the ring. */ 154 * places them in the ring.
155 */
139 unsigned int n = cpu->next_hcall; 156 unsigned int n = cpu->next_hcall;
140 157
141 /* 0xFF means there's no call here (yet). */ 158 /* 0xFF means there's no call here (yet). */
142 if (st[n] == 0xFF) 159 if (st[n] == 0xFF)
143 break; 160 break;
144 161
145 /* OK, we have hypercall. Increment the "next_hcall" cursor, 162 /*
146 * and wrap back to 0 if we reach the end. */ 163 * OK, we have hypercall. Increment the "next_hcall" cursor,
164 * and wrap back to 0 if we reach the end.
165 */
147 if (++cpu->next_hcall == LHCALL_RING_SIZE) 166 if (++cpu->next_hcall == LHCALL_RING_SIZE)
148 cpu->next_hcall = 0; 167 cpu->next_hcall = 0;
149 168
150 /* Copy the hypercall arguments into a local copy of 169 /*
151 * the hcall_args struct. */ 170 * Copy the hypercall arguments into a local copy of the
171 * hcall_args struct.
172 */
152 if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], 173 if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n],
153 sizeof(struct hcall_args))) { 174 sizeof(struct hcall_args))) {
154 kill_guest(cpu, "Fetching async hypercalls"); 175 kill_guest(cpu, "Fetching async hypercalls");
@@ -164,19 +185,25 @@ static void do_async_hcalls(struct lg_cpu *cpu)
164 break; 185 break;
165 } 186 }
166 187
167 /* Stop doing hypercalls if they want to notify the Launcher: 188 /*
168 * it needs to service this first. */ 189 * Stop doing hypercalls if they want to notify the Launcher:
190 * it needs to service this first.
191 */
169 if (cpu->pending_notify) 192 if (cpu->pending_notify)
170 break; 193 break;
171 } 194 }
172} 195}
173 196
174/* Last of all, we look at what happens first of all. The very first time the 197/*
175 * Guest makes a hypercall, we end up here to set things up: */ 198 * Last of all, we look at what happens first of all. The very first time the
199 * Guest makes a hypercall, we end up here to set things up:
200 */
176static void initialize(struct lg_cpu *cpu) 201static void initialize(struct lg_cpu *cpu)
177{ 202{
178 /* You can't do anything until you're initialized. The Guest knows the 203 /*
179 * rules, so we're unforgiving here. */ 204 * You can't do anything until you're initialized. The Guest knows the
205 * rules, so we're unforgiving here.
206 */
180 if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { 207 if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) {
181 kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); 208 kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0);
182 return; 209 return;
@@ -185,32 +212,44 @@ static void initialize(struct lg_cpu *cpu)
185 if (lguest_arch_init_hypercalls(cpu)) 212 if (lguest_arch_init_hypercalls(cpu))
186 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 213 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
187 214
188 /* The Guest tells us where we're not to deliver interrupts by putting 215 /*
189 * the range of addresses into "struct lguest_data". */ 216 * The Guest tells us where we're not to deliver interrupts by putting
217 * the range of addresses into "struct lguest_data".
218 */
190 if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) 219 if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start)
191 || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) 220 || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end))
192 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 221 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
193 222
194 /* We write the current time into the Guest's data page once so it can 223 /*
195 * set its clock. */ 224 * We write the current time into the Guest's data page once so it can
225 * set its clock.
226 */
196 write_timestamp(cpu); 227 write_timestamp(cpu);
197 228
198 /* page_tables.c will also do some setup. */ 229 /* page_tables.c will also do some setup. */
199 page_table_guest_data_init(cpu); 230 page_table_guest_data_init(cpu);
200 231
201 /* This is the one case where the above accesses might have been the 232 /*
233 * This is the one case where the above accesses might have been the
202 * first write to a Guest page. This may have caused a copy-on-write 234 * first write to a Guest page. This may have caused a copy-on-write
203 * fault, but the old page might be (read-only) in the Guest 235 * fault, but the old page might be (read-only) in the Guest
204 * pagetable. */ 236 * pagetable.
237 */
205 guest_pagetable_clear_all(cpu); 238 guest_pagetable_clear_all(cpu);
206} 239}
207/*:*/ 240/*:*/
208 241
209/*M:013 If a Guest reads from a page (so creates a mapping) that it has never 242/*M:013
243 * If a Guest reads from a page (so creates a mapping) that it has never
210 * written to, and then the Launcher writes to it (ie. the output of a virtual 244 * written to, and then the Launcher writes to it (ie. the output of a virtual
211 * device), the Guest will still see the old page. In practice, this never 245 * device), the Guest will still see the old page. In practice, this never
212 * happens: why would the Guest read a page which it has never written to? But 246 * happens: why would the Guest read a page which it has never written to? But
213 * a similar scenario might one day bite us, so it's worth mentioning. :*/ 247 * a similar scenario might one day bite us, so it's worth mentioning.
248 *
249 * Note that if we used a shared anonymous mapping in the Launcher instead of
250 * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we
251 * need that to switch the Launcher to processes (away from threads) anyway.
252:*/
214 253
215/*H:100 254/*H:100
216 * Hypercalls 255 * Hypercalls
@@ -229,17 +268,22 @@ void do_hypercalls(struct lg_cpu *cpu)
229 return; 268 return;
230 } 269 }
231 270
232 /* The Guest has initialized. 271 /*
272 * The Guest has initialized.
233 * 273 *
234 * Look in the hypercall ring for the async hypercalls: */ 274 * Look in the hypercall ring for the async hypercalls:
275 */
235 do_async_hcalls(cpu); 276 do_async_hcalls(cpu);
236 277
237 /* If we stopped reading the hypercall ring because the Guest did a 278 /*
279 * If we stopped reading the hypercall ring because the Guest did a
238 * NOTIFY to the Launcher, we want to return now. Otherwise we do 280 * NOTIFY to the Launcher, we want to return now. Otherwise we do
239 * the hypercall. */ 281 * the hypercall.
282 */
240 if (!cpu->pending_notify) { 283 if (!cpu->pending_notify) {
241 do_hcall(cpu, cpu->hcall); 284 do_hcall(cpu, cpu->hcall);
242 /* Tricky point: we reset the hcall pointer to mark the 285 /*
286 * Tricky point: we reset the hcall pointer to mark the
243 * hypercall as "done". We use the hcall pointer rather than 287 * hypercall as "done". We use the hcall pointer rather than
244 * the trap number to indicate a hypercall is pending. 288 * the trap number to indicate a hypercall is pending.
245 * Normally it doesn't matter: the Guest will run again and 289 * Normally it doesn't matter: the Guest will run again and
@@ -248,13 +292,16 @@ void do_hypercalls(struct lg_cpu *cpu)
248 * However, if we are signalled or the Guest sends I/O to the 292 * However, if we are signalled or the Guest sends I/O to the
249 * Launcher, the run_guest() loop will exit without running the 293 * Launcher, the run_guest() loop will exit without running the
250 * Guest. When it comes back it would try to re-run the 294 * Guest. When it comes back it would try to re-run the
251 * hypercall. Finding that bug sucked. */ 295 * hypercall. Finding that bug sucked.
296 */
252 cpu->hcall = NULL; 297 cpu->hcall = NULL;
253 } 298 }
254} 299}
255 300
256/* This routine supplies the Guest with time: it's used for wallclock time at 301/*
257 * initial boot and as a rough time source if the TSC isn't available. */ 302 * This routine supplies the Guest with time: it's used for wallclock time at
303 * initial boot and as a rough time source if the TSC isn't available.
304 */
258void write_timestamp(struct lg_cpu *cpu) 305void write_timestamp(struct lg_cpu *cpu)
259{ 306{
260 struct timespec now; 307 struct timespec now;
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 0e9067b0d507..18648180db02 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -1,4 +1,5 @@
1/*P:800 Interrupts (traps) are complicated enough to earn their own file. 1/*P:800
2 * Interrupts (traps) are complicated enough to earn their own file.
2 * There are three classes of interrupts: 3 * There are three classes of interrupts:
3 * 4 *
4 * 1) Real hardware interrupts which occur while we're running the Guest, 5 * 1) Real hardware interrupts which occur while we're running the Guest,
@@ -10,7 +11,8 @@
10 * just like real hardware would deliver them. Traps from the Guest can be set 11 * just like real hardware would deliver them. Traps from the Guest can be set
11 * up to go directly back into the Guest, but sometimes the Host wants to see 12 * up to go directly back into the Guest, but sometimes the Host wants to see
12 * them first, so we also have a way of "reflecting" them into the Guest as if 13 * them first, so we also have a way of "reflecting" them into the Guest as if
13 * they had been delivered to it directly. :*/ 14 * they had been delivered to it directly.
15:*/
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
15#include <linux/interrupt.h> 17#include <linux/interrupt.h>
16#include <linux/module.h> 18#include <linux/module.h>
@@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi)
26 return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); 28 return (lo & 0x0000FFFF) | (hi & 0xFFFF0000);
27} 29}
28 30
29/* The "type" of the interrupt handler is a 4 bit field: we only support a 31/*
30 * couple of types. */ 32 * The "type" of the interrupt handler is a 4 bit field: we only support a
33 * couple of types.
34 */
31static int idt_type(u32 lo, u32 hi) 35static int idt_type(u32 lo, u32 hi)
32{ 36{
33 return (hi >> 8) & 0xF; 37 return (hi >> 8) & 0xF;
@@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi)
39 return (hi & 0x8000); 43 return (hi & 0x8000);
40} 44}
41 45
42/* We need a helper to "push" a value onto the Guest's stack, since that's a 46/*
43 * big part of what delivering an interrupt does. */ 47 * We need a helper to "push" a value onto the Guest's stack, since that's a
48 * big part of what delivering an interrupt does.
49 */
44static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) 50static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
45{ 51{
46 /* Stack grows upwards: move stack then write value. */ 52 /* Stack grows upwards: move stack then write value. */
@@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
48 lgwrite(cpu, *gstack, u32, val); 54 lgwrite(cpu, *gstack, u32, val);
49} 55}
50 56
51/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or 57/*H:210
58 * The set_guest_interrupt() routine actually delivers the interrupt or
52 * trap. The mechanics of delivering traps and interrupts to the Guest are the 59 * trap. The mechanics of delivering traps and interrupts to the Guest are the
53 * same, except some traps have an "error code" which gets pushed onto the 60 * same, except some traps have an "error code" which gets pushed onto the
54 * stack as well: the caller tells us if this is one. 61 * stack as well: the caller tells us if this is one.
@@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val)
59 * 66 *
60 * We set up the stack just like the CPU does for a real interrupt, so it's 67 * We set up the stack just like the CPU does for a real interrupt, so it's
61 * identical for the Guest (and the standard "iret" instruction will undo 68 * identical for the Guest (and the standard "iret" instruction will undo
62 * it). */ 69 * it).
70 */
63static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, 71static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
64 bool has_err) 72 bool has_err)
65{ 73{
@@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
67 u32 eflags, ss, irq_enable; 75 u32 eflags, ss, irq_enable;
68 unsigned long virtstack; 76 unsigned long virtstack;
69 77
70 /* There are two cases for interrupts: one where the Guest is already 78 /*
79 * There are two cases for interrupts: one where the Guest is already
71 * in the kernel, and a more complex one where the Guest is in 80 * in the kernel, and a more complex one where the Guest is in
72 * userspace. We check the privilege level to find out. */ 81 * userspace. We check the privilege level to find out.
82 */
73 if ((cpu->regs->ss&0x3) != GUEST_PL) { 83 if ((cpu->regs->ss&0x3) != GUEST_PL) {
74 /* The Guest told us their kernel stack with the SET_STACK 84 /*
75 * hypercall: both the virtual address and the segment */ 85 * The Guest told us their kernel stack with the SET_STACK
86 * hypercall: both the virtual address and the segment.
87 */
76 virtstack = cpu->esp1; 88 virtstack = cpu->esp1;
77 ss = cpu->ss1; 89 ss = cpu->ss1;
78 90
79 origstack = gstack = guest_pa(cpu, virtstack); 91 origstack = gstack = guest_pa(cpu, virtstack);
80 /* We push the old stack segment and pointer onto the new 92 /*
93 * We push the old stack segment and pointer onto the new
81 * stack: when the Guest does an "iret" back from the interrupt 94 * stack: when the Guest does an "iret" back from the interrupt
82 * handler the CPU will notice they're dropping privilege 95 * handler the CPU will notice they're dropping privilege
83 * levels and expect these here. */ 96 * levels and expect these here.
97 */
84 push_guest_stack(cpu, &gstack, cpu->regs->ss); 98 push_guest_stack(cpu, &gstack, cpu->regs->ss);
85 push_guest_stack(cpu, &gstack, cpu->regs->esp); 99 push_guest_stack(cpu, &gstack, cpu->regs->esp);
86 } else { 100 } else {
@@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
91 origstack = gstack = guest_pa(cpu, virtstack); 105 origstack = gstack = guest_pa(cpu, virtstack);
92 } 106 }
93 107
94 /* Remember that we never let the Guest actually disable interrupts, so 108 /*
109 * Remember that we never let the Guest actually disable interrupts, so
95 * the "Interrupt Flag" bit is always set. We copy that bit from the 110 * the "Interrupt Flag" bit is always set. We copy that bit from the
96 * Guest's "irq_enabled" field into the eflags word: we saw the Guest 111 * Guest's "irq_enabled" field into the eflags word: we saw the Guest
97 * copy it back in "lguest_iret". */ 112 * copy it back in "lguest_iret".
113 */
98 eflags = cpu->regs->eflags; 114 eflags = cpu->regs->eflags;
99 if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 115 if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0
100 && !(irq_enable & X86_EFLAGS_IF)) 116 && !(irq_enable & X86_EFLAGS_IF))
101 eflags &= ~X86_EFLAGS_IF; 117 eflags &= ~X86_EFLAGS_IF;
102 118
103 /* An interrupt is expected to push three things on the stack: the old 119 /*
120 * An interrupt is expected to push three things on the stack: the old
104 * "eflags" word, the old code segment, and the old instruction 121 * "eflags" word, the old code segment, and the old instruction
105 * pointer. */ 122 * pointer.
123 */
106 push_guest_stack(cpu, &gstack, eflags); 124 push_guest_stack(cpu, &gstack, eflags);
107 push_guest_stack(cpu, &gstack, cpu->regs->cs); 125 push_guest_stack(cpu, &gstack, cpu->regs->cs);
108 push_guest_stack(cpu, &gstack, cpu->regs->eip); 126 push_guest_stack(cpu, &gstack, cpu->regs->eip);
@@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
111 if (has_err) 129 if (has_err)
112 push_guest_stack(cpu, &gstack, cpu->regs->errcode); 130 push_guest_stack(cpu, &gstack, cpu->regs->errcode);
113 131
114 /* Now we've pushed all the old state, we change the stack, the code 132 /*
115 * segment and the address to execute. */ 133 * Now we've pushed all the old state, we change the stack, the code
134 * segment and the address to execute.
135 */
116 cpu->regs->ss = ss; 136 cpu->regs->ss = ss;
117 cpu->regs->esp = virtstack + (gstack - origstack); 137 cpu->regs->esp = virtstack + (gstack - origstack);
118 cpu->regs->cs = (__KERNEL_CS|GUEST_PL); 138 cpu->regs->cs = (__KERNEL_CS|GUEST_PL);
119 cpu->regs->eip = idt_address(lo, hi); 139 cpu->regs->eip = idt_address(lo, hi);
120 140
121 /* There are two kinds of interrupt handlers: 0xE is an "interrupt 141 /*
122 * gate" which expects interrupts to be disabled on entry. */ 142 * There are two kinds of interrupt handlers: 0xE is an "interrupt
143 * gate" which expects interrupts to be disabled on entry.
144 */
123 if (idt_type(lo, hi) == 0xE) 145 if (idt_type(lo, hi) == 0xE)
124 if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) 146 if (put_user(0, &cpu->lg->lguest_data->irq_enabled))
125 kill_guest(cpu, "Disabling interrupts"); 147 kill_guest(cpu, "Disabling interrupts");
@@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi,
130 * 152 *
131 * interrupt_pending() returns the first pending interrupt which isn't blocked 153 * interrupt_pending() returns the first pending interrupt which isn't blocked
132 * by the Guest. It is called before every entry to the Guest, and just before 154 * by the Guest. It is called before every entry to the Guest, and just before
133 * we go to sleep when the Guest has halted itself. */ 155 * we go to sleep when the Guest has halted itself.
156 */
134unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) 157unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
135{ 158{
136 unsigned int irq; 159 unsigned int irq;
@@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
140 if (!cpu->lg->lguest_data) 163 if (!cpu->lg->lguest_data)
141 return LGUEST_IRQS; 164 return LGUEST_IRQS;
142 165
143 /* Take our "irqs_pending" array and remove any interrupts the Guest 166 /*
144 * wants blocked: the result ends up in "blk". */ 167 * Take our "irqs_pending" array and remove any interrupts the Guest
168 * wants blocked: the result ends up in "blk".
169 */
145 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, 170 if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts,
146 sizeof(blk))) 171 sizeof(blk)))
147 return LGUEST_IRQS; 172 return LGUEST_IRQS;
@@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more)
154 return irq; 179 return irq;
155} 180}
156 181
157/* This actually diverts the Guest to running an interrupt handler, once an 182/*
158 * interrupt has been identified by interrupt_pending(). */ 183 * This actually diverts the Guest to running an interrupt handler, once an
184 * interrupt has been identified by interrupt_pending().
185 */
159void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) 186void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
160{ 187{
161 struct desc_struct *idt; 188 struct desc_struct *idt;
162 189
163 BUG_ON(irq >= LGUEST_IRQS); 190 BUG_ON(irq >= LGUEST_IRQS);
164 191
165 /* They may be in the middle of an iret, where they asked us never to 192 /*
166 * deliver interrupts. */ 193 * They may be in the middle of an iret, where they asked us never to
194 * deliver interrupts.
195 */
167 if (cpu->regs->eip >= cpu->lg->noirq_start && 196 if (cpu->regs->eip >= cpu->lg->noirq_start &&
168 (cpu->regs->eip < cpu->lg->noirq_end)) 197 (cpu->regs->eip < cpu->lg->noirq_end))
169 return; 198 return;
@@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
187 } 216 }
188 } 217 }
189 218
190 /* Look at the IDT entry the Guest gave us for this interrupt. The 219 /*
220 * Look at the IDT entry the Guest gave us for this interrupt. The
191 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip 221 * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip
192 * over them. */ 222 * over them.
223 */
193 idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; 224 idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq];
194 /* If they don't have a handler (yet?), we just ignore it */ 225 /* If they don't have a handler (yet?), we just ignore it */
195 if (idt_present(idt->a, idt->b)) { 226 if (idt_present(idt->a, idt->b)) {
196 /* OK, mark it no longer pending and deliver it. */ 227 /* OK, mark it no longer pending and deliver it. */
197 clear_bit(irq, cpu->irqs_pending); 228 clear_bit(irq, cpu->irqs_pending);
198 /* set_guest_interrupt() takes the interrupt descriptor and a 229 /*
230 * set_guest_interrupt() takes the interrupt descriptor and a
199 * flag to say whether this interrupt pushes an error code onto 231 * flag to say whether this interrupt pushes an error code onto
200 * the stack as well: virtual interrupts never do. */ 232 * the stack as well: virtual interrupts never do.
233 */
201 set_guest_interrupt(cpu, idt->a, idt->b, false); 234 set_guest_interrupt(cpu, idt->a, idt->b, false);
202 } 235 }
203 236
204 /* Every time we deliver an interrupt, we update the timestamp in the 237 /*
238 * Every time we deliver an interrupt, we update the timestamp in the
205 * Guest's lguest_data struct. It would be better for the Guest if we 239 * Guest's lguest_data struct. It would be better for the Guest if we
206 * did this more often, but it can actually be quite slow: doing it 240 * did this more often, but it can actually be quite slow: doing it
207 * here is a compromise which means at least it gets updated every 241 * here is a compromise which means at least it gets updated every
208 * timer interrupt. */ 242 * timer interrupt.
243 */
209 write_timestamp(cpu); 244 write_timestamp(cpu);
210 245
211 /* If there are no other interrupts we want to deliver, clear 246 /*
212 * the pending flag. */ 247 * If there are no other interrupts we want to deliver, clear
248 * the pending flag.
249 */
213 if (!more) 250 if (!more)
214 put_user(0, &cpu->lg->lguest_data->irq_pending); 251 put_user(0, &cpu->lg->lguest_data->irq_pending);
215} 252}
@@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more)
217/* And this is the routine when we want to set an interrupt for the Guest. */ 254/* And this is the routine when we want to set an interrupt for the Guest. */
218void set_interrupt(struct lg_cpu *cpu, unsigned int irq) 255void set_interrupt(struct lg_cpu *cpu, unsigned int irq)
219{ 256{
220 /* Next time the Guest runs, the core code will see if it can deliver 257 /*
221 * this interrupt. */ 258 * Next time the Guest runs, the core code will see if it can deliver
259 * this interrupt.
260 */
222 set_bit(irq, cpu->irqs_pending); 261 set_bit(irq, cpu->irqs_pending);
223 262
224 /* Make sure it sees it; it might be asleep (eg. halted), or 263 /*
225 * running the Guest right now, in which case kick_process() 264 * Make sure it sees it; it might be asleep (eg. halted), or running
226 * will knock it out. */ 265 * the Guest right now, in which case kick_process() will knock it out.
266 */
227 if (!wake_up_process(cpu->tsk)) 267 if (!wake_up_process(cpu->tsk))
228 kick_process(cpu->tsk); 268 kick_process(cpu->tsk);
229} 269}
230/*:*/ 270/*:*/
231 271
232/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent 272/*
273 * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent
233 * me a patch, so we support that too. It'd be a big step for lguest if half 274 * me a patch, so we support that too. It'd be a big step for lguest if half
234 * the Plan 9 user base were to start using it. 275 * the Plan 9 user base were to start using it.
235 * 276 *
236 * Actually now I think of it, it's possible that Ron *is* half the Plan 9 277 * Actually now I think of it, it's possible that Ron *is* half the Plan 9
237 * userbase. Oh well. */ 278 * userbase. Oh well.
279 */
238static bool could_be_syscall(unsigned int num) 280static bool could_be_syscall(unsigned int num)
239{ 281{
240 /* Normal Linux SYSCALL_VECTOR or reserved vector? */ 282 /* Normal Linux SYSCALL_VECTOR or reserved vector? */
@@ -274,9 +316,11 @@ void free_interrupts(void)
274 clear_bit(syscall_vector, used_vectors); 316 clear_bit(syscall_vector, used_vectors);
275} 317}
276 318
277/*H:220 Now we've got the routines to deliver interrupts, delivering traps like 319/*H:220
320 * Now we've got the routines to deliver interrupts, delivering traps like
278 * page fault is easy. The only trick is that Intel decided that some traps 321 * page fault is easy. The only trick is that Intel decided that some traps
279 * should have error codes: */ 322 * should have error codes:
323 */
280static bool has_err(unsigned int trap) 324static bool has_err(unsigned int trap)
281{ 325{
282 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); 326 return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17);
@@ -285,13 +329,17 @@ static bool has_err(unsigned int trap)
285/* deliver_trap() returns true if it could deliver the trap. */ 329/* deliver_trap() returns true if it could deliver the trap. */
286bool deliver_trap(struct lg_cpu *cpu, unsigned int num) 330bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
287{ 331{
288 /* Trap numbers are always 8 bit, but we set an impossible trap number 332 /*
289 * for traps inside the Switcher, so check that here. */ 333 * Trap numbers are always 8 bit, but we set an impossible trap number
334 * for traps inside the Switcher, so check that here.
335 */
290 if (num >= ARRAY_SIZE(cpu->arch.idt)) 336 if (num >= ARRAY_SIZE(cpu->arch.idt))
291 return false; 337 return false;
292 338
293 /* Early on the Guest hasn't set the IDT entries (or maybe it put a 339 /*
294 * bogus one in): if we fail here, the Guest will be killed. */ 340 * Early on the Guest hasn't set the IDT entries (or maybe it put a
341 * bogus one in): if we fail here, the Guest will be killed.
342 */
295 if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) 343 if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b))
296 return false; 344 return false;
297 set_guest_interrupt(cpu, cpu->arch.idt[num].a, 345 set_guest_interrupt(cpu, cpu->arch.idt[num].a,
@@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
299 return true; 347 return true;
300} 348}
301 349
302/*H:250 Here's the hard part: returning to the Host every time a trap happens 350/*H:250
351 * Here's the hard part: returning to the Host every time a trap happens
303 * and then calling deliver_trap() and re-entering the Guest is slow. 352 * and then calling deliver_trap() and re-entering the Guest is slow.
304 * Particularly because Guest userspace system calls are traps (usually trap 353 * Particularly because Guest userspace system calls are traps (usually trap
305 * 128). 354 * 128).
@@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num)
311 * the other hypervisors would beat it up at lunchtime. 360 * the other hypervisors would beat it up at lunchtime.
312 * 361 *
313 * This routine indicates if a particular trap number could be delivered 362 * This routine indicates if a particular trap number could be delivered
314 * directly. */ 363 * directly.
364 */
315static bool direct_trap(unsigned int num) 365static bool direct_trap(unsigned int num)
316{ 366{
317 /* Hardware interrupts don't go to the Guest at all (except system 367 /*
318 * call). */ 368 * Hardware interrupts don't go to the Guest at all (except system
369 * call).
370 */
319 if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) 371 if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num))
320 return false; 372 return false;
321 373
322 /* The Host needs to see page faults (for shadow paging and to save the 374 /*
375 * The Host needs to see page faults (for shadow paging and to save the
323 * fault address), general protection faults (in/out emulation) and 376 * fault address), general protection faults (in/out emulation) and
324 * device not available (TS handling), invalid opcode fault (kvm hcall), 377 * device not available (TS handling), invalid opcode fault (kvm hcall),
325 * and of course, the hypercall trap. */ 378 * and of course, the hypercall trap.
379 */
326 return num != 14 && num != 13 && num != 7 && 380 return num != 14 && num != 13 && num != 7 &&
327 num != 6 && num != LGUEST_TRAP_ENTRY; 381 num != 6 && num != LGUEST_TRAP_ENTRY;
328} 382}
329/*:*/ 383/*:*/
330 384
331/*M:005 The Guest has the ability to turn its interrupt gates into trap gates, 385/*M:005
386 * The Guest has the ability to turn its interrupt gates into trap gates,
332 * if it is careful. The Host will let trap gates can go directly to the 387 * if it is careful. The Host will let trap gates can go directly to the
333 * Guest, but the Guest needs the interrupts atomically disabled for an 388 * Guest, but the Guest needs the interrupts atomically disabled for an
334 * interrupt gate. It can do this by pointing the trap gate at instructions 389 * interrupt gate. It can do this by pointing the trap gate at instructions
335 * within noirq_start and noirq_end, where it can safely disable interrupts. */ 390 * within noirq_start and noirq_end, where it can safely disable interrupts.
391 */
336 392
337/*M:006 The Guests do not use the sysenter (fast system call) instruction, 393/*M:006
394 * The Guests do not use the sysenter (fast system call) instruction,
338 * because it's hardcoded to enter privilege level 0 and so can't go direct. 395 * because it's hardcoded to enter privilege level 0 and so can't go direct.
339 * It's about twice as fast as the older "int 0x80" system call, so it might 396 * It's about twice as fast as the older "int 0x80" system call, so it might
340 * still be worthwhile to handle it in the Switcher and lcall down to the 397 * still be worthwhile to handle it in the Switcher and lcall down to the
341 * Guest. The sysenter semantics are hairy tho: search for that keyword in 398 * Guest. The sysenter semantics are hairy tho: search for that keyword in
342 * entry.S :*/ 399 * entry.S
400:*/
343 401
344/*H:260 When we make traps go directly into the Guest, we need to make sure 402/*H:260
403 * When we make traps go directly into the Guest, we need to make sure
345 * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the 404 * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the
346 * CPU trying to deliver the trap will fault while trying to push the interrupt 405 * CPU trying to deliver the trap will fault while trying to push the interrupt
347 * words on the stack: this is called a double fault, and it forces us to kill 406 * words on the stack: this is called a double fault, and it forces us to kill
348 * the Guest. 407 * the Guest.
349 * 408 *
350 * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ 409 * Which is deeply unfair, because (literally!) it wasn't the Guests' fault.
410 */
351void pin_stack_pages(struct lg_cpu *cpu) 411void pin_stack_pages(struct lg_cpu *cpu)
352{ 412{
353 unsigned int i; 413 unsigned int i;
354 414
355 /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or 415 /*
356 * two pages of stack space. */ 416 * Depending on the CONFIG_4KSTACKS option, the Guest can have one or
417 * two pages of stack space.
418 */
357 for (i = 0; i < cpu->lg->stack_pages; i++) 419 for (i = 0; i < cpu->lg->stack_pages; i++)
358 /* The stack grows *upwards*, so the address we're given is the 420 /*
421 * The stack grows *upwards*, so the address we're given is the
359 * start of the page after the kernel stack. Subtract one to 422 * start of the page after the kernel stack. Subtract one to
360 * get back onto the first stack page, and keep subtracting to 423 * get back onto the first stack page, and keep subtracting to
361 * get to the rest of the stack pages. */ 424 * get to the rest of the stack pages.
425 */
362 pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); 426 pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE);
363} 427}
364 428
365/* Direct traps also mean that we need to know whenever the Guest wants to use 429/*
430 * Direct traps also mean that we need to know whenever the Guest wants to use
366 * a different kernel stack, so we can change the IDT entries to use that 431 * a different kernel stack, so we can change the IDT entries to use that
367 * stack. The IDT entries expect a virtual address, so unlike most addresses 432 * stack. The IDT entries expect a virtual address, so unlike most addresses
368 * the Guest gives us, the "esp" (stack pointer) value here is virtual, not 433 * the Guest gives us, the "esp" (stack pointer) value here is virtual, not
369 * physical. 434 * physical.
370 * 435 *
371 * In Linux each process has its own kernel stack, so this happens a lot: we 436 * In Linux each process has its own kernel stack, so this happens a lot: we
372 * change stacks on each context switch. */ 437 * change stacks on each context switch.
438 */
373void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) 439void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
374{ 440{
375 /* You are not allowed have a stack segment with privilege level 0: bad 441 /*
376 * Guest! */ 442 * You're not allowed a stack segment with privilege level 0: bad Guest!
443 */
377 if ((seg & 0x3) != GUEST_PL) 444 if ((seg & 0x3) != GUEST_PL)
378 kill_guest(cpu, "bad stack segment %i", seg); 445 kill_guest(cpu, "bad stack segment %i", seg);
379 /* We only expect one or two stack pages. */ 446 /* We only expect one or two stack pages. */
@@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages)
387 pin_stack_pages(cpu); 454 pin_stack_pages(cpu);
388} 455}
389 456
390/* All this reference to mapping stacks leads us neatly into the other complex 457/*
391 * part of the Host: page table handling. */ 458 * All this reference to mapping stacks leads us neatly into the other complex
459 * part of the Host: page table handling.
460 */
392 461
393/*H:235 This is the routine which actually checks the Guest's IDT entry and 462/*H:235
394 * transfers it into the entry in "struct lguest": */ 463 * This is the routine which actually checks the Guest's IDT entry and
464 * transfers it into the entry in "struct lguest":
465 */
395static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, 466static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
396 unsigned int num, u32 lo, u32 hi) 467 unsigned int num, u32 lo, u32 hi)
397{ 468{
@@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap,
407 if (type != 0xE && type != 0xF) 478 if (type != 0xE && type != 0xF)
408 kill_guest(cpu, "bad IDT type %i", type); 479 kill_guest(cpu, "bad IDT type %i", type);
409 480
410 /* We only copy the handler address, present bit, privilege level and 481 /*
482 * We only copy the handler address, present bit, privilege level and
411 * type. The privilege level controls where the trap can be triggered 483 * type. The privilege level controls where the trap can be triggered
412 * manually with an "int" instruction. This is usually GUEST_PL, 484 * manually with an "int" instruction. This is usually GUEST_PL,
413 * except for system calls which userspace can use. */ 485 * except for system calls which userspace can use.
486 */
414 trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); 487 trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF);
415 trap->b = (hi&0xFFFFEF00); 488 trap->b = (hi&0xFFFFEF00);
416} 489}
417 490
418/*H:230 While we're here, dealing with delivering traps and interrupts to the 491/*H:230
492 * While we're here, dealing with delivering traps and interrupts to the
419 * Guest, we might as well complete the picture: how the Guest tells us where 493 * Guest, we might as well complete the picture: how the Guest tells us where
420 * it wants them to go. This would be simple, except making traps fast 494 * it wants them to go. This would be simple, except making traps fast
421 * requires some tricks. 495 * requires some tricks.
422 * 496 *
423 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the 497 * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the
424 * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ 498 * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here.
499 */
425void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) 500void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
426{ 501{
427 /* Guest never handles: NMI, doublefault, spurious interrupt or 502 /*
428 * hypercall. We ignore when it tries to set them. */ 503 * Guest never handles: NMI, doublefault, spurious interrupt or
504 * hypercall. We ignore when it tries to set them.
505 */
429 if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) 506 if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY)
430 return; 507 return;
431 508
432 /* Mark the IDT as changed: next time the Guest runs we'll know we have 509 /*
433 * to copy this again. */ 510 * Mark the IDT as changed: next time the Guest runs we'll know we have
511 * to copy this again.
512 */
434 cpu->changed |= CHANGED_IDT; 513 cpu->changed |= CHANGED_IDT;
435 514
436 /* Check that the Guest doesn't try to step outside the bounds. */ 515 /* Check that the Guest doesn't try to step outside the bounds. */
@@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi)
440 set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); 519 set_trap(cpu, &cpu->arch.idt[num], num, lo, hi);
441} 520}
442 521
443/* The default entry for each interrupt points into the Switcher routines which 522/*
523 * The default entry for each interrupt points into the Switcher routines which
444 * simply return to the Host. The run_guest() loop will then call 524 * simply return to the Host. The run_guest() loop will then call
445 * deliver_trap() to bounce it back into the Guest. */ 525 * deliver_trap() to bounce it back into the Guest.
526 */
446static void default_idt_entry(struct desc_struct *idt, 527static void default_idt_entry(struct desc_struct *idt,
447 int trap, 528 int trap,
448 const unsigned long handler, 529 const unsigned long handler,
@@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt,
451 /* A present interrupt gate. */ 532 /* A present interrupt gate. */
452 u32 flags = 0x8e00; 533 u32 flags = 0x8e00;
453 534
454 /* Set the privilege level on the entry for the hypercall: this allows 535 /*
455 * the Guest to use the "int" instruction to trigger it. */ 536 * Set the privilege level on the entry for the hypercall: this allows
537 * the Guest to use the "int" instruction to trigger it.
538 */
456 if (trap == LGUEST_TRAP_ENTRY) 539 if (trap == LGUEST_TRAP_ENTRY)
457 flags |= (GUEST_PL << 13); 540 flags |= (GUEST_PL << 13);
458 else if (base) 541 else if (base)
459 /* Copy priv. level from what Guest asked for. This allows 542 /*
460 * debug (int 3) traps from Guest userspace, for example. */ 543 * Copy privilege level from what Guest asked for. This allows
544 * debug (int 3) traps from Guest userspace, for example.
545 */
461 flags |= (base->b & 0x6000); 546 flags |= (base->b & 0x6000);
462 547
463 /* Now pack it into the IDT entry in its weird format. */ 548 /* Now pack it into the IDT entry in its weird format. */
@@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state,
475 default_idt_entry(&state->guest_idt[i], i, def[i], NULL); 560 default_idt_entry(&state->guest_idt[i], i, def[i], NULL);
476} 561}
477 562
478/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead 563/*H:240
564 * We don't use the IDT entries in the "struct lguest" directly, instead
479 * we copy them into the IDT which we've set up for Guests on this CPU, just 565 * we copy them into the IDT which we've set up for Guests on this CPU, just
480 * before we run the Guest. This routine does that copy. */ 566 * before we run the Guest. This routine does that copy.
567 */
481void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, 568void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
482 const unsigned long *def) 569 const unsigned long *def)
483{ 570{
484 unsigned int i; 571 unsigned int i;
485 572
486 /* We can simply copy the direct traps, otherwise we use the default 573 /*
487 * ones in the Switcher: they will return to the Host. */ 574 * We can simply copy the direct traps, otherwise we use the default
575 * ones in the Switcher: they will return to the Host.
576 */
488 for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { 577 for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) {
489 const struct desc_struct *gidt = &cpu->arch.idt[i]; 578 const struct desc_struct *gidt = &cpu->arch.idt[i];
490 579
@@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
492 if (!direct_trap(i)) 581 if (!direct_trap(i))
493 continue; 582 continue;
494 583
495 /* Only trap gates (type 15) can go direct to the Guest. 584 /*
585 * Only trap gates (type 15) can go direct to the Guest.
496 * Interrupt gates (type 14) disable interrupts as they are 586 * Interrupt gates (type 14) disable interrupts as they are
497 * entered, which we never let the Guest do. Not present 587 * entered, which we never let the Guest do. Not present
498 * entries (type 0x0) also can't go direct, of course. 588 * entries (type 0x0) also can't go direct, of course.
499 * 589 *
500 * If it can't go direct, we still need to copy the priv. level: 590 * If it can't go direct, we still need to copy the priv. level:
501 * they might want to give userspace access to a software 591 * they might want to give userspace access to a software
502 * interrupt. */ 592 * interrupt.
593 */
503 if (idt_type(gidt->a, gidt->b) == 0xF) 594 if (idt_type(gidt->a, gidt->b) == 0xF)
504 idt[i] = *gidt; 595 idt[i] = *gidt;
505 else 596 else
@@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt,
518 * the next timer interrupt (in nanoseconds). We use the high-resolution timer 609 * the next timer interrupt (in nanoseconds). We use the high-resolution timer
519 * infrastructure to set a callback at that time. 610 * infrastructure to set a callback at that time.
520 * 611 *
521 * 0 means "turn off the clock". */ 612 * 0 means "turn off the clock".
613 */
522void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) 614void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
523{ 615{
524 ktime_t expires; 616 ktime_t expires;
@@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta)
529 return; 621 return;
530 } 622 }
531 623
532 /* We use wallclock time here, so the Guest might not be running for 624 /*
625 * We use wallclock time here, so the Guest might not be running for
533 * all the time between now and the timer interrupt it asked for. This 626 * all the time between now and the timer interrupt it asked for. This
534 * is almost always the right thing to do. */ 627 * is almost always the right thing to do.
628 */
535 expires = ktime_add_ns(ktime_get_real(), delta); 629 expires = ktime_add_ns(ktime_get_real(), delta);
536 hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); 630 hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS);
537} 631}
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 01c591923793..bc28745d05af 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -16,15 +16,13 @@
16void free_pagetables(void); 16void free_pagetables(void);
17int init_pagetables(struct page **switcher_page, unsigned int pages); 17int init_pagetables(struct page **switcher_page, unsigned int pages);
18 18
19struct pgdir 19struct pgdir {
20{
21 unsigned long gpgdir; 20 unsigned long gpgdir;
22 pgd_t *pgdir; 21 pgd_t *pgdir;
23}; 22};
24 23
25/* We have two pages shared with guests, per cpu. */ 24/* We have two pages shared with guests, per cpu. */
26struct lguest_pages 25struct lguest_pages {
27{
28 /* This is the stack page mapped rw in guest */ 26 /* This is the stack page mapped rw in guest */
29 char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; 27 char spare[PAGE_SIZE - sizeof(struct lguest_regs)];
30 struct lguest_regs regs; 28 struct lguest_regs regs;
@@ -54,13 +52,13 @@ struct lg_cpu {
54 52
55 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ 53 unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
56 54
57 /* At end of a page shared mapped over lguest_pages in guest. */ 55 /* At end of a page shared mapped over lguest_pages in guest. */
58 unsigned long regs_page; 56 unsigned long regs_page;
59 struct lguest_regs *regs; 57 struct lguest_regs *regs;
60 58
61 struct lguest_pages *last_pages; 59 struct lguest_pages *last_pages;
62 60
63 int cpu_pgd; /* which pgd this cpu is currently using */ 61 int cpu_pgd; /* Which pgd this cpu is currently using */
64 62
65 /* If a hypercall was asked for, this points to the arguments. */ 63 /* If a hypercall was asked for, this points to the arguments. */
66 struct hcall_args *hcall; 64 struct hcall_args *hcall;
@@ -89,15 +87,17 @@ struct lg_eventfd_map {
89}; 87};
90 88
91/* The private info the thread maintains about the guest. */ 89/* The private info the thread maintains about the guest. */
92struct lguest 90struct lguest {
93{
94 struct lguest_data __user *lguest_data; 91 struct lguest_data __user *lguest_data;
95 struct lg_cpu cpus[NR_CPUS]; 92 struct lg_cpu cpus[NR_CPUS];
96 unsigned int nr_cpus; 93 unsigned int nr_cpus;
97 94
98 u32 pfn_limit; 95 u32 pfn_limit;
99 /* This provides the offset to the base of guest-physical 96
100 * memory in the Launcher. */ 97 /*
98 * This provides the offset to the base of guest-physical memory in the
99 * Launcher.
100 */
101 void __user *mem_base; 101 void __user *mem_base;
102 unsigned long kernel_address; 102 unsigned long kernel_address;
103 103
@@ -122,11 +122,13 @@ bool lguest_address_ok(const struct lguest *lg,
122void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); 122void __lgread(struct lg_cpu *, void *, unsigned long, unsigned);
123void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); 123void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
124 124
125/*H:035 Using memory-copy operations like that is usually inconvient, so we 125/*H:035
126 * Using memory-copy operations like that is usually inconvient, so we
126 * have the following helper macros which read and write a specific type (often 127 * have the following helper macros which read and write a specific type (often
127 * an unsigned long). 128 * an unsigned long).
128 * 129 *
129 * This reads into a variable of the given type then returns that. */ 130 * This reads into a variable of the given type then returns that.
131 */
130#define lgread(cpu, addr, type) \ 132#define lgread(cpu, addr, type) \
131 ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) 133 ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; })
132 134
@@ -140,9 +142,11 @@ void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned);
140 142
141int run_guest(struct lg_cpu *cpu, unsigned long __user *user); 143int run_guest(struct lg_cpu *cpu, unsigned long __user *user);
142 144
143/* Helper macros to obtain the first 12 or the last 20 bits, this is only the 145/*
146 * Helper macros to obtain the first 12 or the last 20 bits, this is only the
144 * first step in the migration to the kernel types. pte_pfn is already defined 147 * first step in the migration to the kernel types. pte_pfn is already defined
145 * in the kernel. */ 148 * in the kernel.
149 */
146#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) 150#define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK)
147#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) 151#define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT)
148#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) 152#define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK)
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
index e082cdac88b4..b6200bc39b58 100644
--- a/drivers/lguest/lguest_device.c
+++ b/drivers/lguest/lguest_device.c
@@ -1,10 +1,12 @@
1/*P:050 Lguest guests use a very simple method to describe devices. It's a 1/*P:050
2 * Lguest guests use a very simple method to describe devices. It's a
2 * series of device descriptors contained just above the top of normal Guest 3 * series of device descriptors contained just above the top of normal Guest
3 * memory. 4 * memory.
4 * 5 *
5 * We use the standard "virtio" device infrastructure, which provides us with a 6 * We use the standard "virtio" device infrastructure, which provides us with a
6 * console, a network and a block driver. Each one expects some configuration 7 * console, a network and a block driver. Each one expects some configuration
7 * information and a "virtqueue" or two to send and receive data. :*/ 8 * information and a "virtqueue" or two to send and receive data.
9:*/
8#include <linux/init.h> 10#include <linux/init.h>
9#include <linux/bootmem.h> 11#include <linux/bootmem.h>
10#include <linux/lguest_launcher.h> 12#include <linux/lguest_launcher.h>
@@ -20,8 +22,10 @@
20/* The pointer to our (page) of device descriptions. */ 22/* The pointer to our (page) of device descriptions. */
21static void *lguest_devices; 23static void *lguest_devices;
22 24
23/* For Guests, device memory can be used as normal memory, so we cast away the 25/*
24 * __iomem to quieten sparse. */ 26 * For Guests, device memory can be used as normal memory, so we cast away the
27 * __iomem to quieten sparse.
28 */
25static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) 29static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
26{ 30{
27 return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); 31 return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages);
@@ -32,8 +36,10 @@ static inline void lguest_unmap(void *addr)
32 iounmap((__force void __iomem *)addr); 36 iounmap((__force void __iomem *)addr);
33} 37}
34 38
35/*D:100 Each lguest device is just a virtio device plus a pointer to its entry 39/*D:100
36 * in the lguest_devices page. */ 40 * Each lguest device is just a virtio device plus a pointer to its entry
41 * in the lguest_devices page.
42 */
37struct lguest_device { 43struct lguest_device {
38 struct virtio_device vdev; 44 struct virtio_device vdev;
39 45
@@ -41,9 +47,11 @@ struct lguest_device {
41 struct lguest_device_desc *desc; 47 struct lguest_device_desc *desc;
42}; 48};
43 49
44/* Since the virtio infrastructure hands us a pointer to the virtio_device all 50/*
51 * Since the virtio infrastructure hands us a pointer to the virtio_device all
45 * the time, it helps to have a curt macro to get a pointer to the struct 52 * the time, it helps to have a curt macro to get a pointer to the struct
46 * lguest_device it's enclosed in. */ 53 * lguest_device it's enclosed in.
54 */
47#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) 55#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev)
48 56
49/*D:130 57/*D:130
@@ -55,7 +63,8 @@ struct lguest_device {
55 * the driver will look at them during setup. 63 * the driver will look at them during setup.
56 * 64 *
57 * A convenient routine to return the device's virtqueue config array: 65 * A convenient routine to return the device's virtqueue config array:
58 * immediately after the descriptor. */ 66 * immediately after the descriptor.
67 */
59static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) 68static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
60{ 69{
61 return (void *)(desc + 1); 70 return (void *)(desc + 1);
@@ -98,10 +107,12 @@ static u32 lg_get_features(struct virtio_device *vdev)
98 return features; 107 return features;
99} 108}
100 109
101/* The virtio core takes the features the Host offers, and copies the 110/*
102 * ones supported by the driver into the vdev->features array. Once 111 * The virtio core takes the features the Host offers, and copies the ones
103 * that's all sorted out, this routine is called so we can tell the 112 * supported by the driver into the vdev->features array. Once that's all
104 * Host which features we understand and accept. */ 113 * sorted out, this routine is called so we can tell the Host which features we
114 * understand and accept.
115 */
105static void lg_finalize_features(struct virtio_device *vdev) 116static void lg_finalize_features(struct virtio_device *vdev)
106{ 117{
107 unsigned int i, bits; 118 unsigned int i, bits;
@@ -112,10 +123,11 @@ static void lg_finalize_features(struct virtio_device *vdev)
112 /* Give virtio_ring a chance to accept features. */ 123 /* Give virtio_ring a chance to accept features. */
113 vring_transport_features(vdev); 124 vring_transport_features(vdev);
114 125
115 /* The vdev->feature array is a Linux bitmask: this isn't the 126 /*
116 * same as a the simple array of bits used by lguest devices 127 * The vdev->feature array is a Linux bitmask: this isn't the same as a
117 * for features. So we do this slow, manual conversion which is 128 * the simple array of bits used by lguest devices for features. So we
118 * completely general. */ 129 * do this slow, manual conversion which is completely general.
130 */
119 memset(out_features, 0, desc->feature_len); 131 memset(out_features, 0, desc->feature_len);
120 bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; 132 bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
121 for (i = 0; i < bits; i++) { 133 for (i = 0; i < bits; i++) {
@@ -146,15 +158,19 @@ static void lg_set(struct virtio_device *vdev, unsigned int offset,
146 memcpy(lg_config(desc) + offset, buf, len); 158 memcpy(lg_config(desc) + offset, buf, len);
147} 159}
148 160
149/* The operations to get and set the status word just access the status field 161/*
150 * of the device descriptor. */ 162 * The operations to get and set the status word just access the status field
163 * of the device descriptor.
164 */
151static u8 lg_get_status(struct virtio_device *vdev) 165static u8 lg_get_status(struct virtio_device *vdev)
152{ 166{
153 return to_lgdev(vdev)->desc->status; 167 return to_lgdev(vdev)->desc->status;
154} 168}
155 169
156/* To notify on status updates, we (ab)use the NOTIFY hypercall, with the 170/*
157 * descriptor address of the device. A zero status means "reset". */ 171 * To notify on status updates, we (ab)use the NOTIFY hypercall, with the
172 * descriptor address of the device. A zero status means "reset".
173 */
158static void set_status(struct virtio_device *vdev, u8 status) 174static void set_status(struct virtio_device *vdev, u8 status)
159{ 175{
160 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; 176 unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
@@ -191,8 +207,7 @@ static void lg_reset(struct virtio_device *vdev)
191 */ 207 */
192 208
193/*D:140 This is the information we remember about each virtqueue. */ 209/*D:140 This is the information we remember about each virtqueue. */
194struct lguest_vq_info 210struct lguest_vq_info {
195{
196 /* A copy of the information contained in the device config. */ 211 /* A copy of the information contained in the device config. */
197 struct lguest_vqconfig config; 212 struct lguest_vqconfig config;
198 213
@@ -200,13 +215,17 @@ struct lguest_vq_info
200 void *pages; 215 void *pages;
201}; 216};
202 217
203/* When the virtio_ring code wants to prod the Host, it calls us here and we 218/*
219 * When the virtio_ring code wants to prod the Host, it calls us here and we
204 * make a hypercall. We hand the physical address of the virtqueue so the Host 220 * make a hypercall. We hand the physical address of the virtqueue so the Host
205 * knows which virtqueue we're talking about. */ 221 * knows which virtqueue we're talking about.
222 */
206static void lg_notify(struct virtqueue *vq) 223static void lg_notify(struct virtqueue *vq)
207{ 224{
208 /* We store our virtqueue information in the "priv" pointer of the 225 /*
209 * virtqueue structure. */ 226 * We store our virtqueue information in the "priv" pointer of the
227 * virtqueue structure.
228 */
210 struct lguest_vq_info *lvq = vq->priv; 229 struct lguest_vq_info *lvq = vq->priv;
211 230
212 kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); 231 kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT);
@@ -215,7 +234,8 @@ static void lg_notify(struct virtqueue *vq)
215/* An extern declaration inside a C file is bad form. Don't do it. */ 234/* An extern declaration inside a C file is bad form. Don't do it. */
216extern void lguest_setup_irq(unsigned int irq); 235extern void lguest_setup_irq(unsigned int irq);
217 236
218/* This routine finds the first virtqueue described in the configuration of 237/*
238 * This routine finds the Nth virtqueue described in the configuration of
219 * this device and sets it up. 239 * this device and sets it up.
220 * 240 *
221 * This is kind of an ugly duckling. It'd be nicer to have a standard 241 * This is kind of an ugly duckling. It'd be nicer to have a standard
@@ -223,9 +243,7 @@ extern void lguest_setup_irq(unsigned int irq);
223 * everyone wants to do it differently. The KVM coders want the Guest to 243 * everyone wants to do it differently. The KVM coders want the Guest to
224 * allocate its own pages and tell the Host where they are, but for lguest it's 244 * allocate its own pages and tell the Host where they are, but for lguest it's
225 * simpler for the Host to simply tell us where the pages are. 245 * simpler for the Host to simply tell us where the pages are.
226 * 246 */
227 * So we provide drivers with a "find the Nth virtqueue and set it up"
228 * function. */
229static struct virtqueue *lg_find_vq(struct virtio_device *vdev, 247static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
230 unsigned index, 248 unsigned index,
231 void (*callback)(struct virtqueue *vq), 249 void (*callback)(struct virtqueue *vq),
@@ -244,9 +262,11 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
244 if (!lvq) 262 if (!lvq)
245 return ERR_PTR(-ENOMEM); 263 return ERR_PTR(-ENOMEM);
246 264
247 /* Make a copy of the "struct lguest_vqconfig" entry, which sits after 265 /*
266 * Make a copy of the "struct lguest_vqconfig" entry, which sits after
248 * the descriptor. We need a copy because the config space might not 267 * the descriptor. We need a copy because the config space might not
249 * be aligned correctly. */ 268 * be aligned correctly.
269 */
250 memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); 270 memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
251 271
252 printk("Mapping virtqueue %i addr %lx\n", index, 272 printk("Mapping virtqueue %i addr %lx\n", index,
@@ -261,8 +281,10 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
261 goto free_lvq; 281 goto free_lvq;
262 } 282 }
263 283
264 /* OK, tell virtio_ring.c to set up a virtqueue now we know its size 284 /*
265 * and we've got a pointer to its pages. */ 285 * OK, tell virtio_ring.c to set up a virtqueue now we know its size
286 * and we've got a pointer to its pages.
287 */
266 vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, 288 vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN,
267 vdev, lvq->pages, lg_notify, callback, name); 289 vdev, lvq->pages, lg_notify, callback, name);
268 if (!vq) { 290 if (!vq) {
@@ -273,18 +295,23 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
273 /* Make sure the interrupt is allocated. */ 295 /* Make sure the interrupt is allocated. */
274 lguest_setup_irq(lvq->config.irq); 296 lguest_setup_irq(lvq->config.irq);
275 297
276 /* Tell the interrupt for this virtqueue to go to the virtio_ring 298 /*
277 * interrupt handler. */ 299 * Tell the interrupt for this virtqueue to go to the virtio_ring
278 /* FIXME: We used to have a flag for the Host to tell us we could use 300 * interrupt handler.
301 *
302 * FIXME: We used to have a flag for the Host to tell us we could use
279 * the interrupt as a source of randomness: it'd be nice to have that 303 * the interrupt as a source of randomness: it'd be nice to have that
280 * back.. */ 304 * back.
305 */
281 err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, 306 err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
282 dev_name(&vdev->dev), vq); 307 dev_name(&vdev->dev), vq);
283 if (err) 308 if (err)
284 goto destroy_vring; 309 goto destroy_vring;
285 310
286 /* Last of all we hook up our 'struct lguest_vq_info" to the 311 /*
287 * virtqueue's priv pointer. */ 312 * Last of all we hook up our 'struct lguest_vq_info" to the
313 * virtqueue's priv pointer.
314 */
288 vq->priv = lvq; 315 vq->priv = lvq;
289 return vq; 316 return vq;
290 317
@@ -358,11 +385,14 @@ static struct virtio_config_ops lguest_config_ops = {
358 .del_vqs = lg_del_vqs, 385 .del_vqs = lg_del_vqs,
359}; 386};
360 387
361/* The root device for the lguest virtio devices. This makes them appear as 388/*
362 * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ 389 * The root device for the lguest virtio devices. This makes them appear as
390 * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2.
391 */
363static struct device *lguest_root; 392static struct device *lguest_root;
364 393
365/*D:120 This is the core of the lguest bus: actually adding a new device. 394/*D:120
395 * This is the core of the lguest bus: actually adding a new device.
366 * It's a separate function because it's neater that way, and because an 396 * It's a separate function because it's neater that way, and because an
367 * earlier version of the code supported hotplug and unplug. They were removed 397 * earlier version of the code supported hotplug and unplug. They were removed
368 * early on because they were never used. 398 * early on because they were never used.
@@ -371,14 +401,14 @@ static struct device *lguest_root;
371 * 401 *
372 * It's worth reading this carefully: we start with a pointer to the new device 402 * It's worth reading this carefully: we start with a pointer to the new device
373 * descriptor in the "lguest_devices" page, and the offset into the device 403 * descriptor in the "lguest_devices" page, and the offset into the device
374 * descriptor page so we can uniquely identify it if things go badly wrong. */ 404 * descriptor page so we can uniquely identify it if things go badly wrong.
405 */
375static void add_lguest_device(struct lguest_device_desc *d, 406static void add_lguest_device(struct lguest_device_desc *d,
376 unsigned int offset) 407 unsigned int offset)
377{ 408{
378 struct lguest_device *ldev; 409 struct lguest_device *ldev;
379 410
380 /* Start with zeroed memory; Linux's device layer seems to count on 411 /* Start with zeroed memory; Linux's device layer counts on it. */
381 * it. */
382 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); 412 ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
383 if (!ldev) { 413 if (!ldev) {
384 printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", 414 printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n",
@@ -388,17 +418,25 @@ static void add_lguest_device(struct lguest_device_desc *d,
388 418
389 /* This devices' parent is the lguest/ dir. */ 419 /* This devices' parent is the lguest/ dir. */
390 ldev->vdev.dev.parent = lguest_root; 420 ldev->vdev.dev.parent = lguest_root;
391 /* We have a unique device index thanks to the dev_index counter. */ 421 /*
422 * The device type comes straight from the descriptor. There's also a
423 * device vendor field in the virtio_device struct, which we leave as
424 * 0.
425 */
392 ldev->vdev.id.device = d->type; 426 ldev->vdev.id.device = d->type;
393 /* We have a simple set of routines for querying the device's 427 /*
394 * configuration information and setting its status. */ 428 * We have a simple set of routines for querying the device's
429 * configuration information and setting its status.
430 */
395 ldev->vdev.config = &lguest_config_ops; 431 ldev->vdev.config = &lguest_config_ops;
396 /* And we remember the device's descriptor for lguest_config_ops. */ 432 /* And we remember the device's descriptor for lguest_config_ops. */
397 ldev->desc = d; 433 ldev->desc = d;
398 434
399 /* register_virtio_device() sets up the generic fields for the struct 435 /*
436 * register_virtio_device() sets up the generic fields for the struct
400 * virtio_device and calls device_register(). This makes the bus 437 * virtio_device and calls device_register(). This makes the bus
401 * infrastructure look for a matching driver. */ 438 * infrastructure look for a matching driver.
439 */
402 if (register_virtio_device(&ldev->vdev) != 0) { 440 if (register_virtio_device(&ldev->vdev) != 0) {
403 printk(KERN_ERR "Failed to register lguest dev %u type %u\n", 441 printk(KERN_ERR "Failed to register lguest dev %u type %u\n",
404 offset, d->type); 442 offset, d->type);
@@ -406,8 +444,10 @@ static void add_lguest_device(struct lguest_device_desc *d,
406 } 444 }
407} 445}
408 446
409/*D:110 scan_devices() simply iterates through the device page. The type 0 is 447/*D:110
410 * reserved to mean "end of devices". */ 448 * scan_devices() simply iterates through the device page. The type 0 is
449 * reserved to mean "end of devices".
450 */
411static void scan_devices(void) 451static void scan_devices(void)
412{ 452{
413 unsigned int i; 453 unsigned int i;
@@ -426,7 +466,8 @@ static void scan_devices(void)
426 } 466 }
427} 467}
428 468
429/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the 469/*D:105
470 * Fairly early in boot, lguest_devices_init() is called to set up the
430 * lguest device infrastructure. We check that we are a Guest by checking 471 * lguest device infrastructure. We check that we are a Guest by checking
431 * pv_info.name: there are other ways of checking, but this seems most 472 * pv_info.name: there are other ways of checking, but this seems most
432 * obvious to me. 473 * obvious to me.
@@ -437,7 +478,8 @@ static void scan_devices(void)
437 * correct sysfs incantation). 478 * correct sysfs incantation).
438 * 479 *
439 * Finally we call scan_devices() which adds all the devices found in the 480 * Finally we call scan_devices() which adds all the devices found in the
440 * lguest_devices page. */ 481 * lguest_devices page.
482 */
441static int __init lguest_devices_init(void) 483static int __init lguest_devices_init(void)
442{ 484{
443 if (strcmp(pv_info.name, "lguest") != 0) 485 if (strcmp(pv_info.name, "lguest") != 0)
@@ -456,11 +498,13 @@ static int __init lguest_devices_init(void)
456/* We do this after core stuff, but before the drivers. */ 498/* We do this after core stuff, but before the drivers. */
457postcore_initcall(lguest_devices_init); 499postcore_initcall(lguest_devices_init);
458 500
459/*D:150 At this point in the journey we used to now wade through the lguest 501/*D:150
502 * At this point in the journey we used to now wade through the lguest
460 * devices themselves: net, block and console. Since they're all now virtio 503 * devices themselves: net, block and console. Since they're all now virtio
461 * devices rather than lguest-specific, I've decided to ignore them. Mostly, 504 * devices rather than lguest-specific, I've decided to ignore them. Mostly,
462 * they're kind of boring. But this does mean you'll never experience the 505 * they're kind of boring. But this does mean you'll never experience the
463 * thrill of reading the forbidden love scene buried deep in the block driver. 506 * thrill of reading the forbidden love scene buried deep in the block driver.
464 * 507 *
465 * "make Launcher" beckons, where we answer questions like "Where do Guests 508 * "make Launcher" beckons, where we answer questions like "Where do Guests
466 * come from?", and "What do you do when someone asks for optimization?". */ 509 * come from?", and "What do you do when someone asks for optimization?".
510 */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 9f9a2953b383..b4d3f7ca554f 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -1,8 +1,9 @@
1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher 1/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher
2 * controls and communicates with the Guest. For example, the first write will 2 * controls and communicates with the Guest. For example, the first write will
3 * tell us the Guest's memory layout, pagetable, entry point and kernel address 3 * tell us the Guest's memory layout and entry point. A read will run the
4 * offset. A read will run the Guest until something happens, such as a signal 4 * Guest until something happens, such as a signal or the Guest doing a NOTIFY
5 * or the Guest doing a NOTIFY out to the Launcher. :*/ 5 * out to the Launcher.
6:*/
6#include <linux/uaccess.h> 7#include <linux/uaccess.h>
7#include <linux/miscdevice.h> 8#include <linux/miscdevice.h>
8#include <linux/fs.h> 9#include <linux/fs.h>
@@ -11,14 +12,41 @@
11#include <linux/file.h> 12#include <linux/file.h>
12#include "lg.h" 13#include "lg.h"
13 14
15/*L:056
16 * Before we move on, let's jump ahead and look at what the kernel does when
17 * it needs to look up the eventfds. That will complete our picture of how we
18 * use RCU.
19 *
20 * The notification value is in cpu->pending_notify: we return true if it went
21 * to an eventfd.
22 */
14bool send_notify_to_eventfd(struct lg_cpu *cpu) 23bool send_notify_to_eventfd(struct lg_cpu *cpu)
15{ 24{
16 unsigned int i; 25 unsigned int i;
17 struct lg_eventfd_map *map; 26 struct lg_eventfd_map *map;
18 27
19 /* lg->eventfds is RCU-protected */ 28 /*
29 * This "rcu_read_lock()" helps track when someone is still looking at
30 * the (RCU-using) eventfds array. It's not actually a lock at all;
31 * indeed it's a noop in many configurations. (You didn't expect me to
32 * explain all the RCU secrets here, did you?)
33 */
20 rcu_read_lock(); 34 rcu_read_lock();
35 /*
36 * rcu_dereference is the counter-side of rcu_assign_pointer(); it
37 * makes sure we don't access the memory pointed to by
38 * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy,
39 * but Alpha allows this! Paul McKenney points out that a really
40 * aggressive compiler could have the same effect:
41 * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
42 *
43 * So play safe, use rcu_dereference to get the rcu-protected pointer:
44 */
21 map = rcu_dereference(cpu->lg->eventfds); 45 map = rcu_dereference(cpu->lg->eventfds);
46 /*
47 * Simple array search: even if they add an eventfd while we do this,
48 * we'll continue to use the old array and just won't see the new one.
49 */
22 for (i = 0; i < map->num; i++) { 50 for (i = 0; i < map->num; i++) {
23 if (map->map[i].addr == cpu->pending_notify) { 51 if (map->map[i].addr == cpu->pending_notify) {
24 eventfd_signal(map->map[i].event, 1); 52 eventfd_signal(map->map[i].event, 1);
@@ -26,19 +54,50 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu)
26 break; 54 break;
27 } 55 }
28 } 56 }
57 /* We're done with the rcu-protected variable cpu->lg->eventfds. */
29 rcu_read_unlock(); 58 rcu_read_unlock();
59
60 /* If we cleared the notification, it's because we found a match. */
30 return cpu->pending_notify == 0; 61 return cpu->pending_notify == 0;
31} 62}
32 63
64/*L:055
65 * One of the more tricksy tricks in the Linux Kernel is a technique called
66 * Read Copy Update. Since one point of lguest is to teach lguest journeyers
67 * about kernel coding, I use it here. (In case you're curious, other purposes
68 * include learning about virtualization and instilling a deep appreciation for
69 * simplicity and puppies).
70 *
71 * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
72 * add new eventfds without ever blocking readers from accessing the array.
73 * The current Launcher only does this during boot, so that never happens. But
74 * Read Copy Update is cool, and adding a lock risks damaging even more puppies
75 * than this code does.
76 *
77 * We allocate a brand new one-larger array, copy the old one and add our new
78 * element. Then we make the lg eventfd pointer point to the new array.
79 * That's the easy part: now we need to free the old one, but we need to make
80 * sure no slow CPU somewhere is still looking at it. That's what
81 * synchronize_rcu does for us: waits until every CPU has indicated that it has
82 * moved on to know it's no longer using the old one.
83 *
84 * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
85 */
33static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) 86static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
34{ 87{
35 struct lg_eventfd_map *new, *old = lg->eventfds; 88 struct lg_eventfd_map *new, *old = lg->eventfds;
36 89
90 /*
91 * We don't allow notifications on value 0 anyway (pending_notify of
92 * 0 means "nothing pending").
93 */
37 if (!addr) 94 if (!addr)
38 return -EINVAL; 95 return -EINVAL;
39 96
40 /* Replace the old array with the new one, carefully: others can 97 /*
41 * be accessing it at the same time */ 98 * Replace the old array with the new one, carefully: others can
99 * be accessing it at the same time.
100 */
42 new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), 101 new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
43 GFP_KERNEL); 102 GFP_KERNEL);
44 if (!new) 103 if (!new)
@@ -52,22 +111,41 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
52 new->map[new->num].addr = addr; 111 new->map[new->num].addr = addr;
53 new->map[new->num].event = eventfd_ctx_fdget(fd); 112 new->map[new->num].event = eventfd_ctx_fdget(fd);
54 if (IS_ERR(new->map[new->num].event)) { 113 if (IS_ERR(new->map[new->num].event)) {
114 int err = PTR_ERR(new->map[new->num].event);
55 kfree(new); 115 kfree(new);
56 return PTR_ERR(new->map[new->num].event); 116 return err;
57 } 117 }
58 new->num++; 118 new->num++;
59 119
60 /* Now put new one in place. */ 120 /*
121 * Now put new one in place: rcu_assign_pointer() is a fancy way of
122 * doing "lg->eventfds = new", but it uses memory barriers to make
123 * absolutely sure that the contents of "new" written above is nailed
124 * down before we actually do the assignment.
125 *
126 * We have to think about these kinds of things when we're operating on
127 * live data without locks.
128 */
61 rcu_assign_pointer(lg->eventfds, new); 129 rcu_assign_pointer(lg->eventfds, new);
62 130
63 /* We're not in a big hurry. Wait until noone's looking at old 131 /*
64 * version, then delete it. */ 132 * We're not in a big hurry. Wait until noone's looking at old
133 * version, then free it.
134 */
65 synchronize_rcu(); 135 synchronize_rcu();
66 kfree(old); 136 kfree(old);
67 137
68 return 0; 138 return 0;
69} 139}
70 140
141/*L:052
142 * Receiving notifications from the Guest is usually done by attaching a
143 * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will
144 * become readable when the Guest does an LHCALL_NOTIFY with that value.
145 *
146 * This is really convenient for processing each virtqueue in a separate
147 * thread.
148 */
71static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) 149static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
72{ 150{
73 unsigned long addr, fd; 151 unsigned long addr, fd;
@@ -79,15 +157,22 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
79 if (get_user(fd, input) != 0) 157 if (get_user(fd, input) != 0)
80 return -EFAULT; 158 return -EFAULT;
81 159
160 /*
161 * Just make sure two callers don't add eventfds at once. We really
162 * only need to lock against callers adding to the same Guest, so using
163 * the Big Lguest Lock is overkill. But this is setup, not a fast path.
164 */
82 mutex_lock(&lguest_lock); 165 mutex_lock(&lguest_lock);
83 err = add_eventfd(lg, addr, fd); 166 err = add_eventfd(lg, addr, fd);
84 mutex_unlock(&lguest_lock); 167 mutex_unlock(&lguest_lock);
85 168
86 return 0; 169 return err;
87} 170}
88 171
89/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt 172/*L:050
90 * number to /dev/lguest. */ 173 * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt
174 * number to /dev/lguest.
175 */
91static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) 176static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
92{ 177{
93 unsigned long irq; 178 unsigned long irq;
@@ -97,12 +182,18 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
97 if (irq >= LGUEST_IRQS) 182 if (irq >= LGUEST_IRQS)
98 return -EINVAL; 183 return -EINVAL;
99 184
185 /*
186 * Next time the Guest runs, the core code will see if it can deliver
187 * this interrupt.
188 */
100 set_interrupt(cpu, irq); 189 set_interrupt(cpu, irq);
101 return 0; 190 return 0;
102} 191}
103 192
104/*L:040 Once our Guest is initialized, the Launcher makes it run by reading 193/*L:040
105 * from /dev/lguest. */ 194 * Once our Guest is initialized, the Launcher makes it run by reading
195 * from /dev/lguest.
196 */
106static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) 197static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
107{ 198{
108 struct lguest *lg = file->private_data; 199 struct lguest *lg = file->private_data;
@@ -138,8 +229,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
138 return len; 229 return len;
139 } 230 }
140 231
141 /* If we returned from read() last time because the Guest sent I/O, 232 /*
142 * clear the flag. */ 233 * If we returned from read() last time because the Guest sent I/O,
234 * clear the flag.
235 */
143 if (cpu->pending_notify) 236 if (cpu->pending_notify)
144 cpu->pending_notify = 0; 237 cpu->pending_notify = 0;
145 238
@@ -147,8 +240,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
147 return run_guest(cpu, (unsigned long __user *)user); 240 return run_guest(cpu, (unsigned long __user *)user);
148} 241}
149 242
150/*L:025 This actually initializes a CPU. For the moment, a Guest is only 243/*L:025
151 * uniprocessor, so "id" is always 0. */ 244 * This actually initializes a CPU. For the moment, a Guest is only
245 * uniprocessor, so "id" is always 0.
246 */
152static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) 247static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
153{ 248{
154 /* We have a limited number the number of CPUs in the lguest struct. */ 249 /* We have a limited number the number of CPUs in the lguest struct. */
@@ -163,8 +258,10 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
163 /* Each CPU has a timer it can set. */ 258 /* Each CPU has a timer it can set. */
164 init_clockdev(cpu); 259 init_clockdev(cpu);
165 260
166 /* We need a complete page for the Guest registers: they are accessible 261 /*
167 * to the Guest and we can only grant it access to whole pages. */ 262 * We need a complete page for the Guest registers: they are accessible
263 * to the Guest and we can only grant it access to whole pages.
264 */
168 cpu->regs_page = get_zeroed_page(GFP_KERNEL); 265 cpu->regs_page = get_zeroed_page(GFP_KERNEL);
169 if (!cpu->regs_page) 266 if (!cpu->regs_page)
170 return -ENOMEM; 267 return -ENOMEM;
@@ -172,29 +269,38 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
172 /* We actually put the registers at the bottom of the page. */ 269 /* We actually put the registers at the bottom of the page. */
173 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); 270 cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs);
174 271
175 /* Now we initialize the Guest's registers, handing it the start 272 /*
176 * address. */ 273 * Now we initialize the Guest's registers, handing it the start
274 * address.
275 */
177 lguest_arch_setup_regs(cpu, start_ip); 276 lguest_arch_setup_regs(cpu, start_ip);
178 277
179 /* We keep a pointer to the Launcher task (ie. current task) for when 278 /*
180 * other Guests want to wake this one (eg. console input). */ 279 * We keep a pointer to the Launcher task (ie. current task) for when
280 * other Guests want to wake this one (eg. console input).
281 */
181 cpu->tsk = current; 282 cpu->tsk = current;
182 283
183 /* We need to keep a pointer to the Launcher's memory map, because if 284 /*
285 * We need to keep a pointer to the Launcher's memory map, because if
184 * the Launcher dies we need to clean it up. If we don't keep a 286 * the Launcher dies we need to clean it up. If we don't keep a
185 * reference, it is destroyed before close() is called. */ 287 * reference, it is destroyed before close() is called.
288 */
186 cpu->mm = get_task_mm(cpu->tsk); 289 cpu->mm = get_task_mm(cpu->tsk);
187 290
188 /* We remember which CPU's pages this Guest used last, for optimization 291 /*
189 * when the same Guest runs on the same CPU twice. */ 292 * We remember which CPU's pages this Guest used last, for optimization
293 * when the same Guest runs on the same CPU twice.
294 */
190 cpu->last_pages = NULL; 295 cpu->last_pages = NULL;
191 296
192 /* No error == success. */ 297 /* No error == success. */
193 return 0; 298 return 0;
194} 299}
195 300
196/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) 301/*L:020
197 * values (in addition to the LHREQ_INITIALIZE value). These are: 302 * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in
303 * addition to the LHREQ_INITIALIZE value). These are:
198 * 304 *
199 * base: The start of the Guest-physical memory inside the Launcher memory. 305 * base: The start of the Guest-physical memory inside the Launcher memory.
200 * 306 *
@@ -206,14 +312,15 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip)
206 */ 312 */
207static int initialize(struct file *file, const unsigned long __user *input) 313static int initialize(struct file *file, const unsigned long __user *input)
208{ 314{
209 /* "struct lguest" contains everything we (the Host) know about a 315 /* "struct lguest" contains all we (the Host) know about a Guest. */
210 * Guest. */
211 struct lguest *lg; 316 struct lguest *lg;
212 int err; 317 int err;
213 unsigned long args[3]; 318 unsigned long args[3];
214 319
215 /* We grab the Big Lguest lock, which protects against multiple 320 /*
216 * simultaneous initializations. */ 321 * We grab the Big Lguest lock, which protects against multiple
322 * simultaneous initializations.
323 */
217 mutex_lock(&lguest_lock); 324 mutex_lock(&lguest_lock);
218 /* You can't initialize twice! Close the device and start again... */ 325 /* You can't initialize twice! Close the device and start again... */
219 if (file->private_data) { 326 if (file->private_data) {
@@ -248,8 +355,10 @@ static int initialize(struct file *file, const unsigned long __user *input)
248 if (err) 355 if (err)
249 goto free_eventfds; 356 goto free_eventfds;
250 357
251 /* Initialize the Guest's shadow page tables, using the toplevel 358 /*
252 * address the Launcher gave us. This allocates memory, so can fail. */ 359 * Initialize the Guest's shadow page tables, using the toplevel
360 * address the Launcher gave us. This allocates memory, so can fail.
361 */
253 err = init_guest_pagetable(lg); 362 err = init_guest_pagetable(lg);
254 if (err) 363 if (err)
255 goto free_regs; 364 goto free_regs;
@@ -274,20 +383,24 @@ unlock:
274 return err; 383 return err;
275} 384}
276 385
277/*L:010 The first operation the Launcher does must be a write. All writes 386/*L:010
387 * The first operation the Launcher does must be a write. All writes
278 * start with an unsigned long number: for the first write this must be 388 * start with an unsigned long number: for the first write this must be
279 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use 389 * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use
280 * writes of other values to send interrupts. 390 * writes of other values to send interrupts or set up receipt of notifications.
281 * 391 *
282 * Note that we overload the "offset" in the /dev/lguest file to indicate what 392 * Note that we overload the "offset" in the /dev/lguest file to indicate what
283 * CPU number we're dealing with. Currently this is always 0, since we only 393 * CPU number we're dealing with. Currently this is always 0 since we only
284 * support uniprocessor Guests, but you can see the beginnings of SMP support 394 * support uniprocessor Guests, but you can see the beginnings of SMP support
285 * here. */ 395 * here.
396 */
286static ssize_t write(struct file *file, const char __user *in, 397static ssize_t write(struct file *file, const char __user *in,
287 size_t size, loff_t *off) 398 size_t size, loff_t *off)
288{ 399{
289 /* Once the Guest is initialized, we hold the "struct lguest" in the 400 /*
290 * file private data. */ 401 * Once the Guest is initialized, we hold the "struct lguest" in the
402 * file private data.
403 */
291 struct lguest *lg = file->private_data; 404 struct lguest *lg = file->private_data;
292 const unsigned long __user *input = (const unsigned long __user *)in; 405 const unsigned long __user *input = (const unsigned long __user *)in;
293 unsigned long req; 406 unsigned long req;
@@ -322,13 +435,15 @@ static ssize_t write(struct file *file, const char __user *in,
322 } 435 }
323} 436}
324 437
325/*L:060 The final piece of interface code is the close() routine. It reverses 438/*L:060
439 * The final piece of interface code is the close() routine. It reverses
326 * everything done in initialize(). This is usually called because the 440 * everything done in initialize(). This is usually called because the
327 * Launcher exited. 441 * Launcher exited.
328 * 442 *
329 * Note that the close routine returns 0 or a negative error number: it can't 443 * Note that the close routine returns 0 or a negative error number: it can't
330 * really fail, but it can whine. I blame Sun for this wart, and K&R C for 444 * really fail, but it can whine. I blame Sun for this wart, and K&R C for
331 * letting them do it. :*/ 445 * letting them do it.
446:*/
332static int close(struct inode *inode, struct file *file) 447static int close(struct inode *inode, struct file *file)
333{ 448{
334 struct lguest *lg = file->private_data; 449 struct lguest *lg = file->private_data;
@@ -338,8 +453,10 @@ static int close(struct inode *inode, struct file *file)
338 if (!lg) 453 if (!lg)
339 return 0; 454 return 0;
340 455
341 /* We need the big lock, to protect from inter-guest I/O and other 456 /*
342 * Launchers initializing guests. */ 457 * We need the big lock, to protect from inter-guest I/O and other
458 * Launchers initializing guests.
459 */
343 mutex_lock(&lguest_lock); 460 mutex_lock(&lguest_lock);
344 461
345 /* Free up the shadow page tables for the Guest. */ 462 /* Free up the shadow page tables for the Guest. */
@@ -350,8 +467,10 @@ static int close(struct inode *inode, struct file *file)
350 hrtimer_cancel(&lg->cpus[i].hrt); 467 hrtimer_cancel(&lg->cpus[i].hrt);
351 /* We can free up the register page we allocated. */ 468 /* We can free up the register page we allocated. */
352 free_page(lg->cpus[i].regs_page); 469 free_page(lg->cpus[i].regs_page);
353 /* Now all the memory cleanups are done, it's safe to release 470 /*
354 * the Launcher's memory management structure. */ 471 * Now all the memory cleanups are done, it's safe to release
472 * the Launcher's memory management structure.
473 */
355 mmput(lg->cpus[i].mm); 474 mmput(lg->cpus[i].mm);
356 } 475 }
357 476
@@ -360,8 +479,10 @@ static int close(struct inode *inode, struct file *file)
360 eventfd_ctx_put(lg->eventfds->map[i].event); 479 eventfd_ctx_put(lg->eventfds->map[i].event);
361 kfree(lg->eventfds); 480 kfree(lg->eventfds);
362 481
363 /* If lg->dead doesn't contain an error code it will be NULL or a 482 /*
364 * kmalloc()ed string, either of which is ok to hand to kfree(). */ 483 * If lg->dead doesn't contain an error code it will be NULL or a
484 * kmalloc()ed string, either of which is ok to hand to kfree().
485 */
365 if (!IS_ERR(lg->dead)) 486 if (!IS_ERR(lg->dead))
366 kfree(lg->dead); 487 kfree(lg->dead);
367 /* Free the memory allocated to the lguest_struct */ 488 /* Free the memory allocated to the lguest_struct */
@@ -385,7 +506,8 @@ static int close(struct inode *inode, struct file *file)
385 * 506 *
386 * We begin our understanding with the Host kernel interface which the Launcher 507 * We begin our understanding with the Host kernel interface which the Launcher
387 * uses: reading and writing a character device called /dev/lguest. All the 508 * uses: reading and writing a character device called /dev/lguest. All the
388 * work happens in the read(), write() and close() routines: */ 509 * work happens in the read(), write() and close() routines:
510 */
389static struct file_operations lguest_fops = { 511static struct file_operations lguest_fops = {
390 .owner = THIS_MODULE, 512 .owner = THIS_MODULE,
391 .release = close, 513 .release = close,
@@ -393,8 +515,10 @@ static struct file_operations lguest_fops = {
393 .read = read, 515 .read = read,
394}; 516};
395 517
396/* This is a textbook example of a "misc" character device. Populate a "struct 518/*
397 * miscdevice" and register it with misc_register(). */ 519 * This is a textbook example of a "misc" character device. Populate a "struct
520 * miscdevice" and register it with misc_register().
521 */
398static struct miscdevice lguest_dev = { 522static struct miscdevice lguest_dev = {
399 .minor = MISC_DYNAMIC_MINOR, 523 .minor = MISC_DYNAMIC_MINOR,
400 .name = "lguest", 524 .name = "lguest",
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index a6fe1abda240..a8d0aee3bc0e 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -1,9 +1,11 @@
1/*P:700 The pagetable code, on the other hand, still shows the scars of 1/*P:700
2 * The pagetable code, on the other hand, still shows the scars of
2 * previous encounters. It's functional, and as neat as it can be in the 3 * previous encounters. It's functional, and as neat as it can be in the
3 * circumstances, but be wary, for these things are subtle and break easily. 4 * circumstances, but be wary, for these things are subtle and break easily.
4 * The Guest provides a virtual to physical mapping, but we can neither trust 5 * The Guest provides a virtual to physical mapping, but we can neither trust
5 * it nor use it: we verify and convert it here then point the CPU to the 6 * it nor use it: we verify and convert it here then point the CPU to the
6 * converted Guest pages when running the Guest. :*/ 7 * converted Guest pages when running the Guest.
8:*/
7 9
8/* Copyright (C) Rusty Russell IBM Corporation 2006. 10/* Copyright (C) Rusty Russell IBM Corporation 2006.
9 * GPL v2 and any later version */ 11 * GPL v2 and any later version */
@@ -17,18 +19,20 @@
17#include <asm/bootparam.h> 19#include <asm/bootparam.h>
18#include "lg.h" 20#include "lg.h"
19 21
20/*M:008 We hold reference to pages, which prevents them from being swapped. 22/*M:008
23 * We hold reference to pages, which prevents them from being swapped.
21 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants 24 * It'd be nice to have a callback in the "struct mm_struct" when Linux wants
22 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we 25 * to swap out. If we had this, and a shrinker callback to trim PTE pages, we
23 * could probably consider launching Guests as non-root. :*/ 26 * could probably consider launching Guests as non-root.
27:*/
24 28
25/*H:300 29/*H:300
26 * The Page Table Code 30 * The Page Table Code
27 * 31 *
28 * We use two-level page tables for the Guest. If you're not entirely 32 * We use two-level page tables for the Guest, or three-level with PAE. If
29 * comfortable with virtual addresses, physical addresses and page tables then 33 * you're not entirely comfortable with virtual addresses, physical addresses
30 * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with 34 * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page
31 * diagrams!). 35 * Table Handling" (with diagrams!).
32 * 36 *
33 * The Guest keeps page tables, but we maintain the actual ones here: these are 37 * The Guest keeps page tables, but we maintain the actual ones here: these are
34 * called "shadow" page tables. Which is a very Guest-centric name: these are 38 * called "shadow" page tables. Which is a very Guest-centric name: these are
@@ -45,16 +49,18 @@
45 * (v) Flushing (throwing away) page tables, 49 * (v) Flushing (throwing away) page tables,
46 * (vi) Mapping the Switcher when the Guest is about to run, 50 * (vi) Mapping the Switcher when the Guest is about to run,
47 * (vii) Setting up the page tables initially. 51 * (vii) Setting up the page tables initially.
48 :*/ 52:*/
49 53
50 54/*
51/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is 55 * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB)
52 * conveniently placed at the top 4MB, so it uses a separate, complete PTE 56 * or 512 PTE entries with PAE (2MB).
53 * page. */ 57 */
54#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) 58#define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1)
55 59
56/* For PAE we need the PMD index as well. We use the last 2MB, so we 60/*
57 * will need the last pmd entry of the last pmd page. */ 61 * For PAE we need the PMD index as well. We use the last 2MB, so we
62 * will need the last pmd entry of the last pmd page.
63 */
58#ifdef CONFIG_X86_PAE 64#ifdef CONFIG_X86_PAE
59#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) 65#define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1)
60#define RESERVE_MEM 2U 66#define RESERVE_MEM 2U
@@ -64,14 +70,18 @@
64#define CHECK_GPGD_MASK _PAGE_TABLE 70#define CHECK_GPGD_MASK _PAGE_TABLE
65#endif 71#endif
66 72
67/* We actually need a separate PTE page for each CPU. Remember that after the 73/*
74 * We actually need a separate PTE page for each CPU. Remember that after the
68 * Switcher code itself comes two pages for each CPU, and we don't want this 75 * Switcher code itself comes two pages for each CPU, and we don't want this
69 * CPU's guest to see the pages of any other CPU. */ 76 * CPU's guest to see the pages of any other CPU.
77 */
70static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); 78static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
71#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) 79#define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu)
72 80
73/*H:320 The page table code is curly enough to need helper functions to keep it 81/*H:320
74 * clear and clean. 82 * The page table code is curly enough to need helper functions to keep it
83 * clear and clean. The kernel itself provides many of them; one advantage
84 * of insisting that the Guest and Host use the same CONFIG_PAE setting.
75 * 85 *
76 * There are two functions which return pointers to the shadow (aka "real") 86 * There are two functions which return pointers to the shadow (aka "real")
77 * page tables. 87 * page tables.
@@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages);
79 * spgd_addr() takes the virtual address and returns a pointer to the top-level 89 * spgd_addr() takes the virtual address and returns a pointer to the top-level
80 * page directory entry (PGD) for that address. Since we keep track of several 90 * page directory entry (PGD) for that address. Since we keep track of several
81 * page tables, the "i" argument tells us which one we're interested in (it's 91 * page tables, the "i" argument tells us which one we're interested in (it's
82 * usually the current one). */ 92 * usually the current one).
93 */
83static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) 94static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
84{ 95{
85 unsigned int index = pgd_index(vaddr); 96 unsigned int index = pgd_index(vaddr);
@@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr)
96} 107}
97 108
98#ifdef CONFIG_X86_PAE 109#ifdef CONFIG_X86_PAE
99/* This routine then takes the PGD entry given above, which contains the 110/*
111 * This routine then takes the PGD entry given above, which contains the
100 * address of the PMD page. It then returns a pointer to the PMD entry for the 112 * address of the PMD page. It then returns a pointer to the PMD entry for the
101 * given address. */ 113 * given address.
114 */
102static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 115static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
103{ 116{
104 unsigned int index = pmd_index(vaddr); 117 unsigned int index = pmd_index(vaddr);
@@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
119} 132}
120#endif 133#endif
121 134
122/* This routine then takes the page directory entry returned above, which 135/*
136 * This routine then takes the page directory entry returned above, which
123 * contains the address of the page table entry (PTE) page. It then returns a 137 * contains the address of the page table entry (PTE) page. It then returns a
124 * pointer to the PTE entry for the given address. */ 138 * pointer to the PTE entry for the given address.
139 */
125static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) 140static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
126{ 141{
127#ifdef CONFIG_X86_PAE 142#ifdef CONFIG_X86_PAE
@@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr)
139 return &page[pte_index(vaddr)]; 154 return &page[pte_index(vaddr)];
140} 155}
141 156
142/* These two functions just like the above two, except they access the Guest 157/*
143 * page tables. Hence they return a Guest address. */ 158 * These functions are just like the above two, except they access the Guest
159 * page tables. Hence they return a Guest address.
160 */
144static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) 161static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
145{ 162{
146 unsigned int index = vaddr >> (PGDIR_SHIFT); 163 unsigned int index = vaddr >> (PGDIR_SHIFT);
@@ -148,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr)
148} 165}
149 166
150#ifdef CONFIG_X86_PAE 167#ifdef CONFIG_X86_PAE
168/* Follow the PGD to the PMD. */
151static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) 169static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
152{ 170{
153 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; 171 unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT;
@@ -155,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr)
155 return gpage + pmd_index(vaddr) * sizeof(pmd_t); 173 return gpage + pmd_index(vaddr) * sizeof(pmd_t);
156} 174}
157 175
176/* Follow the PMD to the PTE. */
158static unsigned long gpte_addr(struct lg_cpu *cpu, 177static unsigned long gpte_addr(struct lg_cpu *cpu,
159 pmd_t gpmd, unsigned long vaddr) 178 pmd_t gpmd, unsigned long vaddr)
160{ 179{
@@ -164,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
164 return gpage + pte_index(vaddr) * sizeof(pte_t); 183 return gpage + pte_index(vaddr) * sizeof(pte_t);
165} 184}
166#else 185#else
186/* Follow the PGD to the PTE (no mid-level for !PAE). */
167static unsigned long gpte_addr(struct lg_cpu *cpu, 187static unsigned long gpte_addr(struct lg_cpu *cpu,
168 pgd_t gpgd, unsigned long vaddr) 188 pgd_t gpgd, unsigned long vaddr)
169{ 189{
@@ -175,17 +195,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu,
175#endif 195#endif
176/*:*/ 196/*:*/
177 197
178/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as 198/*M:014
179 * an optimization (ie. pre-faulting). :*/ 199 * get_pfn is slow: we could probably try to grab batches of pages here as
200 * an optimization (ie. pre-faulting).
201:*/
180 202
181/*H:350 This routine takes a page number given by the Guest and converts it to 203/*H:350
204 * This routine takes a page number given by the Guest and converts it to
182 * an actual, physical page number. It can fail for several reasons: the 205 * an actual, physical page number. It can fail for several reasons: the
183 * virtual address might not be mapped by the Launcher, the write flag is set 206 * virtual address might not be mapped by the Launcher, the write flag is set
184 * and the page is read-only, or the write flag was set and the page was 207 * and the page is read-only, or the write flag was set and the page was
185 * shared so had to be copied, but we ran out of memory. 208 * shared so had to be copied, but we ran out of memory.
186 * 209 *
187 * This holds a reference to the page, so release_pte() is careful to put that 210 * This holds a reference to the page, so release_pte() is careful to put that
188 * back. */ 211 * back.
212 */
189static unsigned long get_pfn(unsigned long virtpfn, int write) 213static unsigned long get_pfn(unsigned long virtpfn, int write)
190{ 214{
191 struct page *page; 215 struct page *page;
@@ -198,33 +222,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write)
198 return -1UL; 222 return -1UL;
199} 223}
200 224
201/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table 225/*H:340
226 * Converting a Guest page table entry to a shadow (ie. real) page table
202 * entry can be a little tricky. The flags are (almost) the same, but the 227 * entry can be a little tricky. The flags are (almost) the same, but the
203 * Guest PTE contains a virtual page number: the CPU needs the real page 228 * Guest PTE contains a virtual page number: the CPU needs the real page
204 * number. */ 229 * number.
230 */
205static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) 231static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
206{ 232{
207 unsigned long pfn, base, flags; 233 unsigned long pfn, base, flags;
208 234
209 /* The Guest sets the global flag, because it thinks that it is using 235 /*
236 * The Guest sets the global flag, because it thinks that it is using
210 * PGE. We only told it to use PGE so it would tell us whether it was 237 * PGE. We only told it to use PGE so it would tell us whether it was
211 * flushing a kernel mapping or a userspace mapping. We don't actually 238 * flushing a kernel mapping or a userspace mapping. We don't actually
212 * use the global bit, so throw it away. */ 239 * use the global bit, so throw it away.
240 */
213 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); 241 flags = (pte_flags(gpte) & ~_PAGE_GLOBAL);
214 242
215 /* The Guest's pages are offset inside the Launcher. */ 243 /* The Guest's pages are offset inside the Launcher. */
216 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; 244 base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE;
217 245
218 /* We need a temporary "unsigned long" variable to hold the answer from 246 /*
247 * We need a temporary "unsigned long" variable to hold the answer from
219 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't 248 * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't
220 * fit in spte.pfn. get_pfn() finds the real physical number of the 249 * fit in spte.pfn. get_pfn() finds the real physical number of the
221 * page, given the virtual number. */ 250 * page, given the virtual number.
251 */
222 pfn = get_pfn(base + pte_pfn(gpte), write); 252 pfn = get_pfn(base + pte_pfn(gpte), write);
223 if (pfn == -1UL) { 253 if (pfn == -1UL) {
224 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); 254 kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte));
225 /* When we destroy the Guest, we'll go through the shadow page 255 /*
256 * When we destroy the Guest, we'll go through the shadow page
226 * tables and release_pte() them. Make sure we don't think 257 * tables and release_pte() them. Make sure we don't think
227 * this one is valid! */ 258 * this one is valid!
259 */
228 flags = 0; 260 flags = 0;
229 } 261 }
230 /* Now we assemble our shadow PTE from the page number and flags. */ 262 /* Now we assemble our shadow PTE from the page number and flags. */
@@ -234,8 +266,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write)
234/*H:460 And to complete the chain, release_pte() looks like this: */ 266/*H:460 And to complete the chain, release_pte() looks like this: */
235static void release_pte(pte_t pte) 267static void release_pte(pte_t pte)
236{ 268{
237 /* Remember that get_user_pages_fast() took a reference to the page, in 269 /*
238 * get_pfn()? We have to put it back now. */ 270 * Remember that get_user_pages_fast() took a reference to the page, in
271 * get_pfn()? We have to put it back now.
272 */
239 if (pte_flags(pte) & _PAGE_PRESENT) 273 if (pte_flags(pte) & _PAGE_PRESENT)
240 put_page(pte_page(pte)); 274 put_page(pte_page(pte));
241} 275}
@@ -273,7 +307,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd)
273 * and return to the Guest without it knowing. 307 * and return to the Guest without it knowing.
274 * 308 *
275 * If we fixed up the fault (ie. we mapped the address), this routine returns 309 * If we fixed up the fault (ie. we mapped the address), this routine returns
276 * true. Otherwise, it was a real fault and we need to tell the Guest. */ 310 * true. Otherwise, it was a real fault and we need to tell the Guest.
311 */
277bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) 312bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
278{ 313{
279 pgd_t gpgd; 314 pgd_t gpgd;
@@ -282,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
282 pte_t gpte; 317 pte_t gpte;
283 pte_t *spte; 318 pte_t *spte;
284 319
320 /* Mid level for PAE. */
285#ifdef CONFIG_X86_PAE 321#ifdef CONFIG_X86_PAE
286 pmd_t *spmd; 322 pmd_t *spmd;
287 pmd_t gpmd; 323 pmd_t gpmd;
@@ -298,22 +334,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
298 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { 334 if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) {
299 /* No shadow entry: allocate a new shadow PTE page. */ 335 /* No shadow entry: allocate a new shadow PTE page. */
300 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 336 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
301 /* This is not really the Guest's fault, but killing it is 337 /*
302 * simple for this corner case. */ 338 * This is not really the Guest's fault, but killing it is
339 * simple for this corner case.
340 */
303 if (!ptepage) { 341 if (!ptepage) {
304 kill_guest(cpu, "out of memory allocating pte page"); 342 kill_guest(cpu, "out of memory allocating pte page");
305 return false; 343 return false;
306 } 344 }
307 /* We check that the Guest pgd is OK. */ 345 /* We check that the Guest pgd is OK. */
308 check_gpgd(cpu, gpgd); 346 check_gpgd(cpu, gpgd);
309 /* And we copy the flags to the shadow PGD entry. The page 347 /*
310 * number in the shadow PGD is the page we just allocated. */ 348 * And we copy the flags to the shadow PGD entry. The page
349 * number in the shadow PGD is the page we just allocated.
350 */
311 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); 351 set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd)));
312 } 352 }
313 353
314#ifdef CONFIG_X86_PAE 354#ifdef CONFIG_X86_PAE
315 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); 355 gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
316 /* middle level not present? We can't map it in. */ 356 /* Middle level not present? We can't map it in. */
317 if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) 357 if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
318 return false; 358 return false;
319 359
@@ -324,8 +364,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
324 /* No shadow entry: allocate a new shadow PTE page. */ 364 /* No shadow entry: allocate a new shadow PTE page. */
325 unsigned long ptepage = get_zeroed_page(GFP_KERNEL); 365 unsigned long ptepage = get_zeroed_page(GFP_KERNEL);
326 366
327 /* This is not really the Guest's fault, but killing it is 367 /*
328 * simple for this corner case. */ 368 * This is not really the Guest's fault, but killing it is
369 * simple for this corner case.
370 */
329 if (!ptepage) { 371 if (!ptepage) {
330 kill_guest(cpu, "out of memory allocating pte page"); 372 kill_guest(cpu, "out of memory allocating pte page");
331 return false; 373 return false;
@@ -334,27 +376,37 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
334 /* We check that the Guest pmd is OK. */ 376 /* We check that the Guest pmd is OK. */
335 check_gpmd(cpu, gpmd); 377 check_gpmd(cpu, gpmd);
336 378
337 /* And we copy the flags to the shadow PMD entry. The page 379 /*
338 * number in the shadow PMD is the page we just allocated. */ 380 * And we copy the flags to the shadow PMD entry. The page
381 * number in the shadow PMD is the page we just allocated.
382 */
339 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); 383 native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd)));
340 } 384 }
341 385
342 /* OK, now we look at the lower level in the Guest page table: keep its 386 /*
343 * address, because we might update it later. */ 387 * OK, now we look at the lower level in the Guest page table: keep its
388 * address, because we might update it later.
389 */
344 gpte_ptr = gpte_addr(cpu, gpmd, vaddr); 390 gpte_ptr = gpte_addr(cpu, gpmd, vaddr);
345#else 391#else
346 /* OK, now we look at the lower level in the Guest page table: keep its 392 /*
347 * address, because we might update it later. */ 393 * OK, now we look at the lower level in the Guest page table: keep its
394 * address, because we might update it later.
395 */
348 gpte_ptr = gpte_addr(cpu, gpgd, vaddr); 396 gpte_ptr = gpte_addr(cpu, gpgd, vaddr);
349#endif 397#endif
398
399 /* Read the actual PTE value. */
350 gpte = lgread(cpu, gpte_ptr, pte_t); 400 gpte = lgread(cpu, gpte_ptr, pte_t);
351 401
352 /* If this page isn't in the Guest page tables, we can't page it in. */ 402 /* If this page isn't in the Guest page tables, we can't page it in. */
353 if (!(pte_flags(gpte) & _PAGE_PRESENT)) 403 if (!(pte_flags(gpte) & _PAGE_PRESENT))
354 return false; 404 return false;
355 405
356 /* Check they're not trying to write to a page the Guest wants 406 /*
357 * read-only (bit 2 of errcode == write). */ 407 * Check they're not trying to write to a page the Guest wants
408 * read-only (bit 2 of errcode == write).
409 */
358 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) 410 if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW))
359 return false; 411 return false;
360 412
@@ -362,8 +414,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
362 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) 414 if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
363 return false; 415 return false;
364 416
365 /* Check that the Guest PTE flags are OK, and the page number is below 417 /*
366 * the pfn_limit (ie. not mapping the Launcher binary). */ 418 * Check that the Guest PTE flags are OK, and the page number is below
419 * the pfn_limit (ie. not mapping the Launcher binary).
420 */
367 check_gpte(cpu, gpte); 421 check_gpte(cpu, gpte);
368 422
369 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ 423 /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */
@@ -373,29 +427,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
373 427
374 /* Get the pointer to the shadow PTE entry we're going to set. */ 428 /* Get the pointer to the shadow PTE entry we're going to set. */
375 spte = spte_addr(cpu, *spgd, vaddr); 429 spte = spte_addr(cpu, *spgd, vaddr);
376 /* If there was a valid shadow PTE entry here before, we release it. 430
377 * This can happen with a write to a previously read-only entry. */ 431 /*
432 * If there was a valid shadow PTE entry here before, we release it.
433 * This can happen with a write to a previously read-only entry.
434 */
378 release_pte(*spte); 435 release_pte(*spte);
379 436
380 /* If this is a write, we insist that the Guest page is writable (the 437 /*
381 * final arg to gpte_to_spte()). */ 438 * If this is a write, we insist that the Guest page is writable (the
439 * final arg to gpte_to_spte()).
440 */
382 if (pte_dirty(gpte)) 441 if (pte_dirty(gpte))
383 *spte = gpte_to_spte(cpu, gpte, 1); 442 *spte = gpte_to_spte(cpu, gpte, 1);
384 else 443 else
385 /* If this is a read, don't set the "writable" bit in the page 444 /*
445 * If this is a read, don't set the "writable" bit in the page
386 * table entry, even if the Guest says it's writable. That way 446 * table entry, even if the Guest says it's writable. That way
387 * we will come back here when a write does actually occur, so 447 * we will come back here when a write does actually occur, so
388 * we can update the Guest's _PAGE_DIRTY flag. */ 448 * we can update the Guest's _PAGE_DIRTY flag.
449 */
389 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); 450 native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0));
390 451
391 /* Finally, we write the Guest PTE entry back: we've set the 452 /*
392 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ 453 * Finally, we write the Guest PTE entry back: we've set the
454 * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags.
455 */
393 lgwrite(cpu, gpte_ptr, pte_t, gpte); 456 lgwrite(cpu, gpte_ptr, pte_t, gpte);
394 457
395 /* The fault is fixed, the page table is populated, the mapping 458 /*
459 * The fault is fixed, the page table is populated, the mapping
396 * manipulated, the result returned and the code complete. A small 460 * manipulated, the result returned and the code complete. A small
397 * delay and a trace of alliteration are the only indications the Guest 461 * delay and a trace of alliteration are the only indications the Guest
398 * has that a page fault occurred at all. */ 462 * has that a page fault occurred at all.
463 */
399 return true; 464 return true;
400} 465}
401 466
@@ -408,7 +473,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
408 * mapped, so it's overkill. 473 * mapped, so it's overkill.
409 * 474 *
410 * This is a quick version which answers the question: is this virtual address 475 * This is a quick version which answers the question: is this virtual address
411 * mapped by the shadow page tables, and is it writable? */ 476 * mapped by the shadow page tables, and is it writable?
477 */
412static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) 478static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
413{ 479{
414 pgd_t *spgd; 480 pgd_t *spgd;
@@ -428,21 +494,26 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
428 return false; 494 return false;
429#endif 495#endif
430 496
431 /* Check the flags on the pte entry itself: it must be present and 497 /*
432 * writable. */ 498 * Check the flags on the pte entry itself: it must be present and
499 * writable.
500 */
433 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); 501 flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr)));
434 502
435 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); 503 return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW);
436} 504}
437 505
438/* So, when pin_stack_pages() asks us to pin a page, we check if it's already 506/*
507 * So, when pin_stack_pages() asks us to pin a page, we check if it's already
439 * in the page tables, and if not, we call demand_page() with error code 2 508 * in the page tables, and if not, we call demand_page() with error code 2
440 * (meaning "write"). */ 509 * (meaning "write").
510 */
441void pin_page(struct lg_cpu *cpu, unsigned long vaddr) 511void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
442{ 512{
443 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) 513 if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
444 kill_guest(cpu, "bad stack page %#lx", vaddr); 514 kill_guest(cpu, "bad stack page %#lx", vaddr);
445} 515}
516/*:*/
446 517
447#ifdef CONFIG_X86_PAE 518#ifdef CONFIG_X86_PAE
448static void release_pmd(pmd_t *spmd) 519static void release_pmd(pmd_t *spmd)
@@ -479,15 +550,21 @@ static void release_pgd(pgd_t *spgd)
479} 550}
480 551
481#else /* !CONFIG_X86_PAE */ 552#else /* !CONFIG_X86_PAE */
482/*H:450 If we chase down the release_pgd() code, it looks like this: */ 553/*H:450
554 * If we chase down the release_pgd() code, the non-PAE version looks like
555 * this. The PAE version is almost identical, but instead of calling
556 * release_pte it calls release_pmd(), which looks much like this.
557 */
483static void release_pgd(pgd_t *spgd) 558static void release_pgd(pgd_t *spgd)
484{ 559{
485 /* If the entry's not present, there's nothing to release. */ 560 /* If the entry's not present, there's nothing to release. */
486 if (pgd_flags(*spgd) & _PAGE_PRESENT) { 561 if (pgd_flags(*spgd) & _PAGE_PRESENT) {
487 unsigned int i; 562 unsigned int i;
488 /* Converting the pfn to find the actual PTE page is easy: turn 563 /*
564 * Converting the pfn to find the actual PTE page is easy: turn
489 * the page number into a physical address, then convert to a 565 * the page number into a physical address, then convert to a
490 * virtual address (easy for kernel pages like this one). */ 566 * virtual address (easy for kernel pages like this one).
567 */
491 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 568 pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
492 /* For each entry in the page, we might need to release it. */ 569 /* For each entry in the page, we might need to release it. */
493 for (i = 0; i < PTRS_PER_PTE; i++) 570 for (i = 0; i < PTRS_PER_PTE; i++)
@@ -499,9 +576,12 @@ static void release_pgd(pgd_t *spgd)
499 } 576 }
500} 577}
501#endif 578#endif
502/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() 579
580/*H:445
581 * We saw flush_user_mappings() twice: once from the flush_user_mappings()
503 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. 582 * hypercall and once in new_pgdir() when we re-used a top-level pgdir page.
504 * It simply releases every PTE page from 0 up to the Guest's kernel address. */ 583 * It simply releases every PTE page from 0 up to the Guest's kernel address.
584 */
505static void flush_user_mappings(struct lguest *lg, int idx) 585static void flush_user_mappings(struct lguest *lg, int idx)
506{ 586{
507 unsigned int i; 587 unsigned int i;
@@ -510,10 +590,12 @@ static void flush_user_mappings(struct lguest *lg, int idx)
510 release_pgd(lg->pgdirs[idx].pgdir + i); 590 release_pgd(lg->pgdirs[idx].pgdir + i);
511} 591}
512 592
513/*H:440 (v) Flushing (throwing away) page tables, 593/*H:440
594 * (v) Flushing (throwing away) page tables,
514 * 595 *
515 * The Guest has a hypercall to throw away the page tables: it's used when a 596 * The Guest has a hypercall to throw away the page tables: it's used when a
516 * large number of mappings have been changed. */ 597 * large number of mappings have been changed.
598 */
517void guest_pagetable_flush_user(struct lg_cpu *cpu) 599void guest_pagetable_flush_user(struct lg_cpu *cpu)
518{ 600{
519 /* Drop the userspace part of the current page table. */ 601 /* Drop the userspace part of the current page table. */
@@ -551,9 +633,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
551 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); 633 return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
552} 634}
553 635
554/* We keep several page tables. This is a simple routine to find the page 636/*
637 * We keep several page tables. This is a simple routine to find the page
555 * table (if any) corresponding to this top-level address the Guest has given 638 * table (if any) corresponding to this top-level address the Guest has given
556 * us. */ 639 * us.
640 */
557static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) 641static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
558{ 642{
559 unsigned int i; 643 unsigned int i;
@@ -563,9 +647,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable)
563 return i; 647 return i;
564} 648}
565 649
566/*H:435 And this is us, creating the new page directory. If we really do 650/*H:435
651 * And this is us, creating the new page directory. If we really do
567 * allocate a new one (and so the kernel parts are not there), we set 652 * allocate a new one (and so the kernel parts are not there), we set
568 * blank_pgdir. */ 653 * blank_pgdir.
654 */
569static unsigned int new_pgdir(struct lg_cpu *cpu, 655static unsigned int new_pgdir(struct lg_cpu *cpu,
570 unsigned long gpgdir, 656 unsigned long gpgdir,
571 int *blank_pgdir) 657 int *blank_pgdir)
@@ -575,8 +661,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
575 pmd_t *pmd_table; 661 pmd_t *pmd_table;
576#endif 662#endif
577 663
578 /* We pick one entry at random to throw out. Choosing the Least 664 /*
579 * Recently Used might be better, but this is easy. */ 665 * We pick one entry at random to throw out. Choosing the Least
666 * Recently Used might be better, but this is easy.
667 */
580 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); 668 next = random32() % ARRAY_SIZE(cpu->lg->pgdirs);
581 /* If it's never been allocated at all before, try now. */ 669 /* If it's never been allocated at all before, try now. */
582 if (!cpu->lg->pgdirs[next].pgdir) { 670 if (!cpu->lg->pgdirs[next].pgdir) {
@@ -587,8 +675,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
587 next = cpu->cpu_pgd; 675 next = cpu->cpu_pgd;
588 else { 676 else {
589#ifdef CONFIG_X86_PAE 677#ifdef CONFIG_X86_PAE
590 /* In PAE mode, allocate a pmd page and populate the 678 /*
591 * last pgd entry. */ 679 * In PAE mode, allocate a pmd page and populate the
680 * last pgd entry.
681 */
592 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); 682 pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL);
593 if (!pmd_table) { 683 if (!pmd_table) {
594 free_page((long)cpu->lg->pgdirs[next].pgdir); 684 free_page((long)cpu->lg->pgdirs[next].pgdir);
@@ -598,8 +688,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
598 set_pgd(cpu->lg->pgdirs[next].pgdir + 688 set_pgd(cpu->lg->pgdirs[next].pgdir +
599 SWITCHER_PGD_INDEX, 689 SWITCHER_PGD_INDEX,
600 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 690 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
601 /* This is a blank page, so there are no kernel 691 /*
602 * mappings: caller must map the stack! */ 692 * This is a blank page, so there are no kernel
693 * mappings: caller must map the stack!
694 */
603 *blank_pgdir = 1; 695 *blank_pgdir = 1;
604 } 696 }
605#else 697#else
@@ -615,19 +707,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu,
615 return next; 707 return next;
616} 708}
617 709
618/*H:430 (iv) Switching page tables 710/*H:430
711 * (iv) Switching page tables
619 * 712 *
620 * Now we've seen all the page table setting and manipulation, let's see 713 * Now we've seen all the page table setting and manipulation, let's see
621 * what happens when the Guest changes page tables (ie. changes the top-level 714 * what happens when the Guest changes page tables (ie. changes the top-level
622 * pgdir). This occurs on almost every context switch. */ 715 * pgdir). This occurs on almost every context switch.
716 */
623void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) 717void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
624{ 718{
625 int newpgdir, repin = 0; 719 int newpgdir, repin = 0;
626 720
627 /* Look to see if we have this one already. */ 721 /* Look to see if we have this one already. */
628 newpgdir = find_pgdir(cpu->lg, pgtable); 722 newpgdir = find_pgdir(cpu->lg, pgtable);
629 /* If not, we allocate or mug an existing one: if it's a fresh one, 723 /*
630 * repin gets set to 1. */ 724 * If not, we allocate or mug an existing one: if it's a fresh one,
725 * repin gets set to 1.
726 */
631 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) 727 if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs))
632 newpgdir = new_pgdir(cpu, pgtable, &repin); 728 newpgdir = new_pgdir(cpu, pgtable, &repin);
633 /* Change the current pgd index to the new one. */ 729 /* Change the current pgd index to the new one. */
@@ -637,9 +733,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable)
637 pin_stack_pages(cpu); 733 pin_stack_pages(cpu);
638} 734}
639 735
640/*H:470 Finally, a routine which throws away everything: all PGD entries in all 736/*H:470
737 * Finally, a routine which throws away everything: all PGD entries in all
641 * the shadow page tables, including the Guest's kernel mappings. This is used 738 * the shadow page tables, including the Guest's kernel mappings. This is used
642 * when we destroy the Guest. */ 739 * when we destroy the Guest.
740 */
643static void release_all_pagetables(struct lguest *lg) 741static void release_all_pagetables(struct lguest *lg)
644{ 742{
645 unsigned int i, j; 743 unsigned int i, j;
@@ -656,8 +754,10 @@ static void release_all_pagetables(struct lguest *lg)
656 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; 754 spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX;
657 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); 755 pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT);
658 756
659 /* And release the pmd entries of that pmd page, 757 /*
660 * except for the switcher pmd. */ 758 * And release the pmd entries of that pmd page,
759 * except for the switcher pmd.
760 */
661 for (k = 0; k < SWITCHER_PMD_INDEX; k++) 761 for (k = 0; k < SWITCHER_PMD_INDEX; k++)
662 release_pmd(&pmdpage[k]); 762 release_pmd(&pmdpage[k]);
663#endif 763#endif
@@ -667,10 +767,12 @@ static void release_all_pagetables(struct lguest *lg)
667 } 767 }
668} 768}
669 769
670/* We also throw away everything when a Guest tells us it's changed a kernel 770/*
771 * We also throw away everything when a Guest tells us it's changed a kernel
671 * mapping. Since kernel mappings are in every page table, it's easiest to 772 * mapping. Since kernel mappings are in every page table, it's easiest to
672 * throw them all away. This traps the Guest in amber for a while as 773 * throw them all away. This traps the Guest in amber for a while as
673 * everything faults back in, but it's rare. */ 774 * everything faults back in, but it's rare.
775 */
674void guest_pagetable_clear_all(struct lg_cpu *cpu) 776void guest_pagetable_clear_all(struct lg_cpu *cpu)
675{ 777{
676 release_all_pagetables(cpu->lg); 778 release_all_pagetables(cpu->lg);
@@ -678,15 +780,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu)
678 pin_stack_pages(cpu); 780 pin_stack_pages(cpu);
679} 781}
680/*:*/ 782/*:*/
681/*M:009 Since we throw away all mappings when a kernel mapping changes, our 783
784/*M:009
785 * Since we throw away all mappings when a kernel mapping changes, our
682 * performance sucks for guests using highmem. In fact, a guest with 786 * performance sucks for guests using highmem. In fact, a guest with
683 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is 787 * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is
684 * usually slower than a Guest with less memory. 788 * usually slower than a Guest with less memory.
685 * 789 *
686 * This, of course, cannot be fixed. It would take some kind of... well, I 790 * This, of course, cannot be fixed. It would take some kind of... well, I
687 * don't know, but the term "puissant code-fu" comes to mind. :*/ 791 * don't know, but the term "puissant code-fu" comes to mind.
792:*/
688 793
689/*H:420 This is the routine which actually sets the page table entry for then 794/*H:420
795 * This is the routine which actually sets the page table entry for then
690 * "idx"'th shadow page table. 796 * "idx"'th shadow page table.
691 * 797 *
692 * Normally, we can just throw out the old entry and replace it with 0: if they 798 * Normally, we can just throw out the old entry and replace it with 0: if they
@@ -715,31 +821,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
715 spmd = spmd_addr(cpu, *spgd, vaddr); 821 spmd = spmd_addr(cpu, *spgd, vaddr);
716 if (pmd_flags(*spmd) & _PAGE_PRESENT) { 822 if (pmd_flags(*spmd) & _PAGE_PRESENT) {
717#endif 823#endif
718 /* Otherwise, we start by releasing 824 /* Otherwise, start by releasing the existing entry. */
719 * the existing entry. */
720 pte_t *spte = spte_addr(cpu, *spgd, vaddr); 825 pte_t *spte = spte_addr(cpu, *spgd, vaddr);
721 release_pte(*spte); 826 release_pte(*spte);
722 827
723 /* If they're setting this entry as dirty or accessed, 828 /*
724 * we might as well put that entry they've given us 829 * If they're setting this entry as dirty or accessed,
725 * in now. This shaves 10% off a 830 * we might as well put that entry they've given us in
726 * copy-on-write micro-benchmark. */ 831 * now. This shaves 10% off a copy-on-write
832 * micro-benchmark.
833 */
727 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { 834 if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
728 check_gpte(cpu, gpte); 835 check_gpte(cpu, gpte);
729 native_set_pte(spte, 836 native_set_pte(spte,
730 gpte_to_spte(cpu, gpte, 837 gpte_to_spte(cpu, gpte,
731 pte_flags(gpte) & _PAGE_DIRTY)); 838 pte_flags(gpte) & _PAGE_DIRTY));
732 } else 839 } else {
733 /* Otherwise kill it and we can demand_page() 840 /*
734 * it in later. */ 841 * Otherwise kill it and we can demand_page()
842 * it in later.
843 */
735 native_set_pte(spte, __pte(0)); 844 native_set_pte(spte, __pte(0));
845 }
736#ifdef CONFIG_X86_PAE 846#ifdef CONFIG_X86_PAE
737 } 847 }
738#endif 848#endif
739 } 849 }
740} 850}
741 851
742/*H:410 Updating a PTE entry is a little trickier. 852/*H:410
853 * Updating a PTE entry is a little trickier.
743 * 854 *
744 * We keep track of several different page tables (the Guest uses one for each 855 * We keep track of several different page tables (the Guest uses one for each
745 * process, so it makes sense to cache at least a few). Each of these have 856 * process, so it makes sense to cache at least a few). Each of these have
@@ -748,12 +859,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx,
748 * all the page tables, not just the current one. This is rare. 859 * all the page tables, not just the current one. This is rare.
749 * 860 *
750 * The benefit is that when we have to track a new page table, we can keep all 861 * The benefit is that when we have to track a new page table, we can keep all
751 * the kernel mappings. This speeds up context switch immensely. */ 862 * the kernel mappings. This speeds up context switch immensely.
863 */
752void guest_set_pte(struct lg_cpu *cpu, 864void guest_set_pte(struct lg_cpu *cpu,
753 unsigned long gpgdir, unsigned long vaddr, pte_t gpte) 865 unsigned long gpgdir, unsigned long vaddr, pte_t gpte)
754{ 866{
755 /* Kernel mappings must be changed on all top levels. Slow, but doesn't 867 /*
756 * happen often. */ 868 * Kernel mappings must be changed on all top levels. Slow, but doesn't
869 * happen often.
870 */
757 if (vaddr >= cpu->lg->kernel_address) { 871 if (vaddr >= cpu->lg->kernel_address) {
758 unsigned int i; 872 unsigned int i;
759 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) 873 for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++)
@@ -795,19 +909,25 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx)
795 /* ... throw it away. */ 909 /* ... throw it away. */
796 release_pgd(lg->pgdirs[pgdir].pgdir + idx); 910 release_pgd(lg->pgdirs[pgdir].pgdir + idx);
797} 911}
912
798#ifdef CONFIG_X86_PAE 913#ifdef CONFIG_X86_PAE
914/* For setting a mid-level, we just throw everything away. It's easy. */
799void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) 915void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx)
800{ 916{
801 guest_pagetable_clear_all(&lg->cpus[0]); 917 guest_pagetable_clear_all(&lg->cpus[0]);
802} 918}
803#endif 919#endif
804 920
805/* Once we know how much memory we have we can construct simple identity 921/*H:505
806 * (which set virtual == physical) and linear mappings 922 * To get through boot, we construct simple identity page mappings (which
807 * which will get the Guest far enough into the boot to create its own. 923 * set virtual == physical) and linear mappings which will get the Guest far
924 * enough into the boot to create its own. The linear mapping means we
925 * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET,
926 * as you'll see.
808 * 927 *
809 * We lay them out of the way, just below the initrd (which is why we need to 928 * We lay them out of the way, just below the initrd (which is why we need to
810 * know its size here). */ 929 * know its size here).
930 */
811static unsigned long setup_pagetables(struct lguest *lg, 931static unsigned long setup_pagetables(struct lguest *lg,
812 unsigned long mem, 932 unsigned long mem,
813 unsigned long initrd_size) 933 unsigned long initrd_size)
@@ -825,8 +945,10 @@ static unsigned long setup_pagetables(struct lguest *lg,
825 unsigned int phys_linear; 945 unsigned int phys_linear;
826#endif 946#endif
827 947
828 /* We have mapped_pages frames to map, so we need 948 /*
829 * linear_pages page tables to map them. */ 949 * We have mapped_pages frames to map, so we need linear_pages page
950 * tables to map them.
951 */
830 mapped_pages = mem / PAGE_SIZE; 952 mapped_pages = mem / PAGE_SIZE;
831 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; 953 linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE;
832 954
@@ -837,10 +959,16 @@ static unsigned long setup_pagetables(struct lguest *lg,
837 linear = (void *)pgdir - linear_pages * PAGE_SIZE; 959 linear = (void *)pgdir - linear_pages * PAGE_SIZE;
838 960
839#ifdef CONFIG_X86_PAE 961#ifdef CONFIG_X86_PAE
962 /*
963 * And the single mid page goes below that. We only use one, but
964 * that's enough to map 1G, which definitely gets us through boot.
965 */
840 pmds = (void *)linear - PAGE_SIZE; 966 pmds = (void *)linear - PAGE_SIZE;
841#endif 967#endif
842 /* Linear mapping is easy: put every page's address into the 968 /*
843 * mapping in order. */ 969 * Linear mapping is easy: put every page's address into the
970 * mapping in order.
971 */
844 for (i = 0; i < mapped_pages; i++) { 972 for (i = 0; i < mapped_pages; i++) {
845 pte_t pte; 973 pte_t pte;
846 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); 974 pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER));
@@ -848,11 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg,
848 return -EFAULT; 976 return -EFAULT;
849 } 977 }
850 978
851 /* The top level points to the linear page table pages above.
852 * We setup the identity and linear mappings here. */
853#ifdef CONFIG_X86_PAE 979#ifdef CONFIG_X86_PAE
980 /*
981 * Make the Guest PMD entries point to the corresponding place in the
982 * linear mapping (up to one page worth of PMD).
983 */
854 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; 984 for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD;
855 i += PTRS_PER_PTE, j++) { 985 i += PTRS_PER_PTE, j++) {
986 /* FIXME: native_set_pmd is overkill here. */
856 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) 987 native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i)
857 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 988 - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
858 989
@@ -860,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg,
860 return -EFAULT; 991 return -EFAULT;
861 } 992 }
862 993
994 /* One PGD entry, pointing to that PMD page. */
863 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); 995 set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT));
996 /* Copy it in as the first PGD entry (ie. addresses 0-1G). */
864 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) 997 if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0)
865 return -EFAULT; 998 return -EFAULT;
999 /*
1000 * And the third PGD entry (ie. addresses 3G-4G).
1001 *
1002 * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000.
1003 */
866 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) 1004 if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0)
867 return -EFAULT; 1005 return -EFAULT;
868#else 1006#else
1007 /*
1008 * The top level points to the linear page table pages above.
1009 * We setup the identity and linear mappings here.
1010 */
869 phys_linear = (unsigned long)linear - mem_base; 1011 phys_linear = (unsigned long)linear - mem_base;
870 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { 1012 for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) {
871 pgd_t pgd; 1013 pgd_t pgd;
1014 /*
1015 * Create a PGD entry which points to the right part of the
1016 * linear PTE pages.
1017 */
872 pgd = __pgd((phys_linear + i * sizeof(pte_t)) | 1018 pgd = __pgd((phys_linear + i * sizeof(pte_t)) |
873 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); 1019 (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER));
874 1020
1021 /*
1022 * Copy it into the PGD page at 0 and PAGE_OFFSET.
1023 */
875 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) 1024 if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd))
876 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) 1025 || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET)
877 + i / PTRS_PER_PTE], 1026 + i / PTRS_PER_PTE],
@@ -880,15 +1029,19 @@ static unsigned long setup_pagetables(struct lguest *lg,
880 } 1029 }
881#endif 1030#endif
882 1031
883 /* We return the top level (guest-physical) address: remember where 1032 /*
884 * this is. */ 1033 * We return the top level (guest-physical) address: we remember where
1034 * this is to write it into lguest_data when the Guest initializes.
1035 */
885 return (unsigned long)pgdir - mem_base; 1036 return (unsigned long)pgdir - mem_base;
886} 1037}
887 1038
888/*H:500 (vii) Setting up the page tables initially. 1039/*H:500
1040 * (vii) Setting up the page tables initially.
889 * 1041 *
890 * When a Guest is first created, the Launcher tells us where the toplevel of 1042 * When a Guest is first created, the Launcher tells us where the toplevel of
891 * its first page table is. We set some things up here: */ 1043 * its first page table is. We set some things up here:
1044 */
892int init_guest_pagetable(struct lguest *lg) 1045int init_guest_pagetable(struct lguest *lg)
893{ 1046{
894 u64 mem; 1047 u64 mem;
@@ -898,21 +1051,27 @@ int init_guest_pagetable(struct lguest *lg)
898 pgd_t *pgd; 1051 pgd_t *pgd;
899 pmd_t *pmd_table; 1052 pmd_t *pmd_table;
900#endif 1053#endif
901 /* Get the Guest memory size and the ramdisk size from the boot header 1054 /*
902 * located at lg->mem_base (Guest address 0). */ 1055 * Get the Guest memory size and the ramdisk size from the boot header
1056 * located at lg->mem_base (Guest address 0).
1057 */
903 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) 1058 if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem))
904 || get_user(initrd_size, &boot->hdr.ramdisk_size)) 1059 || get_user(initrd_size, &boot->hdr.ramdisk_size))
905 return -EFAULT; 1060 return -EFAULT;
906 1061
907 /* We start on the first shadow page table, and give it a blank PGD 1062 /*
908 * page. */ 1063 * We start on the first shadow page table, and give it a blank PGD
1064 * page.
1065 */
909 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); 1066 lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size);
910 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) 1067 if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir))
911 return lg->pgdirs[0].gpgdir; 1068 return lg->pgdirs[0].gpgdir;
912 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); 1069 lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL);
913 if (!lg->pgdirs[0].pgdir) 1070 if (!lg->pgdirs[0].pgdir)
914 return -ENOMEM; 1071 return -ENOMEM;
1072
915#ifdef CONFIG_X86_PAE 1073#ifdef CONFIG_X86_PAE
1074 /* For PAE, we also create the initial mid-level. */
916 pgd = lg->pgdirs[0].pgdir; 1075 pgd = lg->pgdirs[0].pgdir;
917 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); 1076 pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL);
918 if (!pmd_table) 1077 if (!pmd_table)
@@ -921,27 +1080,33 @@ int init_guest_pagetable(struct lguest *lg)
921 set_pgd(pgd + SWITCHER_PGD_INDEX, 1080 set_pgd(pgd + SWITCHER_PGD_INDEX,
922 __pgd(__pa(pmd_table) | _PAGE_PRESENT)); 1081 __pgd(__pa(pmd_table) | _PAGE_PRESENT));
923#endif 1082#endif
1083
1084 /* This is the current page table. */
924 lg->cpus[0].cpu_pgd = 0; 1085 lg->cpus[0].cpu_pgd = 0;
925 return 0; 1086 return 0;
926} 1087}
927 1088
928/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ 1089/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */
929void page_table_guest_data_init(struct lg_cpu *cpu) 1090void page_table_guest_data_init(struct lg_cpu *cpu)
930{ 1091{
931 /* We get the kernel address: above this is all kernel memory. */ 1092 /* We get the kernel address: above this is all kernel memory. */
932 if (get_user(cpu->lg->kernel_address, 1093 if (get_user(cpu->lg->kernel_address,
933 &cpu->lg->lguest_data->kernel_address) 1094 &cpu->lg->lguest_data->kernel_address)
934 /* We tell the Guest that it can't use the top 2 or 4 MB 1095 /*
935 * of virtual addresses used by the Switcher. */ 1096 * We tell the Guest that it can't use the top 2 or 4 MB
1097 * of virtual addresses used by the Switcher.
1098 */
936 || put_user(RESERVE_MEM * 1024 * 1024, 1099 || put_user(RESERVE_MEM * 1024 * 1024,
937 &cpu->lg->lguest_data->reserve_mem) 1100 &cpu->lg->lguest_data->reserve_mem)
938 || put_user(cpu->lg->pgdirs[0].gpgdir, 1101 || put_user(cpu->lg->pgdirs[0].gpgdir,
939 &cpu->lg->lguest_data->pgdir)) 1102 &cpu->lg->lguest_data->pgdir))
940 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); 1103 kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data);
941 1104
942 /* In flush_user_mappings() we loop from 0 to 1105 /*
1106 * In flush_user_mappings() we loop from 0 to
943 * "pgd_index(lg->kernel_address)". This assumes it won't hit the 1107 * "pgd_index(lg->kernel_address)". This assumes it won't hit the
944 * Switcher mappings, so check that now. */ 1108 * Switcher mappings, so check that now.
1109 */
945#ifdef CONFIG_X86_PAE 1110#ifdef CONFIG_X86_PAE
946 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && 1111 if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX &&
947 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) 1112 pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX)
@@ -964,12 +1129,14 @@ void free_guest_pagetable(struct lguest *lg)
964 free_page((long)lg->pgdirs[i].pgdir); 1129 free_page((long)lg->pgdirs[i].pgdir);
965} 1130}
966 1131
967/*H:480 (vi) Mapping the Switcher when the Guest is about to run. 1132/*H:480
1133 * (vi) Mapping the Switcher when the Guest is about to run.
968 * 1134 *
969 * The Switcher and the two pages for this CPU need to be visible in the 1135 * The Switcher and the two pages for this CPU need to be visible in the
970 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages 1136 * Guest (and not the pages for other CPUs). We have the appropriate PTE pages
971 * for each CPU already set up, we just need to hook them in now we know which 1137 * for each CPU already set up, we just need to hook them in now we know which
972 * Guest is about to run on this CPU. */ 1138 * Guest is about to run on this CPU.
1139 */
973void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) 1140void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
974{ 1141{
975 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); 1142 pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages);
@@ -980,30 +1147,38 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages)
980 pmd_t switcher_pmd; 1147 pmd_t switcher_pmd;
981 pmd_t *pmd_table; 1148 pmd_t *pmd_table;
982 1149
1150 /* FIXME: native_set_pmd is overkill here. */
983 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> 1151 native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >>
984 PAGE_SHIFT, PAGE_KERNEL_EXEC)); 1152 PAGE_SHIFT, PAGE_KERNEL_EXEC));
985 1153
1154 /* Figure out where the pmd page is, by reading the PGD, and converting
1155 * it to a virtual address. */
986 pmd_table = __va(pgd_pfn(cpu->lg-> 1156 pmd_table = __va(pgd_pfn(cpu->lg->
987 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) 1157 pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX])
988 << PAGE_SHIFT); 1158 << PAGE_SHIFT);
1159 /* Now write it into the shadow page table. */
989 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); 1160 native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd);
990#else 1161#else
991 pgd_t switcher_pgd; 1162 pgd_t switcher_pgd;
992 1163
993 /* Make the last PGD entry for this Guest point to the Switcher's PTE 1164 /*
994 * page for this CPU (with appropriate flags). */ 1165 * Make the last PGD entry for this Guest point to the Switcher's PTE
1166 * page for this CPU (with appropriate flags).
1167 */
995 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); 1168 switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC);
996 1169
997 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; 1170 cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd;
998 1171
999#endif 1172#endif
1000 /* We also change the Switcher PTE page. When we're running the Guest, 1173 /*
1174 * We also change the Switcher PTE page. When we're running the Guest,
1001 * we want the Guest's "regs" page to appear where the first Switcher 1175 * we want the Guest's "regs" page to appear where the first Switcher
1002 * page for this CPU is. This is an optimization: when the Switcher 1176 * page for this CPU is. This is an optimization: when the Switcher
1003 * saves the Guest registers, it saves them into the first page of this 1177 * saves the Guest registers, it saves them into the first page of this
1004 * CPU's "struct lguest_pages": if we make sure the Guest's register 1178 * CPU's "struct lguest_pages": if we make sure the Guest's register
1005 * page is already mapped there, we don't have to copy them out 1179 * page is already mapped there, we don't have to copy them out
1006 * again. */ 1180 * again.
1181 */
1007 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; 1182 pfn = __pa(cpu->regs_page) >> PAGE_SHIFT;
1008 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL)); 1183 native_set_pte(&regs_pte, pfn_pte(pfn, PAGE_KERNEL));
1009 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], 1184 native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)],
@@ -1019,10 +1194,12 @@ static void free_switcher_pte_pages(void)
1019 free_page((long)switcher_pte_page(i)); 1194 free_page((long)switcher_pte_page(i));
1020} 1195}
1021 1196
1022/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given 1197/*H:520
1198 * Setting up the Switcher PTE page for given CPU is fairly easy, given
1023 * the CPU number and the "struct page"s for the Switcher code itself. 1199 * the CPU number and the "struct page"s for the Switcher code itself.
1024 * 1200 *
1025 * Currently the Switcher is less than a page long, so "pages" is always 1. */ 1201 * Currently the Switcher is less than a page long, so "pages" is always 1.
1202 */
1026static __init void populate_switcher_pte_page(unsigned int cpu, 1203static __init void populate_switcher_pte_page(unsigned int cpu,
1027 struct page *switcher_page[], 1204 struct page *switcher_page[],
1028 unsigned int pages) 1205 unsigned int pages)
@@ -1043,13 +1220,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1043 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), 1220 native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]),
1044 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); 1221 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW)));
1045 1222
1046 /* The second page contains the "struct lguest_ro_state", and is 1223 /*
1047 * read-only. */ 1224 * The second page contains the "struct lguest_ro_state", and is
1225 * read-only.
1226 */
1048 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), 1227 native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]),
1049 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); 1228 __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED)));
1050} 1229}
1051 1230
1052/* We've made it through the page table code. Perhaps our tired brains are 1231/*
1232 * We've made it through the page table code. Perhaps our tired brains are
1053 * still processing the details, or perhaps we're simply glad it's over. 1233 * still processing the details, or perhaps we're simply glad it's over.
1054 * 1234 *
1055 * If nothing else, note that all this complexity in juggling shadow page tables 1235 * If nothing else, note that all this complexity in juggling shadow page tables
@@ -1058,10 +1238,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu,
1058 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD 1238 * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD
1059 * have implemented shadow page table support directly into hardware. 1239 * have implemented shadow page table support directly into hardware.
1060 * 1240 *
1061 * There is just one file remaining in the Host. */ 1241 * There is just one file remaining in the Host.
1242 */
1062 1243
1063/*H:510 At boot or module load time, init_pagetables() allocates and populates 1244/*H:510
1064 * the Switcher PTE page for each CPU. */ 1245 * At boot or module load time, init_pagetables() allocates and populates
1246 * the Switcher PTE page for each CPU.
1247 */
1065__init int init_pagetables(struct page **switcher_page, unsigned int pages) 1248__init int init_pagetables(struct page **switcher_page, unsigned int pages)
1066{ 1249{
1067 unsigned int i; 1250 unsigned int i;
diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c
index 482ed5a18750..951c57b0a7e0 100644
--- a/drivers/lguest/segments.c
+++ b/drivers/lguest/segments.c
@@ -1,4 +1,5 @@
1/*P:600 The x86 architecture has segments, which involve a table of descriptors 1/*P:600
2 * The x86 architecture has segments, which involve a table of descriptors
2 * which can be used to do funky things with virtual address interpretation. 3 * which can be used to do funky things with virtual address interpretation.
3 * We originally used to use segments so the Guest couldn't alter the 4 * We originally used to use segments so the Guest couldn't alter the
4 * Guest<->Host Switcher, and then we had to trim Guest segments, and restore 5 * Guest<->Host Switcher, and then we had to trim Guest segments, and restore
@@ -8,7 +9,8 @@
8 * 9 *
9 * In these modern times, the segment handling code consists of simple sanity 10 * In these modern times, the segment handling code consists of simple sanity
10 * checks, and the worst you'll experience reading this code is butterfly-rash 11 * checks, and the worst you'll experience reading this code is butterfly-rash
11 * from frolicking through its parklike serenity. :*/ 12 * from frolicking through its parklike serenity.
13:*/
12#include "lg.h" 14#include "lg.h"
13 15
14/*H:600 16/*H:600
@@ -41,10 +43,12 @@
41 * begin. 43 * begin.
42 */ 44 */
43 45
44/* There are several entries we don't let the Guest set. The TSS entry is the 46/*
47 * There are several entries we don't let the Guest set. The TSS entry is the
45 * "Task State Segment" which controls all kinds of delicate things. The 48 * "Task State Segment" which controls all kinds of delicate things. The
46 * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the 49 * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the
47 * the Guest can't be trusted to deal with double faults. */ 50 * the Guest can't be trusted to deal with double faults.
51 */
48static bool ignored_gdt(unsigned int num) 52static bool ignored_gdt(unsigned int num)
49{ 53{
50 return (num == GDT_ENTRY_TSS 54 return (num == GDT_ENTRY_TSS
@@ -53,42 +57,52 @@ static bool ignored_gdt(unsigned int num)
53 || num == GDT_ENTRY_DOUBLEFAULT_TSS); 57 || num == GDT_ENTRY_DOUBLEFAULT_TSS);
54} 58}
55 59
56/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We 60/*H:630
61 * Once the Guest gave us new GDT entries, we fix them up a little. We
57 * don't care if they're invalid: the worst that can happen is a General 62 * don't care if they're invalid: the worst that can happen is a General
58 * Protection Fault in the Switcher when it restores a Guest segment register 63 * Protection Fault in the Switcher when it restores a Guest segment register
59 * which tries to use that entry. Then we kill the Guest for causing such a 64 * which tries to use that entry. Then we kill the Guest for causing such a
60 * mess: the message will be "unhandled trap 256". */ 65 * mess: the message will be "unhandled trap 256".
66 */
61static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) 67static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end)
62{ 68{
63 unsigned int i; 69 unsigned int i;
64 70
65 for (i = start; i < end; i++) { 71 for (i = start; i < end; i++) {
66 /* We never copy these ones to real GDT, so we don't care what 72 /*
67 * they say */ 73 * We never copy these ones to real GDT, so we don't care what
74 * they say
75 */
68 if (ignored_gdt(i)) 76 if (ignored_gdt(i))
69 continue; 77 continue;
70 78
71 /* Segment descriptors contain a privilege level: the Guest is 79 /*
80 * Segment descriptors contain a privilege level: the Guest is
72 * sometimes careless and leaves this as 0, even though it's 81 * sometimes careless and leaves this as 0, even though it's
73 * running at privilege level 1. If so, we fix it here. */ 82 * running at privilege level 1. If so, we fix it here.
83 */
74 if ((cpu->arch.gdt[i].b & 0x00006000) == 0) 84 if ((cpu->arch.gdt[i].b & 0x00006000) == 0)
75 cpu->arch.gdt[i].b |= (GUEST_PL << 13); 85 cpu->arch.gdt[i].b |= (GUEST_PL << 13);
76 86
77 /* Each descriptor has an "accessed" bit. If we don't set it 87 /*
88 * Each descriptor has an "accessed" bit. If we don't set it
78 * now, the CPU will try to set it when the Guest first loads 89 * now, the CPU will try to set it when the Guest first loads
79 * that entry into a segment register. But the GDT isn't 90 * that entry into a segment register. But the GDT isn't
80 * writable by the Guest, so bad things can happen. */ 91 * writable by the Guest, so bad things can happen.
92 */
81 cpu->arch.gdt[i].b |= 0x00000100; 93 cpu->arch.gdt[i].b |= 0x00000100;
82 } 94 }
83} 95}
84 96
85/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep 97/*H:610
98 * Like the IDT, we never simply use the GDT the Guest gives us. We keep
86 * a GDT for each CPU, and copy across the Guest's entries each time we want to 99 * a GDT for each CPU, and copy across the Guest's entries each time we want to
87 * run the Guest on that CPU. 100 * run the Guest on that CPU.
88 * 101 *
89 * This routine is called at boot or modprobe time for each CPU to set up the 102 * This routine is called at boot or modprobe time for each CPU to set up the
90 * constant GDT entries: the ones which are the same no matter what Guest we're 103 * constant GDT entries: the ones which are the same no matter what Guest we're
91 * running. */ 104 * running.
105 */
92void setup_default_gdt_entries(struct lguest_ro_state *state) 106void setup_default_gdt_entries(struct lguest_ro_state *state)
93{ 107{
94 struct desc_struct *gdt = state->guest_gdt; 108 struct desc_struct *gdt = state->guest_gdt;
@@ -98,30 +112,37 @@ void setup_default_gdt_entries(struct lguest_ro_state *state)
98 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 112 gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
99 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 113 gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
100 114
101 /* The TSS segment refers to the TSS entry for this particular CPU. 115 /*
116 * The TSS segment refers to the TSS entry for this particular CPU.
102 * Forgive the magic flags: the 0x8900 means the entry is Present, it's 117 * Forgive the magic flags: the 0x8900 means the entry is Present, it's
103 * privilege level 0 Available 386 TSS system segment, and the 0x67 118 * privilege level 0 Available 386 TSS system segment, and the 0x67
104 * means Saturn is eclipsed by Mercury in the twelfth house. */ 119 * means Saturn is eclipsed by Mercury in the twelfth house.
120 */
105 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); 121 gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16);
106 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) 122 gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000)
107 | ((tss >> 16) & 0x000000FF); 123 | ((tss >> 16) & 0x000000FF);
108} 124}
109 125
110/* This routine sets up the initial Guest GDT for booting. All entries start 126/*
111 * as 0 (unusable). */ 127 * This routine sets up the initial Guest GDT for booting. All entries start
128 * as 0 (unusable).
129 */
112void setup_guest_gdt(struct lg_cpu *cpu) 130void setup_guest_gdt(struct lg_cpu *cpu)
113{ 131{
114 /* Start with full 0-4G segments... */ 132 /*
133 * Start with full 0-4G segments...except the Guest is allowed to use
134 * them, so set the privilege level appropriately in the flags.
135 */
115 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; 136 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT;
116 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; 137 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT;
117 /* ...except the Guest is allowed to use them, so set the privilege
118 * level appropriately in the flags. */
119 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); 138 cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13);
120 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); 139 cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13);
121} 140}
122 141
123/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" 142/*H:650
124 * entries. */ 143 * An optimization of copy_gdt(), for just the three "thead-local storage"
144 * entries.
145 */
125void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) 146void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
126{ 147{
127 unsigned int i; 148 unsigned int i;
@@ -130,26 +151,34 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt)
130 gdt[i] = cpu->arch.gdt[i]; 151 gdt[i] = cpu->arch.gdt[i];
131} 152}
132 153
133/*H:640 When the Guest is run on a different CPU, or the GDT entries have 154/*H:640
134 * changed, copy_gdt() is called to copy the Guest's GDT entries across to this 155 * When the Guest is run on a different CPU, or the GDT entries have changed,
135 * CPU's GDT. */ 156 * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's
157 * GDT.
158 */
136void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) 159void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt)
137{ 160{
138 unsigned int i; 161 unsigned int i;
139 162
140 /* The default entries from setup_default_gdt_entries() are not 163 /*
141 * replaced. See ignored_gdt() above. */ 164 * The default entries from setup_default_gdt_entries() are not
165 * replaced. See ignored_gdt() above.
166 */
142 for (i = 0; i < GDT_ENTRIES; i++) 167 for (i = 0; i < GDT_ENTRIES; i++)
143 if (!ignored_gdt(i)) 168 if (!ignored_gdt(i))
144 gdt[i] = cpu->arch.gdt[i]; 169 gdt[i] = cpu->arch.gdt[i];
145} 170}
146 171
147/*H:620 This is where the Guest asks us to load a new GDT entry 172/*H:620
148 * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ 173 * This is where the Guest asks us to load a new GDT entry
174 * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in.
175 */
149void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) 176void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
150{ 177{
151 /* We assume the Guest has the same number of GDT entries as the 178 /*
152 * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ 179 * We assume the Guest has the same number of GDT entries as the
180 * Host, otherwise we'd have to dynamically allocate the Guest GDT.
181 */
153 if (num >= ARRAY_SIZE(cpu->arch.gdt)) 182 if (num >= ARRAY_SIZE(cpu->arch.gdt))
154 kill_guest(cpu, "too many gdt entries %i", num); 183 kill_guest(cpu, "too many gdt entries %i", num);
155 184
@@ -157,15 +186,19 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi)
157 cpu->arch.gdt[num].a = lo; 186 cpu->arch.gdt[num].a = lo;
158 cpu->arch.gdt[num].b = hi; 187 cpu->arch.gdt[num].b = hi;
159 fixup_gdt_table(cpu, num, num+1); 188 fixup_gdt_table(cpu, num, num+1);
160 /* Mark that the GDT changed so the core knows it has to copy it again, 189 /*
161 * even if the Guest is run on the same CPU. */ 190 * Mark that the GDT changed so the core knows it has to copy it again,
191 * even if the Guest is run on the same CPU.
192 */
162 cpu->changed |= CHANGED_GDT; 193 cpu->changed |= CHANGED_GDT;
163} 194}
164 195
165/* This is the fast-track version for just changing the three TLS entries. 196/*
197 * This is the fast-track version for just changing the three TLS entries.
166 * Remember that this happens on every context switch, so it's worth 198 * Remember that this happens on every context switch, so it's worth
167 * optimizing. But wouldn't it be neater to have a single hypercall to cover 199 * optimizing. But wouldn't it be neater to have a single hypercall to cover
168 * both cases? */ 200 * both cases?
201 */
169void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) 202void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
170{ 203{
171 struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; 204 struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN];
@@ -175,7 +208,6 @@ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls)
175 /* Note that just the TLS entries have changed. */ 208 /* Note that just the TLS entries have changed. */
176 cpu->changed |= CHANGED_GDT_TLS; 209 cpu->changed |= CHANGED_GDT_TLS;
177} 210}
178/*:*/
179 211
180/*H:660 212/*H:660
181 * With this, we have finished the Host. 213 * With this, we have finished the Host.
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index eaf722fe309a..6ae388849a3b 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -17,13 +17,15 @@
17 * along with this program; if not, write to the Free Software 17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20/*P:450 This file contains the x86-specific lguest code. It used to be all 20/*P:450
21 * This file contains the x86-specific lguest code. It used to be all
21 * mixed in with drivers/lguest/core.c but several foolhardy code slashers 22 * mixed in with drivers/lguest/core.c but several foolhardy code slashers
22 * wrestled most of the dependencies out to here in preparation for porting 23 * wrestled most of the dependencies out to here in preparation for porting
23 * lguest to other architectures (see what I mean by foolhardy?). 24 * lguest to other architectures (see what I mean by foolhardy?).
24 * 25 *
25 * This also contains a couple of non-obvious setup and teardown pieces which 26 * This also contains a couple of non-obvious setup and teardown pieces which
26 * were implemented after days of debugging pain. :*/ 27 * were implemented after days of debugging pain.
28:*/
27#include <linux/kernel.h> 29#include <linux/kernel.h>
28#include <linux/start_kernel.h> 30#include <linux/start_kernel.h>
29#include <linux/string.h> 31#include <linux/string.h>
@@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu);
82 */ 84 */
83static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) 85static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages)
84{ 86{
85 /* Copying all this data can be quite expensive. We usually run the 87 /*
88 * Copying all this data can be quite expensive. We usually run the
86 * same Guest we ran last time (and that Guest hasn't run anywhere else 89 * same Guest we ran last time (and that Guest hasn't run anywhere else
87 * meanwhile). If that's not the case, we pretend everything in the 90 * meanwhile). If that's not the case, we pretend everything in the
88 * Guest has changed. */ 91 * Guest has changed.
92 */
89 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { 93 if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) {
90 __get_cpu_var(last_cpu) = cpu; 94 __get_cpu_var(last_cpu) = cpu;
91 cpu->last_pages = pages; 95 cpu->last_pages = pages;
92 cpu->changed = CHANGED_ALL; 96 cpu->changed = CHANGED_ALL;
93 } 97 }
94 98
95 /* These copies are pretty cheap, so we do them unconditionally: */ 99 /*
96 /* Save the current Host top-level page directory. */ 100 * These copies are pretty cheap, so we do them unconditionally: */
101 /* Save the current Host top-level page directory.
102 */
97 pages->state.host_cr3 = __pa(current->mm->pgd); 103 pages->state.host_cr3 = __pa(current->mm->pgd);
98 /* Set up the Guest's page tables to see this CPU's pages (and no 104 /*
99 * other CPU's pages). */ 105 * Set up the Guest's page tables to see this CPU's pages (and no
106 * other CPU's pages).
107 */
100 map_switcher_in_guest(cpu, pages); 108 map_switcher_in_guest(cpu, pages);
101 /* Set up the two "TSS" members which tell the CPU what stack to use 109 /*
110 * Set up the two "TSS" members which tell the CPU what stack to use
102 * for traps which do directly into the Guest (ie. traps at privilege 111 * for traps which do directly into the Guest (ie. traps at privilege
103 * level 1). */ 112 * level 1).
113 */
104 pages->state.guest_tss.sp1 = cpu->esp1; 114 pages->state.guest_tss.sp1 = cpu->esp1;
105 pages->state.guest_tss.ss1 = cpu->ss1; 115 pages->state.guest_tss.ss1 = cpu->ss1;
106 116
@@ -125,97 +135,126 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
125 /* This is a dummy value we need for GCC's sake. */ 135 /* This is a dummy value we need for GCC's sake. */
126 unsigned int clobber; 136 unsigned int clobber;
127 137
128 /* Copy the guest-specific information into this CPU's "struct 138 /*
129 * lguest_pages". */ 139 * Copy the guest-specific information into this CPU's "struct
140 * lguest_pages".
141 */
130 copy_in_guest_info(cpu, pages); 142 copy_in_guest_info(cpu, pages);
131 143
132 /* Set the trap number to 256 (impossible value). If we fault while 144 /*
145 * Set the trap number to 256 (impossible value). If we fault while
133 * switching to the Guest (bad segment registers or bug), this will 146 * switching to the Guest (bad segment registers or bug), this will
134 * cause us to abort the Guest. */ 147 * cause us to abort the Guest.
148 */
135 cpu->regs->trapnum = 256; 149 cpu->regs->trapnum = 256;
136 150
137 /* Now: we push the "eflags" register on the stack, then do an "lcall". 151 /*
152 * Now: we push the "eflags" register on the stack, then do an "lcall".
138 * This is how we change from using the kernel code segment to using 153 * This is how we change from using the kernel code segment to using
139 * the dedicated lguest code segment, as well as jumping into the 154 * the dedicated lguest code segment, as well as jumping into the
140 * Switcher. 155 * Switcher.
141 * 156 *
142 * The lcall also pushes the old code segment (KERNEL_CS) onto the 157 * The lcall also pushes the old code segment (KERNEL_CS) onto the
143 * stack, then the address of this call. This stack layout happens to 158 * stack, then the address of this call. This stack layout happens to
144 * exactly match the stack layout created by an interrupt... */ 159 * exactly match the stack layout created by an interrupt...
160 */
145 asm volatile("pushf; lcall *lguest_entry" 161 asm volatile("pushf; lcall *lguest_entry"
146 /* This is how we tell GCC that %eax ("a") and %ebx ("b") 162 /*
147 * are changed by this routine. The "=" means output. */ 163 * This is how we tell GCC that %eax ("a") and %ebx ("b")
164 * are changed by this routine. The "=" means output.
165 */
148 : "=a"(clobber), "=b"(clobber) 166 : "=a"(clobber), "=b"(clobber)
149 /* %eax contains the pages pointer. ("0" refers to the 167 /*
168 * %eax contains the pages pointer. ("0" refers to the
150 * 0-th argument above, ie "a"). %ebx contains the 169 * 0-th argument above, ie "a"). %ebx contains the
151 * physical address of the Guest's top-level page 170 * physical address of the Guest's top-level page
152 * directory. */ 171 * directory.
172 */
153 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) 173 : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir))
154 /* We tell gcc that all these registers could change, 174 /*
175 * We tell gcc that all these registers could change,
155 * which means we don't have to save and restore them in 176 * which means we don't have to save and restore them in
156 * the Switcher. */ 177 * the Switcher.
178 */
157 : "memory", "%edx", "%ecx", "%edi", "%esi"); 179 : "memory", "%edx", "%ecx", "%edi", "%esi");
158} 180}
159/*:*/ 181/*:*/
160 182
161/*M:002 There are hooks in the scheduler which we can register to tell when we 183/*M:002
184 * There are hooks in the scheduler which we can register to tell when we
162 * get kicked off the CPU (preempt_notifier_register()). This would allow us 185 * get kicked off the CPU (preempt_notifier_register()). This would allow us
163 * to lazily disable SYSENTER which would regain some performance, and should 186 * to lazily disable SYSENTER which would regain some performance, and should
164 * also simplify copy_in_guest_info(). Note that we'd still need to restore 187 * also simplify copy_in_guest_info(). Note that we'd still need to restore
165 * things when we exit to Launcher userspace, but that's fairly easy. 188 * things when we exit to Launcher userspace, but that's fairly easy.
166 * 189 *
167 * We could also try using this hooks for PGE, but that might be too expensive. 190 * We could also try using these hooks for PGE, but that might be too expensive.
168 * 191 *
169 * The hooks were designed for KVM, but we can also put them to good use. :*/ 192 * The hooks were designed for KVM, but we can also put them to good use.
193:*/
170 194
171/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts 195/*H:040
172 * are disabled: we own the CPU. */ 196 * This is the i386-specific code to setup and run the Guest. Interrupts
197 * are disabled: we own the CPU.
198 */
173void lguest_arch_run_guest(struct lg_cpu *cpu) 199void lguest_arch_run_guest(struct lg_cpu *cpu)
174{ 200{
175 /* Remember the awfully-named TS bit? If the Guest has asked to set it 201 /*
202 * Remember the awfully-named TS bit? If the Guest has asked to set it
176 * we set it now, so we can trap and pass that trap to the Guest if it 203 * we set it now, so we can trap and pass that trap to the Guest if it
177 * uses the FPU. */ 204 * uses the FPU.
205 */
178 if (cpu->ts) 206 if (cpu->ts)
179 unlazy_fpu(current); 207 unlazy_fpu(current);
180 208
181 /* SYSENTER is an optimized way of doing system calls. We can't allow 209 /*
210 * SYSENTER is an optimized way of doing system calls. We can't allow
182 * it because it always jumps to privilege level 0. A normal Guest 211 * it because it always jumps to privilege level 0. A normal Guest
183 * won't try it because we don't advertise it in CPUID, but a malicious 212 * won't try it because we don't advertise it in CPUID, but a malicious
184 * Guest (or malicious Guest userspace program) could, so we tell the 213 * Guest (or malicious Guest userspace program) could, so we tell the
185 * CPU to disable it before running the Guest. */ 214 * CPU to disable it before running the Guest.
215 */
186 if (boot_cpu_has(X86_FEATURE_SEP)) 216 if (boot_cpu_has(X86_FEATURE_SEP))
187 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); 217 wrmsr(MSR_IA32_SYSENTER_CS, 0, 0);
188 218
189 /* Now we actually run the Guest. It will return when something 219 /*
220 * Now we actually run the Guest. It will return when something
190 * interesting happens, and we can examine its registers to see what it 221 * interesting happens, and we can examine its registers to see what it
191 * was doing. */ 222 * was doing.
223 */
192 run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); 224 run_guest_once(cpu, lguest_pages(raw_smp_processor_id()));
193 225
194 /* Note that the "regs" structure contains two extra entries which are 226 /*
227 * Note that the "regs" structure contains two extra entries which are
195 * not really registers: a trap number which says what interrupt or 228 * not really registers: a trap number which says what interrupt or
196 * trap made the switcher code come back, and an error code which some 229 * trap made the switcher code come back, and an error code which some
197 * traps set. */ 230 * traps set.
231 */
198 232
199 /* Restore SYSENTER if it's supposed to be on. */ 233 /* Restore SYSENTER if it's supposed to be on. */
200 if (boot_cpu_has(X86_FEATURE_SEP)) 234 if (boot_cpu_has(X86_FEATURE_SEP))
201 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); 235 wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
202 236
203 /* If the Guest page faulted, then the cr2 register will tell us the 237 /*
238 * If the Guest page faulted, then the cr2 register will tell us the
204 * bad virtual address. We have to grab this now, because once we 239 * bad virtual address. We have to grab this now, because once we
205 * re-enable interrupts an interrupt could fault and thus overwrite 240 * re-enable interrupts an interrupt could fault and thus overwrite
206 * cr2, or we could even move off to a different CPU. */ 241 * cr2, or we could even move off to a different CPU.
242 */
207 if (cpu->regs->trapnum == 14) 243 if (cpu->regs->trapnum == 14)
208 cpu->arch.last_pagefault = read_cr2(); 244 cpu->arch.last_pagefault = read_cr2();
209 /* Similarly, if we took a trap because the Guest used the FPU, 245 /*
246 * Similarly, if we took a trap because the Guest used the FPU,
210 * we have to restore the FPU it expects to see. 247 * we have to restore the FPU it expects to see.
211 * math_state_restore() may sleep and we may even move off to 248 * math_state_restore() may sleep and we may even move off to
212 * a different CPU. So all the critical stuff should be done 249 * a different CPU. So all the critical stuff should be done
213 * before this. */ 250 * before this.
251 */
214 else if (cpu->regs->trapnum == 7) 252 else if (cpu->regs->trapnum == 7)
215 math_state_restore(); 253 math_state_restore();
216} 254}
217 255
218/*H:130 Now we've examined the hypercall code; our Guest can make requests. 256/*H:130
257 * Now we've examined the hypercall code; our Guest can make requests.
219 * Our Guest is usually so well behaved; it never tries to do things it isn't 258 * Our Guest is usually so well behaved; it never tries to do things it isn't
220 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual 259 * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual
221 * infrastructure isn't quite complete, because it doesn't contain replacements 260 * infrastructure isn't quite complete, because it doesn't contain replacements
@@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
225 * 264 *
226 * When the Guest uses one of these instructions, we get a trap (General 265 * When the Guest uses one of these instructions, we get a trap (General
227 * Protection Fault) and come here. We see if it's one of those troublesome 266 * Protection Fault) and come here. We see if it's one of those troublesome
228 * instructions and skip over it. We return true if we did. */ 267 * instructions and skip over it. We return true if we did.
268 */
229static int emulate_insn(struct lg_cpu *cpu) 269static int emulate_insn(struct lg_cpu *cpu)
230{ 270{
231 u8 insn; 271 u8 insn;
232 unsigned int insnlen = 0, in = 0, shift = 0; 272 unsigned int insnlen = 0, in = 0, shift = 0;
233 /* The eip contains the *virtual* address of the Guest's instruction: 273 /*
234 * guest_pa just subtracts the Guest's page_offset. */ 274 * The eip contains the *virtual* address of the Guest's instruction:
275 * guest_pa just subtracts the Guest's page_offset.
276 */
235 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); 277 unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
236 278
237 /* This must be the Guest kernel trying to do something, not userspace! 279 /*
280 * This must be the Guest kernel trying to do something, not userspace!
238 * The bottom two bits of the CS segment register are the privilege 281 * The bottom two bits of the CS segment register are the privilege
239 * level. */ 282 * level.
283 */
240 if ((cpu->regs->cs & 3) != GUEST_PL) 284 if ((cpu->regs->cs & 3) != GUEST_PL)
241 return 0; 285 return 0;
242 286
243 /* Decoding x86 instructions is icky. */ 287 /* Decoding x86 instructions is icky. */
244 insn = lgread(cpu, physaddr, u8); 288 insn = lgread(cpu, physaddr, u8);
245 289
246 /* 0x66 is an "operand prefix". It means it's using the upper 16 bits 290 /*
247 of the eax register. */ 291 * 0x66 is an "operand prefix". It means it's using the upper 16 bits
292 * of the eax register.
293 */
248 if (insn == 0x66) { 294 if (insn == 0x66) {
249 shift = 16; 295 shift = 16;
250 /* The instruction is 1 byte so far, read the next byte. */ 296 /* The instruction is 1 byte so far, read the next byte. */
@@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu)
252 insn = lgread(cpu, physaddr + insnlen, u8); 298 insn = lgread(cpu, physaddr + insnlen, u8);
253 } 299 }
254 300
255 /* We can ignore the lower bit for the moment and decode the 4 opcodes 301 /*
256 * we need to emulate. */ 302 * We can ignore the lower bit for the moment and decode the 4 opcodes
303 * we need to emulate.
304 */
257 switch (insn & 0xFE) { 305 switch (insn & 0xFE) {
258 case 0xE4: /* in <next byte>,%al */ 306 case 0xE4: /* in <next byte>,%al */
259 insnlen += 2; 307 insnlen += 2;
@@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu)
274 return 0; 322 return 0;
275 } 323 }
276 324
277 /* If it was an "IN" instruction, they expect the result to be read 325 /*
326 * If it was an "IN" instruction, they expect the result to be read
278 * into %eax, so we change %eax. We always return all-ones, which 327 * into %eax, so we change %eax. We always return all-ones, which
279 * traditionally means "there's nothing there". */ 328 * traditionally means "there's nothing there".
329 */
280 if (in) { 330 if (in) {
281 /* Lower bit tells is whether it's a 16 or 32 bit access */ 331 /* Lower bit tells is whether it's a 16 or 32 bit access */
282 if (insn & 0x1) 332 if (insn & 0x1)
@@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu)
290 return 1; 340 return 1;
291} 341}
292 342
293/* Our hypercalls mechanism used to be based on direct software interrupts. 343/*
344 * Our hypercalls mechanism used to be based on direct software interrupts.
294 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to 345 * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to
295 * change over to using kvm hypercalls. 346 * change over to using kvm hypercalls.
296 * 347 *
@@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu)
318 */ 369 */
319static void rewrite_hypercall(struct lg_cpu *cpu) 370static void rewrite_hypercall(struct lg_cpu *cpu)
320{ 371{
321 /* This are the opcodes we use to patch the Guest. The opcode for "int 372 /*
373 * This are the opcodes we use to patch the Guest. The opcode for "int
322 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we 374 * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we
323 * complete the sequence with a NOP (0x90). */ 375 * complete the sequence with a NOP (0x90).
376 */
324 u8 insn[3] = {0xcd, 0x1f, 0x90}; 377 u8 insn[3] = {0xcd, 0x1f, 0x90};
325 378
326 __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); 379 __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn));
327 /* The above write might have caused a copy of that page to be made 380 /*
381 * The above write might have caused a copy of that page to be made
328 * (if it was read-only). We need to make sure the Guest has 382 * (if it was read-only). We need to make sure the Guest has
329 * up-to-date pagetables. As this doesn't happen often, we can just 383 * up-to-date pagetables. As this doesn't happen often, we can just
330 * drop them all. */ 384 * drop them all.
385 */
331 guest_pagetable_clear_all(cpu); 386 guest_pagetable_clear_all(cpu);
332} 387}
333 388
@@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu)
335{ 390{
336 u8 insn[3]; 391 u8 insn[3];
337 392
338 /* This must be the Guest kernel trying to do something. 393 /*
394 * This must be the Guest kernel trying to do something.
339 * The bottom two bits of the CS segment register are the privilege 395 * The bottom two bits of the CS segment register are the privilege
340 * level. */ 396 * level.
397 */
341 if ((cpu->regs->cs & 3) != GUEST_PL) 398 if ((cpu->regs->cs & 3) != GUEST_PL)
342 return false; 399 return false;
343 400
@@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
351{ 408{
352 switch (cpu->regs->trapnum) { 409 switch (cpu->regs->trapnum) {
353 case 13: /* We've intercepted a General Protection Fault. */ 410 case 13: /* We've intercepted a General Protection Fault. */
354 /* Check if this was one of those annoying IN or OUT 411 /*
412 * Check if this was one of those annoying IN or OUT
355 * instructions which we need to emulate. If so, we just go 413 * instructions which we need to emulate. If so, we just go
356 * back into the Guest after we've done it. */ 414 * back into the Guest after we've done it.
415 */
357 if (cpu->regs->errcode == 0) { 416 if (cpu->regs->errcode == 0) {
358 if (emulate_insn(cpu)) 417 if (emulate_insn(cpu))
359 return; 418 return;
360 } 419 }
361 /* If KVM is active, the vmcall instruction triggers a 420 /*
362 * General Protection Fault. Normally it triggers an 421 * If KVM is active, the vmcall instruction triggers a General
363 * invalid opcode fault (6): */ 422 * Protection Fault. Normally it triggers an invalid opcode
423 * fault (6):
424 */
364 case 6: 425 case 6:
365 /* We need to check if ring == GUEST_PL and 426 /*
366 * faulting instruction == vmcall. */ 427 * We need to check if ring == GUEST_PL and faulting
428 * instruction == vmcall.
429 */
367 if (is_hypercall(cpu)) { 430 if (is_hypercall(cpu)) {
368 rewrite_hypercall(cpu); 431 rewrite_hypercall(cpu);
369 return; 432 return;
370 } 433 }
371 break; 434 break;
372 case 14: /* We've intercepted a Page Fault. */ 435 case 14: /* We've intercepted a Page Fault. */
373 /* The Guest accessed a virtual address that wasn't mapped. 436 /*
437 * The Guest accessed a virtual address that wasn't mapped.
374 * This happens a lot: we don't actually set up most of the page 438 * This happens a lot: we don't actually set up most of the page
375 * tables for the Guest at all when we start: as it runs it asks 439 * tables for the Guest at all when we start: as it runs it asks
376 * for more and more, and we set them up as required. In this 440 * for more and more, and we set them up as required. In this
377 * case, we don't even tell the Guest that the fault happened. 441 * case, we don't even tell the Guest that the fault happened.
378 * 442 *
379 * The errcode tells whether this was a read or a write, and 443 * The errcode tells whether this was a read or a write, and
380 * whether kernel or userspace code. */ 444 * whether kernel or userspace code.
445 */
381 if (demand_page(cpu, cpu->arch.last_pagefault, 446 if (demand_page(cpu, cpu->arch.last_pagefault,
382 cpu->regs->errcode)) 447 cpu->regs->errcode))
383 return; 448 return;
384 449
385 /* OK, it's really not there (or not OK): the Guest needs to 450 /*
451 * OK, it's really not there (or not OK): the Guest needs to
386 * know. We write out the cr2 value so it knows where the 452 * know. We write out the cr2 value so it knows where the
387 * fault occurred. 453 * fault occurred.
388 * 454 *
389 * Note that if the Guest were really messed up, this could 455 * Note that if the Guest were really messed up, this could
390 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so 456 * happen before it's done the LHCALL_LGUEST_INIT hypercall, so
391 * lg->lguest_data could be NULL */ 457 * lg->lguest_data could be NULL
458 */
392 if (cpu->lg->lguest_data && 459 if (cpu->lg->lguest_data &&
393 put_user(cpu->arch.last_pagefault, 460 put_user(cpu->arch.last_pagefault,
394 &cpu->lg->lguest_data->cr2)) 461 &cpu->lg->lguest_data->cr2))
395 kill_guest(cpu, "Writing cr2"); 462 kill_guest(cpu, "Writing cr2");
396 break; 463 break;
397 case 7: /* We've intercepted a Device Not Available fault. */ 464 case 7: /* We've intercepted a Device Not Available fault. */
398 /* If the Guest doesn't want to know, we already restored the 465 /*
399 * Floating Point Unit, so we just continue without telling 466 * If the Guest doesn't want to know, we already restored the
400 * it. */ 467 * Floating Point Unit, so we just continue without telling it.
468 */
401 if (!cpu->ts) 469 if (!cpu->ts)
402 return; 470 return;
403 break; 471 break;
404 case 32 ... 255: 472 case 32 ... 255:
405 /* These values mean a real interrupt occurred, in which case 473 /*
474 * These values mean a real interrupt occurred, in which case
406 * the Host handler has already been run. We just do a 475 * the Host handler has already been run. We just do a
407 * friendly check if another process should now be run, then 476 * friendly check if another process should now be run, then
408 * return to run the Guest again */ 477 * return to run the Guest again
478 */
409 cond_resched(); 479 cond_resched();
410 return; 480 return;
411 case LGUEST_TRAP_ENTRY: 481 case LGUEST_TRAP_ENTRY:
412 /* Our 'struct hcall_args' maps directly over our regs: we set 482 /*
413 * up the pointer now to indicate a hypercall is pending. */ 483 * Our 'struct hcall_args' maps directly over our regs: we set
484 * up the pointer now to indicate a hypercall is pending.
485 */
414 cpu->hcall = (struct hcall_args *)cpu->regs; 486 cpu->hcall = (struct hcall_args *)cpu->regs;
415 return; 487 return;
416 } 488 }
417 489
418 /* We didn't handle the trap, so it needs to go to the Guest. */ 490 /* We didn't handle the trap, so it needs to go to the Guest. */
419 if (!deliver_trap(cpu, cpu->regs->trapnum)) 491 if (!deliver_trap(cpu, cpu->regs->trapnum))
420 /* If the Guest doesn't have a handler (either it hasn't 492 /*
493 * If the Guest doesn't have a handler (either it hasn't
421 * registered any yet, or it's one of the faults we don't let 494 * registered any yet, or it's one of the faults we don't let
422 * it handle), it dies with this cryptic error message. */ 495 * it handle), it dies with this cryptic error message.
496 */
423 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", 497 kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)",
424 cpu->regs->trapnum, cpu->regs->eip, 498 cpu->regs->trapnum, cpu->regs->eip,
425 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault 499 cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault
426 : cpu->regs->errcode); 500 : cpu->regs->errcode);
427} 501}
428 502
429/* Now we can look at each of the routines this calls, in increasing order of 503/*
504 * Now we can look at each of the routines this calls, in increasing order of
430 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), 505 * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(),
431 * deliver_trap() and demand_page(). After all those, we'll be ready to 506 * deliver_trap() and demand_page(). After all those, we'll be ready to
432 * examine the Switcher, and our philosophical understanding of the Host/Guest 507 * examine the Switcher, and our philosophical understanding of the Host/Guest
433 * duality will be complete. :*/ 508 * duality will be complete.
509:*/
434static void adjust_pge(void *on) 510static void adjust_pge(void *on)
435{ 511{
436 if (on) 512 if (on)
@@ -439,13 +515,16 @@ static void adjust_pge(void *on)
439 write_cr4(read_cr4() & ~X86_CR4_PGE); 515 write_cr4(read_cr4() & ~X86_CR4_PGE);
440} 516}
441 517
442/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do 518/*H:020
443 * some more i386-specific initialization. */ 519 * Now the Switcher is mapped and every thing else is ready, we need to do
520 * some more i386-specific initialization.
521 */
444void __init lguest_arch_host_init(void) 522void __init lguest_arch_host_init(void)
445{ 523{
446 int i; 524 int i;
447 525
448 /* Most of the i386/switcher.S doesn't care that it's been moved; on 526 /*
527 * Most of the i386/switcher.S doesn't care that it's been moved; on
449 * Intel, jumps are relative, and it doesn't access any references to 528 * Intel, jumps are relative, and it doesn't access any references to
450 * external code or data. 529 * external code or data.
451 * 530 *
@@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void)
453 * addresses are placed in a table (default_idt_entries), so we need to 532 * addresses are placed in a table (default_idt_entries), so we need to
454 * update the table with the new addresses. switcher_offset() is a 533 * update the table with the new addresses. switcher_offset() is a
455 * convenience function which returns the distance between the 534 * convenience function which returns the distance between the
456 * compiled-in switcher code and the high-mapped copy we just made. */ 535 * compiled-in switcher code and the high-mapped copy we just made.
536 */
457 for (i = 0; i < IDT_ENTRIES; i++) 537 for (i = 0; i < IDT_ENTRIES; i++)
458 default_idt_entries[i] += switcher_offset(); 538 default_idt_entries[i] += switcher_offset();
459 539
@@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void)
468 for_each_possible_cpu(i) { 548 for_each_possible_cpu(i) {
469 /* lguest_pages() returns this CPU's two pages. */ 549 /* lguest_pages() returns this CPU's two pages. */
470 struct lguest_pages *pages = lguest_pages(i); 550 struct lguest_pages *pages = lguest_pages(i);
471 /* This is a convenience pointer to make the code fit one 551 /* This is a convenience pointer to make the code neater. */
472 * statement to a line. */
473 struct lguest_ro_state *state = &pages->state; 552 struct lguest_ro_state *state = &pages->state;
474 553
475 /* The Global Descriptor Table: the Host has a different one 554 /*
555 * The Global Descriptor Table: the Host has a different one
476 * for each CPU. We keep a descriptor for the GDT which says 556 * for each CPU. We keep a descriptor for the GDT which says
477 * where it is and how big it is (the size is actually the last 557 * where it is and how big it is (the size is actually the last
478 * byte, not the size, hence the "-1"). */ 558 * byte, not the size, hence the "-1").
559 */
479 state->host_gdt_desc.size = GDT_SIZE-1; 560 state->host_gdt_desc.size = GDT_SIZE-1;
480 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); 561 state->host_gdt_desc.address = (long)get_cpu_gdt_table(i);
481 562
482 /* All CPUs on the Host use the same Interrupt Descriptor 563 /*
564 * All CPUs on the Host use the same Interrupt Descriptor
483 * Table, so we just use store_idt(), which gets this CPU's IDT 565 * Table, so we just use store_idt(), which gets this CPU's IDT
484 * descriptor. */ 566 * descriptor.
567 */
485 store_idt(&state->host_idt_desc); 568 store_idt(&state->host_idt_desc);
486 569
487 /* The descriptors for the Guest's GDT and IDT can be filled 570 /*
571 * The descriptors for the Guest's GDT and IDT can be filled
488 * out now, too. We copy the GDT & IDT into ->guest_gdt and 572 * out now, too. We copy the GDT & IDT into ->guest_gdt and
489 * ->guest_idt before actually running the Guest. */ 573 * ->guest_idt before actually running the Guest.
574 */
490 state->guest_idt_desc.size = sizeof(state->guest_idt)-1; 575 state->guest_idt_desc.size = sizeof(state->guest_idt)-1;
491 state->guest_idt_desc.address = (long)&state->guest_idt; 576 state->guest_idt_desc.address = (long)&state->guest_idt;
492 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; 577 state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1;
493 state->guest_gdt_desc.address = (long)&state->guest_gdt; 578 state->guest_gdt_desc.address = (long)&state->guest_gdt;
494 579
495 /* We know where we want the stack to be when the Guest enters 580 /*
581 * We know where we want the stack to be when the Guest enters
496 * the Switcher: in pages->regs. The stack grows upwards, so 582 * the Switcher: in pages->regs. The stack grows upwards, so
497 * we start it at the end of that structure. */ 583 * we start it at the end of that structure.
584 */
498 state->guest_tss.sp0 = (long)(&pages->regs + 1); 585 state->guest_tss.sp0 = (long)(&pages->regs + 1);
499 /* And this is the GDT entry to use for the stack: we keep a 586 /*
500 * couple of special LGUEST entries. */ 587 * And this is the GDT entry to use for the stack: we keep a
588 * couple of special LGUEST entries.
589 */
501 state->guest_tss.ss0 = LGUEST_DS; 590 state->guest_tss.ss0 = LGUEST_DS;
502 591
503 /* x86 can have a finegrained bitmap which indicates what I/O 592 /*
593 * x86 can have a finegrained bitmap which indicates what I/O
504 * ports the process can use. We set it to the end of our 594 * ports the process can use. We set it to the end of our
505 * structure, meaning "none". */ 595 * structure, meaning "none".
596 */
506 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); 597 state->guest_tss.io_bitmap_base = sizeof(state->guest_tss);
507 598
508 /* Some GDT entries are the same across all Guests, so we can 599 /*
509 * set them up now. */ 600 * Some GDT entries are the same across all Guests, so we can
601 * set them up now.
602 */
510 setup_default_gdt_entries(state); 603 setup_default_gdt_entries(state);
511 /* Most IDT entries are the same for all Guests, too.*/ 604 /* Most IDT entries are the same for all Guests, too.*/
512 setup_default_idt_entries(state, default_idt_entries); 605 setup_default_idt_entries(state, default_idt_entries);
513 606
514 /* The Host needs to be able to use the LGUEST segments on this 607 /*
515 * CPU, too, so put them in the Host GDT. */ 608 * The Host needs to be able to use the LGUEST segments on this
609 * CPU, too, so put them in the Host GDT.
610 */
516 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; 611 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT;
517 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; 612 get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT;
518 } 613 }
519 614
520 /* In the Switcher, we want the %cs segment register to use the 615 /*
616 * In the Switcher, we want the %cs segment register to use the
521 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so 617 * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so
522 * it will be undisturbed when we switch. To change %cs and jump we 618 * it will be undisturbed when we switch. To change %cs and jump we
523 * need this structure to feed to Intel's "lcall" instruction. */ 619 * need this structure to feed to Intel's "lcall" instruction.
620 */
524 lguest_entry.offset = (long)switch_to_guest + switcher_offset(); 621 lguest_entry.offset = (long)switch_to_guest + switcher_offset();
525 lguest_entry.segment = LGUEST_CS; 622 lguest_entry.segment = LGUEST_CS;
526 623
527 /* Finally, we need to turn off "Page Global Enable". PGE is an 624 /*
625 * Finally, we need to turn off "Page Global Enable". PGE is an
528 * optimization where page table entries are specially marked to show 626 * optimization where page table entries are specially marked to show
529 * they never change. The Host kernel marks all the kernel pages this 627 * they never change. The Host kernel marks all the kernel pages this
530 * way because it's always present, even when userspace is running. 628 * way because it's always present, even when userspace is running.
@@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void)
534 * you'll get really weird bugs that you'll chase for two days. 632 * you'll get really weird bugs that you'll chase for two days.
535 * 633 *
536 * I used to turn PGE off every time we switched to the Guest and back 634 * I used to turn PGE off every time we switched to the Guest and back
537 * on when we return, but that slowed the Switcher down noticibly. */ 635 * on when we return, but that slowed the Switcher down noticibly.
636 */
538 637
539 /* We don't need the complexity of CPUs coming and going while we're 638 /*
540 * doing this. */ 639 * We don't need the complexity of CPUs coming and going while we're
640 * doing this.
641 */
541 get_online_cpus(); 642 get_online_cpus();
542 if (cpu_has_pge) { /* We have a broader idea of "global". */ 643 if (cpu_has_pge) { /* We have a broader idea of "global". */
543 /* Remember that this was originally set (for cleanup). */ 644 /* Remember that this was originally set (for cleanup). */
544 cpu_had_pge = 1; 645 cpu_had_pge = 1;
545 /* adjust_pge is a helper function which sets or unsets the PGE 646 /*
546 * bit on its CPU, depending on the argument (0 == unset). */ 647 * adjust_pge is a helper function which sets or unsets the PGE
648 * bit on its CPU, depending on the argument (0 == unset).
649 */
547 on_each_cpu(adjust_pge, (void *)0, 1); 650 on_each_cpu(adjust_pge, (void *)0, 1);
548 /* Turn off the feature in the global feature set. */ 651 /* Turn off the feature in the global feature set. */
549 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); 652 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
@@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
590{ 693{
591 u32 tsc_speed; 694 u32 tsc_speed;
592 695
593 /* The pointer to the Guest's "struct lguest_data" is the only argument. 696 /*
594 * We check that address now. */ 697 * The pointer to the Guest's "struct lguest_data" is the only argument.
698 * We check that address now.
699 */
595 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, 700 if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1,
596 sizeof(*cpu->lg->lguest_data))) 701 sizeof(*cpu->lg->lguest_data)))
597 return -EFAULT; 702 return -EFAULT;
598 703
599 /* Having checked it, we simply set lg->lguest_data to point straight 704 /*
705 * Having checked it, we simply set lg->lguest_data to point straight
600 * into the Launcher's memory at the right place and then use 706 * into the Launcher's memory at the right place and then use
601 * copy_to_user/from_user from now on, instead of lgread/write. I put 707 * copy_to_user/from_user from now on, instead of lgread/write. I put
602 * this in to show that I'm not immune to writing stupid 708 * this in to show that I'm not immune to writing stupid
603 * optimizations. */ 709 * optimizations.
710 */
604 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; 711 cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1;
605 712
606 /* We insist that the Time Stamp Counter exist and doesn't change with 713 /*
714 * We insist that the Time Stamp Counter exist and doesn't change with
607 * cpu frequency. Some devious chip manufacturers decided that TSC 715 * cpu frequency. Some devious chip manufacturers decided that TSC
608 * changes could be handled in software. I decided that time going 716 * changes could be handled in software. I decided that time going
609 * backwards might be good for benchmarks, but it's bad for users. 717 * backwards might be good for benchmarks, but it's bad for users.
610 * 718 *
611 * We also insist that the TSC be stable: the kernel detects unreliable 719 * We also insist that the TSC be stable: the kernel detects unreliable
612 * TSCs for its own purposes, and we use that here. */ 720 * TSCs for its own purposes, and we use that here.
721 */
613 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) 722 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable())
614 tsc_speed = tsc_khz; 723 tsc_speed = tsc_khz;
615 else 724 else
@@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu)
625} 734}
626/*:*/ 735/*:*/
627 736
628/*L:030 lguest_arch_setup_regs() 737/*L:030
738 * lguest_arch_setup_regs()
629 * 739 *
630 * Most of the Guest's registers are left alone: we used get_zeroed_page() to 740 * Most of the Guest's registers are left alone: we used get_zeroed_page() to
631 * allocate the structure, so they will be 0. */ 741 * allocate the structure, so they will be 0.
742 */
632void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) 743void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start)
633{ 744{
634 struct lguest_regs *regs = cpu->regs; 745 struct lguest_regs *regs = cpu->regs;
635 746
636 /* There are four "segment" registers which the Guest needs to boot: 747 /*
748 * There are four "segment" registers which the Guest needs to boot:
637 * The "code segment" register (cs) refers to the kernel code segment 749 * The "code segment" register (cs) refers to the kernel code segment
638 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers 750 * __KERNEL_CS, and the "data", "extra" and "stack" segment registers
639 * refer to the kernel data segment __KERNEL_DS. 751 * refer to the kernel data segment __KERNEL_DS.
640 * 752 *
641 * The privilege level is packed into the lower bits. The Guest runs 753 * The privilege level is packed into the lower bits. The Guest runs
642 * at privilege level 1 (GUEST_PL).*/ 754 * at privilege level 1 (GUEST_PL).
755 */
643 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; 756 regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL;
644 regs->cs = __KERNEL_CS|GUEST_PL; 757 regs->cs = __KERNEL_CS|GUEST_PL;
645 758
646 /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) 759 /*
760 * The "eflags" register contains miscellaneous flags. Bit 1 (0x002)
647 * is supposed to always be "1". Bit 9 (0x200) controls whether 761 * is supposed to always be "1". Bit 9 (0x200) controls whether
648 * interrupts are enabled. We always leave interrupts enabled while 762 * interrupts are enabled. We always leave interrupts enabled while
649 * running the Guest. */ 763 * running the Guest.
764 */
650 regs->eflags = X86_EFLAGS_IF | 0x2; 765 regs->eflags = X86_EFLAGS_IF | 0x2;
651 766
652 /* The "Extended Instruction Pointer" register says where the Guest is 767 /*
653 * running. */ 768 * The "Extended Instruction Pointer" register says where the Guest is
769 * running.
770 */
654 regs->eip = start; 771 regs->eip = start;
655 772
656 /* %esi points to our boot information, at physical address 0, so don't 773 /*
657 * touch it. */ 774 * %esi points to our boot information, at physical address 0, so don't
775 * touch it.
776 */
658 777
659 /* There are a couple of GDT entries the Guest expects when first 778 /* There are a couple of GDT entries the Guest expects at boot. */
660 * booting. */
661 setup_guest_gdt(cpu); 779 setup_guest_gdt(cpu);
662} 780}
diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S
index 3fc15318a80f..40634b0db9f7 100644
--- a/drivers/lguest/x86/switcher_32.S
+++ b/drivers/lguest/x86/switcher_32.S
@@ -1,12 +1,15 @@
1/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the 1/*P:900
2 * Host and Guest to do the low-level Guest<->Host switch. It is as simple as 2 * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride
3 * it can be made, but it's naturally very specific to x86. 3 * both the Host and Guest to do the low-level Guest<->Host switch. It is as
4 * simple as it can be made, but it's naturally very specific to x86.
4 * 5 *
5 * You have now completed Preparation. If this has whet your appetite; if you 6 * You have now completed Preparation. If this has whet your appetite; if you
6 * are feeling invigorated and refreshed then the next, more challenging stage 7 * are feeling invigorated and refreshed then the next, more challenging stage
7 * can be found in "make Guest". :*/ 8 * can be found in "make Guest".
9 :*/
8 10
9/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must 11/*M:012
12 * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must
10 * gain at least 1% more performance. Since neither LOC nor performance can be 13 * gain at least 1% more performance. Since neither LOC nor performance can be
11 * measured beforehand, it generally means implementing a feature then deciding 14 * measured beforehand, it generally means implementing a feature then deciding
12 * if it's worth it. And once it's implemented, who can say no? 15 * if it's worth it. And once it's implemented, who can say no?
@@ -31,11 +34,14 @@
31 * Host (which is actually really easy). 34 * Host (which is actually really easy).
32 * 35 *
33 * Two questions remain. Would the performance gain outweigh the complexity? 36 * Two questions remain. Would the performance gain outweigh the complexity?
34 * And who would write the verse documenting it? :*/ 37 * And who would write the verse documenting it?
38:*/
35 39
36/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their 40/*M:011
41 * Lguest64 handles NMI. This gave me NMI envy (until I looked at their
37 * code). It's worth doing though, since it would let us use oprofile in the 42 * code). It's worth doing though, since it would let us use oprofile in the
38 * Host when a Guest is running. :*/ 43 * Host when a Guest is running.
44:*/
39 45
40/*S:100 46/*S:100
41 * Welcome to the Switcher itself! 47 * Welcome to the Switcher itself!
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 5810fa906af0..5fe39c2a3d2b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -220,6 +220,7 @@ static int linear_run (mddev_t *mddev)
220 mddev->queue->unplug_fn = linear_unplug; 220 mddev->queue->unplug_fn = linear_unplug;
221 mddev->queue->backing_dev_info.congested_fn = linear_congested; 221 mddev->queue->backing_dev_info.congested_fn = linear_congested;
222 mddev->queue->backing_dev_info.congested_data = mddev; 222 mddev->queue->backing_dev_info.congested_data = mddev;
223 md_integrity_register(mddev);
223 return 0; 224 return 0;
224} 225}
225 226
@@ -256,6 +257,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
256 rcu_assign_pointer(mddev->private, newconf); 257 rcu_assign_pointer(mddev->private, newconf);
257 md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); 258 md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
258 set_capacity(mddev->gendisk, mddev->array_sectors); 259 set_capacity(mddev->gendisk, mddev->array_sectors);
260 revalidate_disk(mddev->gendisk);
259 call_rcu(&oldconf->rcu, free_conf); 261 call_rcu(&oldconf->rcu, free_conf);
260 return 0; 262 return 0;
261} 263}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index d4351ff0849f..5b98bea4ff9b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1308,7 +1308,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1308 } 1308 }
1309 if (mddev->level != LEVEL_MULTIPATH) { 1309 if (mddev->level != LEVEL_MULTIPATH) {
1310 int role; 1310 int role;
1311 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1311 if (rdev->desc_nr < 0 ||
1312 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1313 role = 0xffff;
1314 rdev->desc_nr = -1;
1315 } else
1316 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1312 switch(role) { 1317 switch(role) {
1313 case 0xffff: /* spare */ 1318 case 0xffff: /* spare */
1314 break; 1319 break;
@@ -1394,8 +1399,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1394 if (rdev2->desc_nr+1 > max_dev) 1399 if (rdev2->desc_nr+1 > max_dev)
1395 max_dev = rdev2->desc_nr+1; 1400 max_dev = rdev2->desc_nr+1;
1396 1401
1397 if (max_dev > le32_to_cpu(sb->max_dev)) 1402 if (max_dev > le32_to_cpu(sb->max_dev)) {
1403 int bmask;
1398 sb->max_dev = cpu_to_le32(max_dev); 1404 sb->max_dev = cpu_to_le32(max_dev);
1405 rdev->sb_size = max_dev * 2 + 256;
1406 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1407 if (rdev->sb_size & bmask)
1408 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1409 }
1399 for (i=0; i<max_dev;i++) 1410 for (i=0; i<max_dev;i++)
1400 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1411 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1401 1412
@@ -1487,37 +1498,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1487 1498
1488static LIST_HEAD(pending_raid_disks); 1499static LIST_HEAD(pending_raid_disks);
1489 1500
1490static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) 1501/*
1502 * Try to register data integrity profile for an mddev
1503 *
1504 * This is called when an array is started and after a disk has been kicked
1505 * from the array. It only succeeds if all working and active component devices
1506 * are integrity capable with matching profiles.
1507 */
1508int md_integrity_register(mddev_t *mddev)
1509{
1510 mdk_rdev_t *rdev, *reference = NULL;
1511
1512 if (list_empty(&mddev->disks))
1513 return 0; /* nothing to do */
1514 if (blk_get_integrity(mddev->gendisk))
1515 return 0; /* already registered */
1516 list_for_each_entry(rdev, &mddev->disks, same_set) {
1517 /* skip spares and non-functional disks */
1518 if (test_bit(Faulty, &rdev->flags))
1519 continue;
1520 if (rdev->raid_disk < 0)
1521 continue;
1522 /*
1523 * If at least one rdev is not integrity capable, we can not
1524 * enable data integrity for the md device.
1525 */
1526 if (!bdev_get_integrity(rdev->bdev))
1527 return -EINVAL;
1528 if (!reference) {
1529 /* Use the first rdev as the reference */
1530 reference = rdev;
1531 continue;
1532 }
1533 /* does this rdev's profile match the reference profile? */
1534 if (blk_integrity_compare(reference->bdev->bd_disk,
1535 rdev->bdev->bd_disk) < 0)
1536 return -EINVAL;
1537 }
1538 /*
1539 * All component devices are integrity capable and have matching
1540 * profiles, register the common profile for the md device.
1541 */
1542 if (blk_integrity_register(mddev->gendisk,
1543 bdev_get_integrity(reference->bdev)) != 0) {
1544 printk(KERN_ERR "md: failed to register integrity for %s\n",
1545 mdname(mddev));
1546 return -EINVAL;
1547 }
1548 printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1549 mdname(mddev));
1550 return 0;
1551}
1552EXPORT_SYMBOL(md_integrity_register);
1553
1554/* Disable data integrity if non-capable/non-matching disk is being added */
1555void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1491{ 1556{
1492 struct mdk_personality *pers = mddev->pers;
1493 struct gendisk *disk = mddev->gendisk;
1494 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1557 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1495 struct blk_integrity *bi_mddev = blk_get_integrity(disk); 1558 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1496 1559
1497 /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ 1560 if (!bi_mddev) /* nothing to do */
1498 if (pers && pers->level >= 4 && pers->level <= 6)
1499 return; 1561 return;
1500 1562 if (rdev->raid_disk < 0) /* skip spares */
1501 /* If rdev is integrity capable, register profile for mddev */
1502 if (!bi_mddev && bi_rdev) {
1503 if (blk_integrity_register(disk, bi_rdev))
1504 printk(KERN_ERR "%s: %s Could not register integrity!\n",
1505 __func__, disk->disk_name);
1506 else
1507 printk(KERN_NOTICE "Enabling data integrity on %s\n",
1508 disk->disk_name);
1509 return; 1563 return;
1510 } 1564 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1511 1565 rdev->bdev->bd_disk) >= 0)
1512 /* Check that mddev and rdev have matching profiles */ 1566 return;
1513 if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { 1567 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1514 printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, 1568 blk_integrity_unregister(mddev->gendisk);
1515 disk->disk_name, rdev->bdev->bd_disk->disk_name);
1516 printk(KERN_NOTICE "Disabling data integrity on %s\n",
1517 disk->disk_name);
1518 blk_integrity_unregister(disk);
1519 }
1520} 1569}
1570EXPORT_SYMBOL(md_integrity_add_rdev);
1521 1571
1522static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1572static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1523{ 1573{
@@ -1591,7 +1641,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1591 /* May as well allow recovery to be retried once */ 1641 /* May as well allow recovery to be retried once */
1592 mddev->recovery_disabled = 0; 1642 mddev->recovery_disabled = 0;
1593 1643
1594 md_integrity_check(rdev, mddev);
1595 return 0; 1644 return 0;
1596 1645
1597 fail: 1646 fail:
@@ -2657,6 +2706,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2657 ssize_t rv = len; 2706 ssize_t rv = len;
2658 struct mdk_personality *pers; 2707 struct mdk_personality *pers;
2659 void *priv; 2708 void *priv;
2709 mdk_rdev_t *rdev;
2660 2710
2661 if (mddev->pers == NULL) { 2711 if (mddev->pers == NULL) {
2662 if (len == 0) 2712 if (len == 0)
@@ -2736,6 +2786,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
2736 mddev_suspend(mddev); 2786 mddev_suspend(mddev);
2737 mddev->pers->stop(mddev); 2787 mddev->pers->stop(mddev);
2738 module_put(mddev->pers->owner); 2788 module_put(mddev->pers->owner);
2789 /* Invalidate devices that are now superfluous */
2790 list_for_each_entry(rdev, &mddev->disks, same_set)
2791 if (rdev->raid_disk >= mddev->raid_disks) {
2792 rdev->raid_disk = -1;
2793 clear_bit(In_sync, &rdev->flags);
2794 }
2739 mddev->pers = pers; 2795 mddev->pers = pers;
2740 mddev->private = priv; 2796 mddev->private = priv;
2741 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2797 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
@@ -3685,17 +3741,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len)
3685 3741
3686 mddev->array_sectors = sectors; 3742 mddev->array_sectors = sectors;
3687 set_capacity(mddev->gendisk, mddev->array_sectors); 3743 set_capacity(mddev->gendisk, mddev->array_sectors);
3688 if (mddev->pers) { 3744 if (mddev->pers)
3689 struct block_device *bdev = bdget_disk(mddev->gendisk, 0); 3745 revalidate_disk(mddev->gendisk);
3690
3691 if (bdev) {
3692 mutex_lock(&bdev->bd_inode->i_mutex);
3693 i_size_write(bdev->bd_inode,
3694 (loff_t)mddev->array_sectors << 9);
3695 mutex_unlock(&bdev->bd_inode->i_mutex);
3696 bdput(bdev);
3697 }
3698 }
3699 3746
3700 return len; 3747 return len;
3701} 3748}
@@ -4048,10 +4095,6 @@ static int do_md_run(mddev_t * mddev)
4048 } 4095 }
4049 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4096 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4050 4097
4051 if (pers->level >= 4 && pers->level <= 6)
4052 /* Cannot support integrity (yet) */
4053 blk_integrity_unregister(mddev->gendisk);
4054
4055 if (mddev->reshape_position != MaxSector && 4098 if (mddev->reshape_position != MaxSector &&
4056 pers->start_reshape == NULL) { 4099 pers->start_reshape == NULL) {
4057 /* This personality cannot handle reshaping... */ 4100 /* This personality cannot handle reshaping... */
@@ -4189,6 +4232,7 @@ static int do_md_run(mddev_t * mddev)
4189 md_wakeup_thread(mddev->thread); 4232 md_wakeup_thread(mddev->thread);
4190 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4233 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4191 4234
4235 revalidate_disk(mddev->gendisk);
4192 mddev->changed = 1; 4236 mddev->changed = 1;
4193 md_new_event(mddev); 4237 md_new_event(mddev);
4194 sysfs_notify_dirent(mddev->sysfs_state); 4238 sysfs_notify_dirent(mddev->sysfs_state);
@@ -5087,18 +5131,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
5087 return -ENOSPC; 5131 return -ENOSPC;
5088 } 5132 }
5089 rv = mddev->pers->resize(mddev, num_sectors); 5133 rv = mddev->pers->resize(mddev, num_sectors);
5090 if (!rv) { 5134 if (!rv)
5091 struct block_device *bdev; 5135 revalidate_disk(mddev->gendisk);
5092
5093 bdev = bdget_disk(mddev->gendisk, 0);
5094 if (bdev) {
5095 mutex_lock(&bdev->bd_inode->i_mutex);
5096 i_size_write(bdev->bd_inode,
5097 (loff_t)mddev->array_sectors << 9);
5098 mutex_unlock(&bdev->bd_inode->i_mutex);
5099 bdput(bdev);
5100 }
5101 }
5102 return rv; 5136 return rv;
5103} 5137}
5104 5138
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 9430a110db93..78f03168baf9 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -431,5 +431,7 @@ extern int md_allow_write(mddev_t *mddev);
431extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); 431extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
432extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); 432extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
433extern int md_check_no_bitmap(mddev_t *mddev); 433extern int md_check_no_bitmap(mddev_t *mddev);
434extern int md_integrity_register(mddev_t *mddev);
435void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
434 436
435#endif /* _MD_MD_H */ 437#endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 237fe3fd235c..7140909f6662 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -313,6 +313,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
313 set_bit(In_sync, &rdev->flags); 313 set_bit(In_sync, &rdev->flags);
314 rcu_assign_pointer(p->rdev, rdev); 314 rcu_assign_pointer(p->rdev, rdev);
315 err = 0; 315 err = 0;
316 md_integrity_add_rdev(rdev, mddev);
316 break; 317 break;
317 } 318 }
318 319
@@ -345,7 +346,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number)
345 /* lost the race, try later */ 346 /* lost the race, try later */
346 err = -EBUSY; 347 err = -EBUSY;
347 p->rdev = rdev; 348 p->rdev = rdev;
349 goto abort;
348 } 350 }
351 md_integrity_register(mddev);
349 } 352 }
350abort: 353abort:
351 354
@@ -519,7 +522,7 @@ static int multipath_run (mddev_t *mddev)
519 mddev->queue->unplug_fn = multipath_unplug; 522 mddev->queue->unplug_fn = multipath_unplug;
520 mddev->queue->backing_dev_info.congested_fn = multipath_congested; 523 mddev->queue->backing_dev_info.congested_fn = multipath_congested;
521 mddev->queue->backing_dev_info.congested_data = mddev; 524 mddev->queue->backing_dev_info.congested_data = mddev;
522 525 md_integrity_register(mddev);
523 return 0; 526 return 0;
524 527
525out_free_conf: 528out_free_conf:
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 335f490dcad6..898e2bdfee47 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -351,6 +351,7 @@ static int raid0_run(mddev_t *mddev)
351 351
352 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); 352 blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
353 dump_zones(mddev); 353 dump_zones(mddev);
354 md_integrity_register(mddev);
354 return 0; 355 return 0;
355} 356}
356 357
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0569efba0c02..8726fd7ebce5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1144,7 +1144,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1144 rcu_assign_pointer(p->rdev, rdev); 1144 rcu_assign_pointer(p->rdev, rdev);
1145 break; 1145 break;
1146 } 1146 }
1147 1147 md_integrity_add_rdev(rdev, mddev);
1148 print_conf(conf); 1148 print_conf(conf);
1149 return err; 1149 return err;
1150} 1150}
@@ -1178,7 +1178,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number)
1178 /* lost the race, try later */ 1178 /* lost the race, try later */
1179 err = -EBUSY; 1179 err = -EBUSY;
1180 p->rdev = rdev; 1180 p->rdev = rdev;
1181 goto abort;
1181 } 1182 }
1183 md_integrity_register(mddev);
1182 } 1184 }
1183abort: 1185abort:
1184 1186
@@ -2067,7 +2069,7 @@ static int run(mddev_t *mddev)
2067 mddev->queue->unplug_fn = raid1_unplug; 2069 mddev->queue->unplug_fn = raid1_unplug;
2068 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2070 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2069 mddev->queue->backing_dev_info.congested_data = mddev; 2071 mddev->queue->backing_dev_info.congested_data = mddev;
2070 2072 md_integrity_register(mddev);
2071 return 0; 2073 return 0;
2072 2074
2073out_no_mem: 2075out_no_mem:
@@ -2132,6 +2134,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors)
2132 return -EINVAL; 2134 return -EINVAL;
2133 set_capacity(mddev->gendisk, mddev->array_sectors); 2135 set_capacity(mddev->gendisk, mddev->array_sectors);
2134 mddev->changed = 1; 2136 mddev->changed = 1;
2137 revalidate_disk(mddev->gendisk);
2135 if (sectors > mddev->dev_sectors && 2138 if (sectors > mddev->dev_sectors &&
2136 mddev->recovery_cp == MaxSector) { 2139 mddev->recovery_cp == MaxSector) {
2137 mddev->recovery_cp = mddev->dev_sectors; 2140 mddev->recovery_cp = mddev->dev_sectors;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 7298a5e5a183..3d9020cf6f6e 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1170,6 +1170,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1170 break; 1170 break;
1171 } 1171 }
1172 1172
1173 md_integrity_add_rdev(rdev, mddev);
1173 print_conf(conf); 1174 print_conf(conf);
1174 return err; 1175 return err;
1175} 1176}
@@ -1203,7 +1204,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number)
1203 /* lost the race, try later */ 1204 /* lost the race, try later */
1204 err = -EBUSY; 1205 err = -EBUSY;
1205 p->rdev = rdev; 1206 p->rdev = rdev;
1207 goto abort;
1206 } 1208 }
1209 md_integrity_register(mddev);
1207 } 1210 }
1208abort: 1211abort:
1209 1212
@@ -2225,6 +2228,7 @@ static int run(mddev_t *mddev)
2225 2228
2226 if (conf->near_copies < mddev->raid_disks) 2229 if (conf->near_copies < mddev->raid_disks)
2227 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 2230 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
2231 md_integrity_register(mddev);
2228 return 0; 2232 return 0;
2229 2233
2230out_free_conf: 2234out_free_conf:
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 37835538b58e..2b521ee67dfa 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3999,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski
3999 return 0; 3999 return 0;
4000 } 4000 }
4001 4001
4002 /* Allow raid5_quiesce to complete */
4003 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4004
4002 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4005 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4003 return reshape_request(mddev, sector_nr, skipped); 4006 return reshape_request(mddev, sector_nr, skipped);
4004 4007
@@ -4316,6 +4319,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4316 return sectors * (raid_disks - conf->max_degraded); 4319 return sectors * (raid_disks - conf->max_degraded);
4317} 4320}
4318 4321
4322static void free_conf(raid5_conf_t *conf)
4323{
4324 shrink_stripes(conf);
4325 safe_put_page(conf->spare_page);
4326 kfree(conf->disks);
4327 kfree(conf->stripe_hashtbl);
4328 kfree(conf);
4329}
4330
4319static raid5_conf_t *setup_conf(mddev_t *mddev) 4331static raid5_conf_t *setup_conf(mddev_t *mddev)
4320{ 4332{
4321 raid5_conf_t *conf; 4333 raid5_conf_t *conf;
@@ -4447,11 +4459,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
4447 4459
4448 abort: 4460 abort:
4449 if (conf) { 4461 if (conf) {
4450 shrink_stripes(conf); 4462 free_conf(conf);
4451 safe_put_page(conf->spare_page);
4452 kfree(conf->disks);
4453 kfree(conf->stripe_hashtbl);
4454 kfree(conf);
4455 return ERR_PTR(-EIO); 4463 return ERR_PTR(-EIO);
4456 } else 4464 } else
4457 return ERR_PTR(-ENOMEM); 4465 return ERR_PTR(-ENOMEM);
@@ -4629,12 +4637,8 @@ abort:
4629 md_unregister_thread(mddev->thread); 4637 md_unregister_thread(mddev->thread);
4630 mddev->thread = NULL; 4638 mddev->thread = NULL;
4631 if (conf) { 4639 if (conf) {
4632 shrink_stripes(conf);
4633 print_raid5_conf(conf); 4640 print_raid5_conf(conf);
4634 safe_put_page(conf->spare_page); 4641 free_conf(conf);
4635 kfree(conf->disks);
4636 kfree(conf->stripe_hashtbl);
4637 kfree(conf);
4638 } 4642 }
4639 mddev->private = NULL; 4643 mddev->private = NULL;
4640 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 4644 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
@@ -4649,13 +4653,10 @@ static int stop(mddev_t *mddev)
4649 4653
4650 md_unregister_thread(mddev->thread); 4654 md_unregister_thread(mddev->thread);
4651 mddev->thread = NULL; 4655 mddev->thread = NULL;
4652 shrink_stripes(conf);
4653 kfree(conf->stripe_hashtbl);
4654 mddev->queue->backing_dev_info.congested_fn = NULL; 4656 mddev->queue->backing_dev_info.congested_fn = NULL;
4655 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 4657 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
4656 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 4658 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
4657 kfree(conf->disks); 4659 free_conf(conf);
4658 kfree(conf);
4659 mddev->private = NULL; 4660 mddev->private = NULL;
4660 return 0; 4661 return 0;
4661} 4662}
@@ -4857,6 +4858,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
4857 return -EINVAL; 4858 return -EINVAL;
4858 set_capacity(mddev->gendisk, mddev->array_sectors); 4859 set_capacity(mddev->gendisk, mddev->array_sectors);
4859 mddev->changed = 1; 4860 mddev->changed = 1;
4861 revalidate_disk(mddev->gendisk);
4860 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 4862 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
4861 mddev->recovery_cp = mddev->dev_sectors; 4863 mddev->recovery_cp = mddev->dev_sectors;
4862 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4864 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5002,7 +5004,7 @@ static int raid5_start_reshape(mddev_t *mddev)
5002 spin_unlock_irqrestore(&conf->device_lock, flags); 5004 spin_unlock_irqrestore(&conf->device_lock, flags);
5003 } 5005 }
5004 mddev->raid_disks = conf->raid_disks; 5006 mddev->raid_disks = conf->raid_disks;
5005 mddev->reshape_position = 0; 5007 mddev->reshape_position = conf->reshape_progress;
5006 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5008 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5007 5009
5008 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5010 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5057,7 +5059,6 @@ static void end_reshape(raid5_conf_t *conf)
5057 */ 5059 */
5058static void raid5_finish_reshape(mddev_t *mddev) 5060static void raid5_finish_reshape(mddev_t *mddev)
5059{ 5061{
5060 struct block_device *bdev;
5061 raid5_conf_t *conf = mddev->private; 5062 raid5_conf_t *conf = mddev->private;
5062 5063
5063 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5064 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
@@ -5066,15 +5067,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
5066 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5067 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5067 set_capacity(mddev->gendisk, mddev->array_sectors); 5068 set_capacity(mddev->gendisk, mddev->array_sectors);
5068 mddev->changed = 1; 5069 mddev->changed = 1;
5069 5070 revalidate_disk(mddev->gendisk);
5070 bdev = bdget_disk(mddev->gendisk, 0);
5071 if (bdev) {
5072 mutex_lock(&bdev->bd_inode->i_mutex);
5073 i_size_write(bdev->bd_inode,
5074 (loff_t)mddev->array_sectors << 9);
5075 mutex_unlock(&bdev->bd_inode->i_mutex);
5076 bdput(bdev);
5077 }
5078 } else { 5071 } else {
5079 int d; 5072 int d;
5080 mddev->degraded = conf->raid_disks; 5073 mddev->degraded = conf->raid_disks;
@@ -5106,12 +5099,18 @@ static void raid5_quiesce(mddev_t *mddev, int state)
5106 5099
5107 case 1: /* stop all writes */ 5100 case 1: /* stop all writes */
5108 spin_lock_irq(&conf->device_lock); 5101 spin_lock_irq(&conf->device_lock);
5109 conf->quiesce = 1; 5102 /* '2' tells resync/reshape to pause so that all
5103 * active stripes can drain
5104 */
5105 conf->quiesce = 2;
5110 wait_event_lock_irq(conf->wait_for_stripe, 5106 wait_event_lock_irq(conf->wait_for_stripe,
5111 atomic_read(&conf->active_stripes) == 0 && 5107 atomic_read(&conf->active_stripes) == 0 &&
5112 atomic_read(&conf->active_aligned_reads) == 0, 5108 atomic_read(&conf->active_aligned_reads) == 0,
5113 conf->device_lock, /* nothing */); 5109 conf->device_lock, /* nothing */);
5110 conf->quiesce = 1;
5114 spin_unlock_irq(&conf->device_lock); 5111 spin_unlock_irq(&conf->device_lock);
5112 /* allow reshape to continue */
5113 wake_up(&conf->wait_for_overlap);
5115 break; 5114 break;
5116 5115
5117 case 0: /* re-enable writes */ 5116 case 0: /* re-enable writes */
diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c
index bae61b22501c..7d430835655f 100644
--- a/drivers/mfd/twl4030-irq.c
+++ b/drivers/mfd/twl4030-irq.c
@@ -180,14 +180,9 @@ static struct completion irq_event;
180static int twl4030_irq_thread(void *data) 180static int twl4030_irq_thread(void *data)
181{ 181{
182 long irq = (long)data; 182 long irq = (long)data;
183 struct irq_desc *desc = irq_to_desc(irq);
184 static unsigned i2c_errors; 183 static unsigned i2c_errors;
185 static const unsigned max_i2c_errors = 100; 184 static const unsigned max_i2c_errors = 100;
186 185
187 if (!desc) {
188 pr_err("twl4030: Invalid IRQ: %ld\n", irq);
189 return -EINVAL;
190 }
191 186
192 current->flags |= PF_NOFREEZE; 187 current->flags |= PF_NOFREEZE;
193 188
@@ -240,7 +235,7 @@ static int twl4030_irq_thread(void *data)
240 } 235 }
241 local_irq_enable(); 236 local_irq_enable();
242 237
243 desc->chip->unmask(irq); 238 enable_irq(irq);
244 } 239 }
245 240
246 return 0; 241 return 0;
@@ -255,25 +250,13 @@ static int twl4030_irq_thread(void *data)
255 * thread. All we do here is acknowledge and mask the interrupt and wakeup 250 * thread. All we do here is acknowledge and mask the interrupt and wakeup
256 * the kernel thread. 251 * the kernel thread.
257 */ 252 */
258static void handle_twl4030_pih(unsigned int irq, struct irq_desc *desc) 253static irqreturn_t handle_twl4030_pih(int irq, void *devid)
259{ 254{
260 /* Acknowledge, clear *AND* mask the interrupt... */ 255 /* Acknowledge, clear *AND* mask the interrupt... */
261 desc->chip->ack(irq); 256 disable_irq_nosync(irq);
262 complete(&irq_event); 257 complete(devid);
263} 258 return IRQ_HANDLED;
264
265static struct task_struct *start_twl4030_irq_thread(long irq)
266{
267 struct task_struct *thread;
268
269 init_completion(&irq_event);
270 thread = kthread_run(twl4030_irq_thread, (void *)irq, "twl4030-irq");
271 if (!thread)
272 pr_err("twl4030: could not create irq %ld thread!\n", irq);
273
274 return thread;
275} 259}
276
277/*----------------------------------------------------------------------*/ 260/*----------------------------------------------------------------------*/
278 261
279/* 262/*
@@ -734,18 +717,28 @@ int twl_init_irq(int irq_num, unsigned irq_base, unsigned irq_end)
734 } 717 }
735 718
736 /* install an irq handler to demultiplex the TWL4030 interrupt */ 719 /* install an irq handler to demultiplex the TWL4030 interrupt */
737 task = start_twl4030_irq_thread(irq_num);
738 if (!task) {
739 pr_err("twl4030: irq thread FAIL\n");
740 status = -ESRCH;
741 goto fail;
742 }
743 720
744 set_irq_data(irq_num, task);
745 set_irq_chained_handler(irq_num, handle_twl4030_pih);
746 721
747 return status; 722 init_completion(&irq_event);
748 723
724 status = request_irq(irq_num, handle_twl4030_pih, IRQF_DISABLED,
725 "TWL4030-PIH", &irq_event);
726 if (status < 0) {
727 pr_err("twl4030: could not claim irq%d: %d\n", irq_num, status);
728 goto fail_rqirq;
729 }
730
731 task = kthread_run(twl4030_irq_thread, (void *)irq_num, "twl4030-irq");
732 if (IS_ERR(task)) {
733 pr_err("twl4030: could not create irq %d thread!\n", irq_num);
734 status = PTR_ERR(task);
735 goto fail_kthread;
736 }
737 return status;
738fail_kthread:
739 free_irq(irq_num, &irq_event);
740fail_rqirq:
741 /* clean up twl4030_sih_setup */
749fail: 742fail:
750 for (i = irq_base; i < irq_end; i++) 743 for (i = irq_base; i < irq_end; i++)
751 set_irq_chip_and_handler(i, NULL, NULL); 744 set_irq_chip_and_handler(i, NULL, NULL);
diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c
index d38a7acdb6ec..d019746551f3 100644
--- a/drivers/misc/cb710/sgbuf2.c
+++ b/drivers/misc/cb710/sgbuf2.c
@@ -114,7 +114,6 @@ static void sg_dwiter_write_slow(struct sg_mapping_iter *miter, uint32_t data)
114 if (!left) 114 if (!left)
115 return; 115 return;
116 addr += len; 116 addr += len;
117 flush_kernel_dcache_page(miter->page);
118 } while (sg_dwiter_next(miter)); 117 } while (sg_dwiter_next(miter));
119} 118}
120 119
@@ -142,9 +141,6 @@ void cb710_sg_dwiter_write_next_block(struct sg_mapping_iter *miter, uint32_t da
142 return; 141 return;
143 } else 142 } else
144 sg_dwiter_write_slow(miter, data); 143 sg_dwiter_write_slow(miter, data);
145
146 if (miter->length == miter->consumed)
147 flush_kernel_dcache_page(miter->page);
148} 144}
149EXPORT_SYMBOL_GPL(cb710_sg_dwiter_write_next_block); 145EXPORT_SYMBOL_GPL(cb710_sg_dwiter_write_next_block);
150 146
diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c
index 11efefb1af51..4e72964a7b43 100644
--- a/drivers/mmc/host/cb710-mmc.c
+++ b/drivers/mmc/host/cb710-mmc.c
@@ -278,7 +278,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data)
278 if (unlikely(data->blksz & 15 && (data->blocks != 1 || data->blksz != 8))) 278 if (unlikely(data->blksz & 15 && (data->blocks != 1 || data->blksz != 8)))
279 return -EINVAL; 279 return -EINVAL;
280 280
281 sg_miter_start(&miter, data->sg, data->sg_len, 0); 281 sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_TO_SG);
282 282
283 cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 283 cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT,
284 15, CB710_MMC_C2_READ_PIO_SIZE_MASK); 284 15, CB710_MMC_C2_READ_PIO_SIZE_MASK);
@@ -307,7 +307,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data)
307 goto out; 307 goto out;
308 } 308 }
309out: 309out:
310 cb710_sg_miter_stop_writing(&miter); 310 sg_miter_stop(&miter);
311 return err; 311 return err;
312} 312}
313 313
@@ -322,7 +322,7 @@ static int cb710_mmc_send(struct cb710_slot *slot, struct mmc_data *data)
322 if (unlikely(data->blocks > 1 && data->blksz & 15)) 322 if (unlikely(data->blocks > 1 && data->blksz & 15))
323 return -EINVAL; 323 return -EINVAL;
324 324
325 sg_miter_start(&miter, data->sg, data->sg_len, 0); 325 sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_FROM_SG);
326 326
327 cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 327 cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT,
328 0, CB710_MMC_C2_READ_PIO_SIZE_MASK); 328 0, CB710_MMC_C2_READ_PIO_SIZE_MASK);
diff --git a/drivers/mmc/host/imxmmc.c b/drivers/mmc/host/imxmmc.c
index e0be21a4a696..bf98d7cc928a 100644
--- a/drivers/mmc/host/imxmmc.c
+++ b/drivers/mmc/host/imxmmc.c
@@ -652,7 +652,7 @@ static irqreturn_t imxmci_irq(int irq, void *devid)
652 set_bit(IMXMCI_PEND_STARTED_b, &host->pending_events); 652 set_bit(IMXMCI_PEND_STARTED_b, &host->pending_events);
653 tasklet_schedule(&host->tasklet); 653 tasklet_schedule(&host->tasklet);
654 654
655 return IRQ_RETVAL(handled);; 655 return IRQ_RETVAL(handled);
656} 656}
657 657
658static void imxmci_tasklet_fnc(unsigned long data) 658static void imxmci_tasklet_fnc(unsigned long data)
diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c
index 62041c7e9246..fc96f8cb9c0b 100644
--- a/drivers/mmc/host/sdhci.c
+++ b/drivers/mmc/host/sdhci.c
@@ -773,8 +773,14 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data)
773 } 773 }
774 774
775 if (!(host->flags & SDHCI_REQ_USE_DMA)) { 775 if (!(host->flags & SDHCI_REQ_USE_DMA)) {
776 sg_miter_start(&host->sg_miter, 776 int flags;
777 data->sg, data->sg_len, SG_MITER_ATOMIC); 777
778 flags = SG_MITER_ATOMIC;
779 if (host->data->flags & MMC_DATA_READ)
780 flags |= SG_MITER_TO_SG;
781 else
782 flags |= SG_MITER_FROM_SG;
783 sg_miter_start(&host->sg_miter, data->sg, data->sg_len, flags);
778 host->blocks = data->blocks; 784 host->blocks = data->blocks;
779 } 785 }
780 786
diff --git a/drivers/net/3c515.c b/drivers/net/3c515.c
index 3e00fa8ea65f..4a7c32895be5 100644
--- a/drivers/net/3c515.c
+++ b/drivers/net/3c515.c
@@ -832,7 +832,9 @@ static int corkscrew_open(struct net_device *dev)
832 skb_reserve(skb, 2); /* Align IP on 16 byte boundaries */ 832 skb_reserve(skb, 2); /* Align IP on 16 byte boundaries */
833 vp->rx_ring[i].addr = isa_virt_to_bus(skb->data); 833 vp->rx_ring[i].addr = isa_virt_to_bus(skb->data);
834 } 834 }
835 vp->rx_ring[i - 1].next = isa_virt_to_bus(&vp->rx_ring[0]); /* Wrap the ring. */ 835 if (i != 0)
836 vp->rx_ring[i - 1].next =
837 isa_virt_to_bus(&vp->rx_ring[0]); /* Wrap the ring. */
836 outl(isa_virt_to_bus(&vp->rx_ring[0]), ioaddr + UpListPtr); 838 outl(isa_virt_to_bus(&vp->rx_ring[0]), ioaddr + UpListPtr);
837 } 839 }
838 if (vp->full_bus_master_tx) { /* Boomerang bus master Tx. */ 840 if (vp->full_bus_master_tx) { /* Boomerang bus master Tx. */
diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c
index c34aee91250b..c20416850948 100644
--- a/drivers/net/3c59x.c
+++ b/drivers/net/3c59x.c
@@ -2721,13 +2721,15 @@ dump_tx_ring(struct net_device *dev)
2721 &vp->tx_ring[vp->dirty_tx % TX_RING_SIZE]); 2721 &vp->tx_ring[vp->dirty_tx % TX_RING_SIZE]);
2722 issue_and_wait(dev, DownStall); 2722 issue_and_wait(dev, DownStall);
2723 for (i = 0; i < TX_RING_SIZE; i++) { 2723 for (i = 0; i < TX_RING_SIZE; i++) {
2724 pr_err(" %d: @%p length %8.8x status %8.8x\n", i, 2724 unsigned int length;
2725 &vp->tx_ring[i], 2725
2726#if DO_ZEROCOPY 2726#if DO_ZEROCOPY
2727 le32_to_cpu(vp->tx_ring[i].frag[0].length), 2727 length = le32_to_cpu(vp->tx_ring[i].frag[0].length);
2728#else 2728#else
2729 le32_to_cpu(vp->tx_ring[i].length), 2729 length = le32_to_cpu(vp->tx_ring[i].length);
2730#endif 2730#endif
2731 pr_err(" %d: @%p length %8.8x status %8.8x\n",
2732 i, &vp->tx_ring[i], length,
2731 le32_to_cpu(vp->tx_ring[i].status)); 2733 le32_to_cpu(vp->tx_ring[i].status));
2732 } 2734 }
2733 if (!stalled) 2735 if (!stalled)
diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c
index 1686dca28748..1f016d66684a 100644
--- a/drivers/net/eexpress.c
+++ b/drivers/net/eexpress.c
@@ -1474,13 +1474,13 @@ static void eexp_hw_init586(struct net_device *dev)
1474 outw(0x0000, ioaddr + 0x800c); 1474 outw(0x0000, ioaddr + 0x800c);
1475 outw(0x0000, ioaddr + 0x800e); 1475 outw(0x0000, ioaddr + 0x800e);
1476 1476
1477 for (i = 0; i < (sizeof(start_code)); i+=32) { 1477 for (i = 0; i < ARRAY_SIZE(start_code) * 2; i+=32) {
1478 int j; 1478 int j;
1479 outw(i, ioaddr + SM_PTR); 1479 outw(i, ioaddr + SM_PTR);
1480 for (j = 0; j < 16; j+=2) 1480 for (j = 0; j < 16 && (i+j)/2 < ARRAY_SIZE(start_code); j+=2)
1481 outw(start_code[(i+j)/2], 1481 outw(start_code[(i+j)/2],
1482 ioaddr+0x4000+j); 1482 ioaddr+0x4000+j);
1483 for (j = 0; j < 16; j+=2) 1483 for (j = 0; j < 16 && (i+j+16)/2 < ARRAY_SIZE(start_code); j+=2)
1484 outw(start_code[(i+j+16)/2], 1484 outw(start_code[(i+j+16)/2],
1485 ioaddr+0x8000+j); 1485 ioaddr+0x8000+j);
1486 } 1486 }
diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h
index 78952f8324e2..fa311a950996 100644
--- a/drivers/net/ehea/ehea.h
+++ b/drivers/net/ehea/ehea.h
@@ -40,7 +40,7 @@
40#include <asm/io.h> 40#include <asm/io.h>
41 41
42#define DRV_NAME "ehea" 42#define DRV_NAME "ehea"
43#define DRV_VERSION "EHEA_0101" 43#define DRV_VERSION "EHEA_0102"
44 44
45/* eHEA capability flags */ 45/* eHEA capability flags */
46#define DLPAR_PORT_ADD_REM 1 46#define DLPAR_PORT_ADD_REM 1
diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c
index e8d46cc1bec2..977c3d358279 100644
--- a/drivers/net/ehea/ehea_main.c
+++ b/drivers/net/ehea/ehea_main.c
@@ -1545,6 +1545,9 @@ static int ehea_clean_portres(struct ehea_port *port, struct ehea_port_res *pr)
1545{ 1545{
1546 int ret, i; 1546 int ret, i;
1547 1547
1548 if (pr->qp)
1549 netif_napi_del(&pr->napi);
1550
1548 ret = ehea_destroy_qp(pr->qp); 1551 ret = ehea_destroy_qp(pr->qp);
1549 1552
1550 if (!ret) { 1553 if (!ret) {
diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c
index dbf06e9313cc..2234118eedbb 100644
--- a/drivers/net/gianfar_ethtool.c
+++ b/drivers/net/gianfar_ethtool.c
@@ -366,9 +366,8 @@ static int gfar_scoalesce(struct net_device *dev, struct ethtool_coalesce *cvals
366 return -EINVAL; 366 return -EINVAL;
367 } 367 }
368 368
369 priv->rxic = mk_ic_value( 369 priv->rxic = mk_ic_value(cvals->rx_max_coalesced_frames,
370 gfar_usecs2ticks(priv, cvals->rx_coalesce_usecs), 370 gfar_usecs2ticks(priv, cvals->rx_coalesce_usecs));
371 cvals->rx_max_coalesced_frames);
372 371
373 /* Set up tx coalescing */ 372 /* Set up tx coalescing */
374 if ((cvals->tx_coalesce_usecs == 0) || 373 if ((cvals->tx_coalesce_usecs == 0) ||
@@ -390,9 +389,8 @@ static int gfar_scoalesce(struct net_device *dev, struct ethtool_coalesce *cvals
390 return -EINVAL; 389 return -EINVAL;
391 } 390 }
392 391
393 priv->txic = mk_ic_value( 392 priv->txic = mk_ic_value(cvals->tx_max_coalesced_frames,
394 gfar_usecs2ticks(priv, cvals->tx_coalesce_usecs), 393 gfar_usecs2ticks(priv, cvals->tx_coalesce_usecs));
395 cvals->tx_max_coalesced_frames);
396 394
397 gfar_write(&priv->regs->rxic, 0); 395 gfar_write(&priv->regs->rxic, 0);
398 if (priv->rxcoalescing) 396 if (priv->rxcoalescing)
diff --git a/drivers/net/igbvf/vf.c b/drivers/net/igbvf/vf.c
index 2a4faf9ade69..a9a61efa964c 100644
--- a/drivers/net/igbvf/vf.c
+++ b/drivers/net/igbvf/vf.c
@@ -274,6 +274,8 @@ static s32 e1000_set_vfta_vf(struct e1000_hw *hw, u16 vid, bool set)
274 274
275 err = mbx->ops.read_posted(hw, msgbuf, 2); 275 err = mbx->ops.read_posted(hw, msgbuf, 2);
276 276
277 msgbuf[0] &= ~E1000_VT_MSGTYPE_CTS;
278
277 /* if nacked the vlan was rejected */ 279 /* if nacked the vlan was rejected */
278 if (!err && (msgbuf[0] == (E1000_VF_SET_VLAN | E1000_VT_MSGTYPE_NACK))) 280 if (!err && (msgbuf[0] == (E1000_VF_SET_VLAN | E1000_VT_MSGTYPE_NACK)))
279 err = -E1000_ERR_MAC_INIT; 281 err = -E1000_ERR_MAC_INIT;
@@ -317,6 +319,8 @@ static void e1000_rar_set_vf(struct e1000_hw *hw, u8 * addr, u32 index)
317 if (!ret_val) 319 if (!ret_val)
318 ret_val = mbx->ops.read_posted(hw, msgbuf, 3); 320 ret_val = mbx->ops.read_posted(hw, msgbuf, 3);
319 321
322 msgbuf[0] &= ~E1000_VT_MSGTYPE_CTS;
323
320 /* if nacked the address was rejected, use "perm_addr" */ 324 /* if nacked the address was rejected, use "perm_addr" */
321 if (!ret_val && 325 if (!ret_val &&
322 (msgbuf[0] == (E1000_VF_SET_MAC_ADDR | E1000_VT_MSGTYPE_NACK))) 326 (msgbuf[0] == (E1000_VF_SET_MAC_ADDR | E1000_VT_MSGTYPE_NACK)))
diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h
index 1b12c7ba275f..e11d83d5852b 100644
--- a/drivers/net/ixgbe/ixgbe.h
+++ b/drivers/net/ixgbe/ixgbe.h
@@ -96,6 +96,8 @@
96#define IXGBE_TX_FLAGS_VLAN_PRIO_MASK 0x0000e000 96#define IXGBE_TX_FLAGS_VLAN_PRIO_MASK 0x0000e000
97#define IXGBE_TX_FLAGS_VLAN_SHIFT 16 97#define IXGBE_TX_FLAGS_VLAN_SHIFT 16
98 98
99#define IXGBE_MAX_RSC_INT_RATE 162760
100
99/* wrapper around a pointer to a socket buffer, 101/* wrapper around a pointer to a socket buffer,
100 * so a DMA handle can be stored along with the buffer */ 102 * so a DMA handle can be stored along with the buffer */
101struct ixgbe_tx_buffer { 103struct ixgbe_tx_buffer {
diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c
index b9923047ce11..522c03bc1dad 100644
--- a/drivers/net/ixgbe/ixgbe_82598.c
+++ b/drivers/net/ixgbe/ixgbe_82598.c
@@ -50,6 +50,51 @@ static s32 ixgbe_read_i2c_eeprom_82598(struct ixgbe_hw *hw, u8 byte_offset,
50 u8 *eeprom_data); 50 u8 *eeprom_data);
51 51
52/** 52/**
53 * ixgbe_set_pcie_completion_timeout - set pci-e completion timeout
54 * @hw: pointer to the HW structure
55 *
56 * The defaults for 82598 should be in the range of 50us to 50ms,
57 * however the hardware default for these parts is 500us to 1ms which is less
58 * than the 10ms recommended by the pci-e spec. To address this we need to
59 * increase the value to either 10ms to 250ms for capability version 1 config,
60 * or 16ms to 55ms for version 2.
61 **/
62void ixgbe_set_pcie_completion_timeout(struct ixgbe_hw *hw)
63{
64 struct ixgbe_adapter *adapter = hw->back;
65 u32 gcr = IXGBE_READ_REG(hw, IXGBE_GCR);
66 u16 pcie_devctl2;
67
68 /* only take action if timeout value is defaulted to 0 */
69 if (gcr & IXGBE_GCR_CMPL_TMOUT_MASK)
70 goto out;
71
72 /*
73 * if capababilities version is type 1 we can write the
74 * timeout of 10ms to 250ms through the GCR register
75 */
76 if (!(gcr & IXGBE_GCR_CAP_VER2)) {
77 gcr |= IXGBE_GCR_CMPL_TMOUT_10ms;
78 goto out;
79 }
80
81 /*
82 * for version 2 capabilities we need to write the config space
83 * directly in order to set the completion timeout value for
84 * 16ms to 55ms
85 */
86 pci_read_config_word(adapter->pdev,
87 IXGBE_PCI_DEVICE_CONTROL2, &pcie_devctl2);
88 pcie_devctl2 |= IXGBE_PCI_DEVICE_CONTROL2_16ms;
89 pci_write_config_word(adapter->pdev,
90 IXGBE_PCI_DEVICE_CONTROL2, pcie_devctl2);
91out:
92 /* disable completion timeout resend */
93 gcr &= ~IXGBE_GCR_CMPL_TMOUT_RESEND;
94 IXGBE_WRITE_REG(hw, IXGBE_GCR, gcr);
95}
96
97/**
53 * ixgbe_get_pcie_msix_count_82598 - Gets MSI-X vector count 98 * ixgbe_get_pcie_msix_count_82598 - Gets MSI-X vector count
54 * @hw: pointer to hardware structure 99 * @hw: pointer to hardware structure
55 * 100 *
@@ -153,6 +198,26 @@ out:
153} 198}
154 199
155/** 200/**
201 * ixgbe_start_hw_82598 - Prepare hardware for Tx/Rx
202 * @hw: pointer to hardware structure
203 *
204 * Starts the hardware using the generic start_hw function.
205 * Then set pcie completion timeout
206 **/
207s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw)
208{
209 s32 ret_val = 0;
210
211 ret_val = ixgbe_start_hw_generic(hw);
212
213 /* set the completion timeout for interface */
214 if (ret_val == 0)
215 ixgbe_set_pcie_completion_timeout(hw);
216
217 return ret_val;
218}
219
220/**
156 * ixgbe_get_link_capabilities_82598 - Determines link capabilities 221 * ixgbe_get_link_capabilities_82598 - Determines link capabilities
157 * @hw: pointer to hardware structure 222 * @hw: pointer to hardware structure
158 * @speed: pointer to link speed 223 * @speed: pointer to link speed
@@ -1085,7 +1150,7 @@ out:
1085static struct ixgbe_mac_operations mac_ops_82598 = { 1150static struct ixgbe_mac_operations mac_ops_82598 = {
1086 .init_hw = &ixgbe_init_hw_generic, 1151 .init_hw = &ixgbe_init_hw_generic,
1087 .reset_hw = &ixgbe_reset_hw_82598, 1152 .reset_hw = &ixgbe_reset_hw_82598,
1088 .start_hw = &ixgbe_start_hw_generic, 1153 .start_hw = &ixgbe_start_hw_82598,
1089 .clear_hw_cntrs = &ixgbe_clear_hw_cntrs_generic, 1154 .clear_hw_cntrs = &ixgbe_clear_hw_cntrs_generic,
1090 .get_media_type = &ixgbe_get_media_type_82598, 1155 .get_media_type = &ixgbe_get_media_type_82598,
1091 .get_supported_physical_layer = &ixgbe_get_supported_physical_layer_82598, 1156 .get_supported_physical_layer = &ixgbe_get_supported_physical_layer_82598,
diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c
index 2a978008fd6e..79144e950a34 100644
--- a/drivers/net/ixgbe/ixgbe_ethtool.c
+++ b/drivers/net/ixgbe/ixgbe_ethtool.c
@@ -1975,7 +1975,10 @@ static int ixgbe_set_coalesce(struct net_device *netdev,
1975 * any other value means disable eitr, which is best 1975 * any other value means disable eitr, which is best
1976 * served by setting the interrupt rate very high 1976 * served by setting the interrupt rate very high
1977 */ 1977 */
1978 adapter->eitr_param = IXGBE_MAX_INT_RATE; 1978 if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
1979 adapter->eitr_param = IXGBE_MAX_RSC_INT_RATE;
1980 else
1981 adapter->eitr_param = IXGBE_MAX_INT_RATE;
1979 adapter->itr_setting = 0; 1982 adapter->itr_setting = 0;
1980 } 1983 }
1981 1984
@@ -1999,13 +2002,13 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data)
1999 2002
2000 ethtool_op_set_flags(netdev, data); 2003 ethtool_op_set_flags(netdev, data);
2001 2004
2002 if (!(adapter->flags & IXGBE_FLAG2_RSC_CAPABLE)) 2005 if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE))
2003 return 0; 2006 return 0;
2004 2007
2005 /* if state changes we need to update adapter->flags and reset */ 2008 /* if state changes we need to update adapter->flags and reset */
2006 if ((!!(data & ETH_FLAG_LRO)) != 2009 if ((!!(data & ETH_FLAG_LRO)) !=
2007 (!!(adapter->flags & IXGBE_FLAG2_RSC_ENABLED))) { 2010 (!!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED))) {
2008 adapter->flags ^= IXGBE_FLAG2_RSC_ENABLED; 2011 adapter->flags2 ^= IXGBE_FLAG2_RSC_ENABLED;
2009 if (netif_running(netdev)) 2012 if (netif_running(netdev))
2010 ixgbe_reinit_locked(adapter); 2013 ixgbe_reinit_locked(adapter);
2011 else 2014 else
diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c
index 200454f30f6a..110c65ab5cb5 100644
--- a/drivers/net/ixgbe/ixgbe_main.c
+++ b/drivers/net/ixgbe/ixgbe_main.c
@@ -780,7 +780,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector,
780 prefetch(next_rxd); 780 prefetch(next_rxd);
781 cleaned_count++; 781 cleaned_count++;
782 782
783 if (adapter->flags & IXGBE_FLAG2_RSC_CAPABLE) 783 if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)
784 rsc_count = ixgbe_get_rsc_count(rx_desc); 784 rsc_count = ixgbe_get_rsc_count(rx_desc);
785 785
786 if (rsc_count) { 786 if (rsc_count) {
@@ -2036,7 +2036,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
2036 IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), psrtype); 2036 IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), psrtype);
2037 } 2037 }
2038 } else { 2038 } else {
2039 if (!(adapter->flags & IXGBE_FLAG2_RSC_ENABLED) && 2039 if (!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) &&
2040 (netdev->mtu <= ETH_DATA_LEN)) 2040 (netdev->mtu <= ETH_DATA_LEN))
2041 rx_buf_len = MAXIMUM_ETHERNET_VLAN_SIZE; 2041 rx_buf_len = MAXIMUM_ETHERNET_VLAN_SIZE;
2042 else 2042 else
@@ -2165,7 +2165,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter)
2165 IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl); 2165 IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl);
2166 } 2166 }
2167 2167
2168 if (adapter->flags & IXGBE_FLAG2_RSC_ENABLED) { 2168 if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) {
2169 /* Enable 82599 HW-RSC */ 2169 /* Enable 82599 HW-RSC */
2170 for (i = 0; i < adapter->num_rx_queues; i++) { 2170 for (i = 0; i < adapter->num_rx_queues; i++) {
2171 j = adapter->rx_ring[i].reg_idx; 2171 j = adapter->rx_ring[i].reg_idx;
@@ -3812,8 +3812,8 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter)
3812 adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82598; 3812 adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82598;
3813 } else if (hw->mac.type == ixgbe_mac_82599EB) { 3813 } else if (hw->mac.type == ixgbe_mac_82599EB) {
3814 adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82599; 3814 adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82599;
3815 adapter->flags |= IXGBE_FLAG2_RSC_CAPABLE; 3815 adapter->flags2 |= IXGBE_FLAG2_RSC_CAPABLE;
3816 adapter->flags |= IXGBE_FLAG2_RSC_ENABLED; 3816 adapter->flags2 |= IXGBE_FLAG2_RSC_ENABLED;
3817 adapter->flags |= IXGBE_FLAG_FDIR_HASH_CAPABLE; 3817 adapter->flags |= IXGBE_FLAG_FDIR_HASH_CAPABLE;
3818 adapter->ring_feature[RING_F_FDIR].indices = 3818 adapter->ring_feature[RING_F_FDIR].indices =
3819 IXGBE_MAX_FDIR_INDICES; 3819 IXGBE_MAX_FDIR_INDICES;
@@ -5360,12 +5360,19 @@ static int ixgbe_del_sanmac_netdev(struct net_device *dev)
5360static void ixgbe_netpoll(struct net_device *netdev) 5360static void ixgbe_netpoll(struct net_device *netdev)
5361{ 5361{
5362 struct ixgbe_adapter *adapter = netdev_priv(netdev); 5362 struct ixgbe_adapter *adapter = netdev_priv(netdev);
5363 int i;
5363 5364
5364 disable_irq(adapter->pdev->irq);
5365 adapter->flags |= IXGBE_FLAG_IN_NETPOLL; 5365 adapter->flags |= IXGBE_FLAG_IN_NETPOLL;
5366 ixgbe_intr(adapter->pdev->irq, netdev); 5366 if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED) {
5367 int num_q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS;
5368 for (i = 0; i < num_q_vectors; i++) {
5369 struct ixgbe_q_vector *q_vector = adapter->q_vector[i];
5370 ixgbe_msix_clean_many(0, q_vector);
5371 }
5372 } else {
5373 ixgbe_intr(adapter->pdev->irq, netdev);
5374 }
5367 adapter->flags &= ~IXGBE_FLAG_IN_NETPOLL; 5375 adapter->flags &= ~IXGBE_FLAG_IN_NETPOLL;
5368 enable_irq(adapter->pdev->irq);
5369} 5376}
5370#endif 5377#endif
5371 5378
@@ -5611,7 +5618,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev,
5611 if (pci_using_dac) 5618 if (pci_using_dac)
5612 netdev->features |= NETIF_F_HIGHDMA; 5619 netdev->features |= NETIF_F_HIGHDMA;
5613 5620
5614 if (adapter->flags & IXGBE_FLAG2_RSC_ENABLED) 5621 if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED)
5615 netdev->features |= NETIF_F_LRO; 5622 netdev->features |= NETIF_F_LRO;
5616 5623
5617 /* make sure the EEPROM is good */ 5624 /* make sure the EEPROM is good */
diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h
index fa87309dc087..be90eb4575f6 100644
--- a/drivers/net/ixgbe/ixgbe_type.h
+++ b/drivers/net/ixgbe/ixgbe_type.h
@@ -718,6 +718,12 @@
718#define IXGBE_ECC_STATUS_82599 0x110E0 718#define IXGBE_ECC_STATUS_82599 0x110E0
719#define IXGBE_BAR_CTRL_82599 0x110F4 719#define IXGBE_BAR_CTRL_82599 0x110F4
720 720
721/* PCI Express Control */
722#define IXGBE_GCR_CMPL_TMOUT_MASK 0x0000F000
723#define IXGBE_GCR_CMPL_TMOUT_10ms 0x00001000
724#define IXGBE_GCR_CMPL_TMOUT_RESEND 0x00010000
725#define IXGBE_GCR_CAP_VER2 0x00040000
726
721/* Time Sync Registers */ 727/* Time Sync Registers */
722#define IXGBE_TSYNCRXCTL 0x05188 /* Rx Time Sync Control register - RW */ 728#define IXGBE_TSYNCRXCTL 0x05188 /* Rx Time Sync Control register - RW */
723#define IXGBE_TSYNCTXCTL 0x08C00 /* Tx Time Sync Control register - RW */ 729#define IXGBE_TSYNCTXCTL 0x08C00 /* Tx Time Sync Control register - RW */
@@ -1521,6 +1527,7 @@
1521 1527
1522/* PCI Bus Info */ 1528/* PCI Bus Info */
1523#define IXGBE_PCI_LINK_STATUS 0xB2 1529#define IXGBE_PCI_LINK_STATUS 0xB2
1530#define IXGBE_PCI_DEVICE_CONTROL2 0xC8
1524#define IXGBE_PCI_LINK_WIDTH 0x3F0 1531#define IXGBE_PCI_LINK_WIDTH 0x3F0
1525#define IXGBE_PCI_LINK_WIDTH_1 0x10 1532#define IXGBE_PCI_LINK_WIDTH_1 0x10
1526#define IXGBE_PCI_LINK_WIDTH_2 0x20 1533#define IXGBE_PCI_LINK_WIDTH_2 0x20
@@ -1531,6 +1538,7 @@
1531#define IXGBE_PCI_LINK_SPEED_5000 0x2 1538#define IXGBE_PCI_LINK_SPEED_5000 0x2
1532#define IXGBE_PCI_HEADER_TYPE_REGISTER 0x0E 1539#define IXGBE_PCI_HEADER_TYPE_REGISTER 0x0E
1533#define IXGBE_PCI_HEADER_TYPE_MULTIFUNC 0x80 1540#define IXGBE_PCI_HEADER_TYPE_MULTIFUNC 0x80
1541#define IXGBE_PCI_DEVICE_CONTROL2_16ms 0x0005
1534 1542
1535/* Number of 100 microseconds we wait for PCI Express master disable */ 1543/* Number of 100 microseconds we wait for PCI Express master disable */
1536#define IXGBE_PCI_MASTER_DISABLE_TIMEOUT 800 1544#define IXGBE_PCI_MASTER_DISABLE_TIMEOUT 800
diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c
index 08c43f2ae72b..5a88b3f57693 100644
--- a/drivers/net/mlx4/en_tx.c
+++ b/drivers/net/mlx4/en_tx.c
@@ -249,6 +249,7 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv,
249 pci_unmap_page(mdev->pdev, 249 pci_unmap_page(mdev->pdev,
250 (dma_addr_t) be64_to_cpu(data->addr), 250 (dma_addr_t) be64_to_cpu(data->addr),
251 frag->size, PCI_DMA_TODEVICE); 251 frag->size, PCI_DMA_TODEVICE);
252 ++data;
252 } 253 }
253 } 254 }
254 /* Stamp the freed descriptor */ 255 /* Stamp the freed descriptor */
diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c
index 637ac8b89bac..3cd8cfcf627b 100644
--- a/drivers/net/netxen/netxen_nic_main.c
+++ b/drivers/net/netxen/netxen_nic_main.c
@@ -221,7 +221,7 @@ netxen_napi_disable(struct netxen_adapter *adapter)
221 } 221 }
222} 222}
223 223
224static int nx_set_dma_mask(struct netxen_adapter *adapter, uint8_t revision_id) 224static int nx_set_dma_mask(struct netxen_adapter *adapter)
225{ 225{
226 struct pci_dev *pdev = adapter->pdev; 226 struct pci_dev *pdev = adapter->pdev;
227 uint64_t mask, cmask; 227 uint64_t mask, cmask;
@@ -229,19 +229,17 @@ static int nx_set_dma_mask(struct netxen_adapter *adapter, uint8_t revision_id)
229 adapter->pci_using_dac = 0; 229 adapter->pci_using_dac = 0;
230 230
231 mask = DMA_BIT_MASK(32); 231 mask = DMA_BIT_MASK(32);
232 /*
233 * Consistent DMA mask is set to 32 bit because it cannot be set to
234 * 35 bits. For P3 also leave it at 32 bits for now. Only the rings
235 * come off this pool.
236 */
237 cmask = DMA_BIT_MASK(32); 232 cmask = DMA_BIT_MASK(32);
238 233
234 if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) {
239#ifndef CONFIG_IA64 235#ifndef CONFIG_IA64
240 if (revision_id >= NX_P3_B0)
241 mask = DMA_BIT_MASK(39);
242 else if (revision_id == NX_P2_C1)
243 mask = DMA_BIT_MASK(35); 236 mask = DMA_BIT_MASK(35);
244#endif 237#endif
238 } else {
239 mask = DMA_BIT_MASK(39);
240 cmask = mask;
241 }
242
245 if (pci_set_dma_mask(pdev, mask) == 0 && 243 if (pci_set_dma_mask(pdev, mask) == 0 &&
246 pci_set_consistent_dma_mask(pdev, cmask) == 0) { 244 pci_set_consistent_dma_mask(pdev, cmask) == 0) {
247 adapter->pci_using_dac = 1; 245 adapter->pci_using_dac = 1;
@@ -256,7 +254,7 @@ static int
256nx_update_dma_mask(struct netxen_adapter *adapter) 254nx_update_dma_mask(struct netxen_adapter *adapter)
257{ 255{
258 int change, shift, err; 256 int change, shift, err;
259 uint64_t mask, old_mask; 257 uint64_t mask, old_mask, old_cmask;
260 struct pci_dev *pdev = adapter->pdev; 258 struct pci_dev *pdev = adapter->pdev;
261 259
262 change = 0; 260 change = 0;
@@ -272,14 +270,29 @@ nx_update_dma_mask(struct netxen_adapter *adapter)
272 270
273 if (change) { 271 if (change) {
274 old_mask = pdev->dma_mask; 272 old_mask = pdev->dma_mask;
273 old_cmask = pdev->dev.coherent_dma_mask;
274
275 mask = (1ULL<<(32+shift)) - 1; 275 mask = (1ULL<<(32+shift)) - 1;
276 276
277 err = pci_set_dma_mask(pdev, mask); 277 err = pci_set_dma_mask(pdev, mask);
278 if (err) 278 if (err)
279 return pci_set_dma_mask(pdev, old_mask); 279 goto err_out;
280
281 if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) {
282
283 err = pci_set_consistent_dma_mask(pdev, mask);
284 if (err)
285 goto err_out;
286 }
287 dev_info(&pdev->dev, "using %d-bit dma mask\n", 32+shift);
280 } 288 }
281 289
282 return 0; 290 return 0;
291
292err_out:
293 pci_set_dma_mask(pdev, old_mask);
294 pci_set_consistent_dma_mask(pdev, old_cmask);
295 return err;
283} 296}
284 297
285static void netxen_check_options(struct netxen_adapter *adapter) 298static void netxen_check_options(struct netxen_adapter *adapter)
@@ -1006,7 +1019,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
1006 revision_id = pdev->revision; 1019 revision_id = pdev->revision;
1007 adapter->ahw.revision_id = revision_id; 1020 adapter->ahw.revision_id = revision_id;
1008 1021
1009 err = nx_set_dma_mask(adapter, revision_id); 1022 err = nx_set_dma_mask(adapter);
1010 if (err) 1023 if (err)
1011 goto err_out_free_netdev; 1024 goto err_out_free_netdev;
1012 1025
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 28368157dac4..a646a445fda9 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -1611,8 +1611,11 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
1611 if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 1611 if (pcnet32_dwio_read_csr(ioaddr, 0) == 4
1612 && pcnet32_dwio_check(ioaddr)) { 1612 && pcnet32_dwio_check(ioaddr)) {
1613 a = &pcnet32_dwio; 1613 a = &pcnet32_dwio;
1614 } else 1614 } else {
1615 if (pcnet32_debug & NETIF_MSG_PROBE)
1616 printk(KERN_ERR PFX "No access methods\n");
1615 goto err_release_region; 1617 goto err_release_region;
1618 }
1616 } 1619 }
1617 1620
1618 chip_version = 1621 chip_version =
@@ -1719,7 +1722,9 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
1719 ret = -ENOMEM; 1722 ret = -ENOMEM;
1720 goto err_release_region; 1723 goto err_release_region;
1721 } 1724 }
1722 SET_NETDEV_DEV(dev, &pdev->dev); 1725
1726 if (pdev)
1727 SET_NETDEV_DEV(dev, &pdev->dev);
1723 1728
1724 if (pcnet32_debug & NETIF_MSG_PROBE) 1729 if (pcnet32_debug & NETIF_MSG_PROBE)
1725 printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr); 1730 printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr);
@@ -1818,7 +1823,6 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
1818 1823
1819 spin_lock_init(&lp->lock); 1824 spin_lock_init(&lp->lock);
1820 1825
1821 SET_NETDEV_DEV(dev, &pdev->dev);
1822 lp->name = chipname; 1826 lp->name = chipname;
1823 lp->shared_irq = shared; 1827 lp->shared_irq = shared;
1824 lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */ 1828 lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */
@@ -1852,12 +1856,6 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
1852 ((cards_found >= MAX_UNITS) || full_duplex[cards_found])) 1856 ((cards_found >= MAX_UNITS) || full_duplex[cards_found]))
1853 lp->options |= PCNET32_PORT_FD; 1857 lp->options |= PCNET32_PORT_FD;
1854 1858
1855 if (!a) {
1856 if (pcnet32_debug & NETIF_MSG_PROBE)
1857 printk(KERN_ERR PFX "No access methods\n");
1858 ret = -ENODEV;
1859 goto err_free_consistent;
1860 }
1861 lp->a = *a; 1859 lp->a = *a;
1862 1860
1863 /* prior to register_netdev, dev->name is not yet correct */ 1861 /* prior to register_netdev, dev->name is not yet correct */
@@ -1973,14 +1971,13 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev)
1973 1971
1974 return 0; 1972 return 0;
1975 1973
1976 err_free_ring: 1974err_free_ring:
1977 pcnet32_free_ring(dev); 1975 pcnet32_free_ring(dev);
1978 err_free_consistent:
1979 pci_free_consistent(lp->pci_dev, sizeof(*lp->init_block), 1976 pci_free_consistent(lp->pci_dev, sizeof(*lp->init_block),
1980 lp->init_block, lp->init_dma_addr); 1977 lp->init_block, lp->init_dma_addr);
1981 err_free_netdev: 1978err_free_netdev:
1982 free_netdev(dev); 1979 free_netdev(dev);
1983 err_release_region: 1980err_release_region:
1984 release_region(ioaddr, PCNET32_TOTAL_SIZE); 1981 release_region(ioaddr, PCNET32_TOTAL_SIZE);
1985 return ret; 1982 return ret;
1986} 1983}
@@ -2089,6 +2086,7 @@ static void pcnet32_free_ring(struct net_device *dev)
2089static int pcnet32_open(struct net_device *dev) 2086static int pcnet32_open(struct net_device *dev)
2090{ 2087{
2091 struct pcnet32_private *lp = netdev_priv(dev); 2088 struct pcnet32_private *lp = netdev_priv(dev);
2089 struct pci_dev *pdev = lp->pci_dev;
2092 unsigned long ioaddr = dev->base_addr; 2090 unsigned long ioaddr = dev->base_addr;
2093 u16 val; 2091 u16 val;
2094 int i; 2092 int i;
@@ -2149,9 +2147,9 @@ static int pcnet32_open(struct net_device *dev)
2149 lp->a.write_csr(ioaddr, 124, val); 2147 lp->a.write_csr(ioaddr, 124, val);
2150 2148
2151 /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */ 2149 /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */
2152 if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT && 2150 if (pdev && pdev->subsystem_vendor == PCI_VENDOR_ID_AT &&
2153 (lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX || 2151 (pdev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX ||
2154 lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) { 2152 pdev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) {
2155 if (lp->options & PCNET32_PORT_ASEL) { 2153 if (lp->options & PCNET32_PORT_ASEL) {
2156 lp->options = PCNET32_PORT_FD | PCNET32_PORT_100; 2154 lp->options = PCNET32_PORT_FD | PCNET32_PORT_100;
2157 if (netif_msg_link(lp)) 2155 if (netif_msg_link(lp))
diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c
index 639d11bc444e..cd37d739ac74 100644
--- a/drivers/net/ppp_generic.c
+++ b/drivers/net/ppp_generic.c
@@ -1384,7 +1384,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
1384 1384
1385 /* create a fragment for each channel */ 1385 /* create a fragment for each channel */
1386 bits = B; 1386 bits = B;
1387 while (nfree > 0 && len > 0) { 1387 while (len > 0) {
1388 list = list->next; 1388 list = list->next;
1389 if (list == &ppp->channels) { 1389 if (list == &ppp->channels) {
1390 i = 0; 1390 i = 0;
@@ -1431,29 +1431,31 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
1431 *otherwise divide it according to the speed 1431 *otherwise divide it according to the speed
1432 *of the channel we are going to transmit on 1432 *of the channel we are going to transmit on
1433 */ 1433 */
1434 if (pch->speed == 0) { 1434 if (nfree > 0) {
1435 flen = totlen/nfree ; 1435 if (pch->speed == 0) {
1436 if (nbigger > 0) { 1436 flen = totlen/nfree ;
1437 flen++; 1437 if (nbigger > 0) {
1438 nbigger--; 1438 flen++;
1439 } 1439 nbigger--;
1440 } else { 1440 }
1441 flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / 1441 } else {
1442 ((totspeed*totfree)/pch->speed)) - hdrlen; 1442 flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) /
1443 if (nbigger > 0) { 1443 ((totspeed*totfree)/pch->speed)) - hdrlen;
1444 flen += ((totfree - nzero)*pch->speed)/totspeed; 1444 if (nbigger > 0) {
1445 nbigger -= ((totfree - nzero)*pch->speed)/ 1445 flen += ((totfree - nzero)*pch->speed)/totspeed;
1446 nbigger -= ((totfree - nzero)*pch->speed)/
1446 totspeed; 1447 totspeed;
1448 }
1447 } 1449 }
1450 nfree--;
1448 } 1451 }
1449 nfree--;
1450 1452
1451 /* 1453 /*
1452 *check if we are on the last channel or 1454 *check if we are on the last channel or
1453 *we exceded the lenght of the data to 1455 *we exceded the lenght of the data to
1454 *fragment 1456 *fragment
1455 */ 1457 */
1456 if ((nfree == 0) || (flen > len)) 1458 if ((nfree <= 0) || (flen > len))
1457 flen = len; 1459 flen = len;
1458 /* 1460 /*
1459 *it is not worth to tx on slow channels: 1461 *it is not worth to tx on slow channels:
@@ -1467,7 +1469,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb)
1467 continue; 1469 continue;
1468 } 1470 }
1469 1471
1470 mtu = pch->chan->mtu + 2 - hdrlen; 1472 mtu = pch->chan->mtu - hdrlen;
1471 if (mtu < 4) 1473 if (mtu < 4)
1472 mtu = 4; 1474 mtu = 4;
1473 if (flen > mtu) 1475 if (flen > mtu)
diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c
index f0031f1f97e5..5f2090233d7b 100644
--- a/drivers/net/pppoe.c
+++ b/drivers/net/pppoe.c
@@ -1063,6 +1063,7 @@ static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1063 else { 1063 else {
1064 int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); 1064 int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote);
1065 1065
1066 po = NULL;
1066 while (++hash < PPPOE_HASH_SIZE) { 1067 while (++hash < PPPOE_HASH_SIZE) {
1067 po = pn->hash_table[hash]; 1068 po = pn->hash_table[hash];
1068 if (po) 1069 if (po)
diff --git a/drivers/net/pppol2tp.c b/drivers/net/pppol2tp.c
index e7935d09c896..e0f9219a0aea 100644
--- a/drivers/net/pppol2tp.c
+++ b/drivers/net/pppol2tp.c
@@ -2680,6 +2680,7 @@ out_unregister_pppol2tp_proto:
2680static void __exit pppol2tp_exit(void) 2680static void __exit pppol2tp_exit(void)
2681{ 2681{
2682 unregister_pppox_proto(PX_PROTO_OL2TP); 2682 unregister_pppox_proto(PX_PROTO_OL2TP);
2683 unregister_pernet_gen_device(pppol2tp_net_id, &pppol2tp_net_ops);
2683 proto_unregister(&pppol2tp_sk_proto); 2684 proto_unregister(&pppol2tp_sk_proto);
2684} 2685}
2685 2686
diff --git a/drivers/net/s6gmac.c b/drivers/net/s6gmac.c
index 5345e47b35ac..4525cbe8dd69 100644
--- a/drivers/net/s6gmac.c
+++ b/drivers/net/s6gmac.c
@@ -793,7 +793,7 @@ static inline int s6gmac_phy_start(struct net_device *dev)
793 struct s6gmac *pd = netdev_priv(dev); 793 struct s6gmac *pd = netdev_priv(dev);
794 int i = 0; 794 int i = 0;
795 struct phy_device *p = NULL; 795 struct phy_device *p = NULL;
796 while ((!(p = pd->mii.bus->phy_map[i])) && (i < PHY_MAX_ADDR)) 796 while ((i < PHY_MAX_ADDR) && (!(p = pd->mii.bus->phy_map[i])))
797 i++; 797 i++;
798 p = phy_connect(dev, dev_name(&p->dev), &s6gmac_adjust_link, 0, 798 p = phy_connect(dev, dev_name(&p->dev), &s6gmac_adjust_link, 0,
799 PHY_INTERFACE_MODE_RGMII); 799 PHY_INTERFACE_MODE_RGMII);
diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c
index 3550c5dcd93c..0a551d8f5d95 100644
--- a/drivers/net/sky2.c
+++ b/drivers/net/sky2.c
@@ -1488,6 +1488,8 @@ static int sky2_up(struct net_device *dev)
1488 sky2_set_vlan_mode(hw, port, sky2->vlgrp != NULL); 1488 sky2_set_vlan_mode(hw, port, sky2->vlgrp != NULL);
1489#endif 1489#endif
1490 1490
1491 sky2->restarting = 0;
1492
1491 err = sky2_rx_start(sky2); 1493 err = sky2_rx_start(sky2);
1492 if (err) 1494 if (err)
1493 goto err_out; 1495 goto err_out;
@@ -1500,6 +1502,9 @@ static int sky2_up(struct net_device *dev)
1500 1502
1501 sky2_set_multicast(dev); 1503 sky2_set_multicast(dev);
1502 1504
1505 /* wake queue incase we are restarting */
1506 netif_wake_queue(dev);
1507
1503 if (netif_msg_ifup(sky2)) 1508 if (netif_msg_ifup(sky2))
1504 printk(KERN_INFO PFX "%s: enabling interface\n", dev->name); 1509 printk(KERN_INFO PFX "%s: enabling interface\n", dev->name);
1505 return 0; 1510 return 0;
@@ -1533,6 +1538,8 @@ static inline int tx_dist(unsigned tail, unsigned head)
1533/* Number of list elements available for next tx */ 1538/* Number of list elements available for next tx */
1534static inline int tx_avail(const struct sky2_port *sky2) 1539static inline int tx_avail(const struct sky2_port *sky2)
1535{ 1540{
1541 if (unlikely(sky2->restarting))
1542 return 0;
1536 return sky2->tx_pending - tx_dist(sky2->tx_cons, sky2->tx_prod); 1543 return sky2->tx_pending - tx_dist(sky2->tx_cons, sky2->tx_prod);
1537} 1544}
1538 1545
@@ -1818,6 +1825,10 @@ static int sky2_down(struct net_device *dev)
1818 if (netif_msg_ifdown(sky2)) 1825 if (netif_msg_ifdown(sky2))
1819 printk(KERN_INFO PFX "%s: disabling interface\n", dev->name); 1826 printk(KERN_INFO PFX "%s: disabling interface\n", dev->name);
1820 1827
1828 /* explicitly shut off tx incase we're restarting */
1829 sky2->restarting = 1;
1830 netif_tx_disable(dev);
1831
1821 /* Force flow control off */ 1832 /* Force flow control off */
1822 sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF); 1833 sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF);
1823 1834
@@ -2359,7 +2370,7 @@ static inline void sky2_tx_done(struct net_device *dev, u16 last)
2359{ 2370{
2360 struct sky2_port *sky2 = netdev_priv(dev); 2371 struct sky2_port *sky2 = netdev_priv(dev);
2361 2372
2362 if (netif_running(dev)) { 2373 if (likely(netif_running(dev) && !sky2->restarting)) {
2363 netif_tx_lock(dev); 2374 netif_tx_lock(dev);
2364 sky2_tx_complete(sky2, last); 2375 sky2_tx_complete(sky2, last);
2365 netif_tx_unlock(dev); 2376 netif_tx_unlock(dev);
@@ -4283,6 +4294,7 @@ static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw,
4283 spin_lock_init(&sky2->phy_lock); 4294 spin_lock_init(&sky2->phy_lock);
4284 sky2->tx_pending = TX_DEF_PENDING; 4295 sky2->tx_pending = TX_DEF_PENDING;
4285 sky2->rx_pending = RX_DEF_PENDING; 4296 sky2->rx_pending = RX_DEF_PENDING;
4297 sky2->restarting = 0;
4286 4298
4287 hw->dev[port] = dev; 4299 hw->dev[port] = dev;
4288 4300
diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h
index b5549c9e5107..4486b066b43f 100644
--- a/drivers/net/sky2.h
+++ b/drivers/net/sky2.h
@@ -2051,6 +2051,7 @@ struct sky2_port {
2051 u8 duplex; /* DUPLEX_HALF, DUPLEX_FULL */ 2051 u8 duplex; /* DUPLEX_HALF, DUPLEX_FULL */
2052 u8 rx_csum; 2052 u8 rx_csum;
2053 u8 wol; 2053 u8 wol;
2054 u8 restarting;
2054 enum flow_control flow_mode; 2055 enum flow_control flow_mode;
2055 enum flow_control flow_status; 2056 enum flow_control flow_status;
2056 2057
diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c
index eb72d2e9ab3d..acfdccd44567 100644
--- a/drivers/net/tulip/de4x5.c
+++ b/drivers/net/tulip/de4x5.c
@@ -5059,7 +5059,7 @@ mii_get_phy(struct net_device *dev)
5059 if ((id == 0) || (id == 65535)) continue; /* Valid ID? */ 5059 if ((id == 0) || (id == 65535)) continue; /* Valid ID? */
5060 for (j=0; j<limit; j++) { /* Search PHY table */ 5060 for (j=0; j<limit; j++) { /* Search PHY table */
5061 if (id != phy_info[j].id) continue; /* ID match? */ 5061 if (id != phy_info[j].id) continue; /* ID match? */
5062 for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++); 5062 for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++);
5063 if (k < DE4X5_MAX_PHY) { 5063 if (k < DE4X5_MAX_PHY) {
5064 memcpy((char *)&lp->phy[k], 5064 memcpy((char *)&lp->phy[k],
5065 (char *)&phy_info[j], sizeof(struct phy_table)); 5065 (char *)&phy_info[j], sizeof(struct phy_table));
@@ -5072,7 +5072,7 @@ mii_get_phy(struct net_device *dev)
5072 break; 5072 break;
5073 } 5073 }
5074 if ((j == limit) && (i < DE4X5_MAX_MII)) { 5074 if ((j == limit) && (i < DE4X5_MAX_MII)) {
5075 for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++); 5075 for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++);
5076 lp->phy[k].addr = i; 5076 lp->phy[k].addr = i;
5077 lp->phy[k].id = id; 5077 lp->phy[k].id = id;
5078 lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */ 5078 lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */
@@ -5091,7 +5091,7 @@ mii_get_phy(struct net_device *dev)
5091 purgatory: 5091 purgatory:
5092 lp->active = 0; 5092 lp->active = 0;
5093 if (lp->phy[0].id) { /* Reset the PHY devices */ 5093 if (lp->phy[0].id) { /* Reset the PHY devices */
5094 for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++) { /*For each PHY*/ 5094 for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++) { /*For each PHY*/
5095 mii_wr(MII_CR_RST, MII_CR, lp->phy[k].addr, DE4X5_MII); 5095 mii_wr(MII_CR_RST, MII_CR, lp->phy[k].addr, DE4X5_MII);
5096 while (mii_rd(MII_CR, lp->phy[k].addr, DE4X5_MII) & MII_CR_RST); 5096 while (mii_rd(MII_CR, lp->phy[k].addr, DE4X5_MII) & MII_CR_RST);
5097 5097
diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c
index c70604f0329e..8ce5e4cee168 100644
--- a/drivers/net/wireless/airo.c
+++ b/drivers/net/wireless/airo.c
@@ -5918,20 +5918,19 @@ static int airo_set_essid(struct net_device *dev,
5918 readSsidRid(local, &SSID_rid); 5918 readSsidRid(local, &SSID_rid);
5919 5919
5920 /* Check if we asked for `any' */ 5920 /* Check if we asked for `any' */
5921 if(dwrq->flags == 0) { 5921 if (dwrq->flags == 0) {
5922 /* Just send an empty SSID list */ 5922 /* Just send an empty SSID list */
5923 memset(&SSID_rid, 0, sizeof(SSID_rid)); 5923 memset(&SSID_rid, 0, sizeof(SSID_rid));
5924 } else { 5924 } else {
5925 int index = (dwrq->flags & IW_ENCODE_INDEX) - 1; 5925 unsigned index = (dwrq->flags & IW_ENCODE_INDEX) - 1;
5926 5926
5927 /* Check the size of the string */ 5927 /* Check the size of the string */
5928 if(dwrq->length > IW_ESSID_MAX_SIZE) { 5928 if (dwrq->length > IW_ESSID_MAX_SIZE)
5929 return -E2BIG ; 5929 return -E2BIG ;
5930 } 5930
5931 /* Check if index is valid */ 5931 /* Check if index is valid */
5932 if((index < 0) || (index >= 4)) { 5932 if (index >= ARRAY_SIZE(SSID_rid.ssids))
5933 return -EINVAL; 5933 return -EINVAL;
5934 }
5935 5934
5936 /* Set the SSID */ 5935 /* Set the SSID */
5937 memset(SSID_rid.ssids[index].ssid, 0, 5936 memset(SSID_rid.ssids[index].ssid, 0,
@@ -6819,7 +6818,7 @@ static int airo_set_txpow(struct net_device *dev,
6819 return -EINVAL; 6818 return -EINVAL;
6820 } 6819 }
6821 clear_bit (FLAG_RADIO_OFF, &local->flags); 6820 clear_bit (FLAG_RADIO_OFF, &local->flags);
6822 for (i = 0; cap_rid.txPowerLevels[i] && (i < 8); i++) 6821 for (i = 0; i < 8 && cap_rid.txPowerLevels[i]; i++)
6823 if (v == cap_rid.txPowerLevels[i]) { 6822 if (v == cap_rid.txPowerLevels[i]) {
6824 readConfigRid(local, 1); 6823 readConfigRid(local, 1);
6825 local->config.txPower = v; 6824 local->config.txPower = v;
diff --git a/drivers/net/wireless/ath/ath9k/eeprom.c b/drivers/net/wireless/ath/ath9k/eeprom.c
index a2fda702b620..ce0e86c36a82 100644
--- a/drivers/net/wireless/ath/ath9k/eeprom.c
+++ b/drivers/net/wireless/ath/ath9k/eeprom.c
@@ -460,7 +460,7 @@ static int ath9k_hw_4k_check_eeprom(struct ath_hw *ah)
460 integer = swab32(eep->modalHeader.antCtrlCommon); 460 integer = swab32(eep->modalHeader.antCtrlCommon);
461 eep->modalHeader.antCtrlCommon = integer; 461 eep->modalHeader.antCtrlCommon = integer;
462 462
463 for (i = 0; i < AR5416_MAX_CHAINS; i++) { 463 for (i = 0; i < AR5416_EEP4K_MAX_CHAINS; i++) {
464 integer = swab32(eep->modalHeader.antCtrlChain[i]); 464 integer = swab32(eep->modalHeader.antCtrlChain[i]);
465 eep->modalHeader.antCtrlChain[i] = integer; 465 eep->modalHeader.antCtrlChain[i] = integer;
466 } 466 }
@@ -914,7 +914,7 @@ static void ath9k_hw_set_4k_power_per_rate_table(struct ath_hw *ah,
914 ctlMode, numCtlModes, isHt40CtlMode, 914 ctlMode, numCtlModes, isHt40CtlMode,
915 (pCtlMode[ctlMode] & EXT_ADDITIVE)); 915 (pCtlMode[ctlMode] & EXT_ADDITIVE));
916 916
917 for (i = 0; (i < AR5416_NUM_CTLS) && 917 for (i = 0; (i < AR5416_EEP4K_NUM_CTLS) &&
918 pEepData->ctlIndex[i]; i++) { 918 pEepData->ctlIndex[i]; i++) {
919 DPRINTF(ah->ah_sc, ATH_DBG_EEPROM, 919 DPRINTF(ah->ah_sc, ATH_DBG_EEPROM,
920 " LOOP-Ctlidx %d: cfgCtl 0x%2.2x " 920 " LOOP-Ctlidx %d: cfgCtl 0x%2.2x "
diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.h b/drivers/net/wireless/iwlwifi/iwl-3945.h
index fbb3a573463e..2de6471d4be9 100644
--- a/drivers/net/wireless/iwlwifi/iwl-3945.h
+++ b/drivers/net/wireless/iwlwifi/iwl-3945.h
@@ -112,7 +112,7 @@ enum iwl3945_antenna {
112#define IWL_TX_FIFO_NONE 7 112#define IWL_TX_FIFO_NONE 7
113 113
114/* Minimum number of queues. MAX_NUM is defined in hw specific files */ 114/* Minimum number of queues. MAX_NUM is defined in hw specific files */
115#define IWL_MIN_NUM_QUEUES 4 115#define IWL39_MIN_NUM_QUEUES 4
116 116
117#define IEEE80211_DATA_LEN 2304 117#define IEEE80211_DATA_LEN 2304
118#define IEEE80211_4ADDR_LEN 30 118#define IEEE80211_4ADDR_LEN 30
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index 6ab07165ea28..18b135f510e5 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -1332,6 +1332,9 @@ int iwl_setup_mac(struct iwl_priv *priv)
1332 1332
1333 hw->wiphy->custom_regulatory = true; 1333 hw->wiphy->custom_regulatory = true;
1334 1334
1335 /* Firmware does not support this */
1336 hw->wiphy->disable_beacon_hints = true;
1337
1335 hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX; 1338 hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX;
1336 /* we create the 802.11 header and a zero-length SSID element */ 1339 /* we create the 802.11 header and a zero-length SSID element */
1337 hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; 1340 hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2;
diff --git a/drivers/net/wireless/iwlwifi/iwl-debugfs.c b/drivers/net/wireless/iwlwifi/iwl-debugfs.c
index 11e08c068917..ca00cc8ad4c7 100644
--- a/drivers/net/wireless/iwlwifi/iwl-debugfs.c
+++ b/drivers/net/wireless/iwlwifi/iwl-debugfs.c
@@ -308,18 +308,18 @@ static ssize_t iwl_dbgfs_nvm_read(struct file *file,
308 return -ENODATA; 308 return -ENODATA;
309 } 309 }
310 310
311 ptr = priv->eeprom;
312 if (!ptr) {
313 IWL_ERR(priv, "Invalid EEPROM/OTP memory\n");
314 return -ENOMEM;
315 }
316
311 /* 4 characters for byte 0xYY */ 317 /* 4 characters for byte 0xYY */
312 buf = kzalloc(buf_size, GFP_KERNEL); 318 buf = kzalloc(buf_size, GFP_KERNEL);
313 if (!buf) { 319 if (!buf) {
314 IWL_ERR(priv, "Can not allocate Buffer\n"); 320 IWL_ERR(priv, "Can not allocate Buffer\n");
315 return -ENOMEM; 321 return -ENOMEM;
316 } 322 }
317
318 ptr = priv->eeprom;
319 if (!ptr) {
320 IWL_ERR(priv, "Invalid EEPROM/OTP memory\n");
321 return -ENOMEM;
322 }
323 pos += scnprintf(buf + pos, buf_size - pos, "NVM Type: %s\n", 323 pos += scnprintf(buf + pos, buf_size - pos, "NVM Type: %s\n",
324 (priv->nvm_device_type == NVM_DEVICE_TYPE_OTP) 324 (priv->nvm_device_type == NVM_DEVICE_TYPE_OTP)
325 ? "OTP" : "EEPROM"); 325 ? "OTP" : "EEPROM");
diff --git a/drivers/net/wireless/iwlwifi/iwl-dev.h b/drivers/net/wireless/iwlwifi/iwl-dev.h
index e2d620f0b6e8..650e20af20fa 100644
--- a/drivers/net/wireless/iwlwifi/iwl-dev.h
+++ b/drivers/net/wireless/iwlwifi/iwl-dev.h
@@ -258,8 +258,10 @@ struct iwl_channel_info {
258#define IWL_TX_FIFO_HCCA_2 6 258#define IWL_TX_FIFO_HCCA_2 6
259#define IWL_TX_FIFO_NONE 7 259#define IWL_TX_FIFO_NONE 7
260 260
261/* Minimum number of queues. MAX_NUM is defined in hw specific files */ 261/* Minimum number of queues. MAX_NUM is defined in hw specific files.
262#define IWL_MIN_NUM_QUEUES 4 262 * Set the minimum to accommodate the 4 standard TX queues, 1 command
263 * queue, 2 (unused) HCCA queues, and 4 HT queues (one for each AC) */
264#define IWL_MIN_NUM_QUEUES 10
263 265
264/* Power management (not Tx power) structures */ 266/* Power management (not Tx power) structures */
265 267
diff --git a/drivers/net/wireless/iwlwifi/iwl-sta.c b/drivers/net/wireless/iwlwifi/iwl-sta.c
index 2addf735b193..ffd5c61a7553 100644
--- a/drivers/net/wireless/iwlwifi/iwl-sta.c
+++ b/drivers/net/wireless/iwlwifi/iwl-sta.c
@@ -566,6 +566,8 @@ int iwl_remove_default_wep_key(struct iwl_priv *priv,
566 unsigned long flags; 566 unsigned long flags;
567 567
568 spin_lock_irqsave(&priv->sta_lock, flags); 568 spin_lock_irqsave(&priv->sta_lock, flags);
569 IWL_DEBUG_WEP(priv, "Removing default WEP key: idx=%d\n",
570 keyconf->keyidx);
569 571
570 if (!test_and_clear_bit(keyconf->keyidx, &priv->ucode_key_table)) 572 if (!test_and_clear_bit(keyconf->keyidx, &priv->ucode_key_table))
571 IWL_ERR(priv, "index %d not used in uCode key table.\n", 573 IWL_ERR(priv, "index %d not used in uCode key table.\n",
@@ -573,6 +575,11 @@ int iwl_remove_default_wep_key(struct iwl_priv *priv,
573 575
574 priv->default_wep_key--; 576 priv->default_wep_key--;
575 memset(&priv->wep_keys[keyconf->keyidx], 0, sizeof(priv->wep_keys[0])); 577 memset(&priv->wep_keys[keyconf->keyidx], 0, sizeof(priv->wep_keys[0]));
578 if (iwl_is_rfkill(priv)) {
579 IWL_DEBUG_WEP(priv, "Not sending REPLY_WEPKEY command due to RFKILL.\n");
580 spin_unlock_irqrestore(&priv->sta_lock, flags);
581 return 0;
582 }
576 ret = iwl_send_static_wepkey_cmd(priv, 1); 583 ret = iwl_send_static_wepkey_cmd(priv, 1);
577 IWL_DEBUG_WEP(priv, "Remove default WEP key: idx=%d ret=%d\n", 584 IWL_DEBUG_WEP(priv, "Remove default WEP key: idx=%d ret=%d\n",
578 keyconf->keyidx, ret); 585 keyconf->keyidx, ret);
@@ -853,6 +860,11 @@ int iwl_remove_dynamic_key(struct iwl_priv *priv,
853 priv->stations[sta_id].sta.sta.modify_mask = STA_MODIFY_KEY_MASK; 860 priv->stations[sta_id].sta.sta.modify_mask = STA_MODIFY_KEY_MASK;
854 priv->stations[sta_id].sta.mode = STA_CONTROL_MODIFY_MSK; 861 priv->stations[sta_id].sta.mode = STA_CONTROL_MODIFY_MSK;
855 862
863 if (iwl_is_rfkill(priv)) {
864 IWL_DEBUG_WEP(priv, "Not sending REPLY_ADD_STA command because RFKILL enabled. \n");
865 spin_unlock_irqrestore(&priv->sta_lock, flags);
866 return 0;
867 }
856 ret = iwl_send_add_sta(priv, &priv->stations[sta_id].sta, CMD_ASYNC); 868 ret = iwl_send_add_sta(priv, &priv->stations[sta_id].sta, CMD_ASYNC);
857 spin_unlock_irqrestore(&priv->sta_lock, flags); 869 spin_unlock_irqrestore(&priv->sta_lock, flags);
858 return ret; 870 return ret;
diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c
index 9bbeec9427f0..2e89040e63be 100644
--- a/drivers/net/wireless/iwlwifi/iwl-tx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-tx.c
@@ -720,8 +720,6 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb)
720 goto drop_unlock; 720 goto drop_unlock;
721 } 721 }
722 722
723 spin_unlock_irqrestore(&priv->lock, flags);
724
725 hdr_len = ieee80211_hdrlen(fc); 723 hdr_len = ieee80211_hdrlen(fc);
726 724
727 /* Find (or create) index into station table for destination station */ 725 /* Find (or create) index into station table for destination station */
@@ -729,7 +727,7 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb)
729 if (sta_id == IWL_INVALID_STATION) { 727 if (sta_id == IWL_INVALID_STATION) {
730 IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n", 728 IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n",
731 hdr->addr1); 729 hdr->addr1);
732 goto drop; 730 goto drop_unlock;
733 } 731 }
734 732
735 IWL_DEBUG_TX(priv, "station Id %d\n", sta_id); 733 IWL_DEBUG_TX(priv, "station Id %d\n", sta_id);
@@ -750,14 +748,17 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb)
750 txq_id = priv->stations[sta_id].tid[tid].agg.txq_id; 748 txq_id = priv->stations[sta_id].tid[tid].agg.txq_id;
751 swq_id = iwl_virtual_agg_queue_num(swq_id, txq_id); 749 swq_id = iwl_virtual_agg_queue_num(swq_id, txq_id);
752 } 750 }
753 priv->stations[sta_id].tid[tid].tfds_in_queue++;
754 } 751 }
755 752
756 txq = &priv->txq[txq_id]; 753 txq = &priv->txq[txq_id];
757 q = &txq->q; 754 q = &txq->q;
758 txq->swq_id = swq_id; 755 txq->swq_id = swq_id;
759 756
760 spin_lock_irqsave(&priv->lock, flags); 757 if (unlikely(iwl_queue_space(q) < q->high_mark))
758 goto drop_unlock;
759
760 if (ieee80211_is_data_qos(fc))
761 priv->stations[sta_id].tid[tid].tfds_in_queue++;
761 762
762 /* Set up driver data for this TFD */ 763 /* Set up driver data for this TFD */
763 memset(&(txq->txb[q->write_ptr]), 0, sizeof(struct iwl_tx_info)); 764 memset(&(txq->txb[q->write_ptr]), 0, sizeof(struct iwl_tx_info));
@@ -902,7 +903,6 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb)
902 903
903drop_unlock: 904drop_unlock:
904 spin_unlock_irqrestore(&priv->lock, flags); 905 spin_unlock_irqrestore(&priv->lock, flags);
905drop:
906 return -1; 906 return -1;
907} 907}
908EXPORT_SYMBOL(iwl_tx_skb); 908EXPORT_SYMBOL(iwl_tx_skb);
@@ -1171,6 +1171,8 @@ int iwl_tx_agg_start(struct iwl_priv *priv, const u8 *ra, u16 tid, u16 *ssn)
1171 IWL_ERR(priv, "Start AGG on invalid station\n"); 1171 IWL_ERR(priv, "Start AGG on invalid station\n");
1172 return -ENXIO; 1172 return -ENXIO;
1173 } 1173 }
1174 if (unlikely(tid >= MAX_TID_COUNT))
1175 return -EINVAL;
1174 1176
1175 if (priv->stations[sta_id].tid[tid].agg.state != IWL_AGG_OFF) { 1177 if (priv->stations[sta_id].tid[tid].agg.state != IWL_AGG_OFF) {
1176 IWL_ERR(priv, "Start AGG when state is not IWL_AGG_OFF !\n"); 1178 IWL_ERR(priv, "Start AGG when state is not IWL_AGG_OFF !\n");
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index 956798f2c80c..523843369ca2 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -3968,6 +3968,9 @@ static int iwl3945_setup_mac(struct iwl_priv *priv)
3968 3968
3969 hw->wiphy->custom_regulatory = true; 3969 hw->wiphy->custom_regulatory = true;
3970 3970
3971 /* Firmware does not support this */
3972 hw->wiphy->disable_beacon_hints = true;
3973
3971 hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX_3945; 3974 hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX_3945;
3972 /* we create the 802.11 header and a zero-length SSID element */ 3975 /* we create the 802.11 header and a zero-length SSID element */
3973 hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; 3976 hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2;
@@ -4018,10 +4021,10 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e
4018 SET_IEEE80211_DEV(hw, &pdev->dev); 4021 SET_IEEE80211_DEV(hw, &pdev->dev);
4019 4022
4020 if ((iwl3945_mod_params.num_of_queues > IWL39_MAX_NUM_QUEUES) || 4023 if ((iwl3945_mod_params.num_of_queues > IWL39_MAX_NUM_QUEUES) ||
4021 (iwl3945_mod_params.num_of_queues < IWL_MIN_NUM_QUEUES)) { 4024 (iwl3945_mod_params.num_of_queues < IWL39_MIN_NUM_QUEUES)) {
4022 IWL_ERR(priv, 4025 IWL_ERR(priv,
4023 "invalid queues_num, should be between %d and %d\n", 4026 "invalid queues_num, should be between %d and %d\n",
4024 IWL_MIN_NUM_QUEUES, IWL39_MAX_NUM_QUEUES); 4027 IWL39_MIN_NUM_QUEUES, IWL39_MAX_NUM_QUEUES);
4025 err = -EINVAL; 4028 err = -EINVAL;
4026 goto out_ieee80211_free_hw; 4029 goto out_ieee80211_free_hw;
4027 } 4030 }
diff --git a/drivers/net/wireless/iwmc3200wifi/commands.c b/drivers/net/wireless/iwmc3200wifi/commands.c
index 834a7f544e5d..e2334d123599 100644
--- a/drivers/net/wireless/iwmc3200wifi/commands.c
+++ b/drivers/net/wireless/iwmc3200wifi/commands.c
@@ -220,6 +220,7 @@ int iwm_store_rxiq_calib_result(struct iwm_priv *iwm)
220 eeprom_rxiq = iwm_eeprom_access(iwm, IWM_EEPROM_CALIB_RXIQ); 220 eeprom_rxiq = iwm_eeprom_access(iwm, IWM_EEPROM_CALIB_RXIQ);
221 if (IS_ERR(eeprom_rxiq)) { 221 if (IS_ERR(eeprom_rxiq)) {
222 IWM_ERR(iwm, "Couldn't access EEPROM RX IQ entry\n"); 222 IWM_ERR(iwm, "Couldn't access EEPROM RX IQ entry\n");
223 kfree(rxiq);
223 return PTR_ERR(eeprom_rxiq); 224 return PTR_ERR(eeprom_rxiq);
224 } 225 }
225 226
diff --git a/drivers/net/wireless/iwmc3200wifi/netdev.c b/drivers/net/wireless/iwmc3200wifi/netdev.c
index aea5ccf24ccf..bf294e41753b 100644
--- a/drivers/net/wireless/iwmc3200wifi/netdev.c
+++ b/drivers/net/wireless/iwmc3200wifi/netdev.c
@@ -106,10 +106,8 @@ void *iwm_if_alloc(int sizeof_bus, struct device *dev,
106 int ret = 0; 106 int ret = 0;
107 107
108 wdev = iwm_wdev_alloc(sizeof_bus, dev); 108 wdev = iwm_wdev_alloc(sizeof_bus, dev);
109 if (!wdev) { 109 if (IS_ERR(wdev))
110 dev_err(dev, "no memory for wireless device instance\n"); 110 return wdev;
111 return ERR_PTR(-ENOMEM);
112 }
113 111
114 iwm = wdev_to_iwm(wdev); 112 iwm = wdev_to_iwm(wdev);
115 iwm->bus_ops = if_ops; 113 iwm->bus_ops = if_ops;
diff --git a/drivers/net/wireless/libertas/11d.c b/drivers/net/wireless/libertas/11d.c
index 9a5408e7d94a..5c6968101f0d 100644
--- a/drivers/net/wireless/libertas/11d.c
+++ b/drivers/net/wireless/libertas/11d.c
@@ -47,7 +47,7 @@ static u8 lbs_region_2_code(u8 *region)
47{ 47{
48 u8 i; 48 u8 i;
49 49
50 for (i = 0; region[i] && i < COUNTRY_CODE_LEN; i++) 50 for (i = 0; i < COUNTRY_CODE_LEN && region[i]; i++)
51 region[i] = toupper(region[i]); 51 region[i] = toupper(region[i]);
52 52
53 for (i = 0; i < ARRAY_SIZE(region_code_mapping); i++) { 53 for (i = 0; i < ARRAY_SIZE(region_code_mapping); i++) {
diff --git a/drivers/net/wireless/libertas/assoc.c b/drivers/net/wireless/libertas/assoc.c
index b9b374119033..d6997371c27e 100644
--- a/drivers/net/wireless/libertas/assoc.c
+++ b/drivers/net/wireless/libertas/assoc.c
@@ -1,6 +1,7 @@
1/* Copyright (C) 2006, Red Hat, Inc. */ 1/* Copyright (C) 2006, Red Hat, Inc. */
2 2
3#include <linux/types.h> 3#include <linux/types.h>
4#include <linux/kernel.h>
4#include <linux/etherdevice.h> 5#include <linux/etherdevice.h>
5#include <linux/ieee80211.h> 6#include <linux/ieee80211.h>
6#include <linux/if_arp.h> 7#include <linux/if_arp.h>
@@ -43,21 +44,21 @@ static int get_common_rates(struct lbs_private *priv,
43 u16 *rates_size) 44 u16 *rates_size)
44{ 45{
45 u8 *card_rates = lbs_bg_rates; 46 u8 *card_rates = lbs_bg_rates;
46 size_t num_card_rates = sizeof(lbs_bg_rates);
47 int ret = 0, i, j; 47 int ret = 0, i, j;
48 u8 tmp[30]; 48 u8 tmp[(ARRAY_SIZE(lbs_bg_rates) - 1) * (*rates_size - 1)];
49 size_t tmp_size = 0; 49 size_t tmp_size = 0;
50 50
51 /* For each rate in card_rates that exists in rate1, copy to tmp */ 51 /* For each rate in card_rates that exists in rate1, copy to tmp */
52 for (i = 0; card_rates[i] && (i < num_card_rates); i++) { 52 for (i = 0; i < ARRAY_SIZE(lbs_bg_rates) && card_rates[i]; i++) {
53 for (j = 0; rates[j] && (j < *rates_size); j++) { 53 for (j = 0; j < *rates_size && rates[j]; j++) {
54 if (rates[j] == card_rates[i]) 54 if (rates[j] == card_rates[i])
55 tmp[tmp_size++] = card_rates[i]; 55 tmp[tmp_size++] = card_rates[i];
56 } 56 }
57 } 57 }
58 58
59 lbs_deb_hex(LBS_DEB_JOIN, "AP rates ", rates, *rates_size); 59 lbs_deb_hex(LBS_DEB_JOIN, "AP rates ", rates, *rates_size);
60 lbs_deb_hex(LBS_DEB_JOIN, "card rates ", card_rates, num_card_rates); 60 lbs_deb_hex(LBS_DEB_JOIN, "card rates ", card_rates,
61 ARRAY_SIZE(lbs_bg_rates));
61 lbs_deb_hex(LBS_DEB_JOIN, "common rates", tmp, tmp_size); 62 lbs_deb_hex(LBS_DEB_JOIN, "common rates", tmp, tmp_size);
62 lbs_deb_join("TX data rate 0x%02x\n", priv->cur_rate); 63 lbs_deb_join("TX data rate 0x%02x\n", priv->cur_rate);
63 64
@@ -69,10 +70,7 @@ static int get_common_rates(struct lbs_private *priv,
69 lbs_pr_alert("Previously set fixed data rate %#x isn't " 70 lbs_pr_alert("Previously set fixed data rate %#x isn't "
70 "compatible with the network.\n", priv->cur_rate); 71 "compatible with the network.\n", priv->cur_rate);
71 ret = -1; 72 ret = -1;
72 goto done;
73 } 73 }
74 ret = 0;
75
76done: 74done:
77 memset(rates, 0, *rates_size); 75 memset(rates, 0, *rates_size);
78 *rates_size = min_t(int, tmp_size, *rates_size); 76 *rates_size = min_t(int, tmp_size, *rates_size);
@@ -322,7 +320,7 @@ static int lbs_associate(struct lbs_private *priv,
322 rates = (struct mrvl_ie_rates_param_set *) pos; 320 rates = (struct mrvl_ie_rates_param_set *) pos;
323 rates->header.type = cpu_to_le16(TLV_TYPE_RATES); 321 rates->header.type = cpu_to_le16(TLV_TYPE_RATES);
324 memcpy(&rates->rates, &bss->rates, MAX_RATES); 322 memcpy(&rates->rates, &bss->rates, MAX_RATES);
325 tmplen = MAX_RATES; 323 tmplen = min_t(u16, ARRAY_SIZE(rates->rates), MAX_RATES);
326 if (get_common_rates(priv, rates->rates, &tmplen)) { 324 if (get_common_rates(priv, rates->rates, &tmplen)) {
327 ret = -1; 325 ret = -1;
328 goto done; 326 goto done;
@@ -598,7 +596,7 @@ static int lbs_adhoc_join(struct lbs_private *priv,
598 596
599 /* Copy Data rates from the rates recorded in scan response */ 597 /* Copy Data rates from the rates recorded in scan response */
600 memset(cmd.bss.rates, 0, sizeof(cmd.bss.rates)); 598 memset(cmd.bss.rates, 0, sizeof(cmd.bss.rates));
601 ratesize = min_t(u16, sizeof(cmd.bss.rates), MAX_RATES); 599 ratesize = min_t(u16, ARRAY_SIZE(cmd.bss.rates), MAX_RATES);
602 memcpy(cmd.bss.rates, bss->rates, ratesize); 600 memcpy(cmd.bss.rates, bss->rates, ratesize);
603 if (get_common_rates(priv, cmd.bss.rates, &ratesize)) { 601 if (get_common_rates(priv, cmd.bss.rates, &ratesize)) {
604 lbs_deb_join("ADHOC_JOIN: get_common_rates returned error.\n"); 602 lbs_deb_join("ADHOC_JOIN: get_common_rates returned error.\n");
diff --git a/drivers/net/wireless/libertas/scan.c b/drivers/net/wireless/libertas/scan.c
index 601b54249677..6c95af3023cc 100644
--- a/drivers/net/wireless/libertas/scan.c
+++ b/drivers/net/wireless/libertas/scan.c
@@ -5,6 +5,7 @@
5 * for sending scan commands to the firmware. 5 * for sending scan commands to the firmware.
6 */ 6 */
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/kernel.h>
8#include <linux/etherdevice.h> 9#include <linux/etherdevice.h>
9#include <linux/if_arp.h> 10#include <linux/if_arp.h>
10#include <asm/unaligned.h> 11#include <asm/unaligned.h>
@@ -876,7 +877,7 @@ static inline char *lbs_translate_scan(struct lbs_private *priv,
876 iwe.u.bitrate.disabled = 0; 877 iwe.u.bitrate.disabled = 0;
877 iwe.u.bitrate.value = 0; 878 iwe.u.bitrate.value = 0;
878 879
879 for (j = 0; bss->rates[j] && (j < sizeof(bss->rates)); j++) { 880 for (j = 0; j < ARRAY_SIZE(bss->rates) && bss->rates[j]; j++) {
880 /* Bit rate given in 500 kb/s units */ 881 /* Bit rate given in 500 kb/s units */
881 iwe.u.bitrate.value = bss->rates[j] * 500000; 882 iwe.u.bitrate.value = bss->rates[j] * 500000;
882 current_val = iwe_stream_add_value(info, start, current_val, 883 current_val = iwe_stream_add_value(info, start, current_val,
diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c
index 40b07b988224..3bd3c779fff3 100644
--- a/drivers/net/wireless/zd1211rw/zd_mac.c
+++ b/drivers/net/wireless/zd1211rw/zd_mac.c
@@ -698,7 +698,7 @@ int zd_mac_rx(struct ieee80211_hw *hw, const u8 *buffer, unsigned int length)
698 && !mac->pass_ctrl) 698 && !mac->pass_ctrl)
699 return 0; 699 return 0;
700 700
701 fc = *(__le16 *)buffer; 701 fc = get_unaligned((__le16*)buffer);
702 need_padding = ieee80211_is_data_qos(fc) ^ ieee80211_has_a4(fc); 702 need_padding = ieee80211_is_data_qos(fc) ^ ieee80211_has_a4(fc);
703 703
704 skb = dev_alloc_skb(length + (need_padding ? 2 : 0)); 704 skb = dev_alloc_skb(length + (need_padding ? 2 : 0));
diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index 0f0e0b919ef4..a45b0c0d574e 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -70,7 +70,6 @@
70#undef CCIO_COLLECT_STATS 70#undef CCIO_COLLECT_STATS
71#endif 71#endif
72 72
73#include <linux/proc_fs.h>
74#include <asm/runway.h> /* for proc_runway_root */ 73#include <asm/runway.h> /* for proc_runway_root */
75 74
76#ifdef DEBUG_CCIO_INIT 75#ifdef DEBUG_CCIO_INIT
diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c
index c590974e9815..d69bde6a2343 100644
--- a/drivers/parisc/dino.c
+++ b/drivers/parisc/dino.c
@@ -614,7 +614,7 @@ dino_fixup_bus(struct pci_bus *bus)
614 dev_name(&bus->self->dev), i, 614 dev_name(&bus->self->dev), i,
615 bus->self->resource[i].start, 615 bus->self->resource[i].start,
616 bus->self->resource[i].end); 616 bus->self->resource[i].end);
617 pci_assign_resource(bus->self, i); 617 WARN_ON(pci_assign_resource(bus->self, i));
618 DBG("DEBUG %s after assign %d [0x%lx,0x%lx]\n", 618 DBG("DEBUG %s after assign %d [0x%lx,0x%lx]\n",
619 dev_name(&bus->self->dev), i, 619 dev_name(&bus->self->dev), i,
620 bus->self->resource[i].start, 620 bus->self->resource[i].start,
diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c
index 685d94e69d44..8c0b26e9b98a 100644
--- a/drivers/parisc/eisa_eeprom.c
+++ b/drivers/parisc/eisa_eeprom.c
@@ -55,7 +55,7 @@ static ssize_t eisa_eeprom_read(struct file * file,
55 ssize_t ret; 55 ssize_t ret;
56 int i; 56 int i;
57 57
58 if (*ppos >= HPEE_MAX_LENGTH) 58 if (*ppos < 0 || *ppos >= HPEE_MAX_LENGTH)
59 return 0; 59 return 0;
60 60
61 count = *ppos + count < HPEE_MAX_LENGTH ? count : HPEE_MAX_LENGTH - *ppos; 61 count = *ppos + count < HPEE_MAX_LENGTH ? count : HPEE_MAX_LENGTH - *ppos;
diff --git a/drivers/parisc/hppb.c b/drivers/parisc/hppb.c
index 13856415b432..815db175d427 100644
--- a/drivers/parisc/hppb.c
+++ b/drivers/parisc/hppb.c
@@ -62,7 +62,8 @@ static int hppb_probe(struct parisc_device *dev)
62 } 62 }
63 card = card->next; 63 card = card->next;
64 } 64 }
65 printk(KERN_INFO "Found GeckoBoa at 0x%x\n", dev->hpa.start); 65 printk(KERN_INFO "Found GeckoBoa at 0x%llx\n",
66 (unsigned long long) dev->hpa.start);
66 67
67 card->hpa = dev->hpa.start; 68 card->hpa = dev->hpa.start;
68 card->mmio_region.name = "HP-PB Bus"; 69 card->mmio_region.name = "HP-PB Bus";
@@ -73,8 +74,10 @@ static int hppb_probe(struct parisc_device *dev)
73 74
74 status = ccio_request_resource(dev, &card->mmio_region); 75 status = ccio_request_resource(dev, &card->mmio_region);
75 if(status < 0) { 76 if(status < 0) {
76 printk(KERN_ERR "%s: failed to claim HP-PB bus space (%08x, %08x)\n", 77 printk(KERN_ERR "%s: failed to claim HP-PB "
77 __FILE__, card->mmio_region.start, card->mmio_region.end); 78 "bus space (0x%08llx, 0x%08llx)\n",
79 __FILE__, (unsigned long long) card->mmio_region.start,
80 (unsigned long long) card->mmio_region.end);
78 } 81 }
79 82
80 return 0; 83 return 0;
diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c
index ede614616f8e..3aeb3279c92a 100644
--- a/drivers/parisc/lba_pci.c
+++ b/drivers/parisc/lba_pci.c
@@ -992,7 +992,7 @@ lba_pat_resources(struct parisc_device *pa_dev, struct lba_device *lba_dev)
992 return; 992 return;
993 993
994 io_pdc_cell = kzalloc(sizeof(pdc_pat_cell_mod_maddr_block_t), GFP_KERNEL); 994 io_pdc_cell = kzalloc(sizeof(pdc_pat_cell_mod_maddr_block_t), GFP_KERNEL);
995 if (!pa_pdc_cell) { 995 if (!io_pdc_cell) {
996 kfree(pa_pdc_cell); 996 kfree(pa_pdc_cell);
997 return; 997 return;
998 } 998 }
diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c
index f9f9a5f1bbd0..13a64bc081b6 100644
--- a/drivers/parisc/pdc_stable.c
+++ b/drivers/parisc/pdc_stable.c
@@ -370,7 +370,7 @@ pdcspath_layer_read(struct pdcspath_entry *entry, char *buf)
370 if (!i) /* entry is not ready */ 370 if (!i) /* entry is not ready */
371 return -ENODATA; 371 return -ENODATA;
372 372
373 for (i = 0; devpath->layers[i] && (likely(i < 6)); i++) 373 for (i = 0; i < 6 && devpath->layers[i]; i++)
374 out += sprintf(out, "%u ", devpath->layers[i]); 374 out += sprintf(out, "%u ", devpath->layers[i]);
375 375
376 out += sprintf(out, "\n"); 376 out += sprintf(out, "\n");
diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c
index b711fb7181e2..1898c7b47907 100644
--- a/drivers/pci/setup-res.c
+++ b/drivers/pci/setup-res.c
@@ -100,16 +100,16 @@ int pci_claim_resource(struct pci_dev *dev, int resource)
100{ 100{
101 struct resource *res = &dev->resource[resource]; 101 struct resource *res = &dev->resource[resource];
102 struct resource *root; 102 struct resource *root;
103 char *dtype = resource < PCI_BRIDGE_RESOURCES ? "device" : "bridge";
104 int err; 103 int err;
105 104
106 root = pci_find_parent_resource(dev, res); 105 root = pci_find_parent_resource(dev, res);
107 106
108 err = -EINVAL; 107 err = -EINVAL;
109 if (root != NULL) 108 if (root != NULL)
110 err = insert_resource(root, res); 109 err = request_resource(root, res);
111 110
112 if (err) { 111 if (err) {
112 const char *dtype = resource < PCI_BRIDGE_RESOURCES ? "device" : "bridge";
113 dev_err(&dev->dev, "BAR %d: %s of %s %pR\n", 113 dev_err(&dev->dev, "BAR %d: %s of %s %pR\n",
114 resource, 114 resource,
115 root ? "address space collision on" : 115 root ? "address space collision on" :
diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig
index 46dad12f952f..77c6097ced80 100644
--- a/drivers/platform/x86/Kconfig
+++ b/drivers/platform/x86/Kconfig
@@ -277,31 +277,6 @@ config THINKPAD_ACPI_UNSAFE_LEDS
277 Say N here, unless you are building a kernel for your own 277 Say N here, unless you are building a kernel for your own
278 use, and need to control the important firmware LEDs. 278 use, and need to control the important firmware LEDs.
279 279
280config THINKPAD_ACPI_DOCK
281 bool "Legacy Docking Station Support"
282 depends on THINKPAD_ACPI
283 depends on ACPI_DOCK=n
284 default n
285 ---help---
286 Allows the thinkpad_acpi driver to handle docking station events.
287 This support was made obsolete by the generic ACPI docking station
288 support (CONFIG_ACPI_DOCK). It will allow locking and removing the
289 laptop from the docking station, but will not properly connect PCI
290 devices.
291
292 If you are not sure, say N here.
293
294config THINKPAD_ACPI_BAY
295 bool "Legacy Removable Bay Support"
296 depends on THINKPAD_ACPI
297 default y
298 ---help---
299 Allows the thinkpad_acpi driver to handle removable bays. It will
300 electrically disable the device in the bay, and also generate
301 notifications when the bay lever is ejected or inserted.
302
303 If you are not sure, say Y here.
304
305config THINKPAD_ACPI_VIDEO 280config THINKPAD_ACPI_VIDEO
306 bool "Video output control support" 281 bool "Video output control support"
307 depends on THINKPAD_ACPI 282 depends on THINKPAD_ACPI
diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c
index ec560f16d720..222ffb892f22 100644
--- a/drivers/platform/x86/eeepc-laptop.c
+++ b/drivers/platform/x86/eeepc-laptop.c
@@ -143,6 +143,7 @@ struct eeepc_hotk {
143 struct rfkill *bluetooth_rfkill; 143 struct rfkill *bluetooth_rfkill;
144 struct rfkill *wwan3g_rfkill; 144 struct rfkill *wwan3g_rfkill;
145 struct hotplug_slot *hotplug_slot; 145 struct hotplug_slot *hotplug_slot;
146 struct work_struct hotplug_work;
146}; 147};
147 148
148/* The actual device the driver binds to */ 149/* The actual device the driver binds to */
@@ -660,7 +661,7 @@ static int eeepc_get_adapter_status(struct hotplug_slot *hotplug_slot,
660 return 0; 661 return 0;
661} 662}
662 663
663static void eeepc_rfkill_hotplug(void) 664static void eeepc_hotplug_work(struct work_struct *work)
664{ 665{
665 struct pci_dev *dev; 666 struct pci_dev *dev;
666 struct pci_bus *bus = pci_find_bus(0, 1); 667 struct pci_bus *bus = pci_find_bus(0, 1);
@@ -701,7 +702,7 @@ static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data)
701 if (event != ACPI_NOTIFY_BUS_CHECK) 702 if (event != ACPI_NOTIFY_BUS_CHECK)
702 return; 703 return;
703 704
704 eeepc_rfkill_hotplug(); 705 schedule_work(&ehotk->hotplug_work);
705} 706}
706 707
707static void eeepc_hotk_notify(struct acpi_device *device, u32 event) 708static void eeepc_hotk_notify(struct acpi_device *device, u32 event)
@@ -892,7 +893,7 @@ static int eeepc_hotk_resume(struct acpi_device *device)
892 893
893 rfkill_set_sw_state(ehotk->wlan_rfkill, wlan != 1); 894 rfkill_set_sw_state(ehotk->wlan_rfkill, wlan != 1);
894 895
895 eeepc_rfkill_hotplug(); 896 schedule_work(&ehotk->hotplug_work);
896 } 897 }
897 898
898 if (ehotk->bluetooth_rfkill) 899 if (ehotk->bluetooth_rfkill)
@@ -1093,6 +1094,8 @@ static int eeepc_rfkill_init(struct device *dev)
1093{ 1094{
1094 int result = 0; 1095 int result = 0;
1095 1096
1097 INIT_WORK(&ehotk->hotplug_work, eeepc_hotplug_work);
1098
1096 eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P6"); 1099 eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P6");
1097 eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P7"); 1100 eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P7");
1098 1101
diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c
index ca508564a181..a2ad53e15874 100644
--- a/drivers/platform/x86/hp-wmi.c
+++ b/drivers/platform/x86/hp-wmi.c
@@ -520,11 +520,13 @@ static int hp_wmi_resume_handler(struct platform_device *device)
520 * the input layer will only actually pass it on if the state 520 * the input layer will only actually pass it on if the state
521 * changed. 521 * changed.
522 */ 522 */
523 523 if (hp_wmi_input_dev) {
524 input_report_switch(hp_wmi_input_dev, SW_DOCK, hp_wmi_dock_state()); 524 input_report_switch(hp_wmi_input_dev, SW_DOCK,
525 input_report_switch(hp_wmi_input_dev, SW_TABLET_MODE, 525 hp_wmi_dock_state());
526 hp_wmi_tablet_state()); 526 input_report_switch(hp_wmi_input_dev, SW_TABLET_MODE,
527 input_sync(hp_wmi_input_dev); 527 hp_wmi_tablet_state());
528 input_sync(hp_wmi_input_dev);
529 }
528 530
529 return 0; 531 return 0;
530} 532}
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index a463fd72c495..e85600852502 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -239,12 +239,6 @@ struct ibm_init_struct {
239}; 239};
240 240
241static struct { 241static struct {
242#ifdef CONFIG_THINKPAD_ACPI_BAY
243 u32 bay_status:1;
244 u32 bay_eject:1;
245 u32 bay_status2:1;
246 u32 bay_eject2:1;
247#endif
248 u32 bluetooth:1; 242 u32 bluetooth:1;
249 u32 hotkey:1; 243 u32 hotkey:1;
250 u32 hotkey_mask:1; 244 u32 hotkey_mask:1;
@@ -589,18 +583,6 @@ static int acpi_ec_write(int i, u8 v)
589 return 1; 583 return 1;
590} 584}
591 585
592#if defined(CONFIG_THINKPAD_ACPI_DOCK) || defined(CONFIG_THINKPAD_ACPI_BAY)
593static int _sta(acpi_handle handle)
594{
595 int status;
596
597 if (!handle || !acpi_evalf(handle, &status, "_STA", "d"))
598 status = 0;
599
600 return status;
601}
602#endif
603
604static int issue_thinkpad_cmos_command(int cmos_cmd) 586static int issue_thinkpad_cmos_command(int cmos_cmd)
605{ 587{
606 if (!cmos_handle) 588 if (!cmos_handle)
@@ -784,6 +766,8 @@ static int dispatch_procfs_write(struct file *file,
784 766
785 if (!ibm || !ibm->write) 767 if (!ibm || !ibm->write)
786 return -EINVAL; 768 return -EINVAL;
769 if (count > PAGE_SIZE - 2)
770 return -EINVAL;
787 771
788 kernbuf = kmalloc(count + 2, GFP_KERNEL); 772 kernbuf = kmalloc(count + 2, GFP_KERNEL);
789 if (!kernbuf) 773 if (!kernbuf)
@@ -4442,293 +4426,6 @@ static struct ibm_struct light_driver_data = {
4442}; 4426};
4443 4427
4444/************************************************************************* 4428/*************************************************************************
4445 * Dock subdriver
4446 */
4447
4448#ifdef CONFIG_THINKPAD_ACPI_DOCK
4449
4450static void dock_notify(struct ibm_struct *ibm, u32 event);
4451static int dock_read(char *p);
4452static int dock_write(char *buf);
4453
4454TPACPI_HANDLE(dock, root, "\\_SB.GDCK", /* X30, X31, X40 */
4455 "\\_SB.PCI0.DOCK", /* 600e/x,770e,770x,A2xm/p,T20-22,X20-21 */
4456 "\\_SB.PCI0.PCI1.DOCK", /* all others */
4457 "\\_SB.PCI.ISA.SLCE", /* 570 */
4458 ); /* A21e,G4x,R30,R31,R32,R40,R40e,R50e */
4459
4460/* don't list other alternatives as we install a notify handler on the 570 */
4461TPACPI_HANDLE(pci, root, "\\_SB.PCI"); /* 570 */
4462
4463static const struct acpi_device_id ibm_pci_device_ids[] = {
4464 {PCI_ROOT_HID_STRING, 0},
4465 {"", 0},
4466};
4467
4468static struct tp_acpi_drv_struct ibm_dock_acpidriver[2] = {
4469 {
4470 .notify = dock_notify,
4471 .handle = &dock_handle,
4472 .type = ACPI_SYSTEM_NOTIFY,
4473 },
4474 {
4475 /* THIS ONE MUST NEVER BE USED FOR DRIVER AUTOLOADING.
4476 * We just use it to get notifications of dock hotplug
4477 * in very old thinkpads */
4478 .hid = ibm_pci_device_ids,
4479 .notify = dock_notify,
4480 .handle = &pci_handle,
4481 .type = ACPI_SYSTEM_NOTIFY,
4482 },
4483};
4484
4485static struct ibm_struct dock_driver_data[2] = {
4486 {
4487 .name = "dock",
4488 .read = dock_read,
4489 .write = dock_write,
4490 .acpi = &ibm_dock_acpidriver[0],
4491 },
4492 {
4493 .name = "dock",
4494 .acpi = &ibm_dock_acpidriver[1],
4495 },
4496};
4497
4498#define dock_docked() (_sta(dock_handle) & 1)
4499
4500static int __init dock_init(struct ibm_init_struct *iibm)
4501{
4502 vdbg_printk(TPACPI_DBG_INIT, "initializing dock subdriver\n");
4503
4504 TPACPI_ACPIHANDLE_INIT(dock);
4505
4506 vdbg_printk(TPACPI_DBG_INIT, "dock is %s\n",
4507 str_supported(dock_handle != NULL));
4508
4509 return (dock_handle)? 0 : 1;
4510}
4511
4512static int __init dock_init2(struct ibm_init_struct *iibm)
4513{
4514 int dock2_needed;
4515
4516 vdbg_printk(TPACPI_DBG_INIT, "initializing dock subdriver part 2\n");
4517
4518 if (dock_driver_data[0].flags.acpi_driver_registered &&
4519 dock_driver_data[0].flags.acpi_notify_installed) {
4520 TPACPI_ACPIHANDLE_INIT(pci);
4521 dock2_needed = (pci_handle != NULL);
4522 vdbg_printk(TPACPI_DBG_INIT,
4523 "dock PCI handler for the TP 570 is %s\n",
4524 str_supported(dock2_needed));
4525 } else {
4526 vdbg_printk(TPACPI_DBG_INIT,
4527 "dock subdriver part 2 not required\n");
4528 dock2_needed = 0;
4529 }
4530
4531 return (dock2_needed)? 0 : 1;
4532}
4533
4534static void dock_notify(struct ibm_struct *ibm, u32 event)
4535{
4536 int docked = dock_docked();
4537 int pci = ibm->acpi->hid && ibm->acpi->device &&
4538 acpi_match_device_ids(ibm->acpi->device, ibm_pci_device_ids);
4539 int data;
4540
4541 if (event == 1 && !pci) /* 570 */
4542 data = 1; /* button */
4543 else if (event == 1 && pci) /* 570 */
4544 data = 3; /* dock */
4545 else if (event == 3 && docked)
4546 data = 1; /* button */
4547 else if (event == 3 && !docked)
4548 data = 2; /* undock */
4549 else if (event == 0 && docked)
4550 data = 3; /* dock */
4551 else {
4552 printk(TPACPI_ERR "unknown dock event %d, status %d\n",
4553 event, _sta(dock_handle));
4554 data = 0; /* unknown */
4555 }
4556 acpi_bus_generate_proc_event(ibm->acpi->device, event, data);
4557 acpi_bus_generate_netlink_event(ibm->acpi->device->pnp.device_class,
4558 dev_name(&ibm->acpi->device->dev),
4559 event, data);
4560}
4561
4562static int dock_read(char *p)
4563{
4564 int len = 0;
4565 int docked = dock_docked();
4566
4567 if (!dock_handle)
4568 len += sprintf(p + len, "status:\t\tnot supported\n");
4569 else if (!docked)
4570 len += sprintf(p + len, "status:\t\tundocked\n");
4571 else {
4572 len += sprintf(p + len, "status:\t\tdocked\n");
4573 len += sprintf(p + len, "commands:\tdock, undock\n");
4574 }
4575
4576 return len;
4577}
4578
4579static int dock_write(char *buf)
4580{
4581 char *cmd;
4582
4583 if (!dock_docked())
4584 return -ENODEV;
4585
4586 while ((cmd = next_cmd(&buf))) {
4587 if (strlencmp(cmd, "undock") == 0) {
4588 if (!acpi_evalf(dock_handle, NULL, "_DCK", "vd", 0) ||
4589 !acpi_evalf(dock_handle, NULL, "_EJ0", "vd", 1))
4590 return -EIO;
4591 } else if (strlencmp(cmd, "dock") == 0) {
4592 if (!acpi_evalf(dock_handle, NULL, "_DCK", "vd", 1))
4593 return -EIO;
4594 } else
4595 return -EINVAL;
4596 }
4597
4598 return 0;
4599}
4600
4601#endif /* CONFIG_THINKPAD_ACPI_DOCK */
4602
4603/*************************************************************************
4604 * Bay subdriver
4605 */
4606
4607#ifdef CONFIG_THINKPAD_ACPI_BAY
4608
4609TPACPI_HANDLE(bay, root, "\\_SB.PCI.IDE.SECN.MAST", /* 570 */
4610 "\\_SB.PCI0.IDE0.IDES.IDSM", /* 600e/x, 770e, 770x */
4611 "\\_SB.PCI0.SATA.SCND.MSTR", /* T60, X60, Z60 */
4612 "\\_SB.PCI0.IDE0.SCND.MSTR", /* all others */
4613 ); /* A21e, R30, R31 */
4614TPACPI_HANDLE(bay_ej, bay, "_EJ3", /* 600e/x, A2xm/p, A3x */
4615 "_EJ0", /* all others */
4616 ); /* 570,A21e,G4x,R30,R31,R32,R40e,R50e */
4617TPACPI_HANDLE(bay2, root, "\\_SB.PCI0.IDE0.PRIM.SLAV", /* A3x, R32 */
4618 "\\_SB.PCI0.IDE0.IDEP.IDPS", /* 600e/x, 770e, 770x */
4619 ); /* all others */
4620TPACPI_HANDLE(bay2_ej, bay2, "_EJ3", /* 600e/x, 770e, A3x */
4621 "_EJ0", /* 770x */
4622 ); /* all others */
4623
4624static int __init bay_init(struct ibm_init_struct *iibm)
4625{
4626 vdbg_printk(TPACPI_DBG_INIT, "initializing bay subdriver\n");
4627
4628 TPACPI_ACPIHANDLE_INIT(bay);
4629 if (bay_handle)
4630 TPACPI_ACPIHANDLE_INIT(bay_ej);
4631 TPACPI_ACPIHANDLE_INIT(bay2);
4632 if (bay2_handle)
4633 TPACPI_ACPIHANDLE_INIT(bay2_ej);
4634
4635 tp_features.bay_status = bay_handle &&
4636 acpi_evalf(bay_handle, NULL, "_STA", "qv");
4637 tp_features.bay_status2 = bay2_handle &&
4638 acpi_evalf(bay2_handle, NULL, "_STA", "qv");
4639
4640 tp_features.bay_eject = bay_handle && bay_ej_handle &&
4641 (strlencmp(bay_ej_path, "_EJ0") == 0 || experimental);
4642 tp_features.bay_eject2 = bay2_handle && bay2_ej_handle &&
4643 (strlencmp(bay2_ej_path, "_EJ0") == 0 || experimental);
4644
4645 vdbg_printk(TPACPI_DBG_INIT,
4646 "bay 1: status %s, eject %s; bay 2: status %s, eject %s\n",
4647 str_supported(tp_features.bay_status),
4648 str_supported(tp_features.bay_eject),
4649 str_supported(tp_features.bay_status2),
4650 str_supported(tp_features.bay_eject2));
4651
4652 return (tp_features.bay_status || tp_features.bay_eject ||
4653 tp_features.bay_status2 || tp_features.bay_eject2)? 0 : 1;
4654}
4655
4656static void bay_notify(struct ibm_struct *ibm, u32 event)
4657{
4658 acpi_bus_generate_proc_event(ibm->acpi->device, event, 0);
4659 acpi_bus_generate_netlink_event(ibm->acpi->device->pnp.device_class,
4660 dev_name(&ibm->acpi->device->dev),
4661 event, 0);
4662}
4663
4664#define bay_occupied(b) (_sta(b##_handle) & 1)
4665
4666static int bay_read(char *p)
4667{
4668 int len = 0;
4669 int occupied = bay_occupied(bay);
4670 int occupied2 = bay_occupied(bay2);
4671 int eject, eject2;
4672
4673 len += sprintf(p + len, "status:\t\t%s\n",
4674 tp_features.bay_status ?
4675 (occupied ? "occupied" : "unoccupied") :
4676 "not supported");
4677 if (tp_features.bay_status2)
4678 len += sprintf(p + len, "status2:\t%s\n", occupied2 ?
4679 "occupied" : "unoccupied");
4680
4681 eject = tp_features.bay_eject && occupied;
4682 eject2 = tp_features.bay_eject2 && occupied2;
4683
4684 if (eject && eject2)
4685 len += sprintf(p + len, "commands:\teject, eject2\n");
4686 else if (eject)
4687 len += sprintf(p + len, "commands:\teject\n");
4688 else if (eject2)
4689 len += sprintf(p + len, "commands:\teject2\n");
4690
4691 return len;
4692}
4693
4694static int bay_write(char *buf)
4695{
4696 char *cmd;
4697
4698 if (!tp_features.bay_eject && !tp_features.bay_eject2)
4699 return -ENODEV;
4700
4701 while ((cmd = next_cmd(&buf))) {
4702 if (tp_features.bay_eject && strlencmp(cmd, "eject") == 0) {
4703 if (!acpi_evalf(bay_ej_handle, NULL, NULL, "vd", 1))
4704 return -EIO;
4705 } else if (tp_features.bay_eject2 &&
4706 strlencmp(cmd, "eject2") == 0) {
4707 if (!acpi_evalf(bay2_ej_handle, NULL, NULL, "vd", 1))
4708 return -EIO;
4709 } else
4710 return -EINVAL;
4711 }
4712
4713 return 0;
4714}
4715
4716static struct tp_acpi_drv_struct ibm_bay_acpidriver = {
4717 .notify = bay_notify,
4718 .handle = &bay_handle,
4719 .type = ACPI_SYSTEM_NOTIFY,
4720};
4721
4722static struct ibm_struct bay_driver_data = {
4723 .name = "bay",
4724 .read = bay_read,
4725 .write = bay_write,
4726 .acpi = &ibm_bay_acpidriver,
4727};
4728
4729#endif /* CONFIG_THINKPAD_ACPI_BAY */
4730
4731/*************************************************************************
4732 * CMOS subdriver 4429 * CMOS subdriver
4733 */ 4430 */
4734 4431
@@ -5945,14 +5642,48 @@ static struct backlight_ops ibm_backlight_data = {
5945 5642
5946/* --------------------------------------------------------------------- */ 5643/* --------------------------------------------------------------------- */
5947 5644
5645/*
5646 * These are only useful for models that have only one possibility
5647 * of GPU. If the BIOS model handles both ATI and Intel, don't use
5648 * these quirks.
5649 */
5650#define TPACPI_BRGHT_Q_NOEC 0x0001 /* Must NOT use EC HBRV */
5651#define TPACPI_BRGHT_Q_EC 0x0002 /* Should or must use EC HBRV */
5652#define TPACPI_BRGHT_Q_ASK 0x8000 /* Ask for user report */
5653
5654static const struct tpacpi_quirk brightness_quirk_table[] __initconst = {
5655 /* Models with ATI GPUs known to require ECNVRAM mode */
5656 TPACPI_Q_IBM('1', 'Y', TPACPI_BRGHT_Q_EC), /* T43/p ATI */
5657
5658 /* Models with ATI GPUs (waiting confirmation) */
5659 TPACPI_Q_IBM('1', 'R', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC),
5660 TPACPI_Q_IBM('1', 'Q', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC),
5661 TPACPI_Q_IBM('7', '6', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC),
5662 TPACPI_Q_IBM('7', '8', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC),
5663
5664 /* Models with Intel Extreme Graphics 2 (waiting confirmation) */
5665 TPACPI_Q_IBM('1', 'V', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC),
5666 TPACPI_Q_IBM('1', 'W', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC),
5667 TPACPI_Q_IBM('1', 'U', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC),
5668
5669 /* Models with Intel GMA900 */
5670 TPACPI_Q_IBM('7', '0', TPACPI_BRGHT_Q_NOEC), /* T43, R52 */
5671 TPACPI_Q_IBM('7', '4', TPACPI_BRGHT_Q_NOEC), /* X41 */
5672 TPACPI_Q_IBM('7', '5', TPACPI_BRGHT_Q_NOEC), /* X41 Tablet */
5673};
5674
5948static int __init brightness_init(struct ibm_init_struct *iibm) 5675static int __init brightness_init(struct ibm_init_struct *iibm)
5949{ 5676{
5950 int b; 5677 int b;
5678 unsigned long quirks;
5951 5679
5952 vdbg_printk(TPACPI_DBG_INIT, "initializing brightness subdriver\n"); 5680 vdbg_printk(TPACPI_DBG_INIT, "initializing brightness subdriver\n");
5953 5681
5954 mutex_init(&brightness_mutex); 5682 mutex_init(&brightness_mutex);
5955 5683
5684 quirks = tpacpi_check_quirks(brightness_quirk_table,
5685 ARRAY_SIZE(brightness_quirk_table));
5686
5956 /* 5687 /*
5957 * We always attempt to detect acpi support, so as to switch 5688 * We always attempt to detect acpi support, so as to switch
5958 * Lenovo Vista BIOS to ACPI brightness mode even if we are not 5689 * Lenovo Vista BIOS to ACPI brightness mode even if we are not
@@ -6009,23 +5740,13 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
6009 /* TPACPI_BRGHT_MODE_AUTO not implemented yet, just use default */ 5740 /* TPACPI_BRGHT_MODE_AUTO not implemented yet, just use default */
6010 if (brightness_mode == TPACPI_BRGHT_MODE_AUTO || 5741 if (brightness_mode == TPACPI_BRGHT_MODE_AUTO ||
6011 brightness_mode == TPACPI_BRGHT_MODE_MAX) { 5742 brightness_mode == TPACPI_BRGHT_MODE_MAX) {
6012 if (thinkpad_id.vendor == PCI_VENDOR_ID_IBM) { 5743 if (quirks & TPACPI_BRGHT_Q_EC)
6013 /* 5744 brightness_mode = TPACPI_BRGHT_MODE_ECNVRAM;
6014 * IBM models that define HBRV probably have 5745 else
6015 * EC-based backlight level control
6016 */
6017 if (acpi_evalf(ec_handle, NULL, "HBRV", "qd"))
6018 /* T40-T43, R50-R52, R50e, R51e, X31-X41 */
6019 brightness_mode = TPACPI_BRGHT_MODE_ECNVRAM;
6020 else
6021 /* all other IBM ThinkPads */
6022 brightness_mode = TPACPI_BRGHT_MODE_UCMS_STEP;
6023 } else
6024 /* All Lenovo ThinkPads */
6025 brightness_mode = TPACPI_BRGHT_MODE_UCMS_STEP; 5746 brightness_mode = TPACPI_BRGHT_MODE_UCMS_STEP;
6026 5747
6027 dbg_printk(TPACPI_DBG_BRGHT, 5748 dbg_printk(TPACPI_DBG_BRGHT,
6028 "selected brightness_mode=%d\n", 5749 "driver auto-selected brightness_mode=%d\n",
6029 brightness_mode); 5750 brightness_mode);
6030 } 5751 }
6031 5752
@@ -6052,6 +5773,15 @@ static int __init brightness_init(struct ibm_init_struct *iibm)
6052 vdbg_printk(TPACPI_DBG_INIT | TPACPI_DBG_BRGHT, 5773 vdbg_printk(TPACPI_DBG_INIT | TPACPI_DBG_BRGHT,
6053 "brightness is supported\n"); 5774 "brightness is supported\n");
6054 5775
5776 if (quirks & TPACPI_BRGHT_Q_ASK) {
5777 printk(TPACPI_NOTICE
5778 "brightness: will use unverified default: "
5779 "brightness_mode=%d\n", brightness_mode);
5780 printk(TPACPI_NOTICE
5781 "brightness: please report to %s whether it works well "
5782 "or not on your ThinkPad\n", TPACPI_MAIL);
5783 }
5784
6055 ibm_backlight_device->props.max_brightness = 5785 ibm_backlight_device->props.max_brightness =
6056 (tp_features.bright_16levels)? 15 : 7; 5786 (tp_features.bright_16levels)? 15 : 7;
6057 ibm_backlight_device->props.brightness = b & TP_EC_BACKLIGHT_LVLMSK; 5787 ibm_backlight_device->props.brightness = b & TP_EC_BACKLIGHT_LVLMSK;
@@ -7854,22 +7584,6 @@ static struct ibm_init_struct ibms_init[] __initdata = {
7854 .init = light_init, 7584 .init = light_init,
7855 .data = &light_driver_data, 7585 .data = &light_driver_data,
7856 }, 7586 },
7857#ifdef CONFIG_THINKPAD_ACPI_DOCK
7858 {
7859 .init = dock_init,
7860 .data = &dock_driver_data[0],
7861 },
7862 {
7863 .init = dock_init2,
7864 .data = &dock_driver_data[1],
7865 },
7866#endif
7867#ifdef CONFIG_THINKPAD_ACPI_BAY
7868 {
7869 .init = bay_init,
7870 .data = &bay_driver_data,
7871 },
7872#endif
7873 { 7587 {
7874 .init = cmos_init, 7588 .init = cmos_init,
7875 .data = &cmos_driver_data, 7589 .data = &cmos_driver_data,
@@ -7968,12 +7682,6 @@ TPACPI_PARAM(hotkey);
7968TPACPI_PARAM(bluetooth); 7682TPACPI_PARAM(bluetooth);
7969TPACPI_PARAM(video); 7683TPACPI_PARAM(video);
7970TPACPI_PARAM(light); 7684TPACPI_PARAM(light);
7971#ifdef CONFIG_THINKPAD_ACPI_DOCK
7972TPACPI_PARAM(dock);
7973#endif
7974#ifdef CONFIG_THINKPAD_ACPI_BAY
7975TPACPI_PARAM(bay);
7976#endif /* CONFIG_THINKPAD_ACPI_BAY */
7977TPACPI_PARAM(cmos); 7685TPACPI_PARAM(cmos);
7978TPACPI_PARAM(led); 7686TPACPI_PARAM(led);
7979TPACPI_PARAM(beep); 7687TPACPI_PARAM(beep);
diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig
index 7eda34838bfe..bdbc4f73fcdc 100644
--- a/drivers/power/Kconfig
+++ b/drivers/power/Kconfig
@@ -43,6 +43,13 @@ config BATTERY_DS2760
43 help 43 help
44 Say Y here to enable support for batteries with ds2760 chip. 44 Say Y here to enable support for batteries with ds2760 chip.
45 45
46config BATTERY_DS2782
47 tristate "DS2782 standalone gas-gauge"
48 depends on I2C
49 help
50 Say Y here to enable support for the DS2782 standalone battery
51 gas-gauge.
52
46config BATTERY_PMU 53config BATTERY_PMU
47 tristate "Apple PMU battery" 54 tristate "Apple PMU battery"
48 depends on PPC32 && ADB_PMU 55 depends on PPC32 && ADB_PMU
diff --git a/drivers/power/Makefile b/drivers/power/Makefile
index daf3179689aa..380d17c9ae29 100644
--- a/drivers/power/Makefile
+++ b/drivers/power/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_APM_POWER) += apm_power.o
19obj-$(CONFIG_WM8350_POWER) += wm8350_power.o 19obj-$(CONFIG_WM8350_POWER) += wm8350_power.o
20 20
21obj-$(CONFIG_BATTERY_DS2760) += ds2760_battery.o 21obj-$(CONFIG_BATTERY_DS2760) += ds2760_battery.o
22obj-$(CONFIG_BATTERY_DS2782) += ds2782_battery.o
22obj-$(CONFIG_BATTERY_PMU) += pmu_battery.o 23obj-$(CONFIG_BATTERY_PMU) += pmu_battery.o
23obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o 24obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o
24obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o 25obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o
diff --git a/drivers/power/ds2782_battery.c b/drivers/power/ds2782_battery.c
new file mode 100644
index 000000000000..da14f374cb60
--- /dev/null
+++ b/drivers/power/ds2782_battery.c
@@ -0,0 +1,330 @@
1/*
2 * I2C client/driver for the Maxim/Dallas DS2782 Stand-Alone Fuel Gauge IC
3 *
4 * Copyright (C) 2009 Bluewater Systems Ltd
5 *
6 * Author: Ryan Mallon <ryan@bluewatersys.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
11 *
12 */
13
14#include <linux/kernel.h>
15#include <linux/module.h>
16#include <linux/types.h>
17#include <linux/errno.h>
18#include <linux/swab.h>
19#include <linux/i2c.h>
20#include <linux/idr.h>
21#include <linux/power_supply.h>
22
23#define DS2782_REG_RARC 0x06 /* Remaining active relative capacity */
24
25#define DS2782_REG_VOLT_MSB 0x0c
26#define DS2782_REG_TEMP_MSB 0x0a
27#define DS2782_REG_CURRENT_MSB 0x0e
28
29/* EEPROM Block */
30#define DS2782_REG_RSNSP 0x69 /* Sense resistor value */
31
32/* Current unit measurement in uA for a 1 milli-ohm sense resistor */
33#define DS2782_CURRENT_UNITS 1563
34
35#define to_ds2782_info(x) container_of(x, struct ds2782_info, battery)
36
37struct ds2782_info {
38 struct i2c_client *client;
39 struct power_supply battery;
40 int id;
41};
42
43static DEFINE_IDR(battery_id);
44static DEFINE_MUTEX(battery_lock);
45
46static inline int ds2782_read_reg(struct ds2782_info *info, int reg, u8 *val)
47{
48 int ret;
49
50 ret = i2c_smbus_read_byte_data(info->client, reg);
51 if (ret < 0) {
52 dev_err(&info->client->dev, "register read failed\n");
53 return ret;
54 }
55
56 *val = ret;
57 return 0;
58}
59
60static inline int ds2782_read_reg16(struct ds2782_info *info, int reg_msb,
61 s16 *val)
62{
63 int ret;
64
65 ret = swab16(i2c_smbus_read_word_data(info->client, reg_msb));
66 if (ret < 0) {
67 dev_err(&info->client->dev, "register read failed\n");
68 return ret;
69 }
70
71 *val = ret;
72 return 0;
73}
74
75static int ds2782_get_temp(struct ds2782_info *info, int *temp)
76{
77 s16 raw;
78 int err;
79
80 /*
81 * Temperature is measured in units of 0.125 degrees celcius, the
82 * power_supply class measures temperature in tenths of degrees
83 * celsius. The temperature value is stored as a 10 bit number, plus
84 * sign in the upper bits of a 16 bit register.
85 */
86 err = ds2782_read_reg16(info, DS2782_REG_TEMP_MSB, &raw);
87 if (err)
88 return err;
89 *temp = ((raw / 32) * 125) / 100;
90 return 0;
91}
92
93static int ds2782_get_current(struct ds2782_info *info, int *current_uA)
94{
95 int sense_res;
96 int err;
97 u8 sense_res_raw;
98 s16 raw;
99
100 /*
101 * The units of measurement for current are dependent on the value of
102 * the sense resistor.
103 */
104 err = ds2782_read_reg(info, DS2782_REG_RSNSP, &sense_res_raw);
105 if (err)
106 return err;
107 if (sense_res_raw == 0) {
108 dev_err(&info->client->dev, "sense resistor value is 0\n");
109 return -ENXIO;
110 }
111 sense_res = 1000 / sense_res_raw;
112
113 dev_dbg(&info->client->dev, "sense resistor = %d milli-ohms\n",
114 sense_res);
115 err = ds2782_read_reg16(info, DS2782_REG_CURRENT_MSB, &raw);
116 if (err)
117 return err;
118 *current_uA = raw * (DS2782_CURRENT_UNITS / sense_res);
119 return 0;
120}
121
122static int ds2782_get_voltage(struct ds2782_info *info, int *voltage_uA)
123{
124 s16 raw;
125 int err;
126
127 /*
128 * Voltage is measured in units of 4.88mV. The voltage is stored as
129 * a 10-bit number plus sign, in the upper bits of a 16-bit register
130 */
131 err = ds2782_read_reg16(info, DS2782_REG_VOLT_MSB, &raw);
132 if (err)
133 return err;
134 *voltage_uA = (raw / 32) * 4800;
135 return 0;
136}
137
138static int ds2782_get_capacity(struct ds2782_info *info, int *capacity)
139{
140 int err;
141 u8 raw;
142
143 err = ds2782_read_reg(info, DS2782_REG_RARC, &raw);
144 if (err)
145 return err;
146 *capacity = raw;
147 return raw;
148}
149
150static int ds2782_get_status(struct ds2782_info *info, int *status)
151{
152 int err;
153 int current_uA;
154 int capacity;
155
156 err = ds2782_get_current(info, &current_uA);
157 if (err)
158 return err;
159
160 err = ds2782_get_capacity(info, &capacity);
161 if (err)
162 return err;
163
164 if (capacity == 100)
165 *status = POWER_SUPPLY_STATUS_FULL;
166 else if (current_uA == 0)
167 *status = POWER_SUPPLY_STATUS_NOT_CHARGING;
168 else if (current_uA < 0)
169 *status = POWER_SUPPLY_STATUS_DISCHARGING;
170 else
171 *status = POWER_SUPPLY_STATUS_CHARGING;
172
173 return 0;
174}
175
176static int ds2782_battery_get_property(struct power_supply *psy,
177 enum power_supply_property prop,
178 union power_supply_propval *val)
179{
180 struct ds2782_info *info = to_ds2782_info(psy);
181 int ret;
182
183 switch (prop) {
184 case POWER_SUPPLY_PROP_STATUS:
185 ret = ds2782_get_status(info, &val->intval);
186 break;
187
188 case POWER_SUPPLY_PROP_CAPACITY:
189 ret = ds2782_get_capacity(info, &val->intval);
190 break;
191
192 case POWER_SUPPLY_PROP_VOLTAGE_NOW:
193 ret = ds2782_get_voltage(info, &val->intval);
194 break;
195
196 case POWER_SUPPLY_PROP_CURRENT_NOW:
197 ret = ds2782_get_current(info, &val->intval);
198 break;
199
200 case POWER_SUPPLY_PROP_TEMP:
201 ret = ds2782_get_temp(info, &val->intval);
202 break;
203
204 default:
205 ret = -EINVAL;
206 }
207
208 return ret;
209}
210
211static enum power_supply_property ds2782_battery_props[] = {
212 POWER_SUPPLY_PROP_STATUS,
213 POWER_SUPPLY_PROP_CAPACITY,
214 POWER_SUPPLY_PROP_VOLTAGE_NOW,
215 POWER_SUPPLY_PROP_CURRENT_NOW,
216 POWER_SUPPLY_PROP_TEMP,
217};
218
219static void ds2782_power_supply_init(struct power_supply *battery)
220{
221 battery->type = POWER_SUPPLY_TYPE_BATTERY;
222 battery->properties = ds2782_battery_props;
223 battery->num_properties = ARRAY_SIZE(ds2782_battery_props);
224 battery->get_property = ds2782_battery_get_property;
225 battery->external_power_changed = NULL;
226}
227
228static int ds2782_battery_remove(struct i2c_client *client)
229{
230 struct ds2782_info *info = i2c_get_clientdata(client);
231
232 power_supply_unregister(&info->battery);
233 kfree(info->battery.name);
234
235 mutex_lock(&battery_lock);
236 idr_remove(&battery_id, info->id);
237 mutex_unlock(&battery_lock);
238
239 i2c_set_clientdata(client, info);
240
241 kfree(info);
242 return 0;
243}
244
245static int ds2782_battery_probe(struct i2c_client *client,
246 const struct i2c_device_id *id)
247{
248 struct ds2782_info *info;
249 int ret;
250 int num;
251
252 /* Get an ID for this battery */
253 ret = idr_pre_get(&battery_id, GFP_KERNEL);
254 if (ret == 0) {
255 ret = -ENOMEM;
256 goto fail_id;
257 }
258
259 mutex_lock(&battery_lock);
260 ret = idr_get_new(&battery_id, client, &num);
261 mutex_unlock(&battery_lock);
262 if (ret < 0)
263 goto fail_id;
264
265 info = kzalloc(sizeof(*info), GFP_KERNEL);
266 if (!info) {
267 ret = -ENOMEM;
268 goto fail_info;
269 }
270
271 info->battery.name = kasprintf(GFP_KERNEL, "ds2782-%d", num);
272 if (!info->battery.name) {
273 ret = -ENOMEM;
274 goto fail_name;
275 }
276
277 i2c_set_clientdata(client, info);
278 info->client = client;
279 ds2782_power_supply_init(&info->battery);
280
281 ret = power_supply_register(&client->dev, &info->battery);
282 if (ret) {
283 dev_err(&client->dev, "failed to register battery\n");
284 goto fail_register;
285 }
286
287 return 0;
288
289fail_register:
290 kfree(info->battery.name);
291fail_name:
292 i2c_set_clientdata(client, info);
293 kfree(info);
294fail_info:
295 mutex_lock(&battery_lock);
296 idr_remove(&battery_id, num);
297 mutex_unlock(&battery_lock);
298fail_id:
299 return ret;
300}
301
302static const struct i2c_device_id ds2782_id[] = {
303 {"ds2782", 0},
304 {},
305};
306
307static struct i2c_driver ds2782_battery_driver = {
308 .driver = {
309 .name = "ds2782-battery",
310 },
311 .probe = ds2782_battery_probe,
312 .remove = ds2782_battery_remove,
313 .id_table = ds2782_id,
314};
315
316static int __init ds2782_init(void)
317{
318 return i2c_add_driver(&ds2782_battery_driver);
319}
320module_init(ds2782_init);
321
322static void __exit ds2782_exit(void)
323{
324 i2c_del_driver(&ds2782_battery_driver);
325}
326module_exit(ds2782_exit);
327
328MODULE_AUTHOR("Ryan Mallon <ryan@bluewatersys.com>");
329MODULE_DESCRIPTION("Maxim/Dallas DS2782 Stand-Alone Fuel Gauage IC driver");
330MODULE_LICENSE("GPL");
diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c
index 5fbca2681baa..58e419299cd6 100644
--- a/drivers/power/olpc_battery.c
+++ b/drivers/power/olpc_battery.c
@@ -8,6 +8,7 @@
8 * published by the Free Software Foundation. 8 * published by the Free Software Foundation.
9 */ 9 */
10 10
11#include <linux/kernel.h>
11#include <linux/module.h> 12#include <linux/module.h>
12#include <linux/err.h> 13#include <linux/err.h>
13#include <linux/platform_device.h> 14#include <linux/platform_device.h>
@@ -35,6 +36,7 @@
35#define BAT_STAT_AC 0x10 36#define BAT_STAT_AC 0x10
36#define BAT_STAT_CHARGING 0x20 37#define BAT_STAT_CHARGING 0x20
37#define BAT_STAT_DISCHARGING 0x40 38#define BAT_STAT_DISCHARGING 0x40
39#define BAT_STAT_TRICKLE 0x80
38 40
39#define BAT_ERR_INFOFAIL 0x02 41#define BAT_ERR_INFOFAIL 0x02
40#define BAT_ERR_OVERVOLTAGE 0x04 42#define BAT_ERR_OVERVOLTAGE 0x04
@@ -89,7 +91,7 @@ static char bat_serial[17]; /* Ick */
89static int olpc_bat_get_status(union power_supply_propval *val, uint8_t ec_byte) 91static int olpc_bat_get_status(union power_supply_propval *val, uint8_t ec_byte)
90{ 92{
91 if (olpc_platform_info.ecver > 0x44) { 93 if (olpc_platform_info.ecver > 0x44) {
92 if (ec_byte & BAT_STAT_CHARGING) 94 if (ec_byte & (BAT_STAT_CHARGING | BAT_STAT_TRICKLE))
93 val->intval = POWER_SUPPLY_STATUS_CHARGING; 95 val->intval = POWER_SUPPLY_STATUS_CHARGING;
94 else if (ec_byte & BAT_STAT_DISCHARGING) 96 else if (ec_byte & BAT_STAT_DISCHARGING)
95 val->intval = POWER_SUPPLY_STATUS_DISCHARGING; 97 val->intval = POWER_SUPPLY_STATUS_DISCHARGING;
@@ -219,7 +221,8 @@ static int olpc_bat_get_property(struct power_supply *psy,
219 It doesn't matter though -- the EC will return the last-known 221 It doesn't matter though -- the EC will return the last-known
220 information, and it's as if we just ran that _little_ bit faster 222 information, and it's as if we just ran that _little_ bit faster
221 and managed to read it out before the battery went away. */ 223 and managed to read it out before the battery went away. */
222 if (!(ec_byte & BAT_STAT_PRESENT) && psp != POWER_SUPPLY_PROP_PRESENT) 224 if (!(ec_byte & (BAT_STAT_PRESENT | BAT_STAT_TRICKLE)) &&
225 psp != POWER_SUPPLY_PROP_PRESENT)
223 return -ENODEV; 226 return -ENODEV;
224 227
225 switch (psp) { 228 switch (psp) {
@@ -229,7 +232,8 @@ static int olpc_bat_get_property(struct power_supply *psy,
229 return ret; 232 return ret;
230 break; 233 break;
231 case POWER_SUPPLY_PROP_PRESENT: 234 case POWER_SUPPLY_PROP_PRESENT:
232 val->intval = !!(ec_byte & BAT_STAT_PRESENT); 235 val->intval = !!(ec_byte & (BAT_STAT_PRESENT |
236 BAT_STAT_TRICKLE));
233 break; 237 break;
234 238
235 case POWER_SUPPLY_PROP_HEALTH: 239 case POWER_SUPPLY_PROP_HEALTH:
@@ -334,21 +338,21 @@ static ssize_t olpc_bat_eeprom_read(struct kobject *kobj,
334 struct bin_attribute *attr, char *buf, loff_t off, size_t count) 338 struct bin_attribute *attr, char *buf, loff_t off, size_t count)
335{ 339{
336 uint8_t ec_byte; 340 uint8_t ec_byte;
337 int ret, end; 341 int ret;
342 int i;
338 343
339 if (off >= EEPROM_SIZE) 344 if (off >= EEPROM_SIZE)
340 return 0; 345 return 0;
341 if (off + count > EEPROM_SIZE) 346 if (off + count > EEPROM_SIZE)
342 count = EEPROM_SIZE - off; 347 count = EEPROM_SIZE - off;
343 348
344 end = EEPROM_START + off + count; 349 for (i = 0; i < count; i++) {
345 for (ec_byte = EEPROM_START + off; ec_byte < end; ec_byte++) { 350 ec_byte = EEPROM_START + off + i;
346 ret = olpc_ec_cmd(EC_BAT_EEPROM, &ec_byte, 1, 351 ret = olpc_ec_cmd(EC_BAT_EEPROM, &ec_byte, 1, &buf[i], 1);
347 &buf[ec_byte - EEPROM_START], 1);
348 if (ret) { 352 if (ret) {
349 printk(KERN_ERR "olpc-battery: EC command " 353 pr_err("olpc-battery: "
350 "EC_BAT_EEPROM @ 0x%x failed -" 354 "EC_BAT_EEPROM cmd @ 0x%x failed - %d!\n",
351 " %d!\n", ec_byte, ret); 355 ec_byte, ret);
352 return -EIO; 356 return -EIO;
353 } 357 }
354 } 358 }
diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c
index 8030e25152fb..c75d6f35cb5f 100644
--- a/drivers/s390/scsi/zfcp_erp.c
+++ b/drivers/s390/scsi/zfcp_erp.c
@@ -553,40 +553,35 @@ static void _zfcp_erp_unit_reopen_all(struct zfcp_port *port, int clear,
553 _zfcp_erp_unit_reopen(unit, clear, id, ref); 553 _zfcp_erp_unit_reopen(unit, clear, id, ref);
554} 554}
555 555
556static void zfcp_erp_strategy_followup_actions(struct zfcp_erp_action *act) 556static void zfcp_erp_strategy_followup_failed(struct zfcp_erp_action *act)
557{ 557{
558 struct zfcp_adapter *adapter = act->adapter;
559 struct zfcp_port *port = act->port;
560 struct zfcp_unit *unit = act->unit;
561 u32 status = act->status;
562
563 /* initiate follow-up actions depending on success of finished action */
564 switch (act->action) { 558 switch (act->action) {
565
566 case ZFCP_ERP_ACTION_REOPEN_ADAPTER: 559 case ZFCP_ERP_ACTION_REOPEN_ADAPTER:
567 if (status == ZFCP_ERP_SUCCEEDED) 560 _zfcp_erp_adapter_reopen(act->adapter, 0, "ersff_1", NULL);
568 _zfcp_erp_port_reopen_all(adapter, 0, "ersfa_1", NULL);
569 else
570 _zfcp_erp_adapter_reopen(adapter, 0, "ersfa_2", NULL);
571 break; 561 break;
572
573 case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED: 562 case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED:
574 if (status == ZFCP_ERP_SUCCEEDED) 563 _zfcp_erp_port_forced_reopen(act->port, 0, "ersff_2", NULL);
575 _zfcp_erp_port_reopen(port, 0, "ersfa_3", NULL);
576 else
577 _zfcp_erp_adapter_reopen(adapter, 0, "ersfa_4", NULL);
578 break; 564 break;
579
580 case ZFCP_ERP_ACTION_REOPEN_PORT: 565 case ZFCP_ERP_ACTION_REOPEN_PORT:
581 if (status == ZFCP_ERP_SUCCEEDED) 566 _zfcp_erp_port_reopen(act->port, 0, "ersff_3", NULL);
582 _zfcp_erp_unit_reopen_all(port, 0, "ersfa_5", NULL);
583 else
584 _zfcp_erp_port_forced_reopen(port, 0, "ersfa_6", NULL);
585 break; 567 break;
586
587 case ZFCP_ERP_ACTION_REOPEN_UNIT: 568 case ZFCP_ERP_ACTION_REOPEN_UNIT:
588 if (status != ZFCP_ERP_SUCCEEDED) 569 _zfcp_erp_unit_reopen(act->unit, 0, "ersff_4", NULL);
589 _zfcp_erp_port_reopen(unit->port, 0, "ersfa_7", NULL); 570 break;
571 }
572}
573
574static void zfcp_erp_strategy_followup_success(struct zfcp_erp_action *act)
575{
576 switch (act->action) {
577 case ZFCP_ERP_ACTION_REOPEN_ADAPTER:
578 _zfcp_erp_port_reopen_all(act->adapter, 0, "ersfs_1", NULL);
579 break;
580 case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED:
581 _zfcp_erp_port_reopen(act->port, 0, "ersfs_2", NULL);
582 break;
583 case ZFCP_ERP_ACTION_REOPEN_PORT:
584 _zfcp_erp_unit_reopen_all(act->port, 0, "ersfs_3", NULL);
590 break; 585 break;
591 } 586 }
592} 587}
@@ -801,7 +796,7 @@ static int zfcp_erp_port_forced_strategy(struct zfcp_erp_action *erp_action)
801 return ZFCP_ERP_FAILED; 796 return ZFCP_ERP_FAILED;
802 797
803 case ZFCP_ERP_STEP_PHYS_PORT_CLOSING: 798 case ZFCP_ERP_STEP_PHYS_PORT_CLOSING:
804 if (status & ZFCP_STATUS_PORT_PHYS_OPEN) 799 if (!(status & ZFCP_STATUS_PORT_PHYS_OPEN))
805 return ZFCP_ERP_SUCCEEDED; 800 return ZFCP_ERP_SUCCEEDED;
806 } 801 }
807 return ZFCP_ERP_FAILED; 802 return ZFCP_ERP_FAILED;
@@ -853,11 +848,17 @@ void zfcp_erp_port_strategy_open_lookup(struct work_struct *work)
853 gid_pn_work); 848 gid_pn_work);
854 849
855 retval = zfcp_fc_ns_gid_pn(&port->erp_action); 850 retval = zfcp_fc_ns_gid_pn(&port->erp_action);
856 if (retval == -ENOMEM) 851 if (!retval) {
857 zfcp_erp_notify(&port->erp_action, ZFCP_ERP_NOMEM); 852 port->erp_action.step = ZFCP_ERP_STEP_NAMESERVER_LOOKUP;
858 port->erp_action.step = ZFCP_ERP_STEP_NAMESERVER_LOOKUP; 853 goto out;
859 if (retval) 854 }
860 zfcp_erp_notify(&port->erp_action, ZFCP_ERP_FAILED); 855 if (retval == -ENOMEM) {
856 zfcp_erp_notify(&port->erp_action, ZFCP_STATUS_ERP_LOWMEM);
857 goto out;
858 }
859 /* all other error condtions */
860 zfcp_erp_notify(&port->erp_action, 0);
861out:
861 zfcp_port_put(port); 862 zfcp_port_put(port);
862} 863}
863 864
@@ -1289,7 +1290,10 @@ static int zfcp_erp_strategy(struct zfcp_erp_action *erp_action)
1289 retval = zfcp_erp_strategy_statechange(erp_action, retval); 1290 retval = zfcp_erp_strategy_statechange(erp_action, retval);
1290 if (retval == ZFCP_ERP_EXIT) 1291 if (retval == ZFCP_ERP_EXIT)
1291 goto unlock; 1292 goto unlock;
1292 zfcp_erp_strategy_followup_actions(erp_action); 1293 if (retval == ZFCP_ERP_SUCCEEDED)
1294 zfcp_erp_strategy_followup_success(erp_action);
1295 if (retval == ZFCP_ERP_FAILED)
1296 zfcp_erp_strategy_followup_failed(erp_action);
1293 1297
1294 unlock: 1298 unlock:
1295 write_unlock(&adapter->erp_lock); 1299 write_unlock(&adapter->erp_lock);
diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c
index 2f0705d76b72..47daebfa7e59 100644
--- a/drivers/s390/scsi/zfcp_fc.c
+++ b/drivers/s390/scsi/zfcp_fc.c
@@ -79,11 +79,9 @@ static int zfcp_wka_port_get(struct zfcp_wka_port *wka_port)
79 79
80 mutex_unlock(&wka_port->mutex); 80 mutex_unlock(&wka_port->mutex);
81 81
82 wait_event_timeout( 82 wait_event(wka_port->completion_wq,
83 wka_port->completion_wq, 83 wka_port->status == ZFCP_WKA_PORT_ONLINE ||
84 wka_port->status == ZFCP_WKA_PORT_ONLINE || 84 wka_port->status == ZFCP_WKA_PORT_OFFLINE);
85 wka_port->status == ZFCP_WKA_PORT_OFFLINE,
86 HZ >> 1);
87 85
88 if (wka_port->status == ZFCP_WKA_PORT_ONLINE) { 86 if (wka_port->status == ZFCP_WKA_PORT_ONLINE) {
89 atomic_inc(&wka_port->refcount); 87 atomic_inc(&wka_port->refcount);
diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c
index c57658f3d34f..47795fbf081f 100644
--- a/drivers/s390/scsi/zfcp_fsf.c
+++ b/drivers/s390/scsi/zfcp_fsf.c
@@ -670,8 +670,11 @@ static int zfcp_fsf_req_sbal_get(struct zfcp_adapter *adapter)
670 zfcp_fsf_sbal_check(adapter), 5 * HZ); 670 zfcp_fsf_sbal_check(adapter), 5 * HZ);
671 if (ret > 0) 671 if (ret > 0)
672 return 0; 672 return 0;
673 if (!ret) 673 if (!ret) {
674 atomic_inc(&adapter->qdio_outb_full); 674 atomic_inc(&adapter->qdio_outb_full);
675 /* assume hanging outbound queue, try queue recovery */
676 zfcp_erp_adapter_reopen(adapter, 0, "fsrsg_1", NULL);
677 }
675 678
676 spin_lock_bh(&adapter->req_q_lock); 679 spin_lock_bh(&adapter->req_q_lock);
677 return -EIO; 680 return -EIO;
@@ -722,7 +725,7 @@ static struct zfcp_fsf_req *zfcp_fsf_req_create(struct zfcp_adapter *adapter,
722 req = zfcp_fsf_alloc_qtcb(pool); 725 req = zfcp_fsf_alloc_qtcb(pool);
723 726
724 if (unlikely(!req)) 727 if (unlikely(!req))
725 return ERR_PTR(-EIO); 728 return ERR_PTR(-ENOMEM);
726 729
727 if (adapter->req_no == 0) 730 if (adapter->req_no == 0)
728 adapter->req_no++; 731 adapter->req_no++;
@@ -1010,6 +1013,23 @@ skip_fsfstatus:
1010 send_ct->handler(send_ct->handler_data); 1013 send_ct->handler(send_ct->handler_data);
1011} 1014}
1012 1015
1016static void zfcp_fsf_setup_ct_els_unchained(struct qdio_buffer_element *sbale,
1017 struct scatterlist *sg_req,
1018 struct scatterlist *sg_resp)
1019{
1020 sbale[0].flags |= SBAL_FLAGS0_TYPE_WRITE_READ;
1021 sbale[2].addr = sg_virt(sg_req);
1022 sbale[2].length = sg_req->length;
1023 sbale[3].addr = sg_virt(sg_resp);
1024 sbale[3].length = sg_resp->length;
1025 sbale[3].flags |= SBAL_FLAGS_LAST_ENTRY;
1026}
1027
1028static int zfcp_fsf_one_sbal(struct scatterlist *sg)
1029{
1030 return sg_is_last(sg) && sg->length <= PAGE_SIZE;
1031}
1032
1013static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req, 1033static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req,
1014 struct scatterlist *sg_req, 1034 struct scatterlist *sg_req,
1015 struct scatterlist *sg_resp, 1035 struct scatterlist *sg_resp,
@@ -1020,30 +1040,30 @@ static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req,
1020 int bytes; 1040 int bytes;
1021 1041
1022 if (!(feat & FSF_FEATURE_ELS_CT_CHAINED_SBALS)) { 1042 if (!(feat & FSF_FEATURE_ELS_CT_CHAINED_SBALS)) {
1023 if (sg_req->length > PAGE_SIZE || sg_resp->length > PAGE_SIZE || 1043 if (!zfcp_fsf_one_sbal(sg_req) || !zfcp_fsf_one_sbal(sg_resp))
1024 !sg_is_last(sg_req) || !sg_is_last(sg_resp))
1025 return -EOPNOTSUPP; 1044 return -EOPNOTSUPP;
1026 1045
1027 sbale[0].flags |= SBAL_FLAGS0_TYPE_WRITE_READ; 1046 zfcp_fsf_setup_ct_els_unchained(sbale, sg_req, sg_resp);
1028 sbale[2].addr = sg_virt(sg_req); 1047 return 0;
1029 sbale[2].length = sg_req->length; 1048 }
1030 sbale[3].addr = sg_virt(sg_resp); 1049
1031 sbale[3].length = sg_resp->length; 1050 /* use single, unchained SBAL if it can hold the request */
1032 sbale[3].flags |= SBAL_FLAGS_LAST_ENTRY; 1051 if (zfcp_fsf_one_sbal(sg_req) && zfcp_fsf_one_sbal(sg_resp)) {
1052 zfcp_fsf_setup_ct_els_unchained(sbale, sg_req, sg_resp);
1033 return 0; 1053 return 0;
1034 } 1054 }
1035 1055
1036 bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ, 1056 bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ,
1037 sg_req, max_sbals); 1057 sg_req, max_sbals);
1038 if (bytes <= 0) 1058 if (bytes <= 0)
1039 return -ENOMEM; 1059 return -EIO;
1040 req->qtcb->bottom.support.req_buf_length = bytes; 1060 req->qtcb->bottom.support.req_buf_length = bytes;
1041 req->sbale_curr = ZFCP_LAST_SBALE_PER_SBAL; 1061 req->sbale_curr = ZFCP_LAST_SBALE_PER_SBAL;
1042 1062
1043 bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ, 1063 bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ,
1044 sg_resp, max_sbals); 1064 sg_resp, max_sbals);
1045 if (bytes <= 0) 1065 if (bytes <= 0)
1046 return -ENOMEM; 1066 return -EIO;
1047 req->qtcb->bottom.support.resp_buf_length = bytes; 1067 req->qtcb->bottom.support.resp_buf_length = bytes;
1048 1068
1049 return 0; 1069 return 0;
@@ -1607,10 +1627,10 @@ static void zfcp_fsf_open_wka_port_handler(struct zfcp_fsf_req *req)
1607 case FSF_ACCESS_DENIED: 1627 case FSF_ACCESS_DENIED:
1608 wka_port->status = ZFCP_WKA_PORT_OFFLINE; 1628 wka_port->status = ZFCP_WKA_PORT_OFFLINE;
1609 break; 1629 break;
1610 case FSF_PORT_ALREADY_OPEN:
1611 break;
1612 case FSF_GOOD: 1630 case FSF_GOOD:
1613 wka_port->handle = header->port_handle; 1631 wka_port->handle = header->port_handle;
1632 /* fall through */
1633 case FSF_PORT_ALREADY_OPEN:
1614 wka_port->status = ZFCP_WKA_PORT_ONLINE; 1634 wka_port->status = ZFCP_WKA_PORT_ONLINE;
1615 } 1635 }
1616out: 1636out:
@@ -1731,15 +1751,16 @@ static void zfcp_fsf_close_physical_port_handler(struct zfcp_fsf_req *req)
1731 zfcp_fsf_access_denied_port(req, port); 1751 zfcp_fsf_access_denied_port(req, port);
1732 break; 1752 break;
1733 case FSF_PORT_BOXED: 1753 case FSF_PORT_BOXED:
1734 zfcp_erp_port_boxed(port, "fscpph2", req);
1735 req->status |= ZFCP_STATUS_FSFREQ_ERROR |
1736 ZFCP_STATUS_FSFREQ_RETRY;
1737 /* can't use generic zfcp_erp_modify_port_status because 1754 /* can't use generic zfcp_erp_modify_port_status because
1738 * ZFCP_STATUS_COMMON_OPEN must not be reset for the port */ 1755 * ZFCP_STATUS_COMMON_OPEN must not be reset for the port */
1739 atomic_clear_mask(ZFCP_STATUS_PORT_PHYS_OPEN, &port->status); 1756 atomic_clear_mask(ZFCP_STATUS_PORT_PHYS_OPEN, &port->status);
1740 list_for_each_entry(unit, &port->unit_list_head, list) 1757 list_for_each_entry(unit, &port->unit_list_head, list)
1741 atomic_clear_mask(ZFCP_STATUS_COMMON_OPEN, 1758 atomic_clear_mask(ZFCP_STATUS_COMMON_OPEN,
1742 &unit->status); 1759 &unit->status);
1760 zfcp_erp_port_boxed(port, "fscpph2", req);
1761 req->status |= ZFCP_STATUS_FSFREQ_ERROR |
1762 ZFCP_STATUS_FSFREQ_RETRY;
1763
1743 break; 1764 break;
1744 case FSF_ADAPTER_STATUS_AVAILABLE: 1765 case FSF_ADAPTER_STATUS_AVAILABLE:
1745 switch (header->fsf_status_qual.word[0]) { 1766 switch (header->fsf_status_qual.word[0]) {
@@ -2541,7 +2562,6 @@ struct zfcp_fsf_req *zfcp_fsf_control_file(struct zfcp_adapter *adapter,
2541 bytes = zfcp_qdio_sbals_from_sg(req, direction, fsf_cfdc->sg, 2562 bytes = zfcp_qdio_sbals_from_sg(req, direction, fsf_cfdc->sg,
2542 FSF_MAX_SBALS_PER_REQ); 2563 FSF_MAX_SBALS_PER_REQ);
2543 if (bytes != ZFCP_CFDC_MAX_SIZE) { 2564 if (bytes != ZFCP_CFDC_MAX_SIZE) {
2544 retval = -ENOMEM;
2545 zfcp_fsf_req_free(req); 2565 zfcp_fsf_req_free(req);
2546 goto out; 2566 goto out;
2547 } 2567 }
diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c
index 967ede73f4c5..6925a1784682 100644
--- a/drivers/s390/scsi/zfcp_scsi.c
+++ b/drivers/s390/scsi/zfcp_scsi.c
@@ -167,20 +167,21 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt)
167 struct zfcp_unit *unit = scpnt->device->hostdata; 167 struct zfcp_unit *unit = scpnt->device->hostdata;
168 struct zfcp_fsf_req *old_req, *abrt_req; 168 struct zfcp_fsf_req *old_req, *abrt_req;
169 unsigned long flags; 169 unsigned long flags;
170 unsigned long old_req_id = (unsigned long) scpnt->host_scribble; 170 unsigned long old_reqid = (unsigned long) scpnt->host_scribble;
171 int retval = SUCCESS; 171 int retval = SUCCESS;
172 int retry = 3; 172 int retry = 3;
173 char *dbf_tag;
173 174
174 /* avoid race condition between late normal completion and abort */ 175 /* avoid race condition between late normal completion and abort */
175 write_lock_irqsave(&adapter->abort_lock, flags); 176 write_lock_irqsave(&adapter->abort_lock, flags);
176 177
177 spin_lock(&adapter->req_list_lock); 178 spin_lock(&adapter->req_list_lock);
178 old_req = zfcp_reqlist_find(adapter, old_req_id); 179 old_req = zfcp_reqlist_find(adapter, old_reqid);
179 spin_unlock(&adapter->req_list_lock); 180 spin_unlock(&adapter->req_list_lock);
180 if (!old_req) { 181 if (!old_req) {
181 write_unlock_irqrestore(&adapter->abort_lock, flags); 182 write_unlock_irqrestore(&adapter->abort_lock, flags);
182 zfcp_scsi_dbf_event_abort("lte1", adapter, scpnt, NULL, 183 zfcp_scsi_dbf_event_abort("lte1", adapter, scpnt, NULL,
183 old_req_id); 184 old_reqid);
184 return FAILED; /* completion could be in progress */ 185 return FAILED; /* completion could be in progress */
185 } 186 }
186 old_req->data = NULL; 187 old_req->data = NULL;
@@ -189,7 +190,7 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt)
189 write_unlock_irqrestore(&adapter->abort_lock, flags); 190 write_unlock_irqrestore(&adapter->abort_lock, flags);
190 191
191 while (retry--) { 192 while (retry--) {
192 abrt_req = zfcp_fsf_abort_fcp_command(old_req_id, unit); 193 abrt_req = zfcp_fsf_abort_fcp_command(old_reqid, unit);
193 if (abrt_req) 194 if (abrt_req)
194 break; 195 break;
195 196
@@ -197,7 +198,7 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt)
197 if (!(atomic_read(&adapter->status) & 198 if (!(atomic_read(&adapter->status) &
198 ZFCP_STATUS_COMMON_RUNNING)) { 199 ZFCP_STATUS_COMMON_RUNNING)) {
199 zfcp_scsi_dbf_event_abort("nres", adapter, scpnt, NULL, 200 zfcp_scsi_dbf_event_abort("nres", adapter, scpnt, NULL,
200 old_req_id); 201 old_reqid);
201 return SUCCESS; 202 return SUCCESS;
202 } 203 }
203 } 204 }
@@ -208,13 +209,14 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt)
208 abrt_req->status & ZFCP_STATUS_FSFREQ_COMPLETED); 209 abrt_req->status & ZFCP_STATUS_FSFREQ_COMPLETED);
209 210
210 if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTSUCCEEDED) 211 if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTSUCCEEDED)
211 zfcp_scsi_dbf_event_abort("okay", adapter, scpnt, abrt_req, 0); 212 dbf_tag = "okay";
212 else if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTNOTNEEDED) 213 else if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTNOTNEEDED)
213 zfcp_scsi_dbf_event_abort("lte2", adapter, scpnt, abrt_req, 0); 214 dbf_tag = "lte2";
214 else { 215 else {
215 zfcp_scsi_dbf_event_abort("fail", adapter, scpnt, abrt_req, 0); 216 dbf_tag = "fail";
216 retval = FAILED; 217 retval = FAILED;
217 } 218 }
219 zfcp_scsi_dbf_event_abort(dbf_tag, adapter, scpnt, abrt_req, old_reqid);
218 zfcp_fsf_req_free(abrt_req); 220 zfcp_fsf_req_free(abrt_req);
219 return retval; 221 return retval;
220} 222}
@@ -534,6 +536,9 @@ static void zfcp_scsi_rport_register(struct zfcp_port *port)
534 struct fc_rport_identifiers ids; 536 struct fc_rport_identifiers ids;
535 struct fc_rport *rport; 537 struct fc_rport *rport;
536 538
539 if (port->rport)
540 return;
541
537 ids.node_name = port->wwnn; 542 ids.node_name = port->wwnn;
538 ids.port_name = port->wwpn; 543 ids.port_name = port->wwpn;
539 ids.port_id = port->d_id; 544 ids.port_id = port->d_id;
@@ -557,8 +562,10 @@ static void zfcp_scsi_rport_block(struct zfcp_port *port)
557{ 562{
558 struct fc_rport *rport = port->rport; 563 struct fc_rport *rport = port->rport;
559 564
560 if (rport) 565 if (rport) {
561 fc_remote_port_delete(rport); 566 fc_remote_port_delete(rport);
567 port->rport = NULL;
568 }
562} 569}
563 570
564void zfcp_scsi_schedule_rport_register(struct zfcp_port *port) 571void zfcp_scsi_schedule_rport_register(struct zfcp_port *port)
diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c
index 3e51e64d1108..0fe5cce818cb 100644
--- a/drivers/s390/scsi/zfcp_sysfs.c
+++ b/drivers/s390/scsi/zfcp_sysfs.c
@@ -494,9 +494,14 @@ static ssize_t zfcp_sysfs_adapter_q_full_show(struct device *dev,
494 struct Scsi_Host *scsi_host = class_to_shost(dev); 494 struct Scsi_Host *scsi_host = class_to_shost(dev);
495 struct zfcp_adapter *adapter = 495 struct zfcp_adapter *adapter =
496 (struct zfcp_adapter *) scsi_host->hostdata[0]; 496 (struct zfcp_adapter *) scsi_host->hostdata[0];
497 u64 util;
498
499 spin_lock_bh(&adapter->qdio_stat_lock);
500 util = adapter->req_q_util;
501 spin_unlock_bh(&adapter->qdio_stat_lock);
497 502
498 return sprintf(buf, "%d %llu\n", atomic_read(&adapter->qdio_outb_full), 503 return sprintf(buf, "%d %llu\n", atomic_read(&adapter->qdio_outb_full),
499 (unsigned long long)adapter->req_q_util); 504 (unsigned long long)util);
500} 505}
501static DEVICE_ATTR(queue_full, S_IRUGO, zfcp_sysfs_adapter_q_full_show, NULL); 506static DEVICE_ATTR(queue_full, S_IRUGO, zfcp_sysfs_adapter_q_full_show, NULL);
502 507
diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c
index 2bc22be5f849..145ab9ba55ea 100644
--- a/drivers/scsi/libfc/fc_exch.c
+++ b/drivers/scsi/libfc/fc_exch.c
@@ -415,9 +415,9 @@ static void fc_exch_timeout(struct work_struct *work)
415 e_stat = ep->esb_stat; 415 e_stat = ep->esb_stat;
416 if (e_stat & ESB_ST_COMPLETE) { 416 if (e_stat & ESB_ST_COMPLETE) {
417 ep->esb_stat = e_stat & ~ESB_ST_REC_QUAL; 417 ep->esb_stat = e_stat & ~ESB_ST_REC_QUAL;
418 spin_unlock_bh(&ep->ex_lock);
418 if (e_stat & ESB_ST_REC_QUAL) 419 if (e_stat & ESB_ST_REC_QUAL)
419 fc_exch_rrq(ep); 420 fc_exch_rrq(ep);
420 spin_unlock_bh(&ep->ex_lock);
421 goto done; 421 goto done;
422 } else { 422 } else {
423 resp = ep->resp; 423 resp = ep->resp;
@@ -1624,14 +1624,14 @@ static void fc_exch_rrq(struct fc_exch *ep)
1624 struct fc_lport *lp; 1624 struct fc_lport *lp;
1625 struct fc_els_rrq *rrq; 1625 struct fc_els_rrq *rrq;
1626 struct fc_frame *fp; 1626 struct fc_frame *fp;
1627 struct fc_seq *rrq_sp;
1628 u32 did; 1627 u32 did;
1629 1628
1630 lp = ep->lp; 1629 lp = ep->lp;
1631 1630
1632 fp = fc_frame_alloc(lp, sizeof(*rrq)); 1631 fp = fc_frame_alloc(lp, sizeof(*rrq));
1633 if (!fp) 1632 if (!fp)
1634 return; 1633 goto retry;
1634
1635 rrq = fc_frame_payload_get(fp, sizeof(*rrq)); 1635 rrq = fc_frame_payload_get(fp, sizeof(*rrq));
1636 memset(rrq, 0, sizeof(*rrq)); 1636 memset(rrq, 0, sizeof(*rrq));
1637 rrq->rrq_cmd = ELS_RRQ; 1637 rrq->rrq_cmd = ELS_RRQ;
@@ -1647,13 +1647,20 @@ static void fc_exch_rrq(struct fc_exch *ep)
1647 fc_host_port_id(lp->host), FC_TYPE_ELS, 1647 fc_host_port_id(lp->host), FC_TYPE_ELS,
1648 FC_FC_FIRST_SEQ | FC_FC_END_SEQ | FC_FC_SEQ_INIT, 0); 1648 FC_FC_FIRST_SEQ | FC_FC_END_SEQ | FC_FC_SEQ_INIT, 0);
1649 1649
1650 rrq_sp = fc_exch_seq_send(lp, fp, fc_exch_rrq_resp, NULL, ep, 1650 if (fc_exch_seq_send(lp, fp, fc_exch_rrq_resp, NULL, ep, lp->e_d_tov))
1651 lp->e_d_tov); 1651 return;
1652 if (!rrq_sp) { 1652
1653 ep->esb_stat |= ESB_ST_REC_QUAL; 1653retry:
1654 fc_exch_timer_set_locked(ep, ep->r_a_tov); 1654 spin_lock_bh(&ep->ex_lock);
1655 if (ep->state & (FC_EX_RST_CLEANUP | FC_EX_DONE)) {
1656 spin_unlock_bh(&ep->ex_lock);
1657 /* drop hold for rec qual */
1658 fc_exch_release(ep);
1655 return; 1659 return;
1656 } 1660 }
1661 ep->esb_stat |= ESB_ST_REC_QUAL;
1662 fc_exch_timer_set_locked(ep, ep->r_a_tov);
1663 spin_unlock_bh(&ep->ex_lock);
1657} 1664}
1658 1665
1659 1666
diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c
index 716cc344c5df..a751f6230c22 100644
--- a/drivers/scsi/libiscsi.c
+++ b/drivers/scsi/libiscsi.c
@@ -1974,10 +1974,10 @@ int iscsi_eh_abort(struct scsi_cmnd *sc)
1974 * good and have never sent us a successful tmf response 1974 * good and have never sent us a successful tmf response
1975 * then sent more data for the cmd. 1975 * then sent more data for the cmd.
1976 */ 1976 */
1977 spin_lock(&session->lock); 1977 spin_lock_bh(&session->lock);
1978 fail_scsi_task(task, DID_ABORT); 1978 fail_scsi_task(task, DID_ABORT);
1979 conn->tmf_state = TMF_INITIAL; 1979 conn->tmf_state = TMF_INITIAL;
1980 spin_unlock(&session->lock); 1980 spin_unlock_bh(&session->lock);
1981 iscsi_start_tx(conn); 1981 iscsi_start_tx(conn);
1982 goto success_unlocked; 1982 goto success_unlocked;
1983 case TMF_TIMEDOUT: 1983 case TMF_TIMEDOUT:
diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c
index 54fa1e42dc4d..b3381959acce 100644
--- a/drivers/scsi/libsas/sas_expander.c
+++ b/drivers/scsi/libsas/sas_expander.c
@@ -766,6 +766,7 @@ static int sas_ex_join_wide_port(struct domain_device *parent, int phy_id)
766 if (!memcmp(phy->attached_sas_addr, ephy->attached_sas_addr, 766 if (!memcmp(phy->attached_sas_addr, ephy->attached_sas_addr,
767 SAS_ADDR_SIZE) && ephy->port) { 767 SAS_ADDR_SIZE) && ephy->port) {
768 sas_port_add_phy(ephy->port, phy->phy); 768 sas_port_add_phy(ephy->port, phy->phy);
769 phy->port = ephy->port;
769 phy->phy_state = PHY_DEVICE_DISCOVERED; 770 phy->phy_state = PHY_DEVICE_DISCOVERED;
770 return 0; 771 return 0;
771 } 772 }
@@ -945,11 +946,21 @@ static int sas_ex_discover_dev(struct domain_device *dev, int phy_id)
945 if (ex->ex_phy[i].phy_state == PHY_VACANT || 946 if (ex->ex_phy[i].phy_state == PHY_VACANT ||
946 ex->ex_phy[i].phy_state == PHY_NOT_PRESENT) 947 ex->ex_phy[i].phy_state == PHY_NOT_PRESENT)
947 continue; 948 continue;
948 949 /*
950 * Due to races, the phy might not get added to the
951 * wide port, so we add the phy to the wide port here.
952 */
949 if (SAS_ADDR(ex->ex_phy[i].attached_sas_addr) == 953 if (SAS_ADDR(ex->ex_phy[i].attached_sas_addr) ==
950 SAS_ADDR(child->sas_addr)) 954 SAS_ADDR(child->sas_addr)) {
951 ex->ex_phy[i].phy_state= PHY_DEVICE_DISCOVERED; 955 ex->ex_phy[i].phy_state= PHY_DEVICE_DISCOVERED;
956 res = sas_ex_join_wide_port(dev, i);
957 if (!res)
958 SAS_DPRINTK("Attaching ex phy%d to wide port %016llx\n",
959 i, SAS_ADDR(ex->ex_phy[i].attached_sas_addr));
960
961 }
952 } 962 }
963 res = 0;
953 } 964 }
954 965
955 return res; 966 return res;
@@ -1598,7 +1609,7 @@ static int sas_get_phy_attached_sas_addr(struct domain_device *dev,
1598} 1609}
1599 1610
1600static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id, 1611static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id,
1601 int from_phy) 1612 int from_phy, bool update)
1602{ 1613{
1603 struct expander_device *ex = &dev->ex_dev; 1614 struct expander_device *ex = &dev->ex_dev;
1604 int res = 0; 1615 int res = 0;
@@ -1611,7 +1622,9 @@ static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id,
1611 if (res) 1622 if (res)
1612 goto out; 1623 goto out;
1613 else if (phy_change_count != ex->ex_phy[i].phy_change_count) { 1624 else if (phy_change_count != ex->ex_phy[i].phy_change_count) {
1614 ex->ex_phy[i].phy_change_count = phy_change_count; 1625 if (update)
1626 ex->ex_phy[i].phy_change_count =
1627 phy_change_count;
1615 *phy_id = i; 1628 *phy_id = i;
1616 return 0; 1629 return 0;
1617 } 1630 }
@@ -1653,31 +1666,52 @@ out:
1653 kfree(rg_req); 1666 kfree(rg_req);
1654 return res; 1667 return res;
1655} 1668}
1669/**
1670 * sas_find_bcast_dev - find the device issue BROADCAST(CHANGE).
1671 * @dev:domain device to be detect.
1672 * @src_dev: the device which originated BROADCAST(CHANGE).
1673 *
1674 * Add self-configuration expander suport. Suppose two expander cascading,
1675 * when the first level expander is self-configuring, hotplug the disks in
1676 * second level expander, BROADCAST(CHANGE) will not only be originated
1677 * in the second level expander, but also be originated in the first level
1678 * expander (see SAS protocol SAS 2r-14, 7.11 for detail), it is to say,
1679 * expander changed count in two level expanders will all increment at least
1680 * once, but the phy which chang count has changed is the source device which
1681 * we concerned.
1682 */
1656 1683
1657static int sas_find_bcast_dev(struct domain_device *dev, 1684static int sas_find_bcast_dev(struct domain_device *dev,
1658 struct domain_device **src_dev) 1685 struct domain_device **src_dev)
1659{ 1686{
1660 struct expander_device *ex = &dev->ex_dev; 1687 struct expander_device *ex = &dev->ex_dev;
1661 int ex_change_count = -1; 1688 int ex_change_count = -1;
1689 int phy_id = -1;
1662 int res; 1690 int res;
1691 struct domain_device *ch;
1663 1692
1664 res = sas_get_ex_change_count(dev, &ex_change_count); 1693 res = sas_get_ex_change_count(dev, &ex_change_count);
1665 if (res) 1694 if (res)
1666 goto out; 1695 goto out;
1667 if (ex_change_count != -1 && 1696 if (ex_change_count != -1 && ex_change_count != ex->ex_change_count) {
1668 ex_change_count != ex->ex_change_count) { 1697 /* Just detect if this expander phys phy change count changed,
1669 *src_dev = dev; 1698 * in order to determine if this expander originate BROADCAST,
1670 ex->ex_change_count = ex_change_count; 1699 * and do not update phy change count field in our structure.
1671 } else { 1700 */
1672 struct domain_device *ch; 1701 res = sas_find_bcast_phy(dev, &phy_id, 0, false);
1673 1702 if (phy_id != -1) {
1674 list_for_each_entry(ch, &ex->children, siblings) { 1703 *src_dev = dev;
1675 if (ch->dev_type == EDGE_DEV || 1704 ex->ex_change_count = ex_change_count;
1676 ch->dev_type == FANOUT_DEV) { 1705 SAS_DPRINTK("Expander phy change count has changed\n");
1677 res = sas_find_bcast_dev(ch, src_dev); 1706 return res;
1678 if (src_dev) 1707 } else
1679 return res; 1708 SAS_DPRINTK("Expander phys DID NOT change\n");
1680 } 1709 }
1710 list_for_each_entry(ch, &ex->children, siblings) {
1711 if (ch->dev_type == EDGE_DEV || ch->dev_type == FANOUT_DEV) {
1712 res = sas_find_bcast_dev(ch, src_dev);
1713 if (src_dev)
1714 return res;
1681 } 1715 }
1682 } 1716 }
1683out: 1717out:
@@ -1700,24 +1734,26 @@ static void sas_unregister_ex_tree(struct domain_device *dev)
1700} 1734}
1701 1735
1702static void sas_unregister_devs_sas_addr(struct domain_device *parent, 1736static void sas_unregister_devs_sas_addr(struct domain_device *parent,
1703 int phy_id) 1737 int phy_id, bool last)
1704{ 1738{
1705 struct expander_device *ex_dev = &parent->ex_dev; 1739 struct expander_device *ex_dev = &parent->ex_dev;
1706 struct ex_phy *phy = &ex_dev->ex_phy[phy_id]; 1740 struct ex_phy *phy = &ex_dev->ex_phy[phy_id];
1707 struct domain_device *child, *n; 1741 struct domain_device *child, *n;
1708 1742 if (last) {
1709 list_for_each_entry_safe(child, n, &ex_dev->children, siblings) { 1743 list_for_each_entry_safe(child, n,
1710 if (SAS_ADDR(child->sas_addr) == 1744 &ex_dev->children, siblings) {
1711 SAS_ADDR(phy->attached_sas_addr)) { 1745 if (SAS_ADDR(child->sas_addr) ==
1712 if (child->dev_type == EDGE_DEV || 1746 SAS_ADDR(phy->attached_sas_addr)) {
1713 child->dev_type == FANOUT_DEV) 1747 if (child->dev_type == EDGE_DEV ||
1714 sas_unregister_ex_tree(child); 1748 child->dev_type == FANOUT_DEV)
1715 else 1749 sas_unregister_ex_tree(child);
1716 sas_unregister_dev(child); 1750 else
1717 break; 1751 sas_unregister_dev(child);
1752 break;
1753 }
1718 } 1754 }
1755 sas_disable_routing(parent, phy->attached_sas_addr);
1719 } 1756 }
1720 sas_disable_routing(parent, phy->attached_sas_addr);
1721 memset(phy->attached_sas_addr, 0, SAS_ADDR_SIZE); 1757 memset(phy->attached_sas_addr, 0, SAS_ADDR_SIZE);
1722 sas_port_delete_phy(phy->port, phy->phy); 1758 sas_port_delete_phy(phy->port, phy->phy);
1723 if (phy->port->num_phys == 0) 1759 if (phy->port->num_phys == 0)
@@ -1770,15 +1806,31 @@ static int sas_discover_new(struct domain_device *dev, int phy_id)
1770{ 1806{
1771 struct ex_phy *ex_phy = &dev->ex_dev.ex_phy[phy_id]; 1807 struct ex_phy *ex_phy = &dev->ex_dev.ex_phy[phy_id];
1772 struct domain_device *child; 1808 struct domain_device *child;
1773 int res; 1809 bool found = false;
1810 int res, i;
1774 1811
1775 SAS_DPRINTK("ex %016llx phy%d new device attached\n", 1812 SAS_DPRINTK("ex %016llx phy%d new device attached\n",
1776 SAS_ADDR(dev->sas_addr), phy_id); 1813 SAS_ADDR(dev->sas_addr), phy_id);
1777 res = sas_ex_phy_discover(dev, phy_id); 1814 res = sas_ex_phy_discover(dev, phy_id);
1778 if (res) 1815 if (res)
1779 goto out; 1816 goto out;
1817 /* to support the wide port inserted */
1818 for (i = 0; i < dev->ex_dev.num_phys; i++) {
1819 struct ex_phy *ex_phy_temp = &dev->ex_dev.ex_phy[i];
1820 if (i == phy_id)
1821 continue;
1822 if (SAS_ADDR(ex_phy_temp->attached_sas_addr) ==
1823 SAS_ADDR(ex_phy->attached_sas_addr)) {
1824 found = true;
1825 break;
1826 }
1827 }
1828 if (found) {
1829 sas_ex_join_wide_port(dev, phy_id);
1830 return 0;
1831 }
1780 res = sas_ex_discover_devices(dev, phy_id); 1832 res = sas_ex_discover_devices(dev, phy_id);
1781 if (res) 1833 if (!res)
1782 goto out; 1834 goto out;
1783 list_for_each_entry(child, &dev->ex_dev.children, siblings) { 1835 list_for_each_entry(child, &dev->ex_dev.children, siblings) {
1784 if (SAS_ADDR(child->sas_addr) == 1836 if (SAS_ADDR(child->sas_addr) ==
@@ -1793,7 +1845,7 @@ out:
1793 return res; 1845 return res;
1794} 1846}
1795 1847
1796static int sas_rediscover_dev(struct domain_device *dev, int phy_id) 1848static int sas_rediscover_dev(struct domain_device *dev, int phy_id, bool last)
1797{ 1849{
1798 struct expander_device *ex = &dev->ex_dev; 1850 struct expander_device *ex = &dev->ex_dev;
1799 struct ex_phy *phy = &ex->ex_phy[phy_id]; 1851 struct ex_phy *phy = &ex->ex_phy[phy_id];
@@ -1804,11 +1856,11 @@ static int sas_rediscover_dev(struct domain_device *dev, int phy_id)
1804 switch (res) { 1856 switch (res) {
1805 case SMP_RESP_NO_PHY: 1857 case SMP_RESP_NO_PHY:
1806 phy->phy_state = PHY_NOT_PRESENT; 1858 phy->phy_state = PHY_NOT_PRESENT;
1807 sas_unregister_devs_sas_addr(dev, phy_id); 1859 sas_unregister_devs_sas_addr(dev, phy_id, last);
1808 goto out; break; 1860 goto out; break;
1809 case SMP_RESP_PHY_VACANT: 1861 case SMP_RESP_PHY_VACANT:
1810 phy->phy_state = PHY_VACANT; 1862 phy->phy_state = PHY_VACANT;
1811 sas_unregister_devs_sas_addr(dev, phy_id); 1863 sas_unregister_devs_sas_addr(dev, phy_id, last);
1812 goto out; break; 1864 goto out; break;
1813 case SMP_RESP_FUNC_ACC: 1865 case SMP_RESP_FUNC_ACC:
1814 break; 1866 break;
@@ -1816,7 +1868,7 @@ static int sas_rediscover_dev(struct domain_device *dev, int phy_id)
1816 1868
1817 if (SAS_ADDR(attached_sas_addr) == 0) { 1869 if (SAS_ADDR(attached_sas_addr) == 0) {
1818 phy->phy_state = PHY_EMPTY; 1870 phy->phy_state = PHY_EMPTY;
1819 sas_unregister_devs_sas_addr(dev, phy_id); 1871 sas_unregister_devs_sas_addr(dev, phy_id, last);
1820 } else if (SAS_ADDR(attached_sas_addr) == 1872 } else if (SAS_ADDR(attached_sas_addr) ==
1821 SAS_ADDR(phy->attached_sas_addr)) { 1873 SAS_ADDR(phy->attached_sas_addr)) {
1822 SAS_DPRINTK("ex %016llx phy 0x%x broadcast flutter\n", 1874 SAS_DPRINTK("ex %016llx phy 0x%x broadcast flutter\n",
@@ -1828,12 +1880,27 @@ out:
1828 return res; 1880 return res;
1829} 1881}
1830 1882
1883/**
1884 * sas_rediscover - revalidate the domain.
1885 * @dev:domain device to be detect.
1886 * @phy_id: the phy id will be detected.
1887 *
1888 * NOTE: this process _must_ quit (return) as soon as any connection
1889 * errors are encountered. Connection recovery is done elsewhere.
1890 * Discover process only interrogates devices in order to discover the
1891 * domain.For plugging out, we un-register the device only when it is
1892 * the last phy in the port, for other phys in this port, we just delete it
1893 * from the port.For inserting, we do discovery when it is the
1894 * first phy,for other phys in this port, we add it to the port to
1895 * forming the wide-port.
1896 */
1831static int sas_rediscover(struct domain_device *dev, const int phy_id) 1897static int sas_rediscover(struct domain_device *dev, const int phy_id)
1832{ 1898{
1833 struct expander_device *ex = &dev->ex_dev; 1899 struct expander_device *ex = &dev->ex_dev;
1834 struct ex_phy *changed_phy = &ex->ex_phy[phy_id]; 1900 struct ex_phy *changed_phy = &ex->ex_phy[phy_id];
1835 int res = 0; 1901 int res = 0;
1836 int i; 1902 int i;
1903 bool last = true; /* is this the last phy of the port */
1837 1904
1838 SAS_DPRINTK("ex %016llx phy%d originated BROADCAST(CHANGE)\n", 1905 SAS_DPRINTK("ex %016llx phy%d originated BROADCAST(CHANGE)\n",
1839 SAS_ADDR(dev->sas_addr), phy_id); 1906 SAS_ADDR(dev->sas_addr), phy_id);
@@ -1848,13 +1915,13 @@ static int sas_rediscover(struct domain_device *dev, const int phy_id)
1848 SAS_ADDR(changed_phy->attached_sas_addr)) { 1915 SAS_ADDR(changed_phy->attached_sas_addr)) {
1849 SAS_DPRINTK("phy%d part of wide port with " 1916 SAS_DPRINTK("phy%d part of wide port with "
1850 "phy%d\n", phy_id, i); 1917 "phy%d\n", phy_id, i);
1851 goto out; 1918 last = false;
1919 break;
1852 } 1920 }
1853 } 1921 }
1854 res = sas_rediscover_dev(dev, phy_id); 1922 res = sas_rediscover_dev(dev, phy_id, last);
1855 } else 1923 } else
1856 res = sas_discover_new(dev, phy_id); 1924 res = sas_discover_new(dev, phy_id);
1857out:
1858 return res; 1925 return res;
1859} 1926}
1860 1927
@@ -1881,7 +1948,7 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev)
1881 1948
1882 do { 1949 do {
1883 phy_id = -1; 1950 phy_id = -1;
1884 res = sas_find_bcast_phy(dev, &phy_id, i); 1951 res = sas_find_bcast_phy(dev, &phy_id, i, true);
1885 if (phy_id == -1) 1952 if (phy_id == -1)
1886 break; 1953 break;
1887 res = sas_rediscover(dev, phy_id); 1954 res = sas_rediscover(dev, phy_id);
diff --git a/drivers/scsi/libsas/sas_port.c b/drivers/scsi/libsas/sas_port.c
index e6ac59c023f1..fe8b74c706d2 100644
--- a/drivers/scsi/libsas/sas_port.c
+++ b/drivers/scsi/libsas/sas_port.c
@@ -56,7 +56,7 @@ static void sas_form_port(struct asd_sas_phy *phy)
56 } 56 }
57 } 57 }
58 58
59 /* find a port */ 59 /* see if the phy should be part of a wide port */
60 spin_lock_irqsave(&sas_ha->phy_port_lock, flags); 60 spin_lock_irqsave(&sas_ha->phy_port_lock, flags);
61 for (i = 0; i < sas_ha->num_phys; i++) { 61 for (i = 0; i < sas_ha->num_phys; i++) {
62 port = sas_ha->sas_port[i]; 62 port = sas_ha->sas_port[i];
@@ -69,12 +69,23 @@ static void sas_form_port(struct asd_sas_phy *phy)
69 SAS_DPRINTK("phy%d matched wide port%d\n", phy->id, 69 SAS_DPRINTK("phy%d matched wide port%d\n", phy->id,
70 port->id); 70 port->id);
71 break; 71 break;
72 } else if (*(u64 *) port->sas_addr == 0 && port->num_phys==0) {
73 memcpy(port->sas_addr, phy->sas_addr, SAS_ADDR_SIZE);
74 break;
75 } 72 }
76 spin_unlock(&port->phy_list_lock); 73 spin_unlock(&port->phy_list_lock);
77 } 74 }
75 /* The phy does not match any existing port, create a new one */
76 if (i == sas_ha->num_phys) {
77 for (i = 0; i < sas_ha->num_phys; i++) {
78 port = sas_ha->sas_port[i];
79 spin_lock(&port->phy_list_lock);
80 if (*(u64 *)port->sas_addr == 0
81 && port->num_phys == 0) {
82 memcpy(port->sas_addr, phy->sas_addr,
83 SAS_ADDR_SIZE);
84 break;
85 }
86 spin_unlock(&port->phy_list_lock);
87 }
88 }
78 89
79 if (i >= sas_ha->num_phys) { 90 if (i >= sas_ha->num_phys) {
80 printk(KERN_NOTICE "%s: couldn't find a free port, bug?\n", 91 printk(KERN_NOTICE "%s: couldn't find a free port, bug?\n",
diff --git a/drivers/scsi/qla4xxx/ql4_dbg.c b/drivers/scsi/qla4xxx/ql4_dbg.c
index fcc184cd066d..cbceb0ebabf7 100644
--- a/drivers/scsi/qla4xxx/ql4_dbg.c
+++ b/drivers/scsi/qla4xxx/ql4_dbg.c
@@ -15,19 +15,18 @@ void qla4xxx_dump_buffer(void *b, uint32_t size)
15 uint32_t cnt; 15 uint32_t cnt;
16 uint8_t *c = b; 16 uint8_t *c = b;
17 17
18 printk(" 0 1 2 3 4 5 6 7 8 9 Ah Bh Ch Dh Eh " 18 printk(" 0 1 2 3 4 5 6 7 8 9 Ah Bh Ch Dh Eh "
19 "Fh\n"); 19 "Fh\n");
20 printk("------------------------------------------------------------" 20 printk("------------------------------------------------------------"
21 "--\n"); 21 "--\n");
22 for (cnt = 0; cnt < size; cnt++, c++) { 22 for (cnt = 0; cnt < size; c++) {
23 printk(KERN_DEBUG "%02x", *c); 23 printk(KERN_INFO "%02x", *c);
24 if (!(cnt % 16)) 24 if (!(++cnt % 16))
25 printk(KERN_DEBUG "\n"); 25 printk(KERN_INFO "\n");
26 26
27 else 27 else
28 printk(KERN_DEBUG " "); 28 printk(KERN_INFO " ");
29 } 29 }
30 if (cnt % 16) 30 printk(KERN_INFO "\n");
31 printk(KERN_DEBUG "\n");
32} 31}
33 32
diff --git a/drivers/scsi/qla4xxx/ql4_def.h b/drivers/scsi/qla4xxx/ql4_def.h
index b586f27c3bd4..81b5f29254e2 100644
--- a/drivers/scsi/qla4xxx/ql4_def.h
+++ b/drivers/scsi/qla4xxx/ql4_def.h
@@ -100,7 +100,6 @@
100#define MAX_SRBS MAX_CMDS_TO_RISC 100#define MAX_SRBS MAX_CMDS_TO_RISC
101#define MBOX_AEN_REG_COUNT 5 101#define MBOX_AEN_REG_COUNT 5
102#define MAX_INIT_RETRIES 5 102#define MAX_INIT_RETRIES 5
103#define IOCB_HIWAT_CUSHION 16
104 103
105/* 104/*
106 * Buffer sizes 105 * Buffer sizes
@@ -184,6 +183,11 @@ struct srb {
184 uint16_t cc_stat; 183 uint16_t cc_stat;
185 u_long r_start; /* Time we recieve a cmd from OS */ 184 u_long r_start; /* Time we recieve a cmd from OS */
186 u_long u_start; /* Time when we handed the cmd to F/W */ 185 u_long u_start; /* Time when we handed the cmd to F/W */
186
187 /* Used for extended sense / status continuation */
188 uint8_t *req_sense_ptr;
189 uint16_t req_sense_len;
190 uint16_t reserved2;
187}; 191};
188 192
189/* 193/*
@@ -302,7 +306,6 @@ struct scsi_qla_host {
302 uint32_t tot_ddbs; 306 uint32_t tot_ddbs;
303 307
304 uint16_t iocb_cnt; 308 uint16_t iocb_cnt;
305 uint16_t iocb_hiwat;
306 309
307 /* SRB cache. */ 310 /* SRB cache. */
308#define SRB_MIN_REQ 128 311#define SRB_MIN_REQ 128
@@ -436,6 +439,8 @@ struct scsi_qla_host {
436 /* Map ddb_list entry by FW ddb index */ 439 /* Map ddb_list entry by FW ddb index */
437 struct ddb_entry *fw_ddb_index_map[MAX_DDB_ENTRIES]; 440 struct ddb_entry *fw_ddb_index_map[MAX_DDB_ENTRIES];
438 441
442 /* Saved srb for status continuation entry processing */
443 struct srb *status_srb;
439}; 444};
440 445
441static inline int is_qla4010(struct scsi_qla_host *ha) 446static inline int is_qla4010(struct scsi_qla_host *ha)
diff --git a/drivers/scsi/qla4xxx/ql4_fw.h b/drivers/scsi/qla4xxx/ql4_fw.h
index 1b667a70cffa..9cd7a608df38 100644
--- a/drivers/scsi/qla4xxx/ql4_fw.h
+++ b/drivers/scsi/qla4xxx/ql4_fw.h
@@ -572,6 +572,7 @@ struct conn_event_log_entry {
572 *************************************************************************/ 572 *************************************************************************/
573#define IOCB_MAX_CDB_LEN 16 /* Bytes in a CBD */ 573#define IOCB_MAX_CDB_LEN 16 /* Bytes in a CBD */
574#define IOCB_MAX_SENSEDATA_LEN 32 /* Bytes of sense data */ 574#define IOCB_MAX_SENSEDATA_LEN 32 /* Bytes of sense data */
575#define IOCB_MAX_EXT_SENSEDATA_LEN 60 /* Bytes of extended sense data */
575 576
576/* IOCB header structure */ 577/* IOCB header structure */
577struct qla4_header { 578struct qla4_header {
@@ -733,6 +734,12 @@ struct status_entry {
733 734
734}; 735};
735 736
737/* Status Continuation entry */
738struct status_cont_entry {
739 struct qla4_header hdr; /* 00-03 */
740 uint8_t ext_sense_data[IOCB_MAX_EXT_SENSEDATA_LEN]; /* 04-63 */
741};
742
736struct passthru0 { 743struct passthru0 {
737 struct qla4_header hdr; /* 00-03 */ 744 struct qla4_header hdr; /* 00-03 */
738 uint32_t handle; /* 04-07 */ 745 uint32_t handle; /* 04-07 */
diff --git a/drivers/scsi/qla4xxx/ql4_iocb.c b/drivers/scsi/qla4xxx/ql4_iocb.c
index 912a67494adf..e0c32159749c 100644
--- a/drivers/scsi/qla4xxx/ql4_iocb.c
+++ b/drivers/scsi/qla4xxx/ql4_iocb.c
@@ -10,9 +10,42 @@
10#include "ql4_dbg.h" 10#include "ql4_dbg.h"
11#include "ql4_inline.h" 11#include "ql4_inline.h"
12 12
13
14#include <scsi/scsi_tcq.h> 13#include <scsi/scsi_tcq.h>
15 14
15static int
16qla4xxx_space_in_req_ring(struct scsi_qla_host *ha, uint16_t req_cnt)
17{
18 uint16_t cnt;
19
20 /* Calculate number of free request entries. */
21 if ((req_cnt + 2) >= ha->req_q_count) {
22 cnt = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out);
23 if (ha->request_in < cnt)
24 ha->req_q_count = cnt - ha->request_in;
25 else
26 ha->req_q_count = REQUEST_QUEUE_DEPTH -
27 (ha->request_in - cnt);
28 }
29
30 /* Check if room for request in request ring. */
31 if ((req_cnt + 2) < ha->req_q_count)
32 return 1;
33 else
34 return 0;
35}
36
37static void qla4xxx_advance_req_ring_ptr(struct scsi_qla_host *ha)
38{
39 /* Advance request queue pointer */
40 if (ha->request_in == (REQUEST_QUEUE_DEPTH - 1)) {
41 ha->request_in = 0;
42 ha->request_ptr = ha->request_ring;
43 } else {
44 ha->request_in++;
45 ha->request_ptr++;
46 }
47}
48
16/** 49/**
17 * qla4xxx_get_req_pkt - returns a valid entry in request queue. 50 * qla4xxx_get_req_pkt - returns a valid entry in request queue.
18 * @ha: Pointer to host adapter structure. 51 * @ha: Pointer to host adapter structure.
@@ -26,35 +59,18 @@
26static int qla4xxx_get_req_pkt(struct scsi_qla_host *ha, 59static int qla4xxx_get_req_pkt(struct scsi_qla_host *ha,
27 struct queue_entry **queue_entry) 60 struct queue_entry **queue_entry)
28{ 61{
29 uint16_t request_in; 62 uint16_t req_cnt = 1;
30 uint8_t status = QLA_SUCCESS;
31
32 *queue_entry = ha->request_ptr;
33 63
34 /* get the latest request_in and request_out index */ 64 if (qla4xxx_space_in_req_ring(ha, req_cnt)) {
35 request_in = ha->request_in; 65 *queue_entry = ha->request_ptr;
36 ha->request_out = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out);
37
38 /* Advance request queue pointer and check for queue full */
39 if (request_in == (REQUEST_QUEUE_DEPTH - 1)) {
40 request_in = 0;
41 ha->request_ptr = ha->request_ring;
42 } else {
43 request_in++;
44 ha->request_ptr++;
45 }
46
47 /* request queue is full, try again later */
48 if ((ha->iocb_cnt + 1) >= ha->iocb_hiwat) {
49 /* restore request pointer */
50 ha->request_ptr = *queue_entry;
51 status = QLA_ERROR;
52 } else {
53 ha->request_in = request_in;
54 memset(*queue_entry, 0, sizeof(**queue_entry)); 66 memset(*queue_entry, 0, sizeof(**queue_entry));
67
68 qla4xxx_advance_req_ring_ptr(ha);
69 ha->req_q_count -= req_cnt;
70 return QLA_SUCCESS;
55 } 71 }
56 72
57 return status; 73 return QLA_ERROR;
58} 74}
59 75
60/** 76/**
@@ -100,21 +116,14 @@ exit_send_marker:
100 return status; 116 return status;
101} 117}
102 118
103static struct continuation_t1_entry* qla4xxx_alloc_cont_entry( 119static struct continuation_t1_entry *
104 struct scsi_qla_host *ha) 120qla4xxx_alloc_cont_entry(struct scsi_qla_host *ha)
105{ 121{
106 struct continuation_t1_entry *cont_entry; 122 struct continuation_t1_entry *cont_entry;
107 123
108 cont_entry = (struct continuation_t1_entry *)ha->request_ptr; 124 cont_entry = (struct continuation_t1_entry *)ha->request_ptr;
109 125
110 /* Advance request queue pointer */ 126 qla4xxx_advance_req_ring_ptr(ha);
111 if (ha->request_in == (REQUEST_QUEUE_DEPTH - 1)) {
112 ha->request_in = 0;
113 ha->request_ptr = ha->request_ring;
114 } else {
115 ha->request_in++;
116 ha->request_ptr++;
117 }
118 127
119 /* Load packet defaults */ 128 /* Load packet defaults */
120 cont_entry->hdr.entryType = ET_CONTINUE; 129 cont_entry->hdr.entryType = ET_CONTINUE;
@@ -197,13 +206,10 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb)
197 struct scsi_cmnd *cmd = srb->cmd; 206 struct scsi_cmnd *cmd = srb->cmd;
198 struct ddb_entry *ddb_entry; 207 struct ddb_entry *ddb_entry;
199 struct command_t3_entry *cmd_entry; 208 struct command_t3_entry *cmd_entry;
200
201 int nseg; 209 int nseg;
202 uint16_t tot_dsds; 210 uint16_t tot_dsds;
203 uint16_t req_cnt; 211 uint16_t req_cnt;
204
205 unsigned long flags; 212 unsigned long flags;
206 uint16_t cnt;
207 uint32_t index; 213 uint32_t index;
208 char tag[2]; 214 char tag[2];
209 215
@@ -217,6 +223,19 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb)
217 223
218 index = (uint32_t)cmd->request->tag; 224 index = (uint32_t)cmd->request->tag;
219 225
226 /*
227 * Check to see if adapter is online before placing request on
228 * request queue. If a reset occurs and a request is in the queue,
229 * the firmware will still attempt to process the request, retrieving
230 * garbage for pointers.
231 */
232 if (!test_bit(AF_ONLINE, &ha->flags)) {
233 DEBUG2(printk("scsi%ld: %s: Adapter OFFLINE! "
234 "Do not issue command.\n",
235 ha->host_no, __func__));
236 goto queuing_error;
237 }
238
220 /* Calculate the number of request entries needed. */ 239 /* Calculate the number of request entries needed. */
221 nseg = scsi_dma_map(cmd); 240 nseg = scsi_dma_map(cmd);
222 if (nseg < 0) 241 if (nseg < 0)
@@ -224,17 +243,7 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb)
224 tot_dsds = nseg; 243 tot_dsds = nseg;
225 244
226 req_cnt = qla4xxx_calc_request_entries(tot_dsds); 245 req_cnt = qla4xxx_calc_request_entries(tot_dsds);
227 246 if (!qla4xxx_space_in_req_ring(ha, req_cnt))
228 if (ha->req_q_count < (req_cnt + 2)) {
229 cnt = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out);
230 if (ha->request_in < cnt)
231 ha->req_q_count = cnt - ha->request_in;
232 else
233 ha->req_q_count = REQUEST_QUEUE_DEPTH -
234 (ha->request_in - cnt);
235 }
236
237 if (ha->req_q_count < (req_cnt + 2))
238 goto queuing_error; 247 goto queuing_error;
239 248
240 /* total iocbs active */ 249 /* total iocbs active */
@@ -286,32 +295,10 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb)
286 break; 295 break;
287 } 296 }
288 297
289 298 qla4xxx_advance_req_ring_ptr(ha);
290 /* Advance request queue pointer */
291 ha->request_in++;
292 if (ha->request_in == REQUEST_QUEUE_DEPTH) {
293 ha->request_in = 0;
294 ha->request_ptr = ha->request_ring;
295 } else
296 ha->request_ptr++;
297
298
299 qla4xxx_build_scsi_iocbs(srb, cmd_entry, tot_dsds); 299 qla4xxx_build_scsi_iocbs(srb, cmd_entry, tot_dsds);
300 wmb(); 300 wmb();
301 301
302 /*
303 * Check to see if adapter is online before placing request on
304 * request queue. If a reset occurs and a request is in the queue,
305 * the firmware will still attempt to process the request, retrieving
306 * garbage for pointers.
307 */
308 if (!test_bit(AF_ONLINE, &ha->flags)) {
309 DEBUG2(printk("scsi%ld: %s: Adapter OFFLINE! "
310 "Do not issue command.\n",
311 ha->host_no, __func__));
312 goto queuing_error;
313 }
314
315 srb->cmd->host_scribble = (unsigned char *)srb; 302 srb->cmd->host_scribble = (unsigned char *)srb;
316 303
317 /* update counters */ 304 /* update counters */
diff --git a/drivers/scsi/qla4xxx/ql4_isr.c b/drivers/scsi/qla4xxx/ql4_isr.c
index 799120fcb9be..8025ee16588e 100644
--- a/drivers/scsi/qla4xxx/ql4_isr.c
+++ b/drivers/scsi/qla4xxx/ql4_isr.c
@@ -11,6 +11,98 @@
11#include "ql4_inline.h" 11#include "ql4_inline.h"
12 12
13/** 13/**
14 * qla4xxx_copy_sense - copy sense data into cmd sense buffer
15 * @ha: Pointer to host adapter structure.
16 * @sts_entry: Pointer to status entry structure.
17 * @srb: Pointer to srb structure.
18 **/
19static void qla4xxx_copy_sense(struct scsi_qla_host *ha,
20 struct status_entry *sts_entry,
21 struct srb *srb)
22{
23 struct scsi_cmnd *cmd = srb->cmd;
24 uint16_t sense_len;
25
26 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
27 sense_len = le16_to_cpu(sts_entry->senseDataByteCnt);
28 if (sense_len == 0)
29 return;
30
31 /* Save total available sense length,
32 * not to exceed cmd's sense buffer size */
33 sense_len = min_t(uint16_t, sense_len, SCSI_SENSE_BUFFERSIZE);
34 srb->req_sense_ptr = cmd->sense_buffer;
35 srb->req_sense_len = sense_len;
36
37 /* Copy sense from sts_entry pkt */
38 sense_len = min_t(uint16_t, sense_len, IOCB_MAX_SENSEDATA_LEN);
39 memcpy(cmd->sense_buffer, sts_entry->senseData, sense_len);
40
41 DEBUG2(printk(KERN_INFO "scsi%ld:%d:%d:%d: %s: sense key = %x, "
42 "ASL= %02x, ASC/ASCQ = %02x/%02x\n", ha->host_no,
43 cmd->device->channel, cmd->device->id,
44 cmd->device->lun, __func__,
45 sts_entry->senseData[2] & 0x0f,
46 sts_entry->senseData[7],
47 sts_entry->senseData[12],
48 sts_entry->senseData[13]));
49
50 DEBUG5(qla4xxx_dump_buffer(cmd->sense_buffer, sense_len));
51 srb->flags |= SRB_GOT_SENSE;
52
53 /* Update srb, in case a sts_cont pkt follows */
54 srb->req_sense_ptr += sense_len;
55 srb->req_sense_len -= sense_len;
56 if (srb->req_sense_len != 0)
57 ha->status_srb = srb;
58 else
59 ha->status_srb = NULL;
60}
61
62/**
63 * qla4xxx_status_cont_entry - Process a Status Continuations entry.
64 * @ha: SCSI driver HA context
65 * @sts_cont: Entry pointer
66 *
67 * Extended sense data.
68 */
69static void
70qla4xxx_status_cont_entry(struct scsi_qla_host *ha,
71 struct status_cont_entry *sts_cont)
72{
73 struct srb *srb = ha->status_srb;
74 struct scsi_cmnd *cmd;
75 uint8_t sense_len;
76
77 if (srb == NULL)
78 return;
79
80 cmd = srb->cmd;
81 if (cmd == NULL) {
82 DEBUG2(printk(KERN_INFO "scsi%ld: %s: Cmd already returned "
83 "back to OS srb=%p srb->state:%d\n", ha->host_no,
84 __func__, srb, srb->state));
85 ha->status_srb = NULL;
86 return;
87 }
88
89 /* Copy sense data. */
90 sense_len = min_t(uint16_t, srb->req_sense_len,
91 IOCB_MAX_EXT_SENSEDATA_LEN);
92 memcpy(srb->req_sense_ptr, sts_cont->ext_sense_data, sense_len);
93 DEBUG5(qla4xxx_dump_buffer(srb->req_sense_ptr, sense_len));
94
95 srb->req_sense_ptr += sense_len;
96 srb->req_sense_len -= sense_len;
97
98 /* Place command on done queue. */
99 if (srb->req_sense_len == 0) {
100 qla4xxx_srb_compl(ha, srb);
101 ha->status_srb = NULL;
102 }
103}
104
105/**
14 * qla4xxx_status_entry - processes status IOCBs 106 * qla4xxx_status_entry - processes status IOCBs
15 * @ha: Pointer to host adapter structure. 107 * @ha: Pointer to host adapter structure.
16 * @sts_entry: Pointer to status entry structure. 108 * @sts_entry: Pointer to status entry structure.
@@ -23,7 +115,6 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha,
23 struct srb *srb; 115 struct srb *srb;
24 struct ddb_entry *ddb_entry; 116 struct ddb_entry *ddb_entry;
25 uint32_t residual; 117 uint32_t residual;
26 uint16_t sensebytecnt;
27 118
28 srb = qla4xxx_del_from_active_array(ha, le32_to_cpu(sts_entry->handle)); 119 srb = qla4xxx_del_from_active_array(ha, le32_to_cpu(sts_entry->handle));
29 if (!srb) { 120 if (!srb) {
@@ -92,24 +183,7 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha,
92 break; 183 break;
93 184
94 /* Copy Sense Data into sense buffer. */ 185 /* Copy Sense Data into sense buffer. */
95 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 186 qla4xxx_copy_sense(ha, sts_entry, srb);
96
97 sensebytecnt = le16_to_cpu(sts_entry->senseDataByteCnt);
98 if (sensebytecnt == 0)
99 break;
100
101 memcpy(cmd->sense_buffer, sts_entry->senseData,
102 min_t(uint16_t, sensebytecnt, SCSI_SENSE_BUFFERSIZE));
103
104 DEBUG2(printk("scsi%ld:%d:%d:%d: %s: sense key = %x, "
105 "ASC/ASCQ = %02x/%02x\n", ha->host_no,
106 cmd->device->channel, cmd->device->id,
107 cmd->device->lun, __func__,
108 sts_entry->senseData[2] & 0x0f,
109 sts_entry->senseData[12],
110 sts_entry->senseData[13]));
111
112 srb->flags |= SRB_GOT_SENSE;
113 break; 187 break;
114 188
115 case SCS_INCOMPLETE: 189 case SCS_INCOMPLETE:
@@ -176,23 +250,7 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha,
176 break; 250 break;
177 251
178 /* Copy Sense Data into sense buffer. */ 252 /* Copy Sense Data into sense buffer. */
179 memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); 253 qla4xxx_copy_sense(ha, sts_entry, srb);
180
181 sensebytecnt =
182 le16_to_cpu(sts_entry->senseDataByteCnt);
183 if (sensebytecnt == 0)
184 break;
185
186 memcpy(cmd->sense_buffer, sts_entry->senseData,
187 min_t(uint16_t, sensebytecnt, SCSI_SENSE_BUFFERSIZE));
188
189 DEBUG2(printk("scsi%ld:%d:%d:%d: %s: sense key = %x, "
190 "ASC/ASCQ = %02x/%02x\n", ha->host_no,
191 cmd->device->channel, cmd->device->id,
192 cmd->device->lun, __func__,
193 sts_entry->senseData[2] & 0x0f,
194 sts_entry->senseData[12],
195 sts_entry->senseData[13]));
196 } else { 254 } else {
197 /* 255 /*
198 * If RISC reports underrun and target does not 256 * If RISC reports underrun and target does not
@@ -268,9 +326,10 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha,
268 326
269status_entry_exit: 327status_entry_exit:
270 328
271 /* complete the request */ 329 /* complete the request, if not waiting for status_continuation pkt */
272 srb->cc_stat = sts_entry->completionStatus; 330 srb->cc_stat = sts_entry->completionStatus;
273 qla4xxx_srb_compl(ha, srb); 331 if (ha->status_srb == NULL)
332 qla4xxx_srb_compl(ha, srb);
274} 333}
275 334
276/** 335/**
@@ -305,10 +364,7 @@ static void qla4xxx_process_response_queue(struct scsi_qla_host * ha)
305 /* process entry */ 364 /* process entry */
306 switch (sts_entry->hdr.entryType) { 365 switch (sts_entry->hdr.entryType) {
307 case ET_STATUS: 366 case ET_STATUS:
308 /* 367 /* Common status */
309 * Common status - Single completion posted in single
310 * IOSB.
311 */
312 qla4xxx_status_entry(ha, sts_entry); 368 qla4xxx_status_entry(ha, sts_entry);
313 break; 369 break;
314 370
@@ -316,9 +372,8 @@ static void qla4xxx_process_response_queue(struct scsi_qla_host * ha)
316 break; 372 break;
317 373
318 case ET_STATUS_CONTINUATION: 374 case ET_STATUS_CONTINUATION:
319 /* Just throw away the status continuation entries */ 375 qla4xxx_status_cont_entry(ha,
320 DEBUG2(printk("scsi%ld: %s: Status Continuation entry " 376 (struct status_cont_entry *) sts_entry);
321 "- ignoring\n", ha->host_no, __func__));
322 break; 377 break;
323 378
324 case ET_COMMAND: 379 case ET_COMMAND:
diff --git a/drivers/scsi/qla4xxx/ql4_mbx.c b/drivers/scsi/qla4xxx/ql4_mbx.c
index 051b0f5e8c8e..09d6d4b76f39 100644
--- a/drivers/scsi/qla4xxx/ql4_mbx.c
+++ b/drivers/scsi/qla4xxx/ql4_mbx.c
@@ -385,16 +385,6 @@ int qla4xxx_get_firmware_status(struct scsi_qla_host * ha)
385 mbox_sts[0])); 385 mbox_sts[0]));
386 return QLA_ERROR; 386 return QLA_ERROR;
387 } 387 }
388
389 /* High-water mark of IOCBs */
390 ha->iocb_hiwat = mbox_sts[2];
391 if (ha->iocb_hiwat > IOCB_HIWAT_CUSHION)
392 ha->iocb_hiwat -= IOCB_HIWAT_CUSHION;
393 else
394 dev_info(&ha->pdev->dev, "WARNING!!! You have less than %d "
395 "firmware IOCBs available (%d).\n",
396 IOCB_HIWAT_CUSHION, ha->iocb_hiwat);
397
398 return QLA_SUCCESS; 388 return QLA_SUCCESS;
399} 389}
400 390
diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c
index ec9da6ce8489..40e3cafb3a9c 100644
--- a/drivers/scsi/qla4xxx/ql4_os.c
+++ b/drivers/scsi/qla4xxx/ql4_os.c
@@ -66,6 +66,7 @@ static int qla4xxx_sess_get_param(struct iscsi_cls_session *sess,
66static int qla4xxx_host_get_param(struct Scsi_Host *shost, 66static int qla4xxx_host_get_param(struct Scsi_Host *shost,
67 enum iscsi_host_param param, char *buf); 67 enum iscsi_host_param param, char *buf);
68static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session); 68static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session);
69static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc);
69 70
70/* 71/*
71 * SCSI host template entry points 72 * SCSI host template entry points
@@ -89,6 +90,7 @@ static struct scsi_host_template qla4xxx_driver_template = {
89 .eh_device_reset_handler = qla4xxx_eh_device_reset, 90 .eh_device_reset_handler = qla4xxx_eh_device_reset,
90 .eh_target_reset_handler = qla4xxx_eh_target_reset, 91 .eh_target_reset_handler = qla4xxx_eh_target_reset,
91 .eh_host_reset_handler = qla4xxx_eh_host_reset, 92 .eh_host_reset_handler = qla4xxx_eh_host_reset,
93 .eh_timed_out = qla4xxx_eh_cmd_timed_out,
92 94
93 .slave_configure = qla4xxx_slave_configure, 95 .slave_configure = qla4xxx_slave_configure,
94 .slave_alloc = qla4xxx_slave_alloc, 96 .slave_alloc = qla4xxx_slave_alloc,
@@ -124,6 +126,21 @@ static struct iscsi_transport qla4xxx_iscsi_transport = {
124 126
125static struct scsi_transport_template *qla4xxx_scsi_transport; 127static struct scsi_transport_template *qla4xxx_scsi_transport;
126 128
129static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc)
130{
131 struct iscsi_cls_session *session;
132 struct ddb_entry *ddb_entry;
133
134 session = starget_to_session(scsi_target(sc->device));
135 ddb_entry = session->dd_data;
136
137 /* if we are not logged in then the LLD is going to clean up the cmd */
138 if (atomic_read(&ddb_entry->state) != DDB_STATE_ONLINE)
139 return BLK_EH_RESET_TIMER;
140 else
141 return BLK_EH_NOT_HANDLED;
142}
143
127static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session) 144static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session)
128{ 145{
129 struct ddb_entry *ddb_entry = session->dd_data; 146 struct ddb_entry *ddb_entry = session->dd_data;
@@ -904,18 +921,17 @@ static int qla4xxx_recover_adapter(struct scsi_qla_host *ha,
904 /* Flush any pending ddb changed AENs */ 921 /* Flush any pending ddb changed AENs */
905 qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS); 922 qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS);
906 923
924 qla4xxx_flush_active_srbs(ha);
925
907 /* Reset the firmware. If successful, function 926 /* Reset the firmware. If successful, function
908 * returns with ISP interrupts enabled. 927 * returns with ISP interrupts enabled.
909 */ 928 */
910 if (status == QLA_SUCCESS) { 929 DEBUG2(printk("scsi%ld: %s - Performing soft reset..\n",
911 DEBUG2(printk("scsi%ld: %s - Performing soft reset..\n", 930 ha->host_no, __func__));
912 ha->host_no, __func__)); 931 if (ql4xxx_lock_drvr_wait(ha) == QLA_SUCCESS)
913 qla4xxx_flush_active_srbs(ha); 932 status = qla4xxx_soft_reset(ha);
914 if (ql4xxx_lock_drvr_wait(ha) == QLA_SUCCESS) 933 else
915 status = qla4xxx_soft_reset(ha); 934 status = QLA_ERROR;
916 else
917 status = QLA_ERROR;
918 }
919 935
920 /* Flush any pending ddb changed AENs */ 936 /* Flush any pending ddb changed AENs */
921 qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS); 937 qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS);
@@ -1527,11 +1543,9 @@ static int qla4xxx_eh_device_reset(struct scsi_cmnd *cmd)
1527{ 1543{
1528 struct scsi_qla_host *ha = to_qla_host(cmd->device->host); 1544 struct scsi_qla_host *ha = to_qla_host(cmd->device->host);
1529 struct ddb_entry *ddb_entry = cmd->device->hostdata; 1545 struct ddb_entry *ddb_entry = cmd->device->hostdata;
1530 struct srb *sp;
1531 int ret = FAILED, stat; 1546 int ret = FAILED, stat;
1532 1547
1533 sp = (struct srb *) cmd->SCp.ptr; 1548 if (!ddb_entry)
1534 if (!sp || !ddb_entry)
1535 return ret; 1549 return ret;
1536 1550
1537 dev_info(&ha->pdev->dev, 1551 dev_info(&ha->pdev->dev,
@@ -1644,7 +1658,7 @@ static int qla4xxx_eh_host_reset(struct scsi_cmnd *cmd)
1644 ha = (struct scsi_qla_host *) cmd->device->host->hostdata; 1658 ha = (struct scsi_qla_host *) cmd->device->host->hostdata;
1645 1659
1646 dev_info(&ha->pdev->dev, 1660 dev_info(&ha->pdev->dev,
1647 "scsi(%ld:%d:%d:%d): ADAPTER RESET ISSUED.\n", ha->host_no, 1661 "scsi(%ld:%d:%d:%d): HOST RESET ISSUED.\n", ha->host_no,
1648 cmd->device->channel, cmd->device->id, cmd->device->lun); 1662 cmd->device->channel, cmd->device->id, cmd->device->lun);
1649 1663
1650 if (qla4xxx_wait_for_hba_online(ha) != QLA_SUCCESS) { 1664 if (qla4xxx_wait_for_hba_online(ha) != QLA_SUCCESS) {
diff --git a/drivers/scsi/qla4xxx/ql4_version.h b/drivers/scsi/qla4xxx/ql4_version.h
index ab984cb89cea..6980cb279c81 100644
--- a/drivers/scsi/qla4xxx/ql4_version.h
+++ b/drivers/scsi/qla4xxx/ql4_version.h
@@ -5,5 +5,5 @@
5 * See LICENSE.qla4xxx for copyright and licensing details. 5 * See LICENSE.qla4xxx for copyright and licensing details.
6 */ 6 */
7 7
8#define QLA4XXX_DRIVER_VERSION "5.01.00-k8" 8#define QLA4XXX_DRIVER_VERSION "5.01.00-k9"
9 9
diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c
index 783e33c65eb7..b47240ca4b19 100644
--- a/drivers/scsi/scsi_transport_iscsi.c
+++ b/drivers/scsi/scsi_transport_iscsi.c
@@ -990,7 +990,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
990 struct iscsi_uevent *ev; 990 struct iscsi_uevent *ev;
991 int len = NLMSG_SPACE(sizeof(*ev) + data_size); 991 int len = NLMSG_SPACE(sizeof(*ev) + data_size);
992 992
993 skb = alloc_skb(len, GFP_NOIO); 993 skb = alloc_skb(len, GFP_ATOMIC);
994 if (!skb) { 994 if (!skb) {
995 printk(KERN_ERR "can not deliver iscsi offload message:OOM\n"); 995 printk(KERN_ERR "can not deliver iscsi offload message:OOM\n");
996 return -ENOMEM; 996 return -ENOMEM;
@@ -1012,7 +1012,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost,
1012 1012
1013 memcpy((char *)ev + sizeof(*ev), data, data_size); 1013 memcpy((char *)ev + sizeof(*ev), data, data_size);
1014 1014
1015 return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_NOIO); 1015 return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_ATOMIC);
1016} 1016}
1017EXPORT_SYMBOL_GPL(iscsi_offload_mesg); 1017EXPORT_SYMBOL_GPL(iscsi_offload_mesg);
1018 1018
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 5616cd780ff3..b7b9fec67a98 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1840,6 +1840,18 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp)
1840 kfree(buffer); 1840 kfree(buffer);
1841} 1841}
1842 1842
1843static int sd_try_extended_inquiry(struct scsi_device *sdp)
1844{
1845 /*
1846 * Although VPD inquiries can go to SCSI-2 type devices,
1847 * some USB ones crash on receiving them, and the pages
1848 * we currently ask for are for SPC-3 and beyond
1849 */
1850 if (sdp->scsi_level > SCSI_SPC_2)
1851 return 1;
1852 return 0;
1853}
1854
1843/** 1855/**
1844 * sd_revalidate_disk - called the first time a new disk is seen, 1856 * sd_revalidate_disk - called the first time a new disk is seen,
1845 * performs disk spin up, read_capacity, etc. 1857 * performs disk spin up, read_capacity, etc.
@@ -1877,8 +1889,12 @@ static int sd_revalidate_disk(struct gendisk *disk)
1877 */ 1889 */
1878 if (sdkp->media_present) { 1890 if (sdkp->media_present) {
1879 sd_read_capacity(sdkp, buffer); 1891 sd_read_capacity(sdkp, buffer);
1880 sd_read_block_limits(sdkp); 1892
1881 sd_read_block_characteristics(sdkp); 1893 if (sd_try_extended_inquiry(sdp)) {
1894 sd_read_block_limits(sdkp);
1895 sd_read_block_characteristics(sdkp);
1896 }
1897
1882 sd_read_write_protect_flag(sdkp, buffer); 1898 sd_read_write_protect_flag(sdkp, buffer);
1883 sd_read_cache_type(sdkp, buffer); 1899 sd_read_cache_type(sdkp, buffer);
1884 sd_read_app_tag_own(sdkp, buffer); 1900 sd_read_app_tag_own(sdkp, buffer);
diff --git a/drivers/serial/cpm_uart/cpm_uart_cpm2.c b/drivers/serial/cpm_uart/cpm_uart_cpm2.c
index 141c0a3333ad..a9802e76b5fa 100644
--- a/drivers/serial/cpm_uart/cpm_uart_cpm2.c
+++ b/drivers/serial/cpm_uart/cpm_uart_cpm2.c
@@ -132,7 +132,7 @@ int cpm_uart_allocbuf(struct uart_cpm_port *pinfo, unsigned int is_con)
132 memsz = L1_CACHE_ALIGN(pinfo->rx_nrfifos * pinfo->rx_fifosize) + 132 memsz = L1_CACHE_ALIGN(pinfo->rx_nrfifos * pinfo->rx_fifosize) +
133 L1_CACHE_ALIGN(pinfo->tx_nrfifos * pinfo->tx_fifosize); 133 L1_CACHE_ALIGN(pinfo->tx_nrfifos * pinfo->tx_fifosize);
134 if (is_con) { 134 if (is_con) {
135 mem_addr = alloc_bootmem(memsz); 135 mem_addr = kzalloc(memsz, GFP_NOWAIT);
136 dma_addr = virt_to_bus(mem_addr); 136 dma_addr = virt_to_bus(mem_addr);
137 } 137 }
138 else 138 else
diff --git a/drivers/video/console/sticore.c b/drivers/video/console/sticore.c
index ef7870f5ea08..857b3668b3ba 100644
--- a/drivers/video/console/sticore.c
+++ b/drivers/video/console/sticore.c
@@ -957,9 +957,14 @@ static int __devinit sticore_pci_init(struct pci_dev *pd,
957#ifdef CONFIG_PCI 957#ifdef CONFIG_PCI
958 unsigned long fb_base, rom_base; 958 unsigned long fb_base, rom_base;
959 unsigned int fb_len, rom_len; 959 unsigned int fb_len, rom_len;
960 int err;
960 struct sti_struct *sti; 961 struct sti_struct *sti;
961 962
962 pci_enable_device(pd); 963 err = pci_enable_device(pd);
964 if (err < 0) {
965 dev_err(&pd->dev, "Cannot enable PCI device\n");
966 return err;
967 }
963 968
964 fb_base = pci_resource_start(pd, 0); 969 fb_base = pci_resource_start(pd, 0);
965 fb_len = pci_resource_len(pd, 0); 970 fb_len = pci_resource_len(pd, 0);
@@ -1048,7 +1053,7 @@ static void __devinit sti_init_roms(void)
1048 1053
1049 /* Register drivers for native & PCI cards */ 1054 /* Register drivers for native & PCI cards */
1050 register_parisc_driver(&pa_sti_driver); 1055 register_parisc_driver(&pa_sti_driver);
1051 pci_register_driver(&pci_sti_driver); 1056 WARN_ON(pci_register_driver(&pci_sti_driver));
1052 1057
1053 /* if we didn't find the given default sti, take the first one */ 1058 /* if we didn't find the given default sti, take the first one */
1054 if (!default_sti) 1059 if (!default_sti)
diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c
index bcec78ffc765..248e00ec4dc1 100644
--- a/drivers/virtio/virtio_pci.c
+++ b/drivers/virtio/virtio_pci.c
@@ -52,8 +52,10 @@ struct virtio_pci_device
52 char (*msix_names)[256]; 52 char (*msix_names)[256];
53 /* Number of available vectors */ 53 /* Number of available vectors */
54 unsigned msix_vectors; 54 unsigned msix_vectors;
55 /* Vectors allocated */ 55 /* Vectors allocated, excluding per-vq vectors if any */
56 unsigned msix_used_vectors; 56 unsigned msix_used_vectors;
57 /* Whether we have vector per vq */
58 bool per_vq_vectors;
57}; 59};
58 60
59/* Constants for MSI-X */ 61/* Constants for MSI-X */
@@ -258,7 +260,6 @@ static void vp_free_vectors(struct virtio_device *vdev)
258 260
259 for (i = 0; i < vp_dev->msix_used_vectors; ++i) 261 for (i = 0; i < vp_dev->msix_used_vectors; ++i)
260 free_irq(vp_dev->msix_entries[i].vector, vp_dev); 262 free_irq(vp_dev->msix_entries[i].vector, vp_dev);
261 vp_dev->msix_used_vectors = 0;
262 263
263 if (vp_dev->msix_enabled) { 264 if (vp_dev->msix_enabled) {
264 /* Disable the vector used for configuration */ 265 /* Disable the vector used for configuration */
@@ -267,80 +268,77 @@ static void vp_free_vectors(struct virtio_device *vdev)
267 /* Flush the write out to device */ 268 /* Flush the write out to device */
268 ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); 269 ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
269 270
270 vp_dev->msix_enabled = 0;
271 pci_disable_msix(vp_dev->pci_dev); 271 pci_disable_msix(vp_dev->pci_dev);
272 vp_dev->msix_enabled = 0;
273 vp_dev->msix_vectors = 0;
272 } 274 }
273}
274 275
275static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, 276 vp_dev->msix_used_vectors = 0;
276 int *options, int noptions) 277 kfree(vp_dev->msix_names);
277{ 278 vp_dev->msix_names = NULL;
278 int i; 279 kfree(vp_dev->msix_entries);
279 for (i = 0; i < noptions; ++i) 280 vp_dev->msix_entries = NULL;
280 if (!pci_enable_msix(dev, entries, options[i]))
281 return options[i];
282 return -EBUSY;
283} 281}
284 282
285static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) 283static int vp_request_vectors(struct virtio_device *vdev, int nvectors,
284 bool per_vq_vectors)
286{ 285{
287 struct virtio_pci_device *vp_dev = to_vp_device(vdev); 286 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
288 const char *name = dev_name(&vp_dev->vdev.dev); 287 const char *name = dev_name(&vp_dev->vdev.dev);
289 unsigned i, v; 288 unsigned i, v;
290 int err = -ENOMEM; 289 int err = -ENOMEM;
291 /* We want at most one vector per queue and one for config changes. 290
292 * Fallback to separate vectors for config and a shared for queues. 291 if (!nvectors) {
293 * Finally fall back to regular interrupts. */ 292 /* Can't allocate MSI-X vectors, use regular interrupt */
294 int options[] = { max_vqs + 1, 2 }; 293 vp_dev->msix_vectors = 0;
295 int nvectors = max(options[0], options[1]); 294 err = request_irq(vp_dev->pci_dev->irq, vp_interrupt,
295 IRQF_SHARED, name, vp_dev);
296 if (err)
297 return err;
298 vp_dev->intx_enabled = 1;
299 return 0;
300 }
296 301
297 vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, 302 vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries,
298 GFP_KERNEL); 303 GFP_KERNEL);
299 if (!vp_dev->msix_entries) 304 if (!vp_dev->msix_entries)
300 goto error_entries; 305 goto error;
301 vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, 306 vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names,
302 GFP_KERNEL); 307 GFP_KERNEL);
303 if (!vp_dev->msix_names) 308 if (!vp_dev->msix_names)
304 goto error_names; 309 goto error;
305 310
306 for (i = 0; i < nvectors; ++i) 311 for (i = 0; i < nvectors; ++i)
307 vp_dev->msix_entries[i].entry = i; 312 vp_dev->msix_entries[i].entry = i;
308 313
309 err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, 314 err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors);
310 options, ARRAY_SIZE(options)); 315 if (err > 0)
311 if (err < 0) { 316 err = -ENOSPC;
312 /* Can't allocate enough MSI-X vectors, use regular interrupt */ 317 if (err)
313 vp_dev->msix_vectors = 0; 318 goto error;
314 err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, 319 vp_dev->msix_vectors = nvectors;
315 IRQF_SHARED, name, vp_dev); 320 vp_dev->msix_enabled = 1;
316 if (err) 321
317 goto error_irq; 322 /* Set the vector used for configuration */
318 vp_dev->intx_enabled = 1; 323 v = vp_dev->msix_used_vectors;
319 } else { 324 snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
320 vp_dev->msix_vectors = err; 325 "%s-config", name);
321 vp_dev->msix_enabled = 1; 326 err = request_irq(vp_dev->msix_entries[v].vector,
322 327 vp_config_changed, 0, vp_dev->msix_names[v],
323 /* Set the vector used for configuration */ 328 vp_dev);
324 v = vp_dev->msix_used_vectors; 329 if (err)
325 snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, 330 goto error;
326 "%s-config", name); 331 ++vp_dev->msix_used_vectors;
327 err = request_irq(vp_dev->msix_entries[v].vector, 332
328 vp_config_changed, 0, vp_dev->msix_names[v], 333 iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
329 vp_dev); 334 /* Verify we had enough resources to assign the vector */
330 if (err) 335 v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
331 goto error_irq; 336 if (v == VIRTIO_MSI_NO_VECTOR) {
332 ++vp_dev->msix_used_vectors; 337 err = -EBUSY;
333 338 goto error;
334 iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
335 /* Verify we had enough resources to assign the vector */
336 v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR);
337 if (v == VIRTIO_MSI_NO_VECTOR) {
338 err = -EBUSY;
339 goto error_irq;
340 }
341 } 339 }
342 340
343 if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) { 341 if (!per_vq_vectors) {
344 /* Shared vector for all VQs */ 342 /* Shared vector for all VQs */
345 v = vp_dev->msix_used_vectors; 343 v = vp_dev->msix_used_vectors;
346 snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, 344 snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names,
@@ -349,28 +347,25 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs)
349 vp_vring_interrupt, 0, vp_dev->msix_names[v], 347 vp_vring_interrupt, 0, vp_dev->msix_names[v],
350 vp_dev); 348 vp_dev);
351 if (err) 349 if (err)
352 goto error_irq; 350 goto error;
353 ++vp_dev->msix_used_vectors; 351 ++vp_dev->msix_used_vectors;
354 } 352 }
355 return 0; 353 return 0;
356error_irq: 354error:
357 vp_free_vectors(vdev); 355 vp_free_vectors(vdev);
358 kfree(vp_dev->msix_names);
359error_names:
360 kfree(vp_dev->msix_entries);
361error_entries:
362 return err; 356 return err;
363} 357}
364 358
365static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, 359static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
366 void (*callback)(struct virtqueue *vq), 360 void (*callback)(struct virtqueue *vq),
367 const char *name) 361 const char *name,
362 u16 vector)
368{ 363{
369 struct virtio_pci_device *vp_dev = to_vp_device(vdev); 364 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
370 struct virtio_pci_vq_info *info; 365 struct virtio_pci_vq_info *info;
371 struct virtqueue *vq; 366 struct virtqueue *vq;
372 unsigned long flags, size; 367 unsigned long flags, size;
373 u16 num, vector; 368 u16 num;
374 int err; 369 int err;
375 370
376 /* Select the queue we're interested in */ 371 /* Select the queue we're interested in */
@@ -389,7 +384,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
389 384
390 info->queue_index = index; 385 info->queue_index = index;
391 info->num = num; 386 info->num = num;
392 info->vector = VIRTIO_MSI_NO_VECTOR; 387 info->vector = vector;
393 388
394 size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); 389 size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN));
395 info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); 390 info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO);
@@ -413,22 +408,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
413 vq->priv = info; 408 vq->priv = info;
414 info->vq = vq; 409 info->vq = vq;
415 410
416 /* allocate per-vq vector if available and necessary */ 411 if (vector != VIRTIO_MSI_NO_VECTOR) {
417 if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) {
418 vector = vp_dev->msix_used_vectors;
419 snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names,
420 "%s-%s", dev_name(&vp_dev->vdev.dev), name);
421 err = request_irq(vp_dev->msix_entries[vector].vector,
422 vring_interrupt, 0,
423 vp_dev->msix_names[vector], vq);
424 if (err)
425 goto out_request_irq;
426 info->vector = vector;
427 ++vp_dev->msix_used_vectors;
428 } else
429 vector = VP_MSIX_VQ_VECTOR;
430
431 if (callback && vp_dev->msix_enabled) {
432 iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); 412 iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
433 vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); 413 vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR);
434 if (vector == VIRTIO_MSI_NO_VECTOR) { 414 if (vector == VIRTIO_MSI_NO_VECTOR) {
@@ -444,11 +424,6 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index,
444 return vq; 424 return vq;
445 425
446out_assign: 426out_assign:
447 if (info->vector != VIRTIO_MSI_NO_VECTOR) {
448 free_irq(vp_dev->msix_entries[info->vector].vector, vq);
449 --vp_dev->msix_used_vectors;
450 }
451out_request_irq:
452 vring_del_virtqueue(vq); 427 vring_del_virtqueue(vq);
453out_activate_queue: 428out_activate_queue:
454 iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); 429 iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN);
@@ -462,12 +437,13 @@ static void vp_del_vq(struct virtqueue *vq)
462{ 437{
463 struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); 438 struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev);
464 struct virtio_pci_vq_info *info = vq->priv; 439 struct virtio_pci_vq_info *info = vq->priv;
465 unsigned long size; 440 unsigned long flags, size;
466 441
467 iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); 442 spin_lock_irqsave(&vp_dev->lock, flags);
443 list_del(&info->node);
444 spin_unlock_irqrestore(&vp_dev->lock, flags);
468 445
469 if (info->vector != VIRTIO_MSI_NO_VECTOR) 446 iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL);
470 free_irq(vp_dev->msix_entries[info->vector].vector, vq);
471 447
472 if (vp_dev->msix_enabled) { 448 if (vp_dev->msix_enabled) {
473 iowrite16(VIRTIO_MSI_NO_VECTOR, 449 iowrite16(VIRTIO_MSI_NO_VECTOR,
@@ -489,36 +465,62 @@ static void vp_del_vq(struct virtqueue *vq)
489/* the config->del_vqs() implementation */ 465/* the config->del_vqs() implementation */
490static void vp_del_vqs(struct virtio_device *vdev) 466static void vp_del_vqs(struct virtio_device *vdev)
491{ 467{
468 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
492 struct virtqueue *vq, *n; 469 struct virtqueue *vq, *n;
470 struct virtio_pci_vq_info *info;
493 471
494 list_for_each_entry_safe(vq, n, &vdev->vqs, list) 472 list_for_each_entry_safe(vq, n, &vdev->vqs, list) {
473 info = vq->priv;
474 if (vp_dev->per_vq_vectors)
475 free_irq(vp_dev->msix_entries[info->vector].vector, vq);
495 vp_del_vq(vq); 476 vp_del_vq(vq);
477 }
478 vp_dev->per_vq_vectors = false;
496 479
497 vp_free_vectors(vdev); 480 vp_free_vectors(vdev);
498} 481}
499 482
500/* the config->find_vqs() implementation */ 483static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs,
501static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, 484 struct virtqueue *vqs[],
502 struct virtqueue *vqs[], 485 vq_callback_t *callbacks[],
503 vq_callback_t *callbacks[], 486 const char *names[],
504 const char *names[]) 487 int nvectors,
488 bool per_vq_vectors)
505{ 489{
506 int vectors = 0; 490 struct virtio_pci_device *vp_dev = to_vp_device(vdev);
507 int i, err; 491 u16 vector;
508 492 int i, err, allocated_vectors;
509 /* How many vectors would we like? */
510 for (i = 0; i < nvqs; ++i)
511 if (callbacks[i])
512 ++vectors;
513 493
514 err = vp_request_vectors(vdev, vectors); 494 err = vp_request_vectors(vdev, nvectors, per_vq_vectors);
515 if (err) 495 if (err)
516 goto error_request; 496 goto error_request;
517 497
498 vp_dev->per_vq_vectors = per_vq_vectors;
499 allocated_vectors = vp_dev->msix_used_vectors;
518 for (i = 0; i < nvqs; ++i) { 500 for (i = 0; i < nvqs; ++i) {
519 vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]); 501 if (!callbacks[i] || !vp_dev->msix_enabled)
520 if (IS_ERR(vqs[i])) 502 vector = VIRTIO_MSI_NO_VECTOR;
503 else if (vp_dev->per_vq_vectors)
504 vector = allocated_vectors++;
505 else
506 vector = VP_MSIX_VQ_VECTOR;
507 vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i], vector);
508 if (IS_ERR(vqs[i])) {
509 err = PTR_ERR(vqs[i]);
521 goto error_find; 510 goto error_find;
511 }
512 /* allocate per-vq irq if available and necessary */
513 if (vp_dev->per_vq_vectors && vector != VIRTIO_MSI_NO_VECTOR) {
514 snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names,
515 "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]);
516 err = request_irq(vp_dev->msix_entries[vector].vector,
517 vring_interrupt, 0,
518 vp_dev->msix_names[vector], vqs[i]);
519 if (err) {
520 vp_del_vq(vqs[i]);
521 goto error_find;
522 }
523 }
522 } 524 }
523 return 0; 525 return 0;
524 526
@@ -526,7 +528,37 @@ error_find:
526 vp_del_vqs(vdev); 528 vp_del_vqs(vdev);
527 529
528error_request: 530error_request:
529 return PTR_ERR(vqs[i]); 531 return err;
532}
533
534/* the config->find_vqs() implementation */
535static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs,
536 struct virtqueue *vqs[],
537 vq_callback_t *callbacks[],
538 const char *names[])
539{
540 int vectors = 0;
541 int i, uninitialized_var(err);
542
543 /* How many vectors would we like? */
544 for (i = 0; i < nvqs; ++i)
545 if (callbacks[i])
546 ++vectors;
547
548 /* We want at most one vector per queue and one for config changes. */
549 err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
550 vectors + 1, true);
551 if (!err)
552 return 0;
553 /* Fallback to separate vectors for config and a shared for queues. */
554 err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
555 2, false);
556 if (!err)
557 return 0;
558 /* Finally fall back to regular interrupts. */
559 err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names,
560 0, false);
561 return err;
530} 562}
531 563
532static struct virtio_config_ops virtio_pci_config_ops = { 564static struct virtio_config_ops virtio_pci_config_ops = {
diff --git a/drivers/watchdog/coh901327_wdt.c b/drivers/watchdog/coh901327_wdt.c
index fecb307d28e9..aec7cefdef21 100644
--- a/drivers/watchdog/coh901327_wdt.c
+++ b/drivers/watchdog/coh901327_wdt.c
@@ -18,6 +18,7 @@
18#include <linux/bitops.h> 18#include <linux/bitops.h>
19#include <linux/uaccess.h> 19#include <linux/uaccess.h>
20#include <linux/clk.h> 20#include <linux/clk.h>
21#include <linux/delay.h>
21 22
22#define DRV_NAME "WDOG COH 901 327" 23#define DRV_NAME "WDOG COH 901 327"
23 24
@@ -92,6 +93,8 @@ static struct clk *clk;
92static void coh901327_enable(u16 timeout) 93static void coh901327_enable(u16 timeout)
93{ 94{
94 u16 val; 95 u16 val;
96 unsigned long freq;
97 unsigned long delay_ns;
95 98
96 clk_enable(clk); 99 clk_enable(clk);
97 /* Restart timer if it is disabled */ 100 /* Restart timer if it is disabled */
@@ -102,6 +105,14 @@ static void coh901327_enable(u16 timeout)
102 /* Acknowledge any pending interrupt so it doesn't just fire off */ 105 /* Acknowledge any pending interrupt so it doesn't just fire off */
103 writew(U300_WDOG_IER_WILL_BARK_IRQ_ACK_ENABLE, 106 writew(U300_WDOG_IER_WILL_BARK_IRQ_ACK_ENABLE,
104 virtbase + U300_WDOG_IER); 107 virtbase + U300_WDOG_IER);
108 /*
109 * The interrupt is cleared in the 32 kHz clock domain.
110 * Wait 3 32 kHz cycles for it to take effect
111 */
112 freq = clk_get_rate(clk);
113 delay_ns = (1000000000 + freq - 1) / freq; /* Freq to ns and round up */
114 delay_ns = 3 * delay_ns; /* Wait 3 cycles */
115 ndelay(delay_ns);
105 /* Enable the watchdog interrupt */ 116 /* Enable the watchdog interrupt */
106 writew(U300_WDOG_IMR_WILL_BARK_IRQ_ENABLE, virtbase + U300_WDOG_IMR); 117 writew(U300_WDOG_IMR_WILL_BARK_IRQ_ENABLE, virtbase + U300_WDOG_IMR);
107 /* Activate the watchdog timer */ 118 /* Activate the watchdog timer */