aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/pci/controller/pci-hyperv.c
diff options
context:
space:
mode:
authorShawn Lin <shawn.lin@rock-chips.com>2018-05-30 21:12:37 -0400
committerBjorn Helgaas <bhelgaas@google.com>2018-06-08 08:50:11 -0400
commit6e0832fa432ec99c94caee733c8f5851cf85560b (patch)
treec4326f9e2d8ff1a6cb17e959fc5268c9e577ca94 /drivers/pci/controller/pci-hyperv.c
parent3a3869f1c443383ef8354ffa0e5fb8df65d8b549 (diff)
PCI: Collect all native drivers under drivers/pci/controller/
Native PCI drivers for root complex devices were originally all in drivers/pci/host/. Some of these devices can also be operated in endpoint mode. Drivers for endpoint mode didn't seem to fit in the "host" directory, so we put both the root complex and endpoint drivers in per-device directories, e.g., drivers/pci/dwc/, drivers/pci/cadence/, etc. These per-device directories contain trivial Kconfig and Makefiles and clutter drivers/pci/. Make a new drivers/pci/controllers/ directory and collect all the device-specific drivers there. No functional change intended. Link: https://lkml.kernel.org/r/1520304202-232891-1-git-send-email-shawn.lin@rock-chips.com Signed-off-by: Shawn Lin <shawn.lin@rock-chips.com> [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas <bhelgaas@google.com>
Diffstat (limited to 'drivers/pci/controller/pci-hyperv.c')
-rw-r--r--drivers/pci/controller/pci-hyperv.c2694
1 files changed, 2694 insertions, 0 deletions
diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c
new file mode 100644
index 000000000000..6cc5036ac83c
--- /dev/null
+++ b/drivers/pci/controller/pci-hyperv.c
@@ -0,0 +1,2694 @@
1// SPDX-License-Identifier: GPL-2.0
2/*
3 * Copyright (c) Microsoft Corporation.
4 *
5 * Author:
6 * Jake Oshins <jakeo@microsoft.com>
7 *
8 * This driver acts as a paravirtual front-end for PCI Express root buses.
9 * When a PCI Express function (either an entire device or an SR-IOV
10 * Virtual Function) is being passed through to the VM, this driver exposes
11 * a new bus to the guest VM. This is modeled as a root PCI bus because
12 * no bridges are being exposed to the VM. In fact, with a "Generation 2"
13 * VM within Hyper-V, there may seem to be no PCI bus at all in the VM
14 * until a device as been exposed using this driver.
15 *
16 * Each root PCI bus has its own PCI domain, which is called "Segment" in
17 * the PCI Firmware Specifications. Thus while each device passed through
18 * to the VM using this front-end will appear at "device 0", the domain will
19 * be unique. Typically, each bus will have one PCI function on it, though
20 * this driver does support more than one.
21 *
22 * In order to map the interrupts from the device through to the guest VM,
23 * this driver also implements an IRQ Domain, which handles interrupts (either
24 * MSI or MSI-X) associated with the functions on the bus. As interrupts are
25 * set up, torn down, or reaffined, this driver communicates with the
26 * underlying hypervisor to adjust the mappings in the I/O MMU so that each
27 * interrupt will be delivered to the correct virtual processor at the right
28 * vector. This driver does not support level-triggered (line-based)
29 * interrupts, and will report that the Interrupt Line register in the
30 * function's configuration space is zero.
31 *
32 * The rest of this driver mostly maps PCI concepts onto underlying Hyper-V
33 * facilities. For instance, the configuration space of a function exposed
34 * by Hyper-V is mapped into a single page of memory space, and the
35 * read and write handlers for config space must be aware of this mechanism.
36 * Similarly, device setup and teardown involves messages sent to and from
37 * the PCI back-end driver in Hyper-V.
38 */
39
40#include <linux/kernel.h>
41#include <linux/module.h>
42#include <linux/pci.h>
43#include <linux/delay.h>
44#include <linux/semaphore.h>
45#include <linux/irqdomain.h>
46#include <asm/irqdomain.h>
47#include <asm/apic.h>
48#include <linux/msi.h>
49#include <linux/hyperv.h>
50#include <linux/refcount.h>
51#include <asm/mshyperv.h>
52
53/*
54 * Protocol versions. The low word is the minor version, the high word the
55 * major version.
56 */
57
58#define PCI_MAKE_VERSION(major, minor) ((u32)(((major) << 16) | (minor)))
59#define PCI_MAJOR_VERSION(version) ((u32)(version) >> 16)
60#define PCI_MINOR_VERSION(version) ((u32)(version) & 0xff)
61
62enum pci_protocol_version_t {
63 PCI_PROTOCOL_VERSION_1_1 = PCI_MAKE_VERSION(1, 1), /* Win10 */
64 PCI_PROTOCOL_VERSION_1_2 = PCI_MAKE_VERSION(1, 2), /* RS1 */
65};
66
67#define CPU_AFFINITY_ALL -1ULL
68
69/*
70 * Supported protocol versions in the order of probing - highest go
71 * first.
72 */
73static enum pci_protocol_version_t pci_protocol_versions[] = {
74 PCI_PROTOCOL_VERSION_1_2,
75 PCI_PROTOCOL_VERSION_1_1,
76};
77
78/*
79 * Protocol version negotiated by hv_pci_protocol_negotiation().
80 */
81static enum pci_protocol_version_t pci_protocol_version;
82
83#define PCI_CONFIG_MMIO_LENGTH 0x2000
84#define CFG_PAGE_OFFSET 0x1000
85#define CFG_PAGE_SIZE (PCI_CONFIG_MMIO_LENGTH - CFG_PAGE_OFFSET)
86
87#define MAX_SUPPORTED_MSI_MESSAGES 0x400
88
89#define STATUS_REVISION_MISMATCH 0xC0000059
90
91/*
92 * Message Types
93 */
94
95enum pci_message_type {
96 /*
97 * Version 1.1
98 */
99 PCI_MESSAGE_BASE = 0x42490000,
100 PCI_BUS_RELATIONS = PCI_MESSAGE_BASE + 0,
101 PCI_QUERY_BUS_RELATIONS = PCI_MESSAGE_BASE + 1,
102 PCI_POWER_STATE_CHANGE = PCI_MESSAGE_BASE + 4,
103 PCI_QUERY_RESOURCE_REQUIREMENTS = PCI_MESSAGE_BASE + 5,
104 PCI_QUERY_RESOURCE_RESOURCES = PCI_MESSAGE_BASE + 6,
105 PCI_BUS_D0ENTRY = PCI_MESSAGE_BASE + 7,
106 PCI_BUS_D0EXIT = PCI_MESSAGE_BASE + 8,
107 PCI_READ_BLOCK = PCI_MESSAGE_BASE + 9,
108 PCI_WRITE_BLOCK = PCI_MESSAGE_BASE + 0xA,
109 PCI_EJECT = PCI_MESSAGE_BASE + 0xB,
110 PCI_QUERY_STOP = PCI_MESSAGE_BASE + 0xC,
111 PCI_REENABLE = PCI_MESSAGE_BASE + 0xD,
112 PCI_QUERY_STOP_FAILED = PCI_MESSAGE_BASE + 0xE,
113 PCI_EJECTION_COMPLETE = PCI_MESSAGE_BASE + 0xF,
114 PCI_RESOURCES_ASSIGNED = PCI_MESSAGE_BASE + 0x10,
115 PCI_RESOURCES_RELEASED = PCI_MESSAGE_BASE + 0x11,
116 PCI_INVALIDATE_BLOCK = PCI_MESSAGE_BASE + 0x12,
117 PCI_QUERY_PROTOCOL_VERSION = PCI_MESSAGE_BASE + 0x13,
118 PCI_CREATE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x14,
119 PCI_DELETE_INTERRUPT_MESSAGE = PCI_MESSAGE_BASE + 0x15,
120 PCI_RESOURCES_ASSIGNED2 = PCI_MESSAGE_BASE + 0x16,
121 PCI_CREATE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x17,
122 PCI_DELETE_INTERRUPT_MESSAGE2 = PCI_MESSAGE_BASE + 0x18, /* unused */
123 PCI_MESSAGE_MAXIMUM
124};
125
126/*
127 * Structures defining the virtual PCI Express protocol.
128 */
129
130union pci_version {
131 struct {
132 u16 minor_version;
133 u16 major_version;
134 } parts;
135 u32 version;
136} __packed;
137
138/*
139 * Function numbers are 8-bits wide on Express, as interpreted through ARI,
140 * which is all this driver does. This representation is the one used in
141 * Windows, which is what is expected when sending this back and forth with
142 * the Hyper-V parent partition.
143 */
144union win_slot_encoding {
145 struct {
146 u32 dev:5;
147 u32 func:3;
148 u32 reserved:24;
149 } bits;
150 u32 slot;
151} __packed;
152
153/*
154 * Pretty much as defined in the PCI Specifications.
155 */
156struct pci_function_description {
157 u16 v_id; /* vendor ID */
158 u16 d_id; /* device ID */
159 u8 rev;
160 u8 prog_intf;
161 u8 subclass;
162 u8 base_class;
163 u32 subsystem_id;
164 union win_slot_encoding win_slot;
165 u32 ser; /* serial number */
166} __packed;
167
168/**
169 * struct hv_msi_desc
170 * @vector: IDT entry
171 * @delivery_mode: As defined in Intel's Programmer's
172 * Reference Manual, Volume 3, Chapter 8.
173 * @vector_count: Number of contiguous entries in the
174 * Interrupt Descriptor Table that are
175 * occupied by this Message-Signaled
176 * Interrupt. For "MSI", as first defined
177 * in PCI 2.2, this can be between 1 and
178 * 32. For "MSI-X," as first defined in PCI
179 * 3.0, this must be 1, as each MSI-X table
180 * entry would have its own descriptor.
181 * @reserved: Empty space
182 * @cpu_mask: All the target virtual processors.
183 */
184struct hv_msi_desc {
185 u8 vector;
186 u8 delivery_mode;
187 u16 vector_count;
188 u32 reserved;
189 u64 cpu_mask;
190} __packed;
191
192/**
193 * struct hv_msi_desc2 - 1.2 version of hv_msi_desc
194 * @vector: IDT entry
195 * @delivery_mode: As defined in Intel's Programmer's
196 * Reference Manual, Volume 3, Chapter 8.
197 * @vector_count: Number of contiguous entries in the
198 * Interrupt Descriptor Table that are
199 * occupied by this Message-Signaled
200 * Interrupt. For "MSI", as first defined
201 * in PCI 2.2, this can be between 1 and
202 * 32. For "MSI-X," as first defined in PCI
203 * 3.0, this must be 1, as each MSI-X table
204 * entry would have its own descriptor.
205 * @processor_count: number of bits enabled in array.
206 * @processor_array: All the target virtual processors.
207 */
208struct hv_msi_desc2 {
209 u8 vector;
210 u8 delivery_mode;
211 u16 vector_count;
212 u16 processor_count;
213 u16 processor_array[32];
214} __packed;
215
216/**
217 * struct tran_int_desc
218 * @reserved: unused, padding
219 * @vector_count: same as in hv_msi_desc
220 * @data: This is the "data payload" value that is
221 * written by the device when it generates
222 * a message-signaled interrupt, either MSI
223 * or MSI-X.
224 * @address: This is the address to which the data
225 * payload is written on interrupt
226 * generation.
227 */
228struct tran_int_desc {
229 u16 reserved;
230 u16 vector_count;
231 u32 data;
232 u64 address;
233} __packed;
234
235/*
236 * A generic message format for virtual PCI.
237 * Specific message formats are defined later in the file.
238 */
239
240struct pci_message {
241 u32 type;
242} __packed;
243
244struct pci_child_message {
245 struct pci_message message_type;
246 union win_slot_encoding wslot;
247} __packed;
248
249struct pci_incoming_message {
250 struct vmpacket_descriptor hdr;
251 struct pci_message message_type;
252} __packed;
253
254struct pci_response {
255 struct vmpacket_descriptor hdr;
256 s32 status; /* negative values are failures */
257} __packed;
258
259struct pci_packet {
260 void (*completion_func)(void *context, struct pci_response *resp,
261 int resp_packet_size);
262 void *compl_ctxt;
263
264 struct pci_message message[0];
265};
266
267/*
268 * Specific message types supporting the PCI protocol.
269 */
270
271/*
272 * Version negotiation message. Sent from the guest to the host.
273 * The guest is free to try different versions until the host
274 * accepts the version.
275 *
276 * pci_version: The protocol version requested.
277 * is_last_attempt: If TRUE, this is the last version guest will request.
278 * reservedz: Reserved field, set to zero.
279 */
280
281struct pci_version_request {
282 struct pci_message message_type;
283 u32 protocol_version;
284} __packed;
285
286/*
287 * Bus D0 Entry. This is sent from the guest to the host when the virtual
288 * bus (PCI Express port) is ready for action.
289 */
290
291struct pci_bus_d0_entry {
292 struct pci_message message_type;
293 u32 reserved;
294 u64 mmio_base;
295} __packed;
296
297struct pci_bus_relations {
298 struct pci_incoming_message incoming;
299 u32 device_count;
300 struct pci_function_description func[0];
301} __packed;
302
303struct pci_q_res_req_response {
304 struct vmpacket_descriptor hdr;
305 s32 status; /* negative values are failures */
306 u32 probed_bar[6];
307} __packed;
308
309struct pci_set_power {
310 struct pci_message message_type;
311 union win_slot_encoding wslot;
312 u32 power_state; /* In Windows terms */
313 u32 reserved;
314} __packed;
315
316struct pci_set_power_response {
317 struct vmpacket_descriptor hdr;
318 s32 status; /* negative values are failures */
319 union win_slot_encoding wslot;
320 u32 resultant_state; /* In Windows terms */
321 u32 reserved;
322} __packed;
323
324struct pci_resources_assigned {
325 struct pci_message message_type;
326 union win_slot_encoding wslot;
327 u8 memory_range[0x14][6]; /* not used here */
328 u32 msi_descriptors;
329 u32 reserved[4];
330} __packed;
331
332struct pci_resources_assigned2 {
333 struct pci_message message_type;
334 union win_slot_encoding wslot;
335 u8 memory_range[0x14][6]; /* not used here */
336 u32 msi_descriptor_count;
337 u8 reserved[70];
338} __packed;
339
340struct pci_create_interrupt {
341 struct pci_message message_type;
342 union win_slot_encoding wslot;
343 struct hv_msi_desc int_desc;
344} __packed;
345
346struct pci_create_int_response {
347 struct pci_response response;
348 u32 reserved;
349 struct tran_int_desc int_desc;
350} __packed;
351
352struct pci_create_interrupt2 {
353 struct pci_message message_type;
354 union win_slot_encoding wslot;
355 struct hv_msi_desc2 int_desc;
356} __packed;
357
358struct pci_delete_interrupt {
359 struct pci_message message_type;
360 union win_slot_encoding wslot;
361 struct tran_int_desc int_desc;
362} __packed;
363
364struct pci_dev_incoming {
365 struct pci_incoming_message incoming;
366 union win_slot_encoding wslot;
367} __packed;
368
369struct pci_eject_response {
370 struct pci_message message_type;
371 union win_slot_encoding wslot;
372 u32 status;
373} __packed;
374
375static int pci_ring_size = (4 * PAGE_SIZE);
376
377/*
378 * Definitions or interrupt steering hypercall.
379 */
380#define HV_PARTITION_ID_SELF ((u64)-1)
381#define HVCALL_RETARGET_INTERRUPT 0x7e
382
383struct hv_interrupt_entry {
384 u32 source; /* 1 for MSI(-X) */
385 u32 reserved1;
386 u32 address;
387 u32 data;
388};
389
390#define HV_VP_SET_BANK_COUNT_MAX 5 /* current implementation limit */
391
392struct hv_vp_set {
393 u64 format; /* 0 (HvGenericSetSparse4k) */
394 u64 valid_banks;
395 u64 masks[HV_VP_SET_BANK_COUNT_MAX];
396};
397
398/*
399 * flags for hv_device_interrupt_target.flags
400 */
401#define HV_DEVICE_INTERRUPT_TARGET_MULTICAST 1
402#define HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET 2
403
404struct hv_device_interrupt_target {
405 u32 vector;
406 u32 flags;
407 union {
408 u64 vp_mask;
409 struct hv_vp_set vp_set;
410 };
411};
412
413struct retarget_msi_interrupt {
414 u64 partition_id; /* use "self" */
415 u64 device_id;
416 struct hv_interrupt_entry int_entry;
417 u64 reserved2;
418 struct hv_device_interrupt_target int_target;
419} __packed;
420
421/*
422 * Driver specific state.
423 */
424
425enum hv_pcibus_state {
426 hv_pcibus_init = 0,
427 hv_pcibus_probed,
428 hv_pcibus_installed,
429 hv_pcibus_removed,
430 hv_pcibus_maximum
431};
432
433struct hv_pcibus_device {
434 struct pci_sysdata sysdata;
435 enum hv_pcibus_state state;
436 refcount_t remove_lock;
437 struct hv_device *hdev;
438 resource_size_t low_mmio_space;
439 resource_size_t high_mmio_space;
440 struct resource *mem_config;
441 struct resource *low_mmio_res;
442 struct resource *high_mmio_res;
443 struct completion *survey_event;
444 struct completion remove_event;
445 struct pci_bus *pci_bus;
446 spinlock_t config_lock; /* Avoid two threads writing index page */
447 spinlock_t device_list_lock; /* Protect lists below */
448 void __iomem *cfg_addr;
449
450 struct list_head resources_for_children;
451
452 struct list_head children;
453 struct list_head dr_list;
454
455 struct msi_domain_info msi_info;
456 struct msi_controller msi_chip;
457 struct irq_domain *irq_domain;
458
459 /* hypercall arg, must not cross page boundary */
460 struct retarget_msi_interrupt retarget_msi_interrupt_params;
461
462 spinlock_t retarget_msi_interrupt_lock;
463
464 struct workqueue_struct *wq;
465};
466
467/*
468 * Tracks "Device Relations" messages from the host, which must be both
469 * processed in order and deferred so that they don't run in the context
470 * of the incoming packet callback.
471 */
472struct hv_dr_work {
473 struct work_struct wrk;
474 struct hv_pcibus_device *bus;
475};
476
477struct hv_dr_state {
478 struct list_head list_entry;
479 u32 device_count;
480 struct pci_function_description func[0];
481};
482
483enum hv_pcichild_state {
484 hv_pcichild_init = 0,
485 hv_pcichild_requirements,
486 hv_pcichild_resourced,
487 hv_pcichild_ejecting,
488 hv_pcichild_maximum
489};
490
491struct hv_pci_dev {
492 /* List protected by pci_rescan_remove_lock */
493 struct list_head list_entry;
494 refcount_t refs;
495 enum hv_pcichild_state state;
496 struct pci_function_description desc;
497 bool reported_missing;
498 struct hv_pcibus_device *hbus;
499 struct work_struct wrk;
500
501 /*
502 * What would be observed if one wrote 0xFFFFFFFF to a BAR and then
503 * read it back, for each of the BAR offsets within config space.
504 */
505 u32 probed_bar[6];
506};
507
508struct hv_pci_compl {
509 struct completion host_event;
510 s32 completion_status;
511};
512
513static void hv_pci_onchannelcallback(void *context);
514
515/**
516 * hv_pci_generic_compl() - Invoked for a completion packet
517 * @context: Set up by the sender of the packet.
518 * @resp: The response packet
519 * @resp_packet_size: Size in bytes of the packet
520 *
521 * This function is used to trigger an event and report status
522 * for any message for which the completion packet contains a
523 * status and nothing else.
524 */
525static void hv_pci_generic_compl(void *context, struct pci_response *resp,
526 int resp_packet_size)
527{
528 struct hv_pci_compl *comp_pkt = context;
529
530 if (resp_packet_size >= offsetofend(struct pci_response, status))
531 comp_pkt->completion_status = resp->status;
532 else
533 comp_pkt->completion_status = -1;
534
535 complete(&comp_pkt->host_event);
536}
537
538static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
539 u32 wslot);
540
541static void get_pcichild(struct hv_pci_dev *hpdev)
542{
543 refcount_inc(&hpdev->refs);
544}
545
546static void put_pcichild(struct hv_pci_dev *hpdev)
547{
548 if (refcount_dec_and_test(&hpdev->refs))
549 kfree(hpdev);
550}
551
552static void get_hvpcibus(struct hv_pcibus_device *hv_pcibus);
553static void put_hvpcibus(struct hv_pcibus_device *hv_pcibus);
554
555/*
556 * There is no good way to get notified from vmbus_onoffer_rescind(),
557 * so let's use polling here, since this is not a hot path.
558 */
559static int wait_for_response(struct hv_device *hdev,
560 struct completion *comp)
561{
562 while (true) {
563 if (hdev->channel->rescind) {
564 dev_warn_once(&hdev->device, "The device is gone.\n");
565 return -ENODEV;
566 }
567
568 if (wait_for_completion_timeout(comp, HZ / 10))
569 break;
570 }
571
572 return 0;
573}
574
575/**
576 * devfn_to_wslot() - Convert from Linux PCI slot to Windows
577 * @devfn: The Linux representation of PCI slot
578 *
579 * Windows uses a slightly different representation of PCI slot.
580 *
581 * Return: The Windows representation
582 */
583static u32 devfn_to_wslot(int devfn)
584{
585 union win_slot_encoding wslot;
586
587 wslot.slot = 0;
588 wslot.bits.dev = PCI_SLOT(devfn);
589 wslot.bits.func = PCI_FUNC(devfn);
590
591 return wslot.slot;
592}
593
594/**
595 * wslot_to_devfn() - Convert from Windows PCI slot to Linux
596 * @wslot: The Windows representation of PCI slot
597 *
598 * Windows uses a slightly different representation of PCI slot.
599 *
600 * Return: The Linux representation
601 */
602static int wslot_to_devfn(u32 wslot)
603{
604 union win_slot_encoding slot_no;
605
606 slot_no.slot = wslot;
607 return PCI_DEVFN(slot_no.bits.dev, slot_no.bits.func);
608}
609
610/*
611 * PCI Configuration Space for these root PCI buses is implemented as a pair
612 * of pages in memory-mapped I/O space. Writing to the first page chooses
613 * the PCI function being written or read. Once the first page has been
614 * written to, the following page maps in the entire configuration space of
615 * the function.
616 */
617
618/**
619 * _hv_pcifront_read_config() - Internal PCI config read
620 * @hpdev: The PCI driver's representation of the device
621 * @where: Offset within config space
622 * @size: Size of the transfer
623 * @val: Pointer to the buffer receiving the data
624 */
625static void _hv_pcifront_read_config(struct hv_pci_dev *hpdev, int where,
626 int size, u32 *val)
627{
628 unsigned long flags;
629 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
630
631 /*
632 * If the attempt is to read the IDs or the ROM BAR, simulate that.
633 */
634 if (where + size <= PCI_COMMAND) {
635 memcpy(val, ((u8 *)&hpdev->desc.v_id) + where, size);
636 } else if (where >= PCI_CLASS_REVISION && where + size <=
637 PCI_CACHE_LINE_SIZE) {
638 memcpy(val, ((u8 *)&hpdev->desc.rev) + where -
639 PCI_CLASS_REVISION, size);
640 } else if (where >= PCI_SUBSYSTEM_VENDOR_ID && where + size <=
641 PCI_ROM_ADDRESS) {
642 memcpy(val, (u8 *)&hpdev->desc.subsystem_id + where -
643 PCI_SUBSYSTEM_VENDOR_ID, size);
644 } else if (where >= PCI_ROM_ADDRESS && where + size <=
645 PCI_CAPABILITY_LIST) {
646 /* ROM BARs are unimplemented */
647 *val = 0;
648 } else if (where >= PCI_INTERRUPT_LINE && where + size <=
649 PCI_INTERRUPT_PIN) {
650 /*
651 * Interrupt Line and Interrupt PIN are hard-wired to zero
652 * because this front-end only supports message-signaled
653 * interrupts.
654 */
655 *val = 0;
656 } else if (where + size <= CFG_PAGE_SIZE) {
657 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
658 /* Choose the function to be read. (See comment above) */
659 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
660 /* Make sure the function was chosen before we start reading. */
661 mb();
662 /* Read from that function's config space. */
663 switch (size) {
664 case 1:
665 *val = readb(addr);
666 break;
667 case 2:
668 *val = readw(addr);
669 break;
670 default:
671 *val = readl(addr);
672 break;
673 }
674 /*
675 * Make sure the read was done before we release the spinlock
676 * allowing consecutive reads/writes.
677 */
678 mb();
679 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
680 } else {
681 dev_err(&hpdev->hbus->hdev->device,
682 "Attempt to read beyond a function's config space.\n");
683 }
684}
685
686static u16 hv_pcifront_get_vendor_id(struct hv_pci_dev *hpdev)
687{
688 u16 ret;
689 unsigned long flags;
690 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET +
691 PCI_VENDOR_ID;
692
693 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
694
695 /* Choose the function to be read. (See comment above) */
696 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
697 /* Make sure the function was chosen before we start reading. */
698 mb();
699 /* Read from that function's config space. */
700 ret = readw(addr);
701 /*
702 * mb() is not required here, because the spin_unlock_irqrestore()
703 * is a barrier.
704 */
705
706 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
707
708 return ret;
709}
710
711/**
712 * _hv_pcifront_write_config() - Internal PCI config write
713 * @hpdev: The PCI driver's representation of the device
714 * @where: Offset within config space
715 * @size: Size of the transfer
716 * @val: The data being transferred
717 */
718static void _hv_pcifront_write_config(struct hv_pci_dev *hpdev, int where,
719 int size, u32 val)
720{
721 unsigned long flags;
722 void __iomem *addr = hpdev->hbus->cfg_addr + CFG_PAGE_OFFSET + where;
723
724 if (where >= PCI_SUBSYSTEM_VENDOR_ID &&
725 where + size <= PCI_CAPABILITY_LIST) {
726 /* SSIDs and ROM BARs are read-only */
727 } else if (where >= PCI_COMMAND && where + size <= CFG_PAGE_SIZE) {
728 spin_lock_irqsave(&hpdev->hbus->config_lock, flags);
729 /* Choose the function to be written. (See comment above) */
730 writel(hpdev->desc.win_slot.slot, hpdev->hbus->cfg_addr);
731 /* Make sure the function was chosen before we start writing. */
732 wmb();
733 /* Write to that function's config space. */
734 switch (size) {
735 case 1:
736 writeb(val, addr);
737 break;
738 case 2:
739 writew(val, addr);
740 break;
741 default:
742 writel(val, addr);
743 break;
744 }
745 /*
746 * Make sure the write was done before we release the spinlock
747 * allowing consecutive reads/writes.
748 */
749 mb();
750 spin_unlock_irqrestore(&hpdev->hbus->config_lock, flags);
751 } else {
752 dev_err(&hpdev->hbus->hdev->device,
753 "Attempt to write beyond a function's config space.\n");
754 }
755}
756
757/**
758 * hv_pcifront_read_config() - Read configuration space
759 * @bus: PCI Bus structure
760 * @devfn: Device/function
761 * @where: Offset from base
762 * @size: Byte/word/dword
763 * @val: Value to be read
764 *
765 * Return: PCIBIOS_SUCCESSFUL on success
766 * PCIBIOS_DEVICE_NOT_FOUND on failure
767 */
768static int hv_pcifront_read_config(struct pci_bus *bus, unsigned int devfn,
769 int where, int size, u32 *val)
770{
771 struct hv_pcibus_device *hbus =
772 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
773 struct hv_pci_dev *hpdev;
774
775 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
776 if (!hpdev)
777 return PCIBIOS_DEVICE_NOT_FOUND;
778
779 _hv_pcifront_read_config(hpdev, where, size, val);
780
781 put_pcichild(hpdev);
782 return PCIBIOS_SUCCESSFUL;
783}
784
785/**
786 * hv_pcifront_write_config() - Write configuration space
787 * @bus: PCI Bus structure
788 * @devfn: Device/function
789 * @where: Offset from base
790 * @size: Byte/word/dword
791 * @val: Value to be written to device
792 *
793 * Return: PCIBIOS_SUCCESSFUL on success
794 * PCIBIOS_DEVICE_NOT_FOUND on failure
795 */
796static int hv_pcifront_write_config(struct pci_bus *bus, unsigned int devfn,
797 int where, int size, u32 val)
798{
799 struct hv_pcibus_device *hbus =
800 container_of(bus->sysdata, struct hv_pcibus_device, sysdata);
801 struct hv_pci_dev *hpdev;
802
803 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(devfn));
804 if (!hpdev)
805 return PCIBIOS_DEVICE_NOT_FOUND;
806
807 _hv_pcifront_write_config(hpdev, where, size, val);
808
809 put_pcichild(hpdev);
810 return PCIBIOS_SUCCESSFUL;
811}
812
813/* PCIe operations */
814static struct pci_ops hv_pcifront_ops = {
815 .read = hv_pcifront_read_config,
816 .write = hv_pcifront_write_config,
817};
818
819/* Interrupt management hooks */
820static void hv_int_desc_free(struct hv_pci_dev *hpdev,
821 struct tran_int_desc *int_desc)
822{
823 struct pci_delete_interrupt *int_pkt;
824 struct {
825 struct pci_packet pkt;
826 u8 buffer[sizeof(struct pci_delete_interrupt)];
827 } ctxt;
828
829 memset(&ctxt, 0, sizeof(ctxt));
830 int_pkt = (struct pci_delete_interrupt *)&ctxt.pkt.message;
831 int_pkt->message_type.type =
832 PCI_DELETE_INTERRUPT_MESSAGE;
833 int_pkt->wslot.slot = hpdev->desc.win_slot.slot;
834 int_pkt->int_desc = *int_desc;
835 vmbus_sendpacket(hpdev->hbus->hdev->channel, int_pkt, sizeof(*int_pkt),
836 (unsigned long)&ctxt.pkt, VM_PKT_DATA_INBAND, 0);
837 kfree(int_desc);
838}
839
840/**
841 * hv_msi_free() - Free the MSI.
842 * @domain: The interrupt domain pointer
843 * @info: Extra MSI-related context
844 * @irq: Identifies the IRQ.
845 *
846 * The Hyper-V parent partition and hypervisor are tracking the
847 * messages that are in use, keeping the interrupt redirection
848 * table up to date. This callback sends a message that frees
849 * the IRT entry and related tracking nonsense.
850 */
851static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
852 unsigned int irq)
853{
854 struct hv_pcibus_device *hbus;
855 struct hv_pci_dev *hpdev;
856 struct pci_dev *pdev;
857 struct tran_int_desc *int_desc;
858 struct irq_data *irq_data = irq_domain_get_irq_data(domain, irq);
859 struct msi_desc *msi = irq_data_get_msi_desc(irq_data);
860
861 pdev = msi_desc_to_pci_dev(msi);
862 hbus = info->data;
863 int_desc = irq_data_get_irq_chip_data(irq_data);
864 if (!int_desc)
865 return;
866
867 irq_data->chip_data = NULL;
868 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
869 if (!hpdev) {
870 kfree(int_desc);
871 return;
872 }
873
874 hv_int_desc_free(hpdev, int_desc);
875 put_pcichild(hpdev);
876}
877
878static int hv_set_affinity(struct irq_data *data, const struct cpumask *dest,
879 bool force)
880{
881 struct irq_data *parent = data->parent_data;
882
883 return parent->chip->irq_set_affinity(parent, dest, force);
884}
885
886static void hv_irq_mask(struct irq_data *data)
887{
888 pci_msi_mask_irq(data);
889}
890
891/**
892 * hv_irq_unmask() - "Unmask" the IRQ by setting its current
893 * affinity.
894 * @data: Describes the IRQ
895 *
896 * Build new a destination for the MSI and make a hypercall to
897 * update the Interrupt Redirection Table. "Device Logical ID"
898 * is built out of this PCI bus's instance GUID and the function
899 * number of the device.
900 */
901static void hv_irq_unmask(struct irq_data *data)
902{
903 struct msi_desc *msi_desc = irq_data_get_msi_desc(data);
904 struct irq_cfg *cfg = irqd_cfg(data);
905 struct retarget_msi_interrupt *params;
906 struct hv_pcibus_device *hbus;
907 struct cpumask *dest;
908 struct pci_bus *pbus;
909 struct pci_dev *pdev;
910 unsigned long flags;
911 u32 var_size = 0;
912 int cpu_vmbus;
913 int cpu;
914 u64 res;
915
916 dest = irq_data_get_effective_affinity_mask(data);
917 pdev = msi_desc_to_pci_dev(msi_desc);
918 pbus = pdev->bus;
919 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
920
921 spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
922
923 params = &hbus->retarget_msi_interrupt_params;
924 memset(params, 0, sizeof(*params));
925 params->partition_id = HV_PARTITION_ID_SELF;
926 params->int_entry.source = 1; /* MSI(-X) */
927 params->int_entry.address = msi_desc->msg.address_lo;
928 params->int_entry.data = msi_desc->msg.data;
929 params->device_id = (hbus->hdev->dev_instance.b[5] << 24) |
930 (hbus->hdev->dev_instance.b[4] << 16) |
931 (hbus->hdev->dev_instance.b[7] << 8) |
932 (hbus->hdev->dev_instance.b[6] & 0xf8) |
933 PCI_FUNC(pdev->devfn);
934 params->int_target.vector = cfg->vector;
935
936 /*
937 * Honoring apic->irq_delivery_mode set to dest_Fixed by
938 * setting the HV_DEVICE_INTERRUPT_TARGET_MULTICAST flag results in a
939 * spurious interrupt storm. Not doing so does not seem to have a
940 * negative effect (yet?).
941 */
942
943 if (pci_protocol_version >= PCI_PROTOCOL_VERSION_1_2) {
944 /*
945 * PCI_PROTOCOL_VERSION_1_2 supports the VP_SET version of the
946 * HVCALL_RETARGET_INTERRUPT hypercall, which also coincides
947 * with >64 VP support.
948 * ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED
949 * is not sufficient for this hypercall.
950 */
951 params->int_target.flags |=
952 HV_DEVICE_INTERRUPT_TARGET_PROCESSOR_SET;
953 params->int_target.vp_set.valid_banks =
954 (1ull << HV_VP_SET_BANK_COUNT_MAX) - 1;
955
956 /*
957 * var-sized hypercall, var-size starts after vp_mask (thus
958 * vp_set.format does not count, but vp_set.valid_banks does).
959 */
960 var_size = 1 + HV_VP_SET_BANK_COUNT_MAX;
961
962 for_each_cpu_and(cpu, dest, cpu_online_mask) {
963 cpu_vmbus = hv_cpu_number_to_vp_number(cpu);
964
965 if (cpu_vmbus >= HV_VP_SET_BANK_COUNT_MAX * 64) {
966 dev_err(&hbus->hdev->device,
967 "too high CPU %d", cpu_vmbus);
968 res = 1;
969 goto exit_unlock;
970 }
971
972 params->int_target.vp_set.masks[cpu_vmbus / 64] |=
973 (1ULL << (cpu_vmbus & 63));
974 }
975 } else {
976 for_each_cpu_and(cpu, dest, cpu_online_mask) {
977 params->int_target.vp_mask |=
978 (1ULL << hv_cpu_number_to_vp_number(cpu));
979 }
980 }
981
982 res = hv_do_hypercall(HVCALL_RETARGET_INTERRUPT | (var_size << 17),
983 params, NULL);
984
985exit_unlock:
986 spin_unlock_irqrestore(&hbus->retarget_msi_interrupt_lock, flags);
987
988 if (res) {
989 dev_err(&hbus->hdev->device,
990 "%s() failed: %#llx", __func__, res);
991 return;
992 }
993
994 pci_msi_unmask_irq(data);
995}
996
997struct compose_comp_ctxt {
998 struct hv_pci_compl comp_pkt;
999 struct tran_int_desc int_desc;
1000};
1001
1002static void hv_pci_compose_compl(void *context, struct pci_response *resp,
1003 int resp_packet_size)
1004{
1005 struct compose_comp_ctxt *comp_pkt = context;
1006 struct pci_create_int_response *int_resp =
1007 (struct pci_create_int_response *)resp;
1008
1009 comp_pkt->comp_pkt.completion_status = resp->status;
1010 comp_pkt->int_desc = int_resp->int_desc;
1011 complete(&comp_pkt->comp_pkt.host_event);
1012}
1013
1014static u32 hv_compose_msi_req_v1(
1015 struct pci_create_interrupt *int_pkt, struct cpumask *affinity,
1016 u32 slot, u8 vector)
1017{
1018 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE;
1019 int_pkt->wslot.slot = slot;
1020 int_pkt->int_desc.vector = vector;
1021 int_pkt->int_desc.vector_count = 1;
1022 int_pkt->int_desc.delivery_mode = dest_Fixed;
1023
1024 /*
1025 * Create MSI w/ dummy vCPU set, overwritten by subsequent retarget in
1026 * hv_irq_unmask().
1027 */
1028 int_pkt->int_desc.cpu_mask = CPU_AFFINITY_ALL;
1029
1030 return sizeof(*int_pkt);
1031}
1032
1033static u32 hv_compose_msi_req_v2(
1034 struct pci_create_interrupt2 *int_pkt, struct cpumask *affinity,
1035 u32 slot, u8 vector)
1036{
1037 int cpu;
1038
1039 int_pkt->message_type.type = PCI_CREATE_INTERRUPT_MESSAGE2;
1040 int_pkt->wslot.slot = slot;
1041 int_pkt->int_desc.vector = vector;
1042 int_pkt->int_desc.vector_count = 1;
1043 int_pkt->int_desc.delivery_mode = dest_Fixed;
1044
1045 /*
1046 * Create MSI w/ dummy vCPU set targeting just one vCPU, overwritten
1047 * by subsequent retarget in hv_irq_unmask().
1048 */
1049 cpu = cpumask_first_and(affinity, cpu_online_mask);
1050 int_pkt->int_desc.processor_array[0] =
1051 hv_cpu_number_to_vp_number(cpu);
1052 int_pkt->int_desc.processor_count = 1;
1053
1054 return sizeof(*int_pkt);
1055}
1056
1057/**
1058 * hv_compose_msi_msg() - Supplies a valid MSI address/data
1059 * @data: Everything about this MSI
1060 * @msg: Buffer that is filled in by this function
1061 *
1062 * This function unpacks the IRQ looking for target CPU set, IDT
1063 * vector and mode and sends a message to the parent partition
1064 * asking for a mapping for that tuple in this partition. The
1065 * response supplies a data value and address to which that data
1066 * should be written to trigger that interrupt.
1067 */
1068static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
1069{
1070 struct irq_cfg *cfg = irqd_cfg(data);
1071 struct hv_pcibus_device *hbus;
1072 struct hv_pci_dev *hpdev;
1073 struct pci_bus *pbus;
1074 struct pci_dev *pdev;
1075 struct cpumask *dest;
1076 struct compose_comp_ctxt comp;
1077 struct tran_int_desc *int_desc;
1078 struct {
1079 struct pci_packet pci_pkt;
1080 union {
1081 struct pci_create_interrupt v1;
1082 struct pci_create_interrupt2 v2;
1083 } int_pkts;
1084 } __packed ctxt;
1085
1086 u32 size;
1087 int ret;
1088
1089 pdev = msi_desc_to_pci_dev(irq_data_get_msi_desc(data));
1090 dest = irq_data_get_effective_affinity_mask(data);
1091 pbus = pdev->bus;
1092 hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
1093 hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
1094 if (!hpdev)
1095 goto return_null_message;
1096
1097 /* Free any previous message that might have already been composed. */
1098 if (data->chip_data) {
1099 int_desc = data->chip_data;
1100 data->chip_data = NULL;
1101 hv_int_desc_free(hpdev, int_desc);
1102 }
1103
1104 int_desc = kzalloc(sizeof(*int_desc), GFP_ATOMIC);
1105 if (!int_desc)
1106 goto drop_reference;
1107
1108 memset(&ctxt, 0, sizeof(ctxt));
1109 init_completion(&comp.comp_pkt.host_event);
1110 ctxt.pci_pkt.completion_func = hv_pci_compose_compl;
1111 ctxt.pci_pkt.compl_ctxt = &comp;
1112
1113 switch (pci_protocol_version) {
1114 case PCI_PROTOCOL_VERSION_1_1:
1115 size = hv_compose_msi_req_v1(&ctxt.int_pkts.v1,
1116 dest,
1117 hpdev->desc.win_slot.slot,
1118 cfg->vector);
1119 break;
1120
1121 case PCI_PROTOCOL_VERSION_1_2:
1122 size = hv_compose_msi_req_v2(&ctxt.int_pkts.v2,
1123 dest,
1124 hpdev->desc.win_slot.slot,
1125 cfg->vector);
1126 break;
1127
1128 default:
1129 /* As we only negotiate protocol versions known to this driver,
1130 * this path should never hit. However, this is it not a hot
1131 * path so we print a message to aid future updates.
1132 */
1133 dev_err(&hbus->hdev->device,
1134 "Unexpected vPCI protocol, update driver.");
1135 goto free_int_desc;
1136 }
1137
1138 ret = vmbus_sendpacket(hpdev->hbus->hdev->channel, &ctxt.int_pkts,
1139 size, (unsigned long)&ctxt.pci_pkt,
1140 VM_PKT_DATA_INBAND,
1141 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1142 if (ret) {
1143 dev_err(&hbus->hdev->device,
1144 "Sending request for interrupt failed: 0x%x",
1145 comp.comp_pkt.completion_status);
1146 goto free_int_desc;
1147 }
1148
1149 /*
1150 * Since this function is called with IRQ locks held, can't
1151 * do normal wait for completion; instead poll.
1152 */
1153 while (!try_wait_for_completion(&comp.comp_pkt.host_event)) {
1154 /* 0xFFFF means an invalid PCI VENDOR ID. */
1155 if (hv_pcifront_get_vendor_id(hpdev) == 0xFFFF) {
1156 dev_err_once(&hbus->hdev->device,
1157 "the device has gone\n");
1158 goto free_int_desc;
1159 }
1160
1161 /*
1162 * When the higher level interrupt code calls us with
1163 * interrupt disabled, we must poll the channel by calling
1164 * the channel callback directly when channel->target_cpu is
1165 * the current CPU. When the higher level interrupt code
1166 * calls us with interrupt enabled, let's add the
1167 * local_bh_disable()/enable() to avoid race.
1168 */
1169 local_bh_disable();
1170
1171 if (hbus->hdev->channel->target_cpu == smp_processor_id())
1172 hv_pci_onchannelcallback(hbus);
1173
1174 local_bh_enable();
1175
1176 if (hpdev->state == hv_pcichild_ejecting) {
1177 dev_err_once(&hbus->hdev->device,
1178 "the device is being ejected\n");
1179 goto free_int_desc;
1180 }
1181
1182 udelay(100);
1183 }
1184
1185 if (comp.comp_pkt.completion_status < 0) {
1186 dev_err(&hbus->hdev->device,
1187 "Request for interrupt failed: 0x%x",
1188 comp.comp_pkt.completion_status);
1189 goto free_int_desc;
1190 }
1191
1192 /*
1193 * Record the assignment so that this can be unwound later. Using
1194 * irq_set_chip_data() here would be appropriate, but the lock it takes
1195 * is already held.
1196 */
1197 *int_desc = comp.int_desc;
1198 data->chip_data = int_desc;
1199
1200 /* Pass up the result. */
1201 msg->address_hi = comp.int_desc.address >> 32;
1202 msg->address_lo = comp.int_desc.address & 0xffffffff;
1203 msg->data = comp.int_desc.data;
1204
1205 put_pcichild(hpdev);
1206 return;
1207
1208free_int_desc:
1209 kfree(int_desc);
1210drop_reference:
1211 put_pcichild(hpdev);
1212return_null_message:
1213 msg->address_hi = 0;
1214 msg->address_lo = 0;
1215 msg->data = 0;
1216}
1217
1218/* HW Interrupt Chip Descriptor */
1219static struct irq_chip hv_msi_irq_chip = {
1220 .name = "Hyper-V PCIe MSI",
1221 .irq_compose_msi_msg = hv_compose_msi_msg,
1222 .irq_set_affinity = hv_set_affinity,
1223 .irq_ack = irq_chip_ack_parent,
1224 .irq_mask = hv_irq_mask,
1225 .irq_unmask = hv_irq_unmask,
1226};
1227
1228static irq_hw_number_t hv_msi_domain_ops_get_hwirq(struct msi_domain_info *info,
1229 msi_alloc_info_t *arg)
1230{
1231 return arg->msi_hwirq;
1232}
1233
1234static struct msi_domain_ops hv_msi_ops = {
1235 .get_hwirq = hv_msi_domain_ops_get_hwirq,
1236 .msi_prepare = pci_msi_prepare,
1237 .set_desc = pci_msi_set_desc,
1238 .msi_free = hv_msi_free,
1239};
1240
1241/**
1242 * hv_pcie_init_irq_domain() - Initialize IRQ domain
1243 * @hbus: The root PCI bus
1244 *
1245 * This function creates an IRQ domain which will be used for
1246 * interrupts from devices that have been passed through. These
1247 * devices only support MSI and MSI-X, not line-based interrupts
1248 * or simulations of line-based interrupts through PCIe's
1249 * fabric-layer messages. Because interrupts are remapped, we
1250 * can support multi-message MSI here.
1251 *
1252 * Return: '0' on success and error value on failure
1253 */
1254static int hv_pcie_init_irq_domain(struct hv_pcibus_device *hbus)
1255{
1256 hbus->msi_info.chip = &hv_msi_irq_chip;
1257 hbus->msi_info.ops = &hv_msi_ops;
1258 hbus->msi_info.flags = (MSI_FLAG_USE_DEF_DOM_OPS |
1259 MSI_FLAG_USE_DEF_CHIP_OPS | MSI_FLAG_MULTI_PCI_MSI |
1260 MSI_FLAG_PCI_MSIX);
1261 hbus->msi_info.handler = handle_edge_irq;
1262 hbus->msi_info.handler_name = "edge";
1263 hbus->msi_info.data = hbus;
1264 hbus->irq_domain = pci_msi_create_irq_domain(hbus->sysdata.fwnode,
1265 &hbus->msi_info,
1266 x86_vector_domain);
1267 if (!hbus->irq_domain) {
1268 dev_err(&hbus->hdev->device,
1269 "Failed to build an MSI IRQ domain\n");
1270 return -ENODEV;
1271 }
1272
1273 return 0;
1274}
1275
1276/**
1277 * get_bar_size() - Get the address space consumed by a BAR
1278 * @bar_val: Value that a BAR returned after -1 was written
1279 * to it.
1280 *
1281 * This function returns the size of the BAR, rounded up to 1
1282 * page. It has to be rounded up because the hypervisor's page
1283 * table entry that maps the BAR into the VM can't specify an
1284 * offset within a page. The invariant is that the hypervisor
1285 * must place any BARs of smaller than page length at the
1286 * beginning of a page.
1287 *
1288 * Return: Size in bytes of the consumed MMIO space.
1289 */
1290static u64 get_bar_size(u64 bar_val)
1291{
1292 return round_up((1 + ~(bar_val & PCI_BASE_ADDRESS_MEM_MASK)),
1293 PAGE_SIZE);
1294}
1295
1296/**
1297 * survey_child_resources() - Total all MMIO requirements
1298 * @hbus: Root PCI bus, as understood by this driver
1299 */
1300static void survey_child_resources(struct hv_pcibus_device *hbus)
1301{
1302 struct hv_pci_dev *hpdev;
1303 resource_size_t bar_size = 0;
1304 unsigned long flags;
1305 struct completion *event;
1306 u64 bar_val;
1307 int i;
1308
1309 /* If nobody is waiting on the answer, don't compute it. */
1310 event = xchg(&hbus->survey_event, NULL);
1311 if (!event)
1312 return;
1313
1314 /* If the answer has already been computed, go with it. */
1315 if (hbus->low_mmio_space || hbus->high_mmio_space) {
1316 complete(event);
1317 return;
1318 }
1319
1320 spin_lock_irqsave(&hbus->device_list_lock, flags);
1321
1322 /*
1323 * Due to an interesting quirk of the PCI spec, all memory regions
1324 * for a child device are a power of 2 in size and aligned in memory,
1325 * so it's sufficient to just add them up without tracking alignment.
1326 */
1327 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1328 for (i = 0; i < 6; i++) {
1329 if (hpdev->probed_bar[i] & PCI_BASE_ADDRESS_SPACE_IO)
1330 dev_err(&hbus->hdev->device,
1331 "There's an I/O BAR in this list!\n");
1332
1333 if (hpdev->probed_bar[i] != 0) {
1334 /*
1335 * A probed BAR has all the upper bits set that
1336 * can be changed.
1337 */
1338
1339 bar_val = hpdev->probed_bar[i];
1340 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1341 bar_val |=
1342 ((u64)hpdev->probed_bar[++i] << 32);
1343 else
1344 bar_val |= 0xffffffff00000000ULL;
1345
1346 bar_size = get_bar_size(bar_val);
1347
1348 if (bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64)
1349 hbus->high_mmio_space += bar_size;
1350 else
1351 hbus->low_mmio_space += bar_size;
1352 }
1353 }
1354 }
1355
1356 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1357 complete(event);
1358}
1359
1360/**
1361 * prepopulate_bars() - Fill in BARs with defaults
1362 * @hbus: Root PCI bus, as understood by this driver
1363 *
1364 * The core PCI driver code seems much, much happier if the BARs
1365 * for a device have values upon first scan. So fill them in.
1366 * The algorithm below works down from large sizes to small,
1367 * attempting to pack the assignments optimally. The assumption,
1368 * enforced in other parts of the code, is that the beginning of
1369 * the memory-mapped I/O space will be aligned on the largest
1370 * BAR size.
1371 */
1372static void prepopulate_bars(struct hv_pcibus_device *hbus)
1373{
1374 resource_size_t high_size = 0;
1375 resource_size_t low_size = 0;
1376 resource_size_t high_base = 0;
1377 resource_size_t low_base = 0;
1378 resource_size_t bar_size;
1379 struct hv_pci_dev *hpdev;
1380 unsigned long flags;
1381 u64 bar_val;
1382 u32 command;
1383 bool high;
1384 int i;
1385
1386 if (hbus->low_mmio_space) {
1387 low_size = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
1388 low_base = hbus->low_mmio_res->start;
1389 }
1390
1391 if (hbus->high_mmio_space) {
1392 high_size = 1ULL <<
1393 (63 - __builtin_clzll(hbus->high_mmio_space));
1394 high_base = hbus->high_mmio_res->start;
1395 }
1396
1397 spin_lock_irqsave(&hbus->device_list_lock, flags);
1398
1399 /* Pick addresses for the BARs. */
1400 do {
1401 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1402 for (i = 0; i < 6; i++) {
1403 bar_val = hpdev->probed_bar[i];
1404 if (bar_val == 0)
1405 continue;
1406 high = bar_val & PCI_BASE_ADDRESS_MEM_TYPE_64;
1407 if (high) {
1408 bar_val |=
1409 ((u64)hpdev->probed_bar[i + 1]
1410 << 32);
1411 } else {
1412 bar_val |= 0xffffffffULL << 32;
1413 }
1414 bar_size = get_bar_size(bar_val);
1415 if (high) {
1416 if (high_size != bar_size) {
1417 i++;
1418 continue;
1419 }
1420 _hv_pcifront_write_config(hpdev,
1421 PCI_BASE_ADDRESS_0 + (4 * i),
1422 4,
1423 (u32)(high_base & 0xffffff00));
1424 i++;
1425 _hv_pcifront_write_config(hpdev,
1426 PCI_BASE_ADDRESS_0 + (4 * i),
1427 4, (u32)(high_base >> 32));
1428 high_base += bar_size;
1429 } else {
1430 if (low_size != bar_size)
1431 continue;
1432 _hv_pcifront_write_config(hpdev,
1433 PCI_BASE_ADDRESS_0 + (4 * i),
1434 4,
1435 (u32)(low_base & 0xffffff00));
1436 low_base += bar_size;
1437 }
1438 }
1439 if (high_size <= 1 && low_size <= 1) {
1440 /* Set the memory enable bit. */
1441 _hv_pcifront_read_config(hpdev, PCI_COMMAND, 2,
1442 &command);
1443 command |= PCI_COMMAND_MEMORY;
1444 _hv_pcifront_write_config(hpdev, PCI_COMMAND, 2,
1445 command);
1446 break;
1447 }
1448 }
1449
1450 high_size >>= 1;
1451 low_size >>= 1;
1452 } while (high_size || low_size);
1453
1454 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1455}
1456
1457/**
1458 * create_root_hv_pci_bus() - Expose a new root PCI bus
1459 * @hbus: Root PCI bus, as understood by this driver
1460 *
1461 * Return: 0 on success, -errno on failure
1462 */
1463static int create_root_hv_pci_bus(struct hv_pcibus_device *hbus)
1464{
1465 /* Register the device */
1466 hbus->pci_bus = pci_create_root_bus(&hbus->hdev->device,
1467 0, /* bus number is always zero */
1468 &hv_pcifront_ops,
1469 &hbus->sysdata,
1470 &hbus->resources_for_children);
1471 if (!hbus->pci_bus)
1472 return -ENODEV;
1473
1474 hbus->pci_bus->msi = &hbus->msi_chip;
1475 hbus->pci_bus->msi->dev = &hbus->hdev->device;
1476
1477 pci_lock_rescan_remove();
1478 pci_scan_child_bus(hbus->pci_bus);
1479 pci_bus_assign_resources(hbus->pci_bus);
1480 pci_bus_add_devices(hbus->pci_bus);
1481 pci_unlock_rescan_remove();
1482 hbus->state = hv_pcibus_installed;
1483 return 0;
1484}
1485
1486struct q_res_req_compl {
1487 struct completion host_event;
1488 struct hv_pci_dev *hpdev;
1489};
1490
1491/**
1492 * q_resource_requirements() - Query Resource Requirements
1493 * @context: The completion context.
1494 * @resp: The response that came from the host.
1495 * @resp_packet_size: The size in bytes of resp.
1496 *
1497 * This function is invoked on completion of a Query Resource
1498 * Requirements packet.
1499 */
1500static void q_resource_requirements(void *context, struct pci_response *resp,
1501 int resp_packet_size)
1502{
1503 struct q_res_req_compl *completion = context;
1504 struct pci_q_res_req_response *q_res_req =
1505 (struct pci_q_res_req_response *)resp;
1506 int i;
1507
1508 if (resp->status < 0) {
1509 dev_err(&completion->hpdev->hbus->hdev->device,
1510 "query resource requirements failed: %x\n",
1511 resp->status);
1512 } else {
1513 for (i = 0; i < 6; i++) {
1514 completion->hpdev->probed_bar[i] =
1515 q_res_req->probed_bar[i];
1516 }
1517 }
1518
1519 complete(&completion->host_event);
1520}
1521
1522/**
1523 * new_pcichild_device() - Create a new child device
1524 * @hbus: The internal struct tracking this root PCI bus.
1525 * @desc: The information supplied so far from the host
1526 * about the device.
1527 *
1528 * This function creates the tracking structure for a new child
1529 * device and kicks off the process of figuring out what it is.
1530 *
1531 * Return: Pointer to the new tracking struct
1532 */
1533static struct hv_pci_dev *new_pcichild_device(struct hv_pcibus_device *hbus,
1534 struct pci_function_description *desc)
1535{
1536 struct hv_pci_dev *hpdev;
1537 struct pci_child_message *res_req;
1538 struct q_res_req_compl comp_pkt;
1539 struct {
1540 struct pci_packet init_packet;
1541 u8 buffer[sizeof(struct pci_child_message)];
1542 } pkt;
1543 unsigned long flags;
1544 int ret;
1545
1546 hpdev = kzalloc(sizeof(*hpdev), GFP_ATOMIC);
1547 if (!hpdev)
1548 return NULL;
1549
1550 hpdev->hbus = hbus;
1551
1552 memset(&pkt, 0, sizeof(pkt));
1553 init_completion(&comp_pkt.host_event);
1554 comp_pkt.hpdev = hpdev;
1555 pkt.init_packet.compl_ctxt = &comp_pkt;
1556 pkt.init_packet.completion_func = q_resource_requirements;
1557 res_req = (struct pci_child_message *)&pkt.init_packet.message;
1558 res_req->message_type.type = PCI_QUERY_RESOURCE_REQUIREMENTS;
1559 res_req->wslot.slot = desc->win_slot.slot;
1560
1561 ret = vmbus_sendpacket(hbus->hdev->channel, res_req,
1562 sizeof(struct pci_child_message),
1563 (unsigned long)&pkt.init_packet,
1564 VM_PKT_DATA_INBAND,
1565 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
1566 if (ret)
1567 goto error;
1568
1569 if (wait_for_response(hbus->hdev, &comp_pkt.host_event))
1570 goto error;
1571
1572 hpdev->desc = *desc;
1573 refcount_set(&hpdev->refs, 1);
1574 get_pcichild(hpdev);
1575 spin_lock_irqsave(&hbus->device_list_lock, flags);
1576
1577 list_add_tail(&hpdev->list_entry, &hbus->children);
1578 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1579 return hpdev;
1580
1581error:
1582 kfree(hpdev);
1583 return NULL;
1584}
1585
1586/**
1587 * get_pcichild_wslot() - Find device from slot
1588 * @hbus: Root PCI bus, as understood by this driver
1589 * @wslot: Location on the bus
1590 *
1591 * This function looks up a PCI device and returns the internal
1592 * representation of it. It acquires a reference on it, so that
1593 * the device won't be deleted while somebody is using it. The
1594 * caller is responsible for calling put_pcichild() to release
1595 * this reference.
1596 *
1597 * Return: Internal representation of a PCI device
1598 */
1599static struct hv_pci_dev *get_pcichild_wslot(struct hv_pcibus_device *hbus,
1600 u32 wslot)
1601{
1602 unsigned long flags;
1603 struct hv_pci_dev *iter, *hpdev = NULL;
1604
1605 spin_lock_irqsave(&hbus->device_list_lock, flags);
1606 list_for_each_entry(iter, &hbus->children, list_entry) {
1607 if (iter->desc.win_slot.slot == wslot) {
1608 hpdev = iter;
1609 get_pcichild(hpdev);
1610 break;
1611 }
1612 }
1613 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1614
1615 return hpdev;
1616}
1617
1618/**
1619 * pci_devices_present_work() - Handle new list of child devices
1620 * @work: Work struct embedded in struct hv_dr_work
1621 *
1622 * "Bus Relations" is the Windows term for "children of this
1623 * bus." The terminology is preserved here for people trying to
1624 * debug the interaction between Hyper-V and Linux. This
1625 * function is called when the parent partition reports a list
1626 * of functions that should be observed under this PCI Express
1627 * port (bus).
1628 *
1629 * This function updates the list, and must tolerate being
1630 * called multiple times with the same information. The typical
1631 * number of child devices is one, with very atypical cases
1632 * involving three or four, so the algorithms used here can be
1633 * simple and inefficient.
1634 *
1635 * It must also treat the omission of a previously observed device as
1636 * notification that the device no longer exists.
1637 *
1638 * Note that this function is serialized with hv_eject_device_work(),
1639 * because both are pushed to the ordered workqueue hbus->wq.
1640 */
1641static void pci_devices_present_work(struct work_struct *work)
1642{
1643 u32 child_no;
1644 bool found;
1645 struct pci_function_description *new_desc;
1646 struct hv_pci_dev *hpdev;
1647 struct hv_pcibus_device *hbus;
1648 struct list_head removed;
1649 struct hv_dr_work *dr_wrk;
1650 struct hv_dr_state *dr = NULL;
1651 unsigned long flags;
1652
1653 dr_wrk = container_of(work, struct hv_dr_work, wrk);
1654 hbus = dr_wrk->bus;
1655 kfree(dr_wrk);
1656
1657 INIT_LIST_HEAD(&removed);
1658
1659 /* Pull this off the queue and process it if it was the last one. */
1660 spin_lock_irqsave(&hbus->device_list_lock, flags);
1661 while (!list_empty(&hbus->dr_list)) {
1662 dr = list_first_entry(&hbus->dr_list, struct hv_dr_state,
1663 list_entry);
1664 list_del(&dr->list_entry);
1665
1666 /* Throw this away if the list still has stuff in it. */
1667 if (!list_empty(&hbus->dr_list)) {
1668 kfree(dr);
1669 continue;
1670 }
1671 }
1672 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1673
1674 if (!dr) {
1675 put_hvpcibus(hbus);
1676 return;
1677 }
1678
1679 /* First, mark all existing children as reported missing. */
1680 spin_lock_irqsave(&hbus->device_list_lock, flags);
1681 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1682 hpdev->reported_missing = true;
1683 }
1684 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1685
1686 /* Next, add back any reported devices. */
1687 for (child_no = 0; child_no < dr->device_count; child_no++) {
1688 found = false;
1689 new_desc = &dr->func[child_no];
1690
1691 spin_lock_irqsave(&hbus->device_list_lock, flags);
1692 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1693 if ((hpdev->desc.win_slot.slot == new_desc->win_slot.slot) &&
1694 (hpdev->desc.v_id == new_desc->v_id) &&
1695 (hpdev->desc.d_id == new_desc->d_id) &&
1696 (hpdev->desc.ser == new_desc->ser)) {
1697 hpdev->reported_missing = false;
1698 found = true;
1699 }
1700 }
1701 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1702
1703 if (!found) {
1704 hpdev = new_pcichild_device(hbus, new_desc);
1705 if (!hpdev)
1706 dev_err(&hbus->hdev->device,
1707 "couldn't record a child device.\n");
1708 }
1709 }
1710
1711 /* Move missing children to a list on the stack. */
1712 spin_lock_irqsave(&hbus->device_list_lock, flags);
1713 do {
1714 found = false;
1715 list_for_each_entry(hpdev, &hbus->children, list_entry) {
1716 if (hpdev->reported_missing) {
1717 found = true;
1718 put_pcichild(hpdev);
1719 list_move_tail(&hpdev->list_entry, &removed);
1720 break;
1721 }
1722 }
1723 } while (found);
1724 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1725
1726 /* Delete everything that should no longer exist. */
1727 while (!list_empty(&removed)) {
1728 hpdev = list_first_entry(&removed, struct hv_pci_dev,
1729 list_entry);
1730 list_del(&hpdev->list_entry);
1731 put_pcichild(hpdev);
1732 }
1733
1734 switch (hbus->state) {
1735 case hv_pcibus_installed:
1736 /*
1737 * Tell the core to rescan bus
1738 * because there may have been changes.
1739 */
1740 pci_lock_rescan_remove();
1741 pci_scan_child_bus(hbus->pci_bus);
1742 pci_unlock_rescan_remove();
1743 break;
1744
1745 case hv_pcibus_init:
1746 case hv_pcibus_probed:
1747 survey_child_resources(hbus);
1748 break;
1749
1750 default:
1751 break;
1752 }
1753
1754 put_hvpcibus(hbus);
1755 kfree(dr);
1756}
1757
1758/**
1759 * hv_pci_devices_present() - Handles list of new children
1760 * @hbus: Root PCI bus, as understood by this driver
1761 * @relations: Packet from host listing children
1762 *
1763 * This function is invoked whenever a new list of devices for
1764 * this bus appears.
1765 */
1766static void hv_pci_devices_present(struct hv_pcibus_device *hbus,
1767 struct pci_bus_relations *relations)
1768{
1769 struct hv_dr_state *dr;
1770 struct hv_dr_work *dr_wrk;
1771 unsigned long flags;
1772 bool pending_dr;
1773
1774 dr_wrk = kzalloc(sizeof(*dr_wrk), GFP_NOWAIT);
1775 if (!dr_wrk)
1776 return;
1777
1778 dr = kzalloc(offsetof(struct hv_dr_state, func) +
1779 (sizeof(struct pci_function_description) *
1780 (relations->device_count)), GFP_NOWAIT);
1781 if (!dr) {
1782 kfree(dr_wrk);
1783 return;
1784 }
1785
1786 INIT_WORK(&dr_wrk->wrk, pci_devices_present_work);
1787 dr_wrk->bus = hbus;
1788 dr->device_count = relations->device_count;
1789 if (dr->device_count != 0) {
1790 memcpy(dr->func, relations->func,
1791 sizeof(struct pci_function_description) *
1792 dr->device_count);
1793 }
1794
1795 spin_lock_irqsave(&hbus->device_list_lock, flags);
1796 /*
1797 * If pending_dr is true, we have already queued a work,
1798 * which will see the new dr. Otherwise, we need to
1799 * queue a new work.
1800 */
1801 pending_dr = !list_empty(&hbus->dr_list);
1802 list_add_tail(&dr->list_entry, &hbus->dr_list);
1803 spin_unlock_irqrestore(&hbus->device_list_lock, flags);
1804
1805 if (pending_dr) {
1806 kfree(dr_wrk);
1807 } else {
1808 get_hvpcibus(hbus);
1809 queue_work(hbus->wq, &dr_wrk->wrk);
1810 }
1811}
1812
1813/**
1814 * hv_eject_device_work() - Asynchronously handles ejection
1815 * @work: Work struct embedded in internal device struct
1816 *
1817 * This function handles ejecting a device. Windows will
1818 * attempt to gracefully eject a device, waiting 60 seconds to
1819 * hear back from the guest OS that this completed successfully.
1820 * If this timer expires, the device will be forcibly removed.
1821 */
1822static void hv_eject_device_work(struct work_struct *work)
1823{
1824 struct pci_eject_response *ejct_pkt;
1825 struct hv_pci_dev *hpdev;
1826 struct pci_dev *pdev;
1827 unsigned long flags;
1828 int wslot;
1829 struct {
1830 struct pci_packet pkt;
1831 u8 buffer[sizeof(struct pci_eject_response)];
1832 } ctxt;
1833
1834 hpdev = container_of(work, struct hv_pci_dev, wrk);
1835
1836 WARN_ON(hpdev->state != hv_pcichild_ejecting);
1837
1838 /*
1839 * Ejection can come before or after the PCI bus has been set up, so
1840 * attempt to find it and tear down the bus state, if it exists. This
1841 * must be done without constructs like pci_domain_nr(hbus->pci_bus)
1842 * because hbus->pci_bus may not exist yet.
1843 */
1844 wslot = wslot_to_devfn(hpdev->desc.win_slot.slot);
1845 pdev = pci_get_domain_bus_and_slot(hpdev->hbus->sysdata.domain, 0,
1846 wslot);
1847 if (pdev) {
1848 pci_lock_rescan_remove();
1849 pci_stop_and_remove_bus_device(pdev);
1850 pci_dev_put(pdev);
1851 pci_unlock_rescan_remove();
1852 }
1853
1854 spin_lock_irqsave(&hpdev->hbus->device_list_lock, flags);
1855 list_del(&hpdev->list_entry);
1856 spin_unlock_irqrestore(&hpdev->hbus->device_list_lock, flags);
1857
1858 memset(&ctxt, 0, sizeof(ctxt));
1859 ejct_pkt = (struct pci_eject_response *)&ctxt.pkt.message;
1860 ejct_pkt->message_type.type = PCI_EJECTION_COMPLETE;
1861 ejct_pkt->wslot.slot = hpdev->desc.win_slot.slot;
1862 vmbus_sendpacket(hpdev->hbus->hdev->channel, ejct_pkt,
1863 sizeof(*ejct_pkt), (unsigned long)&ctxt.pkt,
1864 VM_PKT_DATA_INBAND, 0);
1865
1866 put_pcichild(hpdev);
1867 put_pcichild(hpdev);
1868 put_hvpcibus(hpdev->hbus);
1869}
1870
1871/**
1872 * hv_pci_eject_device() - Handles device ejection
1873 * @hpdev: Internal device tracking struct
1874 *
1875 * This function is invoked when an ejection packet arrives. It
1876 * just schedules work so that we don't re-enter the packet
1877 * delivery code handling the ejection.
1878 */
1879static void hv_pci_eject_device(struct hv_pci_dev *hpdev)
1880{
1881 hpdev->state = hv_pcichild_ejecting;
1882 get_pcichild(hpdev);
1883 INIT_WORK(&hpdev->wrk, hv_eject_device_work);
1884 get_hvpcibus(hpdev->hbus);
1885 queue_work(hpdev->hbus->wq, &hpdev->wrk);
1886}
1887
1888/**
1889 * hv_pci_onchannelcallback() - Handles incoming packets
1890 * @context: Internal bus tracking struct
1891 *
1892 * This function is invoked whenever the host sends a packet to
1893 * this channel (which is private to this root PCI bus).
1894 */
1895static void hv_pci_onchannelcallback(void *context)
1896{
1897 const int packet_size = 0x100;
1898 int ret;
1899 struct hv_pcibus_device *hbus = context;
1900 u32 bytes_recvd;
1901 u64 req_id;
1902 struct vmpacket_descriptor *desc;
1903 unsigned char *buffer;
1904 int bufferlen = packet_size;
1905 struct pci_packet *comp_packet;
1906 struct pci_response *response;
1907 struct pci_incoming_message *new_message;
1908 struct pci_bus_relations *bus_rel;
1909 struct pci_dev_incoming *dev_message;
1910 struct hv_pci_dev *hpdev;
1911
1912 buffer = kmalloc(bufferlen, GFP_ATOMIC);
1913 if (!buffer)
1914 return;
1915
1916 while (1) {
1917 ret = vmbus_recvpacket_raw(hbus->hdev->channel, buffer,
1918 bufferlen, &bytes_recvd, &req_id);
1919
1920 if (ret == -ENOBUFS) {
1921 kfree(buffer);
1922 /* Handle large packet */
1923 bufferlen = bytes_recvd;
1924 buffer = kmalloc(bytes_recvd, GFP_ATOMIC);
1925 if (!buffer)
1926 return;
1927 continue;
1928 }
1929
1930 /* Zero length indicates there are no more packets. */
1931 if (ret || !bytes_recvd)
1932 break;
1933
1934 /*
1935 * All incoming packets must be at least as large as a
1936 * response.
1937 */
1938 if (bytes_recvd <= sizeof(struct pci_response))
1939 continue;
1940 desc = (struct vmpacket_descriptor *)buffer;
1941
1942 switch (desc->type) {
1943 case VM_PKT_COMP:
1944
1945 /*
1946 * The host is trusted, and thus it's safe to interpret
1947 * this transaction ID as a pointer.
1948 */
1949 comp_packet = (struct pci_packet *)req_id;
1950 response = (struct pci_response *)buffer;
1951 comp_packet->completion_func(comp_packet->compl_ctxt,
1952 response,
1953 bytes_recvd);
1954 break;
1955
1956 case VM_PKT_DATA_INBAND:
1957
1958 new_message = (struct pci_incoming_message *)buffer;
1959 switch (new_message->message_type.type) {
1960 case PCI_BUS_RELATIONS:
1961
1962 bus_rel = (struct pci_bus_relations *)buffer;
1963 if (bytes_recvd <
1964 offsetof(struct pci_bus_relations, func) +
1965 (sizeof(struct pci_function_description) *
1966 (bus_rel->device_count))) {
1967 dev_err(&hbus->hdev->device,
1968 "bus relations too small\n");
1969 break;
1970 }
1971
1972 hv_pci_devices_present(hbus, bus_rel);
1973 break;
1974
1975 case PCI_EJECT:
1976
1977 dev_message = (struct pci_dev_incoming *)buffer;
1978 hpdev = get_pcichild_wslot(hbus,
1979 dev_message->wslot.slot);
1980 if (hpdev) {
1981 hv_pci_eject_device(hpdev);
1982 put_pcichild(hpdev);
1983 }
1984 break;
1985
1986 default:
1987 dev_warn(&hbus->hdev->device,
1988 "Unimplemented protocol message %x\n",
1989 new_message->message_type.type);
1990 break;
1991 }
1992 break;
1993
1994 default:
1995 dev_err(&hbus->hdev->device,
1996 "unhandled packet type %d, tid %llx len %d\n",
1997 desc->type, req_id, bytes_recvd);
1998 break;
1999 }
2000 }
2001
2002 kfree(buffer);
2003}
2004
2005/**
2006 * hv_pci_protocol_negotiation() - Set up protocol
2007 * @hdev: VMBus's tracking struct for this root PCI bus
2008 *
2009 * This driver is intended to support running on Windows 10
2010 * (server) and later versions. It will not run on earlier
2011 * versions, as they assume that many of the operations which
2012 * Linux needs accomplished with a spinlock held were done via
2013 * asynchronous messaging via VMBus. Windows 10 increases the
2014 * surface area of PCI emulation so that these actions can take
2015 * place by suspending a virtual processor for their duration.
2016 *
2017 * This function negotiates the channel protocol version,
2018 * failing if the host doesn't support the necessary protocol
2019 * level.
2020 */
2021static int hv_pci_protocol_negotiation(struct hv_device *hdev)
2022{
2023 struct pci_version_request *version_req;
2024 struct hv_pci_compl comp_pkt;
2025 struct pci_packet *pkt;
2026 int ret;
2027 int i;
2028
2029 /*
2030 * Initiate the handshake with the host and negotiate
2031 * a version that the host can support. We start with the
2032 * highest version number and go down if the host cannot
2033 * support it.
2034 */
2035 pkt = kzalloc(sizeof(*pkt) + sizeof(*version_req), GFP_KERNEL);
2036 if (!pkt)
2037 return -ENOMEM;
2038
2039 init_completion(&comp_pkt.host_event);
2040 pkt->completion_func = hv_pci_generic_compl;
2041 pkt->compl_ctxt = &comp_pkt;
2042 version_req = (struct pci_version_request *)&pkt->message;
2043 version_req->message_type.type = PCI_QUERY_PROTOCOL_VERSION;
2044
2045 for (i = 0; i < ARRAY_SIZE(pci_protocol_versions); i++) {
2046 version_req->protocol_version = pci_protocol_versions[i];
2047 ret = vmbus_sendpacket(hdev->channel, version_req,
2048 sizeof(struct pci_version_request),
2049 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2050 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2051 if (!ret)
2052 ret = wait_for_response(hdev, &comp_pkt.host_event);
2053
2054 if (ret) {
2055 dev_err(&hdev->device,
2056 "PCI Pass-through VSP failed to request version: %d",
2057 ret);
2058 goto exit;
2059 }
2060
2061 if (comp_pkt.completion_status >= 0) {
2062 pci_protocol_version = pci_protocol_versions[i];
2063 dev_info(&hdev->device,
2064 "PCI VMBus probing: Using version %#x\n",
2065 pci_protocol_version);
2066 goto exit;
2067 }
2068
2069 if (comp_pkt.completion_status != STATUS_REVISION_MISMATCH) {
2070 dev_err(&hdev->device,
2071 "PCI Pass-through VSP failed version request: %#x",
2072 comp_pkt.completion_status);
2073 ret = -EPROTO;
2074 goto exit;
2075 }
2076
2077 reinit_completion(&comp_pkt.host_event);
2078 }
2079
2080 dev_err(&hdev->device,
2081 "PCI pass-through VSP failed to find supported version");
2082 ret = -EPROTO;
2083
2084exit:
2085 kfree(pkt);
2086 return ret;
2087}
2088
2089/**
2090 * hv_pci_free_bridge_windows() - Release memory regions for the
2091 * bus
2092 * @hbus: Root PCI bus, as understood by this driver
2093 */
2094static void hv_pci_free_bridge_windows(struct hv_pcibus_device *hbus)
2095{
2096 /*
2097 * Set the resources back to the way they looked when they
2098 * were allocated by setting IORESOURCE_BUSY again.
2099 */
2100
2101 if (hbus->low_mmio_space && hbus->low_mmio_res) {
2102 hbus->low_mmio_res->flags |= IORESOURCE_BUSY;
2103 vmbus_free_mmio(hbus->low_mmio_res->start,
2104 resource_size(hbus->low_mmio_res));
2105 }
2106
2107 if (hbus->high_mmio_space && hbus->high_mmio_res) {
2108 hbus->high_mmio_res->flags |= IORESOURCE_BUSY;
2109 vmbus_free_mmio(hbus->high_mmio_res->start,
2110 resource_size(hbus->high_mmio_res));
2111 }
2112}
2113
2114/**
2115 * hv_pci_allocate_bridge_windows() - Allocate memory regions
2116 * for the bus
2117 * @hbus: Root PCI bus, as understood by this driver
2118 *
2119 * This function calls vmbus_allocate_mmio(), which is itself a
2120 * bit of a compromise. Ideally, we might change the pnp layer
2121 * in the kernel such that it comprehends either PCI devices
2122 * which are "grandchildren of ACPI," with some intermediate bus
2123 * node (in this case, VMBus) or change it such that it
2124 * understands VMBus. The pnp layer, however, has been declared
2125 * deprecated, and not subject to change.
2126 *
2127 * The workaround, implemented here, is to ask VMBus to allocate
2128 * MMIO space for this bus. VMBus itself knows which ranges are
2129 * appropriate by looking at its own ACPI objects. Then, after
2130 * these ranges are claimed, they're modified to look like they
2131 * would have looked if the ACPI and pnp code had allocated
2132 * bridge windows. These descriptors have to exist in this form
2133 * in order to satisfy the code which will get invoked when the
2134 * endpoint PCI function driver calls request_mem_region() or
2135 * request_mem_region_exclusive().
2136 *
2137 * Return: 0 on success, -errno on failure
2138 */
2139static int hv_pci_allocate_bridge_windows(struct hv_pcibus_device *hbus)
2140{
2141 resource_size_t align;
2142 int ret;
2143
2144 if (hbus->low_mmio_space) {
2145 align = 1ULL << (63 - __builtin_clzll(hbus->low_mmio_space));
2146 ret = vmbus_allocate_mmio(&hbus->low_mmio_res, hbus->hdev, 0,
2147 (u64)(u32)0xffffffff,
2148 hbus->low_mmio_space,
2149 align, false);
2150 if (ret) {
2151 dev_err(&hbus->hdev->device,
2152 "Need %#llx of low MMIO space. Consider reconfiguring the VM.\n",
2153 hbus->low_mmio_space);
2154 return ret;
2155 }
2156
2157 /* Modify this resource to become a bridge window. */
2158 hbus->low_mmio_res->flags |= IORESOURCE_WINDOW;
2159 hbus->low_mmio_res->flags &= ~IORESOURCE_BUSY;
2160 pci_add_resource(&hbus->resources_for_children,
2161 hbus->low_mmio_res);
2162 }
2163
2164 if (hbus->high_mmio_space) {
2165 align = 1ULL << (63 - __builtin_clzll(hbus->high_mmio_space));
2166 ret = vmbus_allocate_mmio(&hbus->high_mmio_res, hbus->hdev,
2167 0x100000000, -1,
2168 hbus->high_mmio_space, align,
2169 false);
2170 if (ret) {
2171 dev_err(&hbus->hdev->device,
2172 "Need %#llx of high MMIO space. Consider reconfiguring the VM.\n",
2173 hbus->high_mmio_space);
2174 goto release_low_mmio;
2175 }
2176
2177 /* Modify this resource to become a bridge window. */
2178 hbus->high_mmio_res->flags |= IORESOURCE_WINDOW;
2179 hbus->high_mmio_res->flags &= ~IORESOURCE_BUSY;
2180 pci_add_resource(&hbus->resources_for_children,
2181 hbus->high_mmio_res);
2182 }
2183
2184 return 0;
2185
2186release_low_mmio:
2187 if (hbus->low_mmio_res) {
2188 vmbus_free_mmio(hbus->low_mmio_res->start,
2189 resource_size(hbus->low_mmio_res));
2190 }
2191
2192 return ret;
2193}
2194
2195/**
2196 * hv_allocate_config_window() - Find MMIO space for PCI Config
2197 * @hbus: Root PCI bus, as understood by this driver
2198 *
2199 * This function claims memory-mapped I/O space for accessing
2200 * configuration space for the functions on this bus.
2201 *
2202 * Return: 0 on success, -errno on failure
2203 */
2204static int hv_allocate_config_window(struct hv_pcibus_device *hbus)
2205{
2206 int ret;
2207
2208 /*
2209 * Set up a region of MMIO space to use for accessing configuration
2210 * space.
2211 */
2212 ret = vmbus_allocate_mmio(&hbus->mem_config, hbus->hdev, 0, -1,
2213 PCI_CONFIG_MMIO_LENGTH, 0x1000, false);
2214 if (ret)
2215 return ret;
2216
2217 /*
2218 * vmbus_allocate_mmio() gets used for allocating both device endpoint
2219 * resource claims (those which cannot be overlapped) and the ranges
2220 * which are valid for the children of this bus, which are intended
2221 * to be overlapped by those children. Set the flag on this claim
2222 * meaning that this region can't be overlapped.
2223 */
2224
2225 hbus->mem_config->flags |= IORESOURCE_BUSY;
2226
2227 return 0;
2228}
2229
2230static void hv_free_config_window(struct hv_pcibus_device *hbus)
2231{
2232 vmbus_free_mmio(hbus->mem_config->start, PCI_CONFIG_MMIO_LENGTH);
2233}
2234
2235/**
2236 * hv_pci_enter_d0() - Bring the "bus" into the D0 power state
2237 * @hdev: VMBus's tracking struct for this root PCI bus
2238 *
2239 * Return: 0 on success, -errno on failure
2240 */
2241static int hv_pci_enter_d0(struct hv_device *hdev)
2242{
2243 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2244 struct pci_bus_d0_entry *d0_entry;
2245 struct hv_pci_compl comp_pkt;
2246 struct pci_packet *pkt;
2247 int ret;
2248
2249 /*
2250 * Tell the host that the bus is ready to use, and moved into the
2251 * powered-on state. This includes telling the host which region
2252 * of memory-mapped I/O space has been chosen for configuration space
2253 * access.
2254 */
2255 pkt = kzalloc(sizeof(*pkt) + sizeof(*d0_entry), GFP_KERNEL);
2256 if (!pkt)
2257 return -ENOMEM;
2258
2259 init_completion(&comp_pkt.host_event);
2260 pkt->completion_func = hv_pci_generic_compl;
2261 pkt->compl_ctxt = &comp_pkt;
2262 d0_entry = (struct pci_bus_d0_entry *)&pkt->message;
2263 d0_entry->message_type.type = PCI_BUS_D0ENTRY;
2264 d0_entry->mmio_base = hbus->mem_config->start;
2265
2266 ret = vmbus_sendpacket(hdev->channel, d0_entry, sizeof(*d0_entry),
2267 (unsigned long)pkt, VM_PKT_DATA_INBAND,
2268 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2269 if (!ret)
2270 ret = wait_for_response(hdev, &comp_pkt.host_event);
2271
2272 if (ret)
2273 goto exit;
2274
2275 if (comp_pkt.completion_status < 0) {
2276 dev_err(&hdev->device,
2277 "PCI Pass-through VSP failed D0 Entry with status %x\n",
2278 comp_pkt.completion_status);
2279 ret = -EPROTO;
2280 goto exit;
2281 }
2282
2283 ret = 0;
2284
2285exit:
2286 kfree(pkt);
2287 return ret;
2288}
2289
2290/**
2291 * hv_pci_query_relations() - Ask host to send list of child
2292 * devices
2293 * @hdev: VMBus's tracking struct for this root PCI bus
2294 *
2295 * Return: 0 on success, -errno on failure
2296 */
2297static int hv_pci_query_relations(struct hv_device *hdev)
2298{
2299 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2300 struct pci_message message;
2301 struct completion comp;
2302 int ret;
2303
2304 /* Ask the host to send along the list of child devices */
2305 init_completion(&comp);
2306 if (cmpxchg(&hbus->survey_event, NULL, &comp))
2307 return -ENOTEMPTY;
2308
2309 memset(&message, 0, sizeof(message));
2310 message.type = PCI_QUERY_BUS_RELATIONS;
2311
2312 ret = vmbus_sendpacket(hdev->channel, &message, sizeof(message),
2313 0, VM_PKT_DATA_INBAND, 0);
2314 if (!ret)
2315 ret = wait_for_response(hdev, &comp);
2316
2317 return ret;
2318}
2319
2320/**
2321 * hv_send_resources_allocated() - Report local resource choices
2322 * @hdev: VMBus's tracking struct for this root PCI bus
2323 *
2324 * The host OS is expecting to be sent a request as a message
2325 * which contains all the resources that the device will use.
2326 * The response contains those same resources, "translated"
2327 * which is to say, the values which should be used by the
2328 * hardware, when it delivers an interrupt. (MMIO resources are
2329 * used in local terms.) This is nice for Windows, and lines up
2330 * with the FDO/PDO split, which doesn't exist in Linux. Linux
2331 * is deeply expecting to scan an emulated PCI configuration
2332 * space. So this message is sent here only to drive the state
2333 * machine on the host forward.
2334 *
2335 * Return: 0 on success, -errno on failure
2336 */
2337static int hv_send_resources_allocated(struct hv_device *hdev)
2338{
2339 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2340 struct pci_resources_assigned *res_assigned;
2341 struct pci_resources_assigned2 *res_assigned2;
2342 struct hv_pci_compl comp_pkt;
2343 struct hv_pci_dev *hpdev;
2344 struct pci_packet *pkt;
2345 size_t size_res;
2346 u32 wslot;
2347 int ret;
2348
2349 size_res = (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2)
2350 ? sizeof(*res_assigned) : sizeof(*res_assigned2);
2351
2352 pkt = kmalloc(sizeof(*pkt) + size_res, GFP_KERNEL);
2353 if (!pkt)
2354 return -ENOMEM;
2355
2356 ret = 0;
2357
2358 for (wslot = 0; wslot < 256; wslot++) {
2359 hpdev = get_pcichild_wslot(hbus, wslot);
2360 if (!hpdev)
2361 continue;
2362
2363 memset(pkt, 0, sizeof(*pkt) + size_res);
2364 init_completion(&comp_pkt.host_event);
2365 pkt->completion_func = hv_pci_generic_compl;
2366 pkt->compl_ctxt = &comp_pkt;
2367
2368 if (pci_protocol_version < PCI_PROTOCOL_VERSION_1_2) {
2369 res_assigned =
2370 (struct pci_resources_assigned *)&pkt->message;
2371 res_assigned->message_type.type =
2372 PCI_RESOURCES_ASSIGNED;
2373 res_assigned->wslot.slot = hpdev->desc.win_slot.slot;
2374 } else {
2375 res_assigned2 =
2376 (struct pci_resources_assigned2 *)&pkt->message;
2377 res_assigned2->message_type.type =
2378 PCI_RESOURCES_ASSIGNED2;
2379 res_assigned2->wslot.slot = hpdev->desc.win_slot.slot;
2380 }
2381 put_pcichild(hpdev);
2382
2383 ret = vmbus_sendpacket(hdev->channel, &pkt->message,
2384 size_res, (unsigned long)pkt,
2385 VM_PKT_DATA_INBAND,
2386 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2387 if (!ret)
2388 ret = wait_for_response(hdev, &comp_pkt.host_event);
2389 if (ret)
2390 break;
2391
2392 if (comp_pkt.completion_status < 0) {
2393 ret = -EPROTO;
2394 dev_err(&hdev->device,
2395 "resource allocated returned 0x%x",
2396 comp_pkt.completion_status);
2397 break;
2398 }
2399 }
2400
2401 kfree(pkt);
2402 return ret;
2403}
2404
2405/**
2406 * hv_send_resources_released() - Report local resources
2407 * released
2408 * @hdev: VMBus's tracking struct for this root PCI bus
2409 *
2410 * Return: 0 on success, -errno on failure
2411 */
2412static int hv_send_resources_released(struct hv_device *hdev)
2413{
2414 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2415 struct pci_child_message pkt;
2416 struct hv_pci_dev *hpdev;
2417 u32 wslot;
2418 int ret;
2419
2420 for (wslot = 0; wslot < 256; wslot++) {
2421 hpdev = get_pcichild_wslot(hbus, wslot);
2422 if (!hpdev)
2423 continue;
2424
2425 memset(&pkt, 0, sizeof(pkt));
2426 pkt.message_type.type = PCI_RESOURCES_RELEASED;
2427 pkt.wslot.slot = hpdev->desc.win_slot.slot;
2428
2429 put_pcichild(hpdev);
2430
2431 ret = vmbus_sendpacket(hdev->channel, &pkt, sizeof(pkt), 0,
2432 VM_PKT_DATA_INBAND, 0);
2433 if (ret)
2434 return ret;
2435 }
2436
2437 return 0;
2438}
2439
2440static void get_hvpcibus(struct hv_pcibus_device *hbus)
2441{
2442 refcount_inc(&hbus->remove_lock);
2443}
2444
2445static void put_hvpcibus(struct hv_pcibus_device *hbus)
2446{
2447 if (refcount_dec_and_test(&hbus->remove_lock))
2448 complete(&hbus->remove_event);
2449}
2450
2451/**
2452 * hv_pci_probe() - New VMBus channel probe, for a root PCI bus
2453 * @hdev: VMBus's tracking struct for this root PCI bus
2454 * @dev_id: Identifies the device itself
2455 *
2456 * Return: 0 on success, -errno on failure
2457 */
2458static int hv_pci_probe(struct hv_device *hdev,
2459 const struct hv_vmbus_device_id *dev_id)
2460{
2461 struct hv_pcibus_device *hbus;
2462 int ret;
2463
2464 /*
2465 * hv_pcibus_device contains the hypercall arguments for retargeting in
2466 * hv_irq_unmask(). Those must not cross a page boundary.
2467 */
2468 BUILD_BUG_ON(sizeof(*hbus) > PAGE_SIZE);
2469
2470 hbus = (struct hv_pcibus_device *)get_zeroed_page(GFP_KERNEL);
2471 if (!hbus)
2472 return -ENOMEM;
2473 hbus->state = hv_pcibus_init;
2474
2475 /*
2476 * The PCI bus "domain" is what is called "segment" in ACPI and
2477 * other specs. Pull it from the instance ID, to get something
2478 * unique. Bytes 8 and 9 are what is used in Windows guests, so
2479 * do the same thing for consistency. Note that, since this code
2480 * only runs in a Hyper-V VM, Hyper-V can (and does) guarantee
2481 * that (1) the only domain in use for something that looks like
2482 * a physical PCI bus (which is actually emulated by the
2483 * hypervisor) is domain 0 and (2) there will be no overlap
2484 * between domains derived from these instance IDs in the same
2485 * VM.
2486 */
2487 hbus->sysdata.domain = hdev->dev_instance.b[9] |
2488 hdev->dev_instance.b[8] << 8;
2489
2490 hbus->hdev = hdev;
2491 refcount_set(&hbus->remove_lock, 1);
2492 INIT_LIST_HEAD(&hbus->children);
2493 INIT_LIST_HEAD(&hbus->dr_list);
2494 INIT_LIST_HEAD(&hbus->resources_for_children);
2495 spin_lock_init(&hbus->config_lock);
2496 spin_lock_init(&hbus->device_list_lock);
2497 spin_lock_init(&hbus->retarget_msi_interrupt_lock);
2498 init_completion(&hbus->remove_event);
2499 hbus->wq = alloc_ordered_workqueue("hv_pci_%x", 0,
2500 hbus->sysdata.domain);
2501 if (!hbus->wq) {
2502 ret = -ENOMEM;
2503 goto free_bus;
2504 }
2505
2506 ret = vmbus_open(hdev->channel, pci_ring_size, pci_ring_size, NULL, 0,
2507 hv_pci_onchannelcallback, hbus);
2508 if (ret)
2509 goto destroy_wq;
2510
2511 hv_set_drvdata(hdev, hbus);
2512
2513 ret = hv_pci_protocol_negotiation(hdev);
2514 if (ret)
2515 goto close;
2516
2517 ret = hv_allocate_config_window(hbus);
2518 if (ret)
2519 goto close;
2520
2521 hbus->cfg_addr = ioremap(hbus->mem_config->start,
2522 PCI_CONFIG_MMIO_LENGTH);
2523 if (!hbus->cfg_addr) {
2524 dev_err(&hdev->device,
2525 "Unable to map a virtual address for config space\n");
2526 ret = -ENOMEM;
2527 goto free_config;
2528 }
2529
2530 hbus->sysdata.fwnode = irq_domain_alloc_fwnode(hbus);
2531 if (!hbus->sysdata.fwnode) {
2532 ret = -ENOMEM;
2533 goto unmap;
2534 }
2535
2536 ret = hv_pcie_init_irq_domain(hbus);
2537 if (ret)
2538 goto free_fwnode;
2539
2540 ret = hv_pci_query_relations(hdev);
2541 if (ret)
2542 goto free_irq_domain;
2543
2544 ret = hv_pci_enter_d0(hdev);
2545 if (ret)
2546 goto free_irq_domain;
2547
2548 ret = hv_pci_allocate_bridge_windows(hbus);
2549 if (ret)
2550 goto free_irq_domain;
2551
2552 ret = hv_send_resources_allocated(hdev);
2553 if (ret)
2554 goto free_windows;
2555
2556 prepopulate_bars(hbus);
2557
2558 hbus->state = hv_pcibus_probed;
2559
2560 ret = create_root_hv_pci_bus(hbus);
2561 if (ret)
2562 goto free_windows;
2563
2564 return 0;
2565
2566free_windows:
2567 hv_pci_free_bridge_windows(hbus);
2568free_irq_domain:
2569 irq_domain_remove(hbus->irq_domain);
2570free_fwnode:
2571 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2572unmap:
2573 iounmap(hbus->cfg_addr);
2574free_config:
2575 hv_free_config_window(hbus);
2576close:
2577 vmbus_close(hdev->channel);
2578destroy_wq:
2579 destroy_workqueue(hbus->wq);
2580free_bus:
2581 free_page((unsigned long)hbus);
2582 return ret;
2583}
2584
2585static void hv_pci_bus_exit(struct hv_device *hdev)
2586{
2587 struct hv_pcibus_device *hbus = hv_get_drvdata(hdev);
2588 struct {
2589 struct pci_packet teardown_packet;
2590 u8 buffer[sizeof(struct pci_message)];
2591 } pkt;
2592 struct pci_bus_relations relations;
2593 struct hv_pci_compl comp_pkt;
2594 int ret;
2595
2596 /*
2597 * After the host sends the RESCIND_CHANNEL message, it doesn't
2598 * access the per-channel ringbuffer any longer.
2599 */
2600 if (hdev->channel->rescind)
2601 return;
2602
2603 /* Delete any children which might still exist. */
2604 memset(&relations, 0, sizeof(relations));
2605 hv_pci_devices_present(hbus, &relations);
2606
2607 ret = hv_send_resources_released(hdev);
2608 if (ret)
2609 dev_err(&hdev->device,
2610 "Couldn't send resources released packet(s)\n");
2611
2612 memset(&pkt.teardown_packet, 0, sizeof(pkt.teardown_packet));
2613 init_completion(&comp_pkt.host_event);
2614 pkt.teardown_packet.completion_func = hv_pci_generic_compl;
2615 pkt.teardown_packet.compl_ctxt = &comp_pkt;
2616 pkt.teardown_packet.message[0].type = PCI_BUS_D0EXIT;
2617
2618 ret = vmbus_sendpacket(hdev->channel, &pkt.teardown_packet.message,
2619 sizeof(struct pci_message),
2620 (unsigned long)&pkt.teardown_packet,
2621 VM_PKT_DATA_INBAND,
2622 VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED);
2623 if (!ret)
2624 wait_for_completion_timeout(&comp_pkt.host_event, 10 * HZ);
2625}
2626
2627/**
2628 * hv_pci_remove() - Remove routine for this VMBus channel
2629 * @hdev: VMBus's tracking struct for this root PCI bus
2630 *
2631 * Return: 0 on success, -errno on failure
2632 */
2633static int hv_pci_remove(struct hv_device *hdev)
2634{
2635 struct hv_pcibus_device *hbus;
2636
2637 hbus = hv_get_drvdata(hdev);
2638 if (hbus->state == hv_pcibus_installed) {
2639 /* Remove the bus from PCI's point of view. */
2640 pci_lock_rescan_remove();
2641 pci_stop_root_bus(hbus->pci_bus);
2642 pci_remove_root_bus(hbus->pci_bus);
2643 pci_unlock_rescan_remove();
2644 hbus->state = hv_pcibus_removed;
2645 }
2646
2647 hv_pci_bus_exit(hdev);
2648
2649 vmbus_close(hdev->channel);
2650
2651 iounmap(hbus->cfg_addr);
2652 hv_free_config_window(hbus);
2653 pci_free_resource_list(&hbus->resources_for_children);
2654 hv_pci_free_bridge_windows(hbus);
2655 irq_domain_remove(hbus->irq_domain);
2656 irq_domain_free_fwnode(hbus->sysdata.fwnode);
2657 put_hvpcibus(hbus);
2658 wait_for_completion(&hbus->remove_event);
2659 destroy_workqueue(hbus->wq);
2660 free_page((unsigned long)hbus);
2661 return 0;
2662}
2663
2664static const struct hv_vmbus_device_id hv_pci_id_table[] = {
2665 /* PCI Pass-through Class ID */
2666 /* 44C4F61D-4444-4400-9D52-802E27EDE19F */
2667 { HV_PCIE_GUID, },
2668 { },
2669};
2670
2671MODULE_DEVICE_TABLE(vmbus, hv_pci_id_table);
2672
2673static struct hv_driver hv_pci_drv = {
2674 .name = "hv_pci",
2675 .id_table = hv_pci_id_table,
2676 .probe = hv_pci_probe,
2677 .remove = hv_pci_remove,
2678};
2679
2680static void __exit exit_hv_pci_drv(void)
2681{
2682 vmbus_driver_unregister(&hv_pci_drv);
2683}
2684
2685static int __init init_hv_pci_drv(void)
2686{
2687 return vmbus_driver_register(&hv_pci_drv);
2688}
2689
2690module_init(init_hv_pci_drv);
2691module_exit(exit_hv_pci_drv);
2692
2693MODULE_DESCRIPTION("Hyper-V PCI");
2694MODULE_LICENSE("GPL v2");