inject-vm-err: handlers for injected errors

If Linux/EBP causes an error that HV can't handle, then instead of freezing the guest, HV injects the error back into the guest. This enables the guest to handle the error as gracefully as it can/needs. This changeset provides 2 parts: 1. sample handlers: minimal placeholder handlers that just dump the error information on to the console. This is to be used as a reference for any customized elaborate error handling that may be needed. 2. library module: it comes into existence only if/when any error handler is registered. Its main responsibilities: - map memory that's shared with HV where HV dumps all information about the errors. - register handlers for interrupts used by HV to inject errors - invoke custom error handlers when HV injects error JIRA ESV-312 Bug 2580803 Change-Id: Ia8c6484d423fd33cabbfd901f0f6ebb0da95cb40 Signed-off-by: Yashomati <ygodbole@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2214402 Reviewed-on: https://git-master.nvidia.com/r/2128765 GVS: Gerrit_Virtual_Submit Reviewed-by: Dmitry Pervushin <dpervushin@nvidia.com> Reviewed-by: Hardik T Shah <hardikts@nvidia.com> Reviewed-by: Rohit Upadhyay <rupadhyay@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Yashomati <ygodbole@nvidia.com> 2019-05-31 21:59:52 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2019-12-24 14:56:43 -0500
commit: 87dc30edda5936afa82b0afa821c8be2e44343c5 (patch)
tree: e1f61e27e96e88880626426db82dbe21c85e6053 /drivers
parent: cda3f78dc40d0f21b1108a4087b6198fb53bde02 (diff)
4 files changed, 862 insertions, 0 deletions
diff --git a/drivers/virt/tegra/Kconfig b/drivers/virt/tegra/Kconfig
index a29ffe03f..222ea3eb9 100644
--- a/drivers/virt/tegra/Kconfig
+++ b/drivers/virt/tegra/Kconfig
@@ -50,3 +50,12 @@ config TEGRA_HV_SYSFS
          Can be made a module (=m) to save boot time
          If unsure, say Y here
+config TEGRA_VM_ERR_HANDLER
+        tristate "Nvidia Tegra handler for VM error notifications"
+        depends on TEGRA_VIRTUALIZATION
+          default y
+        help
+          Provides a handler that receives VM error notifications
+          from the Hypervisor.
+            If unsure, keep Y
diff --git a/drivers/virt/tegra/Makefile b/drivers/virt/tegra/Makefile
index c03414695..c36d2c8b0 100644
--- a/drivers/virt/tegra/Makefile
+++ b/drivers/virt/tegra/Makefile
@@ -8,3 +8,6 @@ obj-$(CONFIG_TEGRA_HV_MANAGER)		+= tegra_hv.o ivc-cdev.o
 obj-$(CONFIG_TEGRA_HV_MANAGER)          += userspace_ivc_mempool.o
 obj-$(CONFIG_TEGRA_HV_SYSFS)            += hvc_sysfs.o
 obj-$(CONFIG_TEGRA_HV_WDT_HANDLER)      += tegra_hv_wdt_handler.o
+obj-$(CONFIG_TEGRA_VM_ERR_HANDLER)      += vm_err.o
+obj-$(CONFIG_TEGRA_VM_ERR_HANDLER)      += vm_err_sample_handler.o
diff --git a/drivers/virt/tegra/vm_err.c b/drivers/virt/tegra/vm_err.c
new file mode 100644
index 000000000..d9f11248c
--- /dev/null
+++ b/drivers/virt/tegra/vm_err.c
@@ -0,0 +1,535 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) "vm-err: " fmt
+#include <linux/interrupt.h>
+#include <linux/of_irq.h>
+#include <linux/platform_device.h>
+#include <linux/vm_err.h>
+#include <asm/traps.h>
+#include <asm-generic/irq_regs.h>
+#include <asm/system_misc.h>
+#include <soc/tegra/virt/syscalls.h>
+#include <soc/tegra/chip-id.h>
+struct tegra_hv_err_ctrl {
+        struct device *dev;
+        struct errInfo *err_info;
+        unsigned int async_err_arr_items;
+        int hv_peer_err_irq_id;
+        unsigned int vcpu_cnt;
+        struct serr_hook hook;
+        struct vm_err_handlers handlers;
+};
+static struct tegra_hv_config config;
+static unsigned int intr_info[3]; /* intr_property_size = 3 */
+static struct property interrupts_prop = {
+        .name = "interrupts",
+};
+static bool check_sync_err(const unsigned int vcpu_id,
+        const struct tegra_hv_err_ctrl *const ctrl,
+        bool *send_sync_err_ack)
+{
+        uint64_t rd_idx;
+        const struct errData *err_data;
+        if (vcpu_id >= ctrl->vcpu_cnt) {
+                dev_crit(ctrl->dev, "%s: Invalid vcpu id %u\n", __func__,
+                        vcpu_id);
+                *send_sync_err_ack = false;
+                /* Unexpected vcpu id. Enter bad mode. */
+                return true;
+        }
+        /* Shared memory layout is:
+         * |--async-err-metadata--|--async-errors-array-|--sync-errors-array-|
+         * Size of async errors array = Max errors + 1(to avoid same empty
+         * and full conditions of the buffer)
+         * Size of sync errors array = 1 error per VCPU * number of VCPUs in VM
+         */
+        rd_idx = ctrl->async_err_arr_items + vcpu_id;
+        /* It's already validated at init time that sufficient memory is
+         * allocated to hold async_err_arr_items + sync error per vcpu. Hence,
+         * after validating the vcpu_id above, no need to validate rd_idx here.
+         */
+        err_data = &(ctrl->err_info->errData[rd_idx]);
+        if (!err_data->sync_dataAbort.isFilled) {
+                *send_sync_err_ack = false;
+                dev_info(ctrl->dev, "No synchronous error data on vcpu %u\n",
+                        vcpu_id);
+                /* No sync error. No need to enter bad mode. */
+                return false;
+        }
+        if (err_data->errType != SYNC) {
+                dev_crit(ctrl->dev, "%s: unexpected error Type %d\n",
+                        __func__, err_data->errType);
+                *send_sync_err_ack = true;
+                /* Unexpected error id. Enter bad mode. */
+                return true;
+        }
+        if (err_data->offendingGuestId != config.guest_id_self) {
+                dev_crit(ctrl->dev, "%s: invalid offender id %u\n", __func__,
+                        err_data->offendingGuestId);
+                *send_sync_err_ack = true;
+                /* Invalid id of offending guest. Enter bad mode. */
+                return true;
+        }
+        dev_err(ctrl->dev, "Synchronous error on vcpu %u\n", vcpu_id);
+        if (ctrl->handlers.fn_self_sync) {
+                *send_sync_err_ack = true;
+                /* Enter bad_mode (or otherwise) as custom handler dictates */
+                return ctrl->handlers.fn_self_sync(err_data);
+        }
+        /* should never reach here */
+        *send_sync_err_ack = true;
+        /* Reaching here is unexpected. Enter bad mode. */
+        return true;
+}
+static irqreturn_t async_err_handler(int irq, void *context)
+{
+        unsigned int num_async_errs_read = 0;
+        bool enter_bad_mode = false;
+        const struct tegra_hv_err_ctrl *const ctrl = context;
+        const unsigned int vcpu_id = hyp_read_vcpu_id();
+        uint64_t local_rd_idx, next_rd_idx;
+        const struct errData *err_data;
+        bool (*fn_self_async)(const struct errData *const err_data);
+        bool (*fn_peer)(const struct errData *const err_data);
+        bool (*handler)(const struct errData *const err_data);
+        struct pt_regs *regs;
+        if (vcpu_id != 0) {
+                dev_err(ctrl->dev, "Asynchronous error on vcpu %u\n", vcpu_id);
+                /* Only VCPU0 is expected to receive async error vIRQ */
+                return IRQ_HANDLED;
+        }
+        fn_self_async = ctrl->handlers.fn_self_async;
+        fn_peer = ctrl->handlers.fn_peer;
+        if ((fn_self_async == NULL) && (fn_peer == NULL)) {
+                dev_err(ctrl->dev, "Asynchronous error handlers absent\n");
+                return IRQ_HANDLED;
+        }
+        local_rd_idx = ctrl->err_info->async_metaData.rdIdx;
+        dev_dbg(ctrl->dev, "Local Rd Idx = %llu, shared Wr Idx = %llu\n",
+                local_rd_idx, ctrl->err_info->async_metaData.wrIdx);
+        /* Check async error. Read until error queue gets empty */
+        while (local_rd_idx != ctrl->err_info->async_metaData.wrIdx) {
+                next_rd_idx = (local_rd_idx + 1) % ctrl->async_err_arr_items;
+                err_data = &(ctrl->err_info->errData[next_rd_idx]);
+                if (err_data->offendingGuestId == config.guest_id_self)
+                        handler = fn_self_async;
+                else
+                        handler = fn_peer;
+                if (handler) {
+                        if (handler(err_data) == true)
+                                enter_bad_mode = true;
+                }
+                local_rd_idx = next_rd_idx;
+                num_async_errs_read++;
+                dev_dbg(ctrl->dev, "Local Rd Idx = %llu\n", local_rd_idx);
+        }
+        if (num_async_errs_read) {
+                dev_err(ctrl->dev, "%u asynchronous error(s) read\n",
+                        num_async_errs_read);
+                /* Send ack for async error(s) to HV */
+                if (hyp_send_async_err_ack(local_rd_idx) != 0) {
+                        dev_crit(ctrl->dev,
+                                "%s: Sending ack failed. Setting bad mode\n",
+                                __func__);
+                        /* Unexpected */
+                        enter_bad_mode = true;
+                }
+        }
+        if (enter_bad_mode) {
+                regs = get_irq_regs();
+                die("Oops - bad mode", regs, 0);
+                panic("bad mode");
+        }
+        return IRQ_HANDLED;
+}
+static int sync_err_handler(struct pt_regs *regs, int reason,
+        uint32_t esr, void *context)
+{
+        bool enter_bad_mode = false;
+        bool send_sync_err_ack = false;
+        const struct tegra_hv_err_ctrl *const ctrl = context;
+        const unsigned int vcpu_id = hyp_read_vcpu_id();
+        /* Check sync error */
+        if (check_sync_err(vcpu_id, ctrl, &send_sync_err_ack) == true)
+                enter_bad_mode = true;
+        /* Send ack for error to HV. */
+        if (send_sync_err_ack) {
+                if (hyp_send_sync_err_ack(send_sync_err_ack) != 0) {
+                        dev_crit(ctrl->dev,
+                                "%s: Sending ack failed. Setting bad mode\n",
+                                __func__);
+                        /* Unexpected */
+                        enter_bad_mode = true;
+                }
+        }
+        /* Caller expects 0 to enter bad mode */
+        return (!enter_bad_mode);
+}
+void tegra_hv_get_config(struct tegra_hv_config *cfg)
+{
+        cfg->guest_id_self = config.guest_id_self;
+        cfg->num_guests = config.num_guests;
+}
+EXPORT_SYMBOL(tegra_hv_get_config);
+static int virq_handler_init(const struct platform_device *pdev)
+{
+        int ret;
+        struct irq_data *peer_err_irq_data;
+        int lin_peer_err_irq_id;
+        struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
+        struct device dev = pdev->dev;
+        dev_info(ctrl->dev, "Error notification HV IRQ id: %d\n",
+                ctrl->hv_peer_err_irq_id);
+        /* Ensure HV returned valid irq */
+        if (ctrl->hv_peer_err_irq_id == -1)
+                return 0;
+        /* Set indicate irq type 0 to indicate Shared Peripheral Irq */
+        intr_info[0] = cpu_to_be32(0);
+        /* Id in SPI namespace - subtract number of PPIs
+         * (Private Peripheral Irqs) which is = 32
+         */
+        intr_info[1] = cpu_to_be32(ctrl->hv_peer_err_irq_id - 32);
+        /* Trigger irq on low-to-high edge (0x1) */
+        intr_info[2] = cpu_to_be32(IRQF_TRIGGER_RISING);
+        interrupts_prop.length = sizeof(intr_info);
+        dev_info(ctrl->dev, "interrupts_prop.length %u\n",
+                interrupts_prop.length);
+        interrupts_prop.value = intr_info;
+        if (of_add_property(dev.of_node, &interrupts_prop)) {
+                dev_err(ctrl->dev, "%s: failed to add interrupts property\n",
+                        __func__);
+                return -EACCES;
+        }
+        lin_peer_err_irq_id = of_irq_get(dev.of_node, 0);
+        if (lin_peer_err_irq_id < 0) {
+                dev_err(ctrl->dev, "%s: Unable to get Linux irq for id %d\n",
+                        __func__, ctrl->hv_peer_err_irq_id);
+                return lin_peer_err_irq_id;
+        }
+        peer_err_irq_data = irq_get_irq_data(lin_peer_err_irq_id);
+        if (peer_err_irq_data == NULL) {
+                dev_err(ctrl->dev, "%s: Failed to get data for Linux irq %d\n",
+                        __func__, lin_peer_err_irq_id);
+                return -ENODEV;
+        }
+        ret = devm_request_irq(&dev, lin_peer_err_irq_id, async_err_handler,
+                        IRQ_NOTHREAD, dev_name(&dev), ctrl);
+        if (ret < 0) {
+                dev_err(ctrl->dev,
+                        "%s: failed to register IRQ %d, Err %d, %s\n",
+                        __func__, lin_peer_err_irq_id, ret, pdev->name);
+                return ret;
+        }
+        dev_info(ctrl->dev, "Registered Linux IRQ %d for peer notification\n",
+                lin_peer_err_irq_id);
+        return 0;
+}
+static int serr_handler_init(struct platform_device *pdev)
+{
+        struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
+        ctrl->hook.fn = sync_err_handler;
+        ctrl->hook.priv = platform_get_drvdata(pdev);
+        register_serr_hook(&ctrl->hook);
+        return 0;
+}
+static int shared_mem_map(struct platform_device *pdev)
+{
+        uint64_t ipa, buff_size, required_size;
+        int ret;
+        struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
+        /* Get error info details */
+        ret = hyp_read_err_info_get(&ipa, &buff_size,
+                &ctrl->async_err_arr_items, &ctrl->hv_peer_err_irq_id,
+                &ctrl->vcpu_cnt);
+        if (ret != 0) {
+                /* It could come here if DTS and defconfig enable execution
+                 * of this code, but HV hasn't implemented the hypercall.
+                 * Flag error.
+                 */
+                dev_err(ctrl->dev,
+                        "%s: failed to get err memory address. Err %d\n",
+                        __func__, ret);
+                return -ENODEV;
+        }
+        if ((ipa == 0) || (buff_size == 0) ||
+                (ctrl->async_err_arr_items == 0)) {
+                /* It could come here if DTS and defconfig enable execution
+                 * of this code, but PCT hasn't enabled error injection.
+                 * A warning should suffice.
+                 */
+                dev_warn(ctrl->dev, "%s: invalid shared memory parameters\n",
+                        __func__);
+                return -ENOMEM;
+        }
+        /* Shared memory layout is:
+         * |--async-err-metadata--|--async-errors-array-|--sync-errors-array-|
+         * Size of async errors array = Max errors + 1 (to avoid same empty and
+         * full conditions of the buffer)
+         * Size of sync errors array = 1 error per VCPU * number of VCPUs on
+         * a VM
+         */
+        required_size = sizeof(struct async_metaData) +
+                (sizeof(struct errData) *
+                (ctrl->async_err_arr_items + ctrl->vcpu_cnt));
+        if (buff_size < required_size) {
+                dev_err(ctrl->dev,
+                        "%s:invalid params. size %llu. required size %llu\n",
+                        __func__, buff_size, required_size);
+                dev_err(ctrl->dev, "%s: async arr size %u. vcpus %u\n",
+                        __func__, ctrl->async_err_arr_items, ctrl->vcpu_cnt);
+                return -ENOMEM;
+        }
+        dev_info(ctrl->dev, "%s: Err info IPA for guest %u: 0x%llx\n",
+                __func__, config.guest_id_self, ipa);
+        dev_info(ctrl->dev, "Err info buf size 0x%llX\n", buff_size);
+        dev_info(ctrl->dev, "Async err arr size %u. Number of VCPUs %u\n",
+                ctrl->async_err_arr_items, ctrl->vcpu_cnt);
+        /* Map shared memory */
+        ctrl->err_info = (struct errInfo *) ioremap_cache(ipa, buff_size);
+        if (ctrl->err_info == NULL)
+                return -ENOMEM;
+        return 0;
+}
+static int hyp_config_init(struct device *dev)
+{
+        int ret = hyp_read_gid(&config.guest_id_self);
+        if (ret != 0) {
+                dev_err(dev, "%s: failed to read guest id. Err %d\n",
+                        __func__, ret);
+                return ret;
+        }
+        ret = hyp_read_nguests(&config.num_guests);
+        if (ret != 0) {
+                /* Only privileged guest can query number of guests */
+                dev_warn(dev, "%s: can't read number of guests. Err %d\n",
+                        __func__, ret);
+        }
+        dev_info(dev, "%s: guest id %u num guests %u\n", __func__,
+                config.guest_id_self, config.num_guests);
+        return 0;
+}
+static void shared_structs_check(struct device *dev)
+{
+        /* Ensure coherency with common header */
+        BUILD_BUG_ON(REASON_ENUM_SIZE != (ARRAY_SIZE(fault_reason_desc)));
+        /* Manually compare these sizes with HV console dump to ensure
+         * common structures shared by HV and Linux are in sync
+         */
+        dev_info(dev, "async_metaData size 0x%lx\n",
+                sizeof(struct async_metaData));
+        dev_info(dev, "async_bridgeErr size 0x%lx\n",
+                sizeof(struct async_bridgeErr));
+        dev_info(dev, "async_smmuErr size 0x%lx\n",
+                sizeof(struct async_smmuErr));
+        dev_info(dev, "async_mcErr size 0x%lx\n",
+                sizeof(struct async_mcErr));
+        dev_info(dev, "sync_dataAbort size 0x%lx\n",
+                sizeof(struct sync_dataAbort));
+        dev_info(dev, "errData size 0x%lx\n", sizeof(struct errData));
+}
+static int vm_err_handler_init(struct platform_device *pdev)
+{
+        int ret;
+        struct tegra_hv_err_ctrl *ctrl;
+        struct device *dev = &pdev->dev;
+        if (!is_tegra_hypervisor_mode()) {
+                dev_err(dev, "%s: hypervisor is not present\n", __func__);
+                return -ENODEV;
+        }
+        shared_structs_check(dev);
+        ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL);
+        if (!ctrl)
+                return -ENOMEM;
+        ctrl->dev = dev;
+        platform_set_drvdata(pdev, ctrl);
+        ret = hyp_config_init(dev);
+        if (ret)
+                return ret;
+        ret = shared_mem_map(pdev);
+        if (ret)
+                return -ENOMEM;
+        ret = serr_handler_init(pdev);
+        if (ret)
+                return ret;
+        ret = virq_handler_init(pdev);
+        if (ret)
+                return ret;
+        return 0;
+}
+static int vm_err_handler_remove(struct platform_device *pdev)
+{
+        struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
+        struct device_node *node = pdev->dev.of_node;
+        if (of_remove_property(node,
+                of_find_property(node, "interrupts", NULL))) {
+                dev_err(ctrl->dev, "%s: failed to add interrupts property\n",
+                        __func__);
+                return -EACCES;
+        }
+        unregister_serr_hook(&ctrl->hook);
+        iounmap(ctrl->err_info);
+        dev_info(ctrl->dev, "%s: cleaned up and unregistered handler\n",
+                __func__);
+        return 0;
+}
+static const struct of_device_id tegra_hv_err_match[] = {
+        { .compatible = "nvidia,tegra-hv-err", .data = NULL},
+        {},
+};
+static struct platform_driver tegra_hv_err_pdriver = {
+        .driver = {
+                .name = "tegra-hv-err-handler",
+                .owner = THIS_MODULE,
+                .of_match_table = of_match_ptr(tegra_hv_err_match),
+        },
+        .probe = vm_err_handler_init,
+        .remove = vm_err_handler_remove,
+};
+static int tegra_hv_register_hooks_for_device(struct device *dev,
+        void *handlers)
+{
+        struct tegra_hv_err_ctrl *ctrl;
+        const struct platform_device *pd = container_of(dev,
+                struct platform_device, dev);
+        const struct vm_err_handlers *_handlers =
+                (struct vm_err_handlers *) handlers;
+        ctrl = platform_get_drvdata(pd);
+        if (!ctrl) {
+                dev_err(dev, "%s: no platform data", __func__);
+                return 0;
+        }
+        if (ctrl->handlers.fn_self_async == NULL)
+                ctrl->handlers.fn_self_async = _handlers->fn_self_async;
+        if (ctrl->handlers.fn_self_sync == NULL)
+                ctrl->handlers.fn_self_sync = _handlers->fn_self_sync;
+        if (ctrl->handlers.fn_peer == NULL)
+                ctrl->handlers.fn_peer = _handlers->fn_peer;
+        return 0;
+}
+int tegra_hv_register_vm_err_hooks(struct vm_err_handlers *handlers)
+{
+        int ret;
+        if (!handlers) {
+                pr_err("%s: invalid error handlers\n", __func__);
+                return 1;
+        }
+        if (!handlers->fn_self_async && !handlers->fn_self_sync
+                && !handlers->fn_peer) {
+                platform_driver_unregister(&tegra_hv_err_pdriver);
+                return 0;
+        }
+        if (!tegra_hv_err_pdriver.driver.p) {
+                /* Not registered/bound yet */
+                ret = platform_driver_register(&tegra_hv_err_pdriver);
+                if (ret) {
+                        pr_err("%s: failed to register driver. Err %d\n",
+                                __func__, ret);
+                        return ret;
+                }
+        }
+        ret = driver_for_each_device(&tegra_hv_err_pdriver.driver, NULL,
+                handlers, tegra_hv_register_hooks_for_device);
+        if (ret) {
+                pr_err("%s: failed to attach driver. Err %d\n", __func__, ret);
+                return ret;
+        }
+        return 0;
+}
+EXPORT_SYMBOL(tegra_hv_register_vm_err_hooks);
diff --git a/drivers/virt/tegra/vm_err_sample_handler.c b/drivers/virt/tegra/vm_err_sample_handler.c
new file mode 100644
index 000000000..fea81363e
--- /dev/null
+++ b/drivers/virt/tegra/vm_err_sample_handler.c
@@ -0,0 +1,315 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#define pr_fmt(fmt) "vm-err-sample-handler: " fmt
+#include <linux/module.h>
+#include <linux/vm_err.h>
+/* Bridge error details:
+ * Note: These are redefined here only to allow user friendly messages
+ * describing the error.
+ * This must match with "Timeout error" value in t18x_axi_errors[]
+ * in nvidia/drivers/platform/tegra/bridge_mca.c
+ */
+static const unsigned int BRIDGE_ERROR_TIMEOUT = 18;
+/* This must match with "CCPLEX" value in src_ids
+ * in nvidia/drivers/platform/tegra/bridge_mca.c
+ */
+static const unsigned int BRIDGE_SRC_ID_CCPLEX = 1;
+/* This must match with corresponding HV definition in pct.h */
+static const unsigned int GUEST_UNASSIGNED = 18;
+static struct vm_err_handlers handlers;
+static struct tegra_hv_config config;
+static void print_bridge_error(const struct errData * const err_data)
+{
+        const struct async_bridgeErr * const br_err_data =
+                &err_data->async_bridgeErr;
+        unsigned int protection;
+        pr_crit("Bridge error details\n");
+        pr_crit("--------------------------------------\n");
+        pr_crit("Err count %d: %s FAULT ADDR 0x%x status1 0x%x status2 0x%x\n",
+                br_err_data->count, br_err_data->br_name, br_err_data->err_addr,
+                br_err_data->err_status1, br_err_data->err_status2);
+        pr_crit("\tDirection: %s\n", br_err_data->rw ? "READ" : "WRITE");
+        pr_crit("\tBridge ID: 0x%x\n", br_err_data->br_id);
+        pr_crit("\tError type: %u %s\n",
+                br_err_data->err_type,
+                (br_err_data->err_type == BRIDGE_ERROR_TIMEOUT) ?
+                        "(Timeout)" : "");
+        pr_crit("\tLength: %d\n", br_err_data->length);
+        protection = br_err_data->protection;
+        pr_crit("\tProtection: 0x%x %s %s %s access\n", protection,
+                (protection & 0x4) ? "Instruction" : "Data",
+                (protection & 0x2) ? "Non-Secure" : "Secure",
+                (protection & 0x1) ? "Privileged" : "Unprivileged");
+        pr_crit("\tSource ID: 0x%x -- %s\n",
+                br_err_data->src_id,
+                (br_err_data->src_id == BRIDGE_SRC_ID_CCPLEX) ?
+                        " (CCPLEX)" : "");
+        pr_crit("\tAXI_ID: 0x%x\n", br_err_data->axi_id);
+        pr_crit("\tCache: 0x%x\n", br_err_data->cache);
+        pr_crit("\tBurst: 0x%x\n", br_err_data->burst);
+        pr_crit("--------------------------------------\n");
+}
+static void print_smmu_error(const struct errData * const err_data,
+                                const enum errReason reason)
+{
+        const struct async_smmuErr * const smmu_err_data =
+                &err_data->async_smmuErr;
+        pr_crit("SMMU error details\n");
+        pr_crit("--------------------------------------\n");
+        if (reason == REASON_ASYNC_SMMU_CB) {
+                pr_crit("SMMU Context Bank %u error. StreamID: %d\n",
+                        smmu_err_data->cb_id, smmu_err_data->stream_id);
+        } else if (reason == REASON_ASYNC_SMMU_GLOBAL) {
+                pr_crit("Global SMMU fault. CB: %u. StreamID: %d\n",
+                        smmu_err_data->cb_id, smmu_err_data->stream_id);
+        } else {
+                pr_crit("Unexpected fault reason %d\n", reason);
+        }
+        pr_crit("FSR: 0x%x; FAR: 0x%llx; FSYND0: 0x%x; FSYND1: 0x%x\n",
+                        smmu_err_data->fsr, smmu_err_data->far,
+                        smmu_err_data->fsynr0, smmu_err_data->fsynr1);
+        pr_crit("--------------------------------------\n");
+}
+static void print_mc_error(const struct errData * const err_data)
+{
+        const struct async_mcErr * const mc_err_data = &err_data->async_mcErr;
+        pr_crit("Memory Controller error details\n");
+        pr_crit("--------------------------------------\n");
+        pr_crit("mc_err: base: 0x%llx, int_status: 0x%08x; err_status: 0x%08x;"
+                " fault_addr: 0x%llx\n",
+                mc_err_data->ch_base, mc_err_data->int_status,
+                mc_err_data->err_status, mc_err_data->fault_addr);
+        pr_crit("vcpuid %u, client_id %u, peripheral_id %d\n",
+                mc_err_data->vcpuid, mc_err_data->client_id,
+                mc_err_data->peripheral_id);
+        pr_crit("--------------------------------------\n");
+}
+static void print_data_abort(const struct errData *const err_data)
+{
+        const struct sync_dataAbort * const data_abort =
+                &err_data->sync_dataAbort;
+        pr_crit("Data abort details\n");
+        pr_crit("--------------------------------------\n");
+        pr_crit("offending VCpu Id %u\n", data_abort->offendingVCpuId);
+        (data_abort->isWrite) ?
+                pr_crit("write access\n") : pr_crit("read access\n");
+        pr_crit("access size %u\n", data_abort->accessSize);
+        pr_crit("fault address: 0x%llx\n", data_abort->faultAddr);
+        pr_crit("esr: 0x%x\n", data_abort->esrEl2);
+        pr_crit("spsr_el2: 0x%llx\n", data_abort->spsrEl2);
+        pr_crit("elr_el1: 0x%llx\n", data_abort->elrEl1);
+        pr_crit("gprArray[0]: 0x%llx\n", data_abort->gprArray[0]);
+        pr_crit("gprArray[15]: 0x%llx\n", data_abort->gprArray[15]);
+        pr_crit("gprArray[30]: 0x%llx\n", data_abort->gprArray[30]);
+        pr_crit("--------------------------------------\n");
+}
+static bool handle_async_err_details(const struct errData * const err_data)
+{
+        bool enter_bad_mode;
+        if (err_data->errType != ASYNC) {
+                pr_crit("%s: incorrect error type: %d\n", __func__,
+                        err_data->errType);
+                /* Unexpected error type. Enter bad mode. */
+                return true;
+        }
+        pr_info("%s: error reason: %s\n", __func__,
+                fault_reason_desc[err_data->errReason]);
+        switch (err_data->errReason) {
+        case REASON_ASYNC_BRIDGE:
+                print_bridge_error(err_data);
+                /* Bridge error may not be fatal */
+                enter_bad_mode = false;
+                break;
+        case REASON_ASYNC_SMMU_CB:
+                print_smmu_error(err_data, err_data->errReason);
+                /* SMMU context bank error may not be fatal */
+                enter_bad_mode = false;
+                break;
+        case REASON_ASYNC_SMMU_GLOBAL:
+                print_smmu_error(err_data, err_data->errReason);
+                /* Can't recover from global SMMU error. */
+                enter_bad_mode = true;
+                break;
+        case REASON_ASYNC_MC:
+                print_mc_error(err_data);
+                enter_bad_mode = false;
+                break;
+        default:
+                pr_crit("%s: unhandled error. Reason id %d\n", __func__,
+                        err_data->errReason);
+                enter_bad_mode = true;
+                break;
+        }
+        return enter_bad_mode;
+}
+static bool handle_sync_err_details(const struct errData * const err_data)
+{
+        /* Currently only data abort error injection is supported */
+        if (err_data->errReason != REASON_SYNC_DATA_ABORT) {
+                pr_crit("%s: unexpected reason id %u\n", __func__,
+                        err_data->errReason);
+                /* Invalid reason. Enter bad mode. */
+                return true;
+        }
+        pr_info("%s: error reason: %s\n", __func__,
+                fault_reason_desc[err_data->errReason]);
+        print_data_abort(err_data);
+        /* Recovery from sync error could be impossible. Enter bad mode. */
+        return true;
+}
+static bool handle_peer_err_details(const struct errData * const err_data)
+{
+        bool enter_bad_mode;
+        const unsigned int offender = err_data->offendingGuestId;
+        if (offender >= config.num_guests) {
+                if (offender != GUEST_UNASSIGNED) {
+                        pr_crit("%s: invalid offending peer guest id %u\n",
+                                __func__, offender);
+                        /* Unexpected. Cause reboot. */
+                        return true;
+                }
+                pr_crit("%s: HV can't attribute error to any guest\n",
+                        __func__);
+        } else
+                pr_crit("Peer error. Offending guest id = %u\n", offender);
+        pr_crit("Error Type: %s\n", (err_data->errType == SYNC) ?
+                "Synchronous" : "Asynchronous");
+        if (err_data->errReason >= REASON_ENUM_SIZE) {
+                pr_crit("%s: unexpected reason id %u\n", __func__,
+                        err_data->errReason);
+                /* Unexpected. Cause reboot. */
+                return true;
+        }
+        pr_crit("%s: error reason: %s\n", __func__,
+                fault_reason_desc[err_data->errReason]);
+        switch (err_data->errReason) {
+        case REASON_ASYNC_BRIDGE:
+                print_bridge_error(err_data);
+                enter_bad_mode = false;
+                break;
+        case REASON_ASYNC_SMMU_CB:
+        case REASON_ASYNC_SMMU_GLOBAL:
+                print_smmu_error(err_data, err_data->errReason);
+                enter_bad_mode = false;
+                break;
+        case REASON_ASYNC_MC:
+                print_mc_error(err_data);
+                enter_bad_mode = false;
+                break;
+        case REASON_SYNC_DATA_ABORT:
+                print_data_abort(err_data);
+                enter_bad_mode = false;
+                break;
+        default:
+                pr_crit("%s: unhandled error. Reason id %d\n", __func__,
+                err_data->errReason);
+                enter_bad_mode = false;
+                break;
+        }
+        if (offender == GUEST_UNASSIGNED)
+                enter_bad_mode = true;
+        return enter_bad_mode;
+}
+static bool self_async_err_handler(const struct errData *const err_data)
+{
+        return handle_async_err_details(err_data);
+}
+static bool self_sync_err_handler(const struct errData *const err_data)
+{
+        return handle_sync_err_details(err_data);
+}
+static bool peer_err_handler(const struct errData *const err_data)
+{
+        return handle_peer_err_details(err_data);
+}
+static int hooks_init(void)
+{
+        int ret;
+        handlers.fn_self_async = self_async_err_handler;
+        handlers.fn_self_sync = self_sync_err_handler;
+        handlers.fn_peer =
+                IS_ENABLED(CONFIG_TEGRA_EBP) ? NULL : peer_err_handler;
+        ret = tegra_hv_register_vm_err_hooks(&handlers);
+        if (ret)
+                return ret;
+        tegra_hv_get_config(&config);
+        pr_info("%s: Guest Id %u\n", __func__, config.guest_id_self);
+        /* EBP, being unprivileged, doesn't know about total guests */
+        if (IS_ENABLED(CONFIG_TEGRA_EBP) == 0)
+                pr_info("%s: Total guests %u\n", __func__, config.num_guests);
+        return 0;
+}
+static void hooks_exit(void)
+{
+        struct vm_err_handlers handlers;
+        handlers.fn_self_async = NULL;
+        handlers.fn_self_sync = NULL;
+        handlers.fn_peer = NULL;
+        tegra_hv_register_vm_err_hooks(&handlers);
+}
+subsys_initcall(hooks_init);
+module_exit(hooks_exit);
+MODULE_AUTHOR("Nvidia Corporation");
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("Sample VM Error Handler");
author	Yashomati <ygodbole@nvidia.com>	2019-05-31 21:59:52 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2019-12-24 14:56:43 -0500
commit	87dc30edda5936afa82b0afa821c8be2e44343c5 (patch)
tree	e1f61e27e96e88880626426db82dbe21c85e6053 /drivers
parent	cda3f78dc40d0f21b1108a4087b6198fb53bde02 (diff)