inject-vm-err: handlers for injected errors

If Linux/EBP causes an error that HV can't handle, then instead of freezing the guest, HV injects the error back into the guest. This enables the guest to handle the error as gracefully as it can/needs. This changeset provides 2 parts: 1. sample handlers: minimal placeholder handlers that just dump the error information on to the console. This is to be used as a reference for any customized elaborate error handling that may be needed. 2. library module: it comes into existence only if/when any error handler is registered. Its main responsibilities: - map memory that's shared with HV where HV dumps all information about the errors. - register handlers for interrupts used by HV to inject errors - invoke custom error handlers when HV injects error JIRA ESV-312 Bug 2580803 Change-Id: Ia8c6484d423fd33cabbfd901f0f6ebb0da95cb40 Signed-off-by: Yashomati <ygodbole@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2214402 Reviewed-on: https://git-master.nvidia.com/r/2128765 GVS: Gerrit_Virtual_Submit Reviewed-by: Dmitry Pervushin <dpervushin@nvidia.com> Reviewed-by: Hardik T Shah <hardikts@nvidia.com> Reviewed-by: Rohit Upadhyay <rupadhyay@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
author: Yashomati <ygodbole@nvidia.com> 2019-05-31 21:59:52 -0400
committer: mobile promotions <svcmobile_promotions@nvidia.com> 2019-12-24 14:56:43 -0500
commit: 87dc30edda5936afa82b0afa821c8be2e44343c5 (patch)
tree: e1f61e27e96e88880626426db82dbe21c85e6053 /include/linux
parent: cda3f78dc40d0f21b1108a4087b6198fb53bde02 (diff)
2 files changed, 182 insertions, 0 deletions
diff --git a/include/linux/errinfo.h b/include/linux/errinfo.h
new file mode 100644
index 000000000..eca3a9bcb
--- /dev/null
+++ b/include/linux/errinfo.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ */
+#ifndef __INCLUDED_ERRINFO_H__
+#define __INCLUDED_ERRINFO_H__
+enum errReason {
+        REASON_UNDEFINED = 0UL,
+        REASON_ASYNC_SMMU_CB,
+        REASON_ASYNC_SMMU_GLOBAL,
+        REASON_ASYNC_BRIDGE,
+        REASON_ASYNC_MC,
+        REASON_SYNC_INSTR_ABORT,
+        REASON_SYNC_DATA_ABORT,
+        REASON_SYNC_OTHER,
+        REASON_ENUM_SIZE
+};
+enum errType {
+        SYNC = 0UL,
+        ASYNC
+};
+struct __attribute__((__packed__)) async_metaData {
+        uint64_t        rdIdx;
+        uint64_t        wrIdx;
+};
+#define NAME_SIZE 64
+struct __attribute__((__packed__)) async_bridgeErr {
+        char            br_name[NAME_SIZE];
+        unsigned int    err_addr;
+        unsigned int    err_status1;
+        unsigned int    err_status2;
+        unsigned int    rw;
+        unsigned int    err_type;
+        unsigned int    length;
+        unsigned int    br_id;
+        unsigned int    src_id;
+        unsigned int    axi_id;
+        unsigned int    count;
+        unsigned int    protection;
+        unsigned int    burst;
+        unsigned int    cache;
+};
+struct __attribute__((__packed__)) async_smmuErr {
+        unsigned int    stream_id;
+        unsigned int    cb_id;
+        unsigned int    fsynr0;
+        unsigned int    fsynr1;
+        uint64_t        far;
+        unsigned int    fsr;
+};
+struct __attribute__((__packed__))  async_mcErr {
+        uint64_t        ch_base;
+        unsigned int    int_status;
+        unsigned int    err_status;
+        uint64_t        fault_addr;
+        unsigned int    vcpuid;         //0xffffU; /* IDLE_vCPU_ID */
+        unsigned int    client_id;
+        int32_t         peripheral_id;
+};
+struct __attribute__((__packed__)) sync_dataAbort {
+        bool            isFilled;       //metadata field per VCpu
+        bool            isWrite;
+        uint8_t         accessSize;
+        unsigned int    offendingVCpuId;
+        unsigned int    esrEl2;
+        uint64_t        faultAddr;
+        uint64_t        spsrEl2;
+        uint64_t        elrEl1;
+        uint64_t        gprArray[31];
+};
+struct __attribute__((__packed__)) errData {
+        unsigned int    offendingGuestId;
+        enum errType    errType;
+        enum errReason  errReason;
+        union {
+                // *A*synchronous
+                struct async_bridgeErr  async_bridgeErr;
+                struct async_smmuErr    async_smmuErr;
+                struct async_mcErr      async_mcErr;
+                // Synchronous
+                struct sync_dataAbort   sync_dataAbort;
+        };
+};
+/* VM shared memory for error information is allocated contiguously to store
+ * Asynchronous(async) error information followed by the Synchronous(sync)
+ * error information. HV has write access and the VM has read access to this
+ * shared memory. The shared memory layout looks like:
+ *
+ * |--async-err-metadata--|--async-errors-array-|--sync-errors-array-|
+ *
+ * Size of async errors array = Max errors + 1(to avoid same empty and full
+ * conditions of the buffer)
+ * Size of sync errors array = 1 error per VCPU * number of VCPUs on a VM
+ *
+ * So for a give VM, shared memory has:
+ *
+ * |--------ASyncErrInfo----------------|-------SyncErrInfo-------------------|
+ * |--------1bufferPerVM----------------|---VCpu0-buffer---|--VCpuN-buffer----|
+ * |---metaData----|---errData----------|-metaData+errData-|-metaData+errData-|
+ * |-rdIdx-|-wrIdx-|-Err1-|-Err2-|-ErrN-|-isFilled-|-Err1--|-isFilled-|-Err1--|
+ */
+struct __attribute__((__packed__)) errInfo {
+        struct async_metaData   async_metaData;
+        struct errData          errData[];
+};
+#endif
diff --git a/include/linux/vm_err.h b/include/linux/vm_err.h
new file mode 100644
index 000000000..e8fcae8b6
--- /dev/null
+++ b/include/linux/vm_err.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2019 NVIDIA CORPORATION.  All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+#ifndef __VM_ERR_H_
+#define __VM_ERR_H_
+#if IS_ENABLED(CONFIG_TEGRA_VM_ERR_HANDLER)
+#include <linux/errinfo.h>
+struct vm_err_handlers {
+        /* return true, if error needs kernel to enter bad mode and reboot.
+         * return false, if error doesn't need reboot.
+         */
+        bool (*fn_self_async)(const struct errData *const err_data);
+        bool (*fn_self_sync)(const struct errData *const err_data);
+        bool (*fn_peer)(const struct errData *const err_data);
+};
+struct tegra_hv_config {
+        unsigned int guest_id_self;
+        unsigned int num_guests;
+};
+static const char * const fault_reason_desc[] = {
+        "Undefined",
+        "SMMU CB",
+        "SMMU Global",
+        "Bridge",
+        "Memory Controller",
+        "Instruction Abort",
+        "Data Abort",
+        "Other synchronous exception",
+};
+int tegra_hv_register_vm_err_hooks(struct vm_err_handlers *custom_handlers);
+void tegra_hv_get_config(struct tegra_hv_config *config);
+#else
+static inline int tegra_hv_register_vm_err_hooks(
+        struct vm_err_handlers *custom_handlers)
+{
+        pr_err("Can you please enable CONFIG_TEGRA_VM_ERR_HANDLER?");
+        return -EINVAL;
+}
+#endif
+#endif  /* __VM_ERR_H_ */
author	Yashomati <ygodbole@nvidia.com>	2019-05-31 21:59:52 -0400
committer	mobile promotions <svcmobile_promotions@nvidia.com>	2019-12-24 14:56:43 -0500
commit	87dc30edda5936afa82b0afa821c8be2e44343c5 (patch)
tree	e1f61e27e96e88880626426db82dbe21c85e6053 /include/linux
parent	cda3f78dc40d0f21b1108a4087b6198fb53bde02 (diff)

diff --git a/include/linux/errinfo.h b/include/linux/errinfo.h new file mode 100644 index 000000000..eca3a9bcb --- /dev/null +++ b/include/linux/errinfo.h
@@ -0,0 +1,124 @@
	1	/*
	2	* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
	3	*
	4	* NVIDIA CORPORATION and its licensors retain all intellectual property
	5	* and proprietary rights in and to this software, related documentation
	6	* and any modifications thereto. Any use, reproduction, disclosure or
	7	* distribution of this software and related documentation without an express
	8	* license agreement from NVIDIA CORPORATION is strictly prohibited.
	9	*/
	10
	11	#ifndef __INCLUDED_ERRINFO_H__
	12	#define __INCLUDED_ERRINFO_H__
	13
	14	enum errReason {
	15	REASON_UNDEFINED = 0UL,
	16	REASON_ASYNC_SMMU_CB,
	17	REASON_ASYNC_SMMU_GLOBAL,
	18	REASON_ASYNC_BRIDGE,
	19	REASON_ASYNC_MC,
	20	REASON_SYNC_INSTR_ABORT,
	21	REASON_SYNC_DATA_ABORT,
	22	REASON_SYNC_OTHER,
	23	REASON_ENUM_SIZE
	24	};
	25
	26	enum errType {
	27	SYNC = 0UL,
	28	ASYNC
	29	};
	30
	31	struct __attribute__((__packed__)) async_metaData {
	32	uint64_t rdIdx;
	33	uint64_t wrIdx;
	34	};
	35
	36	#define NAME_SIZE 64
	37
	38	struct __attribute__((__packed__)) async_bridgeErr {
	39	char br_name[NAME_SIZE];
	40	unsigned int err_addr;
	41	unsigned int err_status1;
	42	unsigned int err_status2;
	43	unsigned int rw;
	44	unsigned int err_type;
	45	unsigned int length;
	46	unsigned int br_id;
	47	unsigned int src_id;
	48	unsigned int axi_id;
	49	unsigned int count;
	50	unsigned int protection;
	51	unsigned int burst;
	52	unsigned int cache;
	53	};
	54
	55	struct __attribute__((__packed__)) async_smmuErr {
	56	unsigned int stream_id;
	57	unsigned int cb_id;
	58	unsigned int fsynr0;
	59	unsigned int fsynr1;
	60	uint64_t far;
	61	unsigned int fsr;
	62	};
	63
	64	struct __attribute__((__packed__)) async_mcErr {
	65	uint64_t ch_base;
	66	unsigned int int_status;
	67	unsigned int err_status;
	68	uint64_t fault_addr;
	69	unsigned int vcpuid; //0xffffU; /* IDLE_vCPU_ID */
	70	unsigned int client_id;
	71	int32_t peripheral_id;
	72	};
	73
	74	struct __attribute__((__packed__)) sync_dataAbort {
	75	bool isFilled; //metadata field per VCpu
	76	bool isWrite;
	77	uint8_t accessSize;
	78	unsigned int offendingVCpuId;
	79	unsigned int esrEl2;
	80	uint64_t faultAddr;
	81	uint64_t spsrEl2;
	82	uint64_t elrEl1;
	83	uint64_t gprArray[31];
	84	};
	85
	86	struct __attribute__((__packed__)) errData {
	87	unsigned int offendingGuestId;
	88	enum errType errType;
	89	enum errReason errReason;
	90	union {
	91	// Asynchronous
	92	struct async_bridgeErr async_bridgeErr;
	93	struct async_smmuErr async_smmuErr;
	94	struct async_mcErr async_mcErr;
	95	// Synchronous
	96	struct sync_dataAbort sync_dataAbort;
	97	};
	98	};
	99
	100	/* VM shared memory for error information is allocated contiguously to store
	101	* Asynchronous(async) error information followed by the Synchronous(sync)
	102	* error information. HV has write access and the VM has read access to this
	103	* shared memory. The shared memory layout looks like:
	104	*
	105	* \|--async-err-metadata--\|--async-errors-array-\|--sync-errors-array-\|
	106	*
	107	* Size of async errors array = Max errors + 1(to avoid same empty and full
	108	* conditions of the buffer)
	109	* Size of sync errors array = 1 error per VCPU * number of VCPUs on a VM
	110	*
	111	* So for a give VM, shared memory has:
	112	*
	113	* \|--------ASyncErrInfo----------------\|-------SyncErrInfo-------------------\|
	114	* \|--------1bufferPerVM----------------\|---VCpu0-buffer---\|--VCpuN-buffer----\|
	115	* \|---metaData----\|---errData----------\|-metaData+errData-\|-metaData+errData-\|
	116	* \|-rdIdx-\|-wrIdx-\|-Err1-\|-Err2-\|-ErrN-\|-isFilled-\|-Err1--\|-isFilled-\|-Err1--\|
	117	*/
	118
	119	struct __attribute__((__packed__)) errInfo {
	120	struct async_metaData async_metaData;
	121	struct errData errData[];
	122	};
	123
	124	#endif


diff --git a/include/linux/vm_err.h b/include/linux/vm_err.h new file mode 100644 index 000000000..e8fcae8b6 --- /dev/null +++ b/include/linux/vm_err.h
@@ -0,0 +1,58 @@
	1	/*
	2	* Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
	3	*
	4	* This software is licensed under the terms of the GNU General Public
	5	* License version 2, as published by the Free Software Foundation, and
	6	* may be copied, distributed, and modified under those terms.
	7	*
	8	* This program is distributed in the hope that it will be useful,
	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	11	* GNU General Public License for more details.
	12	*
	13	*/
	14
	15	#ifndef __VM_ERR_H_
	16	#define __VM_ERR_H_
	17
	18	#if IS_ENABLED(CONFIG_TEGRA_VM_ERR_HANDLER)
	19	#include <linux/errinfo.h>
	20
	21	struct vm_err_handlers {
	22	/* return true, if error needs kernel to enter bad mode and reboot.
	23	* return false, if error doesn't need reboot.
	24	*/
	25	bool (fn_self_async)(const struct errData const err_data);
	26	bool (fn_self_sync)(const struct errData const err_data);
	27	bool (fn_peer)(const struct errData const err_data);
	28	};
	29
	30	struct tegra_hv_config {
	31	unsigned int guest_id_self;
	32	unsigned int num_guests;
	33	};
	34
	35	static const char * const fault_reason_desc[] = {
	36	"Undefined",
	37	"SMMU CB",
	38	"SMMU Global",
	39	"Bridge",
	40	"Memory Controller",
	41	"Instruction Abort",
	42	"Data Abort",
	43	"Other synchronous exception",
	44	};
	45
	46	int tegra_hv_register_vm_err_hooks(struct vm_err_handlers *custom_handlers);
	47	void tegra_hv_get_config(struct tegra_hv_config *config);
	48
	49	#else
	50	static inline int tegra_hv_register_vm_err_hooks(
	51	struct vm_err_handlers *custom_handlers)
	52	{
	53	pr_err("Can you please enable CONFIG_TEGRA_VM_ERR_HANDLER?");
	54	return -EINVAL;
	55	}
	56	#endif
	57
	58	#endif /* __VM_ERR_H_ */