summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
authorYashomati <ygodbole@nvidia.com>2019-05-31 21:59:52 -0400
committermobile promotions <svcmobile_promotions@nvidia.com>2019-12-24 14:56:43 -0500
commit87dc30edda5936afa82b0afa821c8be2e44343c5 (patch)
treee1f61e27e96e88880626426db82dbe21c85e6053 /drivers
parentcda3f78dc40d0f21b1108a4087b6198fb53bde02 (diff)
inject-vm-err: handlers for injected errors
If Linux/EBP causes an error that HV can't handle, then instead of freezing the guest, HV injects the error back into the guest. This enables the guest to handle the error as gracefully as it can/needs. This changeset provides 2 parts: 1. sample handlers: minimal placeholder handlers that just dump the error information on to the console. This is to be used as a reference for any customized elaborate error handling that may be needed. 2. library module: it comes into existence only if/when any error handler is registered. Its main responsibilities: - map memory that's shared with HV where HV dumps all information about the errors. - register handlers for interrupts used by HV to inject errors - invoke custom error handlers when HV injects error JIRA ESV-312 Bug 2580803 Change-Id: Ia8c6484d423fd33cabbfd901f0f6ebb0da95cb40 Signed-off-by: Yashomati <ygodbole@nvidia.com> Reviewed-on: https://git-master.nvidia.com/r/2214402 Reviewed-on: https://git-master.nvidia.com/r/2128765 GVS: Gerrit_Virtual_Submit Reviewed-by: Dmitry Pervushin <dpervushin@nvidia.com> Reviewed-by: Hardik T Shah <hardikts@nvidia.com> Reviewed-by: Rohit Upadhyay <rupadhyay@nvidia.com> Reviewed-by: mobile promotions <svcmobile_promotions@nvidia.com> Tested-by: mobile promotions <svcmobile_promotions@nvidia.com>
Diffstat (limited to 'drivers')
-rw-r--r--drivers/virt/tegra/Kconfig9
-rw-r--r--drivers/virt/tegra/Makefile3
-rw-r--r--drivers/virt/tegra/vm_err.c535
-rw-r--r--drivers/virt/tegra/vm_err_sample_handler.c315
4 files changed, 862 insertions, 0 deletions
diff --git a/drivers/virt/tegra/Kconfig b/drivers/virt/tegra/Kconfig
index a29ffe03f..222ea3eb9 100644
--- a/drivers/virt/tegra/Kconfig
+++ b/drivers/virt/tegra/Kconfig
@@ -50,3 +50,12 @@ config TEGRA_HV_SYSFS
50 Can be made a module (=m) to save boot time 50 Can be made a module (=m) to save boot time
51 If unsure, say Y here 51 If unsure, say Y here
52 52
53config TEGRA_VM_ERR_HANDLER
54 tristate "Nvidia Tegra handler for VM error notifications"
55 depends on TEGRA_VIRTUALIZATION
56 default y
57 help
58 Provides a handler that receives VM error notifications
59 from the Hypervisor.
60 If unsure, keep Y
61
diff --git a/drivers/virt/tegra/Makefile b/drivers/virt/tegra/Makefile
index c03414695..c36d2c8b0 100644
--- a/drivers/virt/tegra/Makefile
+++ b/drivers/virt/tegra/Makefile
@@ -8,3 +8,6 @@ obj-$(CONFIG_TEGRA_HV_MANAGER) += tegra_hv.o ivc-cdev.o
8obj-$(CONFIG_TEGRA_HV_MANAGER) += userspace_ivc_mempool.o 8obj-$(CONFIG_TEGRA_HV_MANAGER) += userspace_ivc_mempool.o
9obj-$(CONFIG_TEGRA_HV_SYSFS) += hvc_sysfs.o 9obj-$(CONFIG_TEGRA_HV_SYSFS) += hvc_sysfs.o
10obj-$(CONFIG_TEGRA_HV_WDT_HANDLER) += tegra_hv_wdt_handler.o 10obj-$(CONFIG_TEGRA_HV_WDT_HANDLER) += tegra_hv_wdt_handler.o
11
12obj-$(CONFIG_TEGRA_VM_ERR_HANDLER) += vm_err.o
13obj-$(CONFIG_TEGRA_VM_ERR_HANDLER) += vm_err_sample_handler.o
diff --git a/drivers/virt/tegra/vm_err.c b/drivers/virt/tegra/vm_err.c
new file mode 100644
index 000000000..d9f11248c
--- /dev/null
+++ b/drivers/virt/tegra/vm_err.c
@@ -0,0 +1,535 @@
1/*
2 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13#define pr_fmt(fmt) "vm-err: " fmt
14
15#include <linux/interrupt.h>
16#include <linux/of_irq.h>
17#include <linux/platform_device.h>
18#include <linux/vm_err.h>
19#include <asm/traps.h>
20#include <asm-generic/irq_regs.h>
21#include <asm/system_misc.h>
22#include <soc/tegra/virt/syscalls.h>
23#include <soc/tegra/chip-id.h>
24
25struct tegra_hv_err_ctrl {
26 struct device *dev;
27 struct errInfo *err_info;
28 unsigned int async_err_arr_items;
29 int hv_peer_err_irq_id;
30 unsigned int vcpu_cnt;
31 struct serr_hook hook;
32 struct vm_err_handlers handlers;
33};
34
35static struct tegra_hv_config config;
36
37static unsigned int intr_info[3]; /* intr_property_size = 3 */
38
39static struct property interrupts_prop = {
40 .name = "interrupts",
41};
42
43static bool check_sync_err(const unsigned int vcpu_id,
44 const struct tegra_hv_err_ctrl *const ctrl,
45 bool *send_sync_err_ack)
46{
47 uint64_t rd_idx;
48 const struct errData *err_data;
49
50 if (vcpu_id >= ctrl->vcpu_cnt) {
51 dev_crit(ctrl->dev, "%s: Invalid vcpu id %u\n", __func__,
52 vcpu_id);
53 *send_sync_err_ack = false;
54 /* Unexpected vcpu id. Enter bad mode. */
55 return true;
56 }
57
58 /* Shared memory layout is:
59 * |--async-err-metadata--|--async-errors-array-|--sync-errors-array-|
60 * Size of async errors array = Max errors + 1(to avoid same empty
61 * and full conditions of the buffer)
62 * Size of sync errors array = 1 error per VCPU * number of VCPUs in VM
63 */
64 rd_idx = ctrl->async_err_arr_items + vcpu_id;
65 /* It's already validated at init time that sufficient memory is
66 * allocated to hold async_err_arr_items + sync error per vcpu. Hence,
67 * after validating the vcpu_id above, no need to validate rd_idx here.
68 */
69 err_data = &(ctrl->err_info->errData[rd_idx]);
70 if (!err_data->sync_dataAbort.isFilled) {
71 *send_sync_err_ack = false;
72 dev_info(ctrl->dev, "No synchronous error data on vcpu %u\n",
73 vcpu_id);
74 /* No sync error. No need to enter bad mode. */
75 return false;
76 }
77
78 if (err_data->errType != SYNC) {
79 dev_crit(ctrl->dev, "%s: unexpected error Type %d\n",
80 __func__, err_data->errType);
81 *send_sync_err_ack = true;
82 /* Unexpected error id. Enter bad mode. */
83 return true;
84 }
85
86 if (err_data->offendingGuestId != config.guest_id_self) {
87 dev_crit(ctrl->dev, "%s: invalid offender id %u\n", __func__,
88 err_data->offendingGuestId);
89 *send_sync_err_ack = true;
90 /* Invalid id of offending guest. Enter bad mode. */
91 return true;
92 }
93 dev_err(ctrl->dev, "Synchronous error on vcpu %u\n", vcpu_id);
94
95 if (ctrl->handlers.fn_self_sync) {
96 *send_sync_err_ack = true;
97 /* Enter bad_mode (or otherwise) as custom handler dictates */
98 return ctrl->handlers.fn_self_sync(err_data);
99 }
100
101 /* should never reach here */
102 *send_sync_err_ack = true;
103 /* Reaching here is unexpected. Enter bad mode. */
104 return true;
105}
106
107static irqreturn_t async_err_handler(int irq, void *context)
108{
109 unsigned int num_async_errs_read = 0;
110 bool enter_bad_mode = false;
111 const struct tegra_hv_err_ctrl *const ctrl = context;
112 const unsigned int vcpu_id = hyp_read_vcpu_id();
113 uint64_t local_rd_idx, next_rd_idx;
114 const struct errData *err_data;
115 bool (*fn_self_async)(const struct errData *const err_data);
116 bool (*fn_peer)(const struct errData *const err_data);
117 bool (*handler)(const struct errData *const err_data);
118 struct pt_regs *regs;
119
120 if (vcpu_id != 0) {
121 dev_err(ctrl->dev, "Asynchronous error on vcpu %u\n", vcpu_id);
122 /* Only VCPU0 is expected to receive async error vIRQ */
123 return IRQ_HANDLED;
124 }
125
126 fn_self_async = ctrl->handlers.fn_self_async;
127 fn_peer = ctrl->handlers.fn_peer;
128
129 if ((fn_self_async == NULL) && (fn_peer == NULL)) {
130 dev_err(ctrl->dev, "Asynchronous error handlers absent\n");
131 return IRQ_HANDLED;
132 }
133
134 local_rd_idx = ctrl->err_info->async_metaData.rdIdx;
135 dev_dbg(ctrl->dev, "Local Rd Idx = %llu, shared Wr Idx = %llu\n",
136 local_rd_idx, ctrl->err_info->async_metaData.wrIdx);
137
138 /* Check async error. Read until error queue gets empty */
139 while (local_rd_idx != ctrl->err_info->async_metaData.wrIdx) {
140 next_rd_idx = (local_rd_idx + 1) % ctrl->async_err_arr_items;
141
142 err_data = &(ctrl->err_info->errData[next_rd_idx]);
143 if (err_data->offendingGuestId == config.guest_id_self)
144 handler = fn_self_async;
145 else
146 handler = fn_peer;
147
148 if (handler) {
149 if (handler(err_data) == true)
150 enter_bad_mode = true;
151 }
152
153 local_rd_idx = next_rd_idx;
154 num_async_errs_read++;
155 dev_dbg(ctrl->dev, "Local Rd Idx = %llu\n", local_rd_idx);
156 }
157
158 if (num_async_errs_read) {
159 dev_err(ctrl->dev, "%u asynchronous error(s) read\n",
160 num_async_errs_read);
161
162 /* Send ack for async error(s) to HV */
163 if (hyp_send_async_err_ack(local_rd_idx) != 0) {
164 dev_crit(ctrl->dev,
165 "%s: Sending ack failed. Setting bad mode\n",
166 __func__);
167 /* Unexpected */
168 enter_bad_mode = true;
169 }
170 }
171
172 if (enter_bad_mode) {
173 regs = get_irq_regs();
174 die("Oops - bad mode", regs, 0);
175 panic("bad mode");
176 }
177
178 return IRQ_HANDLED;
179}
180
181static int sync_err_handler(struct pt_regs *regs, int reason,
182 uint32_t esr, void *context)
183{
184 bool enter_bad_mode = false;
185 bool send_sync_err_ack = false;
186 const struct tegra_hv_err_ctrl *const ctrl = context;
187 const unsigned int vcpu_id = hyp_read_vcpu_id();
188
189 /* Check sync error */
190 if (check_sync_err(vcpu_id, ctrl, &send_sync_err_ack) == true)
191 enter_bad_mode = true;
192
193 /* Send ack for error to HV. */
194 if (send_sync_err_ack) {
195 if (hyp_send_sync_err_ack(send_sync_err_ack) != 0) {
196 dev_crit(ctrl->dev,
197 "%s: Sending ack failed. Setting bad mode\n",
198 __func__);
199 /* Unexpected */
200 enter_bad_mode = true;
201 }
202 }
203
204 /* Caller expects 0 to enter bad mode */
205 return (!enter_bad_mode);
206}
207
208void tegra_hv_get_config(struct tegra_hv_config *cfg)
209{
210 cfg->guest_id_self = config.guest_id_self;
211 cfg->num_guests = config.num_guests;
212}
213EXPORT_SYMBOL(tegra_hv_get_config);
214
215static int virq_handler_init(const struct platform_device *pdev)
216{
217 int ret;
218 struct irq_data *peer_err_irq_data;
219 int lin_peer_err_irq_id;
220 struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
221 struct device dev = pdev->dev;
222
223 dev_info(ctrl->dev, "Error notification HV IRQ id: %d\n",
224 ctrl->hv_peer_err_irq_id);
225
226 /* Ensure HV returned valid irq */
227 if (ctrl->hv_peer_err_irq_id == -1)
228 return 0;
229
230 /* Set indicate irq type 0 to indicate Shared Peripheral Irq */
231 intr_info[0] = cpu_to_be32(0);
232 /* Id in SPI namespace - subtract number of PPIs
233 * (Private Peripheral Irqs) which is = 32
234 */
235 intr_info[1] = cpu_to_be32(ctrl->hv_peer_err_irq_id - 32);
236 /* Trigger irq on low-to-high edge (0x1) */
237 intr_info[2] = cpu_to_be32(IRQF_TRIGGER_RISING);
238
239 interrupts_prop.length = sizeof(intr_info);
240 dev_info(ctrl->dev, "interrupts_prop.length %u\n",
241 interrupts_prop.length);
242
243 interrupts_prop.value = intr_info;
244
245 if (of_add_property(dev.of_node, &interrupts_prop)) {
246 dev_err(ctrl->dev, "%s: failed to add interrupts property\n",
247 __func__);
248 return -EACCES;
249 }
250
251 lin_peer_err_irq_id = of_irq_get(dev.of_node, 0);
252 if (lin_peer_err_irq_id < 0) {
253 dev_err(ctrl->dev, "%s: Unable to get Linux irq for id %d\n",
254 __func__, ctrl->hv_peer_err_irq_id);
255 return lin_peer_err_irq_id;
256 }
257
258 peer_err_irq_data = irq_get_irq_data(lin_peer_err_irq_id);
259 if (peer_err_irq_data == NULL) {
260 dev_err(ctrl->dev, "%s: Failed to get data for Linux irq %d\n",
261 __func__, lin_peer_err_irq_id);
262 return -ENODEV;
263 }
264
265 ret = devm_request_irq(&dev, lin_peer_err_irq_id, async_err_handler,
266 IRQ_NOTHREAD, dev_name(&dev), ctrl);
267 if (ret < 0) {
268 dev_err(ctrl->dev,
269 "%s: failed to register IRQ %d, Err %d, %s\n",
270 __func__, lin_peer_err_irq_id, ret, pdev->name);
271 return ret;
272 }
273 dev_info(ctrl->dev, "Registered Linux IRQ %d for peer notification\n",
274 lin_peer_err_irq_id);
275
276 return 0;
277}
278
279static int serr_handler_init(struct platform_device *pdev)
280{
281 struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
282
283 ctrl->hook.fn = sync_err_handler;
284 ctrl->hook.priv = platform_get_drvdata(pdev);
285 register_serr_hook(&ctrl->hook);
286
287 return 0;
288}
289
290static int shared_mem_map(struct platform_device *pdev)
291{
292 uint64_t ipa, buff_size, required_size;
293 int ret;
294 struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
295
296 /* Get error info details */
297 ret = hyp_read_err_info_get(&ipa, &buff_size,
298 &ctrl->async_err_arr_items, &ctrl->hv_peer_err_irq_id,
299 &ctrl->vcpu_cnt);
300 if (ret != 0) {
301 /* It could come here if DTS and defconfig enable execution
302 * of this code, but HV hasn't implemented the hypercall.
303 * Flag error.
304 */
305 dev_err(ctrl->dev,
306 "%s: failed to get err memory address. Err %d\n",
307 __func__, ret);
308 return -ENODEV;
309 }
310
311 if ((ipa == 0) || (buff_size == 0) ||
312 (ctrl->async_err_arr_items == 0)) {
313 /* It could come here if DTS and defconfig enable execution
314 * of this code, but PCT hasn't enabled error injection.
315 * A warning should suffice.
316 */
317 dev_warn(ctrl->dev, "%s: invalid shared memory parameters\n",
318 __func__);
319 return -ENOMEM;
320 }
321
322 /* Shared memory layout is:
323 * |--async-err-metadata--|--async-errors-array-|--sync-errors-array-|
324 * Size of async errors array = Max errors + 1 (to avoid same empty and
325 * full conditions of the buffer)
326 * Size of sync errors array = 1 error per VCPU * number of VCPUs on
327 * a VM
328 */
329 required_size = sizeof(struct async_metaData) +
330 (sizeof(struct errData) *
331 (ctrl->async_err_arr_items + ctrl->vcpu_cnt));
332 if (buff_size < required_size) {
333 dev_err(ctrl->dev,
334 "%s:invalid params. size %llu. required size %llu\n",
335 __func__, buff_size, required_size);
336 dev_err(ctrl->dev, "%s: async arr size %u. vcpus %u\n",
337 __func__, ctrl->async_err_arr_items, ctrl->vcpu_cnt);
338 return -ENOMEM;
339 }
340
341 dev_info(ctrl->dev, "%s: Err info IPA for guest %u: 0x%llx\n",
342 __func__, config.guest_id_self, ipa);
343 dev_info(ctrl->dev, "Err info buf size 0x%llX\n", buff_size);
344 dev_info(ctrl->dev, "Async err arr size %u. Number of VCPUs %u\n",
345 ctrl->async_err_arr_items, ctrl->vcpu_cnt);
346
347 /* Map shared memory */
348 ctrl->err_info = (struct errInfo *) ioremap_cache(ipa, buff_size);
349 if (ctrl->err_info == NULL)
350 return -ENOMEM;
351
352 return 0;
353}
354
355static int hyp_config_init(struct device *dev)
356{
357 int ret = hyp_read_gid(&config.guest_id_self);
358
359 if (ret != 0) {
360 dev_err(dev, "%s: failed to read guest id. Err %d\n",
361 __func__, ret);
362 return ret;
363 }
364
365 ret = hyp_read_nguests(&config.num_guests);
366 if (ret != 0) {
367 /* Only privileged guest can query number of guests */
368 dev_warn(dev, "%s: can't read number of guests. Err %d\n",
369 __func__, ret);
370 }
371
372 dev_info(dev, "%s: guest id %u num guests %u\n", __func__,
373 config.guest_id_self, config.num_guests);
374
375 return 0;
376}
377
378static void shared_structs_check(struct device *dev)
379{
380 /* Ensure coherency with common header */
381 BUILD_BUG_ON(REASON_ENUM_SIZE != (ARRAY_SIZE(fault_reason_desc)));
382
383 /* Manually compare these sizes with HV console dump to ensure
384 * common structures shared by HV and Linux are in sync
385 */
386 dev_info(dev, "async_metaData size 0x%lx\n",
387 sizeof(struct async_metaData));
388 dev_info(dev, "async_bridgeErr size 0x%lx\n",
389 sizeof(struct async_bridgeErr));
390 dev_info(dev, "async_smmuErr size 0x%lx\n",
391 sizeof(struct async_smmuErr));
392 dev_info(dev, "async_mcErr size 0x%lx\n",
393 sizeof(struct async_mcErr));
394 dev_info(dev, "sync_dataAbort size 0x%lx\n",
395 sizeof(struct sync_dataAbort));
396 dev_info(dev, "errData size 0x%lx\n", sizeof(struct errData));
397}
398
399static int vm_err_handler_init(struct platform_device *pdev)
400{
401 int ret;
402 struct tegra_hv_err_ctrl *ctrl;
403 struct device *dev = &pdev->dev;
404
405 if (!is_tegra_hypervisor_mode()) {
406 dev_err(dev, "%s: hypervisor is not present\n", __func__);
407 return -ENODEV;
408 }
409
410 shared_structs_check(dev);
411
412 ctrl = devm_kzalloc(dev, sizeof(*ctrl), GFP_KERNEL);
413 if (!ctrl)
414 return -ENOMEM;
415
416 ctrl->dev = dev;
417 platform_set_drvdata(pdev, ctrl);
418
419 ret = hyp_config_init(dev);
420 if (ret)
421 return ret;
422
423 ret = shared_mem_map(pdev);
424 if (ret)
425 return -ENOMEM;
426
427 ret = serr_handler_init(pdev);
428 if (ret)
429 return ret;
430
431 ret = virq_handler_init(pdev);
432 if (ret)
433 return ret;
434
435 return 0;
436}
437
438static int vm_err_handler_remove(struct platform_device *pdev)
439{
440 struct tegra_hv_err_ctrl *ctrl = platform_get_drvdata(pdev);
441 struct device_node *node = pdev->dev.of_node;
442
443 if (of_remove_property(node,
444 of_find_property(node, "interrupts", NULL))) {
445 dev_err(ctrl->dev, "%s: failed to add interrupts property\n",
446 __func__);
447 return -EACCES;
448 }
449
450 unregister_serr_hook(&ctrl->hook);
451 iounmap(ctrl->err_info);
452
453 dev_info(ctrl->dev, "%s: cleaned up and unregistered handler\n",
454 __func__);
455
456 return 0;
457}
458
459static const struct of_device_id tegra_hv_err_match[] = {
460 { .compatible = "nvidia,tegra-hv-err", .data = NULL},
461 {},
462};
463
464static struct platform_driver tegra_hv_err_pdriver = {
465 .driver = {
466 .name = "tegra-hv-err-handler",
467 .owner = THIS_MODULE,
468 .of_match_table = of_match_ptr(tegra_hv_err_match),
469 },
470 .probe = vm_err_handler_init,
471 .remove = vm_err_handler_remove,
472};
473
474static int tegra_hv_register_hooks_for_device(struct device *dev,
475 void *handlers)
476{
477 struct tegra_hv_err_ctrl *ctrl;
478 const struct platform_device *pd = container_of(dev,
479 struct platform_device, dev);
480 const struct vm_err_handlers *_handlers =
481 (struct vm_err_handlers *) handlers;
482
483 ctrl = platform_get_drvdata(pd);
484 if (!ctrl) {
485 dev_err(dev, "%s: no platform data", __func__);
486 return 0;
487 }
488
489 if (ctrl->handlers.fn_self_async == NULL)
490 ctrl->handlers.fn_self_async = _handlers->fn_self_async;
491
492 if (ctrl->handlers.fn_self_sync == NULL)
493 ctrl->handlers.fn_self_sync = _handlers->fn_self_sync;
494
495 if (ctrl->handlers.fn_peer == NULL)
496 ctrl->handlers.fn_peer = _handlers->fn_peer;
497
498 return 0;
499}
500
501int tegra_hv_register_vm_err_hooks(struct vm_err_handlers *handlers)
502{
503 int ret;
504
505 if (!handlers) {
506 pr_err("%s: invalid error handlers\n", __func__);
507 return 1;
508 }
509
510 if (!handlers->fn_self_async && !handlers->fn_self_sync
511 && !handlers->fn_peer) {
512 platform_driver_unregister(&tegra_hv_err_pdriver);
513 return 0;
514 }
515
516 if (!tegra_hv_err_pdriver.driver.p) {
517 /* Not registered/bound yet */
518 ret = platform_driver_register(&tegra_hv_err_pdriver);
519 if (ret) {
520 pr_err("%s: failed to register driver. Err %d\n",
521 __func__, ret);
522 return ret;
523 }
524 }
525
526 ret = driver_for_each_device(&tegra_hv_err_pdriver.driver, NULL,
527 handlers, tegra_hv_register_hooks_for_device);
528 if (ret) {
529 pr_err("%s: failed to attach driver. Err %d\n", __func__, ret);
530 return ret;
531 }
532
533 return 0;
534}
535EXPORT_SYMBOL(tegra_hv_register_vm_err_hooks);
diff --git a/drivers/virt/tegra/vm_err_sample_handler.c b/drivers/virt/tegra/vm_err_sample_handler.c
new file mode 100644
index 000000000..fea81363e
--- /dev/null
+++ b/drivers/virt/tegra/vm_err_sample_handler.c
@@ -0,0 +1,315 @@
1/*
2 * Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms and conditions of the GNU General Public License,
6 * version 2, as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope it will be useful, but WITHOUT
9 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11 * more details.
12 */
13#define pr_fmt(fmt) "vm-err-sample-handler: " fmt
14
15#include <linux/module.h>
16#include <linux/vm_err.h>
17
18/* Bridge error details:
19 * Note: These are redefined here only to allow user friendly messages
20 * describing the error.
21 * This must match with "Timeout error" value in t18x_axi_errors[]
22 * in nvidia/drivers/platform/tegra/bridge_mca.c
23 */
24static const unsigned int BRIDGE_ERROR_TIMEOUT = 18;
25
26/* This must match with "CCPLEX" value in src_ids
27 * in nvidia/drivers/platform/tegra/bridge_mca.c
28 */
29static const unsigned int BRIDGE_SRC_ID_CCPLEX = 1;
30
31/* This must match with corresponding HV definition in pct.h */
32static const unsigned int GUEST_UNASSIGNED = 18;
33
34static struct vm_err_handlers handlers;
35static struct tegra_hv_config config;
36
37static void print_bridge_error(const struct errData * const err_data)
38{
39 const struct async_bridgeErr * const br_err_data =
40 &err_data->async_bridgeErr;
41 unsigned int protection;
42
43 pr_crit("Bridge error details\n");
44 pr_crit("--------------------------------------\n");
45 pr_crit("Err count %d: %s FAULT ADDR 0x%x status1 0x%x status2 0x%x\n",
46 br_err_data->count, br_err_data->br_name, br_err_data->err_addr,
47 br_err_data->err_status1, br_err_data->err_status2);
48
49 pr_crit("\tDirection: %s\n", br_err_data->rw ? "READ" : "WRITE");
50 pr_crit("\tBridge ID: 0x%x\n", br_err_data->br_id);
51 pr_crit("\tError type: %u %s\n",
52 br_err_data->err_type,
53 (br_err_data->err_type == BRIDGE_ERROR_TIMEOUT) ?
54 "(Timeout)" : "");
55
56 pr_crit("\tLength: %d\n", br_err_data->length);
57 protection = br_err_data->protection;
58 pr_crit("\tProtection: 0x%x %s %s %s access\n", protection,
59 (protection & 0x4) ? "Instruction" : "Data",
60 (protection & 0x2) ? "Non-Secure" : "Secure",
61 (protection & 0x1) ? "Privileged" : "Unprivileged");
62
63 pr_crit("\tSource ID: 0x%x -- %s\n",
64 br_err_data->src_id,
65 (br_err_data->src_id == BRIDGE_SRC_ID_CCPLEX) ?
66 " (CCPLEX)" : "");
67
68 pr_crit("\tAXI_ID: 0x%x\n", br_err_data->axi_id);
69 pr_crit("\tCache: 0x%x\n", br_err_data->cache);
70 pr_crit("\tBurst: 0x%x\n", br_err_data->burst);
71 pr_crit("--------------------------------------\n");
72}
73
74static void print_smmu_error(const struct errData * const err_data,
75 const enum errReason reason)
76{
77 const struct async_smmuErr * const smmu_err_data =
78 &err_data->async_smmuErr;
79
80 pr_crit("SMMU error details\n");
81 pr_crit("--------------------------------------\n");
82 if (reason == REASON_ASYNC_SMMU_CB) {
83 pr_crit("SMMU Context Bank %u error. StreamID: %d\n",
84 smmu_err_data->cb_id, smmu_err_data->stream_id);
85 } else if (reason == REASON_ASYNC_SMMU_GLOBAL) {
86 pr_crit("Global SMMU fault. CB: %u. StreamID: %d\n",
87 smmu_err_data->cb_id, smmu_err_data->stream_id);
88 } else {
89 pr_crit("Unexpected fault reason %d\n", reason);
90 }
91 pr_crit("FSR: 0x%x; FAR: 0x%llx; FSYND0: 0x%x; FSYND1: 0x%x\n",
92 smmu_err_data->fsr, smmu_err_data->far,
93 smmu_err_data->fsynr0, smmu_err_data->fsynr1);
94 pr_crit("--------------------------------------\n");
95}
96
97static void print_mc_error(const struct errData * const err_data)
98{
99 const struct async_mcErr * const mc_err_data = &err_data->async_mcErr;
100
101 pr_crit("Memory Controller error details\n");
102 pr_crit("--------------------------------------\n");
103 pr_crit("mc_err: base: 0x%llx, int_status: 0x%08x; err_status: 0x%08x;"
104 " fault_addr: 0x%llx\n",
105 mc_err_data->ch_base, mc_err_data->int_status,
106 mc_err_data->err_status, mc_err_data->fault_addr);
107 pr_crit("vcpuid %u, client_id %u, peripheral_id %d\n",
108 mc_err_data->vcpuid, mc_err_data->client_id,
109 mc_err_data->peripheral_id);
110 pr_crit("--------------------------------------\n");
111}
112
113static void print_data_abort(const struct errData *const err_data)
114{
115 const struct sync_dataAbort * const data_abort =
116 &err_data->sync_dataAbort;
117
118 pr_crit("Data abort details\n");
119 pr_crit("--------------------------------------\n");
120 pr_crit("offending VCpu Id %u\n", data_abort->offendingVCpuId);
121 (data_abort->isWrite) ?
122 pr_crit("write access\n") : pr_crit("read access\n");
123 pr_crit("access size %u\n", data_abort->accessSize);
124 pr_crit("fault address: 0x%llx\n", data_abort->faultAddr);
125 pr_crit("esr: 0x%x\n", data_abort->esrEl2);
126 pr_crit("spsr_el2: 0x%llx\n", data_abort->spsrEl2);
127 pr_crit("elr_el1: 0x%llx\n", data_abort->elrEl1);
128 pr_crit("gprArray[0]: 0x%llx\n", data_abort->gprArray[0]);
129 pr_crit("gprArray[15]: 0x%llx\n", data_abort->gprArray[15]);
130 pr_crit("gprArray[30]: 0x%llx\n", data_abort->gprArray[30]);
131 pr_crit("--------------------------------------\n");
132}
133
134static bool handle_async_err_details(const struct errData * const err_data)
135{
136 bool enter_bad_mode;
137
138 if (err_data->errType != ASYNC) {
139 pr_crit("%s: incorrect error type: %d\n", __func__,
140 err_data->errType);
141 /* Unexpected error type. Enter bad mode. */
142 return true;
143 }
144
145 pr_info("%s: error reason: %s\n", __func__,
146 fault_reason_desc[err_data->errReason]);
147 switch (err_data->errReason) {
148 case REASON_ASYNC_BRIDGE:
149 print_bridge_error(err_data);
150 /* Bridge error may not be fatal */
151 enter_bad_mode = false;
152 break;
153
154 case REASON_ASYNC_SMMU_CB:
155 print_smmu_error(err_data, err_data->errReason);
156 /* SMMU context bank error may not be fatal */
157 enter_bad_mode = false;
158 break;
159
160 case REASON_ASYNC_SMMU_GLOBAL:
161 print_smmu_error(err_data, err_data->errReason);
162 /* Can't recover from global SMMU error. */
163 enter_bad_mode = true;
164 break;
165
166 case REASON_ASYNC_MC:
167 print_mc_error(err_data);
168 enter_bad_mode = false;
169 break;
170
171 default:
172 pr_crit("%s: unhandled error. Reason id %d\n", __func__,
173 err_data->errReason);
174 enter_bad_mode = true;
175 break;
176 }
177
178 return enter_bad_mode;
179}
180
181static bool handle_sync_err_details(const struct errData * const err_data)
182{
183 /* Currently only data abort error injection is supported */
184 if (err_data->errReason != REASON_SYNC_DATA_ABORT) {
185 pr_crit("%s: unexpected reason id %u\n", __func__,
186 err_data->errReason);
187 /* Invalid reason. Enter bad mode. */
188 return true;
189 }
190 pr_info("%s: error reason: %s\n", __func__,
191 fault_reason_desc[err_data->errReason]);
192 print_data_abort(err_data);
193
194 /* Recovery from sync error could be impossible. Enter bad mode. */
195 return true;
196}
197
198static bool handle_peer_err_details(const struct errData * const err_data)
199{
200 bool enter_bad_mode;
201 const unsigned int offender = err_data->offendingGuestId;
202
203 if (offender >= config.num_guests) {
204 if (offender != GUEST_UNASSIGNED) {
205 pr_crit("%s: invalid offending peer guest id %u\n",
206 __func__, offender);
207 /* Unexpected. Cause reboot. */
208 return true;
209 }
210 pr_crit("%s: HV can't attribute error to any guest\n",
211 __func__);
212 } else
213 pr_crit("Peer error. Offending guest id = %u\n", offender);
214
215 pr_crit("Error Type: %s\n", (err_data->errType == SYNC) ?
216 "Synchronous" : "Asynchronous");
217
218 if (err_data->errReason >= REASON_ENUM_SIZE) {
219 pr_crit("%s: unexpected reason id %u\n", __func__,
220 err_data->errReason);
221 /* Unexpected. Cause reboot. */
222 return true;
223 }
224 pr_crit("%s: error reason: %s\n", __func__,
225 fault_reason_desc[err_data->errReason]);
226
227 switch (err_data->errReason) {
228 case REASON_ASYNC_BRIDGE:
229 print_bridge_error(err_data);
230 enter_bad_mode = false;
231 break;
232
233 case REASON_ASYNC_SMMU_CB:
234 case REASON_ASYNC_SMMU_GLOBAL:
235 print_smmu_error(err_data, err_data->errReason);
236 enter_bad_mode = false;
237 break;
238
239 case REASON_ASYNC_MC:
240 print_mc_error(err_data);
241 enter_bad_mode = false;
242 break;
243
244 case REASON_SYNC_DATA_ABORT:
245 print_data_abort(err_data);
246 enter_bad_mode = false;
247 break;
248
249 default:
250 pr_crit("%s: unhandled error. Reason id %d\n", __func__,
251 err_data->errReason);
252 enter_bad_mode = false;
253 break;
254 }
255
256 if (offender == GUEST_UNASSIGNED)
257 enter_bad_mode = true;
258
259 return enter_bad_mode;
260}
261
262static bool self_async_err_handler(const struct errData *const err_data)
263{
264 return handle_async_err_details(err_data);
265}
266
267static bool self_sync_err_handler(const struct errData *const err_data)
268{
269 return handle_sync_err_details(err_data);
270}
271
272static bool peer_err_handler(const struct errData *const err_data)
273{
274 return handle_peer_err_details(err_data);
275}
276
277static int hooks_init(void)
278{
279 int ret;
280
281 handlers.fn_self_async = self_async_err_handler;
282 handlers.fn_self_sync = self_sync_err_handler;
283 handlers.fn_peer =
284 IS_ENABLED(CONFIG_TEGRA_EBP) ? NULL : peer_err_handler;
285
286 ret = tegra_hv_register_vm_err_hooks(&handlers);
287 if (ret)
288 return ret;
289
290 tegra_hv_get_config(&config);
291 pr_info("%s: Guest Id %u\n", __func__, config.guest_id_self);
292
293 /* EBP, being unprivileged, doesn't know about total guests */
294 if (IS_ENABLED(CONFIG_TEGRA_EBP) == 0)
295 pr_info("%s: Total guests %u\n", __func__, config.num_guests);
296
297 return 0;
298}
299
300static void hooks_exit(void)
301{
302 struct vm_err_handlers handlers;
303
304 handlers.fn_self_async = NULL;
305 handlers.fn_self_sync = NULL;
306 handlers.fn_peer = NULL;
307
308 tegra_hv_register_vm_err_hooks(&handlers);
309}
310subsys_initcall(hooks_init);
311module_exit(hooks_exit);
312
313MODULE_AUTHOR("Nvidia Corporation");
314MODULE_LICENSE("GPL v2");
315MODULE_DESCRIPTION("Sample VM Error Handler");