summaryrefslogtreecommitdiffstats
path: root/tools/testing/selftests
diff options
context:
space:
mode:
authorOliver O'Halloran <oohall@gmail.com>2019-09-03 06:16:05 -0400
committerMichael Ellerman <mpe@ellerman.id.au>2019-09-05 00:22:40 -0400
commit85d86c8aa52eb5b3539eebe3adcc2f077118b412 (patch)
treefbaaba976f4d8f615dae79ec7033ece3e728228c /tools/testing/selftests
parentbd6461cc7b3c4fd12dcba4b0e95dfc612df872fd (diff)
selftests/powerpc: Add basic EEH selftest
Use the new eeh_dev_check and eeh_dev_break interfaces to test EEH recovery. Historically this has been done manually using platform specific EEH error injection facilities (e.g. via RTAS). However, documentation on how to use these facilities is haphazard at best and non-existent at worst so it's hard to develop a cross-platform test. The new debugfs interfaces allow the kernel to handle the platform specific details so we can write a more generic set of sets. This patch adds the most basic of recovery tests where: a) Errors are injected and recovered from sequentially, b) Errors are not injected into PCI-PCI bridges, such as PCIe switches. c) Errors are only injected into device function zero. d) No errors are injected into Virtual Functions. a), b) and c) are largely due to limitations of Linux's EEH support. EEH recovery is serialised in the EEH recovery thread which forces a). Similarly, multi-function PCI devices are almost always grouped into the same PE so injecting an error on one function exercises the same code paths. c) is because we currently more or less ignore PCI bridges during recovery and assume that the recovered topology will be the same as the original. d) is due to the limits of the eeh_dev_break interface. With the current implementation we can't inject an error into a specific VF without potentially causing additional errors on other VFs. Due to the serialised recovery process we might end up timing out waiting for another function to recover before the function of interest is recovered. The platform specific error injection facilities are finer-grained and allow this capability, but doing that requires working out how to use those facilities first. Basicly, it's better than nothing and it's a base to build on. Signed-off-by: Oliver O'Halloran <oohall@gmail.com> Signed-off-by: Michael Ellerman <mpe@ellerman.id.au> Link: https://lore.kernel.org/r/20190903101605.2890-15-oohall@gmail.com
Diffstat (limited to 'tools/testing/selftests')
-rw-r--r--tools/testing/selftests/powerpc/Makefile1
-rw-r--r--tools/testing/selftests/powerpc/eeh/Makefile9
-rwxr-xr-xtools/testing/selftests/powerpc/eeh/eeh-basic.sh82
-rwxr-xr-xtools/testing/selftests/powerpc/eeh/eeh-functions.sh76
4 files changed, 168 insertions, 0 deletions
diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile
index b3ad909aefbc..644770c3b754 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -26,6 +26,7 @@ SUB_DIRS = alignment \
26 switch_endian \ 26 switch_endian \
27 syscalls \ 27 syscalls \
28 tm \ 28 tm \
29 eeh \
29 vphn \ 30 vphn \
30 math \ 31 math \
31 ptrace \ 32 ptrace \
diff --git a/tools/testing/selftests/powerpc/eeh/Makefile b/tools/testing/selftests/powerpc/eeh/Makefile
new file mode 100644
index 000000000000..b397babd569b
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/Makefile
@@ -0,0 +1,9 @@
1# SPDX-License-Identifier: GPL-2.0
2noarg:
3 $(MAKE) -C ../
4
5TEST_PROGS := eeh-basic.sh
6TEST_FILES := eeh-functions.sh
7
8top_srcdir = ../../../../..
9include ../../lib.mk
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-basic.sh b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
new file mode 100755
index 000000000000..f988d2f42e8f
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-basic.sh
@@ -0,0 +1,82 @@
1#!/bin/sh
2# SPDX-License-Identifier: GPL-2.0-only
3
4. ./eeh-functions.sh
5
6if ! eeh_supported ; then
7 echo "EEH not supported on this system, skipping"
8 exit 0;
9fi
10
11if [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_check" ] && \
12 [ ! -e "/sys/kernel/debug/powerpc/eeh_dev_break" ] ; then
13 echo "debugfs EEH testing files are missing. Is debugfs mounted?"
14 exit 1;
15fi
16
17pre_lspci=`mktemp`
18lspci > $pre_lspci
19
20# Bump the max freeze count to something absurd so we don't
21# trip over it while breaking things.
22echo 5000 > /sys/kernel/debug/powerpc/eeh_max_freezes
23
24# record the devices that we break in here. Assuming everything
25# goes to plan we should get them back once the recover process
26# is finished.
27devices=""
28
29# Build up a list of candidate devices.
30for dev in `ls -1 /sys/bus/pci/devices/ | grep '\.0$'` ; do
31 # skip bridges since we can't recover them (yet...)
32 if [ -e "/sys/bus/pci/devices/$dev/pci_bus" ] ; then
33 echo "$dev, Skipped: bridge"
34 continue;
35 fi
36
37 # Skip VFs for now since we don't have a reliable way
38 # to break them.
39 if [ -e "/sys/bus/pci/devices/$dev/physfn" ] ; then
40 echo "$dev, Skipped: virtfn"
41 continue;
42 fi
43
44 # Don't inject errosr into an already-frozen PE. This happens with
45 # PEs that contain multiple PCI devices (e.g. multi-function cards)
46 # and injecting new errors during the recovery process will probably
47 # result in the recovery failing and the device being marked as
48 # failed.
49 if ! pe_ok $dev ; then
50 echo "$dev, Skipped: Bad initial PE state"
51 continue;
52 fi
53
54 echo "$dev, Added"
55
56 # Add to this list of device to check
57 devices="$devices $dev"
58done
59
60dev_count="$(echo $devices | wc -w)"
61echo "Found ${dev_count} breakable devices..."
62
63failed=0
64for dev in $devices ; do
65 echo "Breaking $dev..."
66
67 if ! pe_ok $dev ; then
68 echo "Skipping $dev, Initial PE state is not ok"
69 failed="$((failed + 1))"
70 continue;
71 fi
72
73 if ! eeh_one_dev $dev ; then
74 failed="$((failed + 1))"
75 fi
76done
77
78echo "$failed devices failed to recover ($dev_count tested)"
79lspci | diff -u $pre_lspci -
80rm -f $pre_lspci
81
82exit $failed
diff --git a/tools/testing/selftests/powerpc/eeh/eeh-functions.sh b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
new file mode 100755
index 000000000000..26112ab5cdf4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/eeh/eeh-functions.sh
@@ -0,0 +1,76 @@
1#!/bin/sh
2# SPDX-License-Identifier: GPL-2.0-only
3
4pe_ok() {
5 local dev="$1"
6 local path="/sys/bus/pci/devices/$dev/eeh_pe_state"
7
8 if ! [ -e "$path" ] ; then
9 return 1;
10 fi
11
12 local fw_state="$(cut -d' ' -f1 < $path)"
13 local sw_state="$(cut -d' ' -f2 < $path)"
14
15 # If EEH_PE_ISOLATED or EEH_PE_RECOVERING are set then the PE is in an
16 # error state or being recovered. Either way, not ok.
17 if [ "$((sw_state & 0x3))" -ne 0 ] ; then
18 return 1
19 fi
20
21 # A functioning PE should have the EEH_STATE_MMIO_ACTIVE and
22 # EEH_STATE_DMA_ACTIVE flags set. For some goddamn stupid reason
23 # the platform backends set these when the PE is in reset. The
24 # RECOVERING check above should stop any false positives though.
25 if [ "$((fw_state & 0x18))" -ne "$((0x18))" ] ; then
26 return 1
27 fi
28
29 return 0;
30}
31
32eeh_supported() {
33 test -e /proc/powerpc/eeh && \
34 grep -q 'EEH Subsystem is enabled' /proc/powerpc/eeh
35}
36
37eeh_one_dev() {
38 local dev="$1"
39
40 # Using this function from the command line is sometimes useful for
41 # testing so check that the argument is a well-formed sysfs device
42 # name.
43 if ! test -e /sys/bus/pci/devices/$dev/ ; then
44 echo "Error: '$dev' must be a sysfs device name (DDDD:BB:DD.F)"
45 return 1;
46 fi
47
48 # Break it
49 echo $dev >/sys/kernel/debug/powerpc/eeh_dev_break
50
51 # Force an EEH device check. If the kernel has already
52 # noticed the EEH (due to a driver poll or whatever), this
53 # is a no-op.
54 echo $dev >/sys/kernel/debug/powerpc/eeh_dev_check
55
56 # Enforce a 30s timeout for recovery. Even the IPR, which is infamously
57 # slow to reset, should recover within 30s.
58 max_wait=30
59
60 for i in `seq 0 ${max_wait}` ; do
61 if pe_ok $dev ; then
62 break;
63 fi
64 echo "$dev, waited $i/${max_wait}"
65 sleep 1
66 done
67
68 if ! pe_ok $dev ; then
69 echo "$dev, Failed to recover!"
70 return 1;
71 fi
72
73 echo "$dev, Recovered after $i seconds"
74 return 0;
75}
76