diff options
-rw-r--r-- | Documentation/edac.txt | 56 |
1 files changed, 29 insertions, 27 deletions
diff --git a/Documentation/edac.txt b/Documentation/edac.txt index bd3f8a3905af..0b875e8da969 100644 --- a/Documentation/edac.txt +++ b/Documentation/edac.txt | |||
@@ -766,7 +766,7 @@ exports one | |||
766 | For injecting a memory error, there are some sysfs nodes, under | 766 | For injecting a memory error, there are some sysfs nodes, under |
767 | /sys/devices/system/edac/mc/mc?/: | 767 | /sys/devices/system/edac/mc/mc?/: |
768 | 768 | ||
769 | inject_addrmatch: | 769 | inject_addrmatch/*: |
770 | Controls the error injection mask register. It is possible to specify | 770 | Controls the error injection mask register. It is possible to specify |
771 | several characteristics of the address to match an error code: | 771 | several characteristics of the address to match an error code: |
772 | dimm = the affected dimm. Numbers are relative to a channel; | 772 | dimm = the affected dimm. Numbers are relative to a channel; |
@@ -781,10 +781,12 @@ exports one | |||
781 | 781 | ||
782 | For example, to generate an error at rank 1 of dimm 2, for any channel, | 782 | For example, to generate an error at rank 1 of dimm 2, for any channel, |
783 | any bank, any page, any column: | 783 | any bank, any page, any column: |
784 | echo "dimm:2 rank:1" >/sys/devices/system/edac/mc/mc0/inject_addrmatch | 784 | echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm |
785 | echo 1 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank | ||
785 | 786 | ||
786 | To return to the default behaviour of matching any, you can do: | 787 | To return to the default behaviour of matching any, you can do: |
787 | echo "dimm:any rank:any" >/sys/devices/system/edac/mc/mc0/inject_addrmatch | 788 | echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/dimm |
789 | echo any >/sys/devices/system/edac/mc/mc0/inject_addrmatch/rank | ||
788 | 790 | ||
789 | inject_eccmask: | 791 | inject_eccmask: |
790 | specifies what bits will have troubles, | 792 | specifies what bits will have troubles, |
@@ -813,7 +815,7 @@ exports one | |||
813 | For example, the following code will generate an error for any write access | 815 | For example, the following code will generate an error for any write access |
814 | at socket 0, on any DIMM/address on channel 2: | 816 | at socket 0, on any DIMM/address on channel 2: |
815 | 817 | ||
816 | echo "channel:2" > /sys/devices/system/edac/mc/mc0/inject_addrmatch | 818 | echo 2 >/sys/devices/system/edac/mc/mc0/inject_addrmatch/channel |
817 | echo 2 >/sys/devices/system/edac/mc/mc0/inject_type | 819 | echo 2 >/sys/devices/system/edac/mc/mc0/inject_type |
818 | echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask | 820 | echo 64 >/sys/devices/system/edac/mc/mc0/inject_eccmask |
819 | echo 3 >/sys/devices/system/edac/mc/mc0/inject_section | 821 | echo 3 >/sys/devices/system/edac/mc/mc0/inject_section |
@@ -829,18 +831,23 @@ exports one | |||
829 | 831 | ||
830 | 3) Nehalem specific Corrected Error memory counters | 832 | 3) Nehalem specific Corrected Error memory counters |
831 | 833 | ||
832 | Nehalem have some registers to count memory errors, reporting it on a | 834 | Nehalem have some registers to count memory errors. The driver uses those |
833 | way that it is different from what EDAC API allows. Due to that, a | 835 | registers to report Corrected Errors on devices with Registered Dimms. |
834 | separate sysfs note were created to handle such counters. | ||
835 | 836 | ||
836 | They can be read by looking at the contents of "corrected_error_counts" | 837 | However, those counters don't work with Unregistered Dimms. As the chipset |
837 | counter. Due to hardware limits, the output is different on machines | 838 | offers some counters that also work with UDIMMS (but with a worse level of |
838 | with unregistered memories and machines with registered ones. | 839 | granularity than the default ones), the driver exposes those registers for |
840 | UDIMM memories. | ||
839 | 841 | ||
840 | With unregistered memories, it outputs: | 842 | They can be read by looking at the contents of all_channel_counts/ |
841 | 843 | ||
842 | $ cat /sys/devices/system/edac/mc/mc0/corrected_error_counts | 844 | $ for i in /sys/devices/system/edac/mc/mc0/all_channel_counts/*; do echo $i; cat $i; done |
843 | all channels UDIMM0: 0 UDIMM1: 0 UDIMM2: 0 | 845 | /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm0 |
846 | 0 | ||
847 | /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm1 | ||
848 | 0 | ||
849 | /sys/devices/system/edac/mc/mc0/all_channel_counts/udimm2 | ||
850 | 0 | ||
844 | 851 | ||
845 | What happens here is that errors on different csrows, but at the same | 852 | What happens here is that errors on different csrows, but at the same |
846 | dimm number will increment the same counter. | 853 | dimm number will increment the same counter. |
@@ -849,21 +856,16 @@ exports one | |||
849 | csrow1: channel 0, dimm1 | 856 | csrow1: channel 0, dimm1 |
850 | csrow2: channel 1, dimm0 | 857 | csrow2: channel 1, dimm0 |
851 | csrow3: channel 2, dimm0 | 858 | csrow3: channel 2, dimm0 |
852 | The hardware will increment UDIMM0 for an error at either csrow0, csrow2 | 859 | The hardware will increment udimm0 for an error at the first dimm at either |
853 | or csrow3. | 860 | csrow0, csrow2 or csrow3; |
854 | 861 | The hardware will increment udimm1 for an error at the second dimm at either | |
855 | With registered memories, it outputs: | 862 | csrow0, csrow2 or csrow3; |
856 | 863 | The hardware will increment udimm2 for an error at the third dimm at either | |
857 | $cat /sys/devices/system/edac/mc/mc0/corrected_error_counts | 864 | csrow0, csrow2 or csrow3; |
858 | channel 0 RDIMM0: 0 RDIMM1: 0 RDIMM2: 0 | ||
859 | channel 1 RDIMM0: 0 RDIMM1: 0 RDIMM2: 0 | ||
860 | channel 2 RDIMM0: 0 RDIMM1: 0 RDIMM2: 0 | ||
861 | |||
862 | So, with registered memories, there's a direct map between a csrow and a | ||
863 | counter. | ||
864 | 865 | ||
865 | 4) Standard error counters | 866 | 4) Standard error counters |
866 | 867 | ||
867 | The standard error counters are generated when an mcelog error is received | 868 | The standard error counters are generated when an mcelog error is received |
868 | by the driver. Since it is counted by software, it is possible that some | 869 | by the driver. Since, with udimm, this is counted by software, it is |
869 | errors could be lost. | 870 | possible that some errors could be lost. With rdimm's, they displays the |
871 | contents of the registers | ||