diff options
author | Jay Cornwall <Jay.Cornwall@amd.com> | 2019-07-01 16:46:56 -0400 |
---|---|---|
committer | Alex Deucher <alexander.deucher@amd.com> | 2019-07-18 15:18:06 -0400 |
commit | 37f86a9b3617d55ad8189e1b7e6468b85dba4b88 (patch) | |
tree | a2525b2957285fbb0ecf091c41455148ec44c953 | |
parent | 5ddd4a9a7c25a6a23a79f973e7a87b1403503719 (diff) |
drm/amdkfd: Merge gfx9/arcturus trap handlers, add ACC VGPR save
ACC VGPRs are a secondary VGPR set of same size as the primary VGPRs.
Save them as a block immediately following VGPRs.
Signed-off-by: Jay Cornwall <Jay.Cornwall@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | 627 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_arcturus.asm | 746 | ||||
-rw-r--r-- | drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | 83 |
3 files changed, 538 insertions, 918 deletions
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h index ee700a69c68e..c45ba0013ca5 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h | |||
@@ -861,200 +861,487 @@ static const uint32_t cwsr_trap_gfx10_hex[] = { | |||
861 | 0xbf9f0000, 0x00000000, | 861 | 0xbf9f0000, 0x00000000, |
862 | }; | 862 | }; |
863 | static const uint32_t cwsr_trap_arcturus_hex[] = { | 863 | static const uint32_t cwsr_trap_arcturus_hex[] = { |
864 | 0xbf820001, 0xbf8200ca, | 864 | 0xbf820001, 0xbf8202bd, |
865 | 0xb8f0f802, 0x89708670, | 865 | 0xb8f8f802, 0x89788678, |
866 | 0xb8f1f803, 0x8671ff71, | 866 | 0xb8fbf803, 0x866eff7b, |
867 | 0x00000400, 0xbf850008, | 867 | 0x00000400, 0xbf85003b, |
868 | 0xb8f1f803, 0x8671ff71, | 868 | 0x866eff7b, 0x00000800, |
869 | 0x000001ff, 0xbf850001, | 869 | 0xbf850003, 0x866eff7b, |
870 | 0x806c846c, 0x866dff6d, | 870 | 0x00000100, 0xbf84000c, |
871 | 0x0000ffff, 0xbe801f6c, | 871 | 0x866eff78, 0x00002000, |
872 | 0xb8f1f803, 0x8671ff71, | 872 | 0xbf840005, 0xbf8e0010, |
873 | 0x00000100, 0xbf840006, | 873 | 0xb8eef803, 0x866eff6e, |
874 | 0xbef60080, 0xb9760203, | 874 | 0x00000400, 0xbf84fffb, |
875 | 0x866dff6d, 0x0000ffff, | 875 | 0x8778ff78, 0x00002000, |
876 | 0x80ec886c, 0x82ed806d, | 876 | 0x80ec886c, 0x82ed806d, |
877 | 0xbef60080, 0xb9760283, | 877 | 0xb8eef807, 0x866fff6e, |
878 | 0xbef20068, 0xbef30069, | 878 | 0x001f8000, 0x8e6f8b6f, |
879 | 0xb8f62407, 0x8e769b76, | 879 | 0x8977ff77, 0xfc000000, |
880 | 0x876d766d, 0xb8f603c7, | 880 | 0x87776f77, 0x896eff6e, |
881 | 0x8e769a76, 0x876d766d, | 881 | 0x001f8000, 0xb96ef807, |
882 | 0xb8f6f807, 0x8676ff76, | 882 | 0xb8faf812, 0xb8fbf813, |
883 | 0x00007fff, 0xb976f807, | 883 | 0x8efa887a, 0xc0071bbd, |
884 | 0xbeee007e, 0xbeef007f, | 884 | 0x00000000, 0xbf8cc07f, |
885 | 0xbefe0180, 0xbf900004, | 885 | 0xc0071ebd, 0x00000008, |
886 | 0xbf8e0002, 0xbf88fffe, | 886 | 0xbf8cc07f, 0x86ee6e6e, |
887 | 0xbf840001, 0xbe801d6e, | ||
888 | 0xb8fbf803, 0x867bff7b, | ||
889 | 0x000001ff, 0xbf850002, | ||
890 | 0x806c846c, 0x826d806d, | ||
891 | 0x866dff6d, 0x0000ffff, | ||
892 | 0x8f6e8b77, 0x866eff6e, | ||
893 | 0x001f8000, 0xb96ef807, | ||
894 | 0x86fe7e7e, 0x86ea6a6a, | ||
895 | 0x8f6e8378, 0xb96ee0c2, | ||
896 | 0xbf800002, 0xb9780002, | ||
897 | 0xbe801f6c, 0x866dff6d, | ||
898 | 0x0000ffff, 0xbefa0080, | ||
899 | 0xb97a0283, 0xb8fa2407, | ||
900 | 0x8e7a9b7a, 0x876d7a6d, | ||
901 | 0xb8fa03c7, 0x8e7a9a7a, | ||
902 | 0x876d7a6d, 0xb8faf807, | ||
903 | 0x867aff7a, 0x00007fff, | ||
904 | 0xb97af807, 0xbeee007e, | ||
905 | 0xbeef007f, 0xbefe0180, | ||
906 | 0xbf900004, 0x877a8478, | ||
907 | 0xb97af802, 0xbf8e0002, | ||
908 | 0xbf88fffe, 0xb8fa2a05, | ||
909 | 0x807a817a, 0x8e7a8a7a, | ||
910 | 0x8e7a817a, 0xb8fb1605, | ||
911 | 0x807b817b, 0x8e7b867b, | ||
912 | 0x807a7b7a, 0x807a7e7a, | ||
913 | 0x827b807f, 0x867bff7b, | ||
914 | 0x0000ffff, 0xc04b1c3d, | ||
915 | 0x00000050, 0xbf8cc07f, | ||
916 | 0xc04b1d3d, 0x00000060, | ||
917 | 0xbf8cc07f, 0xc0431e7d, | ||
918 | 0x00000074, 0xbf8cc07f, | ||
887 | 0xbef4007e, 0x8675ff7f, | 919 | 0xbef4007e, 0x8675ff7f, |
888 | 0x0000ffff, 0x8775ff75, | 920 | 0x0000ffff, 0x8775ff75, |
889 | 0x00040000, 0xbef60080, | 921 | 0x00040000, 0xbef60080, |
890 | 0xbef700ff, 0x00807fac, | 922 | 0xbef700ff, 0x00807fac, |
891 | 0x8676ff7f, 0x08000000, | 923 | 0x867aff7f, 0x08000000, |
892 | 0x8f768376, 0x87777677, | 924 | 0x8f7a837a, 0x87777a77, |
893 | 0x8676ff7f, 0x70000000, | 925 | 0x867aff7f, 0x70000000, |
894 | 0x8f768176, 0x87777677, | 926 | 0x8f7a817a, 0x87777a77, |
895 | 0xbefb007c, 0xbefa0080, | 927 | 0xbef1007c, 0xbef00080, |
896 | 0xbf8a0000, 0x8676ff7f, | 928 | 0xb8f02a05, 0x80708170, |
897 | 0x04000000, 0xbf840012, | 929 | 0x8e708a70, 0x8e708170, |
898 | 0xbefe00c1, 0xbeff00c1, | 930 | 0xb8fa1605, 0x807a817a, |
899 | 0xb8f14306, 0x8671c171, | 931 | 0x8e7a867a, 0x80707a70, |
900 | 0xbf84000d, 0x8e718671, | 932 | 0xbef60084, 0xbef600ff, |
901 | 0x8e718271, 0xbef60071, | 933 | 0x01000000, 0xbefe007c, |
902 | 0xbef600ff, 0x01000000, | 934 | 0xbefc0070, 0xc0611c7a, |
935 | 0x0000007c, 0xbf8cc07f, | ||
936 | 0x80708470, 0xbefc007e, | ||
937 | 0xbefe007c, 0xbefc0070, | ||
938 | 0xc0611b3a, 0x0000007c, | ||
939 | 0xbf8cc07f, 0x80708470, | ||
940 | 0xbefc007e, 0xbefe007c, | ||
941 | 0xbefc0070, 0xc0611b7a, | ||
942 | 0x0000007c, 0xbf8cc07f, | ||
943 | 0x80708470, 0xbefc007e, | ||
944 | 0xbefe007c, 0xbefc0070, | ||
945 | 0xc0611bba, 0x0000007c, | ||
946 | 0xbf8cc07f, 0x80708470, | ||
947 | 0xbefc007e, 0xbefe007c, | ||
948 | 0xbefc0070, 0xc0611bfa, | ||
949 | 0x0000007c, 0xbf8cc07f, | ||
950 | 0x80708470, 0xbefc007e, | ||
951 | 0xbefe007c, 0xbefc0070, | ||
952 | 0xc0611e3a, 0x0000007c, | ||
953 | 0xbf8cc07f, 0x80708470, | ||
954 | 0xbefc007e, 0xb8fbf803, | ||
955 | 0xbefe007c, 0xbefc0070, | ||
956 | 0xc0611efa, 0x0000007c, | ||
957 | 0xbf8cc07f, 0x80708470, | ||
958 | 0xbefc007e, 0xbefe007c, | ||
959 | 0xbefc0070, 0xc0611a3a, | ||
960 | 0x0000007c, 0xbf8cc07f, | ||
961 | 0x80708470, 0xbefc007e, | ||
962 | 0xbefe007c, 0xbefc0070, | ||
963 | 0xc0611a7a, 0x0000007c, | ||
964 | 0xbf8cc07f, 0x80708470, | ||
965 | 0xbefc007e, 0xb8f1f801, | ||
966 | 0xbefe007c, 0xbefc0070, | ||
967 | 0xc0611c7a, 0x0000007c, | ||
968 | 0xbf8cc07f, 0x80708470, | ||
969 | 0xbefc007e, 0x867aff7f, | ||
970 | 0x04000000, 0xbeef0080, | ||
971 | 0x876f6f7a, 0xb8f02a05, | ||
972 | 0x80708170, 0x8e708a70, | ||
973 | 0x8e708170, 0xb8fb1605, | ||
974 | 0x807b817b, 0x8e7b847b, | ||
975 | 0x8e76827b, 0xbef600ff, | ||
976 | 0x01000000, 0xbef20174, | ||
977 | 0x80747074, 0x82758075, | ||
903 | 0xbefc0080, 0xbf800000, | 978 | 0xbefc0080, 0xbf800000, |
904 | 0x807cff7c, 0x00000100, | 979 | 0xbe802b00, 0xbe822b02, |
905 | 0x807aff7a, 0x00000100, | 980 | 0xbe842b04, 0xbe862b06, |
906 | 0xbf0a717c, 0xbf85fffa, | 981 | 0xbe882b08, 0xbe8a2b0a, |
982 | 0xbe8c2b0c, 0xbe8e2b0e, | ||
983 | 0xc06b003a, 0x00000000, | ||
984 | 0xbf8cc07f, 0xc06b013a, | ||
985 | 0x00000010, 0xbf8cc07f, | ||
986 | 0xc06b023a, 0x00000020, | ||
987 | 0xbf8cc07f, 0xc06b033a, | ||
988 | 0x00000030, 0xbf8cc07f, | ||
989 | 0x8074c074, 0x82758075, | ||
990 | 0x807c907c, 0xbf0a7b7c, | ||
991 | 0xbf85ffe7, 0xbef40172, | ||
992 | 0xbef00080, 0xbefe00c1, | ||
993 | 0xbeff00c1, 0xbee80080, | ||
994 | 0xbee90080, 0xbef600ff, | ||
995 | 0x01000000, 0x867aff78, | ||
996 | 0x00400000, 0xbf850003, | ||
997 | 0xb8faf803, 0x897a7aff, | ||
998 | 0x10000000, 0xbf85004d, | ||
999 | 0xbe840080, 0xd2890000, | ||
1000 | 0x00000900, 0x80048104, | ||
1001 | 0xd2890001, 0x00000900, | ||
1002 | 0x80048104, 0xd2890002, | ||
1003 | 0x00000900, 0x80048104, | ||
1004 | 0xd2890003, 0x00000900, | ||
1005 | 0x80048104, 0xc069003a, | ||
1006 | 0x00000070, 0xbf8cc07f, | ||
1007 | 0x80709070, 0xbf06c004, | ||
1008 | 0xbf84ffee, 0xbe840080, | ||
1009 | 0xd2890000, 0x00000901, | ||
1010 | 0x80048104, 0xd2890001, | ||
1011 | 0x00000901, 0x80048104, | ||
1012 | 0xd2890002, 0x00000901, | ||
1013 | 0x80048104, 0xd2890003, | ||
1014 | 0x00000901, 0x80048104, | ||
1015 | 0xc069003a, 0x00000070, | ||
1016 | 0xbf8cc07f, 0x80709070, | ||
1017 | 0xbf06c004, 0xbf84ffee, | ||
1018 | 0xbe840080, 0xd2890000, | ||
1019 | 0x00000902, 0x80048104, | ||
1020 | 0xd2890001, 0x00000902, | ||
1021 | 0x80048104, 0xd2890002, | ||
1022 | 0x00000902, 0x80048104, | ||
1023 | 0xd2890003, 0x00000902, | ||
1024 | 0x80048104, 0xc069003a, | ||
1025 | 0x00000070, 0xbf8cc07f, | ||
1026 | 0x80709070, 0xbf06c004, | ||
1027 | 0xbf84ffee, 0xbe840080, | ||
1028 | 0xd2890000, 0x00000903, | ||
1029 | 0x80048104, 0xd2890001, | ||
1030 | 0x00000903, 0x80048104, | ||
1031 | 0xd2890002, 0x00000903, | ||
1032 | 0x80048104, 0xd2890003, | ||
1033 | 0x00000903, 0x80048104, | ||
1034 | 0xc069003a, 0x00000070, | ||
1035 | 0xbf8cc07f, 0x80709070, | ||
1036 | 0xbf06c004, 0xbf84ffee, | ||
1037 | 0xbf820008, 0xe0724000, | ||
1038 | 0x701d0000, 0xe0724100, | ||
1039 | 0x701d0100, 0xe0724200, | ||
1040 | 0x701d0200, 0xe0724300, | ||
1041 | 0x701d0300, 0xbefe00c1, | ||
1042 | 0xbeff00c1, 0xb8fb4306, | ||
1043 | 0x867bc17b, 0xbf840064, | ||
1044 | 0xbf8a0000, 0x867aff6f, | ||
1045 | 0x04000000, 0xbf840060, | ||
1046 | 0x8e7b867b, 0x8e7b827b, | ||
1047 | 0xbef6007b, 0xb8f02a05, | ||
1048 | 0x80708170, 0x8e708a70, | ||
1049 | 0x8e708170, 0xb8fa1605, | ||
1050 | 0x807a817a, 0x8e7a867a, | ||
1051 | 0x80707a70, 0x8070ff70, | ||
1052 | 0x00000080, 0xbef600ff, | ||
1053 | 0x01000000, 0xbefc0080, | ||
1054 | 0xd28c0002, 0x000100c1, | ||
1055 | 0xd28d0003, 0x000204c1, | ||
1056 | 0x867aff78, 0x00400000, | ||
1057 | 0xbf850003, 0xb8faf803, | ||
1058 | 0x897a7aff, 0x10000000, | ||
1059 | 0xbf850030, 0x24040682, | ||
1060 | 0xd86e4000, 0x00000002, | ||
1061 | 0xbf8cc07f, 0xbe840080, | ||
1062 | 0xd2890000, 0x00000900, | ||
1063 | 0x80048104, 0xd2890001, | ||
1064 | 0x00000900, 0x80048104, | ||
1065 | 0xd2890002, 0x00000900, | ||
1066 | 0x80048104, 0xd2890003, | ||
1067 | 0x00000900, 0x80048104, | ||
1068 | 0xc069003a, 0x00000070, | ||
1069 | 0xbf8cc07f, 0x80709070, | ||
1070 | 0xbf06c004, 0xbf84ffee, | ||
1071 | 0xbe840080, 0xd2890000, | ||
1072 | 0x00000901, 0x80048104, | ||
1073 | 0xd2890001, 0x00000901, | ||
1074 | 0x80048104, 0xd2890002, | ||
1075 | 0x00000901, 0x80048104, | ||
1076 | 0xd2890003, 0x00000901, | ||
1077 | 0x80048104, 0xc069003a, | ||
1078 | 0x00000070, 0xbf8cc07f, | ||
1079 | 0x80709070, 0xbf06c004, | ||
1080 | 0xbf84ffee, 0x680404ff, | ||
1081 | 0x00000200, 0xd0c9006a, | ||
1082 | 0x0000f702, 0xbf87ffd2, | ||
1083 | 0xbf820015, 0xd1060002, | ||
1084 | 0x00011103, 0x7e0602ff, | ||
1085 | 0x00000200, 0xbefc00ff, | ||
1086 | 0x00010000, 0xbe800077, | ||
1087 | 0x8677ff77, 0xff7fffff, | ||
1088 | 0x8777ff77, 0x00058000, | ||
1089 | 0xd8ec0000, 0x00000002, | ||
1090 | 0xbf8cc07f, 0xe0765000, | ||
1091 | 0x701d0002, 0x68040702, | ||
1092 | 0xd0c9006a, 0x0000f702, | ||
1093 | 0xbf87fff7, 0xbef70000, | ||
1094 | 0xbef000ff, 0x00000400, | ||
907 | 0xbefe00c1, 0xbeff00c1, | 1095 | 0xbefe00c1, 0xbeff00c1, |
908 | 0xb8f12a05, 0x80718171, | 1096 | 0xb8fb2a05, 0x807b817b, |
909 | 0x8e718271, 0x8e768871, | 1097 | 0x8e7b827b, 0x8e76887b, |
910 | 0xbef600ff, 0x01000000, | 1098 | 0xbef600ff, 0x01000000, |
1099 | 0xbefc0084, 0xbf0a7b7c, | ||
1100 | 0xbf84006d, 0xbf11017c, | ||
1101 | 0x807bff7b, 0x00001000, | ||
1102 | 0x867aff78, 0x00400000, | ||
1103 | 0xbf850003, 0xb8faf803, | ||
1104 | 0x897a7aff, 0x10000000, | ||
1105 | 0xbf850051, 0xbe840080, | ||
1106 | 0xd2890000, 0x00000900, | ||
1107 | 0x80048104, 0xd2890001, | ||
1108 | 0x00000900, 0x80048104, | ||
1109 | 0xd2890002, 0x00000900, | ||
1110 | 0x80048104, 0xd2890003, | ||
1111 | 0x00000900, 0x80048104, | ||
1112 | 0xc069003a, 0x00000070, | ||
1113 | 0xbf8cc07f, 0x80709070, | ||
1114 | 0xbf06c004, 0xbf84ffee, | ||
1115 | 0xbe840080, 0xd2890000, | ||
1116 | 0x00000901, 0x80048104, | ||
1117 | 0xd2890001, 0x00000901, | ||
1118 | 0x80048104, 0xd2890002, | ||
1119 | 0x00000901, 0x80048104, | ||
1120 | 0xd2890003, 0x00000901, | ||
1121 | 0x80048104, 0xc069003a, | ||
1122 | 0x00000070, 0xbf8cc07f, | ||
1123 | 0x80709070, 0xbf06c004, | ||
1124 | 0xbf84ffee, 0xbe840080, | ||
1125 | 0xd2890000, 0x00000902, | ||
1126 | 0x80048104, 0xd2890001, | ||
1127 | 0x00000902, 0x80048104, | ||
1128 | 0xd2890002, 0x00000902, | ||
1129 | 0x80048104, 0xd2890003, | ||
1130 | 0x00000902, 0x80048104, | ||
1131 | 0xc069003a, 0x00000070, | ||
1132 | 0xbf8cc07f, 0x80709070, | ||
1133 | 0xbf06c004, 0xbf84ffee, | ||
1134 | 0xbe840080, 0xd2890000, | ||
1135 | 0x00000903, 0x80048104, | ||
1136 | 0xd2890001, 0x00000903, | ||
1137 | 0x80048104, 0xd2890002, | ||
1138 | 0x00000903, 0x80048104, | ||
1139 | 0xd2890003, 0x00000903, | ||
1140 | 0x80048104, 0xc069003a, | ||
1141 | 0x00000070, 0xbf8cc07f, | ||
1142 | 0x80709070, 0xbf06c004, | ||
1143 | 0xbf84ffee, 0x807c847c, | ||
1144 | 0xbf0a7b7c, 0xbf85ffb1, | ||
1145 | 0xbf9c0000, 0xbf820012, | ||
1146 | 0x7e000300, 0x7e020301, | ||
1147 | 0x7e040302, 0x7e060303, | ||
1148 | 0xe0724000, 0x701d0000, | ||
1149 | 0xe0724100, 0x701d0100, | ||
1150 | 0xe0724200, 0x701d0200, | ||
1151 | 0xe0724300, 0x701d0300, | ||
1152 | 0x807c847c, 0x8070ff70, | ||
1153 | 0x00000400, 0xbf0a7b7c, | ||
1154 | 0xbf85ffef, 0xbf9c0000, | ||
911 | 0xbefc0080, 0xbf11017c, | 1155 | 0xbefc0080, 0xbf11017c, |
912 | 0x8071ff71, 0x00001000, | 1156 | 0x867aff78, 0x00400000, |
913 | 0x7e000300, 0xe0724000, | 1157 | 0xbf850003, 0xb8faf803, |
914 | 0x7a1d0000, 0x807c817c, | 1158 | 0x897a7aff, 0x10000000, |
915 | 0x807aff7a, 0x00000100, | 1159 | 0xbf850059, 0xd3d84000, |
916 | 0xbf0a717c, 0xbf85fff8, | 1160 | 0x18000100, 0xd3d84001, |
917 | 0xbf9c0000, 0xbefe00c1, | 1161 | 0x18000101, 0xd3d84002, |
918 | 0xbeff00c1, 0xb8f12a05, | 1162 | 0x18000102, 0xd3d84003, |
919 | 0x80718171, 0x8e718271, | 1163 | 0x18000103, 0xbe840080, |
920 | 0x8e768871, 0xbef600ff, | 1164 | 0xd2890000, 0x00000900, |
921 | 0x01000000, 0xbefc0080, | 1165 | 0x80048104, 0xd2890001, |
922 | 0xbf11017c, 0x8071ff71, | 1166 | 0x00000900, 0x80048104, |
923 | 0x00001000, 0xd3d84000, | 1167 | 0xd2890002, 0x00000900, |
924 | 0x18000100, 0x7e000000, | 1168 | 0x80048104, 0xd2890003, |
925 | 0x7e000000, 0xe0724000, | 1169 | 0x00000900, 0x80048104, |
926 | 0x7a1d0000, 0x807c817c, | 1170 | 0xc069003a, 0x00000070, |
927 | 0x807aff7a, 0x00000100, | 1171 | 0xbf8cc07f, 0x80709070, |
928 | 0xbf0a717c, 0xbf85fff5, | 1172 | 0xbf06c004, 0xbf84ffee, |
929 | 0xbf9c0000, 0xb8f11605, | 1173 | 0xbe840080, 0xd2890000, |
930 | 0x80718171, 0x8e718471, | 1174 | 0x00000901, 0x80048104, |
931 | 0x8e768871, 0xbef600ff, | 1175 | 0xd2890001, 0x00000901, |
932 | 0x01000000, 0xbefc0080, | 1176 | 0x80048104, 0xd2890002, |
933 | 0xbf800000, 0xbe802a00, | 1177 | 0x00000901, 0x80048104, |
934 | 0x7e000200, 0xe0724000, | 1178 | 0xd2890003, 0x00000901, |
935 | 0x7a1d0000, 0x807aff7a, | 1179 | 0x80048104, 0xc069003a, |
936 | 0x00000100, 0x807c817c, | 1180 | 0x00000070, 0xbf8cc07f, |
937 | 0xbf0a717c, 0xbf85fff7, | 1181 | 0x80709070, 0xbf06c004, |
938 | 0xbef60084, 0xbef600ff, | 1182 | 0xbf84ffee, 0xbe840080, |
939 | 0x01000000, 0x7e00027b, | 1183 | 0xd2890000, 0x00000902, |
940 | 0xe0724000, 0x7a1d0000, | 1184 | 0x80048104, 0xd2890001, |
941 | 0x807aff7a, 0x00000100, | 1185 | 0x00000902, 0x80048104, |
942 | 0x7e00026c, 0xe0724000, | 1186 | 0xd2890002, 0x00000902, |
943 | 0x7a1d0000, 0x807aff7a, | 1187 | 0x80048104, 0xd2890003, |
944 | 0x00000100, 0x7e00026d, | 1188 | 0x00000902, 0x80048104, |
945 | 0xe0724000, 0x7a1d0000, | 1189 | 0xc069003a, 0x00000070, |
946 | 0x807aff7a, 0x00000100, | 1190 | 0xbf8cc07f, 0x80709070, |
947 | 0x7e00026e, 0xe0724000, | 1191 | 0xbf06c004, 0xbf84ffee, |
948 | 0x7a1d0000, 0x807aff7a, | 1192 | 0xbe840080, 0xd2890000, |
949 | 0x00000100, 0x7e00026f, | 1193 | 0x00000903, 0x80048104, |
950 | 0xe0724000, 0x7a1d0000, | 1194 | 0xd2890001, 0x00000903, |
951 | 0x807aff7a, 0x00000100, | 1195 | 0x80048104, 0xd2890002, |
952 | 0x7e000270, 0xe0724000, | 1196 | 0x00000903, 0x80048104, |
953 | 0x7a1d0000, 0x807aff7a, | 1197 | 0xd2890003, 0x00000903, |
954 | 0x00000100, 0xb8f1f803, | 1198 | 0x80048104, 0xc069003a, |
955 | 0x7e000271, 0xe0724000, | 1199 | 0x00000070, 0xbf8cc07f, |
956 | 0x7a1d0000, 0x807aff7a, | 1200 | 0x80709070, 0xbf06c004, |
957 | 0x00000100, 0x7e000272, | 1201 | 0xbf84ffee, 0x807c847c, |
958 | 0xe0724000, 0x7a1d0000, | 1202 | 0xbf0a7b7c, 0xbf85ffa9, |
959 | 0x807aff7a, 0x00000100, | 1203 | 0xbf9c0000, 0xbf820016, |
960 | 0x7e000273, 0xe0724000, | 1204 | 0xd3d84000, 0x18000100, |
961 | 0x7a1d0000, 0x807aff7a, | 1205 | 0xd3d84001, 0x18000101, |
962 | 0x00000100, 0xb8fbf801, | 1206 | 0xd3d84002, 0x18000102, |
963 | 0x7e00027b, 0xe0724000, | 1207 | 0xd3d84003, 0x18000103, |
964 | 0x7a1d0000, 0x807aff7a, | 1208 | 0xe0724000, 0x701d0000, |
965 | 0x00000100, 0xbf8200bb, | 1209 | 0xe0724100, 0x701d0100, |
966 | 0xbef4007e, 0x8675ff7f, | 1210 | 0xe0724200, 0x701d0200, |
967 | 0x0000ffff, 0x8775ff75, | 1211 | 0xe0724300, 0x701d0300, |
968 | 0x00040000, 0xbef60080, | 1212 | 0x807c847c, 0x8070ff70, |
969 | 0xbef700ff, 0x00807fac, | 1213 | 0x00000400, 0xbf0a7b7c, |
970 | 0x8672ff7f, 0x08000000, | 1214 | 0xbf85ffeb, 0xbf9c0000, |
971 | 0x8f728372, 0x87777277, | 1215 | 0xbf820106, 0xbef4007e, |
972 | 0x8672ff7f, 0x70000000, | 1216 | 0x8675ff7f, 0x0000ffff, |
973 | 0x8f728172, 0x87777277, | 1217 | 0x8775ff75, 0x00040000, |
974 | 0xbef80080, 0x8672ff7f, | 1218 | 0xbef60080, 0xbef700ff, |
975 | 0x04000000, 0xbf840011, | 1219 | 0x00807fac, 0x866eff7f, |
1220 | 0x08000000, 0x8f6e836e, | ||
1221 | 0x87776e77, 0x866eff7f, | ||
1222 | 0x70000000, 0x8f6e816e, | ||
1223 | 0x87776e77, 0x866eff7f, | ||
1224 | 0x04000000, 0xbf84001f, | ||
976 | 0xbefe00c1, 0xbeff00c1, | 1225 | 0xbefe00c1, 0xbeff00c1, |
977 | 0xb8ef4306, 0x866fc16f, | 1226 | 0xb8ef4306, 0x866fc16f, |
978 | 0xbf84000c, 0x8e6f866f, | 1227 | 0xbf84001a, 0x8e6f866f, |
979 | 0x8e6f826f, 0xbef6006f, | 1228 | 0x8e6f826f, 0xbef6006f, |
1229 | 0xb8f82a05, 0x80788178, | ||
1230 | 0x8e788a78, 0x8e788178, | ||
1231 | 0xb8ee1605, 0x806e816e, | ||
1232 | 0x8e6e866e, 0x80786e78, | ||
1233 | 0x8078ff78, 0x00000080, | ||
980 | 0xbef600ff, 0x01000000, | 1234 | 0xbef600ff, 0x01000000, |
981 | 0xbefc0080, 0x807cff7c, | 1235 | 0xbefc0080, 0xe0510000, |
982 | 0x00000100, 0x8078ff78, | 1236 | 0x781d0000, 0xe0510100, |
983 | 0x00000100, 0xbf0a6f7c, | 1237 | 0x781d0000, 0x807cff7c, |
984 | 0xbf85fffa, 0xbefe00c1, | 1238 | 0x00000200, 0x8078ff78, |
985 | 0xbeff00c1, 0xb8ef2a05, | 1239 | 0x00000200, 0xbf0a6f7c, |
986 | 0x806f816f, 0x8e6f826f, | 1240 | 0xbf85fff6, 0xbef80080, |
987 | 0x8e76886f, 0xbef600ff, | ||
988 | 0x01000000, 0xbef20078, | ||
989 | 0x8078ff78, 0x00000100, | ||
990 | 0xbefc0081, 0xbf11087c, | ||
991 | 0x806fff6f, 0x00008000, | ||
992 | 0xe0524000, 0x781d0000, | ||
993 | 0xbf8c0f70, 0x7e000300, | ||
994 | 0x807c817c, 0x8078ff78, | ||
995 | 0x00000100, 0xbf0a6f7c, | ||
996 | 0xbf85fff7, 0xbf9c0000, | ||
997 | 0xbefe00c1, 0xbeff00c1, | 1241 | 0xbefe00c1, 0xbeff00c1, |
998 | 0xb8ef2a05, 0x806f816f, | 1242 | 0xb8ef2a05, 0x806f816f, |
999 | 0x8e6f826f, 0x8e76886f, | 1243 | 0x8e6f826f, 0x8e76886f, |
1000 | 0xbef600ff, 0x01000000, | 1244 | 0xbef90076, 0xbef600ff, |
1001 | 0xbefc0080, 0xbf11087c, | 1245 | 0x01000000, 0xbeee0078, |
1002 | 0x806fff6f, 0x00008000, | 1246 | 0x8078ff78, 0x00000400, |
1247 | 0xbef30079, 0x8079ff79, | ||
1248 | 0x00000400, 0xbefc0084, | ||
1249 | 0xbf11087c, 0x806fff6f, | ||
1250 | 0x00008000, 0xe0524000, | ||
1251 | 0x791d0000, 0xe0524100, | ||
1252 | 0x791d0100, 0xe0524200, | ||
1253 | 0x791d0200, 0xe0524300, | ||
1254 | 0x791d0300, 0x8079ff79, | ||
1255 | 0x00000400, 0xbf8c0f70, | ||
1256 | 0xd3d94000, 0x18000100, | ||
1257 | 0xd3d94001, 0x18000101, | ||
1258 | 0xd3d94002, 0x18000102, | ||
1259 | 0xd3d94003, 0x18000103, | ||
1003 | 0xe0524000, 0x781d0000, | 1260 | 0xe0524000, 0x781d0000, |
1004 | 0xbf8c0f70, 0xd3d94000, | 1261 | 0xe0524100, 0x781d0100, |
1005 | 0x18000100, 0x807c817c, | 1262 | 0xe0524200, 0x781d0200, |
1006 | 0x8078ff78, 0x00000100, | 1263 | 0xe0524300, 0x781d0300, |
1007 | 0xbf0a6f7c, 0xbf85fff6, | 1264 | 0xbf8c0f70, 0x7e000300, |
1265 | 0x7e020301, 0x7e040302, | ||
1266 | 0x7e060303, 0x807c847c, | ||
1267 | 0x8078ff78, 0x00000400, | ||
1268 | 0xbf0a6f7c, 0xbf85ffdb, | ||
1008 | 0xbf9c0000, 0xe0524000, | 1269 | 0xbf9c0000, 0xe0524000, |
1009 | 0x721d0000, 0xb8ef1605, | 1270 | 0x731d0000, 0xe0524100, |
1271 | 0x731d0100, 0xe0524200, | ||
1272 | 0x731d0200, 0xe0524300, | ||
1273 | 0x731d0300, 0xbf8c0f70, | ||
1274 | 0xd3d94000, 0x18000100, | ||
1275 | 0xd3d94001, 0x18000101, | ||
1276 | 0xd3d94002, 0x18000102, | ||
1277 | 0xd3d94003, 0x18000103, | ||
1278 | 0xe0524000, 0x6e1d0000, | ||
1279 | 0xe0524100, 0x6e1d0100, | ||
1280 | 0xe0524200, 0x6e1d0200, | ||
1281 | 0xe0524300, 0x6e1d0300, | ||
1282 | 0xb8f82a05, 0x80788178, | ||
1283 | 0x8e788a78, 0x8e788178, | ||
1284 | 0xb8ee1605, 0x806e816e, | ||
1285 | 0x8e6e866e, 0x80786e78, | ||
1286 | 0x80f8c078, 0xb8ef1605, | ||
1010 | 0x806f816f, 0x8e6f846f, | 1287 | 0x806f816f, 0x8e6f846f, |
1011 | 0x8e76886f, 0xbef600ff, | 1288 | 0x8e76826f, 0xbef600ff, |
1012 | 0x01000000, 0xc0211cba, | 1289 | 0x01000000, 0xbefc006f, |
1013 | 0x00000078, 0x8078ff78, | 1290 | 0xc031003a, 0x00000078, |
1014 | 0x00000100, 0xbefc0081, | 1291 | 0x80f8c078, 0xbf8cc07f, |
1015 | 0xc021003a, 0x00000078, | 1292 | 0x80fc907c, 0xbf800000, |
1016 | 0x8078ff78, 0x00000100, | 1293 | 0xbe802d00, 0xbe822d02, |
1017 | 0xbf8cc07f, 0xbe802c00, | 1294 | 0xbe842d04, 0xbe862d06, |
1018 | 0xbf800000, 0x807c817c, | 1295 | 0xbe882d08, 0xbe8a2d0a, |
1019 | 0xbf0a6f7c, 0xbf85fff6, | 1296 | 0xbe8c2d0c, 0xbe8e2d0e, |
1020 | 0xbe800072, 0xbef60084, | 1297 | 0xbf06807c, 0xbf84fff0, |
1021 | 0xbef600ff, 0x01000000, | 1298 | 0xb8f82a05, 0x80788178, |
1022 | 0xc0211bfa, 0x00000078, | 1299 | 0x8e788a78, 0x8e788178, |
1023 | 0x8078ff78, 0x00000100, | 1300 | 0xb8ee1605, 0x806e816e, |
1301 | 0x8e6e866e, 0x80786e78, | ||
1302 | 0xbef60084, 0xbef600ff, | ||
1303 | 0x01000000, 0xc0211bfa, | ||
1304 | 0x00000078, 0x80788478, | ||
1024 | 0xc0211b3a, 0x00000078, | 1305 | 0xc0211b3a, 0x00000078, |
1025 | 0x8078ff78, 0x00000100, | 1306 | 0x80788478, 0xc0211b7a, |
1026 | 0xc0211b7a, 0x00000078, | 1307 | 0x00000078, 0x80788478, |
1027 | 0x8078ff78, 0x00000100, | ||
1028 | 0xc0211eba, 0x00000078, | ||
1029 | 0x8078ff78, 0x00000100, | ||
1030 | 0xc0211efa, 0x00000078, | ||
1031 | 0x8078ff78, 0x00000100, | ||
1032 | 0xc0211c3a, 0x00000078, | 1308 | 0xc0211c3a, 0x00000078, |
1033 | 0x8078ff78, 0x00000100, | 1309 | 0x80788478, 0xc0211c7a, |
1034 | 0xc0211c7a, 0x00000078, | 1310 | 0x00000078, 0x80788478, |
1035 | 0x8078ff78, 0x00000100, | 1311 | 0xc0211eba, 0x00000078, |
1312 | 0x80788478, 0xc0211efa, | ||
1313 | 0x00000078, 0x80788478, | ||
1036 | 0xc0211a3a, 0x00000078, | 1314 | 0xc0211a3a, 0x00000078, |
1037 | 0x8078ff78, 0x00000100, | 1315 | 0x80788478, 0xc0211a7a, |
1038 | 0xc0211a7a, 0x00000078, | 1316 | 0x00000078, 0x80788478, |
1039 | 0x8078ff78, 0x00000100, | ||
1040 | 0xc0211cfa, 0x00000078, | 1317 | 0xc0211cfa, 0x00000078, |
1041 | 0x8078ff78, 0x00000100, | 1318 | 0x80788478, 0xbf8cc07f, |
1042 | 0xbf8cc07f, 0xbef2006d, | 1319 | 0xbefc006f, 0xbefe0070, |
1043 | 0x866dff72, 0x0000ffff, | 1320 | 0xbeff0071, 0x866f7bff, |
1044 | 0xbefc006f, 0xbefe007a, | ||
1045 | 0xbeff007b, 0x866f71ff, | ||
1046 | 0x000003ff, 0xb96f4803, | 1321 | 0x000003ff, 0xb96f4803, |
1047 | 0x866f71ff, 0xfffff800, | 1322 | 0x866f7bff, 0xfffff800, |
1048 | 0x8f6f8b6f, 0xb96fa2c3, | 1323 | 0x8f6f8b6f, 0xb96fa2c3, |
1049 | 0xb973f801, 0x866fff72, | 1324 | 0xb973f801, 0xb8ee2a05, |
1050 | 0xf8000000, 0x8f6f9b6f, | 1325 | 0x806e816e, 0x8e6e8a6e, |
1051 | 0x8e6f906f, 0xbef30080, | 1326 | 0x8e6e816e, 0xb8ef1605, |
1052 | 0x87736f73, 0x866fff72, | 1327 | 0x806f816f, 0x8e6f866f, |
1053 | 0x04000000, 0x8f6f9a6f, | 1328 | 0x806e6f6e, 0x806e746e, |
1054 | 0x8e6f8f6f, 0x87736f73, | 1329 | 0x826f8075, 0x866fff6f, |
1055 | 0x866fff70, 0x00800000, | 1330 | 0x0000ffff, 0xc00b1c37, |
1056 | 0x8f6f976f, 0xb973f807, | 1331 | 0x00000050, 0xc00b1d37, |
1057 | 0x86fe7e7e, 0x86ea6a6a, | 1332 | 0x00000060, 0xc0031e77, |
1058 | 0xb970f802, 0xbf8a0000, | 1333 | 0x00000074, 0xbf8cc07f, |
1334 | 0x866fff6d, 0xf8000000, | ||
1335 | 0x8f6f9b6f, 0x8e6f906f, | ||
1336 | 0xbeee0080, 0x876e6f6e, | ||
1337 | 0x866fff6d, 0x04000000, | ||
1338 | 0x8f6f9a6f, 0x8e6f8f6f, | ||
1339 | 0x876e6f6e, 0x866fff7a, | ||
1340 | 0x00800000, 0x8f6f976f, | ||
1341 | 0xb96ef807, 0x866dff6d, | ||
1342 | 0x0000ffff, 0x86fe7e7e, | ||
1343 | 0x86ea6a6a, 0x8f6e837a, | ||
1344 | 0xb96ee0c2, 0xbf800002, | ||
1345 | 0xb97a0002, 0xbf8a0000, | ||
1059 | 0x95806f6c, 0xbf810000, | 1346 | 0x95806f6c, 0xbf810000, |
1060 | }; | 1347 | }; |
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_arcturus.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_arcturus.asm deleted file mode 100644 index b83e2a643ddb..000000000000 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_arcturus.asm +++ /dev/null | |||
@@ -1,746 +0,0 @@ | |||
1 | shader main | ||
2 | |||
3 | asic(DEFAULT) | ||
4 | |||
5 | type(CS) | ||
6 | |||
7 | /*************************************************************************/ | ||
8 | /* control on how to run the shader */ | ||
9 | /*************************************************************************/ | ||
10 | //any hack that needs to be made to run this code in EMU (either becasue various EMU code are not ready or no compute save & restore in EMU run) | ||
11 | var EMU_RUN_HACK = 0 | ||
12 | var EMU_RUN_HACK_RESTORE_NORMAL = 0 | ||
13 | var EMU_RUN_HACK_SAVE_NORMAL_EXIT = 0 | ||
14 | var EMU_RUN_HACK_SAVE_SINGLE_WAVE = 0 | ||
15 | var EMU_RUN_HACK_SAVE_FIRST_TIME = 0 //for interrupted restore in which the first save is through EMU_RUN_HACK | ||
16 | var SAVE_LDS = 0 | ||
17 | var WG_BASE_ADDR_LO = 0x9000a000 | ||
18 | var WG_BASE_ADDR_HI = 0x0 | ||
19 | var WAVE_SPACE = 0x6000 //memory size that each wave occupies in workgroup state mem | ||
20 | var CTX_SAVE_CONTROL = 0x0 | ||
21 | var CTX_RESTORE_CONTROL = CTX_SAVE_CONTROL | ||
22 | var SIM_RUN_HACK = 0 //any hack that needs to be made to run this code in SIM (either becasue various RTL code are not ready or no compute save & restore in RTL run) | ||
23 | var SGPR_SAVE_USE_SQC = 0 //use SQC D$ to do the write | ||
24 | var USE_MTBUF_INSTEAD_OF_MUBUF = 0 //need to change BUF_DATA_FORMAT in S_SAVE_BUF_RSRC_WORD3_MISC from 0 to BUF_DATA_FORMAT_32 if set to 1 (i.e. 0x00827FAC) | ||
25 | var SWIZZLE_EN = 0 //whether we use swizzled buffer addressing | ||
26 | |||
27 | /**************************************************************************/ | ||
28 | /* variables */ | ||
29 | /**************************************************************************/ | ||
30 | var SQ_WAVE_STATUS_INST_ATC_SHIFT = 23 | ||
31 | var SQ_WAVE_STATUS_INST_ATC_MASK = 0x00800000 | ||
32 | var SQ_WAVE_STATUS_SPI_PRIO_MASK = 0x00000006 | ||
33 | |||
34 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT = 12 | ||
35 | var SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE = 9 | ||
36 | var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT = 8 | ||
37 | var SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE = 6 | ||
38 | var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT = 24 | ||
39 | var SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE = 3 //FIXME sq.blk still has 4 bits at this time while SQ programming guide has 3 bits | ||
40 | |||
41 | var SQ_WAVE_TRAPSTS_SAVECTX_MASK = 0x400 | ||
42 | var SQ_WAVE_TRAPSTS_EXCE_MASK = 0x1FF // Exception mask | ||
43 | var SQ_WAVE_TRAPSTS_SAVECTX_SHIFT = 10 | ||
44 | var SQ_WAVE_TRAPSTS_MEM_VIOL_MASK = 0x100 | ||
45 | var SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT = 8 | ||
46 | var SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK = 0x3FF | ||
47 | var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT = 0x0 | ||
48 | var SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE = 10 | ||
49 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK = 0xFFFFF800 | ||
50 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT = 11 | ||
51 | var SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE = 21 | ||
52 | |||
53 | var SQ_WAVE_IB_STS_RCNT_SHIFT = 16 //FIXME | ||
54 | var SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT = 15 //FIXME | ||
55 | var SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG = 0x00007FFF //FIXME | ||
56 | |||
57 | var SQ_BUF_RSRC_WORD1_ATC_SHIFT = 24 | ||
58 | var SQ_BUF_RSRC_WORD3_MTYPE_SHIFT = 27 | ||
59 | |||
60 | |||
61 | /* Save */ | ||
62 | var S_SAVE_BUF_RSRC_WORD1_STRIDE = 0x00040000 //stride is 4 bytes | ||
63 | var S_SAVE_BUF_RSRC_WORD3_MISC = 0x00807FAC //SQ_SEL_X/Y/Z/W, BUF_NUM_FORMAT_FLOAT, (0 for MUBUF stride[17:14] when ADD_TID_ENABLE and BUF_DATA_FORMAT_32 for MTBUF), ADD_TID_ENABLE | ||
64 | |||
65 | var S_SAVE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit | ||
66 | var S_SAVE_SPI_INIT_ATC_SHIFT = 27 | ||
67 | var S_SAVE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype | ||
68 | var S_SAVE_SPI_INIT_MTYPE_SHIFT = 28 | ||
69 | var S_SAVE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG | ||
70 | var S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT = 26 | ||
71 | |||
72 | var S_SAVE_PC_HI_RCNT_SHIFT = 27 //FIXME check with Brian to ensure all fields other than PC[47:0] can be used | ||
73 | var S_SAVE_PC_HI_RCNT_MASK = 0xF8000000 //FIXME | ||
74 | var S_SAVE_PC_HI_FIRST_REPLAY_SHIFT = 26 //FIXME | ||
75 | var S_SAVE_PC_HI_FIRST_REPLAY_MASK = 0x04000000 //FIXME | ||
76 | |||
77 | var s_save_spi_init_lo = exec_lo | ||
78 | var s_save_spi_init_hi = exec_hi | ||
79 | |||
80 | var s_save_pc_lo = ttmp0 //{TTMP1, TTMP0} = {3¡¯h0,pc_rewind[3:0], HT[0],trapID[7:0], PC[47:0]} | ||
81 | var s_save_pc_hi = ttmp1 | ||
82 | var s_save_exec_lo = ttmp2 | ||
83 | var s_save_exec_hi = ttmp3 | ||
84 | var s_save_status = ttmp4 | ||
85 | var s_save_trapsts = ttmp5 //not really used until the end of the SAVE routine | ||
86 | var s_save_xnack_mask_lo = ttmp6 | ||
87 | var s_save_xnack_mask_hi = ttmp7 | ||
88 | var s_save_buf_rsrc0 = ttmp8 | ||
89 | var s_save_buf_rsrc1 = ttmp9 | ||
90 | var s_save_buf_rsrc2 = ttmp10 | ||
91 | var s_save_buf_rsrc3 = ttmp11 | ||
92 | |||
93 | var s_save_mem_offset = ttmp14 | ||
94 | var s_save_alloc_size = s_save_trapsts //conflict | ||
95 | var s_save_tmp = s_save_buf_rsrc2 //shared with s_save_buf_rsrc2 (conflict: should not use mem access with s_save_tmp at the same time) | ||
96 | var s_save_m0 = ttmp15 | ||
97 | |||
98 | /* Restore */ | ||
99 | var S_RESTORE_BUF_RSRC_WORD1_STRIDE = S_SAVE_BUF_RSRC_WORD1_STRIDE | ||
100 | var S_RESTORE_BUF_RSRC_WORD3_MISC = S_SAVE_BUF_RSRC_WORD3_MISC | ||
101 | |||
102 | var S_RESTORE_SPI_INIT_ATC_MASK = 0x08000000 //bit[27]: ATC bit | ||
103 | var S_RESTORE_SPI_INIT_ATC_SHIFT = 27 | ||
104 | var S_RESTORE_SPI_INIT_MTYPE_MASK = 0x70000000 //bit[30:28]: Mtype | ||
105 | var S_RESTORE_SPI_INIT_MTYPE_SHIFT = 28 | ||
106 | var S_RESTORE_SPI_INIT_FIRST_WAVE_MASK = 0x04000000 //bit[26]: FirstWaveInTG | ||
107 | var S_RESTORE_SPI_INIT_FIRST_WAVE_SHIFT = 26 | ||
108 | |||
109 | var S_RESTORE_PC_HI_RCNT_SHIFT = S_SAVE_PC_HI_RCNT_SHIFT | ||
110 | var S_RESTORE_PC_HI_RCNT_MASK = S_SAVE_PC_HI_RCNT_MASK | ||
111 | var S_RESTORE_PC_HI_FIRST_REPLAY_SHIFT = S_SAVE_PC_HI_FIRST_REPLAY_SHIFT | ||
112 | var S_RESTORE_PC_HI_FIRST_REPLAY_MASK = S_SAVE_PC_HI_FIRST_REPLAY_MASK | ||
113 | |||
114 | var s_restore_spi_init_lo = exec_lo | ||
115 | var s_restore_spi_init_hi = exec_hi | ||
116 | |||
117 | var s_restore_mem_offset = ttmp12 | ||
118 | var s_restore_alloc_size = ttmp3 | ||
119 | var s_restore_tmp = ttmp6 | ||
120 | var s_restore_mem_offset_save = s_restore_tmp //no conflict | ||
121 | |||
122 | var s_restore_m0 = s_restore_alloc_size //no conflict | ||
123 | |||
124 | var s_restore_mode = ttmp7 | ||
125 | |||
126 | var s_restore_pc_lo = ttmp0 | ||
127 | var s_restore_pc_hi = ttmp1 | ||
128 | var s_restore_exec_lo = ttmp14 | ||
129 | var s_restore_exec_hi = ttmp15 | ||
130 | var s_restore_status = ttmp4 | ||
131 | var s_restore_trapsts = ttmp5 | ||
132 | var s_restore_xnack_mask_lo = xnack_mask_lo | ||
133 | var s_restore_xnack_mask_hi = xnack_mask_hi | ||
134 | var s_restore_buf_rsrc0 = ttmp8 | ||
135 | var s_restore_buf_rsrc1 = ttmp9 | ||
136 | var s_restore_buf_rsrc2 = ttmp10 | ||
137 | var s_restore_buf_rsrc3 = ttmp11 | ||
138 | |||
139 | /**************************************************************************/ | ||
140 | /* trap handler entry points */ | ||
141 | /**************************************************************************/ | ||
142 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) //hack to use trap_id for determining save/restore | ||
143 | //FIXME VCCZ un-init assertion s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC | ||
144 | s_and_b32 s_save_tmp, s_save_pc_hi, 0xffff0000 //change SCC | ||
145 | s_cmp_eq_u32 s_save_tmp, 0x007e0000 //Save: trap_id = 0x7e. Restore: trap_id = 0x7f. | ||
146 | s_cbranch_scc0 L_JUMP_TO_RESTORE //do not need to recover STATUS here since we are going to RESTORE | ||
147 | //FIXME s_setreg_b32 hwreg(HW_REG_STATUS), s_save_status //need to recover STATUS since we are going to SAVE | ||
148 | s_branch L_SKIP_RESTORE //NOT restore, SAVE actually | ||
149 | else | ||
150 | s_branch L_SKIP_RESTORE //NOT restore. might be a regular trap or save | ||
151 | end | ||
152 | |||
153 | L_JUMP_TO_RESTORE: | ||
154 | s_branch L_RESTORE //restore | ||
155 | |||
156 | L_SKIP_RESTORE: | ||
157 | |||
158 | s_getreg_b32 s_save_status, hwreg(HW_REG_STATUS) //save STATUS since we will change SCC | ||
159 | s_andn2_b32 s_save_status, s_save_status, SQ_WAVE_STATUS_SPI_PRIO_MASK //check whether this is for save | ||
160 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) | ||
161 | s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_SAVECTX_MASK //check whether this is for save | ||
162 | s_cbranch_scc1 L_SAVE //this is the operation for save | ||
163 | |||
164 | // ********* Handle non-CWSR traps ******************* | ||
165 | if (!EMU_RUN_HACK) | ||
166 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) | ||
167 | s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_EXCE_MASK // Check whether it is an exception | ||
168 | s_cbranch_scc1 L_EXCP_CASE // Exception, jump back to the shader program directly. | ||
169 | s_add_u32 ttmp0, ttmp0, 4 // S_TRAP case, add 4 to ttmp0 | ||
170 | |||
171 | L_EXCP_CASE: | ||
172 | s_and_b32 ttmp1, ttmp1, 0xFFFF | ||
173 | s_rfe_b64 [ttmp0, ttmp1] | ||
174 | end | ||
175 | // ********* End handling of non-CWSR traps ******************* | ||
176 | |||
177 | /**************************************************************************/ | ||
178 | /* save routine */ | ||
179 | /**************************************************************************/ | ||
180 | |||
181 | L_SAVE: | ||
182 | |||
183 | //check whether there is mem_viol | ||
184 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) | ||
185 | s_and_b32 s_save_trapsts, s_save_trapsts, SQ_WAVE_TRAPSTS_MEM_VIOL_MASK | ||
186 | s_cbranch_scc0 L_NO_PC_REWIND | ||
187 | |||
188 | //if so, need rewind PC assuming GDS operation gets NACKed | ||
189 | s_mov_b32 s_save_tmp, 0 //clear mem_viol bit | ||
190 | s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_MEM_VIOL_SHIFT, 1), s_save_tmp //clear mem_viol bit | ||
191 | s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] | ||
192 | s_sub_u32 s_save_pc_lo, s_save_pc_lo, 8 //pc[31:0]-8 | ||
193 | s_subb_u32 s_save_pc_hi, s_save_pc_hi, 0x0 // -scc | ||
194 | |||
195 | L_NO_PC_REWIND: | ||
196 | s_mov_b32 s_save_tmp, 0 //clear saveCtx bit | ||
197 | s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_SAVECTX_SHIFT, 1), s_save_tmp //clear saveCtx bit | ||
198 | |||
199 | s_mov_b32 s_save_xnack_mask_lo, xnack_mask_lo //save XNACK_MASK | ||
200 | s_mov_b32 s_save_xnack_mask_hi, xnack_mask_hi | ||
201 | s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_RCNT_SHIFT, SQ_WAVE_IB_STS_RCNT_SIZE) //save RCNT | ||
202 | s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_RCNT_SHIFT | ||
203 | s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp | ||
204 | s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT, SQ_WAVE_IB_STS_FIRST_REPLAY_SIZE) //save FIRST_REPLAY | ||
205 | s_lshl_b32 s_save_tmp, s_save_tmp, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT | ||
206 | s_or_b32 s_save_pc_hi, s_save_pc_hi, s_save_tmp | ||
207 | s_getreg_b32 s_save_tmp, hwreg(HW_REG_IB_STS) //clear RCNT and FIRST_REPLAY in IB_STS | ||
208 | s_and_b32 s_save_tmp, s_save_tmp, SQ_WAVE_IB_STS_RCNT_FIRST_REPLAY_MASK_NEG | ||
209 | |||
210 | s_setreg_b32 hwreg(HW_REG_IB_STS), s_save_tmp | ||
211 | |||
212 | /* inform SPI the readiness and wait for SPI's go signal */ | ||
213 | s_mov_b32 s_save_exec_lo, exec_lo //save EXEC and use EXEC for the go signal from SPI | ||
214 | s_mov_b32 s_save_exec_hi, exec_hi | ||
215 | s_mov_b64 exec, 0x0 //clear EXEC to get ready to receive | ||
216 | if (EMU_RUN_HACK) | ||
217 | |||
218 | else | ||
219 | s_sendmsg sendmsg(MSG_SAVEWAVE) //send SPI a message and wait for SPI's write to EXEC | ||
220 | end | ||
221 | |||
222 | L_SLEEP: | ||
223 | s_sleep 0x2 | ||
224 | |||
225 | if (EMU_RUN_HACK) | ||
226 | |||
227 | else | ||
228 | s_cbranch_execz L_SLEEP | ||
229 | end | ||
230 | |||
231 | |||
232 | /* setup Resource Contants */ | ||
233 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_SINGLE_WAVE)) | ||
234 | //calculate wd_addr using absolute thread id | ||
235 | v_readlane_b32 s_save_tmp, v9, 0 | ||
236 | s_lshr_b32 s_save_tmp, s_save_tmp, 6 | ||
237 | s_mul_i32 s_save_tmp, s_save_tmp, WAVE_SPACE | ||
238 | s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO | ||
239 | s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI | ||
240 | s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL | ||
241 | else | ||
242 | end | ||
243 | if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_SINGLE_WAVE)) | ||
244 | s_add_i32 s_save_spi_init_lo, s_save_tmp, WG_BASE_ADDR_LO | ||
245 | s_mov_b32 s_save_spi_init_hi, WG_BASE_ADDR_HI | ||
246 | s_and_b32 s_save_spi_init_hi, s_save_spi_init_hi, CTX_SAVE_CONTROL | ||
247 | else | ||
248 | end | ||
249 | |||
250 | |||
251 | s_mov_b32 s_save_buf_rsrc0, s_save_spi_init_lo //base_addr_lo | ||
252 | s_and_b32 s_save_buf_rsrc1, s_save_spi_init_hi, 0x0000FFFF //base_addr_hi | ||
253 | s_or_b32 s_save_buf_rsrc1, s_save_buf_rsrc1, S_SAVE_BUF_RSRC_WORD1_STRIDE | ||
254 | s_mov_b32 s_save_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) although not neccessarily inited | ||
255 | s_mov_b32 s_save_buf_rsrc3, S_SAVE_BUF_RSRC_WORD3_MISC | ||
256 | s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_ATC_MASK | ||
257 | s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position | ||
258 | s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or ATC TODO: ATC deprecated, no need anymore. | ||
259 | s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_MTYPE_MASK | ||
260 | s_lshr_b32 s_save_tmp, s_save_tmp, (S_SAVE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position | ||
261 | s_or_b32 s_save_buf_rsrc3, s_save_buf_rsrc3, s_save_tmp //or MTYPE | ||
262 | |||
263 | s_mov_b32 s_save_m0, m0 //save M0 | ||
264 | |||
265 | /* global mem offset */ | ||
266 | s_mov_b32 s_save_mem_offset, 0x0 //mem offset initial value = 0 | ||
267 | |||
268 | |||
269 | /* the first wave in the threadgroup */ | ||
270 | s_barrier //FIXME not performance-optimal "LDS is used? wait for other waves in the same TG" | ||
271 | s_and_b32 s_save_tmp, s_save_spi_init_hi, S_SAVE_SPI_INIT_FIRST_WAVE_MASK //exec is still used here | ||
272 | s_cbranch_scc0 L_SAVE_VGPR | ||
273 | |||
274 | /* save LDS */ | ||
275 | ////////////////////////////// | ||
276 | L_SAVE_LDS: | ||
277 | |||
278 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on | ||
279 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
280 | |||
281 | s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size | ||
282 | s_and_b32 s_save_alloc_size, s_save_alloc_size, 0xFFFFFFFF //lds_size is zero? | ||
283 | s_cbranch_scc0 L_SAVE_VGPR //no lds used? jump to L_SAVE_VGPR | ||
284 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 6 //LDS size in dwords = lds_size * 64dw | ||
285 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //LDS size in bytes | ||
286 | s_mov_b32 s_save_buf_rsrc2, s_save_alloc_size //NUM_RECORDS in bytes | ||
287 | if (SWIZZLE_EN) | ||
288 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
289 | else | ||
290 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
291 | end | ||
292 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 | ||
293 | s_nop 0x0 //Manually inserted wait states | ||
294 | |||
295 | L_SAVE_LDS_LOOP: | ||
296 | if (SAVE_LDS) | ||
297 | buffer_store_lds_dword s_save_buf_rsrc0, s_save_mem_offset lds:1 | ||
298 | end | ||
299 | s_add_u32 m0, m0, 256 //every buffer_store_lds does 256 bytes | ||
300 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //mem offset increased by 256 bytes | ||
301 | s_cmp_lt_u32 m0, s_save_alloc_size //scc=(m0 < s_save_alloc_size) ? 1 : 0 | ||
302 | s_cbranch_scc1 L_SAVE_LDS_LOOP //LDS save is complete? | ||
303 | |||
304 | |||
305 | /* save VGPRs */ | ||
306 | ////////////////////////////// | ||
307 | L_SAVE_VGPR: | ||
308 | |||
309 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on | ||
310 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
311 | |||
312 | s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | ||
313 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 | ||
314 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible | ||
315 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) | ||
316 | if (SWIZZLE_EN) | ||
317 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
318 | else | ||
319 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
320 | end | ||
321 | |||
322 | s_mov_b32 m0, 0x0 //VGPR initial index value =0 | ||
323 | s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 | ||
324 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later | ||
325 | |||
326 | L_SAVE_VGPR_LOOP: | ||
327 | v_mov_b32 v0, v0 //v0 = v[0+m0] | ||
328 | |||
329 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
330 | tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
331 | else | ||
332 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 | ||
333 | end | ||
334 | |||
335 | s_add_u32 m0, m0, 1 //next vgpr index | ||
336 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes | ||
337 | s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 | ||
338 | s_cbranch_scc1 L_SAVE_VGPR_LOOP //VGPR save is complete? | ||
339 | s_set_gpr_idx_off | ||
340 | |||
341 | |||
342 | /* save ACC_VGPRs */ | ||
343 | ////////////////////////////// | ||
344 | L_SAVE_ACC_VGPR: | ||
345 | |||
346 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on | ||
347 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
348 | |||
349 | s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | ||
350 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 | ||
351 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) //FIXME for GFX, zero is possible | ||
352 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) | ||
353 | if (SWIZZLE_EN) | ||
354 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
355 | else | ||
356 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
357 | end | ||
358 | |||
359 | s_mov_b32 m0, 0x0 //VGPR initial index value =0 | ||
360 | s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 | ||
361 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 0x1000 //add 0x1000 since we compare m0 against it later | ||
362 | |||
363 | L_SAVE_ACC_VGPR_LOOP: | ||
364 | v_accvgpr_read v0, v0 | ||
365 | v_nop | ||
366 | v_nop | ||
367 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
368 | tbuffer_store_format_x v0, v0, s_save_buf_rsrc0, s_save_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
369 | else | ||
370 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 | ||
371 | end | ||
372 | |||
373 | s_add_u32 m0, m0, 1 //next vgpr index | ||
374 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256 //every buffer_store_dword does 256 bytes | ||
375 | s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 | ||
376 | s_cbranch_scc1 L_SAVE_ACC_VGPR_LOOP //VGPR save is complete? | ||
377 | s_set_gpr_idx_off | ||
378 | |||
379 | |||
380 | /* save SGPRs */ | ||
381 | ////////////////////////////// | ||
382 | s_getreg_b32 s_save_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size | ||
383 | s_add_u32 s_save_alloc_size, s_save_alloc_size, 1 | ||
384 | s_lshl_b32 s_save_alloc_size, s_save_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) | ||
385 | |||
386 | if (SGPR_SAVE_USE_SQC) | ||
387 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 2 //NUM_RECORDS in bytes | ||
388 | else | ||
389 | s_lshl_b32 s_save_buf_rsrc2, s_save_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) | ||
390 | end | ||
391 | |||
392 | if (SWIZZLE_EN) | ||
393 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
394 | else | ||
395 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
396 | end | ||
397 | |||
398 | s_mov_b32 m0, 0x0 //SGPR initial index value =0 | ||
399 | s_nop 0x0 //Manually inserted wait states | ||
400 | |||
401 | L_SAVE_SGPR_LOOP: | ||
402 | s_movrels_b32 s0, s0 //s0 = s[0+m0] | ||
403 | write_sgpr_to_mem(s0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PV: the best performance should be using s_buffer_store_dwordx4 | ||
404 | s_add_u32 m0, m0, 1 //next sgpr index | ||
405 | s_cmp_lt_u32 m0, s_save_alloc_size //scc = (m0 < s_save_alloc_size) ? 1 : 0 | ||
406 | s_cbranch_scc1 L_SAVE_SGPR_LOOP //SGPR save is complete? | ||
407 | |||
408 | /* save HW registers */ | ||
409 | ////////////////////////////// | ||
410 | L_SAVE_HWREG: | ||
411 | s_mov_b32 s_save_buf_rsrc2, 0x4 //NUM_RECORDS in bytes | ||
412 | if (SWIZZLE_EN) | ||
413 | s_add_u32 s_save_buf_rsrc2, s_save_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
414 | else | ||
415 | s_mov_b32 s_save_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
416 | end | ||
417 | |||
418 | |||
419 | write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //M0 | ||
420 | |||
421 | if ((EMU_RUN_HACK) && (EMU_RUN_HACK_SAVE_FIRST_TIME)) | ||
422 | s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 | ||
423 | s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over | ||
424 | end | ||
425 | |||
426 | write_sgpr_to_mem(s_save_pc_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //PC | ||
427 | write_sgpr_to_mem(s_save_pc_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) | ||
428 | write_sgpr_to_mem(s_save_exec_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //EXEC | ||
429 | write_sgpr_to_mem(s_save_exec_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) | ||
430 | write_sgpr_to_mem(s_save_status, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //STATUS | ||
431 | |||
432 | //s_save_trapsts conflicts with s_save_alloc_size | ||
433 | s_getreg_b32 s_save_trapsts, hwreg(HW_REG_TRAPSTS) | ||
434 | write_sgpr_to_mem(s_save_trapsts, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //TRAPSTS | ||
435 | |||
436 | write_sgpr_to_mem(s_save_xnack_mask_lo, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_LO | ||
437 | write_sgpr_to_mem(s_save_xnack_mask_hi, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) //XNACK_MASK_HI | ||
438 | |||
439 | //use s_save_tmp would introduce conflict here between s_save_tmp and s_save_buf_rsrc2 | ||
440 | s_getreg_b32 s_save_m0, hwreg(HW_REG_MODE) //MODE | ||
441 | write_sgpr_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset, SGPR_SAVE_USE_SQC, USE_MTBUF_INSTEAD_OF_MUBUF) | ||
442 | |||
443 | /* S_PGM_END_SAVED */ //FIXME graphics ONLY | ||
444 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) | ||
445 | s_and_b32 s_save_pc_hi, s_save_pc_hi, 0x0000ffff //pc[47:32] | ||
446 | s_add_u32 s_save_pc_lo, s_save_pc_lo, 4 //pc[31:0]+4 | ||
447 | s_addc_u32 s_save_pc_hi, s_save_pc_hi, 0x0 //carry bit over | ||
448 | s_rfe_b64 s_save_pc_lo //Return to the main shader program | ||
449 | else | ||
450 | end | ||
451 | |||
452 | |||
453 | s_branch L_END_PGM | ||
454 | |||
455 | |||
456 | |||
457 | /**************************************************************************/ | ||
458 | /* restore routine */ | ||
459 | /**************************************************************************/ | ||
460 | |||
461 | L_RESTORE: | ||
462 | /* Setup Resource Contants */ | ||
463 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) | ||
464 | //calculate wd_addr using absolute thread id | ||
465 | v_readlane_b32 s_restore_tmp, v9, 0 | ||
466 | s_lshr_b32 s_restore_tmp, s_restore_tmp, 6 | ||
467 | s_mul_i32 s_restore_tmp, s_restore_tmp, WAVE_SPACE | ||
468 | s_add_i32 s_restore_spi_init_lo, s_restore_tmp, WG_BASE_ADDR_LO | ||
469 | s_mov_b32 s_restore_spi_init_hi, WG_BASE_ADDR_HI | ||
470 | s_and_b32 s_restore_spi_init_hi, s_restore_spi_init_hi, CTX_RESTORE_CONTROL | ||
471 | else | ||
472 | end | ||
473 | |||
474 | s_mov_b32 s_restore_buf_rsrc0, s_restore_spi_init_lo //base_addr_lo | ||
475 | s_and_b32 s_restore_buf_rsrc1, s_restore_spi_init_hi, 0x0000FFFF //base_addr_hi | ||
476 | s_or_b32 s_restore_buf_rsrc1, s_restore_buf_rsrc1, S_RESTORE_BUF_RSRC_WORD1_STRIDE | ||
477 | s_mov_b32 s_restore_buf_rsrc2, 0 //NUM_RECORDS initial value = 0 (in bytes) | ||
478 | s_mov_b32 s_restore_buf_rsrc3, S_RESTORE_BUF_RSRC_WORD3_MISC | ||
479 | s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_ATC_MASK | ||
480 | s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_ATC_SHIFT-SQ_BUF_RSRC_WORD1_ATC_SHIFT) //get ATC bit into position | ||
481 | s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or ATC TODO: ATC deprecated, no need anymore. | ||
482 | s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_MTYPE_MASK | ||
483 | s_lshr_b32 s_restore_tmp, s_restore_tmp, (S_RESTORE_SPI_INIT_MTYPE_SHIFT-SQ_BUF_RSRC_WORD3_MTYPE_SHIFT) //get MTYPE bits into position | ||
484 | s_or_b32 s_restore_buf_rsrc3, s_restore_buf_rsrc3, s_restore_tmp //or MTYPE | ||
485 | |||
486 | /* global mem offset */ | ||
487 | s_mov_b32 s_restore_mem_offset, 0x0 //mem offset initial value = 0 | ||
488 | |||
489 | /* the first wave in the threadgroup */ | ||
490 | s_and_b32 s_restore_tmp, s_restore_spi_init_hi, S_RESTORE_SPI_INIT_FIRST_WAVE_MASK | ||
491 | s_cbranch_scc0 L_RESTORE_VGPR | ||
492 | |||
493 | /* restore LDS */ | ||
494 | ////////////////////////////// | ||
495 | L_RESTORE_LDS: | ||
496 | |||
497 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead | ||
498 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
499 | |||
500 | s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_LDS_ALLOC,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SHIFT,SQ_WAVE_LDS_ALLOC_LDS_SIZE_SIZE) //lds_size | ||
501 | s_and_b32 s_restore_alloc_size, s_restore_alloc_size, 0xFFFFFFFF //lds_size is zero? | ||
502 | s_cbranch_scc0 L_RESTORE_VGPR //no lds used? jump to L_RESTORE_VGPR | ||
503 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 6 //LDS size in dwords = lds_size * 64dw | ||
504 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //LDS size in bytes | ||
505 | s_mov_b32 s_restore_buf_rsrc2, s_restore_alloc_size //NUM_RECORDS in bytes | ||
506 | if (SWIZZLE_EN) | ||
507 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
508 | else | ||
509 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
510 | end | ||
511 | s_mov_b32 m0, 0x0 //lds_offset initial value = 0 | ||
512 | |||
513 | L_RESTORE_LDS_LOOP: | ||
514 | if (SAVE_LDS) | ||
515 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset lds:1 | ||
516 | end | ||
517 | s_add_u32 m0, m0, 256 //every buffer_load_dword does 256 bytes | ||
518 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //mem offset increased by 256 bytes | ||
519 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc=(m0 < s_restore_alloc_size) ? 1 : 0 | ||
520 | s_cbranch_scc1 L_RESTORE_LDS_LOOP //LDS restore is complete? | ||
521 | |||
522 | |||
523 | /* restore VGPRs */ | ||
524 | ////////////////////////////// | ||
525 | L_RESTORE_VGPR: | ||
526 | |||
527 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead | ||
528 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
529 | |||
530 | s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | ||
531 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 | ||
532 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) | ||
533 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) | ||
534 | if (SWIZZLE_EN) | ||
535 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
536 | else | ||
537 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
538 | end | ||
539 | s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last | ||
540 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 | ||
541 | s_mov_b32 m0, 1 //VGPR initial index value = 1 | ||
542 | s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 | ||
543 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later | ||
544 | |||
545 | L_RESTORE_VGPR_LOOP: | ||
546 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
547 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
548 | else | ||
549 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 | ||
550 | end | ||
551 | s_waitcnt vmcnt(0) //ensure data ready | ||
552 | v_mov_b32 v0, v0 //v[0+m0] = v0 | ||
553 | s_add_u32 m0, m0, 1 //next vgpr index | ||
554 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes | ||
555 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 | ||
556 | s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? | ||
557 | s_set_gpr_idx_off | ||
558 | |||
559 | |||
560 | /* restore ACC_VGPRs */ | ||
561 | ////////////////////////////// | ||
562 | L_RESTORE_ACC_VGPR: | ||
563 | |||
564 | s_mov_b32 exec_lo, 0xFFFFFFFF //need every thread from now on //be consistent with SAVE although can be moved ahead | ||
565 | s_mov_b32 exec_hi, 0xFFFFFFFF | ||
566 | |||
567 | s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | ||
568 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 | ||
569 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) | ||
570 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) | ||
571 | if (SWIZZLE_EN) | ||
572 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
573 | else | ||
574 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
575 | end | ||
576 | s_mov_b32 m0, 0 //VGPR initial index value = 0 | ||
577 | s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 | ||
578 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later | ||
579 | |||
580 | L_RESTORE_ACC_VGPR_LOOP: | ||
581 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
582 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
583 | else | ||
584 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 | ||
585 | end | ||
586 | s_waitcnt vmcnt(0) //ensure data ready | ||
587 | v_accvgpr_write v0, v0 //v[0+m0] = v0 | ||
588 | s_add_u32 m0, m0, 1 //next vgpr index | ||
589 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256 //every buffer_load_dword does 256 bytes | ||
590 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 | ||
591 | s_cbranch_scc1 L_RESTORE_ACC_VGPR_LOOP //VGPR restore (except v0) is complete? | ||
592 | s_set_gpr_idx_off | ||
593 | /* VGPR restore on v0 */ | ||
594 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | ||
595 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
596 | else | ||
597 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save slc:1 glc:1 | ||
598 | end | ||
599 | |||
600 | /* restore SGPRs */ | ||
601 | ////////////////////////////// | ||
602 | s_getreg_b32 s_restore_alloc_size, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_SGPR_SIZE_SIZE) //spgr_size | ||
603 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 | ||
604 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 4 //Number of SGPRs = (sgpr_size + 1) * 16 (non-zero value) | ||
605 | |||
606 | if (SGPR_SAVE_USE_SQC) | ||
607 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 2 //NUM_RECORDS in bytes | ||
608 | else | ||
609 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads) | ||
610 | end | ||
611 | if (SWIZZLE_EN) | ||
612 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
613 | else | ||
614 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
615 | end | ||
616 | read_sgpr_from_mem(s_restore_tmp, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //save s0 to s_restore_tmp | ||
617 | s_mov_b32 m0, 0x1 //SGPR initial index value =1 //go on with with s1 | ||
618 | |||
619 | L_RESTORE_SGPR_LOOP: | ||
620 | read_sgpr_from_mem(s0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PV: further performance improvement can be made | ||
621 | s_waitcnt lgkmcnt(0) //ensure data ready | ||
622 | s_movreld_b32 s0, s0 //s[0+m0] = s0 | ||
623 | s_nop 0 // hazard SALU M0=> S_MOVREL | ||
624 | s_add_u32 m0, m0, 1 //next sgpr index | ||
625 | s_cmp_lt_u32 m0, s_restore_alloc_size //scc = (m0 < s_restore_alloc_size) ? 1 : 0 | ||
626 | s_cbranch_scc1 L_RESTORE_SGPR_LOOP //SGPR restore (except s0) is complete? | ||
627 | s_mov_b32 s0, s_restore_tmp /* SGPR restore on s0 */ | ||
628 | |||
629 | /* restore HW registers */ | ||
630 | ////////////////////////////// | ||
631 | L_RESTORE_HWREG: | ||
632 | s_mov_b32 s_restore_buf_rsrc2, 0x4 //NUM_RECORDS in bytes | ||
633 | if (SWIZZLE_EN) | ||
634 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | ||
635 | else | ||
636 | s_mov_b32 s_restore_buf_rsrc2, 0x1000000 //NUM_RECORDS in bytes | ||
637 | end | ||
638 | |||
639 | read_sgpr_from_mem(s_restore_m0, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //M0 | ||
640 | read_sgpr_from_mem(s_restore_pc_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //PC | ||
641 | read_sgpr_from_mem(s_restore_pc_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) | ||
642 | read_sgpr_from_mem(s_restore_exec_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //EXEC | ||
643 | read_sgpr_from_mem(s_restore_exec_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) | ||
644 | read_sgpr_from_mem(s_restore_status, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //STATUS | ||
645 | read_sgpr_from_mem(s_restore_trapsts, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //TRAPSTS | ||
646 | read_sgpr_from_mem(xnack_mask_lo, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_LO | ||
647 | read_sgpr_from_mem(xnack_mask_hi, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //XNACK_MASK_HI | ||
648 | read_sgpr_from_mem(s_restore_mode, s_restore_buf_rsrc0, s_restore_mem_offset, SGPR_SAVE_USE_SQC) //MODE | ||
649 | |||
650 | s_waitcnt lgkmcnt(0) //from now on, it is safe to restore STATUS and IB_STS | ||
651 | |||
652 | s_mov_b32 s_restore_tmp, s_restore_pc_hi | ||
653 | s_and_b32 s_restore_pc_hi, s_restore_tmp, 0x0000ffff //pc[47:32] //Do it here in order not to affect STATUS | ||
654 | |||
655 | //for normal save & restore, the saved PC points to the next inst to execute, no adjustment needs to be made, otherwise: | ||
656 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_RESTORE_NORMAL)) | ||
657 | s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 8 //pc[31:0]+8 //two back-to-back s_trap are used (first for save and second for restore) | ||
658 | s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over | ||
659 | end | ||
660 | if ((EMU_RUN_HACK) && (EMU_RUN_HACK_RESTORE_NORMAL)) | ||
661 | s_add_u32 s_restore_pc_lo, s_restore_pc_lo, 4 //pc[31:0]+4 // save is hack through s_trap but restore is normal | ||
662 | s_addc_u32 s_restore_pc_hi, s_restore_pc_hi, 0x0 //carry bit over | ||
663 | end | ||
664 | |||
665 | s_mov_b32 m0, s_restore_m0 | ||
666 | s_mov_b32 exec_lo, s_restore_exec_lo | ||
667 | s_mov_b32 exec_hi, s_restore_exec_hi | ||
668 | |||
669 | s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_PRE_SAVECTX_MASK, s_restore_trapsts | ||
670 | s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_PRE_SAVECTX_SIZE), s_restore_m0 | ||
671 | s_and_b32 s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_MASK, s_restore_trapsts | ||
672 | s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT | ||
673 | s_setreg_b32 hwreg(HW_REG_TRAPSTS, SQ_WAVE_TRAPSTS_POST_SAVECTX_SHIFT, SQ_WAVE_TRAPSTS_POST_SAVECTX_SIZE), s_restore_m0 | ||
674 | //s_setreg_b32 hwreg(HW_REG_TRAPSTS), s_restore_trapsts //don't overwrite SAVECTX bit as it may be set through external SAVECTX during restore | ||
675 | s_setreg_b32 hwreg(HW_REG_MODE), s_restore_mode | ||
676 | //reuse s_restore_m0 as a temp register | ||
677 | s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_RCNT_MASK | ||
678 | s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_RCNT_SHIFT | ||
679 | s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_RCNT_SHIFT | ||
680 | s_mov_b32 s_restore_mode, 0x0 //IB_STS is zero | ||
681 | s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 | ||
682 | s_and_b32 s_restore_m0, s_restore_tmp, S_SAVE_PC_HI_FIRST_REPLAY_MASK | ||
683 | s_lshr_b32 s_restore_m0, s_restore_m0, S_SAVE_PC_HI_FIRST_REPLAY_SHIFT | ||
684 | s_lshl_b32 s_restore_m0, s_restore_m0, SQ_WAVE_IB_STS_FIRST_REPLAY_SHIFT | ||
685 | s_or_b32 s_restore_mode, s_restore_mode, s_restore_m0 | ||
686 | s_and_b32 s_restore_m0, s_restore_status, SQ_WAVE_STATUS_INST_ATC_MASK | ||
687 | s_lshr_b32 s_restore_m0, s_restore_m0, SQ_WAVE_STATUS_INST_ATC_SHIFT | ||
688 | s_setreg_b32 hwreg(HW_REG_IB_STS), s_restore_mode | ||
689 | |||
690 | s_and_b64 exec, exec, exec // Restore STATUS.EXECZ, not writable by s_setreg_b32 | ||
691 | s_and_b64 vcc, vcc, vcc // Restore STATUS.VCCZ, not writable by s_setreg_b32 | ||
692 | s_setreg_b32 hwreg(HW_REG_STATUS), s_restore_status | ||
693 | |||
694 | s_barrier //barrier to ensure the readiness of LDS before access attemps from any other wave in the same TG //FIXME not performance-optimal at this time | ||
695 | |||
696 | |||
697 | // s_rfe_b64 s_restore_pc_lo //Return to the main shader program and resume execution | ||
698 | s_rfe_restore_b64 s_restore_pc_lo, s_restore_m0 // s_restore_m0[0] is used to set STATUS.inst_atc | ||
699 | |||
700 | |||
701 | /**************************************************************************/ | ||
702 | /* the END */ | ||
703 | /**************************************************************************/ | ||
704 | L_END_PGM: | ||
705 | s_endpgm | ||
706 | |||
707 | end | ||
708 | |||
709 | |||
710 | /**************************************************************************/ | ||
711 | /* the helper functions */ | ||
712 | /**************************************************************************/ | ||
713 | |||
714 | function write_sgpr_to_mem(s, s_rsrc, s_mem_offset, use_sqc, use_mtbuf) | ||
715 | if (use_sqc) | ||
716 | s_mov_b32 exec_lo, m0 //assuming exec_lo is not needed anymore from this point on | ||
717 | s_mov_b32 m0, s_mem_offset | ||
718 | s_buffer_store_dword s, s_rsrc, m0 glc:1 | ||
719 | s_add_u32 s_mem_offset, s_mem_offset, 4 | ||
720 | s_mov_b32 m0, exec_lo | ||
721 | elsif (use_mtbuf) | ||
722 | v_mov_b32 v0, s | ||
723 | tbuffer_store_format_x v0, v0, s_rsrc, s_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | ||
724 | s_add_u32 s_mem_offset, s_mem_offset, 256 | ||
725 | else | ||
726 | v_mov_b32 v0, s | ||
727 | buffer_store_dword v0, v0, s_rsrc, s_mem_offset slc:1 glc:1 | ||
728 | s_add_u32 s_mem_offset, s_mem_offset, 256 | ||
729 | end | ||
730 | end | ||
731 | |||
732 | |||
733 | |||
734 | function read_sgpr_from_mem(s, s_rsrc, s_mem_offset, use_sqc) | ||
735 | s_buffer_load_dword s, s_rsrc, s_mem_offset glc:1 | ||
736 | if (use_sqc) | ||
737 | s_add_u32 s_mem_offset, s_mem_offset, 4 | ||
738 | else | ||
739 | s_add_u32 s_mem_offset, s_mem_offset, 256 | ||
740 | end | ||
741 | end | ||
742 | |||
743 | |||
744 | |||
745 | |||
746 | |||
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm index 6bae2e022c6e..871f2d431a44 100644 --- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm +++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx9.asm | |||
@@ -197,13 +197,15 @@ var s_restore_spi_init_lo = exec_lo | |||
197 | var s_restore_spi_init_hi = exec_hi | 197 | var s_restore_spi_init_hi = exec_hi |
198 | 198 | ||
199 | var s_restore_mem_offset = ttmp12 | 199 | var s_restore_mem_offset = ttmp12 |
200 | var s_restore_accvgpr_offset = ttmp13 | ||
200 | var s_restore_alloc_size = ttmp3 | 201 | var s_restore_alloc_size = ttmp3 |
201 | var s_restore_tmp = ttmp2 | 202 | var s_restore_tmp = ttmp2 |
202 | var s_restore_mem_offset_save = s_restore_tmp //no conflict | 203 | var s_restore_mem_offset_save = s_restore_tmp //no conflict |
204 | var s_restore_accvgpr_offset_save = ttmp7 | ||
203 | 205 | ||
204 | var s_restore_m0 = s_restore_alloc_size //no conflict | 206 | var s_restore_m0 = s_restore_alloc_size //no conflict |
205 | 207 | ||
206 | var s_restore_mode = ttmp7 | 208 | var s_restore_mode = s_restore_accvgpr_offset_save |
207 | 209 | ||
208 | var s_restore_pc_lo = ttmp0 | 210 | var s_restore_pc_lo = ttmp0 |
209 | var s_restore_pc_hi = ttmp1 | 211 | var s_restore_pc_hi = ttmp1 |
@@ -226,7 +228,7 @@ var s_restore_ttmps_hi = s_restore_alloc_size //no conflict | |||
226 | /* Shader Main*/ | 228 | /* Shader Main*/ |
227 | 229 | ||
228 | shader main | 230 | shader main |
229 | asic(GFX9) | 231 | asic(DEFAULT) |
230 | type(CS) | 232 | type(CS) |
231 | 233 | ||
232 | 234 | ||
@@ -791,10 +793,48 @@ end | |||
791 | 793 | ||
792 | L_SAVE_VGPR_END: | 794 | L_SAVE_VGPR_END: |
793 | 795 | ||
796 | if ASIC_TARGET_ARCTURUS | ||
797 | // Save ACC VGPRs | ||
798 | s_mov_b32 m0, 0x0 //VGPR initial index value =0 | ||
799 | s_set_gpr_idx_on m0, 0x1 //M0[7:0] = M0[7:0] and M0[15:12] = 0x1 | ||
794 | 800 | ||
801 | if SAVE_AFTER_XNACK_ERROR | ||
802 | check_if_tcp_store_ok() | ||
803 | s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP | ||
795 | 804 | ||
805 | L_SAVE_ACCVGPR_LOOP_SQC: | ||
806 | for var vgpr = 0; vgpr < 4; ++ vgpr | ||
807 | v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] | ||
808 | end | ||
809 | |||
810 | write_vgprs_to_mem_with_sqc(v0, 4, s_save_buf_rsrc0, s_save_mem_offset) | ||
811 | |||
812 | s_add_u32 m0, m0, 4 | ||
813 | s_cmp_lt_u32 m0, s_save_alloc_size | ||
814 | s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP_SQC | ||
796 | 815 | ||
816 | s_set_gpr_idx_off | ||
817 | s_branch L_SAVE_ACCVGPR_END | ||
818 | end | ||
797 | 819 | ||
820 | L_SAVE_ACCVGPR_LOOP: | ||
821 | for var vgpr = 0; vgpr < 4; ++ vgpr | ||
822 | v_accvgpr_read v[vgpr], acc[vgpr] // v[N] = acc[N+m0] | ||
823 | end | ||
824 | |||
825 | buffer_store_dword v0, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 | ||
826 | buffer_store_dword v1, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256 | ||
827 | buffer_store_dword v2, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*2 | ||
828 | buffer_store_dword v3, v0, s_save_buf_rsrc0, s_save_mem_offset slc:1 glc:1 offset:256*3 | ||
829 | |||
830 | s_add_u32 m0, m0, 4 | ||
831 | s_add_u32 s_save_mem_offset, s_save_mem_offset, 256*4 | ||
832 | s_cmp_lt_u32 m0, s_save_alloc_size | ||
833 | s_cbranch_scc1 L_SAVE_ACCVGPR_LOOP | ||
834 | s_set_gpr_idx_off | ||
835 | |||
836 | L_SAVE_ACCVGPR_END: | ||
837 | end | ||
798 | 838 | ||
799 | /* S_PGM_END_SAVED */ //FIXME graphics ONLY | 839 | /* S_PGM_END_SAVED */ //FIXME graphics ONLY |
800 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) | 840 | if ((EMU_RUN_HACK) && (!EMU_RUN_HACK_SAVE_NORMAL_EXIT)) |
@@ -921,6 +961,11 @@ end | |||
921 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 | 961 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 1 |
922 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) | 962 | s_lshl_b32 s_restore_alloc_size, s_restore_alloc_size, 2 //Number of VGPRs = (vgpr_size + 1) * 4 (non-zero value) |
923 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) | 963 | s_lshl_b32 s_restore_buf_rsrc2, s_restore_alloc_size, 8 //NUM_RECORDS in bytes (64 threads*4) |
964 | |||
965 | if ASIC_TARGET_ARCTURUS | ||
966 | s_mov_b32 s_restore_accvgpr_offset, s_restore_buf_rsrc2 //ACC VGPRs at end of VGPRs | ||
967 | end | ||
968 | |||
924 | if (SWIZZLE_EN) | 969 | if (SWIZZLE_EN) |
925 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? | 970 | s_add_u32 s_restore_buf_rsrc2, s_restore_buf_rsrc2, 0x0 //FIXME need to use swizzle to enable bounds checking? |
926 | else | 971 | else |
@@ -958,6 +1003,10 @@ else | |||
958 | // VGPR load using dw burst | 1003 | // VGPR load using dw burst |
959 | s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last | 1004 | s_mov_b32 s_restore_mem_offset_save, s_restore_mem_offset // restore start with v1, v0 will be the last |
960 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 | 1005 | s_add_u32 s_restore_mem_offset, s_restore_mem_offset, 256*4 |
1006 | if ASIC_TARGET_ARCTURUS | ||
1007 | s_mov_b32 s_restore_accvgpr_offset_save, s_restore_accvgpr_offset | ||
1008 | s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 | ||
1009 | end | ||
961 | s_mov_b32 m0, 4 //VGPR initial index value = 1 | 1010 | s_mov_b32 m0, 4 //VGPR initial index value = 1 |
962 | s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 | 1011 | s_set_gpr_idx_on m0, 0x8 //M0[7:0] = M0[7:0] and M0[15:12] = 0x8 |
963 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later | 1012 | s_add_u32 s_restore_alloc_size, s_restore_alloc_size, 0x8000 //add 0x8000 since we compare m0 against it later |
@@ -966,6 +1015,20 @@ else | |||
966 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | 1015 | if(USE_MTBUF_INSTEAD_OF_MUBUF) |
967 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | 1016 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
968 | else | 1017 | else |
1018 | |||
1019 | if ASIC_TARGET_ARCTURUS | ||
1020 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 | ||
1021 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256 | ||
1022 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*2 | ||
1023 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset slc:1 glc:1 offset:256*3 | ||
1024 | s_add_u32 s_restore_accvgpr_offset, s_restore_accvgpr_offset, 256*4 | ||
1025 | s_waitcnt vmcnt(0) | ||
1026 | |||
1027 | for var vgpr = 0; vgpr < 4; ++ vgpr | ||
1028 | v_accvgpr_write acc[vgpr], v[vgpr] | ||
1029 | end | ||
1030 | end | ||
1031 | |||
969 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 | 1032 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 |
970 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 | 1033 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256 |
971 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 | 1034 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_mem_offset slc:1 glc:1 offset:256*2 |
@@ -982,6 +1045,18 @@ else | |||
982 | s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? | 1045 | s_cbranch_scc1 L_RESTORE_VGPR_LOOP //VGPR restore (except v0) is complete? |
983 | s_set_gpr_idx_off | 1046 | s_set_gpr_idx_off |
984 | /* VGPR restore on v0 */ | 1047 | /* VGPR restore on v0 */ |
1048 | if ASIC_TARGET_ARCTURUS | ||
1049 | buffer_load_dword v0, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 | ||
1050 | buffer_load_dword v1, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256 | ||
1051 | buffer_load_dword v2, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*2 | ||
1052 | buffer_load_dword v3, v0, s_restore_buf_rsrc0, s_restore_accvgpr_offset_save slc:1 glc:1 offset:256*3 | ||
1053 | s_waitcnt vmcnt(0) | ||
1054 | |||
1055 | for var vgpr = 0; vgpr < 4; ++ vgpr | ||
1056 | v_accvgpr_write acc[vgpr], v[vgpr] | ||
1057 | end | ||
1058 | end | ||
1059 | |||
985 | if(USE_MTBUF_INSTEAD_OF_MUBUF) | 1060 | if(USE_MTBUF_INSTEAD_OF_MUBUF) |
986 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 | 1061 | tbuffer_load_format_x v0, v0, s_restore_buf_rsrc0, s_restore_mem_offset_save format:BUF_NUM_FORMAT_FLOAT format: BUF_DATA_FORMAT_32 slc:1 glc:1 |
987 | else | 1062 | else |
@@ -1202,6 +1277,10 @@ function get_vgpr_size_bytes(s_vgpr_size_byte) | |||
1202 | s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size | 1277 | s_getreg_b32 s_vgpr_size_byte, hwreg(HW_REG_GPR_ALLOC,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SHIFT,SQ_WAVE_GPR_ALLOC_VGPR_SIZE_SIZE) //vpgr_size |
1203 | s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 | 1278 | s_add_u32 s_vgpr_size_byte, s_vgpr_size_byte, 1 |
1204 | s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible | 1279 | s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, (2+8) //Number of VGPRs = (vgpr_size + 1) * 4 * 64 * 4 (non-zero value) //FIXME for GFX, zero is possible |
1280 | |||
1281 | if ASIC_TARGET_ARCTURUS | ||
1282 | s_lshl_b32 s_vgpr_size_byte, s_vgpr_size_byte, 1 // Double size for ACC VGPRs | ||
1283 | end | ||
1205 | end | 1284 | end |
1206 | 1285 | ||
1207 | function get_sgpr_size_bytes(s_sgpr_size_byte) | 1286 | function get_sgpr_size_bytes(s_sgpr_size_byte) |