diff options
author | Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp> | 2012-03-21 10:50:34 -0400 |
---|---|---|
committer | Avi Kivity <avi@redhat.com> | 2012-04-08 09:08:27 -0400 |
commit | 1e3f42f03c38c29c1814199a6f0a2f01b919ea3f (patch) | |
tree | 0ceba904d28e89ded0954dcf40361a6ec235cce3 | |
parent | 220f773a0013bf6fe2eefd9718ac7471f368fd8e (diff) |
KVM: MMU: Improve iteration through sptes from rmap
Iteration using rmap_next(), the actual body is pte_list_next(), is
inefficient: every time we call it we start from checking whether rmap
holds a single spte or points to a descriptor which links more sptes.
In the case of shadow paging, this quadratic total iteration cost is a
problem. Even for two dimensional paging, with EPT/NPT on, in which we
almost always have a single mapping, the extra checks at the end of the
iteration should be eliminated.
This patch fixes this by introducing rmap_iterator which keeps the
iteration context for the next search. Furthermore the implementation
of rmap_next() is splitted into two functions, rmap_get_first() and
rmap_get_next(), to avoid repeatedly checking whether the rmap being
iterated on has only one spte.
Although there seemed to be only a slight change for EPT/NPT, the actual
improvement was significant: we observed that GET_DIRTY_LOG for 1GB
dirty memory became 15% faster than before. This is probably because
the new code is easy to make branch predictions.
Note: we just remove pte_list_next() because we can think of parent_ptes
as a reverse mapping.
Signed-off-by: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Signed-off-by: Avi Kivity <avi@redhat.com>
-rw-r--r-- | arch/x86/kvm/mmu.c | 196 | ||||
-rw-r--r-- | arch/x86/kvm/mmu_audit.c | 10 |
2 files changed, 124 insertions, 82 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3213348e3a93..29ad6f9c58a5 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -842,32 +842,6 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, | |||
842 | return count; | 842 | return count; |
843 | } | 843 | } |
844 | 844 | ||
845 | static u64 *pte_list_next(unsigned long *pte_list, u64 *spte) | ||
846 | { | ||
847 | struct pte_list_desc *desc; | ||
848 | u64 *prev_spte; | ||
849 | int i; | ||
850 | |||
851 | if (!*pte_list) | ||
852 | return NULL; | ||
853 | else if (!(*pte_list & 1)) { | ||
854 | if (!spte) | ||
855 | return (u64 *)*pte_list; | ||
856 | return NULL; | ||
857 | } | ||
858 | desc = (struct pte_list_desc *)(*pte_list & ~1ul); | ||
859 | prev_spte = NULL; | ||
860 | while (desc) { | ||
861 | for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { | ||
862 | if (prev_spte == spte) | ||
863 | return desc->sptes[i]; | ||
864 | prev_spte = desc->sptes[i]; | ||
865 | } | ||
866 | desc = desc->more; | ||
867 | } | ||
868 | return NULL; | ||
869 | } | ||
870 | |||
871 | static void | 845 | static void |
872 | pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, | 846 | pte_list_desc_remove_entry(unsigned long *pte_list, struct pte_list_desc *desc, |
873 | int i, struct pte_list_desc *prev_desc) | 847 | int i, struct pte_list_desc *prev_desc) |
@@ -988,11 +962,6 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) | |||
988 | return pte_list_add(vcpu, spte, rmapp); | 962 | return pte_list_add(vcpu, spte, rmapp); |
989 | } | 963 | } |
990 | 964 | ||
991 | static u64 *rmap_next(unsigned long *rmapp, u64 *spte) | ||
992 | { | ||
993 | return pte_list_next(rmapp, spte); | ||
994 | } | ||
995 | |||
996 | static void rmap_remove(struct kvm *kvm, u64 *spte) | 965 | static void rmap_remove(struct kvm *kvm, u64 *spte) |
997 | { | 966 | { |
998 | struct kvm_mmu_page *sp; | 967 | struct kvm_mmu_page *sp; |
@@ -1005,6 +974,67 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) | |||
1005 | pte_list_remove(spte, rmapp); | 974 | pte_list_remove(spte, rmapp); |
1006 | } | 975 | } |
1007 | 976 | ||
977 | /* | ||
978 | * Used by the following functions to iterate through the sptes linked by a | ||
979 | * rmap. All fields are private and not assumed to be used outside. | ||
980 | */ | ||
981 | struct rmap_iterator { | ||
982 | /* private fields */ | ||
983 | struct pte_list_desc *desc; /* holds the sptep if not NULL */ | ||
984 | int pos; /* index of the sptep */ | ||
985 | }; | ||
986 | |||
987 | /* | ||
988 | * Iteration must be started by this function. This should also be used after | ||
989 | * removing/dropping sptes from the rmap link because in such cases the | ||
990 | * information in the itererator may not be valid. | ||
991 | * | ||
992 | * Returns sptep if found, NULL otherwise. | ||
993 | */ | ||
994 | static u64 *rmap_get_first(unsigned long rmap, struct rmap_iterator *iter) | ||
995 | { | ||
996 | if (!rmap) | ||
997 | return NULL; | ||
998 | |||
999 | if (!(rmap & 1)) { | ||
1000 | iter->desc = NULL; | ||
1001 | return (u64 *)rmap; | ||
1002 | } | ||
1003 | |||
1004 | iter->desc = (struct pte_list_desc *)(rmap & ~1ul); | ||
1005 | iter->pos = 0; | ||
1006 | return iter->desc->sptes[iter->pos]; | ||
1007 | } | ||
1008 | |||
1009 | /* | ||
1010 | * Must be used with a valid iterator: e.g. after rmap_get_first(). | ||
1011 | * | ||
1012 | * Returns sptep if found, NULL otherwise. | ||
1013 | */ | ||
1014 | static u64 *rmap_get_next(struct rmap_iterator *iter) | ||
1015 | { | ||
1016 | if (iter->desc) { | ||
1017 | if (iter->pos < PTE_LIST_EXT - 1) { | ||
1018 | u64 *sptep; | ||
1019 | |||
1020 | ++iter->pos; | ||
1021 | sptep = iter->desc->sptes[iter->pos]; | ||
1022 | if (sptep) | ||
1023 | return sptep; | ||
1024 | } | ||
1025 | |||
1026 | iter->desc = iter->desc->more; | ||
1027 | |||
1028 | if (iter->desc) { | ||
1029 | iter->pos = 0; | ||
1030 | /* desc->sptes[0] cannot be NULL */ | ||
1031 | return iter->desc->sptes[iter->pos]; | ||
1032 | } | ||
1033 | } | ||
1034 | |||
1035 | return NULL; | ||
1036 | } | ||
1037 | |||
1008 | static void drop_spte(struct kvm *kvm, u64 *sptep) | 1038 | static void drop_spte(struct kvm *kvm, u64 *sptep) |
1009 | { | 1039 | { |
1010 | if (mmu_spte_clear_track_bits(sptep)) | 1040 | if (mmu_spte_clear_track_bits(sptep)) |
@@ -1013,23 +1043,27 @@ static void drop_spte(struct kvm *kvm, u64 *sptep) | |||
1013 | 1043 | ||
1014 | static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) | 1044 | static int __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, int level) |
1015 | { | 1045 | { |
1016 | u64 *spte = NULL; | 1046 | u64 *sptep; |
1047 | struct rmap_iterator iter; | ||
1017 | int write_protected = 0; | 1048 | int write_protected = 0; |
1018 | 1049 | ||
1019 | while ((spte = rmap_next(rmapp, spte))) { | 1050 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
1020 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1051 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1021 | rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); | 1052 | rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep); |
1022 | 1053 | ||
1023 | if (!is_writable_pte(*spte)) | 1054 | if (!is_writable_pte(*sptep)) { |
1055 | sptep = rmap_get_next(&iter); | ||
1024 | continue; | 1056 | continue; |
1057 | } | ||
1025 | 1058 | ||
1026 | if (level == PT_PAGE_TABLE_LEVEL) { | 1059 | if (level == PT_PAGE_TABLE_LEVEL) { |
1027 | mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); | 1060 | mmu_spte_update(sptep, *sptep & ~PT_WRITABLE_MASK); |
1061 | sptep = rmap_get_next(&iter); | ||
1028 | } else { | 1062 | } else { |
1029 | BUG_ON(!is_large_pte(*spte)); | 1063 | BUG_ON(!is_large_pte(*sptep)); |
1030 | drop_spte(kvm, spte); | 1064 | drop_spte(kvm, sptep); |
1031 | --kvm->stat.lpages; | 1065 | --kvm->stat.lpages; |
1032 | spte = NULL; | 1066 | sptep = rmap_get_first(*rmapp, &iter); |
1033 | } | 1067 | } |
1034 | 1068 | ||
1035 | write_protected = 1; | 1069 | write_protected = 1; |
@@ -1084,48 +1118,57 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn) | |||
1084 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1118 | static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1085 | unsigned long data) | 1119 | unsigned long data) |
1086 | { | 1120 | { |
1087 | u64 *spte; | 1121 | u64 *sptep; |
1122 | struct rmap_iterator iter; | ||
1088 | int need_tlb_flush = 0; | 1123 | int need_tlb_flush = 0; |
1089 | 1124 | ||
1090 | while ((spte = rmap_next(rmapp, NULL))) { | 1125 | while ((sptep = rmap_get_first(*rmapp, &iter))) { |
1091 | BUG_ON(!(*spte & PT_PRESENT_MASK)); | 1126 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1092 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); | 1127 | rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", sptep, *sptep); |
1093 | drop_spte(kvm, spte); | 1128 | |
1129 | drop_spte(kvm, sptep); | ||
1094 | need_tlb_flush = 1; | 1130 | need_tlb_flush = 1; |
1095 | } | 1131 | } |
1132 | |||
1096 | return need_tlb_flush; | 1133 | return need_tlb_flush; |
1097 | } | 1134 | } |
1098 | 1135 | ||
1099 | static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1136 | static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1100 | unsigned long data) | 1137 | unsigned long data) |
1101 | { | 1138 | { |
1139 | u64 *sptep; | ||
1140 | struct rmap_iterator iter; | ||
1102 | int need_flush = 0; | 1141 | int need_flush = 0; |
1103 | u64 *spte, new_spte; | 1142 | u64 new_spte; |
1104 | pte_t *ptep = (pte_t *)data; | 1143 | pte_t *ptep = (pte_t *)data; |
1105 | pfn_t new_pfn; | 1144 | pfn_t new_pfn; |
1106 | 1145 | ||
1107 | WARN_ON(pte_huge(*ptep)); | 1146 | WARN_ON(pte_huge(*ptep)); |
1108 | new_pfn = pte_pfn(*ptep); | 1147 | new_pfn = pte_pfn(*ptep); |
1109 | spte = rmap_next(rmapp, NULL); | 1148 | |
1110 | while (spte) { | 1149 | for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { |
1111 | BUG_ON(!is_shadow_present_pte(*spte)); | 1150 | BUG_ON(!is_shadow_present_pte(*sptep)); |
1112 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); | 1151 | rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", sptep, *sptep); |
1152 | |||
1113 | need_flush = 1; | 1153 | need_flush = 1; |
1154 | |||
1114 | if (pte_write(*ptep)) { | 1155 | if (pte_write(*ptep)) { |
1115 | drop_spte(kvm, spte); | 1156 | drop_spte(kvm, sptep); |
1116 | spte = rmap_next(rmapp, NULL); | 1157 | sptep = rmap_get_first(*rmapp, &iter); |
1117 | } else { | 1158 | } else { |
1118 | new_spte = *spte &~ (PT64_BASE_ADDR_MASK); | 1159 | new_spte = *sptep & ~PT64_BASE_ADDR_MASK; |
1119 | new_spte |= (u64)new_pfn << PAGE_SHIFT; | 1160 | new_spte |= (u64)new_pfn << PAGE_SHIFT; |
1120 | 1161 | ||
1121 | new_spte &= ~PT_WRITABLE_MASK; | 1162 | new_spte &= ~PT_WRITABLE_MASK; |
1122 | new_spte &= ~SPTE_HOST_WRITEABLE; | 1163 | new_spte &= ~SPTE_HOST_WRITEABLE; |
1123 | new_spte &= ~shadow_accessed_mask; | 1164 | new_spte &= ~shadow_accessed_mask; |
1124 | mmu_spte_clear_track_bits(spte); | 1165 | |
1125 | mmu_spte_set(spte, new_spte); | 1166 | mmu_spte_clear_track_bits(sptep); |
1126 | spte = rmap_next(rmapp, spte); | 1167 | mmu_spte_set(sptep, new_spte); |
1168 | sptep = rmap_get_next(&iter); | ||
1127 | } | 1169 | } |
1128 | } | 1170 | } |
1171 | |||
1129 | if (need_flush) | 1172 | if (need_flush) |
1130 | kvm_flush_remote_tlbs(kvm); | 1173 | kvm_flush_remote_tlbs(kvm); |
1131 | 1174 | ||
@@ -1184,7 +1227,8 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) | |||
1184 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1227 | static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1185 | unsigned long data) | 1228 | unsigned long data) |
1186 | { | 1229 | { |
1187 | u64 *spte; | 1230 | u64 *sptep; |
1231 | struct rmap_iterator iter; | ||
1188 | int young = 0; | 1232 | int young = 0; |
1189 | 1233 | ||
1190 | /* | 1234 | /* |
@@ -1197,25 +1241,24 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1197 | if (!shadow_accessed_mask) | 1241 | if (!shadow_accessed_mask) |
1198 | return kvm_unmap_rmapp(kvm, rmapp, data); | 1242 | return kvm_unmap_rmapp(kvm, rmapp, data); |
1199 | 1243 | ||
1200 | spte = rmap_next(rmapp, NULL); | 1244 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1201 | while (spte) { | 1245 | sptep = rmap_get_next(&iter)) { |
1202 | int _young; | 1246 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1203 | u64 _spte = *spte; | 1247 | |
1204 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | 1248 | if (*sptep & PT_ACCESSED_MASK) { |
1205 | _young = _spte & PT_ACCESSED_MASK; | ||
1206 | if (_young) { | ||
1207 | young = 1; | 1249 | young = 1; |
1208 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); | 1250 | clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)sptep); |
1209 | } | 1251 | } |
1210 | spte = rmap_next(rmapp, spte); | ||
1211 | } | 1252 | } |
1253 | |||
1212 | return young; | 1254 | return young; |
1213 | } | 1255 | } |
1214 | 1256 | ||
1215 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | 1257 | static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, |
1216 | unsigned long data) | 1258 | unsigned long data) |
1217 | { | 1259 | { |
1218 | u64 *spte; | 1260 | u64 *sptep; |
1261 | struct rmap_iterator iter; | ||
1219 | int young = 0; | 1262 | int young = 0; |
1220 | 1263 | ||
1221 | /* | 1264 | /* |
@@ -1226,16 +1269,14 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, | |||
1226 | if (!shadow_accessed_mask) | 1269 | if (!shadow_accessed_mask) |
1227 | goto out; | 1270 | goto out; |
1228 | 1271 | ||
1229 | spte = rmap_next(rmapp, NULL); | 1272 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
1230 | while (spte) { | 1273 | sptep = rmap_get_next(&iter)) { |
1231 | u64 _spte = *spte; | 1274 | BUG_ON(!(*sptep & PT_PRESENT_MASK)); |
1232 | BUG_ON(!(_spte & PT_PRESENT_MASK)); | 1275 | |
1233 | young = _spte & PT_ACCESSED_MASK; | 1276 | if (*sptep & PT_ACCESSED_MASK) { |
1234 | if (young) { | ||
1235 | young = 1; | 1277 | young = 1; |
1236 | break; | 1278 | break; |
1237 | } | 1279 | } |
1238 | spte = rmap_next(rmapp, spte); | ||
1239 | } | 1280 | } |
1240 | out: | 1281 | out: |
1241 | return young; | 1282 | return young; |
@@ -1887,10 +1928,11 @@ static void kvm_mmu_put_page(struct kvm_mmu_page *sp, u64 *parent_pte) | |||
1887 | 1928 | ||
1888 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) | 1929 | static void kvm_mmu_unlink_parents(struct kvm *kvm, struct kvm_mmu_page *sp) |
1889 | { | 1930 | { |
1890 | u64 *parent_pte; | 1931 | u64 *sptep; |
1932 | struct rmap_iterator iter; | ||
1891 | 1933 | ||
1892 | while ((parent_pte = pte_list_next(&sp->parent_ptes, NULL))) | 1934 | while ((sptep = rmap_get_first(sp->parent_ptes, &iter))) |
1893 | drop_parent_pte(sp, parent_pte); | 1935 | drop_parent_pte(sp, sptep); |
1894 | } | 1936 | } |
1895 | 1937 | ||
1896 | static int mmu_zap_unsync_children(struct kvm *kvm, | 1938 | static int mmu_zap_unsync_children(struct kvm *kvm, |
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 715da5a19a5b..7d7d0b9e23eb 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c | |||
@@ -192,7 +192,8 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
192 | { | 192 | { |
193 | struct kvm_memory_slot *slot; | 193 | struct kvm_memory_slot *slot; |
194 | unsigned long *rmapp; | 194 | unsigned long *rmapp; |
195 | u64 *spte; | 195 | u64 *sptep; |
196 | struct rmap_iterator iter; | ||
196 | 197 | ||
197 | if (sp->role.direct || sp->unsync || sp->role.invalid) | 198 | if (sp->role.direct || sp->unsync || sp->role.invalid) |
198 | return; | 199 | return; |
@@ -200,13 +201,12 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) | |||
200 | slot = gfn_to_memslot(kvm, sp->gfn); | 201 | slot = gfn_to_memslot(kvm, sp->gfn); |
201 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; | 202 | rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; |
202 | 203 | ||
203 | spte = rmap_next(rmapp, NULL); | 204 | for (sptep = rmap_get_first(*rmapp, &iter); sptep; |
204 | while (spte) { | 205 | sptep = rmap_get_next(&iter)) { |
205 | if (is_writable_pte(*spte)) | 206 | if (is_writable_pte(*sptep)) |
206 | audit_printk(kvm, "shadow page has writable " | 207 | audit_printk(kvm, "shadow page has writable " |
207 | "mappings: gfn %llx role %x\n", | 208 | "mappings: gfn %llx role %x\n", |
208 | sp->gfn, sp->role.word); | 209 | sp->gfn, sp->role.word); |
209 | spte = rmap_next(rmapp, spte); | ||
210 | } | 210 | } |
211 | } | 211 | } |
212 | 212 | ||