diff options
author | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2013-07-26 09:04:02 -0400 |
---|---|---|
committer | Paolo Bonzini <pbonzini@redhat.com> | 2013-07-29 03:03:09 -0400 |
commit | 3eabaee998c787e7e1565574821652548f7fc003 (patch) | |
tree | 8e36fdfda46ec4c0a5b2a419a39fa2f5c1ba0f8e /arch/s390/mm | |
parent | 663f4c61b8036fd3a80debbe00b58d198ae63e76 (diff) |
KVM: s390: allow sie enablement for multi-threaded programs
Improve the code to upgrade the standard 2K page tables to 4K page tables
with PGSTEs to allow the operation to happen when the program is already
multi-threaded.
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/pgtable.c | 181 |
1 files changed, 117 insertions, 64 deletions
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index a8154a1a2c94..6d332487f363 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c | |||
@@ -731,6 +731,11 @@ void gmap_do_ipte_notify(struct mm_struct *mm, unsigned long addr, pte_t *pte) | |||
731 | spin_unlock(&gmap_notifier_lock); | 731 | spin_unlock(&gmap_notifier_lock); |
732 | } | 732 | } |
733 | 733 | ||
734 | static inline int page_table_with_pgste(struct page *page) | ||
735 | { | ||
736 | return atomic_read(&page->_mapcount) == 0; | ||
737 | } | ||
738 | |||
734 | static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, | 739 | static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, |
735 | unsigned long vmaddr) | 740 | unsigned long vmaddr) |
736 | { | 741 | { |
@@ -750,7 +755,7 @@ static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, | |||
750 | mp->vmaddr = vmaddr & PMD_MASK; | 755 | mp->vmaddr = vmaddr & PMD_MASK; |
751 | INIT_LIST_HEAD(&mp->mapper); | 756 | INIT_LIST_HEAD(&mp->mapper); |
752 | page->index = (unsigned long) mp; | 757 | page->index = (unsigned long) mp; |
753 | atomic_set(&page->_mapcount, 3); | 758 | atomic_set(&page->_mapcount, 0); |
754 | table = (unsigned long *) page_to_phys(page); | 759 | table = (unsigned long *) page_to_phys(page); |
755 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); | 760 | clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE/2); |
756 | clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); | 761 | clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2); |
@@ -821,6 +826,11 @@ EXPORT_SYMBOL(set_guest_storage_key); | |||
821 | 826 | ||
822 | #else /* CONFIG_PGSTE */ | 827 | #else /* CONFIG_PGSTE */ |
823 | 828 | ||
829 | static inline int page_table_with_pgste(struct page *page) | ||
830 | { | ||
831 | return 0; | ||
832 | } | ||
833 | |||
824 | static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, | 834 | static inline unsigned long *page_table_alloc_pgste(struct mm_struct *mm, |
825 | unsigned long vmaddr) | 835 | unsigned long vmaddr) |
826 | { | 836 | { |
@@ -897,12 +907,12 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) | |||
897 | struct page *page; | 907 | struct page *page; |
898 | unsigned int bit, mask; | 908 | unsigned int bit, mask; |
899 | 909 | ||
900 | if (mm_has_pgste(mm)) { | 910 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
911 | if (page_table_with_pgste(page)) { | ||
901 | gmap_disconnect_pgtable(mm, table); | 912 | gmap_disconnect_pgtable(mm, table); |
902 | return page_table_free_pgste(table); | 913 | return page_table_free_pgste(table); |
903 | } | 914 | } |
904 | /* Free 1K/2K page table fragment of a 4K page */ | 915 | /* Free 1K/2K page table fragment of a 4K page */ |
905 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | ||
906 | bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); | 916 | bit = 1 << ((__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t))); |
907 | spin_lock_bh(&mm->context.list_lock); | 917 | spin_lock_bh(&mm->context.list_lock); |
908 | if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) | 918 | if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) |
@@ -940,14 +950,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table) | |||
940 | unsigned int bit, mask; | 950 | unsigned int bit, mask; |
941 | 951 | ||
942 | mm = tlb->mm; | 952 | mm = tlb->mm; |
943 | if (mm_has_pgste(mm)) { | 953 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); |
954 | if (page_table_with_pgste(page)) { | ||
944 | gmap_disconnect_pgtable(mm, table); | 955 | gmap_disconnect_pgtable(mm, table); |
945 | table = (unsigned long *) (__pa(table) | FRAG_MASK); | 956 | table = (unsigned long *) (__pa(table) | FRAG_MASK); |
946 | tlb_remove_table(tlb, table); | 957 | tlb_remove_table(tlb, table); |
947 | return; | 958 | return; |
948 | } | 959 | } |
949 | bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); | 960 | bit = 1 << ((__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t))); |
950 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | ||
951 | spin_lock_bh(&mm->context.list_lock); | 961 | spin_lock_bh(&mm->context.list_lock); |
952 | if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) | 962 | if ((atomic_read(&page->_mapcount) & FRAG_MASK) != FRAG_MASK) |
953 | list_del(&page->lru); | 963 | list_del(&page->lru); |
@@ -1033,36 +1043,120 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) | |||
1033 | } | 1043 | } |
1034 | 1044 | ||
1035 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | 1045 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1036 | void thp_split_vma(struct vm_area_struct *vma) | 1046 | static inline void thp_split_vma(struct vm_area_struct *vma) |
1037 | { | 1047 | { |
1038 | unsigned long addr; | 1048 | unsigned long addr; |
1039 | struct page *page; | ||
1040 | 1049 | ||
1041 | for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { | 1050 | for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) |
1042 | page = follow_page(vma, addr, FOLL_SPLIT); | 1051 | follow_page(vma, addr, FOLL_SPLIT); |
1043 | } | ||
1044 | } | 1052 | } |
1045 | 1053 | ||
1046 | void thp_split_mm(struct mm_struct *mm) | 1054 | static inline void thp_split_mm(struct mm_struct *mm) |
1047 | { | 1055 | { |
1048 | struct vm_area_struct *vma = mm->mmap; | 1056 | struct vm_area_struct *vma; |
1049 | 1057 | ||
1050 | while (vma != NULL) { | 1058 | for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { |
1051 | thp_split_vma(vma); | 1059 | thp_split_vma(vma); |
1052 | vma->vm_flags &= ~VM_HUGEPAGE; | 1060 | vma->vm_flags &= ~VM_HUGEPAGE; |
1053 | vma->vm_flags |= VM_NOHUGEPAGE; | 1061 | vma->vm_flags |= VM_NOHUGEPAGE; |
1054 | vma = vma->vm_next; | ||
1055 | } | 1062 | } |
1063 | mm->def_flags |= VM_NOHUGEPAGE; | ||
1064 | } | ||
1065 | #else | ||
1066 | static inline void thp_split_mm(struct mm_struct *mm) | ||
1067 | { | ||
1056 | } | 1068 | } |
1057 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ | 1069 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
1058 | 1070 | ||
1071 | static unsigned long page_table_realloc_pmd(struct mmu_gather *tlb, | ||
1072 | struct mm_struct *mm, pud_t *pud, | ||
1073 | unsigned long addr, unsigned long end) | ||
1074 | { | ||
1075 | unsigned long next, *table, *new; | ||
1076 | struct page *page; | ||
1077 | pmd_t *pmd; | ||
1078 | |||
1079 | pmd = pmd_offset(pud, addr); | ||
1080 | do { | ||
1081 | next = pmd_addr_end(addr, end); | ||
1082 | again: | ||
1083 | if (pmd_none_or_clear_bad(pmd)) | ||
1084 | continue; | ||
1085 | table = (unsigned long *) pmd_deref(*pmd); | ||
1086 | page = pfn_to_page(__pa(table) >> PAGE_SHIFT); | ||
1087 | if (page_table_with_pgste(page)) | ||
1088 | continue; | ||
1089 | /* Allocate new page table with pgstes */ | ||
1090 | new = page_table_alloc_pgste(mm, addr); | ||
1091 | if (!new) { | ||
1092 | mm->context.has_pgste = 0; | ||
1093 | continue; | ||
1094 | } | ||
1095 | spin_lock(&mm->page_table_lock); | ||
1096 | if (likely((unsigned long *) pmd_deref(*pmd) == table)) { | ||
1097 | /* Nuke pmd entry pointing to the "short" page table */ | ||
1098 | pmdp_flush_lazy(mm, addr, pmd); | ||
1099 | pmd_clear(pmd); | ||
1100 | /* Copy ptes from old table to new table */ | ||
1101 | memcpy(new, table, PAGE_SIZE/2); | ||
1102 | clear_table(table, _PAGE_INVALID, PAGE_SIZE/2); | ||
1103 | /* Establish new table */ | ||
1104 | pmd_populate(mm, pmd, (pte_t *) new); | ||
1105 | /* Free old table with rcu, there might be a walker! */ | ||
1106 | page_table_free_rcu(tlb, table); | ||
1107 | new = NULL; | ||
1108 | } | ||
1109 | spin_unlock(&mm->page_table_lock); | ||
1110 | if (new) { | ||
1111 | page_table_free_pgste(new); | ||
1112 | goto again; | ||
1113 | } | ||
1114 | } while (pmd++, addr = next, addr != end); | ||
1115 | |||
1116 | return addr; | ||
1117 | } | ||
1118 | |||
1119 | static unsigned long page_table_realloc_pud(struct mmu_gather *tlb, | ||
1120 | struct mm_struct *mm, pgd_t *pgd, | ||
1121 | unsigned long addr, unsigned long end) | ||
1122 | { | ||
1123 | unsigned long next; | ||
1124 | pud_t *pud; | ||
1125 | |||
1126 | pud = pud_offset(pgd, addr); | ||
1127 | do { | ||
1128 | next = pud_addr_end(addr, end); | ||
1129 | if (pud_none_or_clear_bad(pud)) | ||
1130 | continue; | ||
1131 | next = page_table_realloc_pmd(tlb, mm, pud, addr, next); | ||
1132 | } while (pud++, addr = next, addr != end); | ||
1133 | |||
1134 | return addr; | ||
1135 | } | ||
1136 | |||
1137 | static void page_table_realloc(struct mmu_gather *tlb, struct mm_struct *mm, | ||
1138 | unsigned long addr, unsigned long end) | ||
1139 | { | ||
1140 | unsigned long next; | ||
1141 | pgd_t *pgd; | ||
1142 | |||
1143 | pgd = pgd_offset(mm, addr); | ||
1144 | do { | ||
1145 | next = pgd_addr_end(addr, end); | ||
1146 | if (pgd_none_or_clear_bad(pgd)) | ||
1147 | continue; | ||
1148 | next = page_table_realloc_pud(tlb, mm, pgd, addr, next); | ||
1149 | } while (pgd++, addr = next, addr != end); | ||
1150 | } | ||
1151 | |||
1059 | /* | 1152 | /* |
1060 | * switch on pgstes for its userspace process (for kvm) | 1153 | * switch on pgstes for its userspace process (for kvm) |
1061 | */ | 1154 | */ |
1062 | int s390_enable_sie(void) | 1155 | int s390_enable_sie(void) |
1063 | { | 1156 | { |
1064 | struct task_struct *tsk = current; | 1157 | struct task_struct *tsk = current; |
1065 | struct mm_struct *mm, *old_mm; | 1158 | struct mm_struct *mm = tsk->mm; |
1159 | struct mmu_gather tlb; | ||
1066 | 1160 | ||
1067 | /* Do we have switched amode? If no, we cannot do sie */ | 1161 | /* Do we have switched amode? If no, we cannot do sie */ |
1068 | if (s390_user_mode == HOME_SPACE_MODE) | 1162 | if (s390_user_mode == HOME_SPACE_MODE) |
@@ -1072,57 +1166,16 @@ int s390_enable_sie(void) | |||
1072 | if (mm_has_pgste(tsk->mm)) | 1166 | if (mm_has_pgste(tsk->mm)) |
1073 | return 0; | 1167 | return 0; |
1074 | 1168 | ||
1075 | /* lets check if we are allowed to replace the mm */ | 1169 | down_write(&mm->mmap_sem); |
1076 | task_lock(tsk); | ||
1077 | if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || | ||
1078 | #ifdef CONFIG_AIO | ||
1079 | !hlist_empty(&tsk->mm->ioctx_list) || | ||
1080 | #endif | ||
1081 | tsk->mm != tsk->active_mm) { | ||
1082 | task_unlock(tsk); | ||
1083 | return -EINVAL; | ||
1084 | } | ||
1085 | task_unlock(tsk); | ||
1086 | |||
1087 | /* we copy the mm and let dup_mm create the page tables with_pgstes */ | ||
1088 | tsk->mm->context.alloc_pgste = 1; | ||
1089 | /* make sure that both mms have a correct rss state */ | ||
1090 | sync_mm_rss(tsk->mm); | ||
1091 | mm = dup_mm(tsk); | ||
1092 | tsk->mm->context.alloc_pgste = 0; | ||
1093 | if (!mm) | ||
1094 | return -ENOMEM; | ||
1095 | |||
1096 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE | ||
1097 | /* split thp mappings and disable thp for future mappings */ | 1170 | /* split thp mappings and disable thp for future mappings */ |
1098 | thp_split_mm(mm); | 1171 | thp_split_mm(mm); |
1099 | mm->def_flags |= VM_NOHUGEPAGE; | 1172 | /* Reallocate the page tables with pgstes */ |
1100 | #endif | 1173 | mm->context.has_pgste = 1; |
1101 | 1174 | tlb_gather_mmu(&tlb, mm, 0); | |
1102 | /* Now lets check again if something happened */ | 1175 | page_table_realloc(&tlb, mm, 0, TASK_SIZE); |
1103 | task_lock(tsk); | 1176 | tlb_finish_mmu(&tlb, 0, -1); |
1104 | if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || | 1177 | up_write(&mm->mmap_sem); |
1105 | #ifdef CONFIG_AIO | 1178 | return mm->context.has_pgste ? 0 : -ENOMEM; |
1106 | !hlist_empty(&tsk->mm->ioctx_list) || | ||
1107 | #endif | ||
1108 | tsk->mm != tsk->active_mm) { | ||
1109 | mmput(mm); | ||
1110 | task_unlock(tsk); | ||
1111 | return -EINVAL; | ||
1112 | } | ||
1113 | |||
1114 | /* ok, we are alone. No ptrace, no threads, etc. */ | ||
1115 | old_mm = tsk->mm; | ||
1116 | tsk->mm = tsk->active_mm = mm; | ||
1117 | preempt_disable(); | ||
1118 | update_mm(mm, tsk); | ||
1119 | atomic_inc(&mm->context.attach_count); | ||
1120 | atomic_dec(&old_mm->context.attach_count); | ||
1121 | cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); | ||
1122 | preempt_enable(); | ||
1123 | task_unlock(tsk); | ||
1124 | mmput(old_mm); | ||
1125 | return 0; | ||
1126 | } | 1179 | } |
1127 | EXPORT_SYMBOL_GPL(s390_enable_sie); | 1180 | EXPORT_SYMBOL_GPL(s390_enable_sie); |
1128 | 1181 | ||