diff options
author | Gerald Schaefer <gerald.schaefer@de.ibm.com> | 2016-04-15 10:38:40 -0400 |
---|---|---|
committer | Martin Schwidefsky <schwidefsky@de.ibm.com> | 2016-04-21 03:50:09 -0400 |
commit | 723cacbd9dc79582e562c123a0bacf8bfc69e72a (patch) | |
tree | cae93823c3e8a37ef012b85b38d48e9e1bc19762 /arch/s390/mm | |
parent | dba599091c191d209b1499511a524ad9657c0e5a (diff) |
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and
pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and
mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a
pagetable upgrade on another thread in crst_table_upgrade() could already
have set new asce_bits, but not yet the new mm->pgd. This would result in a
corrupt user_asce in switch_mm(), and eventually in a kernel panic from a
translation exception.
Fix this by storing the complete asce instead of just the asce_bits, which
can then be read atomically from switch_mm(), so that it either sees the
old value or the new value, but no mixture. Both cases are OK. Having the
old value would result in a page fault on access to the higher level memory,
but the fault handler would see the new mm->pgd, if it was a valid access
after the mmap on the other thread has completed. So as worst-case scenario
we would have a page fault loop for the racing thread until the next time
slice.
Also remove dead code and simplify the upgrade/downgrade path, there are no
upgrades from 2 levels, and only downgrades from 3 levels for compat tasks.
There are also no concurrent upgrades, because the mmap_sem is held with
down_write() in do_mmap, so the flush and table checks during upgrade can
be removed.
Reported-by: Michael Munday <munday@ca.ibm.com>
Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch/s390/mm')
-rw-r--r-- | arch/s390/mm/init.c | 3 | ||||
-rw-r--r-- | arch/s390/mm/mmap.c | 6 | ||||
-rw-r--r-- | arch/s390/mm/pgalloc.c | 85 |
3 files changed, 33 insertions, 61 deletions
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index c7b0451397d6..2489b2e917c8 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c | |||
@@ -89,7 +89,8 @@ void __init paging_init(void) | |||
89 | asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; | 89 | asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; |
90 | pgd_type = _REGION3_ENTRY_EMPTY; | 90 | pgd_type = _REGION3_ENTRY_EMPTY; |
91 | } | 91 | } |
92 | S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; | 92 | init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; |
93 | S390_lowcore.kernel_asce = init_mm.context.asce; | ||
93 | clear_table((unsigned long *) init_mm.pgd, pgd_type, | 94 | clear_table((unsigned long *) init_mm.pgd, pgd_type, |
94 | sizeof(unsigned long)*2048); | 95 | sizeof(unsigned long)*2048); |
95 | vmem_map_init(); | 96 | vmem_map_init(); |
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 45c4daa49930..89cf09e5f168 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c | |||
@@ -174,7 +174,7 @@ int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags) | |||
174 | if (!(flags & MAP_FIXED)) | 174 | if (!(flags & MAP_FIXED)) |
175 | addr = 0; | 175 | addr = 0; |
176 | if ((addr + len) >= TASK_SIZE) | 176 | if ((addr + len) >= TASK_SIZE) |
177 | return crst_table_upgrade(current->mm, TASK_MAX_SIZE); | 177 | return crst_table_upgrade(current->mm); |
178 | return 0; | 178 | return 0; |
179 | } | 179 | } |
180 | 180 | ||
@@ -191,7 +191,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr, | |||
191 | return area; | 191 | return area; |
192 | if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { | 192 | if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { |
193 | /* Upgrade the page table to 4 levels and retry. */ | 193 | /* Upgrade the page table to 4 levels and retry. */ |
194 | rc = crst_table_upgrade(mm, TASK_MAX_SIZE); | 194 | rc = crst_table_upgrade(mm); |
195 | if (rc) | 195 | if (rc) |
196 | return (unsigned long) rc; | 196 | return (unsigned long) rc; |
197 | area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); | 197 | area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); |
@@ -213,7 +213,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr, | |||
213 | return area; | 213 | return area; |
214 | if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { | 214 | if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { |
215 | /* Upgrade the page table to 4 levels and retry. */ | 215 | /* Upgrade the page table to 4 levels and retry. */ |
216 | rc = crst_table_upgrade(mm, TASK_MAX_SIZE); | 216 | rc = crst_table_upgrade(mm); |
217 | if (rc) | 217 | if (rc) |
218 | return (unsigned long) rc; | 218 | return (unsigned long) rc; |
219 | area = arch_get_unmapped_area_topdown(filp, addr, len, | 219 | area = arch_get_unmapped_area_topdown(filp, addr, len, |
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index f6c3de26cda8..e8b5962ac12a 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c | |||
@@ -76,81 +76,52 @@ static void __crst_table_upgrade(void *arg) | |||
76 | __tlb_flush_local(); | 76 | __tlb_flush_local(); |
77 | } | 77 | } |
78 | 78 | ||
79 | int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) | 79 | int crst_table_upgrade(struct mm_struct *mm) |
80 | { | 80 | { |
81 | unsigned long *table, *pgd; | 81 | unsigned long *table, *pgd; |
82 | unsigned long entry; | ||
83 | int flush; | ||
84 | 82 | ||
85 | BUG_ON(limit > TASK_MAX_SIZE); | 83 | /* upgrade should only happen from 3 to 4 levels */ |
86 | flush = 0; | 84 | BUG_ON(mm->context.asce_limit != (1UL << 42)); |
87 | repeat: | 85 | |
88 | table = crst_table_alloc(mm); | 86 | table = crst_table_alloc(mm); |
89 | if (!table) | 87 | if (!table) |
90 | return -ENOMEM; | 88 | return -ENOMEM; |
89 | |||
91 | spin_lock_bh(&mm->page_table_lock); | 90 | spin_lock_bh(&mm->page_table_lock); |
92 | if (mm->context.asce_limit < limit) { | 91 | pgd = (unsigned long *) mm->pgd; |
93 | pgd = (unsigned long *) mm->pgd; | 92 | crst_table_init(table, _REGION2_ENTRY_EMPTY); |
94 | if (mm->context.asce_limit <= (1UL << 31)) { | 93 | pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); |
95 | entry = _REGION3_ENTRY_EMPTY; | 94 | mm->pgd = (pgd_t *) table; |
96 | mm->context.asce_limit = 1UL << 42; | 95 | mm->context.asce_limit = 1UL << 53; |
97 | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | 96 | mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | |
98 | _ASCE_USER_BITS | | 97 | _ASCE_USER_BITS | _ASCE_TYPE_REGION2; |
99 | _ASCE_TYPE_REGION3; | 98 | mm->task_size = mm->context.asce_limit; |
100 | } else { | ||
101 | entry = _REGION2_ENTRY_EMPTY; | ||
102 | mm->context.asce_limit = 1UL << 53; | ||
103 | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | ||
104 | _ASCE_USER_BITS | | ||
105 | _ASCE_TYPE_REGION2; | ||
106 | } | ||
107 | crst_table_init(table, entry); | ||
108 | pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd); | ||
109 | mm->pgd = (pgd_t *) table; | ||
110 | mm->task_size = mm->context.asce_limit; | ||
111 | table = NULL; | ||
112 | flush = 1; | ||
113 | } | ||
114 | spin_unlock_bh(&mm->page_table_lock); | 99 | spin_unlock_bh(&mm->page_table_lock); |
115 | if (table) | 100 | |
116 | crst_table_free(mm, table); | 101 | on_each_cpu(__crst_table_upgrade, mm, 0); |
117 | if (mm->context.asce_limit < limit) | ||
118 | goto repeat; | ||
119 | if (flush) | ||
120 | on_each_cpu(__crst_table_upgrade, mm, 0); | ||
121 | return 0; | 102 | return 0; |
122 | } | 103 | } |
123 | 104 | ||
124 | void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) | 105 | void crst_table_downgrade(struct mm_struct *mm) |
125 | { | 106 | { |
126 | pgd_t *pgd; | 107 | pgd_t *pgd; |
127 | 108 | ||
109 | /* downgrade should only happen from 3 to 2 levels (compat only) */ | ||
110 | BUG_ON(mm->context.asce_limit != (1UL << 42)); | ||
111 | |||
128 | if (current->active_mm == mm) { | 112 | if (current->active_mm == mm) { |
129 | clear_user_asce(); | 113 | clear_user_asce(); |
130 | __tlb_flush_mm(mm); | 114 | __tlb_flush_mm(mm); |
131 | } | 115 | } |
132 | while (mm->context.asce_limit > limit) { | 116 | |
133 | pgd = mm->pgd; | 117 | pgd = mm->pgd; |
134 | switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { | 118 | mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); |
135 | case _REGION_ENTRY_TYPE_R2: | 119 | mm->context.asce_limit = 1UL << 31; |
136 | mm->context.asce_limit = 1UL << 42; | 120 | mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | |
137 | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | 121 | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; |
138 | _ASCE_USER_BITS | | 122 | mm->task_size = mm->context.asce_limit; |
139 | _ASCE_TYPE_REGION3; | 123 | crst_table_free(mm, (unsigned long *) pgd); |
140 | break; | 124 | |
141 | case _REGION_ENTRY_TYPE_R3: | ||
142 | mm->context.asce_limit = 1UL << 31; | ||
143 | mm->context.asce_bits = _ASCE_TABLE_LENGTH | | ||
144 | _ASCE_USER_BITS | | ||
145 | _ASCE_TYPE_SEGMENT; | ||
146 | break; | ||
147 | default: | ||
148 | BUG(); | ||
149 | } | ||
150 | mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN); | ||
151 | mm->task_size = mm->context.asce_limit; | ||
152 | crst_table_free(mm, (unsigned long *) pgd); | ||
153 | } | ||
154 | if (current->active_mm == mm) | 125 | if (current->active_mm == mm) |
155 | set_user_asce(mm); | 126 | set_user_asce(mm); |
156 | } | 127 | } |