aboutsummaryrefslogtreecommitdiffstats
path: root/arch/s390/mm
diff options
context:
space:
mode:
authorGerald Schaefer <gerald.schaefer@de.ibm.com>2016-04-15 10:38:40 -0400
committerMartin Schwidefsky <schwidefsky@de.ibm.com>2016-04-21 03:50:09 -0400
commit723cacbd9dc79582e562c123a0bacf8bfc69e72a (patch)
treecae93823c3e8a37ef012b85b38d48e9e1bc19762 /arch/s390/mm
parentdba599091c191d209b1499511a524ad9657c0e5a (diff)
s390/mm: fix asce_bits handling with dynamic pagetable levels
There is a race with multi-threaded applications between context switch and pagetable upgrade. In switch_mm() a new user_asce is built from mm->pgd and mm->context.asce_bits, w/o holding any locks. A concurrent mmap with a pagetable upgrade on another thread in crst_table_upgrade() could already have set new asce_bits, but not yet the new mm->pgd. This would result in a corrupt user_asce in switch_mm(), and eventually in a kernel panic from a translation exception. Fix this by storing the complete asce instead of just the asce_bits, which can then be read atomically from switch_mm(), so that it either sees the old value or the new value, but no mixture. Both cases are OK. Having the old value would result in a page fault on access to the higher level memory, but the fault handler would see the new mm->pgd, if it was a valid access after the mmap on the other thread has completed. So as worst-case scenario we would have a page fault loop for the racing thread until the next time slice. Also remove dead code and simplify the upgrade/downgrade path, there are no upgrades from 2 levels, and only downgrades from 3 levels for compat tasks. There are also no concurrent upgrades, because the mmap_sem is held with down_write() in do_mmap, so the flush and table checks during upgrade can be removed. Reported-by: Michael Munday <munday@ca.ibm.com> Reviewed-by: Martin Schwidefsky <schwidefsky@de.ibm.com> Signed-off-by: Gerald Schaefer <gerald.schaefer@de.ibm.com> Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Diffstat (limited to 'arch/s390/mm')
-rw-r--r--arch/s390/mm/init.c3
-rw-r--r--arch/s390/mm/mmap.c6
-rw-r--r--arch/s390/mm/pgalloc.c85
3 files changed, 33 insertions, 61 deletions
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index c7b0451397d6..2489b2e917c8 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -89,7 +89,8 @@ void __init paging_init(void)
89 asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; 89 asce_bits = _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH;
90 pgd_type = _REGION3_ENTRY_EMPTY; 90 pgd_type = _REGION3_ENTRY_EMPTY;
91 } 91 }
92 S390_lowcore.kernel_asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits; 92 init_mm.context.asce = (__pa(init_mm.pgd) & PAGE_MASK) | asce_bits;
93 S390_lowcore.kernel_asce = init_mm.context.asce;
93 clear_table((unsigned long *) init_mm.pgd, pgd_type, 94 clear_table((unsigned long *) init_mm.pgd, pgd_type,
94 sizeof(unsigned long)*2048); 95 sizeof(unsigned long)*2048);
95 vmem_map_init(); 96 vmem_map_init();
diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c
index 45c4daa49930..89cf09e5f168 100644
--- a/arch/s390/mm/mmap.c
+++ b/arch/s390/mm/mmap.c
@@ -174,7 +174,7 @@ int s390_mmap_check(unsigned long addr, unsigned long len, unsigned long flags)
174 if (!(flags & MAP_FIXED)) 174 if (!(flags & MAP_FIXED))
175 addr = 0; 175 addr = 0;
176 if ((addr + len) >= TASK_SIZE) 176 if ((addr + len) >= TASK_SIZE)
177 return crst_table_upgrade(current->mm, TASK_MAX_SIZE); 177 return crst_table_upgrade(current->mm);
178 return 0; 178 return 0;
179} 179}
180 180
@@ -191,7 +191,7 @@ s390_get_unmapped_area(struct file *filp, unsigned long addr,
191 return area; 191 return area;
192 if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { 192 if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
193 /* Upgrade the page table to 4 levels and retry. */ 193 /* Upgrade the page table to 4 levels and retry. */
194 rc = crst_table_upgrade(mm, TASK_MAX_SIZE); 194 rc = crst_table_upgrade(mm);
195 if (rc) 195 if (rc)
196 return (unsigned long) rc; 196 return (unsigned long) rc;
197 area = arch_get_unmapped_area(filp, addr, len, pgoff, flags); 197 area = arch_get_unmapped_area(filp, addr, len, pgoff, flags);
@@ -213,7 +213,7 @@ s390_get_unmapped_area_topdown(struct file *filp, const unsigned long addr,
213 return area; 213 return area;
214 if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) { 214 if (area == -ENOMEM && !is_compat_task() && TASK_SIZE < TASK_MAX_SIZE) {
215 /* Upgrade the page table to 4 levels and retry. */ 215 /* Upgrade the page table to 4 levels and retry. */
216 rc = crst_table_upgrade(mm, TASK_MAX_SIZE); 216 rc = crst_table_upgrade(mm);
217 if (rc) 217 if (rc)
218 return (unsigned long) rc; 218 return (unsigned long) rc;
219 area = arch_get_unmapped_area_topdown(filp, addr, len, 219 area = arch_get_unmapped_area_topdown(filp, addr, len,
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index f6c3de26cda8..e8b5962ac12a 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -76,81 +76,52 @@ static void __crst_table_upgrade(void *arg)
76 __tlb_flush_local(); 76 __tlb_flush_local();
77} 77}
78 78
79int crst_table_upgrade(struct mm_struct *mm, unsigned long limit) 79int crst_table_upgrade(struct mm_struct *mm)
80{ 80{
81 unsigned long *table, *pgd; 81 unsigned long *table, *pgd;
82 unsigned long entry;
83 int flush;
84 82
85 BUG_ON(limit > TASK_MAX_SIZE); 83 /* upgrade should only happen from 3 to 4 levels */
86 flush = 0; 84 BUG_ON(mm->context.asce_limit != (1UL << 42));
87repeat: 85
88 table = crst_table_alloc(mm); 86 table = crst_table_alloc(mm);
89 if (!table) 87 if (!table)
90 return -ENOMEM; 88 return -ENOMEM;
89
91 spin_lock_bh(&mm->page_table_lock); 90 spin_lock_bh(&mm->page_table_lock);
92 if (mm->context.asce_limit < limit) { 91 pgd = (unsigned long *) mm->pgd;
93 pgd = (unsigned long *) mm->pgd; 92 crst_table_init(table, _REGION2_ENTRY_EMPTY);
94 if (mm->context.asce_limit <= (1UL << 31)) { 93 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
95 entry = _REGION3_ENTRY_EMPTY; 94 mm->pgd = (pgd_t *) table;
96 mm->context.asce_limit = 1UL << 42; 95 mm->context.asce_limit = 1UL << 53;
97 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 96 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
98 _ASCE_USER_BITS | 97 _ASCE_USER_BITS | _ASCE_TYPE_REGION2;
99 _ASCE_TYPE_REGION3; 98 mm->task_size = mm->context.asce_limit;
100 } else {
101 entry = _REGION2_ENTRY_EMPTY;
102 mm->context.asce_limit = 1UL << 53;
103 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
104 _ASCE_USER_BITS |
105 _ASCE_TYPE_REGION2;
106 }
107 crst_table_init(table, entry);
108 pgd_populate(mm, (pgd_t *) table, (pud_t *) pgd);
109 mm->pgd = (pgd_t *) table;
110 mm->task_size = mm->context.asce_limit;
111 table = NULL;
112 flush = 1;
113 }
114 spin_unlock_bh(&mm->page_table_lock); 99 spin_unlock_bh(&mm->page_table_lock);
115 if (table) 100
116 crst_table_free(mm, table); 101 on_each_cpu(__crst_table_upgrade, mm, 0);
117 if (mm->context.asce_limit < limit)
118 goto repeat;
119 if (flush)
120 on_each_cpu(__crst_table_upgrade, mm, 0);
121 return 0; 102 return 0;
122} 103}
123 104
124void crst_table_downgrade(struct mm_struct *mm, unsigned long limit) 105void crst_table_downgrade(struct mm_struct *mm)
125{ 106{
126 pgd_t *pgd; 107 pgd_t *pgd;
127 108
109 /* downgrade should only happen from 3 to 2 levels (compat only) */
110 BUG_ON(mm->context.asce_limit != (1UL << 42));
111
128 if (current->active_mm == mm) { 112 if (current->active_mm == mm) {
129 clear_user_asce(); 113 clear_user_asce();
130 __tlb_flush_mm(mm); 114 __tlb_flush_mm(mm);
131 } 115 }
132 while (mm->context.asce_limit > limit) { 116
133 pgd = mm->pgd; 117 pgd = mm->pgd;
134 switch (pgd_val(*pgd) & _REGION_ENTRY_TYPE_MASK) { 118 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
135 case _REGION_ENTRY_TYPE_R2: 119 mm->context.asce_limit = 1UL << 31;
136 mm->context.asce_limit = 1UL << 42; 120 mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH |
137 mm->context.asce_bits = _ASCE_TABLE_LENGTH | 121 _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT;
138 _ASCE_USER_BITS | 122 mm->task_size = mm->context.asce_limit;
139 _ASCE_TYPE_REGION3; 123 crst_table_free(mm, (unsigned long *) pgd);
140 break; 124
141 case _REGION_ENTRY_TYPE_R3:
142 mm->context.asce_limit = 1UL << 31;
143 mm->context.asce_bits = _ASCE_TABLE_LENGTH |
144 _ASCE_USER_BITS |
145 _ASCE_TYPE_SEGMENT;
146 break;
147 default:
148 BUG();
149 }
150 mm->pgd = (pgd_t *) (pgd_val(*pgd) & _REGION_ENTRY_ORIGIN);
151 mm->task_size = mm->context.asce_limit;
152 crst_table_free(mm, (unsigned long *) pgd);
153 }
154 if (current->active_mm == mm) 125 if (current->active_mm == mm)
155 set_user_asce(mm); 126 set_user_asce(mm);
156} 127}