diff options
| author | Jan Kara <jack@suse.cz> | 2016-10-19 08:34:31 -0400 |
|---|---|---|
| committer | Dan Williams <dan.j.williams@intel.com> | 2016-12-26 23:29:25 -0500 |
| commit | 9f141d6ef6258a3a37a045842d9ba7e68f368956 (patch) | |
| tree | 99f00ca80593939bb78d826331c5b13aab014624 | |
| parent | f449b936f1aff7696b24a338f493d5cee8d48d55 (diff) | |
dax: Call ->iomap_begin without entry lock during dax fault
Currently ->iomap_begin() handler is called with entry lock held. If the
filesystem held any locks between ->iomap_begin() and ->iomap_end()
(such as ext4 which will want to hold transaction open), this would cause
lock inversion with the iomap_apply() from standard IO path which first
calls ->iomap_begin() and only then calls ->actor() callback which grabs
entry locks for DAX (if it faults when copying from/to user provided
buffers).
Fix the problem by nesting grabbing of entry lock inside ->iomap_begin()
- ->iomap_end() pair.
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
| -rw-r--r-- | fs/dax.c | 121 |
1 files changed, 66 insertions, 55 deletions
| @@ -1078,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | |||
| 1078 | } | 1078 | } |
| 1079 | EXPORT_SYMBOL_GPL(dax_iomap_rw); | 1079 | EXPORT_SYMBOL_GPL(dax_iomap_rw); |
| 1080 | 1080 | ||
| 1081 | static int dax_fault_return(int error) | ||
| 1082 | { | ||
| 1083 | if (error == 0) | ||
| 1084 | return VM_FAULT_NOPAGE; | ||
| 1085 | if (error == -ENOMEM) | ||
| 1086 | return VM_FAULT_OOM; | ||
| 1087 | return VM_FAULT_SIGBUS; | ||
| 1088 | } | ||
| 1089 | |||
| 1081 | /** | 1090 | /** |
| 1082 | * dax_iomap_fault - handle a page fault on a DAX file | 1091 | * dax_iomap_fault - handle a page fault on a DAX file |
| 1083 | * @vma: The virtual memory area where the fault occurred | 1092 | * @vma: The virtual memory area where the fault occurred |
| @@ -1110,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1110 | if (pos >= i_size_read(inode)) | 1119 | if (pos >= i_size_read(inode)) |
| 1111 | return VM_FAULT_SIGBUS; | 1120 | return VM_FAULT_SIGBUS; |
| 1112 | 1121 | ||
| 1113 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||
| 1114 | if (IS_ERR(entry)) { | ||
| 1115 | error = PTR_ERR(entry); | ||
| 1116 | goto out; | ||
| 1117 | } | ||
| 1118 | |||
| 1119 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) | 1122 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) |
| 1120 | flags |= IOMAP_WRITE; | 1123 | flags |= IOMAP_WRITE; |
| 1121 | 1124 | ||
| @@ -1126,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1126 | */ | 1129 | */ |
| 1127 | error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); | 1130 | error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); |
| 1128 | if (error) | 1131 | if (error) |
| 1129 | goto unlock_entry; | 1132 | return dax_fault_return(error); |
| 1130 | if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { | 1133 | if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { |
| 1131 | error = -EIO; /* fs corruption? */ | 1134 | vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ |
| 1135 | goto finish_iomap; | ||
| 1136 | } | ||
| 1137 | |||
| 1138 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||
| 1139 | if (IS_ERR(entry)) { | ||
| 1140 | vmf_ret = dax_fault_return(PTR_ERR(entry)); | ||
| 1132 | goto finish_iomap; | 1141 | goto finish_iomap; |
| 1133 | } | 1142 | } |
| 1134 | 1143 | ||
| @@ -1151,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1151 | } | 1160 | } |
| 1152 | 1161 | ||
| 1153 | if (error) | 1162 | if (error) |
| 1154 | goto finish_iomap; | 1163 | goto error_unlock_entry; |
| 1155 | 1164 | ||
| 1156 | __SetPageUptodate(vmf->cow_page); | 1165 | __SetPageUptodate(vmf->cow_page); |
| 1157 | vmf_ret = finish_fault(vmf); | 1166 | vmf_ret = finish_fault(vmf); |
| 1158 | if (!vmf_ret) | 1167 | if (!vmf_ret) |
| 1159 | vmf_ret = VM_FAULT_DONE_COW; | 1168 | vmf_ret = VM_FAULT_DONE_COW; |
| 1160 | goto finish_iomap; | 1169 | goto unlock_entry; |
| 1161 | } | 1170 | } |
| 1162 | 1171 | ||
| 1163 | switch (iomap.type) { | 1172 | switch (iomap.type) { |
| @@ -1169,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1169 | } | 1178 | } |
| 1170 | error = dax_insert_mapping(mapping, iomap.bdev, sector, | 1179 | error = dax_insert_mapping(mapping, iomap.bdev, sector, |
| 1171 | PAGE_SIZE, &entry, vma, vmf); | 1180 | PAGE_SIZE, &entry, vma, vmf); |
| 1181 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | ||
| 1182 | if (error == -EBUSY) | ||
| 1183 | error = 0; | ||
| 1172 | break; | 1184 | break; |
| 1173 | case IOMAP_UNWRITTEN: | 1185 | case IOMAP_UNWRITTEN: |
| 1174 | case IOMAP_HOLE: | 1186 | case IOMAP_HOLE: |
| 1175 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1187 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
| 1176 | vmf_ret = dax_load_hole(mapping, &entry, vmf); | 1188 | vmf_ret = dax_load_hole(mapping, &entry, vmf); |
| 1177 | goto finish_iomap; | 1189 | goto unlock_entry; |
| 1178 | } | 1190 | } |
| 1179 | /*FALLTHRU*/ | 1191 | /*FALLTHRU*/ |
| 1180 | default: | 1192 | default: |
| @@ -1183,30 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
| 1183 | break; | 1195 | break; |
| 1184 | } | 1196 | } |
| 1185 | 1197 | ||
| 1186 | finish_iomap: | 1198 | error_unlock_entry: |
| 1187 | if (ops->iomap_end) { | 1199 | vmf_ret = dax_fault_return(error) | major; |
| 1188 | if (error || (vmf_ret & VM_FAULT_ERROR)) { | ||
| 1189 | /* keep previous error */ | ||
| 1190 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, | ||
| 1191 | &iomap); | ||
| 1192 | } else { | ||
| 1193 | error = ops->iomap_end(inode, pos, PAGE_SIZE, | ||
| 1194 | PAGE_SIZE, flags, &iomap); | ||
| 1195 | } | ||
| 1196 | } | ||
| 1197 | unlock_entry: | 1200 | unlock_entry: |
| 1198 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 1201 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); |
| 1199 | out: | 1202 | finish_iomap: |
| 1200 | if (error == -ENOMEM) | 1203 | if (ops->iomap_end) { |
| 1201 | return VM_FAULT_OOM | major; | 1204 | int copied = PAGE_SIZE; |
| 1202 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1205 | |
| 1203 | if (error < 0 && error != -EBUSY) | 1206 | if (vmf_ret & VM_FAULT_ERROR) |
| 1204 | return VM_FAULT_SIGBUS | major; | 1207 | copied = 0; |
| 1205 | if (vmf_ret) { | 1208 | /* |
| 1206 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ | 1209 | * The fault is done by now and there's no way back (other |
| 1207 | return vmf_ret; | 1210 | * thread may be already happily using PTE we have installed). |
| 1211 | * Just ignore error from ->iomap_end since we cannot do much | ||
| 1212 | * with it. | ||
| 1213 | */ | ||
| 1214 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); | ||
| 1208 | } | 1215 | } |
| 1209 | return VM_FAULT_NOPAGE | major; | 1216 | return vmf_ret; |
| 1210 | } | 1217 | } |
| 1211 | EXPORT_SYMBOL_GPL(dax_iomap_fault); | 1218 | EXPORT_SYMBOL_GPL(dax_iomap_fault); |
| 1212 | 1219 | ||
| @@ -1331,16 +1338,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1331 | goto fallback; | 1338 | goto fallback; |
| 1332 | 1339 | ||
| 1333 | /* | 1340 | /* |
| 1334 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||
| 1335 | * PMD or a HZP entry. If it can't (because a 4k page is already in | ||
| 1336 | * the tree, for instance), it will return -EEXIST and we just fall | ||
| 1337 | * back to 4k entries. | ||
| 1338 | */ | ||
| 1339 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||
| 1340 | if (IS_ERR(entry)) | ||
| 1341 | goto fallback; | ||
| 1342 | |||
| 1343 | /* | ||
| 1344 | * Note that we don't use iomap_apply here. We aren't doing I/O, only | 1341 | * Note that we don't use iomap_apply here. We aren't doing I/O, only |
| 1345 | * setting up a mapping, so really we're using iomap_begin() as a way | 1342 | * setting up a mapping, so really we're using iomap_begin() as a way |
| 1346 | * to look up our filesystem block. | 1343 | * to look up our filesystem block. |
| @@ -1348,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1348 | pos = (loff_t)pgoff << PAGE_SHIFT; | 1345 | pos = (loff_t)pgoff << PAGE_SHIFT; |
| 1349 | error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); | 1346 | error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); |
| 1350 | if (error) | 1347 | if (error) |
| 1351 | goto unlock_entry; | 1348 | goto fallback; |
| 1349 | |||
| 1352 | if (iomap.offset + iomap.length < pos + PMD_SIZE) | 1350 | if (iomap.offset + iomap.length < pos + PMD_SIZE) |
| 1353 | goto finish_iomap; | 1351 | goto finish_iomap; |
| 1354 | 1352 | ||
| 1353 | /* | ||
| 1354 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||
| 1355 | * PMD or a HZP entry. If it can't (because a 4k page is already in | ||
| 1356 | * the tree, for instance), it will return -EEXIST and we just fall | ||
| 1357 | * back to 4k entries. | ||
| 1358 | */ | ||
| 1359 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||
| 1360 | if (IS_ERR(entry)) | ||
| 1361 | goto finish_iomap; | ||
| 1362 | |||
| 1355 | vmf.pgoff = pgoff; | 1363 | vmf.pgoff = pgoff; |
| 1356 | vmf.flags = flags; | 1364 | vmf.flags = flags; |
| 1357 | vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; | 1365 | vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; |
| @@ -1364,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1364 | case IOMAP_UNWRITTEN: | 1372 | case IOMAP_UNWRITTEN: |
| 1365 | case IOMAP_HOLE: | 1373 | case IOMAP_HOLE: |
| 1366 | if (WARN_ON_ONCE(write)) | 1374 | if (WARN_ON_ONCE(write)) |
| 1367 | goto finish_iomap; | 1375 | goto unlock_entry; |
| 1368 | result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, | 1376 | result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, |
| 1369 | &entry); | 1377 | &entry); |
| 1370 | break; | 1378 | break; |
| @@ -1373,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
| 1373 | break; | 1381 | break; |
| 1374 | } | 1382 | } |
| 1375 | 1383 | ||
| 1384 | unlock_entry: | ||
| 1385 | put_locked_mapping_entry(mapping, pgoff, entry); | ||
| 1376 | finish_iomap: | 1386 | finish_iomap: |
| 1377 | if (ops->iomap_end) { | 1387 | if (ops->iomap_end) { |
| 1378 | if (result == VM_FAULT_FALLBACK) { | 1388 | int copied = PMD_SIZE; |
| 1379 | ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, | 1389 | |
| 1380 | &iomap); | 1390 | if (result == VM_FAULT_FALLBACK) |
| 1381 | } else { | 1391 | copied = 0; |
| 1382 | error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, | 1392 | /* |
| 1383 | iomap_flags, &iomap); | 1393 | * The fault is done by now and there's no way back (other |
| 1384 | if (error) | 1394 | * thread may be already happily using PMD we have installed). |
| 1385 | result = VM_FAULT_FALLBACK; | 1395 | * Just ignore error from ->iomap_end since we cannot do much |
| 1386 | } | 1396 | * with it. |
| 1397 | */ | ||
| 1398 | ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, | ||
| 1399 | &iomap); | ||
| 1387 | } | 1400 | } |
| 1388 | unlock_entry: | ||
| 1389 | put_locked_mapping_entry(mapping, pgoff, entry); | ||
| 1390 | fallback: | 1401 | fallback: |
| 1391 | if (result == VM_FAULT_FALLBACK) { | 1402 | if (result == VM_FAULT_FALLBACK) { |
| 1392 | split_huge_pmd(vma, pmd, address); | 1403 | split_huge_pmd(vma, pmd, address); |
