diff options
author | Jan Kara <jack@suse.cz> | 2016-10-19 08:34:31 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2016-12-26 23:29:25 -0500 |
commit | 9f141d6ef6258a3a37a045842d9ba7e68f368956 (patch) | |
tree | 99f00ca80593939bb78d826331c5b13aab014624 | |
parent | f449b936f1aff7696b24a338f493d5cee8d48d55 (diff) |
dax: Call ->iomap_begin without entry lock during dax fault
Currently ->iomap_begin() handler is called with entry lock held. If the
filesystem held any locks between ->iomap_begin() and ->iomap_end()
(such as ext4 which will want to hold transaction open), this would cause
lock inversion with the iomap_apply() from standard IO path which first
calls ->iomap_begin() and only then calls ->actor() callback which grabs
entry locks for DAX (if it faults when copying from/to user provided
buffers).
Fix the problem by nesting grabbing of entry lock inside ->iomap_begin()
- ->iomap_end() pair.
Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r-- | fs/dax.c | 121 |
1 files changed, 66 insertions, 55 deletions
@@ -1078,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | |||
1078 | } | 1078 | } |
1079 | EXPORT_SYMBOL_GPL(dax_iomap_rw); | 1079 | EXPORT_SYMBOL_GPL(dax_iomap_rw); |
1080 | 1080 | ||
1081 | static int dax_fault_return(int error) | ||
1082 | { | ||
1083 | if (error == 0) | ||
1084 | return VM_FAULT_NOPAGE; | ||
1085 | if (error == -ENOMEM) | ||
1086 | return VM_FAULT_OOM; | ||
1087 | return VM_FAULT_SIGBUS; | ||
1088 | } | ||
1089 | |||
1081 | /** | 1090 | /** |
1082 | * dax_iomap_fault - handle a page fault on a DAX file | 1091 | * dax_iomap_fault - handle a page fault on a DAX file |
1083 | * @vma: The virtual memory area where the fault occurred | 1092 | * @vma: The virtual memory area where the fault occurred |
@@ -1110,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1110 | if (pos >= i_size_read(inode)) | 1119 | if (pos >= i_size_read(inode)) |
1111 | return VM_FAULT_SIGBUS; | 1120 | return VM_FAULT_SIGBUS; |
1112 | 1121 | ||
1113 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||
1114 | if (IS_ERR(entry)) { | ||
1115 | error = PTR_ERR(entry); | ||
1116 | goto out; | ||
1117 | } | ||
1118 | |||
1119 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) | 1122 | if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) |
1120 | flags |= IOMAP_WRITE; | 1123 | flags |= IOMAP_WRITE; |
1121 | 1124 | ||
@@ -1126,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1126 | */ | 1129 | */ |
1127 | error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); | 1130 | error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); |
1128 | if (error) | 1131 | if (error) |
1129 | goto unlock_entry; | 1132 | return dax_fault_return(error); |
1130 | if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { | 1133 | if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { |
1131 | error = -EIO; /* fs corruption? */ | 1134 | vmf_ret = dax_fault_return(-EIO); /* fs corruption? */ |
1135 | goto finish_iomap; | ||
1136 | } | ||
1137 | |||
1138 | entry = grab_mapping_entry(mapping, vmf->pgoff, 0); | ||
1139 | if (IS_ERR(entry)) { | ||
1140 | vmf_ret = dax_fault_return(PTR_ERR(entry)); | ||
1132 | goto finish_iomap; | 1141 | goto finish_iomap; |
1133 | } | 1142 | } |
1134 | 1143 | ||
@@ -1151,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1151 | } | 1160 | } |
1152 | 1161 | ||
1153 | if (error) | 1162 | if (error) |
1154 | goto finish_iomap; | 1163 | goto error_unlock_entry; |
1155 | 1164 | ||
1156 | __SetPageUptodate(vmf->cow_page); | 1165 | __SetPageUptodate(vmf->cow_page); |
1157 | vmf_ret = finish_fault(vmf); | 1166 | vmf_ret = finish_fault(vmf); |
1158 | if (!vmf_ret) | 1167 | if (!vmf_ret) |
1159 | vmf_ret = VM_FAULT_DONE_COW; | 1168 | vmf_ret = VM_FAULT_DONE_COW; |
1160 | goto finish_iomap; | 1169 | goto unlock_entry; |
1161 | } | 1170 | } |
1162 | 1171 | ||
1163 | switch (iomap.type) { | 1172 | switch (iomap.type) { |
@@ -1169,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1169 | } | 1178 | } |
1170 | error = dax_insert_mapping(mapping, iomap.bdev, sector, | 1179 | error = dax_insert_mapping(mapping, iomap.bdev, sector, |
1171 | PAGE_SIZE, &entry, vma, vmf); | 1180 | PAGE_SIZE, &entry, vma, vmf); |
1181 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | ||
1182 | if (error == -EBUSY) | ||
1183 | error = 0; | ||
1172 | break; | 1184 | break; |
1173 | case IOMAP_UNWRITTEN: | 1185 | case IOMAP_UNWRITTEN: |
1174 | case IOMAP_HOLE: | 1186 | case IOMAP_HOLE: |
1175 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { | 1187 | if (!(vmf->flags & FAULT_FLAG_WRITE)) { |
1176 | vmf_ret = dax_load_hole(mapping, &entry, vmf); | 1188 | vmf_ret = dax_load_hole(mapping, &entry, vmf); |
1177 | goto finish_iomap; | 1189 | goto unlock_entry; |
1178 | } | 1190 | } |
1179 | /*FALLTHRU*/ | 1191 | /*FALLTHRU*/ |
1180 | default: | 1192 | default: |
@@ -1183,30 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, | |||
1183 | break; | 1195 | break; |
1184 | } | 1196 | } |
1185 | 1197 | ||
1186 | finish_iomap: | 1198 | error_unlock_entry: |
1187 | if (ops->iomap_end) { | 1199 | vmf_ret = dax_fault_return(error) | major; |
1188 | if (error || (vmf_ret & VM_FAULT_ERROR)) { | ||
1189 | /* keep previous error */ | ||
1190 | ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, | ||
1191 | &iomap); | ||
1192 | } else { | ||
1193 | error = ops->iomap_end(inode, pos, PAGE_SIZE, | ||
1194 | PAGE_SIZE, flags, &iomap); | ||
1195 | } | ||
1196 | } | ||
1197 | unlock_entry: | 1200 | unlock_entry: |
1198 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); | 1201 | put_locked_mapping_entry(mapping, vmf->pgoff, entry); |
1199 | out: | 1202 | finish_iomap: |
1200 | if (error == -ENOMEM) | 1203 | if (ops->iomap_end) { |
1201 | return VM_FAULT_OOM | major; | 1204 | int copied = PAGE_SIZE; |
1202 | /* -EBUSY is fine, somebody else faulted on the same PTE */ | 1205 | |
1203 | if (error < 0 && error != -EBUSY) | 1206 | if (vmf_ret & VM_FAULT_ERROR) |
1204 | return VM_FAULT_SIGBUS | major; | 1207 | copied = 0; |
1205 | if (vmf_ret) { | 1208 | /* |
1206 | WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ | 1209 | * The fault is done by now and there's no way back (other |
1207 | return vmf_ret; | 1210 | * thread may be already happily using PTE we have installed). |
1211 | * Just ignore error from ->iomap_end since we cannot do much | ||
1212 | * with it. | ||
1213 | */ | ||
1214 | ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); | ||
1208 | } | 1215 | } |
1209 | return VM_FAULT_NOPAGE | major; | 1216 | return vmf_ret; |
1210 | } | 1217 | } |
1211 | EXPORT_SYMBOL_GPL(dax_iomap_fault); | 1218 | EXPORT_SYMBOL_GPL(dax_iomap_fault); |
1212 | 1219 | ||
@@ -1331,16 +1338,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
1331 | goto fallback; | 1338 | goto fallback; |
1332 | 1339 | ||
1333 | /* | 1340 | /* |
1334 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||
1335 | * PMD or a HZP entry. If it can't (because a 4k page is already in | ||
1336 | * the tree, for instance), it will return -EEXIST and we just fall | ||
1337 | * back to 4k entries. | ||
1338 | */ | ||
1339 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||
1340 | if (IS_ERR(entry)) | ||
1341 | goto fallback; | ||
1342 | |||
1343 | /* | ||
1344 | * Note that we don't use iomap_apply here. We aren't doing I/O, only | 1341 | * Note that we don't use iomap_apply here. We aren't doing I/O, only |
1345 | * setting up a mapping, so really we're using iomap_begin() as a way | 1342 | * setting up a mapping, so really we're using iomap_begin() as a way |
1346 | * to look up our filesystem block. | 1343 | * to look up our filesystem block. |
@@ -1348,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
1348 | pos = (loff_t)pgoff << PAGE_SHIFT; | 1345 | pos = (loff_t)pgoff << PAGE_SHIFT; |
1349 | error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); | 1346 | error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); |
1350 | if (error) | 1347 | if (error) |
1351 | goto unlock_entry; | 1348 | goto fallback; |
1349 | |||
1352 | if (iomap.offset + iomap.length < pos + PMD_SIZE) | 1350 | if (iomap.offset + iomap.length < pos + PMD_SIZE) |
1353 | goto finish_iomap; | 1351 | goto finish_iomap; |
1354 | 1352 | ||
1353 | /* | ||
1354 | * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX | ||
1355 | * PMD or a HZP entry. If it can't (because a 4k page is already in | ||
1356 | * the tree, for instance), it will return -EEXIST and we just fall | ||
1357 | * back to 4k entries. | ||
1358 | */ | ||
1359 | entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD); | ||
1360 | if (IS_ERR(entry)) | ||
1361 | goto finish_iomap; | ||
1362 | |||
1355 | vmf.pgoff = pgoff; | 1363 | vmf.pgoff = pgoff; |
1356 | vmf.flags = flags; | 1364 | vmf.flags = flags; |
1357 | vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; | 1365 | vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; |
@@ -1364,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
1364 | case IOMAP_UNWRITTEN: | 1372 | case IOMAP_UNWRITTEN: |
1365 | case IOMAP_HOLE: | 1373 | case IOMAP_HOLE: |
1366 | if (WARN_ON_ONCE(write)) | 1374 | if (WARN_ON_ONCE(write)) |
1367 | goto finish_iomap; | 1375 | goto unlock_entry; |
1368 | result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, | 1376 | result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, |
1369 | &entry); | 1377 | &entry); |
1370 | break; | 1378 | break; |
@@ -1373,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address, | |||
1373 | break; | 1381 | break; |
1374 | } | 1382 | } |
1375 | 1383 | ||
1384 | unlock_entry: | ||
1385 | put_locked_mapping_entry(mapping, pgoff, entry); | ||
1376 | finish_iomap: | 1386 | finish_iomap: |
1377 | if (ops->iomap_end) { | 1387 | if (ops->iomap_end) { |
1378 | if (result == VM_FAULT_FALLBACK) { | 1388 | int copied = PMD_SIZE; |
1379 | ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, | 1389 | |
1380 | &iomap); | 1390 | if (result == VM_FAULT_FALLBACK) |
1381 | } else { | 1391 | copied = 0; |
1382 | error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, | 1392 | /* |
1383 | iomap_flags, &iomap); | 1393 | * The fault is done by now and there's no way back (other |
1384 | if (error) | 1394 | * thread may be already happily using PMD we have installed). |
1385 | result = VM_FAULT_FALLBACK; | 1395 | * Just ignore error from ->iomap_end since we cannot do much |
1386 | } | 1396 | * with it. |
1397 | */ | ||
1398 | ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, | ||
1399 | &iomap); | ||
1387 | } | 1400 | } |
1388 | unlock_entry: | ||
1389 | put_locked_mapping_entry(mapping, pgoff, entry); | ||
1390 | fallback: | 1401 | fallback: |
1391 | if (result == VM_FAULT_FALLBACK) { | 1402 | if (result == VM_FAULT_FALLBACK) { |
1392 | split_huge_pmd(vma, pmd, address); | 1403 | split_huge_pmd(vma, pmd, address); |