aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJan Kara <jack@suse.cz>2016-10-19 08:34:31 -0400
committerDan Williams <dan.j.williams@intel.com>2016-12-26 23:29:25 -0500
commit9f141d6ef6258a3a37a045842d9ba7e68f368956 (patch)
tree99f00ca80593939bb78d826331c5b13aab014624
parentf449b936f1aff7696b24a338f493d5cee8d48d55 (diff)
dax: Call ->iomap_begin without entry lock during dax fault
Currently ->iomap_begin() handler is called with entry lock held. If the filesystem held any locks between ->iomap_begin() and ->iomap_end() (such as ext4 which will want to hold transaction open), this would cause lock inversion with the iomap_apply() from standard IO path which first calls ->iomap_begin() and only then calls ->actor() callback which grabs entry locks for DAX (if it faults when copying from/to user provided buffers). Fix the problem by nesting grabbing of entry lock inside ->iomap_begin() - ->iomap_end() pair. Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com> Signed-off-by: Jan Kara <jack@suse.cz> Signed-off-by: Dan Williams <dan.j.williams@intel.com>
-rw-r--r--fs/dax.c121
1 files changed, 66 insertions, 55 deletions
diff --git a/fs/dax.c b/fs/dax.c
index bfec6f2ef613..5c74f60d0a50 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -1078,6 +1078,15 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
1078} 1078}
1079EXPORT_SYMBOL_GPL(dax_iomap_rw); 1079EXPORT_SYMBOL_GPL(dax_iomap_rw);
1080 1080
1081static int dax_fault_return(int error)
1082{
1083 if (error == 0)
1084 return VM_FAULT_NOPAGE;
1085 if (error == -ENOMEM)
1086 return VM_FAULT_OOM;
1087 return VM_FAULT_SIGBUS;
1088}
1089
1081/** 1090/**
1082 * dax_iomap_fault - handle a page fault on a DAX file 1091 * dax_iomap_fault - handle a page fault on a DAX file
1083 * @vma: The virtual memory area where the fault occurred 1092 * @vma: The virtual memory area where the fault occurred
@@ -1110,12 +1119,6 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1110 if (pos >= i_size_read(inode)) 1119 if (pos >= i_size_read(inode))
1111 return VM_FAULT_SIGBUS; 1120 return VM_FAULT_SIGBUS;
1112 1121
1113 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1114 if (IS_ERR(entry)) {
1115 error = PTR_ERR(entry);
1116 goto out;
1117 }
1118
1119 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) 1122 if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
1120 flags |= IOMAP_WRITE; 1123 flags |= IOMAP_WRITE;
1121 1124
@@ -1126,9 +1129,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1126 */ 1129 */
1127 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap); 1130 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
1128 if (error) 1131 if (error)
1129 goto unlock_entry; 1132 return dax_fault_return(error);
1130 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { 1133 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
1131 error = -EIO; /* fs corruption? */ 1134 vmf_ret = dax_fault_return(-EIO); /* fs corruption? */
1135 goto finish_iomap;
1136 }
1137
1138 entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
1139 if (IS_ERR(entry)) {
1140 vmf_ret = dax_fault_return(PTR_ERR(entry));
1132 goto finish_iomap; 1141 goto finish_iomap;
1133 } 1142 }
1134 1143
@@ -1151,13 +1160,13 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1151 } 1160 }
1152 1161
1153 if (error) 1162 if (error)
1154 goto finish_iomap; 1163 goto error_unlock_entry;
1155 1164
1156 __SetPageUptodate(vmf->cow_page); 1165 __SetPageUptodate(vmf->cow_page);
1157 vmf_ret = finish_fault(vmf); 1166 vmf_ret = finish_fault(vmf);
1158 if (!vmf_ret) 1167 if (!vmf_ret)
1159 vmf_ret = VM_FAULT_DONE_COW; 1168 vmf_ret = VM_FAULT_DONE_COW;
1160 goto finish_iomap; 1169 goto unlock_entry;
1161 } 1170 }
1162 1171
1163 switch (iomap.type) { 1172 switch (iomap.type) {
@@ -1169,12 +1178,15 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1169 } 1178 }
1170 error = dax_insert_mapping(mapping, iomap.bdev, sector, 1179 error = dax_insert_mapping(mapping, iomap.bdev, sector,
1171 PAGE_SIZE, &entry, vma, vmf); 1180 PAGE_SIZE, &entry, vma, vmf);
1181 /* -EBUSY is fine, somebody else faulted on the same PTE */
1182 if (error == -EBUSY)
1183 error = 0;
1172 break; 1184 break;
1173 case IOMAP_UNWRITTEN: 1185 case IOMAP_UNWRITTEN:
1174 case IOMAP_HOLE: 1186 case IOMAP_HOLE:
1175 if (!(vmf->flags & FAULT_FLAG_WRITE)) { 1187 if (!(vmf->flags & FAULT_FLAG_WRITE)) {
1176 vmf_ret = dax_load_hole(mapping, &entry, vmf); 1188 vmf_ret = dax_load_hole(mapping, &entry, vmf);
1177 goto finish_iomap; 1189 goto unlock_entry;
1178 } 1190 }
1179 /*FALLTHRU*/ 1191 /*FALLTHRU*/
1180 default: 1192 default:
@@ -1183,30 +1195,25 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
1183 break; 1195 break;
1184 } 1196 }
1185 1197
1186 finish_iomap: 1198 error_unlock_entry:
1187 if (ops->iomap_end) { 1199 vmf_ret = dax_fault_return(error) | major;
1188 if (error || (vmf_ret & VM_FAULT_ERROR)) {
1189 /* keep previous error */
1190 ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags,
1191 &iomap);
1192 } else {
1193 error = ops->iomap_end(inode, pos, PAGE_SIZE,
1194 PAGE_SIZE, flags, &iomap);
1195 }
1196 }
1197 unlock_entry: 1200 unlock_entry:
1198 put_locked_mapping_entry(mapping, vmf->pgoff, entry); 1201 put_locked_mapping_entry(mapping, vmf->pgoff, entry);
1199 out: 1202 finish_iomap:
1200 if (error == -ENOMEM) 1203 if (ops->iomap_end) {
1201 return VM_FAULT_OOM | major; 1204 int copied = PAGE_SIZE;
1202 /* -EBUSY is fine, somebody else faulted on the same PTE */ 1205
1203 if (error < 0 && error != -EBUSY) 1206 if (vmf_ret & VM_FAULT_ERROR)
1204 return VM_FAULT_SIGBUS | major; 1207 copied = 0;
1205 if (vmf_ret) { 1208 /*
1206 WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ 1209 * The fault is done by now and there's no way back (other
1207 return vmf_ret; 1210 * thread may be already happily using PTE we have installed).
1211 * Just ignore error from ->iomap_end since we cannot do much
1212 * with it.
1213 */
1214 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
1208 } 1215 }
1209 return VM_FAULT_NOPAGE | major; 1216 return vmf_ret;
1210} 1217}
1211EXPORT_SYMBOL_GPL(dax_iomap_fault); 1218EXPORT_SYMBOL_GPL(dax_iomap_fault);
1212 1219
@@ -1331,16 +1338,6 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1331 goto fallback; 1338 goto fallback;
1332 1339
1333 /* 1340 /*
1334 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1335 * PMD or a HZP entry. If it can't (because a 4k page is already in
1336 * the tree, for instance), it will return -EEXIST and we just fall
1337 * back to 4k entries.
1338 */
1339 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1340 if (IS_ERR(entry))
1341 goto fallback;
1342
1343 /*
1344 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1341 * Note that we don't use iomap_apply here. We aren't doing I/O, only
1345 * setting up a mapping, so really we're using iomap_begin() as a way 1342 * setting up a mapping, so really we're using iomap_begin() as a way
1346 * to look up our filesystem block. 1343 * to look up our filesystem block.
@@ -1348,10 +1345,21 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1348 pos = (loff_t)pgoff << PAGE_SHIFT; 1345 pos = (loff_t)pgoff << PAGE_SHIFT;
1349 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap); 1346 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
1350 if (error) 1347 if (error)
1351 goto unlock_entry; 1348 goto fallback;
1349
1352 if (iomap.offset + iomap.length < pos + PMD_SIZE) 1350 if (iomap.offset + iomap.length < pos + PMD_SIZE)
1353 goto finish_iomap; 1351 goto finish_iomap;
1354 1352
1353 /*
1354 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
1355 * PMD or a HZP entry. If it can't (because a 4k page is already in
1356 * the tree, for instance), it will return -EEXIST and we just fall
1357 * back to 4k entries.
1358 */
1359 entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
1360 if (IS_ERR(entry))
1361 goto finish_iomap;
1362
1355 vmf.pgoff = pgoff; 1363 vmf.pgoff = pgoff;
1356 vmf.flags = flags; 1364 vmf.flags = flags;
1357 vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO; 1365 vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
@@ -1364,7 +1372,7 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1364 case IOMAP_UNWRITTEN: 1372 case IOMAP_UNWRITTEN:
1365 case IOMAP_HOLE: 1373 case IOMAP_HOLE:
1366 if (WARN_ON_ONCE(write)) 1374 if (WARN_ON_ONCE(write))
1367 goto finish_iomap; 1375 goto unlock_entry;
1368 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap, 1376 result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
1369 &entry); 1377 &entry);
1370 break; 1378 break;
@@ -1373,20 +1381,23 @@ int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
1373 break; 1381 break;
1374 } 1382 }
1375 1383
1384 unlock_entry:
1385 put_locked_mapping_entry(mapping, pgoff, entry);
1376 finish_iomap: 1386 finish_iomap:
1377 if (ops->iomap_end) { 1387 if (ops->iomap_end) {
1378 if (result == VM_FAULT_FALLBACK) { 1388 int copied = PMD_SIZE;
1379 ops->iomap_end(inode, pos, PMD_SIZE, 0, iomap_flags, 1389
1380 &iomap); 1390 if (result == VM_FAULT_FALLBACK)
1381 } else { 1391 copied = 0;
1382 error = ops->iomap_end(inode, pos, PMD_SIZE, PMD_SIZE, 1392 /*
1383 iomap_flags, &iomap); 1393 * The fault is done by now and there's no way back (other
1384 if (error) 1394 * thread may be already happily using PMD we have installed).
1385 result = VM_FAULT_FALLBACK; 1395 * Just ignore error from ->iomap_end since we cannot do much
1386 } 1396 * with it.
1397 */
1398 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
1399 &iomap);
1387 } 1400 }
1388 unlock_entry:
1389 put_locked_mapping_entry(mapping, pgoff, entry);
1390 fallback: 1401 fallback:
1391 if (result == VM_FAULT_FALLBACK) { 1402 if (result == VM_FAULT_FALLBACK) {
1392 split_huge_pmd(vma, pmd, address); 1403 split_huge_pmd(vma, pmd, address);