aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/file.c
diff options
context:
space:
mode:
authorMark Fasheh <mark.fasheh@oracle.com>2007-02-09 23:24:12 -0500
committerMark Fasheh <mark.fasheh@oracle.com>2007-04-26 18:02:08 -0400
commit9517bac6cc7a7aa4fee63cb38a32cb6014e264c7 (patch)
tree3cac0c18d0cacc316e0e8a60f483282d6f991779 /fs/ocfs2/file.c
parent89488984ac23b0580f959b9ee549f2fcb1c2f194 (diff)
ocfs2: teach ocfs2_file_aio_write() about sparse files
Unfortunately, ocfs2 can no longer make use of generic_file_aio_write_nlock() because allocating writes will require zeroing of pages adjacent to the I/O for cluster sizes greater than page size. Implement a custom file write here, which can order page locks for zeroing. This also has the advantage that cluster locks can easily be ordered outside of the page locks. Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r--fs/ocfs2/file.c374
1 files changed, 335 insertions, 39 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3bcf3629265..667e5a869bf 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/pipe_fs_i.h> 34#include <linux/pipe_fs_i.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h>
36 37
37#define MLOG_MASK_PREFIX ML_INODE 38#define MLOG_MASK_PREFIX ML_INODE
38#include <cluster/masklog.h> 39#include <cluster/masklog.h>
@@ -485,13 +486,13 @@ leave:
485 * accessed, and lock them, reserving the appropriate number of bits. 486 * accessed, and lock them, reserving the appropriate number of bits.
486 * 487 *
487 * Called from ocfs2_extend_allocation() for file systems which don't 488 * Called from ocfs2_extend_allocation() for file systems which don't
488 * support holes, and from ocfs2_prepare_write() for file systems 489 * support holes, and from ocfs2_write() for file systems which
489 * which understand sparse inodes. 490 * understand sparse inodes.
490 */ 491 */
491static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 492int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
492 u32 clusters_to_add, 493 u32 clusters_to_add,
493 struct ocfs2_alloc_context **data_ac, 494 struct ocfs2_alloc_context **data_ac,
494 struct ocfs2_alloc_context **meta_ac) 495 struct ocfs2_alloc_context **meta_ac)
495{ 496{
496 int ret, num_free_extents; 497 int ret, num_free_extents;
497 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 498 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -518,7 +519,7 @@ static int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
518 * a cluster lock (because we ran out of room for another 519 * a cluster lock (because we ran out of room for another
519 * extent) will violate ordering rules. 520 * extent) will violate ordering rules.
520 * 521 *
521 * Most of the time we'll only be seeing this 1 page at a time 522 * Most of the time we'll only be seeing this 1 cluster at a time
522 * anyway. 523 * anyway.
523 */ 524 */
524 if (!num_free_extents || 525 if (!num_free_extents ||
@@ -596,13 +597,6 @@ static int ocfs2_extend_allocation(struct inode *inode,
596restart_all: 597restart_all:
597 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 598 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
598 599
599 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
600 &meta_ac);
601 if (status) {
602 mlog_errno(status);
603 goto leave;
604 }
605
606 /* blocks peope in read/write from reading our allocation 600 /* blocks peope in read/write from reading our allocation
607 * until we're done changing it. We depend on i_mutex to block 601 * until we're done changing it. We depend on i_mutex to block
608 * other extend/truncate calls while we're here. Ordering wrt 602 * other extend/truncate calls while we're here. Ordering wrt
@@ -610,6 +604,13 @@ restart_all:
610 down_write(&OCFS2_I(inode)->ip_alloc_sem); 604 down_write(&OCFS2_I(inode)->ip_alloc_sem);
611 drop_alloc_sem = 1; 605 drop_alloc_sem = 1;
612 606
607 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
608 &meta_ac);
609 if (status) {
610 mlog_errno(status);
611 goto leave;
612 }
613
613 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 614 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
614 handle = ocfs2_start_trans(osb, credits); 615 handle = ocfs2_start_trans(osb, credits);
615 if (IS_ERR(handle)) { 616 if (IS_ERR(handle)) {
@@ -1088,10 +1089,49 @@ out:
1088 return ret; 1089 return ret;
1089} 1090}
1090 1091
1092/*
1093 * Will look for holes and unwritten extents in the range starting at
1094 * pos for count bytes (inclusive).
1095 */
1096static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1097 size_t count)
1098{
1099 int ret = 0;
1100 unsigned int extent_flags;
1101 u32 cpos, clusters, extent_len, phys_cpos;
1102 struct super_block *sb = inode->i_sb;
1103
1104 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1105 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1106
1107 while (clusters) {
1108 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1109 &extent_flags);
1110 if (ret < 0) {
1111 mlog_errno(ret);
1112 goto out;
1113 }
1114
1115 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1116 ret = 1;
1117 break;
1118 }
1119
1120 if (extent_len > clusters)
1121 extent_len = clusters;
1122
1123 clusters -= extent_len;
1124 cpos += extent_len;
1125 }
1126out:
1127 return ret;
1128}
1129
1091static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1130static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1092 loff_t *ppos, 1131 loff_t *ppos,
1093 size_t count, 1132 size_t count,
1094 int appending) 1133 int appending,
1134 int *direct_io)
1095{ 1135{
1096 int ret = 0, meta_level = appending; 1136 int ret = 0, meta_level = appending;
1097 struct inode *inode = dentry->d_inode; 1137 struct inode *inode = dentry->d_inode;
@@ -1143,12 +1183,47 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1143 saved_pos = *ppos; 1183 saved_pos = *ppos;
1144 } 1184 }
1145 1185
1186 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
1187 loff_t end = saved_pos + count;
1188
1189 /*
1190 * Skip the O_DIRECT checks if we don't need
1191 * them.
1192 */
1193 if (!direct_io || !(*direct_io))
1194 break;
1195
1196 /*
1197 * Allowing concurrent direct writes means
1198 * i_size changes wouldn't be synchronized, so
1199 * one node could wind up truncating another
1200 * nodes writes.
1201 */
1202 if (end > i_size_read(inode)) {
1203 *direct_io = 0;
1204 break;
1205 }
1206
1207 /*
1208 * We don't fill holes during direct io, so
1209 * check for them here. If any are found, the
1210 * caller will have to retake some cluster
1211 * locks and initiate the io as buffered.
1212 */
1213 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1214 count);
1215 if (ret == 1) {
1216 *direct_io = 0;
1217 ret = 0;
1218 } else if (ret < 0)
1219 mlog_errno(ret);
1220 break;
1221 }
1222
1146 /* 1223 /*
1147 * The rest of this loop is concerned with legacy file 1224 * The rest of this loop is concerned with legacy file
1148 * systems which don't support sparse files. 1225 * systems which don't support sparse files.
1149 */ 1226 */
1150 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
1151 break;
1152 1227
1153 newsize = count + saved_pos; 1228 newsize = count + saved_pos;
1154 1229
@@ -1202,55 +1277,264 @@ out:
1202 return ret; 1277 return ret;
1203} 1278}
1204 1279
1280static inline void
1281ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1282{
1283 const struct iovec *iov = *iovp;
1284 size_t base = *basep;
1285
1286 do {
1287 int copy = min(bytes, iov->iov_len - base);
1288
1289 bytes -= copy;
1290 base += copy;
1291 if (iov->iov_len == base) {
1292 iov++;
1293 base = 0;
1294 }
1295 } while (bytes);
1296 *iovp = iov;
1297 *basep = base;
1298}
1299
1300static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
1301 const struct iovec *cur_iov,
1302 size_t iov_offset)
1303{
1304 int ret;
1305 char *buf;
1306 struct page *src_page = NULL;
1307
1308 buf = cur_iov->iov_base + iov_offset;
1309
1310 if (!segment_eq(get_fs(), KERNEL_DS)) {
1311 /*
1312 * Pull in the user page. We want to do this outside
1313 * of the meta data locks in order to preserve locking
1314 * order in case of page fault.
1315 */
1316 ret = get_user_pages(current, current->mm,
1317 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1318 0, 0, &src_page, NULL);
1319 if (ret == 1)
1320 bp->b_src_buf = kmap(src_page);
1321 else
1322 src_page = ERR_PTR(-EFAULT);
1323 } else {
1324 bp->b_src_buf = buf;
1325 }
1326
1327 return src_page;
1328}
1329
1330static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
1331 struct page *page)
1332{
1333 if (page) {
1334 kunmap(page);
1335 page_cache_release(page);
1336 }
1337}
1338
1339static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1340 const struct iovec *iov,
1341 unsigned long nr_segs,
1342 size_t count,
1343 ssize_t o_direct_written)
1344{
1345 int ret = 0;
1346 ssize_t copied, total = 0;
1347 size_t iov_offset = 0;
1348 const struct iovec *cur_iov = iov;
1349 struct ocfs2_buffered_write_priv bp;
1350 struct page *page;
1351
1352 /*
1353 * handle partial DIO write. Adjust cur_iov if needed.
1354 */
1355 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1356
1357 do {
1358 bp.b_cur_off = iov_offset;
1359 bp.b_cur_iov = cur_iov;
1360
1361 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
1362 if (IS_ERR(page)) {
1363 ret = PTR_ERR(page);
1364 goto out;
1365 }
1366
1367 copied = ocfs2_buffered_write_cluster(file, *ppos, count,
1368 ocfs2_map_and_write_user_data,
1369 &bp);
1370
1371 ocfs2_put_write_source(&bp, page);
1372
1373 if (copied < 0) {
1374 mlog_errno(copied);
1375 ret = copied;
1376 goto out;
1377 }
1378
1379 total += copied;
1380 *ppos = *ppos + copied;
1381 count -= copied;
1382
1383 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1384 } while(count);
1385
1386out:
1387 return total ? total : ret;
1388}
1389
1390static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
1391 unsigned long *nr_segs)
1392{
1393 size_t ocount; /* original count */
1394 unsigned long seg;
1395
1396 ocount = 0;
1397 for (seg = 0; seg < *nr_segs; seg++) {
1398 const struct iovec *iv = &iov[seg];
1399
1400 /*
1401 * If any segment has a negative length, or the cumulative
1402 * length ever wraps negative then return -EINVAL.
1403 */
1404 ocount += iv->iov_len;
1405 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
1406 return -EINVAL;
1407 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1408 continue;
1409 if (seg == 0)
1410 return -EFAULT;
1411 *nr_segs = seg;
1412 ocount -= iv->iov_len; /* This segment is no good */
1413 break;
1414 }
1415
1416 *counted = ocount;
1417 return 0;
1418}
1419
1205static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1420static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1206 const struct iovec *iov, 1421 const struct iovec *iov,
1207 unsigned long nr_segs, 1422 unsigned long nr_segs,
1208 loff_t pos) 1423 loff_t pos)
1209{ 1424{
1210 int ret, rw_level, have_alloc_sem = 0; 1425 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1211 struct file *filp = iocb->ki_filp; 1426 int can_do_direct, sync = 0;
1212 struct inode *inode = filp->f_path.dentry->d_inode; 1427 ssize_t written = 0;
1213 int appending = filp->f_flags & O_APPEND ? 1 : 0; 1428 size_t ocount; /* original count */
1214 1429 size_t count; /* after file limit checks */
1215 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1430 loff_t *ppos = &iocb->ki_pos;
1431 struct file *file = iocb->ki_filp;
1432 struct inode *inode = file->f_path.dentry->d_inode;
1433
1434 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1216 (unsigned int)nr_segs, 1435 (unsigned int)nr_segs,
1217 filp->f_path.dentry->d_name.len, 1436 file->f_path.dentry->d_name.len,
1218 filp->f_path.dentry->d_name.name); 1437 file->f_path.dentry->d_name.name);
1219 1438
1220 /* happy write of zero bytes */
1221 if (iocb->ki_left == 0) 1439 if (iocb->ki_left == 0)
1222 return 0; 1440 return 0;
1223 1441
1442 ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
1443 if (ret)
1444 return ret;
1445
1446 count = ocount;
1447
1448 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1449
1450 appending = file->f_flags & O_APPEND ? 1 : 0;
1451 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1452
1224 mutex_lock(&inode->i_mutex); 1453 mutex_lock(&inode->i_mutex);
1454
1455relock:
1225 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1456 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1226 if (filp->f_flags & O_DIRECT) { 1457 if (direct_io) {
1227 have_alloc_sem = 1;
1228 down_read(&inode->i_alloc_sem); 1458 down_read(&inode->i_alloc_sem);
1459 have_alloc_sem = 1;
1229 } 1460 }
1230 1461
1231 /* concurrent O_DIRECT writes are allowed */ 1462 /* concurrent O_DIRECT writes are allowed */
1232 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 1463 rw_level = !direct_io;
1233 ret = ocfs2_rw_lock(inode, rw_level); 1464 ret = ocfs2_rw_lock(inode, rw_level);
1234 if (ret < 0) { 1465 if (ret < 0) {
1235 rw_level = -1;
1236 mlog_errno(ret); 1466 mlog_errno(ret);
1237 goto out; 1467 goto out_sems;
1238 } 1468 }
1239 1469
1240 ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, 1470 can_do_direct = direct_io;
1241 iocb->ki_left, appending); 1471 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1472 iocb->ki_left, appending,
1473 &can_do_direct);
1242 if (ret < 0) { 1474 if (ret < 0) {
1243 mlog_errno(ret); 1475 mlog_errno(ret);
1244 goto out; 1476 goto out;
1245 } 1477 }
1246 1478
1479 /*
1480 * We can't complete the direct I/O as requested, fall back to
1481 * buffered I/O.
1482 */
1483 if (direct_io && !can_do_direct) {
1484 ocfs2_rw_unlock(inode, rw_level);
1485 up_read(&inode->i_alloc_sem);
1486
1487 have_alloc_sem = 0;
1488 rw_level = -1;
1489
1490 direct_io = 0;
1491 sync = 1;
1492 goto relock;
1493 }
1494
1495 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
1496 sync = 1;
1497
1498 /*
1499 * XXX: Is it ok to execute these checks a second time?
1500 */
1501 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
1502 if (ret)
1503 goto out;
1504
1505 /*
1506 * Set pos so that sync_page_range_nolock() below understands
1507 * where to start from. We might've moved it around via the
1508 * calls above. The range we want to actually sync starts from
1509 * *ppos here.
1510 *
1511 */
1512 pos = *ppos;
1513
1247 /* communicate with ocfs2_dio_end_io */ 1514 /* communicate with ocfs2_dio_end_io */
1248 ocfs2_iocb_set_rw_locked(iocb); 1515 ocfs2_iocb_set_rw_locked(iocb);
1249 1516
1250 ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); 1517 if (direct_io) {
1518 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1519 ppos, count, ocount);
1520 if (written < 0) {
1521 ret = written;
1522 goto out_dio;
1523 }
1524 } else {
1525 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
1526 count, written);
1527 if (written < 0) {
1528 ret = written;
1529 if (ret != -EFAULT || ret != -ENOSPC)
1530 mlog_errno(ret);
1531 goto out;
1532 }
1533 }
1251 1534
1535out_dio:
1252 /* buffered aio wouldn't have proper lock coverage today */ 1536 /* buffered aio wouldn't have proper lock coverage today */
1253 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1537 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1254 1538
1255 /* 1539 /*
1256 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1540 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1268,14 +1552,25 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1268 } 1552 }
1269 1553
1270out: 1554out:
1555 if (rw_level != -1)
1556 ocfs2_rw_unlock(inode, rw_level);
1557
1558out_sems:
1271 if (have_alloc_sem) 1559 if (have_alloc_sem)
1272 up_read(&inode->i_alloc_sem); 1560 up_read(&inode->i_alloc_sem);
1273 if (rw_level != -1) 1561
1274 ocfs2_rw_unlock(inode, rw_level); 1562 if (written > 0 && sync) {
1563 ssize_t err;
1564
1565 err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
1566 if (err < 0)
1567 written = err;
1568 }
1569
1275 mutex_unlock(&inode->i_mutex); 1570 mutex_unlock(&inode->i_mutex);
1276 1571
1277 mlog_exit(ret); 1572 mlog_exit(ret);
1278 return ret; 1573 return written ? written : ret;
1279} 1574}
1280 1575
1281static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, 1576static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
@@ -1300,7 +1595,8 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1300 goto out; 1595 goto out;
1301 } 1596 }
1302 1597
1303 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); 1598 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1599 NULL);
1304 if (ret < 0) { 1600 if (ret < 0) {
1305 mlog_errno(ret); 1601 mlog_errno(ret);
1306 goto out_unlock; 1602 goto out_unlock;