aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r--fs/ocfs2/file.c637
1 files changed, 551 insertions, 86 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..520a2a6d7670 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/pipe_fs_i.h> 34#include <linux/pipe_fs_i.h>
35#include <linux/mount.h> 35#include <linux/mount.h>
36#include <linux/writeback.h>
36 37
37#define MLOG_MASK_PREFIX ML_INODE 38#define MLOG_MASK_PREFIX ML_INODE
38#include <cluster/masklog.h> 39#include <cluster/masklog.h>
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
215 216
216 mlog_entry_void(); 217 mlog_entry_void();
217 i_size_write(inode, new_i_size); 218 i_size_write(inode, new_i_size);
218 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); 219 inode->i_blocks = ocfs2_inode_sector_count(inode);
219 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 220 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
220 221
221 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); 222 status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
261{ 262{
262 int status; 263 int status;
263 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di;
264 266
265 mlog_entry_void(); 267 mlog_entry_void();
266 268
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
274 goto out; 276 goto out;
275 } 277 }
276 278
277 status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); 279 status = ocfs2_journal_access(handle, inode, fe_bh,
280 OCFS2_JOURNAL_ACCESS_WRITE);
281 if (status < 0) {
282 mlog_errno(status);
283 goto out_commit;
284 }
285
286 /*
287 * Do this before setting i_size.
288 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
290 if (status) {
291 mlog_errno(status);
292 goto out_commit;
293 }
294
295 i_size_write(inode, new_i_size);
296 inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
298
299 di = (struct ocfs2_dinode *) fe_bh->b_data;
300 di->i_size = cpu_to_le64(new_i_size);
301 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
302 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
303
304 status = ocfs2_journal_dirty(handle, fe_bh);
278 if (status < 0) 305 if (status < 0)
279 mlog_errno(status); 306 mlog_errno(status);
280 307
308out_commit:
281 ocfs2_commit_trans(osb, handle); 309 ocfs2_commit_trans(osb, handle);
282out: 310out:
311
283 mlog_exit(status); 312 mlog_exit(status);
284 return status; 313 return status;
285} 314}
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
342 mlog_errno(status); 371 mlog_errno(status);
343 goto bail; 372 goto bail;
344 } 373 }
345 ocfs2_data_unlock(inode, 1);
346
347 if (le32_to_cpu(fe->i_clusters) ==
348 ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
349 mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
350 fe->i_clusters);
351 /* No allocation change is required, so lets fast path
352 * this truncate. */
353 status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
354 if (status < 0)
355 mlog_errno(status);
356 goto bail;
357 }
358 374
359 /* alright, we're going to need to do a full blown alloc size 375 /* alright, we're going to need to do a full blown alloc size
360 * change. Orphan the inode so that recovery can complete the 376 * change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
363 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); 379 status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
364 if (status < 0) { 380 if (status < 0) {
365 mlog_errno(status); 381 mlog_errno(status);
366 goto bail; 382 goto bail_unlock_data;
367 } 383 }
368 384
369 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); 385 status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
370 if (status < 0) { 386 if (status < 0) {
371 mlog_errno(status); 387 mlog_errno(status);
372 goto bail; 388 goto bail_unlock_data;
373 } 389 }
374 390
375 status = ocfs2_commit_truncate(osb, inode, di_bh, tc); 391 status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
376 if (status < 0) { 392 if (status < 0) {
377 mlog_errno(status); 393 mlog_errno(status);
378 goto bail; 394 goto bail_unlock_data;
379 } 395 }
380 396
381 /* TODO: orphan dir cleanup here. */ 397 /* TODO: orphan dir cleanup here. */
398bail_unlock_data:
399 ocfs2_data_unlock(inode, 1);
400
382bail: 401bail:
383 402
384 mlog_exit(status); 403 mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
397 */ 416 */
398int ocfs2_do_extend_allocation(struct ocfs2_super *osb, 417int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
399 struct inode *inode, 418 struct inode *inode,
419 u32 *logical_offset,
400 u32 clusters_to_add, 420 u32 clusters_to_add,
401 struct buffer_head *fe_bh, 421 struct buffer_head *fe_bh,
402 handle_t *handle, 422 handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
460 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 480 block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
461 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 481 mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
462 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 482 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
463 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, 483 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
464 num_bits, meta_ac); 484 *logical_offset, block, num_bits,
485 meta_ac);
465 if (status < 0) { 486 if (status < 0) {
466 mlog_errno(status); 487 mlog_errno(status);
467 goto leave; 488 goto leave;
468 } 489 }
469 490
470 le32_add_cpu(&fe->i_clusters, num_bits);
471 spin_lock(&OCFS2_I(inode)->ip_lock);
472 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
473 spin_unlock(&OCFS2_I(inode)->ip_lock);
474
475 status = ocfs2_journal_dirty(handle, fe_bh); 491 status = ocfs2_journal_dirty(handle, fe_bh);
476 if (status < 0) { 492 if (status < 0) {
477 mlog_errno(status); 493 mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
479 } 495 }
480 496
481 clusters_to_add -= num_bits; 497 clusters_to_add -= num_bits;
498 *logical_offset += num_bits;
482 499
483 if (clusters_to_add) { 500 if (clusters_to_add) {
484 mlog(0, "need to alloc once more, clusters = %u, wanted = " 501 mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
494 return status; 511 return status;
495} 512}
496 513
514/*
515 * For a given allocation, determine which allocators will need to be
516 * accessed, and lock them, reserving the appropriate number of bits.
517 *
518 * Called from ocfs2_extend_allocation() for file systems which don't
519 * support holes, and from ocfs2_write() for file systems which
520 * understand sparse inodes.
521 */
522int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
523 u32 clusters_to_add,
524 struct ocfs2_alloc_context **data_ac,
525 struct ocfs2_alloc_context **meta_ac)
526{
527 int ret, num_free_extents;
528 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
529
530 *meta_ac = NULL;
531 *data_ac = NULL;
532
533 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
534 "clusters_to_add = %u\n",
535 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
536 le32_to_cpu(di->i_clusters), clusters_to_add);
537
538 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
539 if (num_free_extents < 0) {
540 ret = num_free_extents;
541 mlog_errno(ret);
542 goto out;
543 }
544
545 /*
546 * Sparse allocation file systems need to be more conservative
547 * with reserving room for expansion - the actual allocation
548 * happens while we've got a journal handle open so re-taking
549 * a cluster lock (because we ran out of room for another
550 * extent) will violate ordering rules.
551 *
552 * Most of the time we'll only be seeing this 1 cluster at a time
553 * anyway.
554 */
555 if (!num_free_extents ||
556 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
557 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
558 if (ret < 0) {
559 if (ret != -ENOSPC)
560 mlog_errno(ret);
561 goto out;
562 }
563 }
564
565 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
566 if (ret < 0) {
567 if (ret != -ENOSPC)
568 mlog_errno(ret);
569 goto out;
570 }
571
572out:
573 if (ret) {
574 if (*meta_ac) {
575 ocfs2_free_alloc_context(*meta_ac);
576 *meta_ac = NULL;
577 }
578
579 /*
580 * We cannot have an error and a non null *data_ac.
581 */
582 }
583
584 return ret;
585}
586
497static int ocfs2_extend_allocation(struct inode *inode, 587static int ocfs2_extend_allocation(struct inode *inode,
498 u32 clusters_to_add) 588 u32 clusters_to_add)
499{ 589{
500 int status = 0; 590 int status = 0;
501 int restart_func = 0; 591 int restart_func = 0;
502 int drop_alloc_sem = 0; 592 int drop_alloc_sem = 0;
503 int credits, num_free_extents; 593 int credits;
504 u32 prev_clusters; 594 u32 prev_clusters, logical_start;
505 struct buffer_head *bh = NULL; 595 struct buffer_head *bh = NULL;
506 struct ocfs2_dinode *fe = NULL; 596 struct ocfs2_dinode *fe = NULL;
507 handle_t *handle = NULL; 597 handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
512 602
513 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); 603 mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
514 604
605 /*
606 * This function only exists for file systems which don't
607 * support holes.
608 */
609 BUG_ON(ocfs2_sparse_alloc(osb));
610
515 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 611 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
516 OCFS2_BH_CACHED, inode); 612 OCFS2_BH_CACHED, inode);
517 if (status < 0) { 613 if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
526 goto leave; 622 goto leave;
527 } 623 }
528 624
625 logical_start = OCFS2_I(inode)->ip_clusters;
626
529restart_all: 627restart_all:
530 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 628 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
531 629
532 mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
533 "clusters_to_add = %u\n",
534 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
535 fe->i_clusters, clusters_to_add);
536
537 num_free_extents = ocfs2_num_free_extents(osb,
538 inode,
539 fe);
540 if (num_free_extents < 0) {
541 status = num_free_extents;
542 mlog_errno(status);
543 goto leave;
544 }
545
546 if (!num_free_extents) {
547 status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
548 if (status < 0) {
549 if (status != -ENOSPC)
550 mlog_errno(status);
551 goto leave;
552 }
553 }
554
555 status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
556 if (status < 0) {
557 if (status != -ENOSPC)
558 mlog_errno(status);
559 goto leave;
560 }
561
562 /* blocks peope in read/write from reading our allocation 630 /* blocks peope in read/write from reading our allocation
563 * until we're done changing it. We depend on i_mutex to block 631 * until we're done changing it. We depend on i_mutex to block
564 * other extend/truncate calls while we're here. Ordering wrt 632 * other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
566 down_write(&OCFS2_I(inode)->ip_alloc_sem); 634 down_write(&OCFS2_I(inode)->ip_alloc_sem);
567 drop_alloc_sem = 1; 635 drop_alloc_sem = 1;
568 636
637 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
638 &meta_ac);
639 if (status) {
640 mlog_errno(status);
641 goto leave;
642 }
643
569 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); 644 credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
570 handle = ocfs2_start_trans(osb, credits); 645 handle = ocfs2_start_trans(osb, credits);
571 if (IS_ERR(handle)) { 646 if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
590 665
591 status = ocfs2_do_extend_allocation(osb, 666 status = ocfs2_do_extend_allocation(osb,
592 inode, 667 inode,
668 &logical_start,
593 clusters_to_add, 669 clusters_to_add,
594 bh, 670 bh,
595 handle, 671 handle,
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
778 size_t tail_to_skip) 854 size_t tail_to_skip)
779{ 855{
780 int ret = 0; 856 int ret = 0;
781 u32 clusters_to_add; 857 u32 clusters_to_add = 0;
782 858
783 BUG_ON(!tail_to_skip && !di_bh); 859 BUG_ON(!tail_to_skip && !di_bh);
784 860
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
790 goto out; 866 goto out;
791 BUG_ON(new_i_size < i_size_read(inode)); 867 BUG_ON(new_i_size < i_size_read(inode));
792 868
869 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
870 BUG_ON(tail_to_skip != 0);
871 goto out_update_size;
872 }
873
793 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 874 clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) -
794 OCFS2_I(inode)->ip_clusters; 875 OCFS2_I(inode)->ip_clusters;
795 876
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
825 goto out_unlock; 906 goto out_unlock;
826 } 907 }
827 908
909out_update_size:
828 if (!tail_to_skip) { 910 if (!tail_to_skip) {
829 /* We're being called from ocfs2_setattr() which wants 911 /* We're being called from ocfs2_setattr() which wants
830 * us to update i_size */ 912 * us to update i_size */
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
834 } 916 }
835 917
836out_unlock: 918out_unlock:
837 ocfs2_data_unlock(inode, 1); 919 if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
920 ocfs2_data_unlock(inode, 1);
838 921
839out: 922out:
840 return ret; 923 return ret;
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
972 1055
973 ret = ocfs2_meta_lock(inode, NULL, 0); 1056 ret = ocfs2_meta_lock(inode, NULL, 0);
974 if (ret) { 1057 if (ret) {
975 mlog_errno(ret); 1058 if (ret != -ENOENT)
1059 mlog_errno(ret);
976 goto out; 1060 goto out;
977 } 1061 }
978 1062
@@ -1035,10 +1119,49 @@ out:
1035 return ret; 1119 return ret;
1036} 1120}
1037 1121
1122/*
1123 * Will look for holes and unwritten extents in the range starting at
1124 * pos for count bytes (inclusive).
1125 */
1126static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
1127 size_t count)
1128{
1129 int ret = 0;
1130 unsigned int extent_flags;
1131 u32 cpos, clusters, extent_len, phys_cpos;
1132 struct super_block *sb = inode->i_sb;
1133
1134 cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
1135 clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
1136
1137 while (clusters) {
1138 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
1139 &extent_flags);
1140 if (ret < 0) {
1141 mlog_errno(ret);
1142 goto out;
1143 }
1144
1145 if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
1146 ret = 1;
1147 break;
1148 }
1149
1150 if (extent_len > clusters)
1151 extent_len = clusters;
1152
1153 clusters -= extent_len;
1154 cpos += extent_len;
1155 }
1156out:
1157 return ret;
1158}
1159
1038static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1160static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1039 loff_t *ppos, 1161 loff_t *ppos,
1040 size_t count, 1162 size_t count,
1041 int appending) 1163 int appending,
1164 int *direct_io)
1042{ 1165{
1043 int ret = 0, meta_level = appending; 1166 int ret = 0, meta_level = appending;
1044 struct inode *inode = dentry->d_inode; 1167 struct inode *inode = dentry->d_inode;
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1089 } else { 1212 } else {
1090 saved_pos = *ppos; 1213 saved_pos = *ppos;
1091 } 1214 }
1215
1216 if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
1217 loff_t end = saved_pos + count;
1218
1219 /*
1220 * Skip the O_DIRECT checks if we don't need
1221 * them.
1222 */
1223 if (!direct_io || !(*direct_io))
1224 break;
1225
1226 /*
1227 * Allowing concurrent direct writes means
1228 * i_size changes wouldn't be synchronized, so
1229 * one node could wind up truncating another
1230 * nodes writes.
1231 */
1232 if (end > i_size_read(inode)) {
1233 *direct_io = 0;
1234 break;
1235 }
1236
1237 /*
1238 * We don't fill holes during direct io, so
1239 * check for them here. If any are found, the
1240 * caller will have to retake some cluster
1241 * locks and initiate the io as buffered.
1242 */
1243 ret = ocfs2_check_range_for_holes(inode, saved_pos,
1244 count);
1245 if (ret == 1) {
1246 *direct_io = 0;
1247 ret = 0;
1248 } else if (ret < 0)
1249 mlog_errno(ret);
1250 break;
1251 }
1252
1253 /*
1254 * The rest of this loop is concerned with legacy file
1255 * systems which don't support sparse files.
1256 */
1257
1092 newsize = count + saved_pos; 1258 newsize = count + saved_pos;
1093 1259
1094 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", 1260 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1307,264 @@ out:
1141 return ret; 1307 return ret;
1142} 1308}
1143 1309
1310static inline void
1311ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1312{
1313 const struct iovec *iov = *iovp;
1314 size_t base = *basep;
1315
1316 do {
1317 int copy = min(bytes, iov->iov_len - base);
1318
1319 bytes -= copy;
1320 base += copy;
1321 if (iov->iov_len == base) {
1322 iov++;
1323 base = 0;
1324 }
1325 } while (bytes);
1326 *iovp = iov;
1327 *basep = base;
1328}
1329
1330static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
1331 const struct iovec *cur_iov,
1332 size_t iov_offset)
1333{
1334 int ret;
1335 char *buf;
1336 struct page *src_page = NULL;
1337
1338 buf = cur_iov->iov_base + iov_offset;
1339
1340 if (!segment_eq(get_fs(), KERNEL_DS)) {
1341 /*
1342 * Pull in the user page. We want to do this outside
1343 * of the meta data locks in order to preserve locking
1344 * order in case of page fault.
1345 */
1346 ret = get_user_pages(current, current->mm,
1347 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1348 0, 0, &src_page, NULL);
1349 if (ret == 1)
1350 bp->b_src_buf = kmap(src_page);
1351 else
1352 src_page = ERR_PTR(-EFAULT);
1353 } else {
1354 bp->b_src_buf = buf;
1355 }
1356
1357 return src_page;
1358}
1359
1360static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
1361 struct page *page)
1362{
1363 if (page) {
1364 kunmap(page);
1365 page_cache_release(page);
1366 }
1367}
1368
1369static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1370 const struct iovec *iov,
1371 unsigned long nr_segs,
1372 size_t count,
1373 ssize_t o_direct_written)
1374{
1375 int ret = 0;
1376 ssize_t copied, total = 0;
1377 size_t iov_offset = 0;
1378 const struct iovec *cur_iov = iov;
1379 struct ocfs2_buffered_write_priv bp;
1380 struct page *page;
1381
1382 /*
1383 * handle partial DIO write. Adjust cur_iov if needed.
1384 */
1385 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1386
1387 do {
1388 bp.b_cur_off = iov_offset;
1389 bp.b_cur_iov = cur_iov;
1390
1391 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
1392 if (IS_ERR(page)) {
1393 ret = PTR_ERR(page);
1394 goto out;
1395 }
1396
1397 copied = ocfs2_buffered_write_cluster(file, *ppos, count,
1398 ocfs2_map_and_write_user_data,
1399 &bp);
1400
1401 ocfs2_put_write_source(&bp, page);
1402
1403 if (copied < 0) {
1404 mlog_errno(copied);
1405 ret = copied;
1406 goto out;
1407 }
1408
1409 total += copied;
1410 *ppos = *ppos + copied;
1411 count -= copied;
1412
1413 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
1414 } while(count);
1415
1416out:
1417 return total ? total : ret;
1418}
1419
1420static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
1421 unsigned long *nr_segs)
1422{
1423 size_t ocount; /* original count */
1424 unsigned long seg;
1425
1426 ocount = 0;
1427 for (seg = 0; seg < *nr_segs; seg++) {
1428 const struct iovec *iv = &iov[seg];
1429
1430 /*
1431 * If any segment has a negative length, or the cumulative
1432 * length ever wraps negative then return -EINVAL.
1433 */
1434 ocount += iv->iov_len;
1435 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
1436 return -EINVAL;
1437 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
1438 continue;
1439 if (seg == 0)
1440 return -EFAULT;
1441 *nr_segs = seg;
1442 ocount -= iv->iov_len; /* This segment is no good */
1443 break;
1444 }
1445
1446 *counted = ocount;
1447 return 0;
1448}
1449
1144static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, 1450static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1145 const struct iovec *iov, 1451 const struct iovec *iov,
1146 unsigned long nr_segs, 1452 unsigned long nr_segs,
1147 loff_t pos) 1453 loff_t pos)
1148{ 1454{
1149 int ret, rw_level, have_alloc_sem = 0; 1455 int ret, direct_io, appending, rw_level, have_alloc_sem = 0;
1150 struct file *filp = iocb->ki_filp; 1456 int can_do_direct, sync = 0;
1151 struct inode *inode = filp->f_path.dentry->d_inode; 1457 ssize_t written = 0;
1152 int appending = filp->f_flags & O_APPEND ? 1 : 0; 1458 size_t ocount; /* original count */
1153 1459 size_t count; /* after file limit checks */
1154 mlog_entry("(0x%p, %u, '%.*s')\n", filp, 1460 loff_t *ppos = &iocb->ki_pos;
1461 struct file *file = iocb->ki_filp;
1462 struct inode *inode = file->f_path.dentry->d_inode;
1463
1464 mlog_entry("(0x%p, %u, '%.*s')\n", file,
1155 (unsigned int)nr_segs, 1465 (unsigned int)nr_segs,
1156 filp->f_path.dentry->d_name.len, 1466 file->f_path.dentry->d_name.len,
1157 filp->f_path.dentry->d_name.name); 1467 file->f_path.dentry->d_name.name);
1158 1468
1159 /* happy write of zero bytes */
1160 if (iocb->ki_left == 0) 1469 if (iocb->ki_left == 0)
1161 return 0; 1470 return 0;
1162 1471
1472 ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
1473 if (ret)
1474 return ret;
1475
1476 count = ocount;
1477
1478 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1479
1480 appending = file->f_flags & O_APPEND ? 1 : 0;
1481 direct_io = file->f_flags & O_DIRECT ? 1 : 0;
1482
1163 mutex_lock(&inode->i_mutex); 1483 mutex_lock(&inode->i_mutex);
1484
1485relock:
1164 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 1486 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
1165 if (filp->f_flags & O_DIRECT) { 1487 if (direct_io) {
1166 have_alloc_sem = 1;
1167 down_read(&inode->i_alloc_sem); 1488 down_read(&inode->i_alloc_sem);
1489 have_alloc_sem = 1;
1168 } 1490 }
1169 1491
1170 /* concurrent O_DIRECT writes are allowed */ 1492 /* concurrent O_DIRECT writes are allowed */
1171 rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; 1493 rw_level = !direct_io;
1172 ret = ocfs2_rw_lock(inode, rw_level); 1494 ret = ocfs2_rw_lock(inode, rw_level);
1173 if (ret < 0) { 1495 if (ret < 0) {
1174 rw_level = -1;
1175 mlog_errno(ret); 1496 mlog_errno(ret);
1176 goto out; 1497 goto out_sems;
1177 } 1498 }
1178 1499
1179 ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, 1500 can_do_direct = direct_io;
1180 iocb->ki_left, appending); 1501 ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
1502 iocb->ki_left, appending,
1503 &can_do_direct);
1181 if (ret < 0) { 1504 if (ret < 0) {
1182 mlog_errno(ret); 1505 mlog_errno(ret);
1183 goto out; 1506 goto out;
1184 } 1507 }
1185 1508
1186 /* communicate with ocfs2_dio_end_io */ 1509 /*
1187 ocfs2_iocb_set_rw_locked(iocb); 1510 * We can't complete the direct I/O as requested, fall back to
1511 * buffered I/O.
1512 */
1513 if (direct_io && !can_do_direct) {
1514 ocfs2_rw_unlock(inode, rw_level);
1515 up_read(&inode->i_alloc_sem);
1516
1517 have_alloc_sem = 0;
1518 rw_level = -1;
1188 1519
1189 ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); 1520 direct_io = 0;
1521 sync = 1;
1522 goto relock;
1523 }
1524
1525 if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
1526 sync = 1;
1527
1528 /*
1529 * XXX: Is it ok to execute these checks a second time?
1530 */
1531 ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
1532 if (ret)
1533 goto out;
1534
1535 /*
1536 * Set pos so that sync_page_range_nolock() below understands
1537 * where to start from. We might've moved it around via the
1538 * calls above. The range we want to actually sync starts from
1539 * *ppos here.
1540 *
1541 */
1542 pos = *ppos;
1543
1544 /* communicate with ocfs2_dio_end_io */
1545 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1546
1547 if (direct_io) {
1548 written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
1549 ppos, count, ocount);
1550 if (written < 0) {
1551 ret = written;
1552 goto out_dio;
1553 }
1554 } else {
1555 written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
1556 count, written);
1557 if (written < 0) {
1558 ret = written;
1559 if (ret != -EFAULT || ret != -ENOSPC)
1560 mlog_errno(ret);
1561 goto out;
1562 }
1563 }
1190 1564
1565out_dio:
1191 /* buffered aio wouldn't have proper lock coverage today */ 1566 /* buffered aio wouldn't have proper lock coverage today */
1192 BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); 1567 BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
1193 1568
1194 /* 1569 /*
1195 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 1570 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1207 } 1582 }
1208 1583
1209out: 1584out:
1585 if (rw_level != -1)
1586 ocfs2_rw_unlock(inode, rw_level);
1587
1588out_sems:
1210 if (have_alloc_sem) 1589 if (have_alloc_sem)
1211 up_read(&inode->i_alloc_sem); 1590 up_read(&inode->i_alloc_sem);
1212 if (rw_level != -1) 1591
1213 ocfs2_rw_unlock(inode, rw_level); 1592 if (written > 0 && sync) {
1593 ssize_t err;
1594
1595 err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
1596 if (err < 0)
1597 written = err;
1598 }
1599
1214 mutex_unlock(&inode->i_mutex); 1600 mutex_unlock(&inode->i_mutex);
1215 1601
1216 mlog_exit(ret); 1602 mlog_exit(ret);
1603 return written ? written : ret;
1604}
1605
1606static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1607 struct pipe_buffer *buf,
1608 struct splice_desc *sd)
1609{
1610 int ret, count, total = 0;
1611 ssize_t copied = 0;
1612 struct ocfs2_splice_write_priv sp;
1613
1614 ret = buf->ops->pin(pipe, buf);
1615 if (ret)
1616 goto out;
1617
1618 sp.s_sd = sd;
1619 sp.s_buf = buf;
1620 sp.s_pipe = pipe;
1621 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1622 sp.s_buf_offset = buf->offset;
1623
1624 count = sd->len;
1625 if (count + sp.s_offset > PAGE_CACHE_SIZE)
1626 count = PAGE_CACHE_SIZE - sp.s_offset;
1627
1628 do {
1629 /*
1630 * splice wants us to copy up to one page at a
1631 * time. For pagesize > cluster size, this means we
1632 * might enter ocfs2_buffered_write_cluster() more
1633 * than once, so keep track of our progress here.
1634 */
1635 copied = ocfs2_buffered_write_cluster(sd->file,
1636 (loff_t)sd->pos + total,
1637 count,
1638 ocfs2_map_and_write_splice_data,
1639 &sp);
1640 if (copied < 0) {
1641 mlog_errno(copied);
1642 ret = copied;
1643 goto out;
1644 }
1645
1646 count -= copied;
1647 sp.s_offset += copied;
1648 sp.s_buf_offset += copied;
1649 total += copied;
1650 } while (count);
1651
1652 ret = 0;
1653out:
1654
1655 return total ? total : ret;
1656}
1657
1658static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1659 struct file *out,
1660 loff_t *ppos,
1661 size_t len,
1662 unsigned int flags)
1663{
1664 int ret, err;
1665 struct address_space *mapping = out->f_mapping;
1666 struct inode *inode = mapping->host;
1667
1668 ret = __splice_from_pipe(pipe, out, ppos, len, flags,
1669 ocfs2_splice_write_actor);
1670 if (ret > 0) {
1671 *ppos += ret;
1672
1673 if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
1674 err = generic_osync_inode(inode, mapping,
1675 OSYNC_METADATA|OSYNC_DATA);
1676 if (err)
1677 ret = err;
1678 }
1679 }
1680
1217 return ret; 1681 return ret;
1218} 1682}
1219 1683
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1239 goto out; 1703 goto out;
1240 } 1704 }
1241 1705
1242 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); 1706 ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
1707 NULL);
1243 if (ret < 0) { 1708 if (ret < 0) {
1244 mlog_errno(ret); 1709 mlog_errno(ret);
1245 goto out_unlock; 1710 goto out_unlock;
1246 } 1711 }
1247 1712
1248 /* ok, we're done with i_size and alloc work */ 1713 /* ok, we're done with i_size and alloc work */
1249 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1714 ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
1250 1715
1251out_unlock: 1716out_unlock:
1252 ocfs2_rw_unlock(inode, 1); 1717 ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
1323 } 1788 }
1324 rw_level = 0; 1789 rw_level = 0;
1325 /* communicate with ocfs2_dio_end_io */ 1790 /* communicate with ocfs2_dio_end_io */
1326 ocfs2_iocb_set_rw_locked(iocb); 1791 ocfs2_iocb_set_rw_locked(iocb, rw_level);
1327 } 1792 }
1328 1793
1329 /* 1794 /*