diff options
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r-- | fs/ocfs2/file.c | 637 |
1 files changed, 551 insertions, 86 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index f2cd3bf9efb2..520a2a6d7670 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/pipe_fs_i.h> | 34 | #include <linux/pipe_fs_i.h> |
35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
36 | #include <linux/writeback.h> | ||
36 | 37 | ||
37 | #define MLOG_MASK_PREFIX ML_INODE | 38 | #define MLOG_MASK_PREFIX ML_INODE |
38 | #include <cluster/masklog.h> | 39 | #include <cluster/masklog.h> |
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle, | |||
215 | 216 | ||
216 | mlog_entry_void(); | 217 | mlog_entry_void(); |
217 | i_size_write(inode, new_i_size); | 218 | i_size_write(inode, new_i_size); |
218 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | 219 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
219 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 220 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
220 | 221 | ||
221 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 222 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
261 | { | 262 | { |
262 | int status; | 263 | int status; |
263 | handle_t *handle; | 264 | handle_t *handle; |
265 | struct ocfs2_dinode *di; | ||
264 | 266 | ||
265 | mlog_entry_void(); | 267 | mlog_entry_void(); |
266 | 268 | ||
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | |||
274 | goto out; | 276 | goto out; |
275 | } | 277 | } |
276 | 278 | ||
277 | status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size); | 279 | status = ocfs2_journal_access(handle, inode, fe_bh, |
280 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
281 | if (status < 0) { | ||
282 | mlog_errno(status); | ||
283 | goto out_commit; | ||
284 | } | ||
285 | |||
286 | /* | ||
287 | * Do this before setting i_size. | ||
288 | */ | ||
289 | status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); | ||
290 | if (status) { | ||
291 | mlog_errno(status); | ||
292 | goto out_commit; | ||
293 | } | ||
294 | |||
295 | i_size_write(inode, new_i_size); | ||
296 | inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size); | ||
297 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | ||
298 | |||
299 | di = (struct ocfs2_dinode *) fe_bh->b_data; | ||
300 | di->i_size = cpu_to_le64(new_i_size); | ||
301 | di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); | ||
302 | di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | ||
303 | |||
304 | status = ocfs2_journal_dirty(handle, fe_bh); | ||
278 | if (status < 0) | 305 | if (status < 0) |
279 | mlog_errno(status); | 306 | mlog_errno(status); |
280 | 307 | ||
308 | out_commit: | ||
281 | ocfs2_commit_trans(osb, handle); | 309 | ocfs2_commit_trans(osb, handle); |
282 | out: | 310 | out: |
311 | |||
283 | mlog_exit(status); | 312 | mlog_exit(status); |
284 | return status; | 313 | return status; |
285 | } | 314 | } |
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
342 | mlog_errno(status); | 371 | mlog_errno(status); |
343 | goto bail; | 372 | goto bail; |
344 | } | 373 | } |
345 | ocfs2_data_unlock(inode, 1); | ||
346 | |||
347 | if (le32_to_cpu(fe->i_clusters) == | ||
348 | ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { | ||
349 | mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", | ||
350 | fe->i_clusters); | ||
351 | /* No allocation change is required, so lets fast path | ||
352 | * this truncate. */ | ||
353 | status = ocfs2_simple_size_update(inode, di_bh, new_i_size); | ||
354 | if (status < 0) | ||
355 | mlog_errno(status); | ||
356 | goto bail; | ||
357 | } | ||
358 | 374 | ||
359 | /* alright, we're going to need to do a full blown alloc size | 375 | /* alright, we're going to need to do a full blown alloc size |
360 | * change. Orphan the inode so that recovery can complete the | 376 | * change. Orphan the inode so that recovery can complete the |
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode, | |||
363 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | 379 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); |
364 | if (status < 0) { | 380 | if (status < 0) { |
365 | mlog_errno(status); | 381 | mlog_errno(status); |
366 | goto bail; | 382 | goto bail_unlock_data; |
367 | } | 383 | } |
368 | 384 | ||
369 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | 385 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); |
370 | if (status < 0) { | 386 | if (status < 0) { |
371 | mlog_errno(status); | 387 | mlog_errno(status); |
372 | goto bail; | 388 | goto bail_unlock_data; |
373 | } | 389 | } |
374 | 390 | ||
375 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | 391 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); |
376 | if (status < 0) { | 392 | if (status < 0) { |
377 | mlog_errno(status); | 393 | mlog_errno(status); |
378 | goto bail; | 394 | goto bail_unlock_data; |
379 | } | 395 | } |
380 | 396 | ||
381 | /* TODO: orphan dir cleanup here. */ | 397 | /* TODO: orphan dir cleanup here. */ |
398 | bail_unlock_data: | ||
399 | ocfs2_data_unlock(inode, 1); | ||
400 | |||
382 | bail: | 401 | bail: |
383 | 402 | ||
384 | mlog_exit(status); | 403 | mlog_exit(status); |
@@ -397,6 +416,7 @@ bail: | |||
397 | */ | 416 | */ |
398 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | 417 | int ocfs2_do_extend_allocation(struct ocfs2_super *osb, |
399 | struct inode *inode, | 418 | struct inode *inode, |
419 | u32 *logical_offset, | ||
400 | u32 clusters_to_add, | 420 | u32 clusters_to_add, |
401 | struct buffer_head *fe_bh, | 421 | struct buffer_head *fe_bh, |
402 | handle_t *handle, | 422 | handle_t *handle, |
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
460 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); | 480 | block = ocfs2_clusters_to_blocks(osb->sb, bit_off); |
461 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", | 481 | mlog(0, "Allocating %u clusters at block %u for inode %llu\n", |
462 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); | 482 | num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); |
463 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block, | 483 | status = ocfs2_insert_extent(osb, handle, inode, fe_bh, |
464 | num_bits, meta_ac); | 484 | *logical_offset, block, num_bits, |
485 | meta_ac); | ||
465 | if (status < 0) { | 486 | if (status < 0) { |
466 | mlog_errno(status); | 487 | mlog_errno(status); |
467 | goto leave; | 488 | goto leave; |
468 | } | 489 | } |
469 | 490 | ||
470 | le32_add_cpu(&fe->i_clusters, num_bits); | ||
471 | spin_lock(&OCFS2_I(inode)->ip_lock); | ||
472 | OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); | ||
473 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
474 | |||
475 | status = ocfs2_journal_dirty(handle, fe_bh); | 491 | status = ocfs2_journal_dirty(handle, fe_bh); |
476 | if (status < 0) { | 492 | if (status < 0) { |
477 | mlog_errno(status); | 493 | mlog_errno(status); |
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb, | |||
479 | } | 495 | } |
480 | 496 | ||
481 | clusters_to_add -= num_bits; | 497 | clusters_to_add -= num_bits; |
498 | *logical_offset += num_bits; | ||
482 | 499 | ||
483 | if (clusters_to_add) { | 500 | if (clusters_to_add) { |
484 | mlog(0, "need to alloc once more, clusters = %u, wanted = " | 501 | mlog(0, "need to alloc once more, clusters = %u, wanted = " |
@@ -494,14 +511,87 @@ leave: | |||
494 | return status; | 511 | return status; |
495 | } | 512 | } |
496 | 513 | ||
514 | /* | ||
515 | * For a given allocation, determine which allocators will need to be | ||
516 | * accessed, and lock them, reserving the appropriate number of bits. | ||
517 | * | ||
518 | * Called from ocfs2_extend_allocation() for file systems which don't | ||
519 | * support holes, and from ocfs2_write() for file systems which | ||
520 | * understand sparse inodes. | ||
521 | */ | ||
522 | int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, | ||
523 | u32 clusters_to_add, | ||
524 | struct ocfs2_alloc_context **data_ac, | ||
525 | struct ocfs2_alloc_context **meta_ac) | ||
526 | { | ||
527 | int ret, num_free_extents; | ||
528 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
529 | |||
530 | *meta_ac = NULL; | ||
531 | *data_ac = NULL; | ||
532 | |||
533 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " | ||
534 | "clusters_to_add = %u\n", | ||
535 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
536 | le32_to_cpu(di->i_clusters), clusters_to_add); | ||
537 | |||
538 | num_free_extents = ocfs2_num_free_extents(osb, inode, di); | ||
539 | if (num_free_extents < 0) { | ||
540 | ret = num_free_extents; | ||
541 | mlog_errno(ret); | ||
542 | goto out; | ||
543 | } | ||
544 | |||
545 | /* | ||
546 | * Sparse allocation file systems need to be more conservative | ||
547 | * with reserving room for expansion - the actual allocation | ||
548 | * happens while we've got a journal handle open so re-taking | ||
549 | * a cluster lock (because we ran out of room for another | ||
550 | * extent) will violate ordering rules. | ||
551 | * | ||
552 | * Most of the time we'll only be seeing this 1 cluster at a time | ||
553 | * anyway. | ||
554 | */ | ||
555 | if (!num_free_extents || | ||
556 | (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { | ||
557 | ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); | ||
558 | if (ret < 0) { | ||
559 | if (ret != -ENOSPC) | ||
560 | mlog_errno(ret); | ||
561 | goto out; | ||
562 | } | ||
563 | } | ||
564 | |||
565 | ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); | ||
566 | if (ret < 0) { | ||
567 | if (ret != -ENOSPC) | ||
568 | mlog_errno(ret); | ||
569 | goto out; | ||
570 | } | ||
571 | |||
572 | out: | ||
573 | if (ret) { | ||
574 | if (*meta_ac) { | ||
575 | ocfs2_free_alloc_context(*meta_ac); | ||
576 | *meta_ac = NULL; | ||
577 | } | ||
578 | |||
579 | /* | ||
580 | * We cannot have an error and a non null *data_ac. | ||
581 | */ | ||
582 | } | ||
583 | |||
584 | return ret; | ||
585 | } | ||
586 | |||
497 | static int ocfs2_extend_allocation(struct inode *inode, | 587 | static int ocfs2_extend_allocation(struct inode *inode, |
498 | u32 clusters_to_add) | 588 | u32 clusters_to_add) |
499 | { | 589 | { |
500 | int status = 0; | 590 | int status = 0; |
501 | int restart_func = 0; | 591 | int restart_func = 0; |
502 | int drop_alloc_sem = 0; | 592 | int drop_alloc_sem = 0; |
503 | int credits, num_free_extents; | 593 | int credits; |
504 | u32 prev_clusters; | 594 | u32 prev_clusters, logical_start; |
505 | struct buffer_head *bh = NULL; | 595 | struct buffer_head *bh = NULL; |
506 | struct ocfs2_dinode *fe = NULL; | 596 | struct ocfs2_dinode *fe = NULL; |
507 | handle_t *handle = NULL; | 597 | handle_t *handle = NULL; |
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
512 | 602 | ||
513 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | 603 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); |
514 | 604 | ||
605 | /* | ||
606 | * This function only exists for file systems which don't | ||
607 | * support holes. | ||
608 | */ | ||
609 | BUG_ON(ocfs2_sparse_alloc(osb)); | ||
610 | |||
515 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, | 611 | status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, |
516 | OCFS2_BH_CACHED, inode); | 612 | OCFS2_BH_CACHED, inode); |
517 | if (status < 0) { | 613 | if (status < 0) { |
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode, | |||
526 | goto leave; | 622 | goto leave; |
527 | } | 623 | } |
528 | 624 | ||
625 | logical_start = OCFS2_I(inode)->ip_clusters; | ||
626 | |||
529 | restart_all: | 627 | restart_all: |
530 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 628 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
531 | 629 | ||
532 | mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, " | ||
533 | "clusters_to_add = %u\n", | ||
534 | (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), | ||
535 | fe->i_clusters, clusters_to_add); | ||
536 | |||
537 | num_free_extents = ocfs2_num_free_extents(osb, | ||
538 | inode, | ||
539 | fe); | ||
540 | if (num_free_extents < 0) { | ||
541 | status = num_free_extents; | ||
542 | mlog_errno(status); | ||
543 | goto leave; | ||
544 | } | ||
545 | |||
546 | if (!num_free_extents) { | ||
547 | status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); | ||
548 | if (status < 0) { | ||
549 | if (status != -ENOSPC) | ||
550 | mlog_errno(status); | ||
551 | goto leave; | ||
552 | } | ||
553 | } | ||
554 | |||
555 | status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac); | ||
556 | if (status < 0) { | ||
557 | if (status != -ENOSPC) | ||
558 | mlog_errno(status); | ||
559 | goto leave; | ||
560 | } | ||
561 | |||
562 | /* blocks peope in read/write from reading our allocation | 630 | /* blocks peope in read/write from reading our allocation |
563 | * until we're done changing it. We depend on i_mutex to block | 631 | * until we're done changing it. We depend on i_mutex to block |
564 | * other extend/truncate calls while we're here. Ordering wrt | 632 | * other extend/truncate calls while we're here. Ordering wrt |
@@ -566,6 +634,13 @@ restart_all: | |||
566 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 634 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
567 | drop_alloc_sem = 1; | 635 | drop_alloc_sem = 1; |
568 | 636 | ||
637 | status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac, | ||
638 | &meta_ac); | ||
639 | if (status) { | ||
640 | mlog_errno(status); | ||
641 | goto leave; | ||
642 | } | ||
643 | |||
569 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); | 644 | credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); |
570 | handle = ocfs2_start_trans(osb, credits); | 645 | handle = ocfs2_start_trans(osb, credits); |
571 | if (IS_ERR(handle)) { | 646 | if (IS_ERR(handle)) { |
@@ -590,6 +665,7 @@ restarted_transaction: | |||
590 | 665 | ||
591 | status = ocfs2_do_extend_allocation(osb, | 666 | status = ocfs2_do_extend_allocation(osb, |
592 | inode, | 667 | inode, |
668 | &logical_start, | ||
593 | clusters_to_add, | 669 | clusters_to_add, |
594 | bh, | 670 | bh, |
595 | handle, | 671 | handle, |
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode, | |||
778 | size_t tail_to_skip) | 854 | size_t tail_to_skip) |
779 | { | 855 | { |
780 | int ret = 0; | 856 | int ret = 0; |
781 | u32 clusters_to_add; | 857 | u32 clusters_to_add = 0; |
782 | 858 | ||
783 | BUG_ON(!tail_to_skip && !di_bh); | 859 | BUG_ON(!tail_to_skip && !di_bh); |
784 | 860 | ||
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode, | |||
790 | goto out; | 866 | goto out; |
791 | BUG_ON(new_i_size < i_size_read(inode)); | 867 | BUG_ON(new_i_size < i_size_read(inode)); |
792 | 868 | ||
869 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
870 | BUG_ON(tail_to_skip != 0); | ||
871 | goto out_update_size; | ||
872 | } | ||
873 | |||
793 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - | 874 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - |
794 | OCFS2_I(inode)->ip_clusters; | 875 | OCFS2_I(inode)->ip_clusters; |
795 | 876 | ||
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode, | |||
825 | goto out_unlock; | 906 | goto out_unlock; |
826 | } | 907 | } |
827 | 908 | ||
909 | out_update_size: | ||
828 | if (!tail_to_skip) { | 910 | if (!tail_to_skip) { |
829 | /* We're being called from ocfs2_setattr() which wants | 911 | /* We're being called from ocfs2_setattr() which wants |
830 | * us to update i_size */ | 912 | * us to update i_size */ |
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode, | |||
834 | } | 916 | } |
835 | 917 | ||
836 | out_unlock: | 918 | out_unlock: |
837 | ocfs2_data_unlock(inode, 1); | 919 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) |
920 | ocfs2_data_unlock(inode, 1); | ||
838 | 921 | ||
839 | out: | 922 | out: |
840 | return ret; | 923 | return ret; |
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd) | |||
972 | 1055 | ||
973 | ret = ocfs2_meta_lock(inode, NULL, 0); | 1056 | ret = ocfs2_meta_lock(inode, NULL, 0); |
974 | if (ret) { | 1057 | if (ret) { |
975 | mlog_errno(ret); | 1058 | if (ret != -ENOENT) |
1059 | mlog_errno(ret); | ||
976 | goto out; | 1060 | goto out; |
977 | } | 1061 | } |
978 | 1062 | ||
@@ -1035,10 +1119,49 @@ out: | |||
1035 | return ret; | 1119 | return ret; |
1036 | } | 1120 | } |
1037 | 1121 | ||
1122 | /* | ||
1123 | * Will look for holes and unwritten extents in the range starting at | ||
1124 | * pos for count bytes (inclusive). | ||
1125 | */ | ||
1126 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | ||
1127 | size_t count) | ||
1128 | { | ||
1129 | int ret = 0; | ||
1130 | unsigned int extent_flags; | ||
1131 | u32 cpos, clusters, extent_len, phys_cpos; | ||
1132 | struct super_block *sb = inode->i_sb; | ||
1133 | |||
1134 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | ||
1135 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | ||
1136 | |||
1137 | while (clusters) { | ||
1138 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | ||
1139 | &extent_flags); | ||
1140 | if (ret < 0) { | ||
1141 | mlog_errno(ret); | ||
1142 | goto out; | ||
1143 | } | ||
1144 | |||
1145 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | ||
1146 | ret = 1; | ||
1147 | break; | ||
1148 | } | ||
1149 | |||
1150 | if (extent_len > clusters) | ||
1151 | extent_len = clusters; | ||
1152 | |||
1153 | clusters -= extent_len; | ||
1154 | cpos += extent_len; | ||
1155 | } | ||
1156 | out: | ||
1157 | return ret; | ||
1158 | } | ||
1159 | |||
1038 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 1160 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, |
1039 | loff_t *ppos, | 1161 | loff_t *ppos, |
1040 | size_t count, | 1162 | size_t count, |
1041 | int appending) | 1163 | int appending, |
1164 | int *direct_io) | ||
1042 | { | 1165 | { |
1043 | int ret = 0, meta_level = appending; | 1166 | int ret = 0, meta_level = appending; |
1044 | struct inode *inode = dentry->d_inode; | 1167 | struct inode *inode = dentry->d_inode; |
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | |||
1089 | } else { | 1212 | } else { |
1090 | saved_pos = *ppos; | 1213 | saved_pos = *ppos; |
1091 | } | 1214 | } |
1215 | |||
1216 | if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { | ||
1217 | loff_t end = saved_pos + count; | ||
1218 | |||
1219 | /* | ||
1220 | * Skip the O_DIRECT checks if we don't need | ||
1221 | * them. | ||
1222 | */ | ||
1223 | if (!direct_io || !(*direct_io)) | ||
1224 | break; | ||
1225 | |||
1226 | /* | ||
1227 | * Allowing concurrent direct writes means | ||
1228 | * i_size changes wouldn't be synchronized, so | ||
1229 | * one node could wind up truncating another | ||
1230 | * nodes writes. | ||
1231 | */ | ||
1232 | if (end > i_size_read(inode)) { | ||
1233 | *direct_io = 0; | ||
1234 | break; | ||
1235 | } | ||
1236 | |||
1237 | /* | ||
1238 | * We don't fill holes during direct io, so | ||
1239 | * check for them here. If any are found, the | ||
1240 | * caller will have to retake some cluster | ||
1241 | * locks and initiate the io as buffered. | ||
1242 | */ | ||
1243 | ret = ocfs2_check_range_for_holes(inode, saved_pos, | ||
1244 | count); | ||
1245 | if (ret == 1) { | ||
1246 | *direct_io = 0; | ||
1247 | ret = 0; | ||
1248 | } else if (ret < 0) | ||
1249 | mlog_errno(ret); | ||
1250 | break; | ||
1251 | } | ||
1252 | |||
1253 | /* | ||
1254 | * The rest of this loop is concerned with legacy file | ||
1255 | * systems which don't support sparse files. | ||
1256 | */ | ||
1257 | |||
1092 | newsize = count + saved_pos; | 1258 | newsize = count + saved_pos; |
1093 | 1259 | ||
1094 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", | 1260 | mlog(0, "pos=%lld newsize=%lld cursize=%lld\n", |
@@ -1141,55 +1307,264 @@ out: | |||
1141 | return ret; | 1307 | return ret; |
1142 | } | 1308 | } |
1143 | 1309 | ||
1310 | static inline void | ||
1311 | ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes) | ||
1312 | { | ||
1313 | const struct iovec *iov = *iovp; | ||
1314 | size_t base = *basep; | ||
1315 | |||
1316 | do { | ||
1317 | int copy = min(bytes, iov->iov_len - base); | ||
1318 | |||
1319 | bytes -= copy; | ||
1320 | base += copy; | ||
1321 | if (iov->iov_len == base) { | ||
1322 | iov++; | ||
1323 | base = 0; | ||
1324 | } | ||
1325 | } while (bytes); | ||
1326 | *iovp = iov; | ||
1327 | *basep = base; | ||
1328 | } | ||
1329 | |||
1330 | static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, | ||
1331 | const struct iovec *cur_iov, | ||
1332 | size_t iov_offset) | ||
1333 | { | ||
1334 | int ret; | ||
1335 | char *buf; | ||
1336 | struct page *src_page = NULL; | ||
1337 | |||
1338 | buf = cur_iov->iov_base + iov_offset; | ||
1339 | |||
1340 | if (!segment_eq(get_fs(), KERNEL_DS)) { | ||
1341 | /* | ||
1342 | * Pull in the user page. We want to do this outside | ||
1343 | * of the meta data locks in order to preserve locking | ||
1344 | * order in case of page fault. | ||
1345 | */ | ||
1346 | ret = get_user_pages(current, current->mm, | ||
1347 | (unsigned long)buf & PAGE_CACHE_MASK, 1, | ||
1348 | 0, 0, &src_page, NULL); | ||
1349 | if (ret == 1) | ||
1350 | bp->b_src_buf = kmap(src_page); | ||
1351 | else | ||
1352 | src_page = ERR_PTR(-EFAULT); | ||
1353 | } else { | ||
1354 | bp->b_src_buf = buf; | ||
1355 | } | ||
1356 | |||
1357 | return src_page; | ||
1358 | } | ||
1359 | |||
1360 | static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, | ||
1361 | struct page *page) | ||
1362 | { | ||
1363 | if (page) { | ||
1364 | kunmap(page); | ||
1365 | page_cache_release(page); | ||
1366 | } | ||
1367 | } | ||
1368 | |||
1369 | static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos, | ||
1370 | const struct iovec *iov, | ||
1371 | unsigned long nr_segs, | ||
1372 | size_t count, | ||
1373 | ssize_t o_direct_written) | ||
1374 | { | ||
1375 | int ret = 0; | ||
1376 | ssize_t copied, total = 0; | ||
1377 | size_t iov_offset = 0; | ||
1378 | const struct iovec *cur_iov = iov; | ||
1379 | struct ocfs2_buffered_write_priv bp; | ||
1380 | struct page *page; | ||
1381 | |||
1382 | /* | ||
1383 | * handle partial DIO write. Adjust cur_iov if needed. | ||
1384 | */ | ||
1385 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); | ||
1386 | |||
1387 | do { | ||
1388 | bp.b_cur_off = iov_offset; | ||
1389 | bp.b_cur_iov = cur_iov; | ||
1390 | |||
1391 | page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); | ||
1392 | if (IS_ERR(page)) { | ||
1393 | ret = PTR_ERR(page); | ||
1394 | goto out; | ||
1395 | } | ||
1396 | |||
1397 | copied = ocfs2_buffered_write_cluster(file, *ppos, count, | ||
1398 | ocfs2_map_and_write_user_data, | ||
1399 | &bp); | ||
1400 | |||
1401 | ocfs2_put_write_source(&bp, page); | ||
1402 | |||
1403 | if (copied < 0) { | ||
1404 | mlog_errno(copied); | ||
1405 | ret = copied; | ||
1406 | goto out; | ||
1407 | } | ||
1408 | |||
1409 | total += copied; | ||
1410 | *ppos = *ppos + copied; | ||
1411 | count -= copied; | ||
1412 | |||
1413 | ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); | ||
1414 | } while(count); | ||
1415 | |||
1416 | out: | ||
1417 | return total ? total : ret; | ||
1418 | } | ||
1419 | |||
1420 | static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted, | ||
1421 | unsigned long *nr_segs) | ||
1422 | { | ||
1423 | size_t ocount; /* original count */ | ||
1424 | unsigned long seg; | ||
1425 | |||
1426 | ocount = 0; | ||
1427 | for (seg = 0; seg < *nr_segs; seg++) { | ||
1428 | const struct iovec *iv = &iov[seg]; | ||
1429 | |||
1430 | /* | ||
1431 | * If any segment has a negative length, or the cumulative | ||
1432 | * length ever wraps negative then return -EINVAL. | ||
1433 | */ | ||
1434 | ocount += iv->iov_len; | ||
1435 | if (unlikely((ssize_t)(ocount|iv->iov_len) < 0)) | ||
1436 | return -EINVAL; | ||
1437 | if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) | ||
1438 | continue; | ||
1439 | if (seg == 0) | ||
1440 | return -EFAULT; | ||
1441 | *nr_segs = seg; | ||
1442 | ocount -= iv->iov_len; /* This segment is no good */ | ||
1443 | break; | ||
1444 | } | ||
1445 | |||
1446 | *counted = ocount; | ||
1447 | return 0; | ||
1448 | } | ||
1449 | |||
1144 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | 1450 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, |
1145 | const struct iovec *iov, | 1451 | const struct iovec *iov, |
1146 | unsigned long nr_segs, | 1452 | unsigned long nr_segs, |
1147 | loff_t pos) | 1453 | loff_t pos) |
1148 | { | 1454 | { |
1149 | int ret, rw_level, have_alloc_sem = 0; | 1455 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; |
1150 | struct file *filp = iocb->ki_filp; | 1456 | int can_do_direct, sync = 0; |
1151 | struct inode *inode = filp->f_path.dentry->d_inode; | 1457 | ssize_t written = 0; |
1152 | int appending = filp->f_flags & O_APPEND ? 1 : 0; | 1458 | size_t ocount; /* original count */ |
1153 | 1459 | size_t count; /* after file limit checks */ | |
1154 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, | 1460 | loff_t *ppos = &iocb->ki_pos; |
1461 | struct file *file = iocb->ki_filp; | ||
1462 | struct inode *inode = file->f_path.dentry->d_inode; | ||
1463 | |||
1464 | mlog_entry("(0x%p, %u, '%.*s')\n", file, | ||
1155 | (unsigned int)nr_segs, | 1465 | (unsigned int)nr_segs, |
1156 | filp->f_path.dentry->d_name.len, | 1466 | file->f_path.dentry->d_name.len, |
1157 | filp->f_path.dentry->d_name.name); | 1467 | file->f_path.dentry->d_name.name); |
1158 | 1468 | ||
1159 | /* happy write of zero bytes */ | ||
1160 | if (iocb->ki_left == 0) | 1469 | if (iocb->ki_left == 0) |
1161 | return 0; | 1470 | return 0; |
1162 | 1471 | ||
1472 | ret = ocfs2_check_iovec(iov, &ocount, &nr_segs); | ||
1473 | if (ret) | ||
1474 | return ret; | ||
1475 | |||
1476 | count = ocount; | ||
1477 | |||
1478 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | ||
1479 | |||
1480 | appending = file->f_flags & O_APPEND ? 1 : 0; | ||
1481 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; | ||
1482 | |||
1163 | mutex_lock(&inode->i_mutex); | 1483 | mutex_lock(&inode->i_mutex); |
1484 | |||
1485 | relock: | ||
1164 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 1486 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ |
1165 | if (filp->f_flags & O_DIRECT) { | 1487 | if (direct_io) { |
1166 | have_alloc_sem = 1; | ||
1167 | down_read(&inode->i_alloc_sem); | 1488 | down_read(&inode->i_alloc_sem); |
1489 | have_alloc_sem = 1; | ||
1168 | } | 1490 | } |
1169 | 1491 | ||
1170 | /* concurrent O_DIRECT writes are allowed */ | 1492 | /* concurrent O_DIRECT writes are allowed */ |
1171 | rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1; | 1493 | rw_level = !direct_io; |
1172 | ret = ocfs2_rw_lock(inode, rw_level); | 1494 | ret = ocfs2_rw_lock(inode, rw_level); |
1173 | if (ret < 0) { | 1495 | if (ret < 0) { |
1174 | rw_level = -1; | ||
1175 | mlog_errno(ret); | 1496 | mlog_errno(ret); |
1176 | goto out; | 1497 | goto out_sems; |
1177 | } | 1498 | } |
1178 | 1499 | ||
1179 | ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos, | 1500 | can_do_direct = direct_io; |
1180 | iocb->ki_left, appending); | 1501 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, |
1502 | iocb->ki_left, appending, | ||
1503 | &can_do_direct); | ||
1181 | if (ret < 0) { | 1504 | if (ret < 0) { |
1182 | mlog_errno(ret); | 1505 | mlog_errno(ret); |
1183 | goto out; | 1506 | goto out; |
1184 | } | 1507 | } |
1185 | 1508 | ||
1186 | /* communicate with ocfs2_dio_end_io */ | 1509 | /* |
1187 | ocfs2_iocb_set_rw_locked(iocb); | 1510 | * We can't complete the direct I/O as requested, fall back to |
1511 | * buffered I/O. | ||
1512 | */ | ||
1513 | if (direct_io && !can_do_direct) { | ||
1514 | ocfs2_rw_unlock(inode, rw_level); | ||
1515 | up_read(&inode->i_alloc_sem); | ||
1516 | |||
1517 | have_alloc_sem = 0; | ||
1518 | rw_level = -1; | ||
1188 | 1519 | ||
1189 | ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos); | 1520 | direct_io = 0; |
1521 | sync = 1; | ||
1522 | goto relock; | ||
1523 | } | ||
1524 | |||
1525 | if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) | ||
1526 | sync = 1; | ||
1527 | |||
1528 | /* | ||
1529 | * XXX: Is it ok to execute these checks a second time? | ||
1530 | */ | ||
1531 | ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); | ||
1532 | if (ret) | ||
1533 | goto out; | ||
1534 | |||
1535 | /* | ||
1536 | * Set pos so that sync_page_range_nolock() below understands | ||
1537 | * where to start from. We might've moved it around via the | ||
1538 | * calls above. The range we want to actually sync starts from | ||
1539 | * *ppos here. | ||
1540 | * | ||
1541 | */ | ||
1542 | pos = *ppos; | ||
1543 | |||
1544 | /* communicate with ocfs2_dio_end_io */ | ||
1545 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | ||
1546 | |||
1547 | if (direct_io) { | ||
1548 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, | ||
1549 | ppos, count, ocount); | ||
1550 | if (written < 0) { | ||
1551 | ret = written; | ||
1552 | goto out_dio; | ||
1553 | } | ||
1554 | } else { | ||
1555 | written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs, | ||
1556 | count, written); | ||
1557 | if (written < 0) { | ||
1558 | ret = written; | ||
1559 | if (ret != -EFAULT || ret != -ENOSPC) | ||
1560 | mlog_errno(ret); | ||
1561 | goto out; | ||
1562 | } | ||
1563 | } | ||
1190 | 1564 | ||
1565 | out_dio: | ||
1191 | /* buffered aio wouldn't have proper lock coverage today */ | 1566 | /* buffered aio wouldn't have proper lock coverage today */ |
1192 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 1567 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); |
1193 | 1568 | ||
1194 | /* | 1569 | /* |
1195 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 1570 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | |||
1207 | } | 1582 | } |
1208 | 1583 | ||
1209 | out: | 1584 | out: |
1585 | if (rw_level != -1) | ||
1586 | ocfs2_rw_unlock(inode, rw_level); | ||
1587 | |||
1588 | out_sems: | ||
1210 | if (have_alloc_sem) | 1589 | if (have_alloc_sem) |
1211 | up_read(&inode->i_alloc_sem); | 1590 | up_read(&inode->i_alloc_sem); |
1212 | if (rw_level != -1) | 1591 | |
1213 | ocfs2_rw_unlock(inode, rw_level); | 1592 | if (written > 0 && sync) { |
1593 | ssize_t err; | ||
1594 | |||
1595 | err = sync_page_range_nolock(inode, file->f_mapping, pos, count); | ||
1596 | if (err < 0) | ||
1597 | written = err; | ||
1598 | } | ||
1599 | |||
1214 | mutex_unlock(&inode->i_mutex); | 1600 | mutex_unlock(&inode->i_mutex); |
1215 | 1601 | ||
1216 | mlog_exit(ret); | 1602 | mlog_exit(ret); |
1603 | return written ? written : ret; | ||
1604 | } | ||
1605 | |||
1606 | static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe, | ||
1607 | struct pipe_buffer *buf, | ||
1608 | struct splice_desc *sd) | ||
1609 | { | ||
1610 | int ret, count, total = 0; | ||
1611 | ssize_t copied = 0; | ||
1612 | struct ocfs2_splice_write_priv sp; | ||
1613 | |||
1614 | ret = buf->ops->pin(pipe, buf); | ||
1615 | if (ret) | ||
1616 | goto out; | ||
1617 | |||
1618 | sp.s_sd = sd; | ||
1619 | sp.s_buf = buf; | ||
1620 | sp.s_pipe = pipe; | ||
1621 | sp.s_offset = sd->pos & ~PAGE_CACHE_MASK; | ||
1622 | sp.s_buf_offset = buf->offset; | ||
1623 | |||
1624 | count = sd->len; | ||
1625 | if (count + sp.s_offset > PAGE_CACHE_SIZE) | ||
1626 | count = PAGE_CACHE_SIZE - sp.s_offset; | ||
1627 | |||
1628 | do { | ||
1629 | /* | ||
1630 | * splice wants us to copy up to one page at a | ||
1631 | * time. For pagesize > cluster size, this means we | ||
1632 | * might enter ocfs2_buffered_write_cluster() more | ||
1633 | * than once, so keep track of our progress here. | ||
1634 | */ | ||
1635 | copied = ocfs2_buffered_write_cluster(sd->file, | ||
1636 | (loff_t)sd->pos + total, | ||
1637 | count, | ||
1638 | ocfs2_map_and_write_splice_data, | ||
1639 | &sp); | ||
1640 | if (copied < 0) { | ||
1641 | mlog_errno(copied); | ||
1642 | ret = copied; | ||
1643 | goto out; | ||
1644 | } | ||
1645 | |||
1646 | count -= copied; | ||
1647 | sp.s_offset += copied; | ||
1648 | sp.s_buf_offset += copied; | ||
1649 | total += copied; | ||
1650 | } while (count); | ||
1651 | |||
1652 | ret = 0; | ||
1653 | out: | ||
1654 | |||
1655 | return total ? total : ret; | ||
1656 | } | ||
1657 | |||
1658 | static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, | ||
1659 | struct file *out, | ||
1660 | loff_t *ppos, | ||
1661 | size_t len, | ||
1662 | unsigned int flags) | ||
1663 | { | ||
1664 | int ret, err; | ||
1665 | struct address_space *mapping = out->f_mapping; | ||
1666 | struct inode *inode = mapping->host; | ||
1667 | |||
1668 | ret = __splice_from_pipe(pipe, out, ppos, len, flags, | ||
1669 | ocfs2_splice_write_actor); | ||
1670 | if (ret > 0) { | ||
1671 | *ppos += ret; | ||
1672 | |||
1673 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | ||
1674 | err = generic_osync_inode(inode, mapping, | ||
1675 | OSYNC_METADATA|OSYNC_DATA); | ||
1676 | if (err) | ||
1677 | ret = err; | ||
1678 | } | ||
1679 | } | ||
1680 | |||
1217 | return ret; | 1681 | return ret; |
1218 | } | 1682 | } |
1219 | 1683 | ||
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | |||
1239 | goto out; | 1703 | goto out; |
1240 | } | 1704 | } |
1241 | 1705 | ||
1242 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0); | 1706 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, |
1707 | NULL); | ||
1243 | if (ret < 0) { | 1708 | if (ret < 0) { |
1244 | mlog_errno(ret); | 1709 | mlog_errno(ret); |
1245 | goto out_unlock; | 1710 | goto out_unlock; |
1246 | } | 1711 | } |
1247 | 1712 | ||
1248 | /* ok, we're done with i_size and alloc work */ | 1713 | /* ok, we're done with i_size and alloc work */ |
1249 | ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); | 1714 | ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags); |
1250 | 1715 | ||
1251 | out_unlock: | 1716 | out_unlock: |
1252 | ocfs2_rw_unlock(inode, 1); | 1717 | ocfs2_rw_unlock(inode, 1); |
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | |||
1323 | } | 1788 | } |
1324 | rw_level = 0; | 1789 | rw_level = 0; |
1325 | /* communicate with ocfs2_dio_end_io */ | 1790 | /* communicate with ocfs2_dio_end_io */ |
1326 | ocfs2_iocb_set_rw_locked(iocb); | 1791 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
1327 | } | 1792 | } |
1328 | 1793 | ||
1329 | /* | 1794 | /* |