aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/file.c')
-rw-r--r--fs/ocfs2/file.c702
1 files changed, 601 insertions, 101 deletions
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4979b6675717..f04c7aa834cb 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -263,6 +263,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
263 int status; 263 int status;
264 handle_t *handle; 264 handle_t *handle;
265 struct ocfs2_dinode *di; 265 struct ocfs2_dinode *di;
266 u64 cluster_bytes;
266 267
267 mlog_entry_void(); 268 mlog_entry_void();
268 269
@@ -286,7 +287,9 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
286 /* 287 /*
287 * Do this before setting i_size. 288 * Do this before setting i_size.
288 */ 289 */
289 status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size); 290 cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
291 status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size,
292 cluster_bytes);
290 if (status) { 293 if (status) {
291 mlog_errno(status); 294 mlog_errno(status);
292 goto out_commit; 295 goto out_commit;
@@ -326,9 +329,6 @@ static int ocfs2_truncate_file(struct inode *inode,
326 (unsigned long long)OCFS2_I(inode)->ip_blkno, 329 (unsigned long long)OCFS2_I(inode)->ip_blkno,
327 (unsigned long long)new_i_size); 330 (unsigned long long)new_i_size);
328 331
329 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
330 truncate_inode_pages(inode->i_mapping, new_i_size);
331
332 fe = (struct ocfs2_dinode *) di_bh->b_data; 332 fe = (struct ocfs2_dinode *) di_bh->b_data;
333 if (!OCFS2_IS_VALID_DINODE(fe)) { 333 if (!OCFS2_IS_VALID_DINODE(fe)) {
334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 334 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
@@ -363,16 +363,23 @@ static int ocfs2_truncate_file(struct inode *inode,
363 if (new_i_size == le64_to_cpu(fe->i_size)) 363 if (new_i_size == le64_to_cpu(fe->i_size))
364 goto bail; 364 goto bail;
365 365
366 down_write(&OCFS2_I(inode)->ip_alloc_sem);
367
366 /* This forces other nodes to sync and drop their pages. Do 368 /* This forces other nodes to sync and drop their pages. Do
367 * this even if we have a truncate without allocation change - 369 * this even if we have a truncate without allocation change -
368 * ocfs2 cluster sizes can be much greater than page size, so 370 * ocfs2 cluster sizes can be much greater than page size, so
369 * we have to truncate them anyway. */ 371 * we have to truncate them anyway. */
370 status = ocfs2_data_lock(inode, 1); 372 status = ocfs2_data_lock(inode, 1);
371 if (status < 0) { 373 if (status < 0) {
374 up_write(&OCFS2_I(inode)->ip_alloc_sem);
375
372 mlog_errno(status); 376 mlog_errno(status);
373 goto bail; 377 goto bail;
374 } 378 }
375 379
380 unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1);
381 truncate_inode_pages(inode->i_mapping, new_i_size);
382
376 /* alright, we're going to need to do a full blown alloc size 383 /* alright, we're going to need to do a full blown alloc size
377 * change. Orphan the inode so that recovery can complete the 384 * change. Orphan the inode so that recovery can complete the
378 * truncate if necessary. This does the task of marking 385 * truncate if necessary. This does the task of marking
@@ -399,6 +406,8 @@ static int ocfs2_truncate_file(struct inode *inode,
399bail_unlock_data: 406bail_unlock_data:
400 ocfs2_data_unlock(inode, 1); 407 ocfs2_data_unlock(inode, 1);
401 408
409 up_write(&OCFS2_I(inode)->ip_alloc_sem);
410
402bail: 411bail:
403 412
404 mlog_exit(status); 413 mlog_exit(status);
@@ -419,6 +428,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
419 struct inode *inode, 428 struct inode *inode,
420 u32 *logical_offset, 429 u32 *logical_offset,
421 u32 clusters_to_add, 430 u32 clusters_to_add,
431 int mark_unwritten,
422 struct buffer_head *fe_bh, 432 struct buffer_head *fe_bh,
423 handle_t *handle, 433 handle_t *handle,
424 struct ocfs2_alloc_context *data_ac, 434 struct ocfs2_alloc_context *data_ac,
@@ -431,9 +441,13 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
431 enum ocfs2_alloc_restarted reason = RESTART_NONE; 441 enum ocfs2_alloc_restarted reason = RESTART_NONE;
432 u32 bit_off, num_bits; 442 u32 bit_off, num_bits;
433 u64 block; 443 u64 block;
444 u8 flags = 0;
434 445
435 BUG_ON(!clusters_to_add); 446 BUG_ON(!clusters_to_add);
436 447
448 if (mark_unwritten)
449 flags = OCFS2_EXT_UNWRITTEN;
450
437 free_extents = ocfs2_num_free_extents(osb, inode, fe); 451 free_extents = ocfs2_num_free_extents(osb, inode, fe);
438 if (free_extents < 0) { 452 if (free_extents < 0) {
439 status = free_extents; 453 status = free_extents;
@@ -483,7 +497,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
483 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 497 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
484 status = ocfs2_insert_extent(osb, handle, inode, fe_bh, 498 status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
485 *logical_offset, block, num_bits, 499 *logical_offset, block, num_bits,
486 meta_ac); 500 flags, meta_ac);
487 if (status < 0) { 501 if (status < 0) {
488 mlog_errno(status); 502 mlog_errno(status);
489 goto leave; 503 goto leave;
@@ -516,25 +530,31 @@ leave:
516 * For a given allocation, determine which allocators will need to be 530 * For a given allocation, determine which allocators will need to be
517 * accessed, and lock them, reserving the appropriate number of bits. 531 * accessed, and lock them, reserving the appropriate number of bits.
518 * 532 *
519 * Called from ocfs2_extend_allocation() for file systems which don't 533 * Sparse file systems call this from ocfs2_write_begin_nolock()
520 * support holes, and from ocfs2_write() for file systems which 534 * and ocfs2_allocate_unwritten_extents().
521 * understand sparse inodes. 535 *
536 * File systems which don't support holes call this from
537 * ocfs2_extend_allocation().
522 */ 538 */
523int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, 539int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
524 u32 clusters_to_add, 540 u32 clusters_to_add, u32 extents_to_split,
525 struct ocfs2_alloc_context **data_ac, 541 struct ocfs2_alloc_context **data_ac,
526 struct ocfs2_alloc_context **meta_ac) 542 struct ocfs2_alloc_context **meta_ac)
527{ 543{
528 int ret, num_free_extents; 544 int ret = 0, num_free_extents;
545 unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split;
529 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 546 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
530 547
531 *meta_ac = NULL; 548 *meta_ac = NULL;
532 *data_ac = NULL; 549 if (data_ac)
550 *data_ac = NULL;
551
552 BUG_ON(clusters_to_add != 0 && data_ac == NULL);
533 553
534 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " 554 mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
535 "clusters_to_add = %u\n", 555 "clusters_to_add = %u, extents_to_split = %u\n",
536 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode), 556 (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
537 le32_to_cpu(di->i_clusters), clusters_to_add); 557 le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split);
538 558
539 num_free_extents = ocfs2_num_free_extents(osb, inode, di); 559 num_free_extents = ocfs2_num_free_extents(osb, inode, di);
540 if (num_free_extents < 0) { 560 if (num_free_extents < 0) {
@@ -552,9 +572,12 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
552 * 572 *
553 * Most of the time we'll only be seeing this 1 cluster at a time 573 * Most of the time we'll only be seeing this 1 cluster at a time
554 * anyway. 574 * anyway.
575 *
576 * Always lock for any unwritten extents - we might want to
577 * add blocks during a split.
555 */ 578 */
556 if (!num_free_extents || 579 if (!num_free_extents ||
557 (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) { 580 (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) {
558 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); 581 ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
559 if (ret < 0) { 582 if (ret < 0) {
560 if (ret != -ENOSPC) 583 if (ret != -ENOSPC)
@@ -563,6 +586,9 @@ int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
563 } 586 }
564 } 587 }
565 588
589 if (clusters_to_add == 0)
590 goto out;
591
566 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); 592 ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
567 if (ret < 0) { 593 if (ret < 0) {
568 if (ret != -ENOSPC) 594 if (ret != -ENOSPC)
@@ -585,14 +611,13 @@ out:
585 return ret; 611 return ret;
586} 612}
587 613
588static int ocfs2_extend_allocation(struct inode *inode, 614static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
589 u32 clusters_to_add) 615 u32 clusters_to_add, int mark_unwritten)
590{ 616{
591 int status = 0; 617 int status = 0;
592 int restart_func = 0; 618 int restart_func = 0;
593 int drop_alloc_sem = 0;
594 int credits; 619 int credits;
595 u32 prev_clusters, logical_start; 620 u32 prev_clusters;
596 struct buffer_head *bh = NULL; 621 struct buffer_head *bh = NULL;
597 struct ocfs2_dinode *fe = NULL; 622 struct ocfs2_dinode *fe = NULL;
598 handle_t *handle = NULL; 623 handle_t *handle = NULL;
@@ -607,7 +632,7 @@ static int ocfs2_extend_allocation(struct inode *inode,
607 * This function only exists for file systems which don't 632 * This function only exists for file systems which don't
608 * support holes. 633 * support holes.
609 */ 634 */
610 BUG_ON(ocfs2_sparse_alloc(osb)); 635 BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
611 636
612 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 637 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
613 OCFS2_BH_CACHED, inode); 638 OCFS2_BH_CACHED, inode);
@@ -623,19 +648,10 @@ static int ocfs2_extend_allocation(struct inode *inode,
623 goto leave; 648 goto leave;
624 } 649 }
625 650
626 logical_start = OCFS2_I(inode)->ip_clusters;
627
628restart_all: 651restart_all:
629 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); 652 BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
630 653
631 /* blocks peope in read/write from reading our allocation 654 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac,
632 * until we're done changing it. We depend on i_mutex to block
633 * other extend/truncate calls while we're here. Ordering wrt
634 * start_trans is important here -- always do it before! */
635 down_write(&OCFS2_I(inode)->ip_alloc_sem);
636 drop_alloc_sem = 1;
637
638 status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
639 &meta_ac); 655 &meta_ac);
640 if (status) { 656 if (status) {
641 mlog_errno(status); 657 mlog_errno(status);
@@ -668,6 +684,7 @@ restarted_transaction:
668 inode, 684 inode,
669 &logical_start, 685 &logical_start,
670 clusters_to_add, 686 clusters_to_add,
687 mark_unwritten,
671 bh, 688 bh,
672 handle, 689 handle,
673 data_ac, 690 data_ac,
@@ -720,10 +737,6 @@ restarted_transaction:
720 OCFS2_I(inode)->ip_clusters, i_size_read(inode)); 737 OCFS2_I(inode)->ip_clusters, i_size_read(inode));
721 738
722leave: 739leave:
723 if (drop_alloc_sem) {
724 up_write(&OCFS2_I(inode)->ip_alloc_sem);
725 drop_alloc_sem = 0;
726 }
727 if (handle) { 740 if (handle) {
728 ocfs2_commit_trans(osb, handle); 741 ocfs2_commit_trans(osb, handle);
729 handle = NULL; 742 handle = NULL;
@@ -749,6 +762,25 @@ leave:
749 return status; 762 return status;
750} 763}
751 764
765static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
766 u32 clusters_to_add, int mark_unwritten)
767{
768 int ret;
769
770 /*
771 * The alloc sem blocks peope in read/write from reading our
772 * allocation until we're done changing it. We depend on
773 * i_mutex to block other extend/truncate calls while we're
774 * here.
775 */
776 down_write(&OCFS2_I(inode)->ip_alloc_sem);
777 ret = __ocfs2_extend_allocation(inode, logical_start, clusters_to_add,
778 mark_unwritten);
779 up_write(&OCFS2_I(inode)->ip_alloc_sem);
780
781 return ret;
782}
783
752/* Some parts of this taken from generic_cont_expand, which turned out 784/* Some parts of this taken from generic_cont_expand, which turned out
753 * to be too fragile to do exactly what we need without us having to 785 * to be too fragile to do exactly what we need without us having to
754 * worry about recursive locking in ->prepare_write() and 786 * worry about recursive locking in ->prepare_write() and
@@ -890,7 +922,9 @@ static int ocfs2_extend_file(struct inode *inode,
890 } 922 }
891 923
892 if (clusters_to_add) { 924 if (clusters_to_add) {
893 ret = ocfs2_extend_allocation(inode, clusters_to_add); 925 ret = ocfs2_extend_allocation(inode,
926 OCFS2_I(inode)->ip_clusters,
927 clusters_to_add, 0);
894 if (ret < 0) { 928 if (ret < 0) {
895 mlog_errno(ret); 929 mlog_errno(ret);
896 goto out_unlock; 930 goto out_unlock;
@@ -995,6 +1029,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
995 goto bail_unlock; 1029 goto bail_unlock;
996 } 1030 }
997 1031
1032 /*
1033 * This will intentionally not wind up calling vmtruncate(),
1034 * since all the work for a size change has been done above.
1035 * Otherwise, we could get into problems with truncate as
1036 * ip_alloc_sem is used there to protect against i_size
1037 * changes.
1038 */
998 status = inode_setattr(inode, attr); 1039 status = inode_setattr(inode, attr);
999 if (status < 0) { 1040 if (status < 0) {
1000 mlog_errno(status); 1041 mlog_errno(status);
@@ -1070,17 +1111,16 @@ out:
1070 return ret; 1111 return ret;
1071} 1112}
1072 1113
1073static int ocfs2_write_remove_suid(struct inode *inode) 1114static int __ocfs2_write_remove_suid(struct inode *inode,
1115 struct buffer_head *bh)
1074{ 1116{
1075 int ret; 1117 int ret;
1076 struct buffer_head *bh = NULL;
1077 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1078 handle_t *handle; 1118 handle_t *handle;
1079 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1119 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1080 struct ocfs2_dinode *di; 1120 struct ocfs2_dinode *di;
1081 1121
1082 mlog_entry("(Inode %llu, mode 0%o)\n", 1122 mlog_entry("(Inode %llu, mode 0%o)\n",
1083 (unsigned long long)oi->ip_blkno, inode->i_mode); 1123 (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode);
1084 1124
1085 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 1125 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1086 if (handle == NULL) { 1126 if (handle == NULL) {
@@ -1089,17 +1129,11 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1089 goto out; 1129 goto out;
1090 } 1130 }
1091 1131
1092 ret = ocfs2_read_block(osb, oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1093 if (ret < 0) {
1094 mlog_errno(ret);
1095 goto out_trans;
1096 }
1097
1098 ret = ocfs2_journal_access(handle, inode, bh, 1132 ret = ocfs2_journal_access(handle, inode, bh,
1099 OCFS2_JOURNAL_ACCESS_WRITE); 1133 OCFS2_JOURNAL_ACCESS_WRITE);
1100 if (ret < 0) { 1134 if (ret < 0) {
1101 mlog_errno(ret); 1135 mlog_errno(ret);
1102 goto out_bh; 1136 goto out_trans;
1103 } 1137 }
1104 1138
1105 inode->i_mode &= ~S_ISUID; 1139 inode->i_mode &= ~S_ISUID;
@@ -1112,8 +1146,7 @@ static int ocfs2_write_remove_suid(struct inode *inode)
1112 ret = ocfs2_journal_dirty(handle, bh); 1146 ret = ocfs2_journal_dirty(handle, bh);
1113 if (ret < 0) 1147 if (ret < 0)
1114 mlog_errno(ret); 1148 mlog_errno(ret);
1115out_bh: 1149
1116 brelse(bh);
1117out_trans: 1150out_trans:
1118 ocfs2_commit_trans(osb, handle); 1151 ocfs2_commit_trans(osb, handle);
1119out: 1152out:
@@ -1159,6 +1192,460 @@ out:
1159 return ret; 1192 return ret;
1160} 1193}
1161 1194
1195static int ocfs2_write_remove_suid(struct inode *inode)
1196{
1197 int ret;
1198 struct buffer_head *bh = NULL;
1199 struct ocfs2_inode_info *oi = OCFS2_I(inode);
1200
1201 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
1202 oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode);
1203 if (ret < 0) {
1204 mlog_errno(ret);
1205 goto out;
1206 }
1207
1208 ret = __ocfs2_write_remove_suid(inode, bh);
1209out:
1210 brelse(bh);
1211 return ret;
1212}
1213
1214/*
1215 * Allocate enough extents to cover the region starting at byte offset
1216 * start for len bytes. Existing extents are skipped, any extents
1217 * added are marked as "unwritten".
1218 */
1219static int ocfs2_allocate_unwritten_extents(struct inode *inode,
1220 u64 start, u64 len)
1221{
1222 int ret;
1223 u32 cpos, phys_cpos, clusters, alloc_size;
1224
1225 /*
1226 * We consider both start and len to be inclusive.
1227 */
1228 cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits;
1229 clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len);
1230 clusters -= cpos;
1231
1232 while (clusters) {
1233 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1234 &alloc_size, NULL);
1235 if (ret) {
1236 mlog_errno(ret);
1237 goto out;
1238 }
1239
1240 /*
1241 * Hole or existing extent len can be arbitrary, so
1242 * cap it to our own allocation request.
1243 */
1244 if (alloc_size > clusters)
1245 alloc_size = clusters;
1246
1247 if (phys_cpos) {
1248 /*
1249 * We already have an allocation at this
1250 * region so we can safely skip it.
1251 */
1252 goto next;
1253 }
1254
1255 ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1);
1256 if (ret) {
1257 if (ret != -ENOSPC)
1258 mlog_errno(ret);
1259 goto out;
1260 }
1261
1262next:
1263 cpos += alloc_size;
1264 clusters -= alloc_size;
1265 }
1266
1267 ret = 0;
1268out:
1269 return ret;
1270}
1271
1272static int __ocfs2_remove_inode_range(struct inode *inode,
1273 struct buffer_head *di_bh,
1274 u32 cpos, u32 phys_cpos, u32 len,
1275 struct ocfs2_cached_dealloc_ctxt *dealloc)
1276{
1277 int ret;
1278 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
1279 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 struct inode *tl_inode = osb->osb_tl_inode;
1281 handle_t *handle;
1282 struct ocfs2_alloc_context *meta_ac = NULL;
1283 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
1284
1285 ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac);
1286 if (ret) {
1287 mlog_errno(ret);
1288 return ret;
1289 }
1290
1291 mutex_lock(&tl_inode->i_mutex);
1292
1293 if (ocfs2_truncate_log_needs_flush(osb)) {
1294 ret = __ocfs2_flush_truncate_log(osb);
1295 if (ret < 0) {
1296 mlog_errno(ret);
1297 goto out;
1298 }
1299 }
1300
1301 handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
1302 if (handle == NULL) {
1303 ret = -ENOMEM;
1304 mlog_errno(ret);
1305 goto out;
1306 }
1307
1308 ret = ocfs2_journal_access(handle, inode, di_bh,
1309 OCFS2_JOURNAL_ACCESS_WRITE);
1310 if (ret) {
1311 mlog_errno(ret);
1312 goto out;
1313 }
1314
1315 ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac,
1316 dealloc);
1317 if (ret) {
1318 mlog_errno(ret);
1319 goto out_commit;
1320 }
1321
1322 OCFS2_I(inode)->ip_clusters -= len;
1323 di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
1324
1325 ret = ocfs2_journal_dirty(handle, di_bh);
1326 if (ret) {
1327 mlog_errno(ret);
1328 goto out_commit;
1329 }
1330
1331 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
1332 if (ret)
1333 mlog_errno(ret);
1334
1335out_commit:
1336 ocfs2_commit_trans(osb, handle);
1337out:
1338 mutex_unlock(&tl_inode->i_mutex);
1339
1340 if (meta_ac)
1341 ocfs2_free_alloc_context(meta_ac);
1342
1343 return ret;
1344}
1345
1346/*
1347 * Truncate a byte range, avoiding pages within partial clusters. This
1348 * preserves those pages for the zeroing code to write to.
1349 */
1350static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start,
1351 u64 byte_len)
1352{
1353 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1354 loff_t start, end;
1355 struct address_space *mapping = inode->i_mapping;
1356
1357 start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start);
1358 end = byte_start + byte_len;
1359 end = end & ~(osb->s_clustersize - 1);
1360
1361 if (start < end) {
1362 unmap_mapping_range(mapping, start, end - start, 0);
1363 truncate_inode_pages_range(mapping, start, end - 1);
1364 }
1365}
1366
1367static int ocfs2_zero_partial_clusters(struct inode *inode,
1368 u64 start, u64 len)
1369{
1370 int ret = 0;
1371 u64 tmpend, end = start + len;
1372 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1373 unsigned int csize = osb->s_clustersize;
1374 handle_t *handle;
1375
1376 /*
1377 * The "start" and "end" values are NOT necessarily part of
1378 * the range whose allocation is being deleted. Rather, this
1379 * is what the user passed in with the request. We must zero
1380 * partial clusters here. There's no need to worry about
1381 * physical allocation - the zeroing code knows to skip holes.
1382 */
1383 mlog(0, "byte start: %llu, end: %llu\n",
1384 (unsigned long long)start, (unsigned long long)end);
1385
1386 /*
1387 * If both edges are on a cluster boundary then there's no
1388 * zeroing required as the region is part of the allocation to
1389 * be truncated.
1390 */
1391 if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0)
1392 goto out;
1393
1394 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1395 if (handle == NULL) {
1396 ret = -ENOMEM;
1397 mlog_errno(ret);
1398 goto out;
1399 }
1400
1401 /*
1402 * We want to get the byte offset of the end of the 1st cluster.
1403 */
1404 tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1));
1405 if (tmpend > end)
1406 tmpend = end;
1407
1408 mlog(0, "1st range: start: %llu, tmpend: %llu\n",
1409 (unsigned long long)start, (unsigned long long)tmpend);
1410
1411 ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend);
1412 if (ret)
1413 mlog_errno(ret);
1414
1415 if (tmpend < end) {
1416 /*
1417 * This may make start and end equal, but the zeroing
1418 * code will skip any work in that case so there's no
1419 * need to catch it up here.
1420 */
1421 start = end & ~(osb->s_clustersize - 1);
1422
1423 mlog(0, "2nd range: start: %llu, end: %llu\n",
1424 (unsigned long long)start, (unsigned long long)end);
1425
1426 ret = ocfs2_zero_range_for_truncate(inode, handle, start, end);
1427 if (ret)
1428 mlog_errno(ret);
1429 }
1430
1431 ocfs2_commit_trans(osb, handle);
1432out:
1433 return ret;
1434}
1435
1436static int ocfs2_remove_inode_range(struct inode *inode,
1437 struct buffer_head *di_bh, u64 byte_start,
1438 u64 byte_len)
1439{
1440 int ret = 0;
1441 u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size;
1442 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1443 struct ocfs2_cached_dealloc_ctxt dealloc;
1444
1445 ocfs2_init_dealloc_ctxt(&dealloc);
1446
1447 if (byte_len == 0)
1448 return 0;
1449
1450 trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start);
1451 trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits;
1452 if (trunc_len >= trunc_start)
1453 trunc_len -= trunc_start;
1454 else
1455 trunc_len = 0;
1456
1457 mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n",
1458 (unsigned long long)OCFS2_I(inode)->ip_blkno,
1459 (unsigned long long)byte_start,
1460 (unsigned long long)byte_len, trunc_start, trunc_len);
1461
1462 ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len);
1463 if (ret) {
1464 mlog_errno(ret);
1465 goto out;
1466 }
1467
1468 cpos = trunc_start;
1469 while (trunc_len) {
1470 ret = ocfs2_get_clusters(inode, cpos, &phys_cpos,
1471 &alloc_size, NULL);
1472 if (ret) {
1473 mlog_errno(ret);
1474 goto out;
1475 }
1476
1477 if (alloc_size > trunc_len)
1478 alloc_size = trunc_len;
1479
1480 /* Only do work for non-holes */
1481 if (phys_cpos != 0) {
1482 ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
1483 phys_cpos, alloc_size,
1484 &dealloc);
1485 if (ret) {
1486 mlog_errno(ret);
1487 goto out;
1488 }
1489 }
1490
1491 cpos += alloc_size;
1492 trunc_len -= alloc_size;
1493 }
1494
1495 ocfs2_truncate_cluster_pages(inode, byte_start, byte_len);
1496
1497out:
1498 ocfs2_schedule_truncate_log_flush(osb, 1);
1499 ocfs2_run_deallocs(osb, &dealloc);
1500
1501 return ret;
1502}
1503
1504/*
1505 * Parts of this function taken from xfs_change_file_space()
1506 */
1507int ocfs2_change_file_space(struct file *file, unsigned int cmd,
1508 struct ocfs2_space_resv *sr)
1509{
1510 int ret;
1511 s64 llen;
1512 struct inode *inode = file->f_path.dentry->d_inode;
1513 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1514 struct buffer_head *di_bh = NULL;
1515 handle_t *handle;
1516 unsigned long long max_off = ocfs2_max_file_offset(inode->i_sb->s_blocksize_bits);
1517
1518 if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) &&
1519 !ocfs2_writes_unwritten_extents(osb))
1520 return -ENOTTY;
1521 else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) &&
1522 !ocfs2_sparse_alloc(osb))
1523 return -ENOTTY;
1524
1525 if (!S_ISREG(inode->i_mode))
1526 return -EINVAL;
1527
1528 if (!(file->f_mode & FMODE_WRITE))
1529 return -EBADF;
1530
1531 if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
1532 return -EROFS;
1533
1534 mutex_lock(&inode->i_mutex);
1535
1536 /*
1537 * This prevents concurrent writes on other nodes
1538 */
1539 ret = ocfs2_rw_lock(inode, 1);
1540 if (ret) {
1541 mlog_errno(ret);
1542 goto out;
1543 }
1544
1545 ret = ocfs2_meta_lock(inode, &di_bh, 1);
1546 if (ret) {
1547 mlog_errno(ret);
1548 goto out_rw_unlock;
1549 }
1550
1551 if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
1552 ret = -EPERM;
1553 goto out_meta_unlock;
1554 }
1555
1556 switch (sr->l_whence) {
1557 case 0: /*SEEK_SET*/
1558 break;
1559 case 1: /*SEEK_CUR*/
1560 sr->l_start += file->f_pos;
1561 break;
1562 case 2: /*SEEK_END*/
1563 sr->l_start += i_size_read(inode);
1564 break;
1565 default:
1566 ret = -EINVAL;
1567 goto out_meta_unlock;
1568 }
1569 sr->l_whence = 0;
1570
1571 llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len;
1572
1573 if (sr->l_start < 0
1574 || sr->l_start > max_off
1575 || (sr->l_start + llen) < 0
1576 || (sr->l_start + llen) > max_off) {
1577 ret = -EINVAL;
1578 goto out_meta_unlock;
1579 }
1580
1581 if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) {
1582 if (sr->l_len <= 0) {
1583 ret = -EINVAL;
1584 goto out_meta_unlock;
1585 }
1586 }
1587
1588 if (should_remove_suid(file->f_path.dentry)) {
1589 ret = __ocfs2_write_remove_suid(inode, di_bh);
1590 if (ret) {
1591 mlog_errno(ret);
1592 goto out_meta_unlock;
1593 }
1594 }
1595
1596 down_write(&OCFS2_I(inode)->ip_alloc_sem);
1597 switch (cmd) {
1598 case OCFS2_IOC_RESVSP:
1599 case OCFS2_IOC_RESVSP64:
1600 /*
1601 * This takes unsigned offsets, but the signed ones we
1602 * pass have been checked against overflow above.
1603 */
1604 ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start,
1605 sr->l_len);
1606 break;
1607 case OCFS2_IOC_UNRESVSP:
1608 case OCFS2_IOC_UNRESVSP64:
1609 ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start,
1610 sr->l_len);
1611 break;
1612 default:
1613 ret = -EINVAL;
1614 }
1615 up_write(&OCFS2_I(inode)->ip_alloc_sem);
1616 if (ret) {
1617 mlog_errno(ret);
1618 goto out_meta_unlock;
1619 }
1620
1621 /*
1622 * We update c/mtime for these changes
1623 */
1624 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
1625 if (IS_ERR(handle)) {
1626 ret = PTR_ERR(handle);
1627 mlog_errno(ret);
1628 goto out_meta_unlock;
1629 }
1630
1631 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1632 ret = ocfs2_mark_inode_dirty(handle, inode, di_bh);
1633 if (ret < 0)
1634 mlog_errno(ret);
1635
1636 ocfs2_commit_trans(osb, handle);
1637
1638out_meta_unlock:
1639 brelse(di_bh);
1640 ocfs2_meta_unlock(inode, 1);
1641out_rw_unlock:
1642 ocfs2_rw_unlock(inode, 1);
1643
1644 mutex_unlock(&inode->i_mutex);
1645out:
1646 return ret;
1647}
1648
1162static int ocfs2_prepare_inode_for_write(struct dentry *dentry, 1649static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
1163 loff_t *ppos, 1650 loff_t *ppos,
1164 size_t count, 1651 size_t count,
@@ -1329,15 +1816,16 @@ ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
1329 *basep = base; 1816 *basep = base;
1330} 1817}
1331 1818
1332static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp, 1819static struct page * ocfs2_get_write_source(char **ret_src_buf,
1333 const struct iovec *cur_iov, 1820 const struct iovec *cur_iov,
1334 size_t iov_offset) 1821 size_t iov_offset)
1335{ 1822{
1336 int ret; 1823 int ret;
1337 char *buf; 1824 char *buf = cur_iov->iov_base + iov_offset;
1338 struct page *src_page = NULL; 1825 struct page *src_page = NULL;
1826 unsigned long off;
1339 1827
1340 buf = cur_iov->iov_base + iov_offset; 1828 off = (unsigned long)(buf) & ~PAGE_CACHE_MASK;
1341 1829
1342 if (!segment_eq(get_fs(), KERNEL_DS)) { 1830 if (!segment_eq(get_fs(), KERNEL_DS)) {
1343 /* 1831 /*
@@ -1349,18 +1837,17 @@ static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp
1349 (unsigned long)buf & PAGE_CACHE_MASK, 1, 1837 (unsigned long)buf & PAGE_CACHE_MASK, 1,
1350 0, 0, &src_page, NULL); 1838 0, 0, &src_page, NULL);
1351 if (ret == 1) 1839 if (ret == 1)
1352 bp->b_src_buf = kmap(src_page); 1840 *ret_src_buf = kmap(src_page) + off;
1353 else 1841 else
1354 src_page = ERR_PTR(-EFAULT); 1842 src_page = ERR_PTR(-EFAULT);
1355 } else { 1843 } else {
1356 bp->b_src_buf = buf; 1844 *ret_src_buf = buf;
1357 } 1845 }
1358 1846
1359 return src_page; 1847 return src_page;
1360} 1848}
1361 1849
1362static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp, 1850static void ocfs2_put_write_source(struct page *page)
1363 struct page *page)
1364{ 1851{
1365 if (page) { 1852 if (page) {
1366 kunmap(page); 1853 kunmap(page);
@@ -1376,10 +1863,12 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1376{ 1863{
1377 int ret = 0; 1864 int ret = 0;
1378 ssize_t copied, total = 0; 1865 ssize_t copied, total = 0;
1379 size_t iov_offset = 0; 1866 size_t iov_offset = 0, bytes;
1867 loff_t pos;
1380 const struct iovec *cur_iov = iov; 1868 const struct iovec *cur_iov = iov;
1381 struct ocfs2_buffered_write_priv bp; 1869 struct page *user_page, *page;
1382 struct page *page; 1870 char *buf, *dst;
1871 void *fsdata;
1383 1872
1384 /* 1873 /*
1385 * handle partial DIO write. Adjust cur_iov if needed. 1874 * handle partial DIO write. Adjust cur_iov if needed.
@@ -1387,21 +1876,38 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1387 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written); 1876 ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
1388 1877
1389 do { 1878 do {
1390 bp.b_cur_off = iov_offset; 1879 pos = *ppos;
1391 bp.b_cur_iov = cur_iov;
1392 1880
1393 page = ocfs2_get_write_source(&bp, cur_iov, iov_offset); 1881 user_page = ocfs2_get_write_source(&buf, cur_iov, iov_offset);
1394 if (IS_ERR(page)) { 1882 if (IS_ERR(user_page)) {
1395 ret = PTR_ERR(page); 1883 ret = PTR_ERR(user_page);
1396 goto out; 1884 goto out;
1397 } 1885 }
1398 1886
1399 copied = ocfs2_buffered_write_cluster(file, *ppos, count, 1887 /* Stay within our page boundaries */
1400 ocfs2_map_and_write_user_data, 1888 bytes = min((PAGE_CACHE_SIZE - ((unsigned long)pos & ~PAGE_CACHE_MASK)),
1401 &bp); 1889 (PAGE_CACHE_SIZE - ((unsigned long)buf & ~PAGE_CACHE_MASK)));
1890 /* Stay within the vector boundary */
1891 bytes = min_t(size_t, bytes, cur_iov->iov_len - iov_offset);
1892 /* Stay within count */
1893 bytes = min(bytes, count);
1894
1895 page = NULL;
1896 ret = ocfs2_write_begin(file, file->f_mapping, pos, bytes, 0,
1897 &page, &fsdata);
1898 if (ret) {
1899 mlog_errno(ret);
1900 goto out;
1901 }
1402 1902
1403 ocfs2_put_write_source(&bp, page); 1903 dst = kmap_atomic(page, KM_USER0);
1904 memcpy(dst + (pos & (PAGE_CACHE_SIZE - 1)), buf, bytes);
1905 kunmap_atomic(dst, KM_USER0);
1906 flush_dcache_page(page);
1907 ocfs2_put_write_source(user_page);
1404 1908
1909 copied = ocfs2_write_end(file, file->f_mapping, pos, bytes,
1910 bytes, page, fsdata);
1405 if (copied < 0) { 1911 if (copied < 0) {
1406 mlog_errno(copied); 1912 mlog_errno(copied);
1407 ret = copied; 1913 ret = copied;
@@ -1409,7 +1915,7 @@ static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
1409 } 1915 }
1410 1916
1411 total += copied; 1917 total += copied;
1412 *ppos = *ppos + copied; 1918 *ppos = pos + copied;
1413 count -= copied; 1919 count -= copied;
1414 1920
1415 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied); 1921 ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
@@ -1579,52 +2085,46 @@ static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
1579 struct pipe_buffer *buf, 2085 struct pipe_buffer *buf,
1580 struct splice_desc *sd) 2086 struct splice_desc *sd)
1581{ 2087{
1582 int ret, count, total = 0; 2088 int ret, count;
1583 ssize_t copied = 0; 2089 ssize_t copied = 0;
1584 struct ocfs2_splice_write_priv sp; 2090 struct file *file = sd->u.file;
2091 unsigned int offset;
2092 struct page *page = NULL;
2093 void *fsdata;
2094 char *src, *dst;
1585 2095
1586 ret = buf->ops->confirm(pipe, buf); 2096 ret = buf->ops->confirm(pipe, buf);
1587 if (ret) 2097 if (ret)
1588 goto out; 2098 goto out;
1589 2099
1590 sp.s_sd = sd; 2100 offset = sd->pos & ~PAGE_CACHE_MASK;
1591 sp.s_buf = buf;
1592 sp.s_pipe = pipe;
1593 sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
1594 sp.s_buf_offset = buf->offset;
1595
1596 count = sd->len; 2101 count = sd->len;
1597 if (count + sp.s_offset > PAGE_CACHE_SIZE) 2102 if (count + offset > PAGE_CACHE_SIZE)
1598 count = PAGE_CACHE_SIZE - sp.s_offset; 2103 count = PAGE_CACHE_SIZE - offset;
1599 2104
1600 do { 2105 ret = ocfs2_write_begin(file, file->f_mapping, sd->pos, count, 0,
1601 /* 2106 &page, &fsdata);
1602 * splice wants us to copy up to one page at a 2107 if (ret) {
1603 * time. For pagesize > cluster size, this means we 2108 mlog_errno(ret);
1604 * might enter ocfs2_buffered_write_cluster() more 2109 goto out;
1605 * than once, so keep track of our progress here. 2110 }
1606 */
1607 copied = ocfs2_buffered_write_cluster(sd->u.file,
1608 (loff_t)sd->pos + total,
1609 count,
1610 ocfs2_map_and_write_splice_data,
1611 &sp);
1612 if (copied < 0) {
1613 mlog_errno(copied);
1614 ret = copied;
1615 goto out;
1616 }
1617 2111
1618 count -= copied; 2112 src = buf->ops->map(pipe, buf, 1);
1619 sp.s_offset += copied; 2113 dst = kmap_atomic(page, KM_USER1);
1620 sp.s_buf_offset += copied; 2114 memcpy(dst + offset, src + buf->offset, count);
1621 total += copied; 2115 kunmap_atomic(page, KM_USER1);
1622 } while (count); 2116 buf->ops->unmap(pipe, buf, src);
1623 2117
1624 ret = 0; 2118 copied = ocfs2_write_end(file, file->f_mapping, sd->pos, count, count,
2119 page, fsdata);
2120 if (copied < 0) {
2121 mlog_errno(copied);
2122 ret = copied;
2123 goto out;
2124 }
1625out: 2125out:
1626 2126
1627 return total ? total : ret; 2127 return copied ? copied : ret;
1628} 2128}
1629 2129
1630static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe, 2130static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,