diff options
Diffstat (limited to 'fs/ocfs2/aops.c')
| -rw-r--r-- | fs/ocfs2/aops.c | 1011 |
1 files changed, 820 insertions, 191 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 875c11443817..56963e6c46c0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
| @@ -24,6 +24,8 @@ | |||
| 24 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
| 25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
| 26 | #include <asm/byteorder.h> | 26 | #include <asm/byteorder.h> |
| 27 | #include <linux/swap.h> | ||
| 28 | #include <linux/pipe_fs_i.h> | ||
| 27 | 29 | ||
| 28 | #define MLOG_MASK_PREFIX ML_FILE_IO | 30 | #define MLOG_MASK_PREFIX ML_FILE_IO |
| 29 | #include <cluster/masklog.h> | 31 | #include <cluster/masklog.h> |
| @@ -37,6 +39,7 @@ | |||
| 37 | #include "file.h" | 39 | #include "file.h" |
| 38 | #include "inode.h" | 40 | #include "inode.h" |
| 39 | #include "journal.h" | 41 | #include "journal.h" |
| 42 | #include "suballoc.h" | ||
| 40 | #include "super.h" | 43 | #include "super.h" |
| 41 | #include "symlink.h" | 44 | #include "symlink.h" |
| 42 | 45 | ||
| @@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
| 134 | struct buffer_head *bh_result, int create) | 137 | struct buffer_head *bh_result, int create) |
| 135 | { | 138 | { |
| 136 | int err = 0; | 139 | int err = 0; |
| 140 | unsigned int ext_flags; | ||
| 137 | u64 p_blkno, past_eof; | 141 | u64 p_blkno, past_eof; |
| 142 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 138 | 143 | ||
| 139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | 144 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, |
| 140 | (unsigned long long)iblock, bh_result, create); | 145 | (unsigned long long)iblock, bh_result, create); |
| @@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
| 149 | goto bail; | 154 | goto bail; |
| 150 | } | 155 | } |
| 151 | 156 | ||
| 152 | /* this can happen if another node truncs after our extend! */ | 157 | err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, |
| 153 | spin_lock(&OCFS2_I(inode)->ip_lock); | 158 | &ext_flags); |
| 154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
| 155 | OCFS2_I(inode)->ip_clusters)) | ||
| 156 | err = -EIO; | ||
| 157 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 158 | if (err) | ||
| 159 | goto bail; | ||
| 160 | |||
| 161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
| 162 | NULL); | ||
| 163 | if (err) { | 159 | if (err) { |
| 164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | 160 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " |
| 165 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, | 161 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, |
| @@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
| 167 | goto bail; | 163 | goto bail; |
| 168 | } | 164 | } |
| 169 | 165 | ||
| 170 | map_bh(bh_result, inode->i_sb, p_blkno); | 166 | /* |
| 171 | 167 | * ocfs2 never allocates in this function - the only time we | |
| 172 | if (bh_result->b_blocknr == 0) { | 168 | * need to use BH_New is when we're extending i_size on a file |
| 173 | err = -EIO; | 169 | * system which doesn't support holes, in which case BH_New |
| 174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | 170 | * allows block_prepare_write() to zero. |
| 175 | (unsigned long long)iblock, | 171 | */ |
| 176 | (unsigned long long)p_blkno, | 172 | mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), |
| 177 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 173 | "ino %lu, iblock %llu\n", inode->i_ino, |
| 178 | } | 174 | (unsigned long long)iblock); |
| 175 | |||
| 176 | /* Treat the unwritten extent as a hole for zeroing purposes. */ | ||
| 177 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
| 178 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
| 179 | |||
| 180 | if (!ocfs2_sparse_alloc(osb)) { | ||
| 181 | if (p_blkno == 0) { | ||
| 182 | err = -EIO; | ||
| 183 | mlog(ML_ERROR, | ||
| 184 | "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | ||
| 185 | (unsigned long long)iblock, | ||
| 186 | (unsigned long long)p_blkno, | ||
| 187 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 188 | mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); | ||
| 189 | dump_stack(); | ||
| 190 | } | ||
| 179 | 191 | ||
| 180 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 192 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
| 181 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, | 193 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, |
| 182 | (unsigned long long)past_eof); | 194 | (unsigned long long)past_eof); |
| 183 | 195 | ||
| 184 | if (create && (iblock >= past_eof)) | 196 | if (create && (iblock >= past_eof)) |
| 185 | set_buffer_new(bh_result); | 197 | set_buffer_new(bh_result); |
| 198 | } | ||
| 186 | 199 | ||
| 187 | bail: | 200 | bail: |
| 188 | if (err < 0) | 201 | if (err < 0) |
| @@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) | |||
| 276 | return ret; | 289 | return ret; |
| 277 | } | 290 | } |
| 278 | 291 | ||
| 279 | /* This can also be called from ocfs2_write_zero_page() which has done | 292 | /* |
| 280 | * it's own cluster locking. */ | 293 | * This is called from ocfs2_write_zero_page() which has handled it's |
| 294 | * own cluster locking and has ensured allocation exists for those | ||
| 295 | * blocks to be written. | ||
| 296 | */ | ||
| 281 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | 297 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, |
| 282 | unsigned from, unsigned to) | 298 | unsigned from, unsigned to) |
| 283 | { | 299 | { |
| @@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | |||
| 292 | return ret; | 308 | return ret; |
| 293 | } | 309 | } |
| 294 | 310 | ||
| 295 | /* | ||
| 296 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called | ||
| 297 | * from loopback. It must be able to perform its own locking around | ||
| 298 | * ocfs2_get_block(). | ||
| 299 | */ | ||
| 300 | static int ocfs2_prepare_write(struct file *file, struct page *page, | ||
| 301 | unsigned from, unsigned to) | ||
| 302 | { | ||
| 303 | struct inode *inode = page->mapping->host; | ||
| 304 | int ret; | ||
| 305 | |||
| 306 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
| 307 | |||
| 308 | ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); | ||
| 309 | if (ret != 0) { | ||
| 310 | mlog_errno(ret); | ||
| 311 | goto out; | ||
| 312 | } | ||
| 313 | |||
| 314 | ret = ocfs2_prepare_write_nolock(inode, page, from, to); | ||
| 315 | |||
| 316 | ocfs2_meta_unlock(inode, 0); | ||
| 317 | out: | ||
| 318 | mlog_exit(ret); | ||
| 319 | return ret; | ||
| 320 | } | ||
| 321 | |||
| 322 | /* Taken from ext3. We don't necessarily need the full blown | 311 | /* Taken from ext3. We don't necessarily need the full blown |
| 323 | * functionality yet, but IMHO it's better to cut and paste the whole | 312 | * functionality yet, but IMHO it's better to cut and paste the whole |
| 324 | * thing so we can avoid introducing our own bugs (and easily pick up | 313 | * thing so we can avoid introducing our own bugs (and easily pick up |
| 325 | * their fixes when they happen) --Mark */ | 314 | * their fixes when they happen) --Mark */ |
| 326 | static int walk_page_buffers( handle_t *handle, | 315 | int walk_page_buffers( handle_t *handle, |
| 327 | struct buffer_head *head, | 316 | struct buffer_head *head, |
| 328 | unsigned from, | 317 | unsigned from, |
| 329 | unsigned to, | 318 | unsigned to, |
| 330 | int *partial, | 319 | int *partial, |
| 331 | int (*fn)( handle_t *handle, | 320 | int (*fn)( handle_t *handle, |
| 332 | struct buffer_head *bh)) | 321 | struct buffer_head *bh)) |
| 333 | { | 322 | { |
| 334 | struct buffer_head *bh; | 323 | struct buffer_head *bh; |
| 335 | unsigned block_start, block_end; | 324 | unsigned block_start, block_end; |
| @@ -388,95 +377,6 @@ out: | |||
| 388 | return handle; | 377 | return handle; |
| 389 | } | 378 | } |
| 390 | 379 | ||
| 391 | static int ocfs2_commit_write(struct file *file, struct page *page, | ||
| 392 | unsigned from, unsigned to) | ||
| 393 | { | ||
| 394 | int ret; | ||
| 395 | struct buffer_head *di_bh = NULL; | ||
| 396 | struct inode *inode = page->mapping->host; | ||
| 397 | handle_t *handle = NULL; | ||
| 398 | struct ocfs2_dinode *di; | ||
| 399 | |||
| 400 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
| 401 | |||
| 402 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for | ||
| 403 | * us to continue here without rechecking the I/O against | ||
| 404 | * changed inode values. | ||
| 405 | * | ||
| 406 | * 1) We're currently holding the inode alloc lock, so no | ||
| 407 | * nodes can change it underneath us. | ||
| 408 | * | ||
| 409 | * 2) We've had to take the metadata lock at least once | ||
| 410 | * already to check for extending writes, suid removal, etc. | ||
| 411 | * The meta data update code then ensures that we don't get a | ||
| 412 | * stale inode allocation image (i_size, i_clusters, etc). | ||
| 413 | */ | ||
| 414 | |||
| 415 | ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); | ||
| 416 | if (ret != 0) { | ||
| 417 | mlog_errno(ret); | ||
| 418 | goto out; | ||
| 419 | } | ||
| 420 | |||
| 421 | ret = ocfs2_data_lock_with_page(inode, 1, page); | ||
| 422 | if (ret != 0) { | ||
| 423 | mlog_errno(ret); | ||
| 424 | goto out_unlock_meta; | ||
| 425 | } | ||
| 426 | |||
| 427 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); | ||
| 428 | if (IS_ERR(handle)) { | ||
| 429 | ret = PTR_ERR(handle); | ||
| 430 | goto out_unlock_data; | ||
| 431 | } | ||
| 432 | |||
| 433 | /* Mark our buffer early. We'd rather catch this error up here | ||
| 434 | * as opposed to after a successful commit_write which would | ||
| 435 | * require us to set back inode->i_size. */ | ||
| 436 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
| 437 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 438 | if (ret < 0) { | ||
| 439 | mlog_errno(ret); | ||
| 440 | goto out_commit; | ||
| 441 | } | ||
| 442 | |||
| 443 | /* might update i_size */ | ||
| 444 | ret = generic_commit_write(file, page, from, to); | ||
| 445 | if (ret < 0) { | ||
| 446 | mlog_errno(ret); | ||
| 447 | goto out_commit; | ||
| 448 | } | ||
| 449 | |||
| 450 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
| 451 | |||
| 452 | /* ocfs2_mark_inode_dirty() is too heavy to use here. */ | ||
| 453 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
| 454 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
| 455 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
| 456 | |||
| 457 | inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); | ||
| 458 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
| 459 | |||
| 460 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
| 461 | if (ret < 0) { | ||
| 462 | mlog_errno(ret); | ||
| 463 | goto out_commit; | ||
| 464 | } | ||
| 465 | |||
| 466 | out_commit: | ||
| 467 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | ||
| 468 | out_unlock_data: | ||
| 469 | ocfs2_data_unlock(inode, 1); | ||
| 470 | out_unlock_meta: | ||
| 471 | ocfs2_meta_unlock(inode, 1); | ||
| 472 | out: | ||
| 473 | if (di_bh) | ||
| 474 | brelse(di_bh); | ||
| 475 | |||
| 476 | mlog_exit(ret); | ||
| 477 | return ret; | ||
| 478 | } | ||
| 479 | |||
| 480 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | 380 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) |
| 481 | { | 381 | { |
| 482 | sector_t status; | 382 | sector_t status; |
| @@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | |||
| 499 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 399 | down_read(&OCFS2_I(inode)->ip_alloc_sem); |
| 500 | } | 400 | } |
| 501 | 401 | ||
| 502 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | 402 | err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); |
| 503 | NULL); | ||
| 504 | 403 | ||
| 505 | if (!INODE_JOURNAL(inode)) { | 404 | if (!INODE_JOURNAL(inode)) { |
| 506 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | 405 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
| @@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
| 540 | struct buffer_head *bh_result, int create) | 439 | struct buffer_head *bh_result, int create) |
| 541 | { | 440 | { |
| 542 | int ret; | 441 | int ret; |
| 543 | u64 p_blkno, inode_blocks; | 442 | u64 p_blkno, inode_blocks, contig_blocks; |
| 544 | int contig_blocks; | 443 | unsigned int ext_flags; |
| 545 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; | 444 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; |
| 546 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | 445 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; |
| 547 | 446 | ||
| @@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
| 549 | * nicely aligned and of the right size, so there's no need | 448 | * nicely aligned and of the right size, so there's no need |
| 550 | * for us to check any of that. */ | 449 | * for us to check any of that. */ |
| 551 | 450 | ||
| 552 | spin_lock(&OCFS2_I(inode)->ip_lock); | 451 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
| 553 | inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, | ||
| 554 | OCFS2_I(inode)->ip_clusters); | ||
| 555 | |||
| 556 | /* | ||
| 557 | * For a read which begins past the end of file, we return a hole. | ||
| 558 | */ | ||
| 559 | if (!create && (iblock >= inode_blocks)) { | ||
| 560 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 561 | ret = 0; | ||
| 562 | goto bail; | ||
| 563 | } | ||
| 564 | 452 | ||
| 565 | /* | 453 | /* |
| 566 | * Any write past EOF is not allowed because we'd be extending. | 454 | * Any write past EOF is not allowed because we'd be extending. |
| 567 | */ | 455 | */ |
| 568 | if (create && (iblock + max_blocks) > inode_blocks) { | 456 | if (create && (iblock + max_blocks) > inode_blocks) { |
| 569 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 570 | ret = -EIO; | 457 | ret = -EIO; |
| 571 | goto bail; | 458 | goto bail; |
| 572 | } | 459 | } |
| 573 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
| 574 | 460 | ||
| 575 | /* This figures out the size of the next contiguous block, and | 461 | /* This figures out the size of the next contiguous block, and |
| 576 | * our logical offset */ | 462 | * our logical offset */ |
| 577 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 463 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
| 578 | &contig_blocks); | 464 | &contig_blocks, &ext_flags); |
| 579 | if (ret) { | 465 | if (ret) { |
| 580 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 466 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
| 581 | (unsigned long long)iblock); | 467 | (unsigned long long)iblock); |
| @@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
| 583 | goto bail; | 469 | goto bail; |
| 584 | } | 470 | } |
| 585 | 471 | ||
| 586 | map_bh(bh_result, inode->i_sb, p_blkno); | 472 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { |
| 473 | ocfs2_error(inode->i_sb, | ||
| 474 | "Inode %llu has a hole at block %llu\n", | ||
| 475 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
| 476 | (unsigned long long)iblock); | ||
| 477 | ret = -EROFS; | ||
| 478 | goto bail; | ||
| 479 | } | ||
| 480 | |||
| 481 | /* | ||
| 482 | * get_more_blocks() expects us to describe a hole by clearing | ||
| 483 | * the mapped bit on bh_result(). | ||
| 484 | * | ||
| 485 | * Consider an unwritten extent as a hole. | ||
| 486 | */ | ||
| 487 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
| 488 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
| 489 | else { | ||
| 490 | /* | ||
| 491 | * ocfs2_prepare_inode_for_write() should have caught | ||
| 492 | * the case where we'd be filling a hole and triggered | ||
| 493 | * a buffered write instead. | ||
| 494 | */ | ||
| 495 | if (create) { | ||
| 496 | ret = -EIO; | ||
| 497 | mlog_errno(ret); | ||
| 498 | goto bail; | ||
| 499 | } | ||
| 500 | |||
| 501 | clear_buffer_mapped(bh_result); | ||
| 502 | } | ||
| 587 | 503 | ||
| 588 | /* make sure we don't map more than max_blocks blocks here as | 504 | /* make sure we don't map more than max_blocks blocks here as |
| 589 | that's all the kernel will handle at this point. */ | 505 | that's all the kernel will handle at this point. */ |
| @@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
| 606 | void *private) | 522 | void *private) |
| 607 | { | 523 | { |
| 608 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 524 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
| 525 | int level; | ||
| 609 | 526 | ||
| 610 | /* this io's submitter should not have unlocked this before we could */ | 527 | /* this io's submitter should not have unlocked this before we could */ |
| 611 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 528 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
| 529 | |||
| 612 | ocfs2_iocb_clear_rw_locked(iocb); | 530 | ocfs2_iocb_clear_rw_locked(iocb); |
| 613 | up_read(&inode->i_alloc_sem); | 531 | |
| 614 | ocfs2_rw_unlock(inode, 0); | 532 | level = ocfs2_iocb_rw_locked_level(iocb); |
| 533 | if (!level) | ||
| 534 | up_read(&inode->i_alloc_sem); | ||
| 535 | ocfs2_rw_unlock(inode, level); | ||
| 615 | } | 536 | } |
| 616 | 537 | ||
| 617 | /* | 538 | /* |
| @@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw, | |||
| 647 | 568 | ||
| 648 | mlog_entry_void(); | 569 | mlog_entry_void(); |
| 649 | 570 | ||
| 650 | /* | 571 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
| 651 | * We get PR data locks even for O_DIRECT. This allows | 572 | /* |
| 652 | * concurrent O_DIRECT I/O but doesn't let O_DIRECT with | 573 | * We get PR data locks even for O_DIRECT. This |
| 653 | * extending and buffered zeroing writes race. If they did | 574 | * allows concurrent O_DIRECT I/O but doesn't let |
| 654 | * race then the buffered zeroing could be written back after | 575 | * O_DIRECT with extending and buffered zeroing writes |
| 655 | * the O_DIRECT I/O. It's one thing to tell people not to mix | 576 | * race. If they did race then the buffered zeroing |
| 656 | * buffered and O_DIRECT writes, but expecting them to | 577 | * could be written back after the O_DIRECT I/O. It's |
| 657 | * understand that file extension is also an implicit buffered | 578 | * one thing to tell people not to mix buffered and |
| 658 | * write is too much. By getting the PR we force writeback of | 579 | * O_DIRECT writes, but expecting them to understand |
| 659 | * the buffered zeroing before proceeding. | 580 | * that file extension is also an implicit buffered |
| 660 | */ | 581 | * write is too much. By getting the PR we force |
| 661 | ret = ocfs2_data_lock(inode, 0); | 582 | * writeback of the buffered zeroing before |
| 662 | if (ret < 0) { | 583 | * proceeding. |
| 663 | mlog_errno(ret); | 584 | */ |
| 664 | goto out; | 585 | ret = ocfs2_data_lock(inode, 0); |
| 586 | if (ret < 0) { | ||
| 587 | mlog_errno(ret); | ||
| 588 | goto out; | ||
| 589 | } | ||
| 590 | ocfs2_data_unlock(inode, 0); | ||
| 665 | } | 591 | } |
| 666 | ocfs2_data_unlock(inode, 0); | ||
| 667 | 592 | ||
| 668 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | 593 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, |
| 669 | inode->i_sb->s_bdev, iov, offset, | 594 | inode->i_sb->s_bdev, iov, offset, |
| @@ -675,11 +600,715 @@ out: | |||
| 675 | return ret; | 600 | return ret; |
| 676 | } | 601 | } |
| 677 | 602 | ||
| 603 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | ||
| 604 | u32 cpos, | ||
| 605 | unsigned int *start, | ||
| 606 | unsigned int *end) | ||
| 607 | { | ||
| 608 | unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; | ||
| 609 | |||
| 610 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { | ||
| 611 | unsigned int cpp; | ||
| 612 | |||
| 613 | cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); | ||
| 614 | |||
| 615 | cluster_start = cpos % cpp; | ||
| 616 | cluster_start = cluster_start << osb->s_clustersize_bits; | ||
| 617 | |||
| 618 | cluster_end = cluster_start + osb->s_clustersize; | ||
| 619 | } | ||
| 620 | |||
| 621 | BUG_ON(cluster_start > PAGE_SIZE); | ||
| 622 | BUG_ON(cluster_end > PAGE_SIZE); | ||
| 623 | |||
| 624 | if (start) | ||
| 625 | *start = cluster_start; | ||
| 626 | if (end) | ||
| 627 | *end = cluster_end; | ||
| 628 | } | ||
| 629 | |||
| 630 | /* | ||
| 631 | * 'from' and 'to' are the region in the page to avoid zeroing. | ||
| 632 | * | ||
| 633 | * If pagesize > clustersize, this function will avoid zeroing outside | ||
| 634 | * of the cluster boundary. | ||
| 635 | * | ||
| 636 | * from == to == 0 is code for "zero the entire cluster region" | ||
| 637 | */ | ||
| 638 | static void ocfs2_clear_page_regions(struct page *page, | ||
| 639 | struct ocfs2_super *osb, u32 cpos, | ||
| 640 | unsigned from, unsigned to) | ||
| 641 | { | ||
| 642 | void *kaddr; | ||
| 643 | unsigned int cluster_start, cluster_end; | ||
| 644 | |||
| 645 | ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); | ||
| 646 | |||
| 647 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 648 | |||
| 649 | if (from || to) { | ||
| 650 | if (from > cluster_start) | ||
| 651 | memset(kaddr + cluster_start, 0, from - cluster_start); | ||
| 652 | if (to < cluster_end) | ||
| 653 | memset(kaddr + to, 0, cluster_end - to); | ||
| 654 | } else { | ||
| 655 | memset(kaddr + cluster_start, 0, cluster_end - cluster_start); | ||
| 656 | } | ||
| 657 | |||
| 658 | kunmap_atomic(kaddr, KM_USER0); | ||
| 659 | } | ||
| 660 | |||
| 661 | /* | ||
| 662 | * Some of this taken from block_prepare_write(). We already have our | ||
| 663 | * mapping by now though, and the entire write will be allocating or | ||
| 664 | * it won't, so not much need to use BH_New. | ||
| 665 | * | ||
| 666 | * This will also skip zeroing, which is handled externally. | ||
| 667 | */ | ||
| 668 | int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
| 669 | struct inode *inode, unsigned int from, | ||
| 670 | unsigned int to, int new) | ||
| 671 | { | ||
| 672 | int ret = 0; | ||
| 673 | struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; | ||
| 674 | unsigned int block_end, block_start; | ||
| 675 | unsigned int bsize = 1 << inode->i_blkbits; | ||
| 676 | |||
| 677 | if (!page_has_buffers(page)) | ||
| 678 | create_empty_buffers(page, bsize, 0); | ||
| 679 | |||
| 680 | head = page_buffers(page); | ||
| 681 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
| 682 | bh = bh->b_this_page, block_start += bsize) { | ||
| 683 | block_end = block_start + bsize; | ||
| 684 | |||
| 685 | /* | ||
| 686 | * Ignore blocks outside of our i/o range - | ||
| 687 | * they may belong to unallocated clusters. | ||
| 688 | */ | ||
| 689 | if (block_start >= to || block_end <= from) { | ||
| 690 | if (PageUptodate(page)) | ||
| 691 | set_buffer_uptodate(bh); | ||
| 692 | continue; | ||
| 693 | } | ||
| 694 | |||
| 695 | /* | ||
| 696 | * For an allocating write with cluster size >= page | ||
| 697 | * size, we always write the entire page. | ||
| 698 | */ | ||
| 699 | |||
| 700 | if (buffer_new(bh)) | ||
| 701 | clear_buffer_new(bh); | ||
| 702 | |||
| 703 | if (!buffer_mapped(bh)) { | ||
| 704 | map_bh(bh, inode->i_sb, *p_blkno); | ||
| 705 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
| 706 | } | ||
| 707 | |||
| 708 | if (PageUptodate(page)) { | ||
| 709 | if (!buffer_uptodate(bh)) | ||
| 710 | set_buffer_uptodate(bh); | ||
| 711 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | ||
| 712 | (block_start < from || block_end > to)) { | ||
| 713 | ll_rw_block(READ, 1, &bh); | ||
| 714 | *wait_bh++=bh; | ||
| 715 | } | ||
| 716 | |||
| 717 | *p_blkno = *p_blkno + 1; | ||
| 718 | } | ||
| 719 | |||
| 720 | /* | ||
| 721 | * If we issued read requests - let them complete. | ||
| 722 | */ | ||
| 723 | while(wait_bh > wait) { | ||
| 724 | wait_on_buffer(*--wait_bh); | ||
| 725 | if (!buffer_uptodate(*wait_bh)) | ||
| 726 | ret = -EIO; | ||
| 727 | } | ||
| 728 | |||
| 729 | if (ret == 0 || !new) | ||
| 730 | return ret; | ||
| 731 | |||
| 732 | /* | ||
| 733 | * If we get -EIO above, zero out any newly allocated blocks | ||
| 734 | * to avoid exposing stale data. | ||
| 735 | */ | ||
| 736 | bh = head; | ||
| 737 | block_start = 0; | ||
| 738 | do { | ||
| 739 | void *kaddr; | ||
| 740 | |||
| 741 | block_end = block_start + bsize; | ||
| 742 | if (block_end <= from) | ||
| 743 | goto next_bh; | ||
| 744 | if (block_start >= to) | ||
| 745 | break; | ||
| 746 | |||
| 747 | kaddr = kmap_atomic(page, KM_USER0); | ||
| 748 | memset(kaddr+block_start, 0, bh->b_size); | ||
| 749 | flush_dcache_page(page); | ||
| 750 | kunmap_atomic(kaddr, KM_USER0); | ||
| 751 | set_buffer_uptodate(bh); | ||
| 752 | mark_buffer_dirty(bh); | ||
| 753 | |||
| 754 | next_bh: | ||
| 755 | block_start = block_end; | ||
| 756 | bh = bh->b_this_page; | ||
| 757 | } while (bh != head); | ||
| 758 | |||
| 759 | return ret; | ||
| 760 | } | ||
| 761 | |||
| 762 | /* | ||
| 763 | * This will copy user data from the buffer page in the splice | ||
| 764 | * context. | ||
| 765 | * | ||
| 766 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
| 767 | * communication out all the way to ocfs2_write(). | ||
| 768 | */ | ||
| 769 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
| 770 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
| 771 | unsigned int *ret_from, unsigned int *ret_to) | ||
| 772 | { | ||
| 773 | int ret; | ||
| 774 | unsigned int to, from, cluster_start, cluster_end; | ||
| 775 | char *src, *dst; | ||
| 776 | struct ocfs2_splice_write_priv *sp = wc->w_private; | ||
| 777 | struct pipe_buffer *buf = sp->s_buf; | ||
| 778 | unsigned long bytes, src_from; | ||
| 779 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 780 | |||
| 781 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
| 782 | &cluster_end); | ||
| 783 | |||
| 784 | from = sp->s_offset; | ||
| 785 | src_from = sp->s_buf_offset; | ||
| 786 | bytes = wc->w_count; | ||
| 787 | |||
| 788 | if (wc->w_large_pages) { | ||
| 789 | /* | ||
| 790 | * For cluster size < page size, we have to | ||
| 791 | * calculate pos within the cluster and obey | ||
| 792 | * the rightmost boundary. | ||
| 793 | */ | ||
| 794 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
| 795 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
| 796 | } | ||
| 797 | to = from + bytes; | ||
| 798 | |||
| 799 | if (wc->w_this_page_new) | ||
| 800 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 801 | cluster_start, cluster_end, 1); | ||
| 802 | else | ||
| 803 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 804 | from, to, 0); | ||
| 805 | if (ret) { | ||
| 806 | mlog_errno(ret); | ||
| 807 | goto out; | ||
| 808 | } | ||
| 809 | |||
| 810 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
| 811 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
| 812 | BUG_ON(from > osb->s_clustersize); | ||
| 813 | BUG_ON(to > osb->s_clustersize); | ||
| 814 | |||
| 815 | src = buf->ops->map(sp->s_pipe, buf, 1); | ||
| 816 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | ||
| 817 | memcpy(dst + from, src + src_from, bytes); | ||
| 818 | kunmap_atomic(wc->w_this_page, KM_USER1); | ||
| 819 | buf->ops->unmap(sp->s_pipe, buf, src); | ||
| 820 | |||
| 821 | wc->w_finished_copy = 1; | ||
| 822 | |||
| 823 | *ret_from = from; | ||
| 824 | *ret_to = to; | ||
| 825 | out: | ||
| 826 | |||
| 827 | return bytes ? (unsigned int)bytes : ret; | ||
| 828 | } | ||
| 829 | |||
| 830 | /* | ||
| 831 | * This will copy user data from the iovec in the buffered write | ||
| 832 | * context. | ||
| 833 | */ | ||
| 834 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
| 835 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
| 836 | unsigned int *ret_from, unsigned int *ret_to) | ||
| 837 | { | ||
| 838 | int ret; | ||
| 839 | unsigned int to, from, cluster_start, cluster_end; | ||
| 840 | unsigned long bytes, src_from; | ||
| 841 | char *dst; | ||
| 842 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
| 843 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
| 844 | char __user *buf; | ||
| 845 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 846 | |||
| 847 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
| 848 | &cluster_end); | ||
| 849 | |||
| 850 | buf = cur_iov->iov_base + bp->b_cur_off; | ||
| 851 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | ||
| 852 | |||
| 853 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | ||
| 854 | |||
| 855 | /* | ||
| 856 | * This is a lot of comparisons, but it reads quite | ||
| 857 | * easily, which is important here. | ||
| 858 | */ | ||
| 859 | /* Stay within the src page */ | ||
| 860 | bytes = PAGE_SIZE - src_from; | ||
| 861 | /* Stay within the vector */ | ||
| 862 | bytes = min(bytes, | ||
| 863 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
| 864 | /* Stay within count */ | ||
| 865 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
| 866 | /* | ||
| 867 | * For clustersize > page size, just stay within | ||
| 868 | * target page, otherwise we have to calculate pos | ||
| 869 | * within the cluster and obey the rightmost | ||
| 870 | * boundary. | ||
| 871 | */ | ||
| 872 | if (wc->w_large_pages) { | ||
| 873 | /* | ||
| 874 | * For cluster size < page size, we have to | ||
| 875 | * calculate pos within the cluster and obey | ||
| 876 | * the rightmost boundary. | ||
| 877 | */ | ||
| 878 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
| 879 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
| 880 | } else { | ||
| 881 | /* | ||
| 882 | * cluster size > page size is the most common | ||
| 883 | * case - we just stay within the target page | ||
| 884 | * boundary. | ||
| 885 | */ | ||
| 886 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
| 887 | } | ||
| 888 | |||
| 889 | to = from + bytes; | ||
| 890 | |||
| 891 | if (wc->w_this_page_new) | ||
| 892 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 893 | cluster_start, cluster_end, 1); | ||
| 894 | else | ||
| 895 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
| 896 | from, to, 0); | ||
| 897 | if (ret) { | ||
| 898 | mlog_errno(ret); | ||
| 899 | goto out; | ||
| 900 | } | ||
| 901 | |||
| 902 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
| 903 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
| 904 | BUG_ON(from > osb->s_clustersize); | ||
| 905 | BUG_ON(to > osb->s_clustersize); | ||
| 906 | |||
| 907 | dst = kmap(wc->w_this_page); | ||
| 908 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | ||
| 909 | kunmap(wc->w_this_page); | ||
| 910 | |||
| 911 | /* | ||
| 912 | * XXX: This is slow, but simple. The caller of | ||
| 913 | * ocfs2_buffered_write_cluster() is responsible for | ||
| 914 | * passing through the iovecs, so it's difficult to | ||
| 915 | * predict what our next step is in here after our | ||
| 916 | * initial write. A future version should be pushing | ||
| 917 | * that iovec manipulation further down. | ||
| 918 | * | ||
| 919 | * By setting this, we indicate that a copy from user | ||
| 920 | * data was done, and subsequent calls for this | ||
| 921 | * cluster will skip copying more data. | ||
| 922 | */ | ||
| 923 | wc->w_finished_copy = 1; | ||
| 924 | |||
| 925 | *ret_from = from; | ||
| 926 | *ret_to = to; | ||
| 927 | out: | ||
| 928 | |||
| 929 | return bytes ? (unsigned int)bytes : ret; | ||
| 930 | } | ||
| 931 | |||
| 932 | /* | ||
| 933 | * Map, fill and write a page to disk. | ||
| 934 | * | ||
| 935 | * The work of copying data is done via callback. Newly allocated | ||
| 936 | * pages which don't take user data will be zero'd (set 'new' to | ||
| 937 | * indicate an allocating write) | ||
| 938 | * | ||
| 939 | * Returns a negative error code or the number of bytes copied into | ||
| 940 | * the page. | ||
| 941 | */ | ||
| 942 | int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
| 943 | u64 *p_blkno, struct page *page, | ||
| 944 | struct ocfs2_write_ctxt *wc, int new) | ||
| 945 | { | ||
| 946 | int ret, copied = 0; | ||
| 947 | unsigned int from = 0, to = 0; | ||
| 948 | unsigned int cluster_start, cluster_end; | ||
| 949 | unsigned int zero_from = 0, zero_to = 0; | ||
| 950 | |||
| 951 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | ||
| 952 | &cluster_start, &cluster_end); | ||
| 953 | |||
| 954 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | ||
| 955 | && !wc->w_finished_copy) { | ||
| 956 | |||
| 957 | wc->w_this_page = page; | ||
| 958 | wc->w_this_page_new = new; | ||
| 959 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | ||
| 960 | if (ret < 0) { | ||
| 961 | mlog_errno(ret); | ||
| 962 | goto out; | ||
| 963 | } | ||
| 964 | |||
| 965 | copied = ret; | ||
| 966 | |||
| 967 | zero_from = from; | ||
| 968 | zero_to = to; | ||
| 969 | if (new) { | ||
| 970 | from = cluster_start; | ||
| 971 | to = cluster_end; | ||
| 972 | } | ||
| 973 | } else { | ||
| 974 | /* | ||
| 975 | * If we haven't allocated the new page yet, we | ||
| 976 | * shouldn't be writing it out without copying user | ||
| 977 | * data. This is likely a math error from the caller. | ||
| 978 | */ | ||
| 979 | BUG_ON(!new); | ||
| 980 | |||
| 981 | from = cluster_start; | ||
| 982 | to = cluster_end; | ||
| 983 | |||
| 984 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
| 985 | cluster_start, cluster_end, 1); | ||
| 986 | if (ret) { | ||
| 987 | mlog_errno(ret); | ||
| 988 | goto out; | ||
| 989 | } | ||
| 990 | } | ||
| 991 | |||
| 992 | /* | ||
| 993 | * Parts of newly allocated pages need to be zero'd. | ||
| 994 | * | ||
| 995 | * Above, we have also rewritten 'to' and 'from' - as far as | ||
| 996 | * the rest of the function is concerned, the entire cluster | ||
| 997 | * range inside of a page needs to be written. | ||
| 998 | * | ||
| 999 | * We can skip this if the page is up to date - it's already | ||
| 1000 | * been zero'd from being read in as a hole. | ||
| 1001 | */ | ||
| 1002 | if (new && !PageUptodate(page)) | ||
| 1003 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | ||
| 1004 | wc->w_cpos, zero_from, zero_to); | ||
| 1005 | |||
| 1006 | flush_dcache_page(page); | ||
| 1007 | |||
| 1008 | if (ocfs2_should_order_data(inode)) { | ||
| 1009 | ret = walk_page_buffers(handle, | ||
| 1010 | page_buffers(page), | ||
| 1011 | from, to, NULL, | ||
| 1012 | ocfs2_journal_dirty_data); | ||
| 1013 | if (ret < 0) | ||
| 1014 | mlog_errno(ret); | ||
| 1015 | } | ||
| 1016 | |||
| 1017 | /* | ||
| 1018 | * We don't use generic_commit_write() because we need to | ||
| 1019 | * handle our own i_size update. | ||
| 1020 | */ | ||
| 1021 | ret = block_commit_write(page, from, to); | ||
| 1022 | if (ret) | ||
| 1023 | mlog_errno(ret); | ||
| 1024 | out: | ||
| 1025 | |||
| 1026 | return copied ? copied : ret; | ||
| 1027 | } | ||
| 1028 | |||
| 1029 | /* | ||
| 1030 | * Do the actual write of some data into an inode. Optionally allocate | ||
| 1031 | * in order to fulfill the write. | ||
| 1032 | * | ||
| 1033 | * cpos is the logical cluster offset within the file to write at | ||
| 1034 | * | ||
| 1035 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
| 1036 | * zero indicates that allocation is required. In this case, data_ac | ||
| 1037 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
| 1038 | * allocation isn't required). | ||
| 1039 | */ | ||
| 1040 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | ||
| 1041 | struct buffer_head *di_bh, | ||
| 1042 | struct ocfs2_alloc_context *data_ac, | ||
| 1043 | struct ocfs2_alloc_context *meta_ac, | ||
| 1044 | struct ocfs2_write_ctxt *wc) | ||
| 1045 | { | ||
| 1046 | int ret, i, numpages = 1, new; | ||
| 1047 | unsigned int copied = 0; | ||
| 1048 | u32 tmp_pos; | ||
| 1049 | u64 v_blkno, p_blkno; | ||
| 1050 | struct address_space *mapping = file->f_mapping; | ||
| 1051 | struct inode *inode = mapping->host; | ||
| 1052 | unsigned long index, start; | ||
| 1053 | struct page **cpages; | ||
| 1054 | |||
| 1055 | new = phys == 0 ? 1 : 0; | ||
| 1056 | |||
| 1057 | /* | ||
| 1058 | * Figure out how many pages we'll be manipulating here. For | ||
| 1059 | * non allocating write, we just change the one | ||
| 1060 | * page. Otherwise, we'll need a whole clusters worth. | ||
| 1061 | */ | ||
| 1062 | if (new) | ||
| 1063 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
| 1064 | |||
| 1065 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
| 1066 | if (!cpages) { | ||
| 1067 | ret = -ENOMEM; | ||
| 1068 | mlog_errno(ret); | ||
| 1069 | return ret; | ||
| 1070 | } | ||
| 1071 | |||
| 1072 | /* | ||
| 1073 | * Fill our page array first. That way we've grabbed enough so | ||
| 1074 | * that we can zero and flush if we error after adding the | ||
| 1075 | * extent. | ||
| 1076 | */ | ||
| 1077 | if (new) { | ||
| 1078 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | ||
| 1079 | wc->w_cpos); | ||
| 1080 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
| 1081 | } else { | ||
| 1082 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | ||
| 1083 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | ||
| 1084 | } | ||
| 1085 | |||
| 1086 | for(i = 0; i < numpages; i++) { | ||
| 1087 | index = start + i; | ||
| 1088 | |||
| 1089 | cpages[i] = grab_cache_page(mapping, index); | ||
| 1090 | if (!cpages[i]) { | ||
| 1091 | ret = -ENOMEM; | ||
| 1092 | mlog_errno(ret); | ||
| 1093 | goto out; | ||
| 1094 | } | ||
| 1095 | } | ||
| 1096 | |||
| 1097 | if (new) { | ||
| 1098 | /* | ||
| 1099 | * This is safe to call with the page locks - it won't take | ||
| 1100 | * any additional semaphores or cluster locks. | ||
| 1101 | */ | ||
| 1102 | tmp_pos = wc->w_cpos; | ||
| 1103 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | ||
| 1104 | &tmp_pos, 1, di_bh, handle, | ||
| 1105 | data_ac, meta_ac, NULL); | ||
| 1106 | /* | ||
| 1107 | * This shouldn't happen because we must have already | ||
| 1108 | * calculated the correct meta data allocation required. The | ||
| 1109 | * internal tree allocation code should know how to increase | ||
| 1110 | * transaction credits itself. | ||
| 1111 | * | ||
| 1112 | * If need be, we could handle -EAGAIN for a | ||
| 1113 | * RESTART_TRANS here. | ||
| 1114 | */ | ||
| 1115 | mlog_bug_on_msg(ret == -EAGAIN, | ||
| 1116 | "Inode %llu: EAGAIN return during allocation.\n", | ||
| 1117 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
| 1118 | if (ret < 0) { | ||
| 1119 | mlog_errno(ret); | ||
| 1120 | goto out; | ||
| 1121 | } | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | ||
| 1125 | NULL); | ||
| 1126 | if (ret < 0) { | ||
| 1127 | |||
| 1128 | /* | ||
| 1129 | * XXX: Should we go readonly here? | ||
| 1130 | */ | ||
| 1131 | |||
| 1132 | mlog_errno(ret); | ||
| 1133 | goto out; | ||
| 1134 | } | ||
| 1135 | |||
| 1136 | BUG_ON(p_blkno == 0); | ||
| 1137 | |||
| 1138 | for(i = 0; i < numpages; i++) { | ||
| 1139 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | ||
| 1140 | wc, new); | ||
| 1141 | if (ret < 0) { | ||
| 1142 | mlog_errno(ret); | ||
| 1143 | goto out; | ||
| 1144 | } | ||
| 1145 | |||
| 1146 | copied += ret; | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | out: | ||
| 1150 | for(i = 0; i < numpages; i++) { | ||
| 1151 | unlock_page(cpages[i]); | ||
| 1152 | mark_page_accessed(cpages[i]); | ||
| 1153 | page_cache_release(cpages[i]); | ||
| 1154 | } | ||
| 1155 | kfree(cpages); | ||
| 1156 | |||
| 1157 | return copied ? copied : ret; | ||
| 1158 | } | ||
| 1159 | |||
| 1160 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | ||
| 1161 | struct ocfs2_super *osb, loff_t pos, | ||
| 1162 | size_t count, ocfs2_page_writer *cb, | ||
| 1163 | void *cb_priv) | ||
| 1164 | { | ||
| 1165 | wc->w_count = count; | ||
| 1166 | wc->w_pos = pos; | ||
| 1167 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
| 1168 | wc->w_finished_copy = 0; | ||
| 1169 | |||
| 1170 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | ||
| 1171 | wc->w_large_pages = 1; | ||
| 1172 | else | ||
| 1173 | wc->w_large_pages = 0; | ||
| 1174 | |||
| 1175 | wc->w_write_data_page = cb; | ||
| 1176 | wc->w_private = cb_priv; | ||
| 1177 | } | ||
| 1178 | |||
| 1179 | /* | ||
| 1180 | * Write a cluster to an inode. The cluster may not be allocated yet, | ||
| 1181 | * in which case it will be. This only exists for buffered writes - | ||
| 1182 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
| 1183 | * | ||
| 1184 | * The caller is responsible for incrementing pos, written counts, etc | ||
| 1185 | * | ||
| 1186 | * For file systems that don't support sparse files, pre-allocation | ||
| 1187 | * and page zeroing up until cpos should be done prior to this | ||
| 1188 | * function call. | ||
| 1189 | * | ||
| 1190 | * Callers should be holding i_sem, and the rw cluster lock. | ||
| 1191 | * | ||
| 1192 | * Returns the number of user bytes written, or less than zero for | ||
| 1193 | * error. | ||
| 1194 | */ | ||
| 1195 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
| 1196 | size_t count, ocfs2_page_writer *actor, | ||
| 1197 | void *priv) | ||
| 1198 | { | ||
| 1199 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | ||
| 1200 | ssize_t written = 0; | ||
| 1201 | u32 phys; | ||
| 1202 | struct inode *inode = file->f_mapping->host; | ||
| 1203 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
| 1204 | struct buffer_head *di_bh = NULL; | ||
| 1205 | struct ocfs2_dinode *di; | ||
| 1206 | struct ocfs2_alloc_context *data_ac = NULL; | ||
| 1207 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
| 1208 | handle_t *handle; | ||
| 1209 | struct ocfs2_write_ctxt wc; | ||
| 1210 | |||
| 1211 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
| 1212 | |||
| 1213 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
| 1214 | if (ret) { | ||
| 1215 | mlog_errno(ret); | ||
| 1216 | goto out; | ||
| 1217 | } | ||
| 1218 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
| 1219 | |||
| 1220 | /* | ||
| 1221 | * Take alloc sem here to prevent concurrent lookups. That way | ||
| 1222 | * the mapping, zeroing and tree manipulation within | ||
| 1223 | * ocfs2_write() will be safe against ->readpage(). This | ||
| 1224 | * should also serve to lock out allocation from a shared | ||
| 1225 | * writeable region. | ||
| 1226 | */ | ||
| 1227 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 1228 | |||
| 1229 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | ||
| 1230 | if (ret) { | ||
| 1231 | mlog_errno(ret); | ||
| 1232 | goto out_meta; | ||
| 1233 | } | ||
| 1234 | |||
| 1235 | /* phys == 0 means that allocation is required. */ | ||
| 1236 | if (phys == 0) { | ||
| 1237 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | ||
| 1238 | if (ret) { | ||
| 1239 | mlog_errno(ret); | ||
| 1240 | goto out_meta; | ||
| 1241 | } | ||
| 1242 | |||
| 1243 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | ||
| 1244 | } | ||
| 1245 | |||
| 1246 | ret = ocfs2_data_lock(inode, 1); | ||
| 1247 | if (ret) { | ||
| 1248 | mlog_errno(ret); | ||
| 1249 | goto out_meta; | ||
| 1250 | } | ||
| 1251 | |||
| 1252 | handle = ocfs2_start_trans(osb, credits); | ||
| 1253 | if (IS_ERR(handle)) { | ||
| 1254 | ret = PTR_ERR(handle); | ||
| 1255 | mlog_errno(ret); | ||
| 1256 | goto out_data; | ||
| 1257 | } | ||
| 1258 | |||
| 1259 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | ||
| 1260 | meta_ac, &wc); | ||
| 1261 | if (written < 0) { | ||
| 1262 | ret = written; | ||
| 1263 | mlog_errno(ret); | ||
| 1264 | goto out_commit; | ||
| 1265 | } | ||
| 1266 | |||
| 1267 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
| 1268 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
| 1269 | if (ret) { | ||
| 1270 | mlog_errno(ret); | ||
| 1271 | goto out_commit; | ||
| 1272 | } | ||
| 1273 | |||
| 1274 | pos += written; | ||
| 1275 | if (pos > inode->i_size) { | ||
| 1276 | i_size_write(inode, pos); | ||
| 1277 | mark_inode_dirty(inode); | ||
| 1278 | } | ||
| 1279 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
| 1280 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
| 1281 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
| 1282 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
| 1283 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
| 1284 | |||
| 1285 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
| 1286 | if (ret) | ||
| 1287 | mlog_errno(ret); | ||
| 1288 | |||
| 1289 | out_commit: | ||
| 1290 | ocfs2_commit_trans(osb, handle); | ||
| 1291 | |||
| 1292 | out_data: | ||
| 1293 | ocfs2_data_unlock(inode, 1); | ||
| 1294 | |||
| 1295 | out_meta: | ||
| 1296 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
| 1297 | ocfs2_meta_unlock(inode, 1); | ||
| 1298 | |||
| 1299 | out: | ||
| 1300 | brelse(di_bh); | ||
| 1301 | if (data_ac) | ||
| 1302 | ocfs2_free_alloc_context(data_ac); | ||
| 1303 | if (meta_ac) | ||
| 1304 | ocfs2_free_alloc_context(meta_ac); | ||
| 1305 | |||
| 1306 | return written ? written : ret; | ||
| 1307 | } | ||
| 1308 | |||
| 678 | const struct address_space_operations ocfs2_aops = { | 1309 | const struct address_space_operations ocfs2_aops = { |
| 679 | .readpage = ocfs2_readpage, | 1310 | .readpage = ocfs2_readpage, |
| 680 | .writepage = ocfs2_writepage, | 1311 | .writepage = ocfs2_writepage, |
| 681 | .prepare_write = ocfs2_prepare_write, | ||
| 682 | .commit_write = ocfs2_commit_write, | ||
| 683 | .bmap = ocfs2_bmap, | 1312 | .bmap = ocfs2_bmap, |
| 684 | .sync_page = block_sync_page, | 1313 | .sync_page = block_sync_page, |
| 685 | .direct_IO = ocfs2_direct_IO, | 1314 | .direct_IO = ocfs2_direct_IO, |
