diff options
Diffstat (limited to 'fs/ocfs2/aops.c')
-rw-r--r-- | fs/ocfs2/aops.c | 1011 |
1 files changed, 820 insertions, 191 deletions
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 875c11443817..56963e6c46c0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c | |||
@@ -24,6 +24,8 @@ | |||
24 | #include <linux/highmem.h> | 24 | #include <linux/highmem.h> |
25 | #include <linux/pagemap.h> | 25 | #include <linux/pagemap.h> |
26 | #include <asm/byteorder.h> | 26 | #include <asm/byteorder.h> |
27 | #include <linux/swap.h> | ||
28 | #include <linux/pipe_fs_i.h> | ||
27 | 29 | ||
28 | #define MLOG_MASK_PREFIX ML_FILE_IO | 30 | #define MLOG_MASK_PREFIX ML_FILE_IO |
29 | #include <cluster/masklog.h> | 31 | #include <cluster/masklog.h> |
@@ -37,6 +39,7 @@ | |||
37 | #include "file.h" | 39 | #include "file.h" |
38 | #include "inode.h" | 40 | #include "inode.h" |
39 | #include "journal.h" | 41 | #include "journal.h" |
42 | #include "suballoc.h" | ||
40 | #include "super.h" | 43 | #include "super.h" |
41 | #include "symlink.h" | 44 | #include "symlink.h" |
42 | 45 | ||
@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
134 | struct buffer_head *bh_result, int create) | 137 | struct buffer_head *bh_result, int create) |
135 | { | 138 | { |
136 | int err = 0; | 139 | int err = 0; |
140 | unsigned int ext_flags; | ||
137 | u64 p_blkno, past_eof; | 141 | u64 p_blkno, past_eof; |
142 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
138 | 143 | ||
139 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, | 144 | mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode, |
140 | (unsigned long long)iblock, bh_result, create); | 145 | (unsigned long long)iblock, bh_result, create); |
@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
149 | goto bail; | 154 | goto bail; |
150 | } | 155 | } |
151 | 156 | ||
152 | /* this can happen if another node truncs after our extend! */ | 157 | err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL, |
153 | spin_lock(&OCFS2_I(inode)->ip_lock); | 158 | &ext_flags); |
154 | if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb, | ||
155 | OCFS2_I(inode)->ip_clusters)) | ||
156 | err = -EIO; | ||
157 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
158 | if (err) | ||
159 | goto bail; | ||
160 | |||
161 | err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | ||
162 | NULL); | ||
163 | if (err) { | 159 | if (err) { |
164 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " | 160 | mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, " |
165 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, | 161 | "%llu, NULL)\n", err, inode, (unsigned long long)iblock, |
@@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, | |||
167 | goto bail; | 163 | goto bail; |
168 | } | 164 | } |
169 | 165 | ||
170 | map_bh(bh_result, inode->i_sb, p_blkno); | 166 | /* |
171 | 167 | * ocfs2 never allocates in this function - the only time we | |
172 | if (bh_result->b_blocknr == 0) { | 168 | * need to use BH_New is when we're extending i_size on a file |
173 | err = -EIO; | 169 | * system which doesn't support holes, in which case BH_New |
174 | mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | 170 | * allows block_prepare_write() to zero. |
175 | (unsigned long long)iblock, | 171 | */ |
176 | (unsigned long long)p_blkno, | 172 | mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb), |
177 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | 173 | "ino %lu, iblock %llu\n", inode->i_ino, |
178 | } | 174 | (unsigned long long)iblock); |
175 | |||
176 | /* Treat the unwritten extent as a hole for zeroing purposes. */ | ||
177 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
178 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
179 | |||
180 | if (!ocfs2_sparse_alloc(osb)) { | ||
181 | if (p_blkno == 0) { | ||
182 | err = -EIO; | ||
183 | mlog(ML_ERROR, | ||
184 | "iblock = %llu p_blkno = %llu blkno=(%llu)\n", | ||
185 | (unsigned long long)iblock, | ||
186 | (unsigned long long)p_blkno, | ||
187 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
188 | mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); | ||
189 | dump_stack(); | ||
190 | } | ||
179 | 191 | ||
180 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); | 192 | past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
181 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, | 193 | mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino, |
182 | (unsigned long long)past_eof); | 194 | (unsigned long long)past_eof); |
183 | 195 | ||
184 | if (create && (iblock >= past_eof)) | 196 | if (create && (iblock >= past_eof)) |
185 | set_buffer_new(bh_result); | 197 | set_buffer_new(bh_result); |
198 | } | ||
186 | 199 | ||
187 | bail: | 200 | bail: |
188 | if (err < 0) | 201 | if (err < 0) |
@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc) | |||
276 | return ret; | 289 | return ret; |
277 | } | 290 | } |
278 | 291 | ||
279 | /* This can also be called from ocfs2_write_zero_page() which has done | 292 | /* |
280 | * it's own cluster locking. */ | 293 | * This is called from ocfs2_write_zero_page() which has handled it's |
294 | * own cluster locking and has ensured allocation exists for those | ||
295 | * blocks to be written. | ||
296 | */ | ||
281 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | 297 | int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, |
282 | unsigned from, unsigned to) | 298 | unsigned from, unsigned to) |
283 | { | 299 | { |
@@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page, | |||
292 | return ret; | 308 | return ret; |
293 | } | 309 | } |
294 | 310 | ||
295 | /* | ||
296 | * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called | ||
297 | * from loopback. It must be able to perform its own locking around | ||
298 | * ocfs2_get_block(). | ||
299 | */ | ||
300 | static int ocfs2_prepare_write(struct file *file, struct page *page, | ||
301 | unsigned from, unsigned to) | ||
302 | { | ||
303 | struct inode *inode = page->mapping->host; | ||
304 | int ret; | ||
305 | |||
306 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
307 | |||
308 | ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page); | ||
309 | if (ret != 0) { | ||
310 | mlog_errno(ret); | ||
311 | goto out; | ||
312 | } | ||
313 | |||
314 | ret = ocfs2_prepare_write_nolock(inode, page, from, to); | ||
315 | |||
316 | ocfs2_meta_unlock(inode, 0); | ||
317 | out: | ||
318 | mlog_exit(ret); | ||
319 | return ret; | ||
320 | } | ||
321 | |||
322 | /* Taken from ext3. We don't necessarily need the full blown | 311 | /* Taken from ext3. We don't necessarily need the full blown |
323 | * functionality yet, but IMHO it's better to cut and paste the whole | 312 | * functionality yet, but IMHO it's better to cut and paste the whole |
324 | * thing so we can avoid introducing our own bugs (and easily pick up | 313 | * thing so we can avoid introducing our own bugs (and easily pick up |
325 | * their fixes when they happen) --Mark */ | 314 | * their fixes when they happen) --Mark */ |
326 | static int walk_page_buffers( handle_t *handle, | 315 | int walk_page_buffers( handle_t *handle, |
327 | struct buffer_head *head, | 316 | struct buffer_head *head, |
328 | unsigned from, | 317 | unsigned from, |
329 | unsigned to, | 318 | unsigned to, |
330 | int *partial, | 319 | int *partial, |
331 | int (*fn)( handle_t *handle, | 320 | int (*fn)( handle_t *handle, |
332 | struct buffer_head *bh)) | 321 | struct buffer_head *bh)) |
333 | { | 322 | { |
334 | struct buffer_head *bh; | 323 | struct buffer_head *bh; |
335 | unsigned block_start, block_end; | 324 | unsigned block_start, block_end; |
@@ -388,95 +377,6 @@ out: | |||
388 | return handle; | 377 | return handle; |
389 | } | 378 | } |
390 | 379 | ||
391 | static int ocfs2_commit_write(struct file *file, struct page *page, | ||
392 | unsigned from, unsigned to) | ||
393 | { | ||
394 | int ret; | ||
395 | struct buffer_head *di_bh = NULL; | ||
396 | struct inode *inode = page->mapping->host; | ||
397 | handle_t *handle = NULL; | ||
398 | struct ocfs2_dinode *di; | ||
399 | |||
400 | mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to); | ||
401 | |||
402 | /* NOTE: ocfs2_file_aio_write has ensured that it's safe for | ||
403 | * us to continue here without rechecking the I/O against | ||
404 | * changed inode values. | ||
405 | * | ||
406 | * 1) We're currently holding the inode alloc lock, so no | ||
407 | * nodes can change it underneath us. | ||
408 | * | ||
409 | * 2) We've had to take the metadata lock at least once | ||
410 | * already to check for extending writes, suid removal, etc. | ||
411 | * The meta data update code then ensures that we don't get a | ||
412 | * stale inode allocation image (i_size, i_clusters, etc). | ||
413 | */ | ||
414 | |||
415 | ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page); | ||
416 | if (ret != 0) { | ||
417 | mlog_errno(ret); | ||
418 | goto out; | ||
419 | } | ||
420 | |||
421 | ret = ocfs2_data_lock_with_page(inode, 1, page); | ||
422 | if (ret != 0) { | ||
423 | mlog_errno(ret); | ||
424 | goto out_unlock_meta; | ||
425 | } | ||
426 | |||
427 | handle = ocfs2_start_walk_page_trans(inode, page, from, to); | ||
428 | if (IS_ERR(handle)) { | ||
429 | ret = PTR_ERR(handle); | ||
430 | goto out_unlock_data; | ||
431 | } | ||
432 | |||
433 | /* Mark our buffer early. We'd rather catch this error up here | ||
434 | * as opposed to after a successful commit_write which would | ||
435 | * require us to set back inode->i_size. */ | ||
436 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
437 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
438 | if (ret < 0) { | ||
439 | mlog_errno(ret); | ||
440 | goto out_commit; | ||
441 | } | ||
442 | |||
443 | /* might update i_size */ | ||
444 | ret = generic_commit_write(file, page, from, to); | ||
445 | if (ret < 0) { | ||
446 | mlog_errno(ret); | ||
447 | goto out_commit; | ||
448 | } | ||
449 | |||
450 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
451 | |||
452 | /* ocfs2_mark_inode_dirty() is too heavy to use here. */ | ||
453 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
454 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
455 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
456 | |||
457 | inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode))); | ||
458 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
459 | |||
460 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
461 | if (ret < 0) { | ||
462 | mlog_errno(ret); | ||
463 | goto out_commit; | ||
464 | } | ||
465 | |||
466 | out_commit: | ||
467 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | ||
468 | out_unlock_data: | ||
469 | ocfs2_data_unlock(inode, 1); | ||
470 | out_unlock_meta: | ||
471 | ocfs2_meta_unlock(inode, 1); | ||
472 | out: | ||
473 | if (di_bh) | ||
474 | brelse(di_bh); | ||
475 | |||
476 | mlog_exit(ret); | ||
477 | return ret; | ||
478 | } | ||
479 | |||
480 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | 380 | static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) |
481 | { | 381 | { |
482 | sector_t status; | 382 | sector_t status; |
@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block) | |||
499 | down_read(&OCFS2_I(inode)->ip_alloc_sem); | 399 | down_read(&OCFS2_I(inode)->ip_alloc_sem); |
500 | } | 400 | } |
501 | 401 | ||
502 | err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno, | 402 | err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL); |
503 | NULL); | ||
504 | 403 | ||
505 | if (!INODE_JOURNAL(inode)) { | 404 | if (!INODE_JOURNAL(inode)) { |
506 | up_read(&OCFS2_I(inode)->ip_alloc_sem); | 405 | up_read(&OCFS2_I(inode)->ip_alloc_sem); |
@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
540 | struct buffer_head *bh_result, int create) | 439 | struct buffer_head *bh_result, int create) |
541 | { | 440 | { |
542 | int ret; | 441 | int ret; |
543 | u64 p_blkno, inode_blocks; | 442 | u64 p_blkno, inode_blocks, contig_blocks; |
544 | int contig_blocks; | 443 | unsigned int ext_flags; |
545 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; | 444 | unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; |
546 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; | 445 | unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; |
547 | 446 | ||
@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
549 | * nicely aligned and of the right size, so there's no need | 448 | * nicely aligned and of the right size, so there's no need |
550 | * for us to check any of that. */ | 449 | * for us to check any of that. */ |
551 | 450 | ||
552 | spin_lock(&OCFS2_I(inode)->ip_lock); | 451 | inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); |
553 | inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb, | ||
554 | OCFS2_I(inode)->ip_clusters); | ||
555 | |||
556 | /* | ||
557 | * For a read which begins past the end of file, we return a hole. | ||
558 | */ | ||
559 | if (!create && (iblock >= inode_blocks)) { | ||
560 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
561 | ret = 0; | ||
562 | goto bail; | ||
563 | } | ||
564 | 452 | ||
565 | /* | 453 | /* |
566 | * Any write past EOF is not allowed because we'd be extending. | 454 | * Any write past EOF is not allowed because we'd be extending. |
567 | */ | 455 | */ |
568 | if (create && (iblock + max_blocks) > inode_blocks) { | 456 | if (create && (iblock + max_blocks) > inode_blocks) { |
569 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
570 | ret = -EIO; | 457 | ret = -EIO; |
571 | goto bail; | 458 | goto bail; |
572 | } | 459 | } |
573 | spin_unlock(&OCFS2_I(inode)->ip_lock); | ||
574 | 460 | ||
575 | /* This figures out the size of the next contiguous block, and | 461 | /* This figures out the size of the next contiguous block, and |
576 | * our logical offset */ | 462 | * our logical offset */ |
577 | ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno, | 463 | ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, |
578 | &contig_blocks); | 464 | &contig_blocks, &ext_flags); |
579 | if (ret) { | 465 | if (ret) { |
580 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", | 466 | mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n", |
581 | (unsigned long long)iblock); | 467 | (unsigned long long)iblock); |
@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, | |||
583 | goto bail; | 469 | goto bail; |
584 | } | 470 | } |
585 | 471 | ||
586 | map_bh(bh_result, inode->i_sb, p_blkno); | 472 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { |
473 | ocfs2_error(inode->i_sb, | ||
474 | "Inode %llu has a hole at block %llu\n", | ||
475 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | ||
476 | (unsigned long long)iblock); | ||
477 | ret = -EROFS; | ||
478 | goto bail; | ||
479 | } | ||
480 | |||
481 | /* | ||
482 | * get_more_blocks() expects us to describe a hole by clearing | ||
483 | * the mapped bit on bh_result(). | ||
484 | * | ||
485 | * Consider an unwritten extent as a hole. | ||
486 | */ | ||
487 | if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN)) | ||
488 | map_bh(bh_result, inode->i_sb, p_blkno); | ||
489 | else { | ||
490 | /* | ||
491 | * ocfs2_prepare_inode_for_write() should have caught | ||
492 | * the case where we'd be filling a hole and triggered | ||
493 | * a buffered write instead. | ||
494 | */ | ||
495 | if (create) { | ||
496 | ret = -EIO; | ||
497 | mlog_errno(ret); | ||
498 | goto bail; | ||
499 | } | ||
500 | |||
501 | clear_buffer_mapped(bh_result); | ||
502 | } | ||
587 | 503 | ||
588 | /* make sure we don't map more than max_blocks blocks here as | 504 | /* make sure we don't map more than max_blocks blocks here as |
589 | that's all the kernel will handle at this point. */ | 505 | that's all the kernel will handle at this point. */ |
@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, | |||
606 | void *private) | 522 | void *private) |
607 | { | 523 | { |
608 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 524 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
525 | int level; | ||
609 | 526 | ||
610 | /* this io's submitter should not have unlocked this before we could */ | 527 | /* this io's submitter should not have unlocked this before we could */ |
611 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); | 528 | BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); |
529 | |||
612 | ocfs2_iocb_clear_rw_locked(iocb); | 530 | ocfs2_iocb_clear_rw_locked(iocb); |
613 | up_read(&inode->i_alloc_sem); | 531 | |
614 | ocfs2_rw_unlock(inode, 0); | 532 | level = ocfs2_iocb_rw_locked_level(iocb); |
533 | if (!level) | ||
534 | up_read(&inode->i_alloc_sem); | ||
535 | ocfs2_rw_unlock(inode, level); | ||
615 | } | 536 | } |
616 | 537 | ||
617 | /* | 538 | /* |
@@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw, | |||
647 | 568 | ||
648 | mlog_entry_void(); | 569 | mlog_entry_void(); |
649 | 570 | ||
650 | /* | 571 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) { |
651 | * We get PR data locks even for O_DIRECT. This allows | 572 | /* |
652 | * concurrent O_DIRECT I/O but doesn't let O_DIRECT with | 573 | * We get PR data locks even for O_DIRECT. This |
653 | * extending and buffered zeroing writes race. If they did | 574 | * allows concurrent O_DIRECT I/O but doesn't let |
654 | * race then the buffered zeroing could be written back after | 575 | * O_DIRECT with extending and buffered zeroing writes |
655 | * the O_DIRECT I/O. It's one thing to tell people not to mix | 576 | * race. If they did race then the buffered zeroing |
656 | * buffered and O_DIRECT writes, but expecting them to | 577 | * could be written back after the O_DIRECT I/O. It's |
657 | * understand that file extension is also an implicit buffered | 578 | * one thing to tell people not to mix buffered and |
658 | * write is too much. By getting the PR we force writeback of | 579 | * O_DIRECT writes, but expecting them to understand |
659 | * the buffered zeroing before proceeding. | 580 | * that file extension is also an implicit buffered |
660 | */ | 581 | * write is too much. By getting the PR we force |
661 | ret = ocfs2_data_lock(inode, 0); | 582 | * writeback of the buffered zeroing before |
662 | if (ret < 0) { | 583 | * proceeding. |
663 | mlog_errno(ret); | 584 | */ |
664 | goto out; | 585 | ret = ocfs2_data_lock(inode, 0); |
586 | if (ret < 0) { | ||
587 | mlog_errno(ret); | ||
588 | goto out; | ||
589 | } | ||
590 | ocfs2_data_unlock(inode, 0); | ||
665 | } | 591 | } |
666 | ocfs2_data_unlock(inode, 0); | ||
667 | 592 | ||
668 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, | 593 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, |
669 | inode->i_sb->s_bdev, iov, offset, | 594 | inode->i_sb->s_bdev, iov, offset, |
@@ -675,11 +600,715 @@ out: | |||
675 | return ret; | 600 | return ret; |
676 | } | 601 | } |
677 | 602 | ||
603 | static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb, | ||
604 | u32 cpos, | ||
605 | unsigned int *start, | ||
606 | unsigned int *end) | ||
607 | { | ||
608 | unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE; | ||
609 | |||
610 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) { | ||
611 | unsigned int cpp; | ||
612 | |||
613 | cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits); | ||
614 | |||
615 | cluster_start = cpos % cpp; | ||
616 | cluster_start = cluster_start << osb->s_clustersize_bits; | ||
617 | |||
618 | cluster_end = cluster_start + osb->s_clustersize; | ||
619 | } | ||
620 | |||
621 | BUG_ON(cluster_start > PAGE_SIZE); | ||
622 | BUG_ON(cluster_end > PAGE_SIZE); | ||
623 | |||
624 | if (start) | ||
625 | *start = cluster_start; | ||
626 | if (end) | ||
627 | *end = cluster_end; | ||
628 | } | ||
629 | |||
630 | /* | ||
631 | * 'from' and 'to' are the region in the page to avoid zeroing. | ||
632 | * | ||
633 | * If pagesize > clustersize, this function will avoid zeroing outside | ||
634 | * of the cluster boundary. | ||
635 | * | ||
636 | * from == to == 0 is code for "zero the entire cluster region" | ||
637 | */ | ||
638 | static void ocfs2_clear_page_regions(struct page *page, | ||
639 | struct ocfs2_super *osb, u32 cpos, | ||
640 | unsigned from, unsigned to) | ||
641 | { | ||
642 | void *kaddr; | ||
643 | unsigned int cluster_start, cluster_end; | ||
644 | |||
645 | ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end); | ||
646 | |||
647 | kaddr = kmap_atomic(page, KM_USER0); | ||
648 | |||
649 | if (from || to) { | ||
650 | if (from > cluster_start) | ||
651 | memset(kaddr + cluster_start, 0, from - cluster_start); | ||
652 | if (to < cluster_end) | ||
653 | memset(kaddr + to, 0, cluster_end - to); | ||
654 | } else { | ||
655 | memset(kaddr + cluster_start, 0, cluster_end - cluster_start); | ||
656 | } | ||
657 | |||
658 | kunmap_atomic(kaddr, KM_USER0); | ||
659 | } | ||
660 | |||
661 | /* | ||
662 | * Some of this taken from block_prepare_write(). We already have our | ||
663 | * mapping by now though, and the entire write will be allocating or | ||
664 | * it won't, so not much need to use BH_New. | ||
665 | * | ||
666 | * This will also skip zeroing, which is handled externally. | ||
667 | */ | ||
668 | int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, | ||
669 | struct inode *inode, unsigned int from, | ||
670 | unsigned int to, int new) | ||
671 | { | ||
672 | int ret = 0; | ||
673 | struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; | ||
674 | unsigned int block_end, block_start; | ||
675 | unsigned int bsize = 1 << inode->i_blkbits; | ||
676 | |||
677 | if (!page_has_buffers(page)) | ||
678 | create_empty_buffers(page, bsize, 0); | ||
679 | |||
680 | head = page_buffers(page); | ||
681 | for (bh = head, block_start = 0; bh != head || !block_start; | ||
682 | bh = bh->b_this_page, block_start += bsize) { | ||
683 | block_end = block_start + bsize; | ||
684 | |||
685 | /* | ||
686 | * Ignore blocks outside of our i/o range - | ||
687 | * they may belong to unallocated clusters. | ||
688 | */ | ||
689 | if (block_start >= to || block_end <= from) { | ||
690 | if (PageUptodate(page)) | ||
691 | set_buffer_uptodate(bh); | ||
692 | continue; | ||
693 | } | ||
694 | |||
695 | /* | ||
696 | * For an allocating write with cluster size >= page | ||
697 | * size, we always write the entire page. | ||
698 | */ | ||
699 | |||
700 | if (buffer_new(bh)) | ||
701 | clear_buffer_new(bh); | ||
702 | |||
703 | if (!buffer_mapped(bh)) { | ||
704 | map_bh(bh, inode->i_sb, *p_blkno); | ||
705 | unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr); | ||
706 | } | ||
707 | |||
708 | if (PageUptodate(page)) { | ||
709 | if (!buffer_uptodate(bh)) | ||
710 | set_buffer_uptodate(bh); | ||
711 | } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && | ||
712 | (block_start < from || block_end > to)) { | ||
713 | ll_rw_block(READ, 1, &bh); | ||
714 | *wait_bh++=bh; | ||
715 | } | ||
716 | |||
717 | *p_blkno = *p_blkno + 1; | ||
718 | } | ||
719 | |||
720 | /* | ||
721 | * If we issued read requests - let them complete. | ||
722 | */ | ||
723 | while(wait_bh > wait) { | ||
724 | wait_on_buffer(*--wait_bh); | ||
725 | if (!buffer_uptodate(*wait_bh)) | ||
726 | ret = -EIO; | ||
727 | } | ||
728 | |||
729 | if (ret == 0 || !new) | ||
730 | return ret; | ||
731 | |||
732 | /* | ||
733 | * If we get -EIO above, zero out any newly allocated blocks | ||
734 | * to avoid exposing stale data. | ||
735 | */ | ||
736 | bh = head; | ||
737 | block_start = 0; | ||
738 | do { | ||
739 | void *kaddr; | ||
740 | |||
741 | block_end = block_start + bsize; | ||
742 | if (block_end <= from) | ||
743 | goto next_bh; | ||
744 | if (block_start >= to) | ||
745 | break; | ||
746 | |||
747 | kaddr = kmap_atomic(page, KM_USER0); | ||
748 | memset(kaddr+block_start, 0, bh->b_size); | ||
749 | flush_dcache_page(page); | ||
750 | kunmap_atomic(kaddr, KM_USER0); | ||
751 | set_buffer_uptodate(bh); | ||
752 | mark_buffer_dirty(bh); | ||
753 | |||
754 | next_bh: | ||
755 | block_start = block_end; | ||
756 | bh = bh->b_this_page; | ||
757 | } while (bh != head); | ||
758 | |||
759 | return ret; | ||
760 | } | ||
761 | |||
762 | /* | ||
763 | * This will copy user data from the buffer page in the splice | ||
764 | * context. | ||
765 | * | ||
766 | * For now, we ignore SPLICE_F_MOVE as that would require some extra | ||
767 | * communication out all the way to ocfs2_write(). | ||
768 | */ | ||
769 | int ocfs2_map_and_write_splice_data(struct inode *inode, | ||
770 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
771 | unsigned int *ret_from, unsigned int *ret_to) | ||
772 | { | ||
773 | int ret; | ||
774 | unsigned int to, from, cluster_start, cluster_end; | ||
775 | char *src, *dst; | ||
776 | struct ocfs2_splice_write_priv *sp = wc->w_private; | ||
777 | struct pipe_buffer *buf = sp->s_buf; | ||
778 | unsigned long bytes, src_from; | ||
779 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
780 | |||
781 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
782 | &cluster_end); | ||
783 | |||
784 | from = sp->s_offset; | ||
785 | src_from = sp->s_buf_offset; | ||
786 | bytes = wc->w_count; | ||
787 | |||
788 | if (wc->w_large_pages) { | ||
789 | /* | ||
790 | * For cluster size < page size, we have to | ||
791 | * calculate pos within the cluster and obey | ||
792 | * the rightmost boundary. | ||
793 | */ | ||
794 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
795 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
796 | } | ||
797 | to = from + bytes; | ||
798 | |||
799 | if (wc->w_this_page_new) | ||
800 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
801 | cluster_start, cluster_end, 1); | ||
802 | else | ||
803 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
804 | from, to, 0); | ||
805 | if (ret) { | ||
806 | mlog_errno(ret); | ||
807 | goto out; | ||
808 | } | ||
809 | |||
810 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
811 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
812 | BUG_ON(from > osb->s_clustersize); | ||
813 | BUG_ON(to > osb->s_clustersize); | ||
814 | |||
815 | src = buf->ops->map(sp->s_pipe, buf, 1); | ||
816 | dst = kmap_atomic(wc->w_this_page, KM_USER1); | ||
817 | memcpy(dst + from, src + src_from, bytes); | ||
818 | kunmap_atomic(wc->w_this_page, KM_USER1); | ||
819 | buf->ops->unmap(sp->s_pipe, buf, src); | ||
820 | |||
821 | wc->w_finished_copy = 1; | ||
822 | |||
823 | *ret_from = from; | ||
824 | *ret_to = to; | ||
825 | out: | ||
826 | |||
827 | return bytes ? (unsigned int)bytes : ret; | ||
828 | } | ||
829 | |||
830 | /* | ||
831 | * This will copy user data from the iovec in the buffered write | ||
832 | * context. | ||
833 | */ | ||
834 | int ocfs2_map_and_write_user_data(struct inode *inode, | ||
835 | struct ocfs2_write_ctxt *wc, u64 *p_blkno, | ||
836 | unsigned int *ret_from, unsigned int *ret_to) | ||
837 | { | ||
838 | int ret; | ||
839 | unsigned int to, from, cluster_start, cluster_end; | ||
840 | unsigned long bytes, src_from; | ||
841 | char *dst; | ||
842 | struct ocfs2_buffered_write_priv *bp = wc->w_private; | ||
843 | const struct iovec *cur_iov = bp->b_cur_iov; | ||
844 | char __user *buf; | ||
845 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
846 | |||
847 | ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start, | ||
848 | &cluster_end); | ||
849 | |||
850 | buf = cur_iov->iov_base + bp->b_cur_off; | ||
851 | src_from = (unsigned long)buf & ~PAGE_CACHE_MASK; | ||
852 | |||
853 | from = wc->w_pos & (PAGE_CACHE_SIZE - 1); | ||
854 | |||
855 | /* | ||
856 | * This is a lot of comparisons, but it reads quite | ||
857 | * easily, which is important here. | ||
858 | */ | ||
859 | /* Stay within the src page */ | ||
860 | bytes = PAGE_SIZE - src_from; | ||
861 | /* Stay within the vector */ | ||
862 | bytes = min(bytes, | ||
863 | (unsigned long)(cur_iov->iov_len - bp->b_cur_off)); | ||
864 | /* Stay within count */ | ||
865 | bytes = min(bytes, (unsigned long)wc->w_count); | ||
866 | /* | ||
867 | * For clustersize > page size, just stay within | ||
868 | * target page, otherwise we have to calculate pos | ||
869 | * within the cluster and obey the rightmost | ||
870 | * boundary. | ||
871 | */ | ||
872 | if (wc->w_large_pages) { | ||
873 | /* | ||
874 | * For cluster size < page size, we have to | ||
875 | * calculate pos within the cluster and obey | ||
876 | * the rightmost boundary. | ||
877 | */ | ||
878 | bytes = min(bytes, (unsigned long)(osb->s_clustersize | ||
879 | - (wc->w_pos & (osb->s_clustersize - 1)))); | ||
880 | } else { | ||
881 | /* | ||
882 | * cluster size > page size is the most common | ||
883 | * case - we just stay within the target page | ||
884 | * boundary. | ||
885 | */ | ||
886 | bytes = min(bytes, PAGE_CACHE_SIZE - from); | ||
887 | } | ||
888 | |||
889 | to = from + bytes; | ||
890 | |||
891 | if (wc->w_this_page_new) | ||
892 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
893 | cluster_start, cluster_end, 1); | ||
894 | else | ||
895 | ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode, | ||
896 | from, to, 0); | ||
897 | if (ret) { | ||
898 | mlog_errno(ret); | ||
899 | goto out; | ||
900 | } | ||
901 | |||
902 | BUG_ON(from > PAGE_CACHE_SIZE); | ||
903 | BUG_ON(to > PAGE_CACHE_SIZE); | ||
904 | BUG_ON(from > osb->s_clustersize); | ||
905 | BUG_ON(to > osb->s_clustersize); | ||
906 | |||
907 | dst = kmap(wc->w_this_page); | ||
908 | memcpy(dst + from, bp->b_src_buf + src_from, bytes); | ||
909 | kunmap(wc->w_this_page); | ||
910 | |||
911 | /* | ||
912 | * XXX: This is slow, but simple. The caller of | ||
913 | * ocfs2_buffered_write_cluster() is responsible for | ||
914 | * passing through the iovecs, so it's difficult to | ||
915 | * predict what our next step is in here after our | ||
916 | * initial write. A future version should be pushing | ||
917 | * that iovec manipulation further down. | ||
918 | * | ||
919 | * By setting this, we indicate that a copy from user | ||
920 | * data was done, and subsequent calls for this | ||
921 | * cluster will skip copying more data. | ||
922 | */ | ||
923 | wc->w_finished_copy = 1; | ||
924 | |||
925 | *ret_from = from; | ||
926 | *ret_to = to; | ||
927 | out: | ||
928 | |||
929 | return bytes ? (unsigned int)bytes : ret; | ||
930 | } | ||
931 | |||
932 | /* | ||
933 | * Map, fill and write a page to disk. | ||
934 | * | ||
935 | * The work of copying data is done via callback. Newly allocated | ||
936 | * pages which don't take user data will be zero'd (set 'new' to | ||
937 | * indicate an allocating write) | ||
938 | * | ||
939 | * Returns a negative error code or the number of bytes copied into | ||
940 | * the page. | ||
941 | */ | ||
942 | int ocfs2_write_data_page(struct inode *inode, handle_t *handle, | ||
943 | u64 *p_blkno, struct page *page, | ||
944 | struct ocfs2_write_ctxt *wc, int new) | ||
945 | { | ||
946 | int ret, copied = 0; | ||
947 | unsigned int from = 0, to = 0; | ||
948 | unsigned int cluster_start, cluster_end; | ||
949 | unsigned int zero_from = 0, zero_to = 0; | ||
950 | |||
951 | ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos, | ||
952 | &cluster_start, &cluster_end); | ||
953 | |||
954 | if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index | ||
955 | && !wc->w_finished_copy) { | ||
956 | |||
957 | wc->w_this_page = page; | ||
958 | wc->w_this_page_new = new; | ||
959 | ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to); | ||
960 | if (ret < 0) { | ||
961 | mlog_errno(ret); | ||
962 | goto out; | ||
963 | } | ||
964 | |||
965 | copied = ret; | ||
966 | |||
967 | zero_from = from; | ||
968 | zero_to = to; | ||
969 | if (new) { | ||
970 | from = cluster_start; | ||
971 | to = cluster_end; | ||
972 | } | ||
973 | } else { | ||
974 | /* | ||
975 | * If we haven't allocated the new page yet, we | ||
976 | * shouldn't be writing it out without copying user | ||
977 | * data. This is likely a math error from the caller. | ||
978 | */ | ||
979 | BUG_ON(!new); | ||
980 | |||
981 | from = cluster_start; | ||
982 | to = cluster_end; | ||
983 | |||
984 | ret = ocfs2_map_page_blocks(page, p_blkno, inode, | ||
985 | cluster_start, cluster_end, 1); | ||
986 | if (ret) { | ||
987 | mlog_errno(ret); | ||
988 | goto out; | ||
989 | } | ||
990 | } | ||
991 | |||
992 | /* | ||
993 | * Parts of newly allocated pages need to be zero'd. | ||
994 | * | ||
995 | * Above, we have also rewritten 'to' and 'from' - as far as | ||
996 | * the rest of the function is concerned, the entire cluster | ||
997 | * range inside of a page needs to be written. | ||
998 | * | ||
999 | * We can skip this if the page is up to date - it's already | ||
1000 | * been zero'd from being read in as a hole. | ||
1001 | */ | ||
1002 | if (new && !PageUptodate(page)) | ||
1003 | ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb), | ||
1004 | wc->w_cpos, zero_from, zero_to); | ||
1005 | |||
1006 | flush_dcache_page(page); | ||
1007 | |||
1008 | if (ocfs2_should_order_data(inode)) { | ||
1009 | ret = walk_page_buffers(handle, | ||
1010 | page_buffers(page), | ||
1011 | from, to, NULL, | ||
1012 | ocfs2_journal_dirty_data); | ||
1013 | if (ret < 0) | ||
1014 | mlog_errno(ret); | ||
1015 | } | ||
1016 | |||
1017 | /* | ||
1018 | * We don't use generic_commit_write() because we need to | ||
1019 | * handle our own i_size update. | ||
1020 | */ | ||
1021 | ret = block_commit_write(page, from, to); | ||
1022 | if (ret) | ||
1023 | mlog_errno(ret); | ||
1024 | out: | ||
1025 | |||
1026 | return copied ? copied : ret; | ||
1027 | } | ||
1028 | |||
1029 | /* | ||
1030 | * Do the actual write of some data into an inode. Optionally allocate | ||
1031 | * in order to fulfill the write. | ||
1032 | * | ||
1033 | * cpos is the logical cluster offset within the file to write at | ||
1034 | * | ||
1035 | * 'phys' is the physical mapping of that offset. a 'phys' value of | ||
1036 | * zero indicates that allocation is required. In this case, data_ac | ||
1037 | * and meta_ac should be valid (meta_ac can be null if metadata | ||
1038 | * allocation isn't required). | ||
1039 | */ | ||
1040 | static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle, | ||
1041 | struct buffer_head *di_bh, | ||
1042 | struct ocfs2_alloc_context *data_ac, | ||
1043 | struct ocfs2_alloc_context *meta_ac, | ||
1044 | struct ocfs2_write_ctxt *wc) | ||
1045 | { | ||
1046 | int ret, i, numpages = 1, new; | ||
1047 | unsigned int copied = 0; | ||
1048 | u32 tmp_pos; | ||
1049 | u64 v_blkno, p_blkno; | ||
1050 | struct address_space *mapping = file->f_mapping; | ||
1051 | struct inode *inode = mapping->host; | ||
1052 | unsigned long index, start; | ||
1053 | struct page **cpages; | ||
1054 | |||
1055 | new = phys == 0 ? 1 : 0; | ||
1056 | |||
1057 | /* | ||
1058 | * Figure out how many pages we'll be manipulating here. For | ||
1059 | * non allocating write, we just change the one | ||
1060 | * page. Otherwise, we'll need a whole clusters worth. | ||
1061 | */ | ||
1062 | if (new) | ||
1063 | numpages = ocfs2_pages_per_cluster(inode->i_sb); | ||
1064 | |||
1065 | cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS); | ||
1066 | if (!cpages) { | ||
1067 | ret = -ENOMEM; | ||
1068 | mlog_errno(ret); | ||
1069 | return ret; | ||
1070 | } | ||
1071 | |||
1072 | /* | ||
1073 | * Fill our page array first. That way we've grabbed enough so | ||
1074 | * that we can zero and flush if we error after adding the | ||
1075 | * extent. | ||
1076 | */ | ||
1077 | if (new) { | ||
1078 | start = ocfs2_align_clusters_to_page_index(inode->i_sb, | ||
1079 | wc->w_cpos); | ||
1080 | v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos); | ||
1081 | } else { | ||
1082 | start = wc->w_pos >> PAGE_CACHE_SHIFT; | ||
1083 | v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits; | ||
1084 | } | ||
1085 | |||
1086 | for(i = 0; i < numpages; i++) { | ||
1087 | index = start + i; | ||
1088 | |||
1089 | cpages[i] = grab_cache_page(mapping, index); | ||
1090 | if (!cpages[i]) { | ||
1091 | ret = -ENOMEM; | ||
1092 | mlog_errno(ret); | ||
1093 | goto out; | ||
1094 | } | ||
1095 | } | ||
1096 | |||
1097 | if (new) { | ||
1098 | /* | ||
1099 | * This is safe to call with the page locks - it won't take | ||
1100 | * any additional semaphores or cluster locks. | ||
1101 | */ | ||
1102 | tmp_pos = wc->w_cpos; | ||
1103 | ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, | ||
1104 | &tmp_pos, 1, di_bh, handle, | ||
1105 | data_ac, meta_ac, NULL); | ||
1106 | /* | ||
1107 | * This shouldn't happen because we must have already | ||
1108 | * calculated the correct meta data allocation required. The | ||
1109 | * internal tree allocation code should know how to increase | ||
1110 | * transaction credits itself. | ||
1111 | * | ||
1112 | * If need be, we could handle -EAGAIN for a | ||
1113 | * RESTART_TRANS here. | ||
1114 | */ | ||
1115 | mlog_bug_on_msg(ret == -EAGAIN, | ||
1116 | "Inode %llu: EAGAIN return during allocation.\n", | ||
1117 | (unsigned long long)OCFS2_I(inode)->ip_blkno); | ||
1118 | if (ret < 0) { | ||
1119 | mlog_errno(ret); | ||
1120 | goto out; | ||
1121 | } | ||
1122 | } | ||
1123 | |||
1124 | ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL, | ||
1125 | NULL); | ||
1126 | if (ret < 0) { | ||
1127 | |||
1128 | /* | ||
1129 | * XXX: Should we go readonly here? | ||
1130 | */ | ||
1131 | |||
1132 | mlog_errno(ret); | ||
1133 | goto out; | ||
1134 | } | ||
1135 | |||
1136 | BUG_ON(p_blkno == 0); | ||
1137 | |||
1138 | for(i = 0; i < numpages; i++) { | ||
1139 | ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i], | ||
1140 | wc, new); | ||
1141 | if (ret < 0) { | ||
1142 | mlog_errno(ret); | ||
1143 | goto out; | ||
1144 | } | ||
1145 | |||
1146 | copied += ret; | ||
1147 | } | ||
1148 | |||
1149 | out: | ||
1150 | for(i = 0; i < numpages; i++) { | ||
1151 | unlock_page(cpages[i]); | ||
1152 | mark_page_accessed(cpages[i]); | ||
1153 | page_cache_release(cpages[i]); | ||
1154 | } | ||
1155 | kfree(cpages); | ||
1156 | |||
1157 | return copied ? copied : ret; | ||
1158 | } | ||
1159 | |||
1160 | static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc, | ||
1161 | struct ocfs2_super *osb, loff_t pos, | ||
1162 | size_t count, ocfs2_page_writer *cb, | ||
1163 | void *cb_priv) | ||
1164 | { | ||
1165 | wc->w_count = count; | ||
1166 | wc->w_pos = pos; | ||
1167 | wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits; | ||
1168 | wc->w_finished_copy = 0; | ||
1169 | |||
1170 | if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) | ||
1171 | wc->w_large_pages = 1; | ||
1172 | else | ||
1173 | wc->w_large_pages = 0; | ||
1174 | |||
1175 | wc->w_write_data_page = cb; | ||
1176 | wc->w_private = cb_priv; | ||
1177 | } | ||
1178 | |||
1179 | /* | ||
1180 | * Write a cluster to an inode. The cluster may not be allocated yet, | ||
1181 | * in which case it will be. This only exists for buffered writes - | ||
1182 | * O_DIRECT takes a more "traditional" path through the kernel. | ||
1183 | * | ||
1184 | * The caller is responsible for incrementing pos, written counts, etc | ||
1185 | * | ||
1186 | * For file systems that don't support sparse files, pre-allocation | ||
1187 | * and page zeroing up until cpos should be done prior to this | ||
1188 | * function call. | ||
1189 | * | ||
1190 | * Callers should be holding i_sem, and the rw cluster lock. | ||
1191 | * | ||
1192 | * Returns the number of user bytes written, or less than zero for | ||
1193 | * error. | ||
1194 | */ | ||
1195 | ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos, | ||
1196 | size_t count, ocfs2_page_writer *actor, | ||
1197 | void *priv) | ||
1198 | { | ||
1199 | int ret, credits = OCFS2_INODE_UPDATE_CREDITS; | ||
1200 | ssize_t written = 0; | ||
1201 | u32 phys; | ||
1202 | struct inode *inode = file->f_mapping->host; | ||
1203 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | ||
1204 | struct buffer_head *di_bh = NULL; | ||
1205 | struct ocfs2_dinode *di; | ||
1206 | struct ocfs2_alloc_context *data_ac = NULL; | ||
1207 | struct ocfs2_alloc_context *meta_ac = NULL; | ||
1208 | handle_t *handle; | ||
1209 | struct ocfs2_write_ctxt wc; | ||
1210 | |||
1211 | ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv); | ||
1212 | |||
1213 | ret = ocfs2_meta_lock(inode, &di_bh, 1); | ||
1214 | if (ret) { | ||
1215 | mlog_errno(ret); | ||
1216 | goto out; | ||
1217 | } | ||
1218 | di = (struct ocfs2_dinode *)di_bh->b_data; | ||
1219 | |||
1220 | /* | ||
1221 | * Take alloc sem here to prevent concurrent lookups. That way | ||
1222 | * the mapping, zeroing and tree manipulation within | ||
1223 | * ocfs2_write() will be safe against ->readpage(). This | ||
1224 | * should also serve to lock out allocation from a shared | ||
1225 | * writeable region. | ||
1226 | */ | ||
1227 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1228 | |||
1229 | ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL); | ||
1230 | if (ret) { | ||
1231 | mlog_errno(ret); | ||
1232 | goto out_meta; | ||
1233 | } | ||
1234 | |||
1235 | /* phys == 0 means that allocation is required. */ | ||
1236 | if (phys == 0) { | ||
1237 | ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac); | ||
1238 | if (ret) { | ||
1239 | mlog_errno(ret); | ||
1240 | goto out_meta; | ||
1241 | } | ||
1242 | |||
1243 | credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1); | ||
1244 | } | ||
1245 | |||
1246 | ret = ocfs2_data_lock(inode, 1); | ||
1247 | if (ret) { | ||
1248 | mlog_errno(ret); | ||
1249 | goto out_meta; | ||
1250 | } | ||
1251 | |||
1252 | handle = ocfs2_start_trans(osb, credits); | ||
1253 | if (IS_ERR(handle)) { | ||
1254 | ret = PTR_ERR(handle); | ||
1255 | mlog_errno(ret); | ||
1256 | goto out_data; | ||
1257 | } | ||
1258 | |||
1259 | written = ocfs2_write(file, phys, handle, di_bh, data_ac, | ||
1260 | meta_ac, &wc); | ||
1261 | if (written < 0) { | ||
1262 | ret = written; | ||
1263 | mlog_errno(ret); | ||
1264 | goto out_commit; | ||
1265 | } | ||
1266 | |||
1267 | ret = ocfs2_journal_access(handle, inode, di_bh, | ||
1268 | OCFS2_JOURNAL_ACCESS_WRITE); | ||
1269 | if (ret) { | ||
1270 | mlog_errno(ret); | ||
1271 | goto out_commit; | ||
1272 | } | ||
1273 | |||
1274 | pos += written; | ||
1275 | if (pos > inode->i_size) { | ||
1276 | i_size_write(inode, pos); | ||
1277 | mark_inode_dirty(inode); | ||
1278 | } | ||
1279 | inode->i_blocks = ocfs2_inode_sector_count(inode); | ||
1280 | di->i_size = cpu_to_le64((u64)i_size_read(inode)); | ||
1281 | inode->i_mtime = inode->i_ctime = CURRENT_TIME; | ||
1282 | di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); | ||
1283 | di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); | ||
1284 | |||
1285 | ret = ocfs2_journal_dirty(handle, di_bh); | ||
1286 | if (ret) | ||
1287 | mlog_errno(ret); | ||
1288 | |||
1289 | out_commit: | ||
1290 | ocfs2_commit_trans(osb, handle); | ||
1291 | |||
1292 | out_data: | ||
1293 | ocfs2_data_unlock(inode, 1); | ||
1294 | |||
1295 | out_meta: | ||
1296 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | ||
1297 | ocfs2_meta_unlock(inode, 1); | ||
1298 | |||
1299 | out: | ||
1300 | brelse(di_bh); | ||
1301 | if (data_ac) | ||
1302 | ocfs2_free_alloc_context(data_ac); | ||
1303 | if (meta_ac) | ||
1304 | ocfs2_free_alloc_context(meta_ac); | ||
1305 | |||
1306 | return written ? written : ret; | ||
1307 | } | ||
1308 | |||
678 | const struct address_space_operations ocfs2_aops = { | 1309 | const struct address_space_operations ocfs2_aops = { |
679 | .readpage = ocfs2_readpage, | 1310 | .readpage = ocfs2_readpage, |
680 | .writepage = ocfs2_writepage, | 1311 | .writepage = ocfs2_writepage, |
681 | .prepare_write = ocfs2_prepare_write, | ||
682 | .commit_write = ocfs2_commit_write, | ||
683 | .bmap = ocfs2_bmap, | 1312 | .bmap = ocfs2_bmap, |
684 | .sync_page = block_sync_page, | 1313 | .sync_page = block_sync_page, |
685 | .direct_IO = ocfs2_direct_IO, | 1314 | .direct_IO = ocfs2_direct_IO, |