diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_aops.c')
| -rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 259 |
1 files changed, 166 insertions, 93 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index a3a4b5aaf5d9..c6c077978fe3 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c | |||
| @@ -104,66 +104,114 @@ xfs_page_trace( | |||
| 104 | #define xfs_page_trace(tag, inode, page, mask) | 104 | #define xfs_page_trace(tag, inode, page, mask) |
| 105 | #endif | 105 | #endif |
| 106 | 106 | ||
| 107 | void | 107 | /* |
| 108 | linvfs_unwritten_done( | 108 | * Schedule IO completion handling on a xfsdatad if this was |
| 109 | struct buffer_head *bh, | 109 | * the final hold on this ioend. |
| 110 | int uptodate) | 110 | */ |
| 111 | STATIC void | ||
| 112 | xfs_finish_ioend( | ||
| 113 | xfs_ioend_t *ioend) | ||
| 111 | { | 114 | { |
| 112 | xfs_buf_t *pb = (xfs_buf_t *)bh->b_private; | 115 | if (atomic_dec_and_test(&ioend->io_remaining)) |
| 116 | queue_work(xfsdatad_workqueue, &ioend->io_work); | ||
| 117 | } | ||
| 113 | 118 | ||
| 114 | ASSERT(buffer_unwritten(bh)); | 119 | STATIC void |
| 115 | bh->b_end_io = NULL; | 120 | xfs_destroy_ioend( |
| 116 | clear_buffer_unwritten(bh); | 121 | xfs_ioend_t *ioend) |
| 117 | if (!uptodate) | 122 | { |
| 118 | pagebuf_ioerror(pb, EIO); | 123 | vn_iowake(ioend->io_vnode); |
| 119 | if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { | 124 | mempool_free(ioend, xfs_ioend_pool); |
| 120 | pagebuf_iodone(pb, 1, 1); | ||
| 121 | } | ||
| 122 | end_buffer_async_write(bh, uptodate); | ||
| 123 | } | 125 | } |
| 124 | 126 | ||
| 125 | /* | 127 | /* |
| 126 | * Issue transactions to convert a buffer range from unwritten | 128 | * Issue transactions to convert a buffer range from unwritten |
| 127 | * to written extents (buffered IO). | 129 | * to written extents. |
| 128 | */ | 130 | */ |
| 129 | STATIC void | 131 | STATIC void |
| 130 | linvfs_unwritten_convert( | 132 | xfs_end_bio_unwritten( |
| 131 | xfs_buf_t *bp) | 133 | void *data) |
| 132 | { | 134 | { |
| 133 | vnode_t *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *); | 135 | xfs_ioend_t *ioend = data; |
| 134 | int error; | 136 | vnode_t *vp = ioend->io_vnode; |
| 137 | xfs_off_t offset = ioend->io_offset; | ||
| 138 | size_t size = ioend->io_size; | ||
| 139 | struct buffer_head *bh, *next; | ||
| 140 | int error; | ||
| 141 | |||
| 142 | if (ioend->io_uptodate) | ||
| 143 | VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); | ||
| 144 | |||
| 145 | /* ioend->io_buffer_head is only non-NULL for buffered I/O */ | ||
| 146 | for (bh = ioend->io_buffer_head; bh; bh = next) { | ||
| 147 | next = bh->b_private; | ||
| 148 | |||
| 149 | bh->b_end_io = NULL; | ||
| 150 | clear_buffer_unwritten(bh); | ||
| 151 | end_buffer_async_write(bh, ioend->io_uptodate); | ||
| 152 | } | ||
| 135 | 153 | ||
| 136 | BUG_ON(atomic_read(&bp->pb_hold) < 1); | 154 | xfs_destroy_ioend(ioend); |
| 137 | VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp), | ||
| 138 | BMAPI_UNWRITTEN, NULL, NULL, error); | ||
| 139 | XFS_BUF_SET_FSPRIVATE(bp, NULL); | ||
| 140 | XFS_BUF_CLR_IODONE_FUNC(bp); | ||
| 141 | XFS_BUF_UNDATAIO(bp); | ||
| 142 | iput(LINVFS_GET_IP(vp)); | ||
| 143 | pagebuf_iodone(bp, 0, 0); | ||
| 144 | } | 155 | } |
| 145 | 156 | ||
| 146 | /* | 157 | /* |
| 147 | * Issue transactions to convert a buffer range from unwritten | 158 | * Allocate and initialise an IO completion structure. |
| 148 | * to written extents (direct IO). | 159 | * We need to track unwritten extent write completion here initially. |
| 160 | * We'll need to extend this for updating the ondisk inode size later | ||
| 161 | * (vs. incore size). | ||
| 149 | */ | 162 | */ |
| 150 | STATIC void | 163 | STATIC xfs_ioend_t * |
| 151 | linvfs_unwritten_convert_direct( | 164 | xfs_alloc_ioend( |
| 152 | struct kiocb *iocb, | 165 | struct inode *inode) |
| 153 | loff_t offset, | ||
| 154 | ssize_t size, | ||
| 155 | void *private) | ||
| 156 | { | 166 | { |
| 157 | struct inode *inode = iocb->ki_filp->f_dentry->d_inode; | 167 | xfs_ioend_t *ioend; |
| 158 | ASSERT(!private || inode == (struct inode *)private); | ||
| 159 | 168 | ||
| 160 | /* private indicates an unwritten extent lay beneath this IO */ | 169 | ioend = mempool_alloc(xfs_ioend_pool, GFP_NOFS); |
| 161 | if (private && size > 0) { | ||
| 162 | vnode_t *vp = LINVFS_GET_VP(inode); | ||
| 163 | int error; | ||
| 164 | 170 | ||
| 165 | VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); | 171 | /* |
| 166 | } | 172 | * Set the count to 1 initially, which will prevent an I/O |
| 173 | * completion callback from happening before we have started | ||
| 174 | * all the I/O from calling the completion routine too early. | ||
| 175 | */ | ||
| 176 | atomic_set(&ioend->io_remaining, 1); | ||
| 177 | ioend->io_uptodate = 1; /* cleared if any I/O fails */ | ||
| 178 | ioend->io_vnode = LINVFS_GET_VP(inode); | ||
| 179 | ioend->io_buffer_head = NULL; | ||
| 180 | atomic_inc(&ioend->io_vnode->v_iocount); | ||
| 181 | ioend->io_offset = 0; | ||
| 182 | ioend->io_size = 0; | ||
| 183 | |||
| 184 | INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); | ||
| 185 | |||
| 186 | return ioend; | ||
| 187 | } | ||
| 188 | |||
| 189 | void | ||
| 190 | linvfs_unwritten_done( | ||
| 191 | struct buffer_head *bh, | ||
| 192 | int uptodate) | ||
| 193 | { | ||
| 194 | xfs_ioend_t *ioend = bh->b_private; | ||
| 195 | static spinlock_t unwritten_done_lock = SPIN_LOCK_UNLOCKED; | ||
| 196 | unsigned long flags; | ||
| 197 | |||
| 198 | ASSERT(buffer_unwritten(bh)); | ||
| 199 | bh->b_end_io = NULL; | ||
| 200 | |||
| 201 | if (!uptodate) | ||
| 202 | ioend->io_uptodate = 0; | ||
| 203 | |||
| 204 | /* | ||
| 205 | * Deep magic here. We reuse b_private in the buffer_heads to build | ||
| 206 | * a chain for completing the I/O from user context after we've issued | ||
| 207 | * a transaction to convert the unwritten extent. | ||
| 208 | */ | ||
| 209 | spin_lock_irqsave(&unwritten_done_lock, flags); | ||
| 210 | bh->b_private = ioend->io_buffer_head; | ||
| 211 | ioend->io_buffer_head = bh; | ||
| 212 | spin_unlock_irqrestore(&unwritten_done_lock, flags); | ||
| 213 | |||
| 214 | xfs_finish_ioend(ioend); | ||
| 167 | } | 215 | } |
| 168 | 216 | ||
| 169 | STATIC int | 217 | STATIC int |
| @@ -255,7 +303,7 @@ xfs_probe_unwritten_page( | |||
| 255 | struct address_space *mapping, | 303 | struct address_space *mapping, |
| 256 | pgoff_t index, | 304 | pgoff_t index, |
| 257 | xfs_iomap_t *iomapp, | 305 | xfs_iomap_t *iomapp, |
| 258 | xfs_buf_t *pb, | 306 | xfs_ioend_t *ioend, |
| 259 | unsigned long max_offset, | 307 | unsigned long max_offset, |
| 260 | unsigned long *fsbs, | 308 | unsigned long *fsbs, |
| 261 | unsigned int bbits) | 309 | unsigned int bbits) |
| @@ -283,7 +331,7 @@ xfs_probe_unwritten_page( | |||
| 283 | break; | 331 | break; |
| 284 | xfs_map_at_offset(page, bh, p_offset, bbits, iomapp); | 332 | xfs_map_at_offset(page, bh, p_offset, bbits, iomapp); |
| 285 | set_buffer_unwritten_io(bh); | 333 | set_buffer_unwritten_io(bh); |
| 286 | bh->b_private = pb; | 334 | bh->b_private = ioend; |
| 287 | p_offset += bh->b_size; | 335 | p_offset += bh->b_size; |
| 288 | (*fsbs)++; | 336 | (*fsbs)++; |
| 289 | } while ((bh = bh->b_this_page) != head); | 337 | } while ((bh = bh->b_this_page) != head); |
| @@ -434,34 +482,15 @@ xfs_map_unwritten( | |||
| 434 | { | 482 | { |
| 435 | struct buffer_head *bh = curr; | 483 | struct buffer_head *bh = curr; |
| 436 | xfs_iomap_t *tmp; | 484 | xfs_iomap_t *tmp; |
| 437 | xfs_buf_t *pb; | 485 | xfs_ioend_t *ioend; |
| 438 | loff_t offset, size; | 486 | loff_t offset; |
| 439 | unsigned long nblocks = 0; | 487 | unsigned long nblocks = 0; |
| 440 | 488 | ||
| 441 | offset = start_page->index; | 489 | offset = start_page->index; |
| 442 | offset <<= PAGE_CACHE_SHIFT; | 490 | offset <<= PAGE_CACHE_SHIFT; |
| 443 | offset += p_offset; | 491 | offset += p_offset; |
| 444 | 492 | ||
| 445 | /* get an "empty" pagebuf to manage IO completion | 493 | ioend = xfs_alloc_ioend(inode); |
| 446 | * Proper values will be set before returning */ | ||
| 447 | pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0); | ||
| 448 | if (!pb) | ||
| 449 | return -EAGAIN; | ||
| 450 | |||
| 451 | /* Take a reference to the inode to prevent it from | ||
| 452 | * being reclaimed while we have outstanding unwritten | ||
| 453 | * extent IO on it. | ||
| 454 | */ | ||
| 455 | if ((igrab(inode)) != inode) { | ||
| 456 | pagebuf_free(pb); | ||
| 457 | return -EAGAIN; | ||
| 458 | } | ||
| 459 | |||
| 460 | /* Set the count to 1 initially, this will stop an I/O | ||
| 461 | * completion callout which happens before we have started | ||
| 462 | * all the I/O from calling pagebuf_iodone too early. | ||
| 463 | */ | ||
| 464 | atomic_set(&pb->pb_io_remaining, 1); | ||
| 465 | 494 | ||
| 466 | /* First map forwards in the page consecutive buffers | 495 | /* First map forwards in the page consecutive buffers |
| 467 | * covering this unwritten extent | 496 | * covering this unwritten extent |
| @@ -474,12 +503,12 @@ xfs_map_unwritten( | |||
| 474 | break; | 503 | break; |
| 475 | xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp); | 504 | xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp); |
| 476 | set_buffer_unwritten_io(bh); | 505 | set_buffer_unwritten_io(bh); |
| 477 | bh->b_private = pb; | 506 | bh->b_private = ioend; |
| 478 | p_offset += bh->b_size; | 507 | p_offset += bh->b_size; |
| 479 | nblocks++; | 508 | nblocks++; |
| 480 | } while ((bh = bh->b_this_page) != head); | 509 | } while ((bh = bh->b_this_page) != head); |
| 481 | 510 | ||
| 482 | atomic_add(nblocks, &pb->pb_io_remaining); | 511 | atomic_add(nblocks, &ioend->io_remaining); |
| 483 | 512 | ||
| 484 | /* If we reached the end of the page, map forwards in any | 513 | /* If we reached the end of the page, map forwards in any |
| 485 | * following pages which are also covered by this extent. | 514 | * following pages which are also covered by this extent. |
| @@ -496,13 +525,13 @@ xfs_map_unwritten( | |||
| 496 | tloff = min(tlast, tloff); | 525 | tloff = min(tlast, tloff); |
| 497 | for (tindex = start_page->index + 1; tindex < tloff; tindex++) { | 526 | for (tindex = start_page->index + 1; tindex < tloff; tindex++) { |
| 498 | page = xfs_probe_unwritten_page(mapping, | 527 | page = xfs_probe_unwritten_page(mapping, |
| 499 | tindex, iomapp, pb, | 528 | tindex, iomapp, ioend, |
| 500 | PAGE_CACHE_SIZE, &bs, bbits); | 529 | PAGE_CACHE_SIZE, &bs, bbits); |
| 501 | if (!page) | 530 | if (!page) |
| 502 | break; | 531 | break; |
| 503 | nblocks += bs; | 532 | nblocks += bs; |
| 504 | atomic_add(bs, &pb->pb_io_remaining); | 533 | atomic_add(bs, &ioend->io_remaining); |
| 505 | xfs_convert_page(inode, page, iomapp, wbc, pb, | 534 | xfs_convert_page(inode, page, iomapp, wbc, ioend, |
| 506 | startio, all_bh); | 535 | startio, all_bh); |
| 507 | /* stop if converting the next page might add | 536 | /* stop if converting the next page might add |
| 508 | * enough blocks that the corresponding byte | 537 | * enough blocks that the corresponding byte |
| @@ -514,12 +543,12 @@ xfs_map_unwritten( | |||
| 514 | if (tindex == tlast && | 543 | if (tindex == tlast && |
| 515 | (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) { | 544 | (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) { |
| 516 | page = xfs_probe_unwritten_page(mapping, | 545 | page = xfs_probe_unwritten_page(mapping, |
| 517 | tindex, iomapp, pb, | 546 | tindex, iomapp, ioend, |
| 518 | pg_offset, &bs, bbits); | 547 | pg_offset, &bs, bbits); |
| 519 | if (page) { | 548 | if (page) { |
| 520 | nblocks += bs; | 549 | nblocks += bs; |
| 521 | atomic_add(bs, &pb->pb_io_remaining); | 550 | atomic_add(bs, &ioend->io_remaining); |
| 522 | xfs_convert_page(inode, page, iomapp, wbc, pb, | 551 | xfs_convert_page(inode, page, iomapp, wbc, ioend, |
| 523 | startio, all_bh); | 552 | startio, all_bh); |
| 524 | if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) | 553 | if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits)) |
| 525 | goto enough; | 554 | goto enough; |
| @@ -528,21 +557,9 @@ xfs_map_unwritten( | |||
| 528 | } | 557 | } |
| 529 | 558 | ||
| 530 | enough: | 559 | enough: |
| 531 | size = nblocks; /* NB: using 64bit number here */ | 560 | ioend->io_size = (xfs_off_t)nblocks << block_bits; |
| 532 | size <<= block_bits; /* convert fsb's to byte range */ | 561 | ioend->io_offset = offset; |
| 533 | 562 | xfs_finish_ioend(ioend); | |
| 534 | XFS_BUF_DATAIO(pb); | ||
| 535 | XFS_BUF_ASYNC(pb); | ||
| 536 | XFS_BUF_SET_SIZE(pb, size); | ||
| 537 | XFS_BUF_SET_COUNT(pb, size); | ||
| 538 | XFS_BUF_SET_OFFSET(pb, offset); | ||
| 539 | XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode)); | ||
| 540 | XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert); | ||
| 541 | |||
| 542 | if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) { | ||
| 543 | pagebuf_iodone(pb, 1, 1); | ||
| 544 | } | ||
| 545 | |||
| 546 | return 0; | 563 | return 0; |
| 547 | } | 564 | } |
| 548 | 565 | ||
| @@ -787,7 +804,7 @@ xfs_page_state_convert( | |||
| 787 | continue; | 804 | continue; |
| 788 | if (!iomp) { | 805 | if (!iomp) { |
| 789 | err = xfs_map_blocks(inode, offset, len, &iomap, | 806 | err = xfs_map_blocks(inode, offset, len, &iomap, |
| 790 | BMAPI_READ|BMAPI_IGNSTATE); | 807 | BMAPI_WRITE|BMAPI_IGNSTATE); |
| 791 | if (err) { | 808 | if (err) { |
| 792 | goto error; | 809 | goto error; |
| 793 | } | 810 | } |
| @@ -1028,6 +1045,44 @@ linvfs_get_blocks_direct( | |||
| 1028 | create, 1, BMAPI_WRITE|BMAPI_DIRECT); | 1045 | create, 1, BMAPI_WRITE|BMAPI_DIRECT); |
| 1029 | } | 1046 | } |
| 1030 | 1047 | ||
| 1048 | STATIC void | ||
| 1049 | linvfs_end_io_direct( | ||
| 1050 | struct kiocb *iocb, | ||
| 1051 | loff_t offset, | ||
| 1052 | ssize_t size, | ||
| 1053 | void *private) | ||
| 1054 | { | ||
| 1055 | xfs_ioend_t *ioend = iocb->private; | ||
| 1056 | |||
| 1057 | /* | ||
| 1058 | * Non-NULL private data means we need to issue a transaction to | ||
| 1059 | * convert a range from unwritten to written extents. This needs | ||
| 1060 | * to happen from process contect but aio+dio I/O completion | ||
| 1061 | * happens from irq context so we need to defer it to a workqueue. | ||
| 1062 | * This is not nessecary for synchronous direct I/O, but we do | ||
| 1063 | * it anyway to keep the code uniform and simpler. | ||
| 1064 | * | ||
| 1065 | * The core direct I/O code might be changed to always call the | ||
| 1066 | * completion handler in the future, in which case all this can | ||
| 1067 | * go away. | ||
| 1068 | */ | ||
| 1069 | if (private && size > 0) { | ||
| 1070 | ioend->io_offset = offset; | ||
| 1071 | ioend->io_size = size; | ||
| 1072 | xfs_finish_ioend(ioend); | ||
| 1073 | } else { | ||
| 1074 | ASSERT(size >= 0); | ||
| 1075 | xfs_destroy_ioend(ioend); | ||
| 1076 | } | ||
| 1077 | |||
| 1078 | /* | ||
| 1079 | * blockdev_direct_IO can return an error even afer the I/O | ||
| 1080 | * completion handler was called. Thus we need to protect | ||
| 1081 | * against double-freeing. | ||
| 1082 | */ | ||
| 1083 | iocb->private = NULL; | ||
| 1084 | } | ||
| 1085 | |||
| 1031 | STATIC ssize_t | 1086 | STATIC ssize_t |
| 1032 | linvfs_direct_IO( | 1087 | linvfs_direct_IO( |
| 1033 | int rw, | 1088 | int rw, |
| @@ -1042,16 +1097,23 @@ linvfs_direct_IO( | |||
| 1042 | xfs_iomap_t iomap; | 1097 | xfs_iomap_t iomap; |
| 1043 | int maps = 1; | 1098 | int maps = 1; |
| 1044 | int error; | 1099 | int error; |
| 1100 | ssize_t ret; | ||
| 1045 | 1101 | ||
| 1046 | VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error); | 1102 | VOP_BMAP(vp, offset, 0, BMAPI_DEVICE, &iomap, &maps, error); |
| 1047 | if (error) | 1103 | if (error) |
| 1048 | return -error; | 1104 | return -error; |
| 1049 | 1105 | ||
| 1050 | return blockdev_direct_IO_own_locking(rw, iocb, inode, | 1106 | iocb->private = xfs_alloc_ioend(inode); |
| 1107 | |||
| 1108 | ret = blockdev_direct_IO_own_locking(rw, iocb, inode, | ||
| 1051 | iomap.iomap_target->pbr_bdev, | 1109 | iomap.iomap_target->pbr_bdev, |
| 1052 | iov, offset, nr_segs, | 1110 | iov, offset, nr_segs, |
| 1053 | linvfs_get_blocks_direct, | 1111 | linvfs_get_blocks_direct, |
| 1054 | linvfs_unwritten_convert_direct); | 1112 | linvfs_end_io_direct); |
| 1113 | |||
| 1114 | if (unlikely(ret <= 0 && iocb->private)) | ||
| 1115 | xfs_destroy_ioend(iocb->private); | ||
| 1116 | return ret; | ||
| 1055 | } | 1117 | } |
| 1056 | 1118 | ||
| 1057 | 1119 | ||
| @@ -1202,6 +1264,16 @@ out_unlock: | |||
| 1202 | return error; | 1264 | return error; |
| 1203 | } | 1265 | } |
| 1204 | 1266 | ||
| 1267 | STATIC int | ||
| 1268 | linvfs_invalidate_page( | ||
| 1269 | struct page *page, | ||
| 1270 | unsigned long offset) | ||
| 1271 | { | ||
| 1272 | xfs_page_trace(XFS_INVALIDPAGE_ENTER, | ||
| 1273 | page->mapping->host, page, offset); | ||
| 1274 | return block_invalidatepage(page, offset); | ||
| 1275 | } | ||
| 1276 | |||
| 1205 | /* | 1277 | /* |
| 1206 | * Called to move a page into cleanable state - and from there | 1278 | * Called to move a page into cleanable state - and from there |
| 1207 | * to be released. Possibly the page is already clean. We always | 1279 | * to be released. Possibly the page is already clean. We always |
| @@ -1279,6 +1351,7 @@ struct address_space_operations linvfs_aops = { | |||
| 1279 | .writepage = linvfs_writepage, | 1351 | .writepage = linvfs_writepage, |
| 1280 | .sync_page = block_sync_page, | 1352 | .sync_page = block_sync_page, |
| 1281 | .releasepage = linvfs_release_page, | 1353 | .releasepage = linvfs_release_page, |
| 1354 | .invalidatepage = linvfs_invalidate_page, | ||
| 1282 | .prepare_write = linvfs_prepare_write, | 1355 | .prepare_write = linvfs_prepare_write, |
| 1283 | .commit_write = generic_commit_write, | 1356 | .commit_write = generic_commit_write, |
| 1284 | .bmap = linvfs_bmap, | 1357 | .bmap = linvfs_bmap, |
