aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_aops.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@sgi.com>2006-01-10 23:40:13 -0500
committerNathan Scott <nathans@sgi.com>2006-01-10 23:40:13 -0500
commitf6d6d4fcd180f8e47bf6b13fc6cce1e6c156d0ea (patch)
tree2d4e981bb61f564904f7b7ca1ab69d163c0f69dd /fs/xfs/linux-2.6/xfs_aops.c
parentce8e922c0e79c8093452ba9a124981332b75706b (diff)
[XFS] Initial pass at going directly-to-bio on the buffered IO path. This
allows us to submit much larger I/Os instead of sending down lots of small buffer_heads. To do this we need to have a rather complicated I/O submission and completion tracking infrastructure. Part of the latter has been merged already a long time ago for direct I/O support. Part of the problem is that we need to track sub-pagesize regions and for that we still need buffer_heads for the time beeing. Long-term I hope we can move to better data strucutures and/or maybe move this to fs/mpage.c instead of having it in XFS. Original patch from Nathan Scott with various updates from David Chinner and Christoph Hellwig. SGI-PV: 947118 SGI-Modid: xfs-linux-melb:xfs-kern:203822a Signed-off-by: Christoph Hellwig <hch@sgi.com> Signed-off-by: Nathan Scott <nathans@sgi.com>
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_aops.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c776
1 files changed, 421 insertions, 355 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 3f6b9e29850c..e99d04d3fe82 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -43,8 +43,6 @@
43#include <linux/writeback.h> 43#include <linux/writeback.h>
44 44
45STATIC void xfs_count_page_state(struct page *, int *, int *, int *); 45STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
46STATIC void xfs_convert_page(struct inode *, struct page *, xfs_iomap_t *,
47 struct writeback_control *wbc, void *, int, int);
48 46
49#if defined(XFS_RW_TRACE) 47#if defined(XFS_RW_TRACE)
50void 48void
@@ -58,7 +56,7 @@ xfs_page_trace(
58 bhv_desc_t *bdp; 56 bhv_desc_t *bdp;
59 vnode_t *vp = LINVFS_GET_VP(inode); 57 vnode_t *vp = LINVFS_GET_VP(inode);
60 loff_t isize = i_size_read(inode); 58 loff_t isize = i_size_read(inode);
61 loff_t offset = (loff_t)page->index << PAGE_CACHE_SHIFT; 59 loff_t offset = page_offset(page);
62 int delalloc = -1, unmapped = -1, unwritten = -1; 60 int delalloc = -1, unmapped = -1, unwritten = -1;
63 61
64 if (page_has_buffers(page)) 62 if (page_has_buffers(page))
@@ -103,15 +101,56 @@ xfs_finish_ioend(
103 queue_work(xfsdatad_workqueue, &ioend->io_work); 101 queue_work(xfsdatad_workqueue, &ioend->io_work);
104} 102}
105 103
104/*
105 * We're now finished for good with this ioend structure.
106 * Update the page state via the associated buffer_heads,
107 * release holds on the inode and bio, and finally free
108 * up memory. Do not use the ioend after this.
109 */
106STATIC void 110STATIC void
107xfs_destroy_ioend( 111xfs_destroy_ioend(
108 xfs_ioend_t *ioend) 112 xfs_ioend_t *ioend)
109{ 113{
114 struct buffer_head *bh, *next;
115
116 for (bh = ioend->io_buffer_head; bh; bh = next) {
117 next = bh->b_private;
118 bh->b_end_io(bh, ioend->io_uptodate);
119 }
120
110 vn_iowake(ioend->io_vnode); 121 vn_iowake(ioend->io_vnode);
111 mempool_free(ioend, xfs_ioend_pool); 122 mempool_free(ioend, xfs_ioend_pool);
112} 123}
113 124
114/* 125/*
126 * Buffered IO write completion for delayed allocate extents.
127 * TODO: Update ondisk isize now that we know the file data
128 * has been flushed (i.e. the notorious "NULL file" problem).
129 */
130STATIC void
131xfs_end_bio_delalloc(
132 void *data)
133{
134 xfs_ioend_t *ioend = data;
135
136 xfs_destroy_ioend(ioend);
137}
138
139/*
140 * Buffered IO write completion for regular, written extents.
141 */
142STATIC void
143xfs_end_bio_written(
144 void *data)
145{
146 xfs_ioend_t *ioend = data;
147
148 xfs_destroy_ioend(ioend);
149}
150
151/*
152 * IO write completion for unwritten extents.
153 *
115 * Issue transactions to convert a buffer range from unwritten 154 * Issue transactions to convert a buffer range from unwritten
116 * to written extents. 155 * to written extents.
117 */ 156 */
@@ -123,21 +162,10 @@ xfs_end_bio_unwritten(
123 vnode_t *vp = ioend->io_vnode; 162 vnode_t *vp = ioend->io_vnode;
124 xfs_off_t offset = ioend->io_offset; 163 xfs_off_t offset = ioend->io_offset;
125 size_t size = ioend->io_size; 164 size_t size = ioend->io_size;
126 struct buffer_head *bh, *next;
127 int error; 165 int error;
128 166
129 if (ioend->io_uptodate) 167 if (ioend->io_uptodate)
130 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error); 168 VOP_BMAP(vp, offset, size, BMAPI_UNWRITTEN, NULL, NULL, error);
131
132 /* ioend->io_buffer_head is only non-NULL for buffered I/O */
133 for (bh = ioend->io_buffer_head; bh; bh = next) {
134 next = bh->b_private;
135
136 bh->b_end_io = NULL;
137 clear_buffer_unwritten(bh);
138 end_buffer_async_write(bh, ioend->io_uptodate);
139 }
140
141 xfs_destroy_ioend(ioend); 169 xfs_destroy_ioend(ioend);
142} 170}
143 171
@@ -149,7 +177,8 @@ xfs_end_bio_unwritten(
149 */ 177 */
150STATIC xfs_ioend_t * 178STATIC xfs_ioend_t *
151xfs_alloc_ioend( 179xfs_alloc_ioend(
152 struct inode *inode) 180 struct inode *inode,
181 unsigned int type)
153{ 182{
154 xfs_ioend_t *ioend; 183 xfs_ioend_t *ioend;
155 184
@@ -162,45 +191,25 @@ xfs_alloc_ioend(
162 */ 191 */
163 atomic_set(&ioend->io_remaining, 1); 192 atomic_set(&ioend->io_remaining, 1);
164 ioend->io_uptodate = 1; /* cleared if any I/O fails */ 193 ioend->io_uptodate = 1; /* cleared if any I/O fails */
194 ioend->io_list = NULL;
195 ioend->io_type = type;
165 ioend->io_vnode = LINVFS_GET_VP(inode); 196 ioend->io_vnode = LINVFS_GET_VP(inode);
166 ioend->io_buffer_head = NULL; 197 ioend->io_buffer_head = NULL;
198 ioend->io_buffer_tail = NULL;
167 atomic_inc(&ioend->io_vnode->v_iocount); 199 atomic_inc(&ioend->io_vnode->v_iocount);
168 ioend->io_offset = 0; 200 ioend->io_offset = 0;
169 ioend->io_size = 0; 201 ioend->io_size = 0;
170 202
171 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend); 203 if (type == IOMAP_UNWRITTEN)
204 INIT_WORK(&ioend->io_work, xfs_end_bio_unwritten, ioend);
205 else if (type == IOMAP_DELAY)
206 INIT_WORK(&ioend->io_work, xfs_end_bio_delalloc, ioend);
207 else
208 INIT_WORK(&ioend->io_work, xfs_end_bio_written, ioend);
172 209
173 return ioend; 210 return ioend;
174} 211}
175 212
176void
177linvfs_unwritten_done(
178 struct buffer_head *bh,
179 int uptodate)
180{
181 xfs_ioend_t *ioend = bh->b_private;
182 static spinlock_t unwritten_done_lock = SPIN_LOCK_UNLOCKED;
183 unsigned long flags;
184
185 ASSERT(buffer_unwritten(bh));
186 bh->b_end_io = NULL;
187
188 if (!uptodate)
189 ioend->io_uptodate = 0;
190
191 /*
192 * Deep magic here. We reuse b_private in the buffer_heads to build
193 * a chain for completing the I/O from user context after we've issued
194 * a transaction to convert the unwritten extent.
195 */
196 spin_lock_irqsave(&unwritten_done_lock, flags);
197 bh->b_private = ioend->io_buffer_head;
198 ioend->io_buffer_head = bh;
199 spin_unlock_irqrestore(&unwritten_done_lock, flags);
200
201 xfs_finish_ioend(ioend);
202}
203
204STATIC int 213STATIC int
205xfs_map_blocks( 214xfs_map_blocks(
206 struct inode *inode, 215 struct inode *inode,
@@ -228,7 +237,7 @@ xfs_offset_to_map(
228 xfs_iomap_t *iomapp, 237 xfs_iomap_t *iomapp,
229 unsigned long offset) 238 unsigned long offset)
230{ 239{
231 loff_t full_offset; /* offset from start of file */ 240 xfs_off_t full_offset; /* offset from start of file */
232 241
233 ASSERT(offset < PAGE_CACHE_SIZE); 242 ASSERT(offset < PAGE_CACHE_SIZE);
234 243
@@ -243,16 +252,223 @@ xfs_offset_to_map(
243 return NULL; 252 return NULL;
244} 253}
245 254
255/*
256 * BIO completion handler for buffered IO.
257 */
258STATIC int
259xfs_end_bio(
260 struct bio *bio,
261 unsigned int bytes_done,
262 int error)
263{
264 xfs_ioend_t *ioend = bio->bi_private;
265
266 if (bio->bi_size)
267 return 1;
268
269 ASSERT(ioend);
270 ASSERT(atomic_read(&bio->bi_cnt) >= 1);
271
272 /* Toss bio and pass work off to an xfsdatad thread */
273 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
274 ioend->io_uptodate = 0;
275 bio->bi_private = NULL;
276 bio->bi_end_io = NULL;
277
278 bio_put(bio);
279 xfs_finish_ioend(ioend);
280 return 0;
281}
282
283STATIC void
284xfs_submit_ioend_bio(
285 xfs_ioend_t *ioend,
286 struct bio *bio)
287{
288 atomic_inc(&ioend->io_remaining);
289
290 bio->bi_private = ioend;
291 bio->bi_end_io = xfs_end_bio;
292
293 submit_bio(WRITE, bio);
294 ASSERT(!bio_flagged(bio, BIO_EOPNOTSUPP));
295 bio_put(bio);
296}
297
298STATIC struct bio *
299xfs_alloc_ioend_bio(
300 struct buffer_head *bh)
301{
302 struct bio *bio;
303 int nvecs = bio_get_nr_vecs(bh->b_bdev);
304
305 do {
306 bio = bio_alloc(GFP_NOIO, nvecs);
307 nvecs >>= 1;
308 } while (!bio);
309
310 ASSERT(bio->bi_private == NULL);
311 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
312 bio->bi_bdev = bh->b_bdev;
313 bio_get(bio);
314 return bio;
315}
316
317STATIC void
318xfs_start_buffer_writeback(
319 struct buffer_head *bh)
320{
321 ASSERT(buffer_mapped(bh));
322 ASSERT(buffer_locked(bh));
323 ASSERT(!buffer_delay(bh));
324 ASSERT(!buffer_unwritten(bh));
325
326 mark_buffer_async_write(bh);
327 set_buffer_uptodate(bh);
328 clear_buffer_dirty(bh);
329}
330
331STATIC void
332xfs_start_page_writeback(
333 struct page *page,
334 struct writeback_control *wbc,
335 int clear_dirty,
336 int buffers)
337{
338 ASSERT(PageLocked(page));
339 ASSERT(!PageWriteback(page));
340 set_page_writeback(page);
341 if (clear_dirty)
342 clear_page_dirty(page);
343 unlock_page(page);
344 if (!buffers) {
345 end_page_writeback(page);
346 wbc->pages_skipped++; /* We didn't write this page */
347 }
348}
349
350static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
351{
352 return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
353}
354
355/*
356 * Submit all of the bios for all of the ioends we have saved up,
357 * covering the initial writepage page and also any probed pages.
358 */
359STATIC void
360xfs_submit_ioend(
361 xfs_ioend_t *ioend)
362{
363 xfs_ioend_t *next;
364 struct buffer_head *bh;
365 struct bio *bio;
366 sector_t lastblock = 0;
367
368 do {
369 next = ioend->io_list;
370 bio = NULL;
371
372 for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
373 xfs_start_buffer_writeback(bh);
374
375 if (!bio) {
376 retry:
377 bio = xfs_alloc_ioend_bio(bh);
378 } else if (bh->b_blocknr != lastblock + 1) {
379 xfs_submit_ioend_bio(ioend, bio);
380 goto retry;
381 }
382
383 if (bio_add_buffer(bio, bh) != bh->b_size) {
384 xfs_submit_ioend_bio(ioend, bio);
385 goto retry;
386 }
387
388 lastblock = bh->b_blocknr;
389 }
390 if (bio)
391 xfs_submit_ioend_bio(ioend, bio);
392 xfs_finish_ioend(ioend);
393 } while ((ioend = next) != NULL);
394}
395
396/*
397 * Cancel submission of all buffer_heads so far in this endio.
398 * Toss the endio too. Only ever called for the initial page
399 * in a writepage request, so only ever one page.
400 */
401STATIC void
402xfs_cancel_ioend(
403 xfs_ioend_t *ioend)
404{
405 xfs_ioend_t *next;
406 struct buffer_head *bh, *next_bh;
407
408 do {
409 next = ioend->io_list;
410 bh = ioend->io_buffer_head;
411 do {
412 next_bh = bh->b_private;
413 clear_buffer_async_write(bh);
414 unlock_buffer(bh);
415 } while ((bh = next_bh) != NULL);
416
417 vn_iowake(ioend->io_vnode);
418 mempool_free(ioend, xfs_ioend_pool);
419 } while ((ioend = next) != NULL);
420}
421
422/*
423 * Test to see if we've been building up a completion structure for
424 * earlier buffers -- if so, we try to append to this ioend if we
425 * can, otherwise we finish off any current ioend and start another.
426 * Return true if we've finished the given ioend.
427 */
428STATIC void
429xfs_add_to_ioend(
430 struct inode *inode,
431 struct buffer_head *bh,
432 unsigned int p_offset,
433 unsigned int type,
434 xfs_ioend_t **result,
435 int need_ioend)
436{
437 xfs_ioend_t *ioend = *result;
438
439 if (!ioend || need_ioend || type != ioend->io_type) {
440 xfs_ioend_t *previous = *result;
441 xfs_off_t offset;
442
443 offset = (xfs_off_t)bh->b_page->index << PAGE_CACHE_SHIFT;
444 offset += p_offset;
445 ioend = xfs_alloc_ioend(inode, type);
446 ioend->io_offset = offset;
447 ioend->io_buffer_head = bh;
448 ioend->io_buffer_tail = bh;
449 if (previous)
450 previous->io_list = ioend;
451 *result = ioend;
452 } else {
453 ioend->io_buffer_tail->b_private = bh;
454 ioend->io_buffer_tail = bh;
455 }
456
457 bh->b_private = NULL;
458 ioend->io_size += bh->b_size;
459}
460
246STATIC void 461STATIC void
247xfs_map_at_offset( 462xfs_map_at_offset(
248 struct page *page, 463 struct page *page,
249 struct buffer_head *bh, 464 struct buffer_head *bh,
250 unsigned long offset, 465 unsigned long offset,
251 int block_bits, 466 int block_bits,
252 xfs_iomap_t *iomapp) 467 xfs_iomap_t *iomapp,
468 xfs_ioend_t *ioend)
253{ 469{
254 xfs_daddr_t bn; 470 xfs_daddr_t bn;
255 loff_t delta; 471 xfs_off_t delta;
256 int sector_shift; 472 int sector_shift;
257 473
258 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE)); 474 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
@@ -276,60 +492,7 @@ xfs_map_at_offset(
276 bh->b_bdev = iomapp->iomap_target->bt_bdev; 492 bh->b_bdev = iomapp->iomap_target->bt_bdev;
277 set_buffer_mapped(bh); 493 set_buffer_mapped(bh);
278 clear_buffer_delay(bh); 494 clear_buffer_delay(bh);
279} 495 clear_buffer_unwritten(bh);
280
281/*
282 * Look for a page at index which is unlocked and contains our
283 * unwritten extent flagged buffers at its head. Returns page
284 * locked and with an extra reference count, and length of the
285 * unwritten extent component on this page that we can write,
286 * in units of filesystem blocks.
287 */
288STATIC struct page *
289xfs_probe_unwritten_page(
290 struct address_space *mapping,
291 pgoff_t index,
292 xfs_iomap_t *iomapp,
293 xfs_ioend_t *ioend,
294 unsigned long max_offset,
295 unsigned long *fsbs,
296 unsigned int bbits)
297{
298 struct page *page;
299
300 page = find_trylock_page(mapping, index);
301 if (!page)
302 return NULL;
303 if (PageWriteback(page))
304 goto out;
305
306 if (page->mapping && page_has_buffers(page)) {
307 struct buffer_head *bh, *head;
308 unsigned long p_offset = 0;
309
310 *fsbs = 0;
311 bh = head = page_buffers(page);
312 do {
313 if (!buffer_unwritten(bh) || !buffer_uptodate(bh))
314 break;
315 if (!xfs_offset_to_map(page, iomapp, p_offset))
316 break;
317 if (p_offset >= max_offset)
318 break;
319 xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
320 set_buffer_unwritten_io(bh);
321 bh->b_private = ioend;
322 p_offset += bh->b_size;
323 (*fsbs)++;
324 } while ((bh = bh->b_this_page) != head);
325
326 if (p_offset)
327 return page;
328 }
329
330out:
331 unlock_page(page);
332 return NULL;
333} 496}
334 497
335/* 498/*
@@ -372,15 +535,16 @@ out:
372 return ret; 535 return ret;
373} 536}
374 537
375STATIC unsigned int 538STATIC size_t
376xfs_probe_unmapped_cluster( 539xfs_probe_unmapped_cluster(
377 struct inode *inode, 540 struct inode *inode,
378 struct page *startpage, 541 struct page *startpage,
379 struct buffer_head *bh, 542 struct buffer_head *bh,
380 struct buffer_head *head) 543 struct buffer_head *head)
381{ 544{
545 size_t len, total = 0;
382 pgoff_t tindex, tlast, tloff; 546 pgoff_t tindex, tlast, tloff;
383 unsigned int pg_offset, len, total = 0; 547 unsigned int pg_offset;
384 struct address_space *mapping = inode->i_mapping; 548 struct address_space *mapping = inode->i_mapping;
385 549
386 /* First sum forwards in this page */ 550 /* First sum forwards in this page */
@@ -414,14 +578,15 @@ xfs_probe_unmapped_cluster(
414} 578}
415 579
416/* 580/*
417 * Probe for a given page (index) in the inode and test if it is delayed 581 * Probe for a given page (index) in the inode and test if it is suitable
418 * and without unwritten buffers. Returns page locked and with an extra 582 * for writing as part of an unwritten or delayed allocate extent.
419 * reference count. 583 * Returns page locked and with an extra reference count if so, else NULL.
420 */ 584 */
421STATIC struct page * 585STATIC struct page *
422xfs_probe_delalloc_page( 586xfs_probe_delayed_page(
423 struct inode *inode, 587 struct inode *inode,
424 pgoff_t index) 588 pgoff_t index,
589 unsigned int type)
425{ 590{
426 struct page *page; 591 struct page *page;
427 592
@@ -437,12 +602,12 @@ xfs_probe_delalloc_page(
437 602
438 bh = head = page_buffers(page); 603 bh = head = page_buffers(page);
439 do { 604 do {
440 if (buffer_unwritten(bh)) { 605 if (buffer_unwritten(bh))
441 acceptable = 0; 606 acceptable = (type == IOMAP_UNWRITTEN);
607 else if (buffer_delay(bh))
608 acceptable = (type == IOMAP_DELAY);
609 else
442 break; 610 break;
443 } else if (buffer_delay(bh)) {
444 acceptable = 1;
445 }
446 } while ((bh = bh->b_this_page) != head); 611 } while ((bh = bh->b_this_page) != head);
447 612
448 if (acceptable) 613 if (acceptable)
@@ -454,161 +619,30 @@ out:
454 return NULL; 619 return NULL;
455} 620}
456 621
457STATIC int
458xfs_map_unwritten(
459 struct inode *inode,
460 struct page *start_page,
461 struct buffer_head *head,
462 struct buffer_head *curr,
463 unsigned long p_offset,
464 int block_bits,
465 xfs_iomap_t *iomapp,
466 struct writeback_control *wbc,
467 int startio,
468 int all_bh)
469{
470 struct buffer_head *bh = curr;
471 xfs_iomap_t *tmp;
472 xfs_ioend_t *ioend;
473 loff_t offset;
474 unsigned long nblocks = 0;
475
476 offset = start_page->index;
477 offset <<= PAGE_CACHE_SHIFT;
478 offset += p_offset;
479
480 ioend = xfs_alloc_ioend(inode);
481
482 /* First map forwards in the page consecutive buffers
483 * covering this unwritten extent
484 */
485 do {
486 if (!buffer_unwritten(bh))
487 break;
488 tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
489 if (!tmp)
490 break;
491 xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
492 set_buffer_unwritten_io(bh);
493 bh->b_private = ioend;
494 p_offset += bh->b_size;
495 nblocks++;
496 } while ((bh = bh->b_this_page) != head);
497
498 atomic_add(nblocks, &ioend->io_remaining);
499
500 /* If we reached the end of the page, map forwards in any
501 * following pages which are also covered by this extent.
502 */
503 if (bh == head) {
504 struct address_space *mapping = inode->i_mapping;
505 pgoff_t tindex, tloff, tlast;
506 unsigned long bs;
507 unsigned int pg_offset, bbits = inode->i_blkbits;
508 struct page *page;
509
510 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
511 tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
512 tloff = min(tlast, tloff);
513 for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
514 page = xfs_probe_unwritten_page(mapping,
515 tindex, iomapp, ioend,
516 PAGE_CACHE_SIZE, &bs, bbits);
517 if (!page)
518 break;
519 nblocks += bs;
520 atomic_add(bs, &ioend->io_remaining);
521 xfs_convert_page(inode, page, iomapp, wbc, ioend,
522 startio, all_bh);
523 /* stop if converting the next page might add
524 * enough blocks that the corresponding byte
525 * count won't fit in our ulong page buf length */
526 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
527 goto enough;
528 }
529
530 if (tindex == tlast &&
531 (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
532 page = xfs_probe_unwritten_page(mapping,
533 tindex, iomapp, ioend,
534 pg_offset, &bs, bbits);
535 if (page) {
536 nblocks += bs;
537 atomic_add(bs, &ioend->io_remaining);
538 xfs_convert_page(inode, page, iomapp, wbc, ioend,
539 startio, all_bh);
540 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
541 goto enough;
542 }
543 }
544 }
545
546enough:
547 ioend->io_size = (xfs_off_t)nblocks << block_bits;
548 ioend->io_offset = offset;
549 xfs_finish_ioend(ioend);
550 return 0;
551}
552
553STATIC void
554xfs_submit_page(
555 struct page *page,
556 struct writeback_control *wbc,
557 struct buffer_head *bh_arr[],
558 int bh_count,
559 int probed_page,
560 int clear_dirty)
561{
562 struct buffer_head *bh;
563 int i;
564
565 BUG_ON(PageWriteback(page));
566 if (bh_count)
567 set_page_writeback(page);
568 if (clear_dirty)
569 clear_page_dirty(page);
570 unlock_page(page);
571
572 if (bh_count) {
573 for (i = 0; i < bh_count; i++) {
574 bh = bh_arr[i];
575 mark_buffer_async_write(bh);
576 if (buffer_unwritten(bh))
577 set_buffer_unwritten_io(bh);
578 set_buffer_uptodate(bh);
579 clear_buffer_dirty(bh);
580 }
581
582 for (i = 0; i < bh_count; i++)
583 submit_bh(WRITE, bh_arr[i]);
584
585 if (probed_page && clear_dirty)
586 wbc->nr_to_write--; /* Wrote an "extra" page */
587 }
588}
589
590/* 622/*
591 * Allocate & map buffers for page given the extent map. Write it out. 623 * Allocate & map buffers for page given the extent map. Write it out.
592 * except for the original page of a writepage, this is called on 624 * except for the original page of a writepage, this is called on
593 * delalloc/unwritten pages only, for the original page it is possible 625 * delalloc/unwritten pages only, for the original page it is possible
594 * that the page has no mapping at all. 626 * that the page has no mapping at all.
595 */ 627 */
596STATIC void 628STATIC int
597xfs_convert_page( 629xfs_convert_page(
598 struct inode *inode, 630 struct inode *inode,
599 struct page *page, 631 struct page *page,
600 xfs_iomap_t *iomapp, 632 xfs_iomap_t *iomapp,
633 xfs_ioend_t **ioendp,
601 struct writeback_control *wbc, 634 struct writeback_control *wbc,
602 void *private, 635 void *private,
603 int startio, 636 int startio,
604 int all_bh) 637 int all_bh)
605{ 638{
606 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; 639 struct buffer_head *bh, *head;
607 xfs_iomap_t *mp = iomapp, *tmp; 640 xfs_iomap_t *mp = iomapp, *tmp;
608 unsigned long offset, end_offset; 641 unsigned long p_offset, end_offset;
609 int index = 0; 642 unsigned int type;
610 int bbits = inode->i_blkbits; 643 int bbits = inode->i_blkbits;
611 int len, page_dirty; 644 int len, page_dirty;
645 int count = 0, done = 0, uptodate = 1;
612 646
613 end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)); 647 end_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1));
614 648
@@ -621,59 +655,66 @@ xfs_convert_page(
621 end_offset = roundup(end_offset, len); 655 end_offset = roundup(end_offset, len);
622 page_dirty = end_offset / len; 656 page_dirty = end_offset / len;
623 657
624 offset = 0; 658 p_offset = 0;
625 bh = head = page_buffers(page); 659 bh = head = page_buffers(page);
626 do { 660 do {
627 if (offset >= end_offset) 661 if (p_offset >= end_offset)
628 break; 662 break;
629 if (!(PageUptodate(page) || buffer_uptodate(bh))) 663 if (!buffer_uptodate(bh))
664 uptodate = 0;
665 if (!(PageUptodate(page) || buffer_uptodate(bh))) {
666 done = 1;
630 continue; 667 continue;
631 if (buffer_mapped(bh) && all_bh && 668 }
632 !(buffer_unwritten(bh) || buffer_delay(bh))) { 669
633 if (startio) { 670 if (buffer_unwritten(bh))
671 type = IOMAP_UNWRITTEN;
672 else if (buffer_delay(bh))
673 type = IOMAP_DELAY;
674 else {
675 type = 0;
676 if (!(buffer_mapped(bh) && all_bh && startio)) {
677 done = 1;
678 } else if (startio) {
634 lock_buffer(bh); 679 lock_buffer(bh);
635 bh_arr[index++] = bh; 680 xfs_add_to_ioend(inode, bh, p_offset,
681 type, ioendp, done);
682 count++;
636 page_dirty--; 683 page_dirty--;
637 } 684 }
638 continue; 685 continue;
639 } 686 }
640 tmp = xfs_offset_to_map(page, mp, offset); 687 tmp = xfs_offset_to_map(page, mp, p_offset);
641 if (!tmp) 688 if (!tmp) {
689 done = 1;
642 continue; 690 continue;
691 }
643 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE)); 692 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
644 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY)); 693 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
645 694
646 /* If this is a new unwritten extent buffer (i.e. one 695 xfs_map_at_offset(page, bh, p_offset, bbits, tmp, *ioendp);
647 * that we haven't passed in private data for, we must
648 * now map this buffer too.
649 */
650 if (buffer_unwritten(bh) && !bh->b_end_io) {
651 ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
652 xfs_map_unwritten(inode, page, head, bh, offset,
653 bbits, tmp, wbc, startio, all_bh);
654 } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
655 xfs_map_at_offset(page, bh, offset, bbits, tmp);
656 if (buffer_unwritten(bh)) {
657 set_buffer_unwritten_io(bh);
658 bh->b_private = private;
659 ASSERT(private);
660 }
661 }
662 if (startio) { 696 if (startio) {
663 bh_arr[index++] = bh; 697 xfs_add_to_ioend(inode, bh, p_offset,
698 type, ioendp, done);
699 count++;
664 } else { 700 } else {
665 set_buffer_dirty(bh); 701 set_buffer_dirty(bh);
666 unlock_buffer(bh); 702 unlock_buffer(bh);
667 mark_buffer_dirty(bh); 703 mark_buffer_dirty(bh);
668 } 704 }
669 page_dirty--; 705 page_dirty--;
670 } while (offset += len, (bh = bh->b_this_page) != head); 706 } while (p_offset += len, (bh = bh->b_this_page) != head);
671 707
672 if (startio && index) { 708 if (uptodate && bh == head)
673 xfs_submit_page(page, wbc, bh_arr, index, 1, !page_dirty); 709 SetPageUptodate(page);
674 } else { 710
675 unlock_page(page); 711 if (startio) {
712 if (count)
713 wbc->nr_to_write--;
714 xfs_start_page_writeback(page, wbc, !page_dirty, count);
676 } 715 }
716
717 return done;
677} 718}
678 719
679/* 720/*
@@ -685,19 +726,22 @@ xfs_cluster_write(
685 struct inode *inode, 726 struct inode *inode,
686 pgoff_t tindex, 727 pgoff_t tindex,
687 xfs_iomap_t *iomapp, 728 xfs_iomap_t *iomapp,
729 xfs_ioend_t **ioendp,
688 struct writeback_control *wbc, 730 struct writeback_control *wbc,
689 int startio, 731 int startio,
690 int all_bh, 732 int all_bh,
691 pgoff_t tlast) 733 pgoff_t tlast)
692{ 734{
693 struct page *page; 735 struct page *page;
736 unsigned int type = (*ioendp)->io_type;
737 int done;
694 738
695 for (; tindex <= tlast; tindex++) { 739 for (done = 0; tindex <= tlast && !done; tindex++) {
696 page = xfs_probe_delalloc_page(inode, tindex); 740 page = xfs_probe_delayed_page(inode, tindex, type);
697 if (!page) 741 if (!page)
698 break; 742 break;
699 xfs_convert_page(inode, page, iomapp, wbc, NULL, 743 done = xfs_convert_page(inode, page, iomapp, ioendp,
700 startio, all_bh); 744 wbc, NULL, startio, all_bh);
701 } 745 }
702} 746}
703 747
@@ -728,18 +772,21 @@ xfs_page_state_convert(
728 int startio, 772 int startio,
729 int unmapped) /* also implies page uptodate */ 773 int unmapped) /* also implies page uptodate */
730{ 774{
731 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head; 775 struct buffer_head *bh, *head;
732 xfs_iomap_t *iomp, iomap; 776 xfs_iomap_t *iomp, iomap;
777 xfs_ioend_t *ioend = NULL, *iohead = NULL;
733 loff_t offset; 778 loff_t offset;
734 unsigned long p_offset = 0; 779 unsigned long p_offset = 0;
780 unsigned int type;
735 __uint64_t end_offset; 781 __uint64_t end_offset;
736 pgoff_t end_index, last_index, tlast; 782 pgoff_t end_index, last_index, tlast;
737 int len, err, i, cnt = 0, uptodate = 1; 783 int flags, len, err, done = 1;
738 int flags; 784 int uptodate = 1;
739 int page_dirty; 785 int page_dirty, count = 0, trylock_flag = 0;
740 786
741 /* wait for other IO threads? */ 787 /* wait for other IO threads? */
742 flags = (startio && wbc->sync_mode != WB_SYNC_NONE) ? 0 : BMAPI_TRYLOCK; 788 if (startio && wbc->sync_mode != WB_SYNC_NONE)
789 trylock_flag |= BMAPI_TRYLOCK;
743 790
744 /* Is this page beyond the end of the file? */ 791 /* Is this page beyond the end of the file? */
745 offset = i_size_read(inode); 792 offset = i_size_read(inode);
@@ -754,98 +801,98 @@ xfs_page_state_convert(
754 } 801 }
755 } 802 }
756 803
757 end_offset = min_t(unsigned long long,
758 (loff_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
759 offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
760
761 /* 804 /*
762 * page_dirty is initially a count of buffers on the page before 805 * page_dirty is initially a count of buffers on the page before
763 * EOF and is decrememted as we move each into a cleanable state. 806 * EOF and is decrememted as we move each into a cleanable state.
764 */ 807 *
808 * Derivation:
809 *
810 * End offset is the highest offset that this page should represent.
811 * If we are on the last page, (end_offset & (PAGE_CACHE_SIZE - 1))
812 * will evaluate non-zero and be less than PAGE_CACHE_SIZE and
813 * hence give us the correct page_dirty count. On any other page,
814 * it will be zero and in that case we need page_dirty to be the
815 * count of buffers on the page.
816 */
817 end_offset = min_t(unsigned long long,
818 (xfs_off_t)(page->index + 1) << PAGE_CACHE_SHIFT, offset);
765 len = 1 << inode->i_blkbits; 819 len = 1 << inode->i_blkbits;
766 p_offset = max(p_offset, PAGE_CACHE_SIZE); 820 p_offset = min_t(unsigned long, end_offset & (PAGE_CACHE_SIZE - 1),
767 p_offset = roundup(p_offset, len); 821 PAGE_CACHE_SIZE);
822 p_offset = p_offset ? roundup(p_offset, len) : PAGE_CACHE_SIZE;
768 page_dirty = p_offset / len; 823 page_dirty = p_offset / len;
769 824
770 iomp = NULL; 825 iomp = NULL;
771 p_offset = 0;
772 bh = head = page_buffers(page); 826 bh = head = page_buffers(page);
827 offset = page_offset(page);
828
829 /* TODO: fix up "done" variable and iomap pointer (boolean) */
830 /* TODO: cleanup count and page_dirty */
773 831
774 do { 832 do {
775 if (offset >= end_offset) 833 if (offset >= end_offset)
776 break; 834 break;
777 if (!buffer_uptodate(bh)) 835 if (!buffer_uptodate(bh))
778 uptodate = 0; 836 uptodate = 0;
779 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) 837 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio) {
838 done = 1;
780 continue; 839 continue;
840 }
781 841
782 if (iomp) { 842 if (iomp) {
783 iomp = xfs_offset_to_map(page, &iomap, p_offset); 843 iomp = xfs_offset_to_map(page, &iomap, p_offset);
844 done = (iomp == NULL);
784 } 845 }
785 846
786 /* 847 /*
787 * First case, map an unwritten extent and prepare for 848 * First case, map an unwritten extent and prepare for
788 * extent state conversion transaction on completion. 849 * extent state conversion transaction on completion.
789 */ 850 *
790 if (buffer_unwritten(bh)) {
791 if (!startio)
792 continue;
793 if (!iomp) {
794 err = xfs_map_blocks(inode, offset, len, &iomap,
795 BMAPI_WRITE|BMAPI_IGNSTATE);
796 if (err) {
797 goto error;
798 }
799 iomp = xfs_offset_to_map(page, &iomap,
800 p_offset);
801 }
802 if (iomp) {
803 if (!bh->b_end_io) {
804 err = xfs_map_unwritten(inode, page,
805 head, bh, p_offset,
806 inode->i_blkbits, iomp,
807 wbc, startio, unmapped);
808 if (err) {
809 goto error;
810 }
811 } else {
812 set_bit(BH_Lock, &bh->b_state);
813 }
814 BUG_ON(!buffer_locked(bh));
815 bh_arr[cnt++] = bh;
816 page_dirty--;
817 }
818 /*
819 * Second case, allocate space for a delalloc buffer. 851 * Second case, allocate space for a delalloc buffer.
820 * We can return EAGAIN here in the release page case. 852 * We can return EAGAIN here in the release page case.
821 */ 853 */
822 } else if (buffer_delay(bh)) { 854 if (buffer_unwritten(bh) || buffer_delay(bh)) {
855 if (buffer_unwritten(bh)) {
856 type = IOMAP_UNWRITTEN;
857 flags = BMAPI_WRITE|BMAPI_IGNSTATE;
858 } else {
859 type = IOMAP_DELAY;
860 flags = BMAPI_ALLOCATE;
861 if (!startio)
862 flags |= trylock_flag;
863 }
864
823 if (!iomp) { 865 if (!iomp) {
866 done = 1;
824 err = xfs_map_blocks(inode, offset, len, &iomap, 867 err = xfs_map_blocks(inode, offset, len, &iomap,
825 BMAPI_ALLOCATE | flags); 868 flags);
826 if (err) { 869 if (err)
827 goto error; 870 goto error;
828 }
829 iomp = xfs_offset_to_map(page, &iomap, 871 iomp = xfs_offset_to_map(page, &iomap,
830 p_offset); 872 p_offset);
873 done = (iomp == NULL);
831 } 874 }
832 if (iomp) { 875 if (iomp) {
833 xfs_map_at_offset(page, bh, p_offset, 876 xfs_map_at_offset(page, bh, p_offset,
834 inode->i_blkbits, iomp); 877 inode->i_blkbits, iomp, ioend);
835 if (startio) { 878 if (startio) {
836 bh_arr[cnt++] = bh; 879 xfs_add_to_ioend(inode, bh, p_offset,
880 type, &ioend, done);
837 } else { 881 } else {
838 set_buffer_dirty(bh); 882 set_buffer_dirty(bh);
839 unlock_buffer(bh); 883 unlock_buffer(bh);
840 mark_buffer_dirty(bh); 884 mark_buffer_dirty(bh);
841 } 885 }
842 page_dirty--; 886 page_dirty--;
887 count++;
888 } else {
889 done = 1;
843 } 890 }
844 } else if ((buffer_uptodate(bh) || PageUptodate(page)) && 891 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
845 (unmapped || startio)) { 892 (unmapped || startio)) {
846 893
894 type = 0;
847 if (!buffer_mapped(bh)) { 895 if (!buffer_mapped(bh)) {
848 int size;
849 896
850 /* 897 /*
851 * Getting here implies an unmapped buffer 898 * Getting here implies an unmapped buffer
@@ -853,6 +900,8 @@ xfs_page_state_convert(
853 * need to write the whole page out. 900 * need to write the whole page out.
854 */ 901 */
855 if (!iomp) { 902 if (!iomp) {
903 int size;
904
856 size = xfs_probe_unmapped_cluster( 905 size = xfs_probe_unmapped_cluster(
857 inode, page, bh, head); 906 inode, page, bh, head);
858 err = xfs_map_blocks(inode, offset, 907 err = xfs_map_blocks(inode, offset,
@@ -863,52 +912,70 @@ xfs_page_state_convert(
863 } 912 }
864 iomp = xfs_offset_to_map(page, &iomap, 913 iomp = xfs_offset_to_map(page, &iomap,
865 p_offset); 914 p_offset);
915 done = (iomp == NULL);
866 } 916 }
867 if (iomp) { 917 if (iomp) {
868 xfs_map_at_offset(page, 918 xfs_map_at_offset(page, bh, p_offset,
869 bh, p_offset, 919 inode->i_blkbits, iomp,
870 inode->i_blkbits, iomp); 920 ioend);
871 if (startio) { 921 if (startio) {
872 bh_arr[cnt++] = bh; 922 xfs_add_to_ioend(inode,
923 bh, p_offset, type,
924 &ioend, done);
873 } else { 925 } else {
874 set_buffer_dirty(bh); 926 set_buffer_dirty(bh);
875 unlock_buffer(bh); 927 unlock_buffer(bh);
876 mark_buffer_dirty(bh); 928 mark_buffer_dirty(bh);
877 } 929 }
878 page_dirty--; 930 page_dirty--;
931 count++;
932 } else {
933 done = 1;
879 } 934 }
880 } else if (startio) { 935 } else if (startio) {
881 if (buffer_uptodate(bh) && 936 if (buffer_uptodate(bh) &&
882 !test_and_set_bit(BH_Lock, &bh->b_state)) { 937 !test_and_set_bit(BH_Lock, &bh->b_state)) {
883 bh_arr[cnt++] = bh; 938 ASSERT(buffer_mapped(bh));
939 xfs_add_to_ioend(inode,
940 bh, p_offset, type,
941 &ioend, done);
884 page_dirty--; 942 page_dirty--;
943 count++;
944 } else {
945 done = 1;
885 } 946 }
947 } else {
948 done = 1;
886 } 949 }
887 } 950 }
888 } while (offset += len, p_offset += len, 951
889 ((bh = bh->b_this_page) != head)); 952 if (!iohead)
953 iohead = ioend;
954
955 } while (offset += len, ((bh = bh->b_this_page) != head));
890 956
891 if (uptodate && bh == head) 957 if (uptodate && bh == head)
892 SetPageUptodate(page); 958 SetPageUptodate(page);
893 959
894 if (startio) { 960 if (startio)
895 xfs_submit_page(page, wbc, bh_arr, cnt, 0, !page_dirty); 961 xfs_start_page_writeback(page, wbc, 1, count);
896 }
897 962
898 if (iomp) { 963 if (ioend && iomp && !done) {
899 offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >> 964 offset = (iomp->iomap_offset + iomp->iomap_bsize - 1) >>
900 PAGE_CACHE_SHIFT; 965 PAGE_CACHE_SHIFT;
901 tlast = min_t(pgoff_t, offset, last_index); 966 tlast = min_t(pgoff_t, offset, last_index);
902 xfs_cluster_write(inode, page->index + 1, iomp, wbc, 967 xfs_cluster_write(inode, page->index + 1, iomp, &ioend,
903 startio, unmapped, tlast); 968 wbc, startio, unmapped, tlast);
904 } 969 }
905 970
971 if (iohead)
972 xfs_submit_ioend(iohead);
973
906 return page_dirty; 974 return page_dirty;
907 975
908error: 976error:
909 for (i = 0; i < cnt; i++) { 977 if (iohead)
910 unlock_buffer(bh_arr[i]); 978 xfs_cancel_ioend(iohead);
911 }
912 979
913 /* 980 /*
914 * If it's delalloc and we have nowhere to put it, 981 * If it's delalloc and we have nowhere to put it,
@@ -916,9 +983,8 @@ error:
916 * us to try again. 983 * us to try again.
917 */ 984 */
918 if (err != -EAGAIN) { 985 if (err != -EAGAIN) {
919 if (!unmapped) { 986 if (!unmapped)
920 block_invalidatepage(page, 0); 987 block_invalidatepage(page, 0);
921 }
922 ClearPageUptodate(page); 988 ClearPageUptodate(page);
923 } 989 }
924 return err; 990 return err;
@@ -1094,7 +1160,7 @@ linvfs_direct_IO(
1094 if (error) 1160 if (error)
1095 return -error; 1161 return -error;
1096 1162
1097 iocb->private = xfs_alloc_ioend(inode); 1163 iocb->private = xfs_alloc_ioend(inode, IOMAP_UNWRITTEN);
1098 1164
1099 ret = blockdev_direct_IO_own_locking(rw, iocb, inode, 1165 ret = blockdev_direct_IO_own_locking(rw, iocb, inode,
1100 iomap.iomap_target->bt_bdev, 1166 iomap.iomap_target->bt_bdev,