aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPeng Tao <bergwolf@gmail.com>2012-01-12 10:18:46 -0500
committerTrond Myklebust <Trond.Myklebust@netapp.com>2012-01-12 16:52:10 -0500
commit7c5465d6ccd759caa959828e2add5603518dafc4 (patch)
tree0d7a8dff18b667da2d2d1394c8e4585e7e098920
parentc0411a94a8f318379464e29dd81db806249dbca6 (diff)
pnfsblock: alloc short extent before submit bio
As discussed earlier, it is better for block client to allocate memory for tracking extents state before submitting bio. So the patch does it by allocating a short_extent for every INVALID extent touched by write pagelist and for every zeroing page we created, saving them in layout header. Then in end_io we can just use them to create commit list items and avoid memory allocation there. Signed-off-by: Peng Tao <peng_tao@emc.com> Signed-off-by: Benny Halevy <bhalevy@tonian.com> Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
-rw-r--r--fs/nfs/blocklayout/blocklayout.c74
-rw-r--r--fs/nfs/blocklayout/blocklayout.h9
-rw-r--r--fs/nfs/blocklayout/extents.c85
3 files changed, 131 insertions, 37 deletions
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 9215c6644a3..48cfac31f64 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -90,8 +90,9 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect)
90 */ 90 */
91struct parallel_io { 91struct parallel_io {
92 struct kref refcnt; 92 struct kref refcnt;
93 void (*pnfs_callback) (void *data); 93 void (*pnfs_callback) (void *data, int num_se);
94 void *data; 94 void *data;
95 int bse_count;
95}; 96};
96 97
97static inline struct parallel_io *alloc_parallel(void *data) 98static inline struct parallel_io *alloc_parallel(void *data)
@@ -102,6 +103,7 @@ static inline struct parallel_io *alloc_parallel(void *data)
102 if (rv) { 103 if (rv) {
103 rv->data = data; 104 rv->data = data;
104 kref_init(&rv->refcnt); 105 kref_init(&rv->refcnt);
106 rv->bse_count = 0;
105 } 107 }
106 return rv; 108 return rv;
107} 109}
@@ -116,7 +118,7 @@ static void destroy_parallel(struct kref *kref)
116 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); 118 struct parallel_io *p = container_of(kref, struct parallel_io, refcnt);
117 119
118 dprintk("%s enter\n", __func__); 120 dprintk("%s enter\n", __func__);
119 p->pnfs_callback(p->data); 121 p->pnfs_callback(p->data, p->bse_count);
120 kfree(p); 122 kfree(p);
121} 123}
122 124
@@ -216,7 +218,7 @@ static void bl_read_cleanup(struct work_struct *work)
216} 218}
217 219
218static void 220static void
219bl_end_par_io_read(void *data) 221bl_end_par_io_read(void *data, int unused)
220{ 222{
221 struct nfs_read_data *rdata = data; 223 struct nfs_read_data *rdata = data;
222 224
@@ -317,6 +319,7 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
317{ 319{
318 sector_t isect, end; 320 sector_t isect, end;
319 struct pnfs_block_extent *be; 321 struct pnfs_block_extent *be;
322 struct pnfs_block_short_extent *se;
320 323
321 dprintk("%s(%llu, %u)\n", __func__, offset, count); 324 dprintk("%s(%llu, %u)\n", __func__, offset, count);
322 if (count == 0) 325 if (count == 0)
@@ -329,8 +332,11 @@ static void mark_extents_written(struct pnfs_block_layout *bl,
329 be = bl_find_get_extent(bl, isect, NULL); 332 be = bl_find_get_extent(bl, isect, NULL);
330 BUG_ON(!be); /* FIXME */ 333 BUG_ON(!be); /* FIXME */
331 len = min(end, be->be_f_offset + be->be_length) - isect; 334 len = min(end, be->be_f_offset + be->be_length) - isect;
332 if (be->be_state == PNFS_BLOCK_INVALID_DATA) 335 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
333 bl_mark_for_commit(be, isect, len); /* What if fails? */ 336 se = bl_pop_one_short_extent(be->be_inval);
337 BUG_ON(!se);
338 bl_mark_for_commit(be, isect, len, se);
339 }
334 isect += len; 340 isect += len;
335 bl_put_extent(be); 341 bl_put_extent(be);
336 } 342 }
@@ -352,7 +358,8 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
352 end_page_writeback(page); 358 end_page_writeback(page);
353 page_cache_release(page); 359 page_cache_release(page);
354 } while (bvec >= bio->bi_io_vec); 360 } while (bvec >= bio->bi_io_vec);
355 if (!uptodate) { 361
362 if (unlikely(!uptodate)) {
356 if (!wdata->pnfs_error) 363 if (!wdata->pnfs_error)
357 wdata->pnfs_error = -EIO; 364 wdata->pnfs_error = -EIO;
358 pnfs_set_lo_fail(wdata->lseg); 365 pnfs_set_lo_fail(wdata->lseg);
@@ -361,7 +368,6 @@ static void bl_end_io_write_zero(struct bio *bio, int err)
361 put_parallel(par); 368 put_parallel(par);
362} 369}
363 370
364/* This is basically copied from mpage_end_io_read */
365static void bl_end_io_write(struct bio *bio, int err) 371static void bl_end_io_write(struct bio *bio, int err)
366{ 372{
367 struct parallel_io *par = bio->bi_private; 373 struct parallel_io *par = bio->bi_private;
@@ -387,7 +393,7 @@ static void bl_write_cleanup(struct work_struct *work)
387 dprintk("%s enter\n", __func__); 393 dprintk("%s enter\n", __func__);
388 task = container_of(work, struct rpc_task, u.tk_work); 394 task = container_of(work, struct rpc_task, u.tk_work);
389 wdata = container_of(task, struct nfs_write_data, task); 395 wdata = container_of(task, struct nfs_write_data, task);
390 if (!wdata->pnfs_error) { 396 if (likely(!wdata->pnfs_error)) {
391 /* Marks for LAYOUTCOMMIT */ 397 /* Marks for LAYOUTCOMMIT */
392 mark_extents_written(BLK_LSEG2EXT(wdata->lseg), 398 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
393 wdata->args.offset, wdata->args.count); 399 wdata->args.offset, wdata->args.count);
@@ -396,10 +402,15 @@ static void bl_write_cleanup(struct work_struct *work)
396} 402}
397 403
398/* Called when last of bios associated with a bl_write_pagelist call finishes */ 404/* Called when last of bios associated with a bl_write_pagelist call finishes */
399static void bl_end_par_io_write(void *data) 405static void bl_end_par_io_write(void *data, int num_se)
400{ 406{
401 struct nfs_write_data *wdata = data; 407 struct nfs_write_data *wdata = data;
402 408
409 if (unlikely(wdata->pnfs_error)) {
410 bl_free_short_extents(&BLK_LSEG2EXT(wdata->lseg)->bl_inval,
411 num_se);
412 }
413
403 wdata->task.tk_status = wdata->pnfs_error; 414 wdata->task.tk_status = wdata->pnfs_error;
404 wdata->verf.committed = NFS_FILE_SYNC; 415 wdata->verf.committed = NFS_FILE_SYNC;
405 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup); 416 INIT_WORK(&wdata->task.u.tk_work, bl_write_cleanup);
@@ -552,7 +563,7 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
552 */ 563 */
553 par = alloc_parallel(wdata); 564 par = alloc_parallel(wdata);
554 if (!par) 565 if (!par)
555 return PNFS_NOT_ATTEMPTED; 566 goto out_mds;
556 par->pnfs_callback = bl_end_par_io_write; 567 par->pnfs_callback = bl_end_par_io_write;
557 /* At this point, have to be more careful with error handling */ 568 /* At this point, have to be more careful with error handling */
558 569
@@ -560,12 +571,15 @@ bl_write_pagelist(struct nfs_write_data *wdata, int sync)
560 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read); 571 be = bl_find_get_extent(BLK_LSEG2EXT(wdata->lseg), isect, &cow_read);
561 if (!be || !is_writable(be, isect)) { 572 if (!be || !is_writable(be, isect)) {
562 dprintk("%s no matching extents!\n", __func__); 573 dprintk("%s no matching extents!\n", __func__);
563 wdata->pnfs_error = -EINVAL; 574 goto out_mds;
564 goto out;
565 } 575 }
566 576
567 /* First page inside INVALID extent */ 577 /* First page inside INVALID extent */
568 if (be->be_state == PNFS_BLOCK_INVALID_DATA) { 578 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
579 if (likely(!bl_push_one_short_extent(be->be_inval)))
580 par->bse_count++;
581 else
582 goto out_mds;
569 temp = offset >> PAGE_CACHE_SHIFT; 583 temp = offset >> PAGE_CACHE_SHIFT;
570 npg_zero = do_div(temp, npg_per_block); 584 npg_zero = do_div(temp, npg_per_block);
571 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & 585 isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) &
@@ -603,6 +617,19 @@ fill_invalid_ext:
603 wdata->pnfs_error = ret; 617 wdata->pnfs_error = ret;
604 goto out; 618 goto out;
605 } 619 }
620 if (likely(!bl_push_one_short_extent(be->be_inval)))
621 par->bse_count++;
622 else {
623 end_page_writeback(page);
624 page_cache_release(page);
625 wdata->pnfs_error = -ENOMEM;
626 goto out;
627 }
628 /* FIXME: This should be done in bi_end_io */
629 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
630 page->index << PAGE_CACHE_SHIFT,
631 PAGE_CACHE_SIZE);
632
606 bio = bl_add_page_to_bio(bio, npg_zero, WRITE, 633 bio = bl_add_page_to_bio(bio, npg_zero, WRITE,
607 isect, page, be, 634 isect, page, be,
608 bl_end_io_write_zero, par); 635 bl_end_io_write_zero, par);
@@ -611,10 +638,6 @@ fill_invalid_ext:
611 bio = NULL; 638 bio = NULL;
612 goto out; 639 goto out;
613 } 640 }
614 /* FIXME: This should be done in bi_end_io */
615 mark_extents_written(BLK_LSEG2EXT(wdata->lseg),
616 page->index << PAGE_CACHE_SHIFT,
617 PAGE_CACHE_SIZE);
618next_page: 641next_page:
619 isect += PAGE_CACHE_SECTORS; 642 isect += PAGE_CACHE_SECTORS;
620 extent_length -= PAGE_CACHE_SECTORS; 643 extent_length -= PAGE_CACHE_SECTORS;
@@ -638,6 +661,15 @@ next_page:
638 wdata->pnfs_error = -EINVAL; 661 wdata->pnfs_error = -EINVAL;
639 goto out; 662 goto out;
640 } 663 }
664 if (be->be_state == PNFS_BLOCK_INVALID_DATA) {
665 if (likely(!bl_push_one_short_extent(
666 be->be_inval)))
667 par->bse_count++;
668 else {
669 wdata->pnfs_error = -ENOMEM;
670 goto out;
671 }
672 }
641 extent_length = be->be_length - 673 extent_length = be->be_length -
642 (isect - be->be_f_offset); 674 (isect - be->be_f_offset);
643 } 675 }
@@ -685,6 +717,10 @@ out:
685 bl_submit_bio(WRITE, bio); 717 bl_submit_bio(WRITE, bio);
686 put_parallel(par); 718 put_parallel(par);
687 return PNFS_ATTEMPTED; 719 return PNFS_ATTEMPTED;
720out_mds:
721 bl_put_extent(be);
722 kfree(par);
723 return PNFS_NOT_ATTEMPTED;
688} 724}
689 725
690/* FIXME - range ignored */ 726/* FIXME - range ignored */
@@ -711,11 +747,17 @@ static void
711release_inval_marks(struct pnfs_inval_markings *marks) 747release_inval_marks(struct pnfs_inval_markings *marks)
712{ 748{
713 struct pnfs_inval_tracking *pos, *temp; 749 struct pnfs_inval_tracking *pos, *temp;
750 struct pnfs_block_short_extent *se, *stemp;
714 751
715 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { 752 list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) {
716 list_del(&pos->it_link); 753 list_del(&pos->it_link);
717 kfree(pos); 754 kfree(pos);
718 } 755 }
756
757 list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) {
758 list_del(&se->bse_node);
759 kfree(se);
760 }
719 return; 761 return;
720} 762}
721 763
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 60728acc7b9..e31a2df28e7 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -70,6 +70,7 @@ struct pnfs_inval_markings {
70 spinlock_t im_lock; 70 spinlock_t im_lock;
71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ 71 struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */
72 sector_t im_block_size; /* Server blocksize in sectors */ 72 sector_t im_block_size; /* Server blocksize in sectors */
73 struct list_head im_extents; /* Short extents for INVAL->RW conversion */
73}; 74};
74 75
75struct pnfs_inval_tracking { 76struct pnfs_inval_tracking {
@@ -105,6 +106,7 @@ BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize)
105{ 106{
106 spin_lock_init(&marks->im_lock); 107 spin_lock_init(&marks->im_lock);
107 INIT_LIST_HEAD(&marks->im_tree.mtt_stub); 108 INIT_LIST_HEAD(&marks->im_tree.mtt_stub);
109 INIT_LIST_HEAD(&marks->im_extents);
108 marks->im_block_size = blocksize; 110 marks->im_block_size = blocksize;
109 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, 111 marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS,
110 blocksize); 112 blocksize);
@@ -199,6 +201,11 @@ void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
199int bl_add_merge_extent(struct pnfs_block_layout *bl, 201int bl_add_merge_extent(struct pnfs_block_layout *bl,
200 struct pnfs_block_extent *new); 202 struct pnfs_block_extent *new);
201int bl_mark_for_commit(struct pnfs_block_extent *be, 203int bl_mark_for_commit(struct pnfs_block_extent *be,
202 sector_t offset, sector_t length); 204 sector_t offset, sector_t length,
205 struct pnfs_block_short_extent *new);
206int bl_push_one_short_extent(struct pnfs_inval_markings *marks);
207struct pnfs_block_short_extent *
208bl_pop_one_short_extent(struct pnfs_inval_markings *marks);
209void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free);
203 210
204#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ 211#endif /* FS_NFS_NFS4BLOCKLAYOUT_H */
diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c
index d0f52ed2242..1abac09f7cd 100644
--- a/fs/nfs/blocklayout/extents.c
+++ b/fs/nfs/blocklayout/extents.c
@@ -157,10 +157,10 @@ static int _preload_range(struct pnfs_inval_markings *marks,
157 goto out_cleanup; 157 goto out_cleanup;
158 } 158 }
159 159
160 spin_lock(&marks->im_lock); 160 spin_lock_bh(&marks->im_lock);
161 for (s = start; s < end; s += tree->mtt_step_size) 161 for (s = start; s < end; s += tree->mtt_step_size)
162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); 162 used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]);
163 spin_unlock(&marks->im_lock); 163 spin_unlock_bh(&marks->im_lock);
164 164
165 status = 0; 165 status = 0;
166 166
@@ -179,9 +179,9 @@ int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect)
179{ 179{
180 int rv; 180 int rv;
181 181
182 spin_lock(&marks->im_lock); 182 spin_lock_bh(&marks->im_lock);
183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); 183 rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED);
184 spin_unlock(&marks->im_lock); 184 spin_unlock_bh(&marks->im_lock);
185 return rv; 185 return rv;
186} 186}
187 187
@@ -221,9 +221,9 @@ static int is_range_written(struct pnfs_inval_markings *marks,
221{ 221{
222 int rv; 222 int rv;
223 223
224 spin_lock(&marks->im_lock); 224 spin_lock_bh(&marks->im_lock);
225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); 225 rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN);
226 spin_unlock(&marks->im_lock); 226 spin_unlock_bh(&marks->im_lock);
227 return rv; 227 return rv;
228} 228}
229 229
@@ -244,15 +244,15 @@ int bl_mark_sectors_init(struct pnfs_inval_markings *marks,
244 if (_preload_range(marks, start, end - start)) 244 if (_preload_range(marks, start, end - start))
245 goto outerr; 245 goto outerr;
246 246
247 spin_lock(&marks->im_lock); 247 spin_lock_bh(&marks->im_lock);
248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) 248 if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length))
249 goto out_unlock; 249 goto out_unlock;
250 spin_unlock(&marks->im_lock); 250 spin_unlock_bh(&marks->im_lock);
251 251
252 return 0; 252 return 0;
253 253
254out_unlock: 254out_unlock:
255 spin_unlock(&marks->im_lock); 255 spin_unlock_bh(&marks->im_lock);
256outerr: 256outerr:
257 return -ENOMEM; 257 return -ENOMEM;
258} 258}
@@ -267,9 +267,9 @@ static int mark_written_sectors(struct pnfs_inval_markings *marks,
267 267
268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, 268 dprintk("%s(offset=%llu,len=%llu) enter\n", __func__,
269 (u64)offset, (u64)length); 269 (u64)offset, (u64)length);
270 spin_lock(&marks->im_lock); 270 spin_lock_bh(&marks->im_lock);
271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); 271 status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length);
272 spin_unlock(&marks->im_lock); 272 spin_unlock_bh(&marks->im_lock);
273 return status; 273 return status;
274} 274}
275 275
@@ -369,20 +369,18 @@ static void add_to_commitlist(struct pnfs_block_layout *bl,
369 369
370/* Note the range described by offset, length is guaranteed to be contained 370/* Note the range described by offset, length is guaranteed to be contained
371 * within be. 371 * within be.
372 * new will be freed, either by this function or add_to_commitlist if they
373 * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist.
372 */ 374 */
373int bl_mark_for_commit(struct pnfs_block_extent *be, 375int bl_mark_for_commit(struct pnfs_block_extent *be,
374 sector_t offset, sector_t length) 376 sector_t offset, sector_t length,
377 struct pnfs_block_short_extent *new)
375{ 378{
376 sector_t new_end, end = offset + length; 379 sector_t new_end, end = offset + length;
377 struct pnfs_block_short_extent *new;
378 struct pnfs_block_layout *bl = container_of(be->be_inval, 380 struct pnfs_block_layout *bl = container_of(be->be_inval,
379 struct pnfs_block_layout, 381 struct pnfs_block_layout,
380 bl_inval); 382 bl_inval);
381 383
382 new = kmalloc(sizeof(*new), GFP_NOFS);
383 if (!new)
384 return -ENOMEM;
385
386 mark_written_sectors(be->be_inval, offset, length); 384 mark_written_sectors(be->be_inval, offset, length);
387 /* We want to add the range to commit list, but it must be 385 /* We want to add the range to commit list, but it must be
388 * block-normalized, and verified that the normalized range has 386 * block-normalized, and verified that the normalized range has
@@ -412,9 +410,6 @@ int bl_mark_for_commit(struct pnfs_block_extent *be,
412 new->bse_mdev = be->be_mdev; 410 new->bse_mdev = be->be_mdev;
413 411
414 spin_lock(&bl->bl_ext_lock); 412 spin_lock(&bl->bl_ext_lock);
415 /* new will be freed, either by add_to_commitlist if it decides not
416 * to use it, or after LAYOUTCOMMIT uses it in the commitlist.
417 */
418 add_to_commitlist(bl, new); 413 add_to_commitlist(bl, new);
419 spin_unlock(&bl->bl_ext_lock); 414 spin_unlock(&bl->bl_ext_lock);
420 return 0; 415 return 0;
@@ -862,3 +857,53 @@ clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl,
862 } 857 }
863 } 858 }
864} 859}
860
861int bl_push_one_short_extent(struct pnfs_inval_markings *marks)
862{
863 struct pnfs_block_short_extent *new;
864
865 new = kmalloc(sizeof(*new), GFP_NOFS);
866 if (unlikely(!new))
867 return -ENOMEM;
868
869 spin_lock_bh(&marks->im_lock);
870 list_add(&new->bse_node, &marks->im_extents);
871 spin_unlock_bh(&marks->im_lock);
872
873 return 0;
874}
875
876struct pnfs_block_short_extent *
877bl_pop_one_short_extent(struct pnfs_inval_markings *marks)
878{
879 struct pnfs_block_short_extent *rv = NULL;
880
881 spin_lock_bh(&marks->im_lock);
882 if (!list_empty(&marks->im_extents)) {
883 rv = list_entry((&marks->im_extents)->next,
884 struct pnfs_block_short_extent, bse_node);
885 list_del_init(&rv->bse_node);
886 }
887 spin_unlock_bh(&marks->im_lock);
888
889 return rv;
890}
891
892void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free)
893{
894 struct pnfs_block_short_extent *se = NULL, *tmp;
895
896 if (num_to_free <= 0)
897 return;
898
899 spin_lock(&marks->im_lock);
900 list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) {
901 list_del(&se->bse_node);
902 kfree(se);
903 if (--num_to_free == 0)
904 break;
905 }
906 spin_unlock(&marks->im_lock);
907
908 BUG_ON(num_to_free > 0);
909}