aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBob Peterson <rpeterso@redhat.com>2012-07-19 08:12:40 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2012-07-19 09:51:08 -0400
commit8e2e00473598dd5379d8408cb974dade000acafc (patch)
tree1f7bfdf0d07b6c0315bbd11ffee174742d66a459
parent294f2ad5a545eb71d397623743ddd8201131bdad (diff)
GFS2: Reduce file fragmentation
This patch reduces GFS2 file fragmentation by pre-reserving blocks. The resulting improved on disk layout greatly speeds up operations in cases which would have resulted in interlaced allocation of blocks previously. A typical example of this is 10 parallel dd processes, each writing to a file in a common dirctory. The implementation uses an rbtree of reservations attached to each resource group (and each inode). Signed-off-by: Bob Peterson <rpeterso@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
-rw-r--r--fs/gfs2/bmap.c3
-rw-r--r--fs/gfs2/file.c24
-rw-r--r--fs/gfs2/incore.h49
-rw-r--r--fs/gfs2/inode.c37
-rw-r--r--fs/gfs2/rgrp.c578
-rw-r--r--fs/gfs2/rgrp.h31
-rw-r--r--fs/gfs2/super.c7
-rw-r--r--fs/gfs2/trace_gfs2.h59
-rw-r--r--fs/gfs2/xattr.c12
9 files changed, 708 insertions, 92 deletions
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 6d957a86482b..49cd7dd4a9fa 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -785,6 +785,9 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
785 if (error) 785 if (error)
786 goto out_rlist; 786 goto out_rlist;
787 787
788 if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
789 gfs2_rs_deltree(ip->i_res);
790
788 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + 791 error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
789 RES_INDIRECT + RES_STATFS + RES_QUOTA, 792 RES_INDIRECT + RES_STATFS + RES_QUOTA,
790 revokes); 793 revokes);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6fbf3cbd974d..9f94832cefec 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -383,6 +383,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
383 if (ret) 383 if (ret)
384 return ret; 384 return ret;
385 385
386 atomic_set(&ip->i_res->rs_sizehint,
387 PAGE_CACHE_SIZE / sdp->sd_sb.sb_bsize);
388
386 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); 389 gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
387 ret = gfs2_glock_nq(&gh); 390 ret = gfs2_glock_nq(&gh);
388 if (ret) 391 if (ret)
@@ -571,22 +574,15 @@ fail:
571 574
572static int gfs2_release(struct inode *inode, struct file *file) 575static int gfs2_release(struct inode *inode, struct file *file)
573{ 576{
574 struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
575 struct gfs2_file *fp;
576 struct gfs2_inode *ip = GFS2_I(inode); 577 struct gfs2_inode *ip = GFS2_I(inode);
577 578
578 fp = file->private_data; 579 kfree(file->private_data);
579 file->private_data = NULL; 580 file->private_data = NULL;
580 581
581 if ((file->f_mode & FMODE_WRITE) && ip->i_res && 582 if ((file->f_mode & FMODE_WRITE) &&
582 (atomic_read(&inode->i_writecount) == 1)) 583 (atomic_read(&inode->i_writecount) == 1))
583 gfs2_rs_delete(ip); 584 gfs2_rs_delete(ip);
584 585
585 if (gfs2_assert_warn(sdp, fp))
586 return -EIO;
587
588 kfree(fp);
589
590 return 0; 586 return 0;
591} 587}
592 588
@@ -662,14 +658,18 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
662 unsigned long nr_segs, loff_t pos) 658 unsigned long nr_segs, loff_t pos)
663{ 659{
664 struct file *file = iocb->ki_filp; 660 struct file *file = iocb->ki_filp;
661 size_t writesize = iov_length(iov, nr_segs);
665 struct dentry *dentry = file->f_dentry; 662 struct dentry *dentry = file->f_dentry;
666 struct gfs2_inode *ip = GFS2_I(dentry->d_inode); 663 struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
664 struct gfs2_sbd *sdp;
667 int ret; 665 int ret;
668 666
667 sdp = GFS2_SB(file->f_mapping->host);
669 ret = gfs2_rs_alloc(ip); 668 ret = gfs2_rs_alloc(ip);
670 if (ret) 669 if (ret)
671 return ret; 670 return ret;
672 671
672 atomic_set(&ip->i_res->rs_sizehint, writesize / sdp->sd_sb.sb_bsize);
673 if (file->f_flags & O_APPEND) { 673 if (file->f_flags & O_APPEND) {
674 struct gfs2_holder gh; 674 struct gfs2_holder gh;
675 675
@@ -795,6 +795,8 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
795 if (unlikely(error)) 795 if (unlikely(error))
796 goto out_uninit; 796 goto out_uninit;
797 797
798 atomic_set(&ip->i_res->rs_sizehint, len / sdp->sd_sb.sb_bsize);
799
798 while (len > 0) { 800 while (len > 0) {
799 if (len < bytes) 801 if (len < bytes)
800 bytes = len; 802 bytes = len;
@@ -803,10 +805,6 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
803 offset += bytes; 805 offset += bytes;
804 continue; 806 continue;
805 } 807 }
806 error = gfs2_rindex_update(sdp);
807 if (error)
808 goto out_unlock;
809
810 error = gfs2_quota_lock_check(ip); 808 error = gfs2_quota_lock_check(ip);
811 if (error) 809 if (error)
812 goto out_unlock; 810 goto out_unlock;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index dc730700b3b4..aaecc8085fc5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -84,6 +84,7 @@ struct gfs2_rgrpd {
84 u32 rd_data; /* num of data blocks in rgrp */ 84 u32 rd_data; /* num of data blocks in rgrp */
85 u32 rd_bitbytes; /* number of bytes in data bitmaps */ 85 u32 rd_bitbytes; /* number of bytes in data bitmaps */
86 u32 rd_free; 86 u32 rd_free;
87 u32 rd_reserved; /* number of blocks reserved */
87 u32 rd_free_clone; 88 u32 rd_free_clone;
88 u32 rd_dinodes; 89 u32 rd_dinodes;
89 u64 rd_igeneration; 90 u64 rd_igeneration;
@@ -96,6 +97,9 @@ struct gfs2_rgrpd {
96#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ 97#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */
97#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ 98#define GFS2_RDF_ERROR 0x40000000 /* error in rg */
98#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ 99#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */
100 spinlock_t rd_rsspin; /* protects reservation related vars */
101 struct rb_root rd_rstree; /* multi-block reservation tree */
102 u32 rd_rs_cnt; /* count of current reservations */
99}; 103};
100 104
101enum gfs2_state_bits { 105enum gfs2_state_bits {
@@ -233,6 +237,38 @@ struct gfs2_holder {
233 unsigned long gh_ip; 237 unsigned long gh_ip;
234}; 238};
235 239
240/* Resource group multi-block reservation, in order of appearance:
241
242 Step 1. Function prepares to write, allocates a mb, sets the size hint.
243 Step 2. User calls inplace_reserve to target an rgrp, sets the rgrp info
244 Step 3. Function get_local_rgrp locks the rgrp, determines which bits to use
245 Step 4. Bits are assigned from the rgrp based on either the reservation
246 or wherever it can.
247*/
248
249struct gfs2_blkreserv {
250 /* components used during write (step 1): */
251 atomic_t rs_sizehint; /* hint of the write size */
252
253 /* components used during inplace_reserve (step 2): */
254 u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
255
256 /* components used during get_local_rgrp (step 3): */
257 struct gfs2_rgrpd *rs_rgd; /* pointer to the gfs2_rgrpd */
258 struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
259 struct rb_node rs_node; /* link to other block reservations */
260
261 /* components used during block searches and assignments (step 4): */
262 struct gfs2_bitmap *rs_bi; /* bitmap for the current allocation */
263 u32 rs_biblk; /* start block relative to the bi */
264 u32 rs_free; /* how many blocks are still free */
265
266 /* ancillary quota stuff */
267 struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
268 struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
269 unsigned int rs_qa_qd_num;
270};
271
236enum { 272enum {
237 GLF_LOCK = 1, 273 GLF_LOCK = 1,
238 GLF_DEMOTE = 3, 274 GLF_DEMOTE = 3,
@@ -290,16 +326,6 @@ struct gfs2_glock {
290 326
291#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ 327#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
292 328
293struct gfs2_blkreserv {
294 u32 rs_requested; /* Filled in by caller of gfs2_inplace_reserve() */
295 struct gfs2_holder rs_rgd_gh; /* Filled in by gfs2_inplace_reserve() */
296
297 /* ancillary quota stuff */
298 struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
299 struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
300 unsigned int rs_qa_qd_num;
301};
302
303enum { 329enum {
304 GIF_INVALID = 0, 330 GIF_INVALID = 0,
305 GIF_QD_LOCKED = 1, 331 GIF_QD_LOCKED = 1,
@@ -307,7 +333,6 @@ enum {
307 GIF_SW_PAGED = 3, 333 GIF_SW_PAGED = 3,
308}; 334};
309 335
310
311struct gfs2_inode { 336struct gfs2_inode {
312 struct inode i_inode; 337 struct inode i_inode;
313 u64 i_no_addr; 338 u64 i_no_addr;
@@ -318,7 +343,7 @@ struct gfs2_inode {
318 struct gfs2_glock *i_gl; /* Move into i_gh? */ 343 struct gfs2_glock *i_gl; /* Move into i_gh? */
319 struct gfs2_holder i_iopen_gh; 344 struct gfs2_holder i_iopen_gh;
320 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 345 struct gfs2_holder i_gh; /* for prepare/commit_write only */
321 struct gfs2_blkreserv *i_res; /* resource group block reservation */ 346 struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
322 struct gfs2_rgrpd *i_rgd; 347 struct gfs2_rgrpd *i_rgd;
323 u64 i_goal; /* goal block for allocations */ 348 u64 i_goal; /* goal block for allocations */
324 struct rw_semaphore i_rw_mutex; 349 struct rw_semaphore i_rw_mutex;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b035e0959b2..c53c67e30bd4 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -521,6 +521,9 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
521 int error; 521 int error;
522 522
523 munge_mode_uid_gid(dip, &mode, &uid, &gid); 523 munge_mode_uid_gid(dip, &mode, &uid, &gid);
524 error = gfs2_rindex_update(sdp);
525 if (error)
526 return error;
524 527
525 error = gfs2_quota_lock(dip, uid, gid); 528 error = gfs2_quota_lock(dip, uid, gid);
526 if (error) 529 if (error)
@@ -551,6 +554,10 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
551 struct buffer_head *dibh; 554 struct buffer_head *dibh;
552 int error; 555 int error;
553 556
557 error = gfs2_rindex_update(sdp);
558 if (error)
559 return error;
560
554 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 561 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
555 if (error) 562 if (error)
556 goto fail; 563 goto fail;
@@ -596,7 +603,8 @@ fail_end_trans:
596 gfs2_trans_end(sdp); 603 gfs2_trans_end(sdp);
597 604
598fail_ipreserv: 605fail_ipreserv:
599 gfs2_inplace_release(dip); 606 if (alloc_required)
607 gfs2_inplace_release(dip);
600 608
601fail_quota_locks: 609fail_quota_locks:
602 gfs2_quota_unlock(dip); 610 gfs2_quota_unlock(dip);
@@ -647,7 +655,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
647 const struct qstr *name = &dentry->d_name; 655 const struct qstr *name = &dentry->d_name;
648 struct gfs2_holder ghs[2]; 656 struct gfs2_holder ghs[2];
649 struct inode *inode = NULL; 657 struct inode *inode = NULL;
650 struct gfs2_inode *dip = GFS2_I(dir); 658 struct gfs2_inode *dip = GFS2_I(dir), *ip;
651 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); 659 struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
652 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 }; 660 struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
653 int error; 661 int error;
@@ -657,6 +665,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
657 if (!name->len || name->len > GFS2_FNAMESIZE) 665 if (!name->len || name->len > GFS2_FNAMESIZE)
658 return -ENAMETOOLONG; 666 return -ENAMETOOLONG;
659 667
668 /* We need a reservation to allocate the new dinode block. The
669 directory ip temporarily points to the reservation, but this is
670 being done to get a set of contiguous blocks for the new dinode.
671 Since this is a create, we don't have a sizehint yet, so it will
672 have to use the minimum reservation size. */
660 error = gfs2_rs_alloc(dip); 673 error = gfs2_rs_alloc(dip);
661 if (error) 674 if (error)
662 return error; 675 return error;
@@ -694,24 +707,29 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
694 if (IS_ERR(inode)) 707 if (IS_ERR(inode))
695 goto fail_gunlock2; 708 goto fail_gunlock2;
696 709
697 error = gfs2_inode_refresh(GFS2_I(inode)); 710 ip = GFS2_I(inode);
711 error = gfs2_inode_refresh(ip);
698 if (error) 712 if (error)
699 goto fail_gunlock2; 713 goto fail_gunlock2;
700 714
701 /* the new inode needs a reservation so it can allocate xattrs. */ 715 /* The newly created inode needs a reservation so it can allocate
702 error = gfs2_rs_alloc(GFS2_I(inode)); 716 xattrs. At the same time, we want new blocks allocated to the new
703 if (error) 717 dinode to be as contiguous as possible. Since we allocated the
704 goto fail_gunlock2; 718 dinode block under the directory's reservation, we transfer
719 ownership of that reservation to the new inode. The directory
720 doesn't need a reservation unless it needs a new allocation. */
721 ip->i_res = dip->i_res;
722 dip->i_res = NULL;
705 723
706 error = gfs2_acl_create(dip, inode); 724 error = gfs2_acl_create(dip, inode);
707 if (error) 725 if (error)
708 goto fail_gunlock2; 726 goto fail_gunlock2;
709 727
710 error = gfs2_security_init(dip, GFS2_I(inode), name); 728 error = gfs2_security_init(dip, ip, name);
711 if (error) 729 if (error)
712 goto fail_gunlock2; 730 goto fail_gunlock2;
713 731
714 error = link_dinode(dip, name, GFS2_I(inode)); 732 error = link_dinode(dip, name, ip);
715 if (error) 733 if (error)
716 goto fail_gunlock2; 734 goto fail_gunlock2;
717 735
@@ -738,6 +756,7 @@ fail_gunlock:
738 iput(inode); 756 iput(inode);
739 } 757 }
740fail: 758fail:
759 gfs2_rs_delete(dip);
741 if (bh) 760 if (bh)
742 brelse(bh); 761 brelse(bh);
743 return error; 762 return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index fb7079263ea7..4d34887a601d 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -35,6 +35,9 @@
35#define BFITNOENT ((u32)~0) 35#define BFITNOENT ((u32)~0)
36#define NO_BLOCK ((u64)~0) 36#define NO_BLOCK ((u64)~0)
37 37
38#define RSRV_CONTENTION_FACTOR 4
39#define RGRP_RSRV_MAX_CONTENDERS 2
40
38#if BITS_PER_LONG == 32 41#if BITS_PER_LONG == 32
39#define LBITMASK (0x55555555UL) 42#define LBITMASK (0x55555555UL)
40#define LBITSKIP55 (0x55555555UL) 43#define LBITSKIP55 (0x55555555UL)
@@ -178,6 +181,57 @@ static inline u64 gfs2_bit_search(const __le64 *ptr, u64 mask, u8 state)
178} 181}
179 182
180/** 183/**
184 * rs_cmp - multi-block reservation range compare
185 * @blk: absolute file system block number of the new reservation
186 * @len: number of blocks in the new reservation
187 * @rs: existing reservation to compare against
188 *
189 * returns: 1 if the block range is beyond the reach of the reservation
190 * -1 if the block range is before the start of the reservation
191 * 0 if the block range overlaps with the reservation
192 */
193static inline int rs_cmp(u64 blk, u32 len, struct gfs2_blkreserv *rs)
194{
195 u64 startblk = gfs2_rs_startblk(rs);
196
197 if (blk >= startblk + rs->rs_free)
198 return 1;
199 if (blk + len - 1 < startblk)
200 return -1;
201 return 0;
202}
203
204/**
205 * rs_find - Find a rgrp multi-block reservation that contains a given block
206 * @rgd: The rgrp
207 * @rgblk: The block we're looking for, relative to the rgrp
208 */
209static struct gfs2_blkreserv *rs_find(struct gfs2_rgrpd *rgd, u32 rgblk)
210{
211 struct rb_node **newn;
212 int rc;
213 u64 fsblk = rgblk + rgd->rd_data0;
214
215 spin_lock(&rgd->rd_rsspin);
216 newn = &rgd->rd_rstree.rb_node;
217 while (*newn) {
218 struct gfs2_blkreserv *cur =
219 rb_entry(*newn, struct gfs2_blkreserv, rs_node);
220 rc = rs_cmp(fsblk, 1, cur);
221 if (rc < 0)
222 newn = &((*newn)->rb_left);
223 else if (rc > 0)
224 newn = &((*newn)->rb_right);
225 else {
226 spin_unlock(&rgd->rd_rsspin);
227 return cur;
228 }
229 }
230 spin_unlock(&rgd->rd_rsspin);
231 return NULL;
232}
233
234/**
181 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing 235 * gfs2_bitfit - Search an rgrp's bitmap buffer to find a bit-pair representing
182 * a block in a given allocation state. 236 * a block in a given allocation state.
183 * @buf: the buffer that holds the bitmaps 237 * @buf: the buffer that holds the bitmaps
@@ -424,19 +478,93 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
424int gfs2_rs_alloc(struct gfs2_inode *ip) 478int gfs2_rs_alloc(struct gfs2_inode *ip)
425{ 479{
426 int error = 0; 480 int error = 0;
481 struct gfs2_blkreserv *res;
482
483 if (ip->i_res)
484 return 0;
485
486 res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
487 if (!res)
488 error = -ENOMEM;
427 489
428 down_write(&ip->i_rw_mutex); 490 down_write(&ip->i_rw_mutex);
429 if (!ip->i_res) { 491 if (ip->i_res)
430 ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); 492 kmem_cache_free(gfs2_rsrv_cachep, res);
431 if (!ip->i_res) 493 else
432 error = -ENOMEM; 494 ip->i_res = res;
433 }
434 up_write(&ip->i_rw_mutex); 495 up_write(&ip->i_rw_mutex);
435 return error; 496 return error;
436} 497}
437 498
499static void dump_rs(struct seq_file *seq, struct gfs2_blkreserv *rs)
500{
501 gfs2_print_dbg(seq, " r: %llu s:%llu b:%u f:%u\n",
502 rs->rs_rgd->rd_addr, gfs2_rs_startblk(rs), rs->rs_biblk,
503 rs->rs_free);
504}
505
438/** 506/**
439 * gfs2_rs_delete - delete a reservation 507 * __rs_deltree - remove a multi-block reservation from the rgd tree
508 * @rs: The reservation to remove
509 *
510 */
511static void __rs_deltree(struct gfs2_blkreserv *rs)
512{
513 struct gfs2_rgrpd *rgd;
514
515 if (!gfs2_rs_active(rs))
516 return;
517
518 rgd = rs->rs_rgd;
519 /* We can't do this: The reason is that when the rgrp is invalidated,
520 it's in the "middle" of acquiring the glock, but the HOLDER bit
521 isn't set yet:
522 BUG_ON(!gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl));*/
523 trace_gfs2_rs(NULL, rs, TRACE_RS_TREEDEL);
524
525 if (!RB_EMPTY_ROOT(&rgd->rd_rstree))
526 rb_erase(&rs->rs_node, &rgd->rd_rstree);
527 BUG_ON(!rgd->rd_rs_cnt);
528 rgd->rd_rs_cnt--;
529
530 if (rs->rs_free) {
531 /* return reserved blocks to the rgrp and the ip */
532 BUG_ON(rs->rs_rgd->rd_reserved < rs->rs_free);
533 rs->rs_rgd->rd_reserved -= rs->rs_free;
534 rs->rs_free = 0;
535 clear_bit(GBF_FULL, &rs->rs_bi->bi_flags);
536 smp_mb__after_clear_bit();
537 }
538 /* We can't change any of the step 1 or step 2 components of the rs.
539 E.g. We can't set rs_rgd to NULL because the rgd glock is held and
540 dequeued through this pointer.
541 Can't: atomic_set(&rs->rs_sizehint, 0);
542 Can't: rs->rs_requested = 0;
543 Can't: rs->rs_rgd = NULL;*/
544 rs->rs_bi = NULL;
545 rs->rs_biblk = 0;
546}
547
548/**
549 * gfs2_rs_deltree - remove a multi-block reservation from the rgd tree
550 * @rs: The reservation to remove
551 *
552 */
553void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
554{
555 struct gfs2_rgrpd *rgd;
556
557 if (!gfs2_rs_active(rs))
558 return;
559
560 rgd = rs->rs_rgd;
561 spin_lock(&rgd->rd_rsspin);
562 __rs_deltree(rs);
563 spin_unlock(&rgd->rd_rsspin);
564}
565
566/**
567 * gfs2_rs_delete - delete a multi-block reservation
440 * @ip: The inode for this reservation 568 * @ip: The inode for this reservation
441 * 569 *
442 */ 570 */
@@ -444,12 +572,36 @@ void gfs2_rs_delete(struct gfs2_inode *ip)
444{ 572{
445 down_write(&ip->i_rw_mutex); 573 down_write(&ip->i_rw_mutex);
446 if (ip->i_res) { 574 if (ip->i_res) {
575 gfs2_rs_deltree(ip->i_res);
576 trace_gfs2_rs(ip, ip->i_res, TRACE_RS_DELETE);
577 BUG_ON(ip->i_res->rs_free);
447 kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); 578 kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
448 ip->i_res = NULL; 579 ip->i_res = NULL;
449 } 580 }
450 up_write(&ip->i_rw_mutex); 581 up_write(&ip->i_rw_mutex);
451} 582}
452 583
584/**
585 * return_all_reservations - return all reserved blocks back to the rgrp.
586 * @rgd: the rgrp that needs its space back
587 *
588 * We previously reserved a bunch of blocks for allocation. Now we need to
589 * give them back. This leave the reservation structures in tact, but removes
590 * all of their corresponding "no-fly zones".
591 */
592static void return_all_reservations(struct gfs2_rgrpd *rgd)
593{
594 struct rb_node *n;
595 struct gfs2_blkreserv *rs;
596
597 spin_lock(&rgd->rd_rsspin);
598 while ((n = rb_first(&rgd->rd_rstree))) {
599 rs = rb_entry(n, struct gfs2_blkreserv, rs_node);
600 __rs_deltree(rs);
601 }
602 spin_unlock(&rgd->rd_rsspin);
603}
604
453void gfs2_clear_rgrpd(struct gfs2_sbd *sdp) 605void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
454{ 606{
455 struct rb_node *n; 607 struct rb_node *n;
@@ -472,6 +624,7 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
472 624
473 gfs2_free_clones(rgd); 625 gfs2_free_clones(rgd);
474 kfree(rgd->rd_bits); 626 kfree(rgd->rd_bits);
627 return_all_reservations(rgd);
475 kmem_cache_free(gfs2_rgrpd_cachep, rgd); 628 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
476 } 629 }
477} 630}
@@ -649,6 +802,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
649 rgd->rd_data0 = be64_to_cpu(buf.ri_data0); 802 rgd->rd_data0 = be64_to_cpu(buf.ri_data0);
650 rgd->rd_data = be32_to_cpu(buf.ri_data); 803 rgd->rd_data = be32_to_cpu(buf.ri_data);
651 rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes); 804 rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
805 spin_lock_init(&rgd->rd_rsspin);
652 806
653 error = compute_bitstructs(rgd); 807 error = compute_bitstructs(rgd);
654 if (error) 808 if (error)
@@ -1115,29 +1269,212 @@ out:
1115} 1269}
1116 1270
1117/** 1271/**
1272 * rs_insert - insert a new multi-block reservation into the rgrp's rb_tree
1273 * @bi: the bitmap with the blocks
1274 * @ip: the inode structure
1275 * @biblk: the 32-bit block number relative to the start of the bitmap
1276 * @amount: the number of blocks to reserve
1277 *
1278 * Returns: NULL - reservation was already taken, so not inserted
1279 * pointer to the inserted reservation
1280 */
1281static struct gfs2_blkreserv *rs_insert(struct gfs2_bitmap *bi,
1282 struct gfs2_inode *ip, u32 biblk,
1283 int amount)
1284{
1285 struct rb_node **newn, *parent = NULL;
1286 int rc;
1287 struct gfs2_blkreserv *rs = ip->i_res;
1288 struct gfs2_rgrpd *rgd = rs->rs_rgd;
1289 u64 fsblock = gfs2_bi2rgd_blk(bi, biblk) + rgd->rd_data0;
1290
1291 spin_lock(&rgd->rd_rsspin);
1292 newn = &rgd->rd_rstree.rb_node;
1293 BUG_ON(!ip->i_res);
1294 BUG_ON(gfs2_rs_active(rs));
1295 /* Figure out where to put new node */
1296 /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
1297 while (*newn) {
1298 struct gfs2_blkreserv *cur =
1299 rb_entry(*newn, struct gfs2_blkreserv, rs_node);
1300
1301 parent = *newn;
1302 rc = rs_cmp(fsblock, amount, cur);
1303 if (rc > 0)
1304 newn = &((*newn)->rb_right);
1305 else if (rc < 0)
1306 newn = &((*newn)->rb_left);
1307 else {
1308 spin_unlock(&rgd->rd_rsspin);
1309 return NULL; /* reservation already in use */
1310 }
1311 }
1312
1313 /* Do our reservation work */
1314 rs = ip->i_res;
1315 rs->rs_free = amount;
1316 rs->rs_biblk = biblk;
1317 rs->rs_bi = bi;
1318 rb_link_node(&rs->rs_node, parent, newn);
1319 rb_insert_color(&rs->rs_node, &rgd->rd_rstree);
1320
1321 /* Do our inode accounting for the reservation */
1322 /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
1323
1324 /* Do our rgrp accounting for the reservation */
1325 rgd->rd_reserved += amount; /* blocks reserved */
1326 rgd->rd_rs_cnt++; /* number of in-tree reservations */
1327 spin_unlock(&rgd->rd_rsspin);
1328 trace_gfs2_rs(ip, rs, TRACE_RS_INSERT);
1329 return rs;
1330}
1331
1332/**
1333 * unclaimed_blocks - return number of blocks that aren't spoken for
1334 */
1335static u32 unclaimed_blocks(struct gfs2_rgrpd *rgd)
1336{
1337 return rgd->rd_free_clone - rgd->rd_reserved;
1338}
1339
1340/**
1341 * rg_mblk_search - find a group of multiple free blocks
1342 * @rgd: the resource group descriptor
1343 * @rs: the block reservation
1344 * @ip: pointer to the inode for which we're reserving blocks
1345 *
1346 * This is very similar to rgblk_search, except we're looking for whole
1347 * 64-bit words that represent a chunk of 32 free blocks. I'm only focusing
1348 * on aligned dwords for speed's sake.
1349 *
1350 * Returns: 0 if successful or BFITNOENT if there isn't enough free space
1351 */
1352
1353static int rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1354{
1355 struct gfs2_bitmap *bi = rgd->rd_bits;
1356 const u32 length = rgd->rd_length;
1357 u32 blk;
1358 unsigned int buf, x, search_bytes;
1359 u8 *buffer = NULL;
1360 u8 *ptr, *end, *nonzero;
1361 u32 goal, rsv_bytes;
1362 struct gfs2_blkreserv *rs;
1363 u32 best_rs_bytes, unclaimed;
1364 int best_rs_blocks;
1365
1366 /* Find bitmap block that contains bits for goal block */
1367 if (rgrp_contains_block(rgd, ip->i_goal))
1368 goal = ip->i_goal - rgd->rd_data0;
1369 else
1370 goal = rgd->rd_last_alloc;
1371 for (buf = 0; buf < length; buf++) {
1372 bi = rgd->rd_bits + buf;
1373 /* Convert scope of "goal" from rgrp-wide to within
1374 found bit block */
1375 if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) {
1376 goal -= bi->bi_start * GFS2_NBBY;
1377 goto do_search;
1378 }
1379 }
1380 buf = 0;
1381 goal = 0;
1382
1383do_search:
1384 best_rs_blocks = max_t(int, atomic_read(&ip->i_res->rs_sizehint),
1385 (RGRP_RSRV_MINBLKS * rgd->rd_length));
1386 best_rs_bytes = (best_rs_blocks *
1387 (1 + (RSRV_CONTENTION_FACTOR * rgd->rd_rs_cnt))) /
1388 GFS2_NBBY; /* 1 + is for our not-yet-created reservation */
1389 best_rs_bytes = ALIGN(best_rs_bytes, sizeof(u64));
1390 unclaimed = unclaimed_blocks(rgd);
1391 if (best_rs_bytes * GFS2_NBBY > unclaimed)
1392 best_rs_bytes = unclaimed >> GFS2_BIT_SIZE;
1393
1394 for (x = 0; x <= length; x++) {
1395 bi = rgd->rd_bits + buf;
1396
1397 if (test_bit(GBF_FULL, &bi->bi_flags))
1398 goto skip;
1399
1400 WARN_ON(!buffer_uptodate(bi->bi_bh));
1401 if (bi->bi_clone)
1402 buffer = bi->bi_clone + bi->bi_offset;
1403 else
1404 buffer = bi->bi_bh->b_data + bi->bi_offset;
1405
1406 /* We have to keep the reservations aligned on u64 boundaries
1407 otherwise we could get situations where a byte can't be
1408 used because it's after a reservation, but a free bit still
1409 is within the reservation's area. */
1410 ptr = buffer + ALIGN(goal >> GFS2_BIT_SIZE, sizeof(u64));
1411 end = (buffer + bi->bi_len);
1412 while (ptr < end) {
1413 rsv_bytes = 0;
1414 if ((ptr + best_rs_bytes) <= end)
1415 search_bytes = best_rs_bytes;
1416 else
1417 search_bytes = end - ptr;
1418 BUG_ON(!search_bytes);
1419 nonzero = memchr_inv(ptr, 0, search_bytes);
1420 /* If the lot is all zeroes, reserve the whole size. If
1421 there's enough zeroes to satisfy the request, use
1422 what we can. If there's not enough, keep looking. */
1423 if (nonzero == NULL)
1424 rsv_bytes = search_bytes;
1425 else if ((nonzero - ptr) * GFS2_NBBY >=
1426 ip->i_res->rs_requested)
1427 rsv_bytes = (nonzero - ptr);
1428
1429 if (rsv_bytes) {
1430 blk = ((ptr - buffer) * GFS2_NBBY);
1431 BUG_ON(blk >= bi->bi_len * GFS2_NBBY);
1432 rs = rs_insert(bi, ip, blk,
1433 rsv_bytes * GFS2_NBBY);
1434 if (IS_ERR(rs))
1435 return PTR_ERR(rs);
1436 if (rs)
1437 return 0;
1438 }
1439 ptr += ALIGN(search_bytes, sizeof(u64));
1440 }
1441skip:
1442 /* Try next bitmap block (wrap back to rgrp header
1443 if at end) */
1444 buf++;
1445 buf %= length;
1446 goal = 0;
1447 }
1448
1449 return BFITNOENT;
1450}
1451
1452/**
1118 * try_rgrp_fit - See if a given reservation will fit in a given RG 1453 * try_rgrp_fit - See if a given reservation will fit in a given RG
1119 * @rgd: the RG data 1454 * @rgd: the RG data
1120 * @ip: the inode 1455 * @ip: the inode
1121 * 1456 *
1122 * If there's room for the requested blocks to be allocated from the RG: 1457 * If there's room for the requested blocks to be allocated from the RG:
1458 * This will try to get a multi-block reservation first, and if that doesn't
1459 * fit, it will take what it can.
1123 * 1460 *
1124 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit) 1461 * Returns: 1 on success (it fits), 0 on failure (it doesn't fit)
1125 */ 1462 */
1126 1463
1127static int try_rgrp_fit(const struct gfs2_rgrpd *rgd, const struct gfs2_inode *ip) 1464static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
1128{ 1465{
1129 const struct gfs2_blkreserv *rs = ip->i_res; 1466 struct gfs2_blkreserv *rs = ip->i_res;
1130 1467
1131 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) 1468 if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR))
1132 return 0; 1469 return 0;
1133 if (rgd->rd_free_clone >= rs->rs_requested) 1470 /* Look for a multi-block reservation. */
1471 if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS &&
1472 rg_mblk_search(rgd, ip) != BFITNOENT)
1473 return 1;
1474 if (unclaimed_blocks(rgd) >= rs->rs_requested)
1134 return 1; 1475 return 1;
1135 return 0;
1136}
1137 1476
1138static inline u32 gfs2_bi2rgd_blk(struct gfs2_bitmap *bi, u32 blk) 1477 return 0;
1139{
1140 return (bi->bi_start * GFS2_NBBY) + blk;
1141} 1478}
1142 1479
1143/** 1480/**
@@ -1217,7 +1554,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
1217int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested) 1554int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1218{ 1555{
1219 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1556 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1220 struct gfs2_rgrpd *rgd, *begin = NULL; 1557 struct gfs2_rgrpd *begin = NULL;
1221 struct gfs2_blkreserv *rs = ip->i_res; 1558 struct gfs2_blkreserv *rs = ip->i_res;
1222 int error = 0, rg_locked, flags = LM_FLAG_TRY; 1559 int error = 0, rg_locked, flags = LM_FLAG_TRY;
1223 u64 last_unlinked = NO_BLOCK; 1560 u64 last_unlinked = NO_BLOCK;
@@ -1225,32 +1562,40 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1225 1562
1226 if (sdp->sd_args.ar_rgrplvb) 1563 if (sdp->sd_args.ar_rgrplvb)
1227 flags |= GL_SKIP; 1564 flags |= GL_SKIP;
1228 rs = ip->i_res;
1229 rs->rs_requested = requested; 1565 rs->rs_requested = requested;
1230 if (gfs2_assert_warn(sdp, requested)) { 1566 if (gfs2_assert_warn(sdp, requested)) {
1231 error = -EINVAL; 1567 error = -EINVAL;
1232 goto out; 1568 goto out;
1233 } 1569 }
1234 1570 if (gfs2_rs_active(rs)) {
1235 if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) 1571 begin = rs->rs_rgd;
1236 rgd = begin = ip->i_rgd; 1572 flags = 0; /* Yoda: Do or do not. There is no try */
1237 else 1573 } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) {
1238 rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); 1574 rs->rs_rgd = begin = ip->i_rgd;
1239 1575 } else {
1240 if (rgd == NULL) 1576 rs->rs_rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
1577 }
1578 if (rs->rs_rgd == NULL)
1241 return -EBADSLT; 1579 return -EBADSLT;
1242 1580
1243 while (loops < 3) { 1581 while (loops < 3) {
1244 rg_locked = 0; 1582 rg_locked = 0;
1245 1583
1246 if (gfs2_glock_is_locked_by_me(rgd->rd_gl)) { 1584 if (gfs2_glock_is_locked_by_me(rs->rs_rgd->rd_gl)) {
1247 rg_locked = 1; 1585 rg_locked = 1;
1248 error = 0; 1586 error = 0;
1587 } else if (!loops && !gfs2_rs_active(rs) &&
1588 rs->rs_rgd->rd_rs_cnt > RGRP_RSRV_MAX_CONTENDERS) {
1589 /* If the rgrp already is maxed out for contenders,
1590 we can eliminate it as a "first pass" without even
1591 requesting the rgrp glock. */
1592 error = GLR_TRYFAILED;
1249 } else { 1593 } else {
1250 error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 1594 error = gfs2_glock_nq_init(rs->rs_rgd->rd_gl,
1251 flags, &rs->rs_rgd_gh); 1595 LM_ST_EXCLUSIVE, flags,
1596 &rs->rs_rgd_gh);
1252 if (!error && sdp->sd_args.ar_rgrplvb) { 1597 if (!error && sdp->sd_args.ar_rgrplvb) {
1253 error = update_rgrp_lvb(rgd); 1598 error = update_rgrp_lvb(rs->rs_rgd);
1254 if (error) { 1599 if (error) {
1255 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1600 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1256 return error; 1601 return error;
@@ -1259,25 +1604,37 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
1259 } 1604 }
1260 switch (error) { 1605 switch (error) {
1261 case 0: 1606 case 0:
1262 if (try_rgrp_fit(rgd, ip)) { 1607 if (gfs2_rs_active(rs)) {
1608 if (unclaimed_blocks(rs->rs_rgd) +
1609 rs->rs_free >= rs->rs_requested) {
1610 ip->i_rgd = rs->rs_rgd;
1611 return 0;
1612 }
1613 /* We have a multi-block reservation, but the
1614 rgrp doesn't have enough free blocks to
1615 satisfy the request. Free the reservation
1616 and look for a suitable rgrp. */
1617 gfs2_rs_deltree(rs);
1618 }
1619 if (try_rgrp_fit(rs->rs_rgd, ip)) {
1263 if (sdp->sd_args.ar_rgrplvb) 1620 if (sdp->sd_args.ar_rgrplvb)
1264 gfs2_rgrp_bh_get(rgd); 1621 gfs2_rgrp_bh_get(rs->rs_rgd);
1265 ip->i_rgd = rgd; 1622 ip->i_rgd = rs->rs_rgd;
1266 return 0; 1623 return 0;
1267 } 1624 }
1268 if (rgd->rd_flags & GFS2_RDF_CHECK) { 1625 if (rs->rs_rgd->rd_flags & GFS2_RDF_CHECK) {
1269 if (sdp->sd_args.ar_rgrplvb) 1626 if (sdp->sd_args.ar_rgrplvb)
1270 gfs2_rgrp_bh_get(rgd); 1627 gfs2_rgrp_bh_get(rs->rs_rgd);
1271 try_rgrp_unlink(rgd, &last_unlinked, 1628 try_rgrp_unlink(rs->rs_rgd, &last_unlinked,
1272 ip->i_no_addr); 1629 ip->i_no_addr);
1273 } 1630 }
1274 if (!rg_locked) 1631 if (!rg_locked)
1275 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1632 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1276 /* fall through */ 1633 /* fall through */
1277 case GLR_TRYFAILED: 1634 case GLR_TRYFAILED:
1278 rgd = gfs2_rgrpd_get_next(rgd); 1635 rs->rs_rgd = gfs2_rgrpd_get_next(rs->rs_rgd);
1279 rgd = rgd ? : begin; /* if NULL, wrap */ 1636 rs->rs_rgd = rs->rs_rgd ? : begin; /* if NULL, wrap */
1280 if (rgd != begin) /* If we didn't wrap */ 1637 if (rs->rs_rgd != begin) /* If we didn't wrap */
1281 break; 1638 break;
1282 1639
1283 flags &= ~LM_FLAG_TRY; 1640 flags &= ~LM_FLAG_TRY;
@@ -1315,6 +1672,12 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
1315{ 1672{
1316 struct gfs2_blkreserv *rs = ip->i_res; 1673 struct gfs2_blkreserv *rs = ip->i_res;
1317 1674
1675 if (!rs)
1676 return;
1677
1678 if (!rs->rs_free)
1679 gfs2_rs_deltree(rs);
1680
1318 if (rs->rs_rgd_gh.gh_gl) 1681 if (rs->rs_rgd_gh.gh_gl)
1319 gfs2_glock_dq_uninit(&rs->rs_rgd_gh); 1682 gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
1320 rs->rs_requested = 0; 1683 rs->rs_requested = 0;
@@ -1413,7 +1776,27 @@ do_search:
1413 if (state != GFS2_BLKST_UNLINKED && bi->bi_clone) 1776 if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1414 buffer = bi->bi_clone + bi->bi_offset; 1777 buffer = bi->bi_clone + bi->bi_offset;
1415 1778
1416 biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state); 1779 while (1) {
1780 struct gfs2_blkreserv *rs;
1781 u32 rgblk;
1782
1783 biblk = gfs2_bitfit(buffer, bi->bi_len, goal, state);
1784 if (biblk == BFITNOENT)
1785 break;
1786 /* Check if this block is reserved() */
1787 rgblk = gfs2_bi2rgd_blk(bi, biblk);
1788 rs = rs_find(rgd, rgblk);
1789 if (rs == NULL)
1790 break;
1791
1792 BUG_ON(rs->rs_bi != bi);
1793 biblk = BFITNOENT;
1794 /* This should jump to the first block after the
1795 reservation. */
1796 goal = rs->rs_biblk + rs->rs_free;
1797 if (goal >= bi->bi_len * GFS2_NBBY)
1798 break;
1799 }
1417 if (biblk != BFITNOENT) 1800 if (biblk != BFITNOENT)
1418 break; 1801 break;
1419 1802
@@ -1449,8 +1832,9 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
1449 u32 blk, bool dinode, unsigned int *n) 1832 u32 blk, bool dinode, unsigned int *n)
1450{ 1833{
1451 const unsigned int elen = *n; 1834 const unsigned int elen = *n;
1452 u32 goal; 1835 u32 goal, rgblk;
1453 const u8 *buffer = NULL; 1836 const u8 *buffer = NULL;
1837 struct gfs2_blkreserv *rs;
1454 1838
1455 *n = 0; 1839 *n = 0;
1456 buffer = bi->bi_bh->b_data + bi->bi_offset; 1840 buffer = bi->bi_bh->b_data + bi->bi_offset;
@@ -1463,6 +1847,10 @@ static u64 gfs2_alloc_extent(struct gfs2_rgrpd *rgd, struct gfs2_bitmap *bi,
1463 goal++; 1847 goal++;
1464 if (goal >= (bi->bi_len * GFS2_NBBY)) 1848 if (goal >= (bi->bi_len * GFS2_NBBY))
1465 break; 1849 break;
1850 rgblk = gfs2_bi2rgd_blk(bi, goal);
1851 rs = rs_find(rgd, rgblk);
1852 if (rs) /* Oops, we bumped into someone's reservation */
1853 break;
1466 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != 1854 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
1467 GFS2_BLKST_FREE) 1855 GFS2_BLKST_FREE)
1468 break; 1856 break;
@@ -1538,12 +1926,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1538 1926
1539int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) 1927int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
1540{ 1928{
1541 const struct gfs2_rgrpd *rgd = gl->gl_object; 1929 struct gfs2_rgrpd *rgd = gl->gl_object;
1930 struct gfs2_blkreserv *trs;
1931 const struct rb_node *n;
1932
1542 if (rgd == NULL) 1933 if (rgd == NULL)
1543 return 0; 1934 return 0;
1544 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", 1935 gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u r:%u\n",
1545 (unsigned long long)rgd->rd_addr, rgd->rd_flags, 1936 (unsigned long long)rgd->rd_addr, rgd->rd_flags,
1546 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); 1937 rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
1938 rgd->rd_reserved);
1939 spin_lock(&rgd->rd_rsspin);
1940 for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
1941 trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
1942 dump_rs(seq, trs);
1943 }
1944 spin_unlock(&rgd->rd_rsspin);
1547 return 0; 1945 return 0;
1548} 1946}
1549 1947
@@ -1558,10 +1956,63 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
1558} 1956}
1559 1957
1560/** 1958/**
1959 * claim_reserved_blks - Claim previously reserved blocks
1960 * @ip: the inode that's claiming the reservation
1961 * @dinode: 1 if this block is a dinode block, otherwise data block
1962 * @nblocks: desired extent length
1963 *
1964 * Lay claim to previously allocated block reservation blocks.
1965 * Returns: Starting block number of the blocks claimed.
1966 * Sets *nblocks to the actual extent length allocated.
1967 */
1968static u64 claim_reserved_blks(struct gfs2_inode *ip, bool dinode,
1969 unsigned int *nblocks)
1970{
1971 struct gfs2_blkreserv *rs = ip->i_res;
1972 struct gfs2_rgrpd *rgd = rs->rs_rgd;
1973 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1974 struct gfs2_bitmap *bi;
1975 u64 start_block = gfs2_rs_startblk(rs);
1976 const unsigned int elen = *nblocks;
1977
1978 /*BUG_ON(!gfs2_glock_is_locked_by_me(ip->i_gl));*/
1979 gfs2_assert_withdraw(sdp, rgd);
1980 /*BUG_ON(!gfs2_glock_is_locked_by_me(rgd->rd_gl));*/
1981 bi = rs->rs_bi;
1982 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1983
1984 for (*nblocks = 0; *nblocks < elen && rs->rs_free; (*nblocks)++) {
1985 /* Make sure the bitmap hasn't changed */
1986 gfs2_setbit(rgd, bi->bi_clone, bi, rs->rs_biblk,
1987 dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
1988 rs->rs_biblk++;
1989 rs->rs_free--;
1990
1991 BUG_ON(!rgd->rd_reserved);
1992 rgd->rd_reserved--;
1993 dinode = false;
1994 trace_gfs2_rs(ip, rs, TRACE_RS_CLAIM);
1995 }
1996
1997 if (!rs->rs_free) {
1998 struct gfs2_rgrpd *rgd = ip->i_res->rs_rgd;
1999
2000 gfs2_rs_deltree(rs);
2001 /* -nblocks because we haven't returned to do the math yet.
2002 I'm doing the math backwards to prevent negative numbers,
2003 but think of it as:
2004 if (unclaimed_blocks(rgd) - *nblocks >= RGRP_RSRV_MINBLKS */
2005 if (unclaimed_blocks(rgd) >= RGRP_RSRV_MINBLKS + *nblocks)
2006 rg_mblk_search(rgd, ip);
2007 }
2008 return start_block;
2009}
2010
2011/**
1561 * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode 2012 * gfs2_alloc_blocks - Allocate one or more blocks of data and/or a dinode
1562 * @ip: the inode to allocate the block for 2013 * @ip: the inode to allocate the block for
1563 * @bn: Used to return the starting block number 2014 * @bn: Used to return the starting block number
1564 * @ndata: requested number of blocks/extent length (value/result) 2015 * @nblocks: requested number of blocks/extent length (value/result)
1565 * @dinode: 1 if we're allocating a dinode block, else 0 2016 * @dinode: 1 if we're allocating a dinode block, else 0
1566 * @generation: the generation number of the inode 2017 * @generation: the generation number of the inode
1567 * 2018 *
@@ -1586,20 +2037,34 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
1586 if (ip->i_res->rs_requested == 0) 2037 if (ip->i_res->rs_requested == 0)
1587 return -ECANCELED; 2038 return -ECANCELED;
1588 2039
1589 rgd = ip->i_rgd; 2040 /* Check if we have a multi-block reservation, and if so, claim the
1590 2041 next free block from it. */
1591 if (!dinode && rgrp_contains_block(rgd, ip->i_goal)) 2042 if (gfs2_rs_active(ip->i_res)) {
1592 goal = ip->i_goal - rgd->rd_data0; 2043 BUG_ON(!ip->i_res->rs_free);
1593 else 2044 rgd = ip->i_res->rs_rgd;
1594 goal = rgd->rd_last_alloc; 2045 block = claim_reserved_blks(ip, dinode, nblocks);
1595 2046 } else {
1596 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi); 2047 rgd = ip->i_rgd;
1597 2048
1598 /* Since all blocks are reserved in advance, this shouldn't happen */ 2049 if (!dinode && rgrp_contains_block(rgd, ip->i_goal))
1599 if (blk == BFITNOENT) 2050 goal = ip->i_goal - rgd->rd_data0;
1600 goto rgrp_error; 2051 else
2052 goal = rgd->rd_last_alloc;
2053
2054 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, &bi);
2055
2056 /* Since all blocks are reserved in advance, this shouldn't
2057 happen */
2058 if (blk == BFITNOENT) {
2059 printk(KERN_WARNING "BFITNOENT, nblocks=%u\n",
2060 *nblocks);
2061 printk(KERN_WARNING "FULL=%d\n",
2062 test_bit(GBF_FULL, &rgd->rd_bits->bi_flags));
2063 goto rgrp_error;
2064 }
1601 2065
1602 block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks); 2066 block = gfs2_alloc_extent(rgd, bi, blk, dinode, nblocks);
2067 }
1603 ndata = *nblocks; 2068 ndata = *nblocks;
1604 if (dinode) 2069 if (dinode)
1605 ndata--; 2070 ndata--;
@@ -1616,8 +2081,10 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
1616 brelse(dibh); 2081 brelse(dibh);
1617 } 2082 }
1618 } 2083 }
1619 if (rgd->rd_free < *nblocks) 2084 if (rgd->rd_free < *nblocks) {
2085 printk(KERN_WARNING "nblocks=%u\n", *nblocks);
1620 goto rgrp_error; 2086 goto rgrp_error;
2087 }
1621 2088
1622 rgd->rd_free -= *nblocks; 2089 rgd->rd_free -= *nblocks;
1623 if (dinode) { 2090 if (dinode) {
@@ -1877,6 +2344,7 @@ void gfs2_rlist_free(struct gfs2_rgrp_list *rlist)
1877 for (x = 0; x < rlist->rl_rgrps; x++) 2344 for (x = 0; x < rlist->rl_rgrps; x++)
1878 gfs2_holder_uninit(&rlist->rl_ghs[x]); 2345 gfs2_holder_uninit(&rlist->rl_ghs[x]);
1879 kfree(rlist->rl_ghs); 2346 kfree(rlist->rl_ghs);
2347 rlist->rl_ghs = NULL;
1880 } 2348 }
1881} 2349}
1882 2350
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 5d8314dbc899..ca6e26729b86 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -13,6 +13,14 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/uaccess.h> 14#include <linux/uaccess.h>
15 15
16/* Since each block in the file system is represented by two bits in the
17 * bitmap, one 64-bit word in the bitmap will represent 32 blocks.
18 * By reserving 32 blocks at a time, we can optimize / shortcut how we search
19 * through the bitmaps by looking a word at a time.
20 */
21#define RGRP_RSRV_MINBYTES 8
22#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
23
16struct gfs2_rgrpd; 24struct gfs2_rgrpd;
17struct gfs2_sbd; 25struct gfs2_sbd;
18struct gfs2_holder; 26struct gfs2_holder;
@@ -29,6 +37,8 @@ extern void gfs2_free_clones(struct gfs2_rgrpd *rgd);
29extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); 37extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh);
30extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); 38extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
31 39
40extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
41
32extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested); 42extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
33extern void gfs2_inplace_release(struct gfs2_inode *ip); 43extern void gfs2_inplace_release(struct gfs2_inode *ip);
34 44
@@ -36,6 +46,7 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
36 bool dinode, u64 *generation); 46 bool dinode, u64 *generation);
37 47
38extern int gfs2_rs_alloc(struct gfs2_inode *ip); 48extern int gfs2_rs_alloc(struct gfs2_inode *ip);
49extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
39extern void gfs2_rs_delete(struct gfs2_inode *ip); 50extern void gfs2_rs_delete(struct gfs2_inode *ip);
40extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); 51extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
41extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); 52extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -62,7 +73,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
62 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed); 73 const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed);
63extern int gfs2_fitrim(struct file *filp, void __user *argp); 74extern int gfs2_fitrim(struct file *filp, void __user *argp);
64 75
65/* This is how to tell if a reservation is "inplace" reserved: */ 76/* This is how to tell if a multi-block reservation is "inplace" reserved: */
66static inline int gfs2_mb_reserved(struct gfs2_inode *ip) 77static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
67{ 78{
68 if (ip->i_res && ip->i_res->rs_requested) 79 if (ip->i_res && ip->i_res->rs_requested)
@@ -70,4 +81,22 @@ static inline int gfs2_mb_reserved(struct gfs2_inode *ip)
70 return 0; 81 return 0;
71} 82}
72 83
84/* This is how to tell if a multi-block reservation is in the rgrp tree: */
85static inline int gfs2_rs_active(struct gfs2_blkreserv *rs)
86{
87 if (rs && rs->rs_bi)
88 return 1;
89 return 0;
90}
91
92static inline u32 gfs2_bi2rgd_blk(const struct gfs2_bitmap *bi, u32 blk)
93{
94 return (bi->bi_start * GFS2_NBBY) + blk;
95}
96
97static inline u64 gfs2_rs_startblk(const struct gfs2_blkreserv *rs)
98{
99 return gfs2_bi2rgd_blk(rs->rs_bi, rs->rs_biblk) + rs->rs_rgd->rd_data0;
100}
101
73#endif /* __RGRP_DOT_H__ */ 102#endif /* __RGRP_DOT_H__ */
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 788068758f3a..b1502c4bc60d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1420,6 +1420,10 @@ static int gfs2_dinode_dealloc(struct gfs2_inode *ip)
1420 return -EIO; 1420 return -EIO;
1421 } 1421 }
1422 1422
1423 error = gfs2_rindex_update(sdp);
1424 if (error)
1425 return error;
1426
1423 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 1427 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1424 if (error) 1428 if (error)
1425 return error; 1429 return error;
@@ -1550,6 +1554,9 @@ out_truncate:
1550 1554
1551out_unlock: 1555out_unlock:
1552 /* Error path for case 1 */ 1556 /* Error path for case 1 */
1557 if (gfs2_rs_active(ip->i_res))
1558 gfs2_rs_deltree(ip->i_res);
1559
1553 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) 1560 if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags))
1554 gfs2_glock_dq(&ip->i_iopen_gh); 1561 gfs2_glock_dq(&ip->i_iopen_gh);
1555 gfs2_holder_uninit(&ip->i_iopen_gh); 1562 gfs2_holder_uninit(&ip->i_iopen_gh);
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index 1b8b81588199..a25c252fe412 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -14,6 +14,7 @@
14#include <linux/ktime.h> 14#include <linux/ktime.h>
15#include "incore.h" 15#include "incore.h"
16#include "glock.h" 16#include "glock.h"
17#include "rgrp.h"
17 18
18#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn } 19#define dlm_state_name(nn) { DLM_LOCK_##nn, #nn }
19#define glock_trace_name(x) __print_symbolic(x, \ 20#define glock_trace_name(x) __print_symbolic(x, \
@@ -31,6 +32,17 @@
31 { GFS2_BLKST_DINODE, "dinode" }, \ 32 { GFS2_BLKST_DINODE, "dinode" }, \
32 { GFS2_BLKST_UNLINKED, "unlinked" }) 33 { GFS2_BLKST_UNLINKED, "unlinked" })
33 34
35#define TRACE_RS_DELETE 0
36#define TRACE_RS_TREEDEL 1
37#define TRACE_RS_INSERT 2
38#define TRACE_RS_CLAIM 3
39
40#define rs_func_name(x) __print_symbolic(x, \
41 { 0, "del " }, \
42 { 1, "tdel" }, \
43 { 2, "ins " }, \
44 { 3, "clm " })
45
34#define show_glock_flags(flags) __print_flags(flags, "", \ 46#define show_glock_flags(flags) __print_flags(flags, "", \
35 {(1UL << GLF_LOCK), "l" }, \ 47 {(1UL << GLF_LOCK), "l" }, \
36 {(1UL << GLF_DEMOTE), "D" }, \ 48 {(1UL << GLF_DEMOTE), "D" }, \
@@ -470,6 +482,7 @@ TRACE_EVENT(gfs2_block_alloc,
470 __field( u8, block_state ) 482 __field( u8, block_state )
471 __field( u64, rd_addr ) 483 __field( u64, rd_addr )
472 __field( u32, rd_free_clone ) 484 __field( u32, rd_free_clone )
485 __field( u32, rd_reserved )
473 ), 486 ),
474 487
475 TP_fast_assign( 488 TP_fast_assign(
@@ -480,16 +493,58 @@ TRACE_EVENT(gfs2_block_alloc,
480 __entry->block_state = block_state; 493 __entry->block_state = block_state;
481 __entry->rd_addr = rgd->rd_addr; 494 __entry->rd_addr = rgd->rd_addr;
482 __entry->rd_free_clone = rgd->rd_free_clone; 495 __entry->rd_free_clone = rgd->rd_free_clone;
496 __entry->rd_reserved = rgd->rd_reserved;
483 ), 497 ),
484 498
485 TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u", 499 TP_printk("%u,%u bmap %llu alloc %llu/%lu %s rg:%llu rf:%u rr:%lu",
486 MAJOR(__entry->dev), MINOR(__entry->dev), 500 MAJOR(__entry->dev), MINOR(__entry->dev),
487 (unsigned long long)__entry->inum, 501 (unsigned long long)__entry->inum,
488 (unsigned long long)__entry->start, 502 (unsigned long long)__entry->start,
489 (unsigned long)__entry->len, 503 (unsigned long)__entry->len,
490 block_state_name(__entry->block_state), 504 block_state_name(__entry->block_state),
491 (unsigned long long)__entry->rd_addr, 505 (unsigned long long)__entry->rd_addr,
492 __entry->rd_free_clone) 506 __entry->rd_free_clone, (unsigned long)__entry->rd_reserved)
507);
508
509/* Keep track of multi-block reservations as they are allocated/freed */
510TRACE_EVENT(gfs2_rs,
511
512 TP_PROTO(const struct gfs2_inode *ip, const struct gfs2_blkreserv *rs,
513 u8 func),
514
515 TP_ARGS(ip, rs, func),
516
517 TP_STRUCT__entry(
518 __field( dev_t, dev )
519 __field( u64, rd_addr )
520 __field( u32, rd_free_clone )
521 __field( u32, rd_reserved )
522 __field( u64, inum )
523 __field( u64, start )
524 __field( u32, free )
525 __field( u8, func )
526 ),
527
528 TP_fast_assign(
529 __entry->dev = rs->rs_rgd ? rs->rs_rgd->rd_sbd->sd_vfs->s_dev : 0;
530 __entry->rd_addr = rs->rs_rgd ? rs->rs_rgd->rd_addr : 0;
531 __entry->rd_free_clone = rs->rs_rgd ? rs->rs_rgd->rd_free_clone : 0;
532 __entry->rd_reserved = rs->rs_rgd ? rs->rs_rgd->rd_reserved : 0;
533 __entry->inum = ip ? ip->i_no_addr : 0;
534 __entry->start = gfs2_rs_startblk(rs);
535 __entry->free = rs->rs_free;
536 __entry->func = func;
537 ),
538
539 TP_printk("%u,%u bmap %llu resrv %llu rg:%llu rf:%lu rr:%lu %s "
540 "f:%lu",
541 MAJOR(__entry->dev), MINOR(__entry->dev),
542 (unsigned long long)__entry->inum,
543 (unsigned long long)__entry->start,
544 (unsigned long long)__entry->rd_addr,
545 (unsigned long)__entry->rd_free_clone,
546 (unsigned long)__entry->rd_reserved,
547 rs_func_name(__entry->func), (unsigned long)__entry->free)
493); 548);
494 549
495#endif /* _TRACE_GFS2_H */ 550#endif /* _TRACE_GFS2_H */
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 523c0de0d805..27a0b4a901f5 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -327,6 +327,10 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
327{ 327{
328 int error; 328 int error;
329 329
330 error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
331 if (error)
332 return error;
333
330 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 334 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
331 if (error) 335 if (error)
332 goto out_alloc; 336 goto out_alloc;
@@ -710,6 +714,10 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
710 struct buffer_head *dibh; 714 struct buffer_head *dibh;
711 int error; 715 int error;
712 716
717 error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
718 if (error)
719 return error;
720
713 error = gfs2_quota_lock_check(ip); 721 error = gfs2_quota_lock_check(ip);
714 if (error) 722 if (error)
715 return error; 723 return error;
@@ -1483,6 +1491,10 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
1483{ 1491{
1484 int error; 1492 int error;
1485 1493
1494 error = gfs2_rindex_update(GFS2_SB(&ip->i_inode));
1495 if (error)
1496 return error;
1497
1486 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 1498 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1487 if (error) 1499 if (error)
1488 return error; 1500 return error;