summaryrefslogtreecommitdiffstats
path: root/fs/gfs2
diff options
context:
space:
mode:
authorBob Peterson <rpeterso@redhat.com>2017-02-06 08:28:32 -0500
committerBob Peterson <rpeterso@redhat.com>2017-04-19 08:25:43 -0400
commitd552a2b9b33eafdb5eb16c643e745deb564dda7f (patch)
treed3d0ce95aff18b02aec9061140647110d8566ed2 /fs/gfs2
parentd4d7fc12b642a16732adeacefdaebe684bcb2218 (diff)
GFS2: Non-recursive delete
Implement truncate/delete as a non-recursive algorithm. The older algorithm was implemented with recursion to strip off each layer at a time (going by height, starting with the maximum height. This version tries to do the same thing but without recursion, and without needing to allocate new structures or lists in memory. For example, say you want to truncate a very large file to 1 byte, and its end-of-file metapath is: 0.505.463.428. The starting metapath would be 0.0.0.0. Since it's a truncate to non-zero, it needs to preserve that byte, and all metadata pointing to it. So it would start at 0.0.0.0, look up all its metadata buffers, then free all data blocks pointed to at the highest level. After that buffer is "swept", it moves on to 0.0.0.1, then 0.0.0.2, etc., reading in buffers and sweeping them clean. When it gets to the end of the 0.0.0 metadata buffer (for 4K blocks the last valid one is 0.0.0.508), it backs up to the previous height and starts working on 0.0.1.0, then 0.0.1.1, and so forth. After it reaches the end and sweeps 0.0.1.508, it continues with 0.0.2.0, and so on. When that height is exhausted, and it reaches 0.0.508.508 it backs up another level, to 0.1.0.0, then 0.1.0.1, through 0.1.0.508. So it has to keep marching backwards and forwards through the metadata until it's all swept clean. Once it has all the data blocks freed, it lowers the strip height, and begins the process all over again, but with one less height. This time it sweeps 0.0.0 through 0.505.463. When that's clean, it lowers the strip height again and works to free 0.505. Eventually it strips the lowest height, 0. For a delete or truncate to 0, all metadata for all heights of 0.0.0.0 would be freed. For a truncate to 1 byte, 0.0.0.0 would be preserved. This isn't much different from normal integer incrementing, where an integer gets incremented from 0000 (0.0.0.0) to 3021 (3.0.2.1). So 0000 gets increments to 0001, 0002, up to 0009, then on to 0010, 0011 up to 0099, then 0100 and so forth. It's just that each "digit" goes from 0 to 508 (for a total of 509 pointers) rather than from 0 to 9. Note that the dinode will only have 483 pointers due to the dinode structure itself. Also note: this is just an example. These numbers (509 and 483) are based on a standard 4K block size. Smaller block sizes will yield smaller numbers of indirect pointers accordingly. The truncation process is accomplished with the help of two major functions and a few helper functions. Functions do_strip and recursive_scan are obsolete, so removed. New function sweep_bh_for_rgrps cleans a buffer_head pointed to by the given metapath and height. By cleaning, I mean it frees all blocks starting at the offset passed in metapath. It starts at the first block in the buffer pointed to by the metapath and identifies its resource group (rgrp). From there it frees all subsequent block pointers that lie within that rgrp. If it's already inside a transaction, it stays within it as long as it can. In other words, it doesn't close a transaction until it knows it's freed what it can from the resource group. In this way, multiple buffers may be cleaned in a single transaction, as long as those blocks in the buffer all lie within the same rgrp. If it's not in a transaction, it starts one. If the buffer_head has references to blocks within multiple rgrps, it frees all the blocks inside the first rgrp it finds, then closes the transaction. Then it repeats the cycle: identifies the next unfreed block, uses it to find its rgrp, then starts a new transaction for that set. It repeats this process repeatedly until the buffer_head contains no more references to any blocks past the given metapath. Function trunc_dealloc has been reworked into a finite state automaton. It has basically 3 active states: DEALLOC_MP_FULL, DEALLOC_MP_LOWER, and DEALLOC_FILL_MP: The DEALLOC_MP_FULL state implies the metapath has a full set of buffers out to the "shrink height", and therefore, it can call function sweep_bh_for_rgrps to free the blocks within the highest height of the metapath. If it's just swept the lowest level (or an error has occurred) the state machine is ended. Otherwise it proceeds to the DEALLOC_MP_LOWER state. The DEALLOC_MP_LOWER state implies we are finished with a given buffer_head, which may now be released, and therefore we are then missing some buffer information from the metapath. So we need to find more buffers to read in. In most cases, this is just a matter of releasing the buffer_head and moving to the next pointer from the previous height, so it may be read in and swept as well. If it can't find another non-null pointer to process, it checks whether it's reached the end of a height and needs to lower the strip height, or whether it still needs move forward through the previous height's metadata. In this state, all zero-pointers are skipped. From this state, it can only loop around (once more backing up another height) or, once a valid metapath is found (one that has non-zero pointers), proceed to state DEALLOC_FILL_MP. The DEALLOC_FILL_MP state implies that we have a metapath but not all its buffers are read in. So we must proceed to read in buffer_heads until the metapath has a valid buffer for every height. If the previous state backed us up 3 heights, we may need to read in a buffer, increment the height, then repeat the process until buffers have been read in for all required heights. If it's successful reading a buffer, and it's at the highest height we need, it proceeds back to the DEALLOC_MP_FULL state. If it's unable to fill in a buffer, (encounters a hole, etc.) it tries to find another non-zero block pointer. If they're all zero, it lowers the height and returns to the DEALLOC_MP_LOWER state. If it finds a good non-null pointer, it loops around and reads it in, while keeping the metapath in lock-step with the pointers it examines. The state machine runs until the truncation request is satisfied. Then any transactions are ended, the quota and statfs data are updated, and the function is complete. Helper function metaptr1 was introduced to be an easy way to determine the start of a buffer_head's indirect pointers. Helper function lookup_mp_height was introduced to find a metapath index and read in the buffer that corresponds to it. In this way, function lookup_metapath becomes a simple loop to call it for every height. Helper function fillup_metapath is similar to lookup_metapath except it can do partial lookups. If the state machine backed up multiple levels (like 2999 wrapping to 3000) it needs to find out the next starting point and start issuing metadata reads at that point. Helper function hptrs is a shortcut to determine how many pointers should be expected in a buffer. Height 0 is the dinode which has fewer pointers than the others. Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Diffstat (limited to 'fs/gfs2')
-rw-r--r--fs/gfs2/bmap.c741
-rw-r--r--fs/gfs2/rgrp.c7
-rw-r--r--fs/gfs2/rgrp.h7
3 files changed, 463 insertions, 292 deletions
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 01b97c012c6e..3814a60e0aea 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -38,11 +38,6 @@ struct metapath {
38 __u16 mp_list[GFS2_MAX_META_HEIGHT]; 38 __u16 mp_list[GFS2_MAX_META_HEIGHT];
39}; 39};
40 40
41struct strip_mine {
42 int sm_first;
43 unsigned int sm_height;
44};
45
46/** 41/**
47 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page 42 * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
48 * @ip: the inode 43 * @ip: the inode
@@ -253,6 +248,19 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
253} 248}
254 249
255/** 250/**
251 * metaptr1 - Return the first possible metadata pointer in a metaath buffer
252 * @height: The metadata height (0 = dinode)
253 * @mp: The metapath
254 */
255static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
256{
257 struct buffer_head *bh = mp->mp_bh[height];
258 if (height == 0)
259 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
260 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
261}
262
263/**
256 * metapointer - Return pointer to start of metadata in a buffer 264 * metapointer - Return pointer to start of metadata in a buffer
257 * @height: The metadata height (0 = dinode) 265 * @height: The metadata height (0 = dinode)
258 * @mp: The metapath 266 * @mp: The metapath
@@ -264,10 +272,8 @@ static inline unsigned int metapath_branch_start(const struct metapath *mp)
264 272
265static inline __be64 *metapointer(unsigned int height, const struct metapath *mp) 273static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
266{ 274{
267 struct buffer_head *bh = mp->mp_bh[height]; 275 __be64 *p = metaptr1(height, mp);
268 unsigned int head_size = (height > 0) ? 276 return p + mp->mp_list[height];
269 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
270 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
271} 277}
272 278
273static void gfs2_metapath_ra(struct gfs2_glock *gl, 279static void gfs2_metapath_ra(struct gfs2_glock *gl,
@@ -296,6 +302,23 @@ static void gfs2_metapath_ra(struct gfs2_glock *gl,
296} 302}
297 303
298/** 304/**
305 * lookup_mp_height - helper function for lookup_metapath
306 * @ip: the inode
307 * @mp: the metapath
308 * @h: the height which needs looking up
309 */
310static int lookup_mp_height(struct gfs2_inode *ip, struct metapath *mp, int h)
311{
312 __be64 *ptr = metapointer(h, mp);
313 u64 dblock = be64_to_cpu(*ptr);
314
315 if (!dblock)
316 return h + 1;
317
318 return gfs2_meta_indirect_buffer(ip, h + 1, dblock, &mp->mp_bh[h + 1]);
319}
320
321/**
299 * lookup_metapath - Walk the metadata tree to a specific point 322 * lookup_metapath - Walk the metadata tree to a specific point
300 * @ip: The inode 323 * @ip: The inode
301 * @mp: The metapath 324 * @mp: The metapath
@@ -316,17 +339,10 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
316{ 339{
317 unsigned int end_of_metadata = ip->i_height - 1; 340 unsigned int end_of_metadata = ip->i_height - 1;
318 unsigned int x; 341 unsigned int x;
319 __be64 *ptr;
320 u64 dblock;
321 int ret; 342 int ret;
322 343
323 for (x = 0; x < end_of_metadata; x++) { 344 for (x = 0; x < end_of_metadata; x++) {
324 ptr = metapointer(x, mp); 345 ret = lookup_mp_height(ip, mp, x);
325 dblock = be64_to_cpu(*ptr);
326 if (!dblock)
327 return x + 1;
328
329 ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, &mp->mp_bh[x+1]);
330 if (ret) 346 if (ret)
331 return ret; 347 return ret;
332 } 348 }
@@ -334,6 +350,35 @@ static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
334 return ip->i_height; 350 return ip->i_height;
335} 351}
336 352
353/**
354 * fillup_metapath - fill up buffers for the metadata path to a specific height
355 * @ip: The inode
356 * @mp: The metapath
357 * @h: The height to which it should be mapped
358 *
359 * Similar to lookup_metapath, but does lookups for a range of heights
360 *
361 * Returns: error or height of metadata tree
362 */
363
364static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
365{
366 unsigned int start_h = h - 1;
367 int ret;
368
369 if (h) {
370 /* find the first buffer we need to look up. */
371 while (start_h > 0 && mp->mp_bh[start_h] == NULL)
372 start_h--;
373 for (; start_h < h; start_h++) {
374 ret = lookup_mp_height(ip, mp, start_h);
375 if (ret)
376 return ret;
377 }
378 }
379 return ip->i_height;
380}
381
337static inline void release_metapath(struct metapath *mp) 382static inline void release_metapath(struct metapath *mp)
338{ 383{
339 int i; 384 int i;
@@ -422,6 +467,13 @@ enum alloc_state {
422 /* ALLOC_UNSTUFF = 3, TBD and rather complicated */ 467 /* ALLOC_UNSTUFF = 3, TBD and rather complicated */
423}; 468};
424 469
470static inline unsigned int hptrs(struct gfs2_sbd *sdp, const unsigned int hgt)
471{
472 if (hgt)
473 return sdp->sd_inptrs;
474 return sdp->sd_diptrs;
475}
476
425/** 477/**
426 * gfs2_bmap_alloc - Build a metadata tree of the requested height 478 * gfs2_bmap_alloc - Build a metadata tree of the requested height
427 * @inode: The GFS2 inode 479 * @inode: The GFS2 inode
@@ -620,7 +672,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
620 672
621 BUG_ON(maxlen == 0); 673 BUG_ON(maxlen == 0);
622 674
623 memset(mp.mp_bh, 0, sizeof(mp.mp_bh)); 675 memset(&mp, 0, sizeof(mp));
624 bmap_lock(ip, create); 676 bmap_lock(ip, create);
625 clear_buffer_mapped(bh_map); 677 clear_buffer_mapped(bh_map);
626 clear_buffer_new(bh_map); 678 clear_buffer_new(bh_map);
@@ -702,252 +754,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
702} 754}
703 755
704/** 756/**
705 * do_strip - Look for a layer a particular layer of the file and strip it off
706 * @ip: the inode
707 * @dibh: the dinode buffer
708 * @bh: A buffer of pointers
709 * @top: The first pointer in the buffer
710 * @bottom: One more than the last pointer
711 * @height: the height this buffer is at
712 * @sm: a pointer to a struct strip_mine
713 *
714 * Returns: errno
715 */
716
717static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
718 struct buffer_head *bh, __be64 *top, __be64 *bottom,
719 unsigned int height, struct strip_mine *sm)
720{
721 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
722 struct gfs2_rgrp_list rlist;
723 struct gfs2_trans *tr;
724 u64 bn, bstart;
725 u32 blen, btotal;
726 __be64 *p;
727 unsigned int rg_blocks = 0;
728 int metadata;
729 unsigned int revokes = 0;
730 int x;
731 int error;
732 int jblocks_rqsted;
733
734 error = gfs2_rindex_update(sdp);
735 if (error)
736 return error;
737
738 if (!*top)
739 sm->sm_first = 0;
740
741 if (height != sm->sm_height)
742 return 0;
743
744 if (sm->sm_first) {
745 top++;
746 sm->sm_first = 0;
747 }
748
749 metadata = (height != ip->i_height - 1);
750 if (metadata)
751 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
752 else if (ip->i_depth)
753 revokes = sdp->sd_inptrs;
754
755 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
756 bstart = 0;
757 blen = 0;
758
759 for (p = top; p < bottom; p++) {
760 if (!*p)
761 continue;
762
763 bn = be64_to_cpu(*p);
764
765 if (bstart + blen == bn)
766 blen++;
767 else {
768 if (bstart)
769 gfs2_rlist_add(ip, &rlist, bstart);
770
771 bstart = bn;
772 blen = 1;
773 }
774 }
775
776 if (bstart)
777 gfs2_rlist_add(ip, &rlist, bstart);
778 else
779 goto out; /* Nothing to do */
780
781 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
782
783 for (x = 0; x < rlist.rl_rgrps; x++) {
784 struct gfs2_rgrpd *rgd;
785 rgd = rlist.rl_ghs[x].gh_gl->gl_object;
786 rg_blocks += rgd->rd_length;
787 }
788
789 error = gfs2_glock_nq_m(rlist.rl_rgrps, rlist.rl_ghs);
790 if (error)
791 goto out_rlist;
792
793 if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
794 gfs2_rs_deltree(&ip->i_res);
795
796restart:
797 jblocks_rqsted = rg_blocks + RES_DINODE +
798 RES_INDIRECT + RES_STATFS + RES_QUOTA +
799 gfs2_struct2blk(sdp, revokes, sizeof(u64));
800 if (jblocks_rqsted > atomic_read(&sdp->sd_log_thresh2))
801 jblocks_rqsted = atomic_read(&sdp->sd_log_thresh2);
802 error = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
803 if (error)
804 goto out_rg_gunlock;
805
806 tr = current->journal_info;
807 down_write(&ip->i_rw_mutex);
808
809 gfs2_trans_add_meta(ip->i_gl, dibh);
810 gfs2_trans_add_meta(ip->i_gl, bh);
811
812 bstart = 0;
813 blen = 0;
814 btotal = 0;
815
816 for (p = top; p < bottom; p++) {
817 if (!*p)
818 continue;
819
820 /* check for max reasonable journal transaction blocks */
821 if (tr->tr_num_buf_new + RES_STATFS +
822 RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
823 if (rg_blocks >= tr->tr_num_buf_new)
824 rg_blocks -= tr->tr_num_buf_new;
825 else
826 rg_blocks = 0;
827 break;
828 }
829
830 bn = be64_to_cpu(*p);
831
832 if (bstart + blen == bn)
833 blen++;
834 else {
835 if (bstart) {
836 __gfs2_free_blocks(ip, bstart, blen, metadata);
837 btotal += blen;
838 }
839
840 bstart = bn;
841 blen = 1;
842 }
843
844 *p = 0;
845 gfs2_add_inode_blocks(&ip->i_inode, -1);
846 }
847 if (p == bottom)
848 rg_blocks = 0;
849
850 if (bstart) {
851 __gfs2_free_blocks(ip, bstart, blen, metadata);
852 btotal += blen;
853 }
854
855 gfs2_statfs_change(sdp, 0, +btotal, 0);
856 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
857 ip->i_inode.i_gid);
858
859 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
860
861 gfs2_dinode_out(ip, dibh->b_data);
862
863 up_write(&ip->i_rw_mutex);
864
865 gfs2_trans_end(sdp);
866
867 if (rg_blocks)
868 goto restart;
869
870out_rg_gunlock:
871 gfs2_glock_dq_m(rlist.rl_rgrps, rlist.rl_ghs);
872out_rlist:
873 gfs2_rlist_free(&rlist);
874out:
875 return error;
876}
877
878/**
879 * recursive_scan - recursively scan through the end of a file
880 * @ip: the inode
881 * @dibh: the dinode buffer
882 * @mp: the path through the metadata to the point to start
883 * @height: the height the recursion is at
884 * @block: the indirect block to look at
885 * @first: 1 if this is the first block
886 * @sm: data opaque to this function to pass to @bc
887 *
888 * When this is first called @height and @block should be zero and
889 * @first should be 1.
890 *
891 * Returns: errno
892 */
893
894static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
895 struct metapath *mp, unsigned int height,
896 u64 block, int first, struct strip_mine *sm)
897{
898 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
899 struct buffer_head *bh = NULL;
900 __be64 *top, *bottom;
901 u64 bn;
902 int error;
903 int mh_size = sizeof(struct gfs2_meta_header);
904
905 if (!height) {
906 error = gfs2_meta_inode_buffer(ip, &bh);
907 if (error)
908 return error;
909 dibh = bh;
910
911 top = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + mp->mp_list[0];
912 bottom = (__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)) + sdp->sd_diptrs;
913 } else {
914 error = gfs2_meta_indirect_buffer(ip, height, block, &bh);
915 if (error)
916 return error;
917
918 top = (__be64 *)(bh->b_data + mh_size) +
919 (first ? mp->mp_list[height] : 0);
920
921 bottom = (__be64 *)(bh->b_data + mh_size) + sdp->sd_inptrs;
922 }
923
924 error = do_strip(ip, dibh, bh, top, bottom, height, sm);
925 if (error)
926 goto out;
927
928 if (height < ip->i_height - 1) {
929
930 gfs2_metapath_ra(ip->i_gl, bh, top);
931
932 for (; top < bottom; top++, first = 0) {
933 if (!*top)
934 continue;
935
936 bn = be64_to_cpu(*top);
937
938 error = recursive_scan(ip, dibh, mp, height + 1, bn,
939 first, sm);
940 if (error)
941 break;
942 }
943 }
944out:
945 brelse(bh);
946 return error;
947}
948
949
950/**
951 * gfs2_block_truncate_page - Deal with zeroing out data for truncate 757 * gfs2_block_truncate_page - Deal with zeroing out data for truncate
952 * 758 *
953 * This is partly borrowed from ext3. 759 * This is partly borrowed from ext3.
@@ -1106,41 +912,406 @@ out:
1106 return error; 912 return error;
1107} 913}
1108 914
1109static int trunc_dealloc(struct gfs2_inode *ip, u64 size) 915/**
916 * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
917 * @ip: inode
918 * @rg_gh: holder of resource group glock
919 * @mp: current metapath fully populated with buffers
920 * @btotal: place to keep count of total blocks freed
921 * @hgt: height we're processing
922 * @first: true if this is the first call to this function for this height
923 *
924 * We sweep a metadata buffer (provided by the metapath) for blocks we need to
925 * free, and free them all. However, we do it one rgrp at a time. If this
926 * block has references to multiple rgrps, we break it into individual
927 * transactions. This allows other processes to use the rgrps while we're
928 * focused on a single one, for better concurrency / performance.
929 * At every transaction boundary, we rewrite the inode into the journal.
930 * That way the bitmaps are kept consistent with the inode and we can recover
931 * if we're interrupted by power-outages.
932 *
933 * Returns: 0, or return code if an error occurred.
934 * *btotal has the total number of blocks freed
935 */
936static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
937 const struct metapath *mp, u32 *btotal, int hgt,
938 bool preserve1)
1110{ 939{
1111 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 940 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1112 unsigned int height = ip->i_height; 941 struct gfs2_rgrpd *rgd;
1113 u64 lblock; 942 struct gfs2_trans *tr;
1114 struct metapath mp; 943 struct buffer_head *bh = mp->mp_bh[hgt];
1115 int error; 944 __be64 *top, *bottom, *p;
945 int blks_outside_rgrp;
946 u64 bn, bstart, isize_blks;
947 s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
948 int meta = ((hgt != ip->i_height - 1) ? 1 : 0);
949 int ret = 0;
950 bool buf_in_tr = false; /* buffer was added to transaction */
951
952 if (gfs2_metatype_check(sdp, bh,
953 (hgt ? GFS2_METATYPE_IN : GFS2_METATYPE_DI)))
954 return -EIO;
955
956more_rgrps:
957 blks_outside_rgrp = 0;
958 bstart = 0;
959 blen = 0;
960 top = metapointer(hgt, mp); /* first ptr from metapath */
961 /* If we're keeping some data at the truncation point, we've got to
962 preserve the metadata tree by adding 1 to the starting metapath. */
963 if (preserve1)
964 top++;
965
966 bottom = (__be64 *)(bh->b_data + bh->b_size);
967
968 for (p = top; p < bottom; p++) {
969 if (!*p)
970 continue;
971 bn = be64_to_cpu(*p);
972 if (gfs2_holder_initialized(rd_gh)) {
973 rgd = (struct gfs2_rgrpd *)rd_gh->gh_gl->gl_object;
974 gfs2_assert_withdraw(sdp,
975 gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
976 } else {
977 rgd = gfs2_blk2rgrpd(sdp, bn, false);
978 ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
979 0, rd_gh);
980 if (ret)
981 goto out;
982
983 /* Must be done with the rgrp glock held: */
984 if (gfs2_rs_active(&ip->i_res) &&
985 rgd == ip->i_res.rs_rbm.rgd)
986 gfs2_rs_deltree(&ip->i_res);
987 }
988
989 if (!rgrp_contains_block(rgd, bn)) {
990 blks_outside_rgrp++;
991 continue;
992 }
993
994 /* The size of our transactions will be unknown until we
995 actually process all the metadata blocks that relate to
996 the rgrp. So we estimate. We know it can't be more than
997 the dinode's i_blocks and we don't want to exceed the
998 journal flush threshold, sd_log_thresh2. */
999 if (current->journal_info == NULL) {
1000 unsigned int jblocks_rqsted, revokes;
1001
1002 jblocks_rqsted = rgd->rd_length + RES_DINODE +
1003 RES_INDIRECT;
1004 isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1005 if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1006 jblocks_rqsted +=
1007 atomic_read(&sdp->sd_log_thresh2);
1008 else
1009 jblocks_rqsted += isize_blks;
1010 revokes = jblocks_rqsted;
1011 if (meta)
1012 revokes += hptrs(sdp, hgt);
1013 else if (ip->i_depth)
1014 revokes += sdp->sd_inptrs;
1015 ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1016 if (ret)
1017 goto out_unlock;
1018 down_write(&ip->i_rw_mutex);
1019 }
1020 /* check if we will exceed the transaction blocks requested */
1021 tr = current->journal_info;
1022 if (tr->tr_num_buf_new + RES_STATFS +
1023 RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1024 /* We set blks_outside_rgrp to ensure the loop will
1025 be repeated for the same rgrp, but with a new
1026 transaction. */
1027 blks_outside_rgrp++;
1028 /* This next part is tricky. If the buffer was added
1029 to the transaction, we've already set some block
1030 pointers to 0, so we better follow through and free
1031 them, or we will introduce corruption (so break).
1032 This may be impossible, or at least rare, but I
1033 decided to cover the case regardless.
1034
1035 If the buffer was not added to the transaction
1036 (this call), doing so would exceed our transaction
1037 size, so we need to end the transaction and start a
1038 new one (so goto). */
1039
1040 if (buf_in_tr)
1041 break;
1042 goto out_unlock;
1043 }
1044
1045 gfs2_trans_add_meta(ip->i_gl, bh);
1046 buf_in_tr = true;
1047 *p = 0;
1048 if (bstart + blen == bn) {
1049 blen++;
1050 continue;
1051 }
1052 if (bstart) {
1053 __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1054 (*btotal) += blen;
1055 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1056 }
1057 bstart = bn;
1058 blen = 1;
1059 }
1060 if (bstart) {
1061 __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
1062 (*btotal) += blen;
1063 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1064 }
1065out_unlock:
1066 if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1067 outside the rgrp we just processed,
1068 do it all over again. */
1069 if (current->journal_info) {
1070 struct buffer_head *dibh = mp->mp_bh[0];
1071
1072 /* Every transaction boundary, we rewrite the dinode
1073 to keep its di_blocks current in case of failure. */
1074 ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1075 CURRENT_TIME;
1076 gfs2_trans_add_meta(ip->i_gl, dibh);
1077 gfs2_dinode_out(ip, dibh->b_data);
1078 up_write(&ip->i_rw_mutex);
1079 gfs2_trans_end(sdp);
1080 }
1081 gfs2_glock_dq_uninit(rd_gh);
1082 cond_resched();
1083 goto more_rgrps;
1084 }
1085out:
1086 return ret;
1087}
1088
1089/**
1090 * find_nonnull_ptr - find a non-null pointer given a metapath and height
1091 * assumes the metapath is valid (with buffers) out to height h
1092 * @mp: starting metapath
1093 * @h: desired height to search
1094 *
1095 * Returns: true if a non-null pointer was found in the metapath buffer
1096 * false if all remaining pointers are NULL in the buffer
1097 */
1098static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1099 unsigned int h)
1100{
1101 __be64 *ptr;
1102 unsigned int ptrs = hptrs(sdp, h) - 1;
1103
1104 while (true) {
1105 ptr = metapointer(h, mp);
1106 if (*ptr) /* if we have a non-null pointer */
1107 return true;
1108
1109 if (mp->mp_list[h] < ptrs)
1110 mp->mp_list[h]++;
1111 else
1112 return false; /* no more pointers in this buffer */
1113 }
1114}
1115
1116enum dealloc_states {
1117 DEALLOC_MP_FULL = 0, /* Strip a metapath with all buffers read in */
1118 DEALLOC_MP_LOWER = 1, /* lower the metapath strip height */
1119 DEALLOC_FILL_MP = 2, /* Fill in the metapath to the given height. */
1120 DEALLOC_DONE = 3, /* process complete */
1121};
1116 1122
1117 if (!size) 1123/**
1124 * trunc_dealloc - truncate a file down to a desired size
1125 * @ip: inode to truncate
1126 * @newsize: The desired size of the file
1127 *
1128 * This function truncates a file to newsize. It works from the
1129 * bottom up, and from the right to the left. In other words, it strips off
1130 * the highest layer (data) before stripping any of the metadata. Doing it
1131 * this way is best in case the operation is interrupted by power failure, etc.
1132 * The dinode is rewritten in every transaction to guarantee integrity.
1133 */
1134static int trunc_dealloc(struct gfs2_inode *ip, u64 newsize)
1135{
1136 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1137 struct metapath mp;
1138 struct buffer_head *dibh, *bh;
1139 struct gfs2_holder rd_gh;
1140 u64 lblock;
1141 __u16 nbof[GFS2_MAX_META_HEIGHT]; /* new beginning of truncation */
1142 unsigned int strip_h = ip->i_height - 1;
1143 u32 btotal = 0;
1144 int ret, state;
1145 int mp_h; /* metapath buffers are read in to this height */
1146 sector_t last_ra = 0;
1147 u64 prev_bnr = 0;
1148 bool preserve1; /* need to preserve the first meta pointer? */
1149
1150 if (!newsize)
1118 lblock = 0; 1151 lblock = 0;
1119 else 1152 else
1120 lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift; 1153 lblock = (newsize - 1) >> sdp->sd_sb.sb_bsize_shift;
1121 1154
1155 memset(&mp, 0, sizeof(mp));
1122 find_metapath(sdp, lblock, &mp, ip->i_height); 1156 find_metapath(sdp, lblock, &mp, ip->i_height);
1123 error = gfs2_rindex_update(sdp);
1124 if (error)
1125 return error;
1126 1157
1127 error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); 1158 memcpy(&nbof, &mp.mp_list, sizeof(nbof));
1128 if (error) 1159
1129 return error; 1160 ret = gfs2_meta_inode_buffer(ip, &dibh);
1161 if (ret)
1162 return ret;
1130 1163
1131 while (height--) { 1164 mp.mp_bh[0] = dibh;
1132 struct strip_mine sm; 1165 ret = lookup_metapath(ip, &mp);
1133 sm.sm_first = !!size; 1166 if (ret == ip->i_height)
1134 sm.sm_height = height; 1167 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1168 else
1169 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1135 1170
1136 error = recursive_scan(ip, NULL, &mp, 0, 0, 1, &sm); 1171 ret = gfs2_rindex_update(sdp);
1137 if (error) 1172 if (ret)
1173 goto out_metapath;
1174
1175 ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1176 if (ret)
1177 goto out_metapath;
1178 gfs2_holder_mark_uninitialized(&rd_gh);
1179
1180 mp_h = strip_h;
1181
1182 while (state != DEALLOC_DONE) {
1183 switch (state) {
1184 /* Truncate a full metapath at the given strip height.
1185 * Note that strip_h == mp_h in order to be in this state. */
1186 case DEALLOC_MP_FULL:
1187 if (mp_h > 0) { /* issue read-ahead on metadata */
1188 __be64 *top;
1189
1190 bh = mp.mp_bh[mp_h - 1];
1191 if (bh->b_blocknr != last_ra) {
1192 last_ra = bh->b_blocknr;
1193 top = metaptr1(mp_h - 1, &mp);
1194 gfs2_metapath_ra(ip->i_gl, bh, top);
1195 }
1196 }
1197 /* If we're truncating to a non-zero size and the mp is
1198 at the beginning of file for the strip height, we
1199 need to preserve the first metadata pointer. */
1200 preserve1 = (newsize &&
1201 (mp.mp_list[mp_h] == nbof[mp_h]));
1202 bh = mp.mp_bh[mp_h];
1203 gfs2_assert_withdraw(sdp, bh);
1204 if (gfs2_assert_withdraw(sdp,
1205 prev_bnr != bh->b_blocknr)) {
1206 printk(KERN_EMERG "GFS2: fsid=%s:inode %llu, "
1207 "block:%llu, i_h:%u, s_h:%u, mp_h:%u\n",
1208 sdp->sd_fsname,
1209 (unsigned long long)ip->i_no_addr,
1210 prev_bnr, ip->i_height, strip_h, mp_h);
1211 }
1212 prev_bnr = bh->b_blocknr;
1213 ret = sweep_bh_for_rgrps(ip, &rd_gh, &mp, &btotal,
1214 mp_h, preserve1);
1215 /* If we hit an error or just swept dinode buffer,
1216 just exit. */
1217 if (ret || !mp_h) {
1218 state = DEALLOC_DONE;
1219 break;
1220 }
1221 state = DEALLOC_MP_LOWER;
1222 break;
1223
1224 /* lower the metapath strip height */
1225 case DEALLOC_MP_LOWER:
1226 /* We're done with the current buffer, so release it,
1227 unless it's the dinode buffer. Then back up to the
1228 previous pointer. */
1229 if (mp_h) {
1230 brelse(mp.mp_bh[mp_h]);
1231 mp.mp_bh[mp_h] = NULL;
1232 }
1233 /* If we can't get any lower in height, we've stripped
1234 off all we can. Next step is to back up and start
1235 stripping the previous level of metadata. */
1236 if (mp_h == 0) {
1237 strip_h--;
1238 memcpy(&mp.mp_list, &nbof, sizeof(nbof));
1239 mp_h = strip_h;
1240 state = DEALLOC_FILL_MP;
1241 break;
1242 }
1243 mp.mp_list[mp_h] = 0;
1244 mp_h--; /* search one metadata height down */
1245 if (mp.mp_list[mp_h] >= hptrs(sdp, mp_h) - 1)
1246 break; /* loop around in the same state */
1247 mp.mp_list[mp_h]++;
1248 /* Here we've found a part of the metapath that is not
1249 * allocated. We need to search at that height for the
1250 * next non-null pointer. */
1251 if (find_nonnull_ptr(sdp, &mp, mp_h)) {
1252 state = DEALLOC_FILL_MP;
1253 mp_h++;
1254 }
1255 /* No more non-null pointers at this height. Back up
1256 to the previous height and try again. */
1257 break; /* loop around in the same state */
1258
1259 /* Fill the metapath with buffers to the given height. */
1260 case DEALLOC_FILL_MP:
1261 /* Fill the buffers out to the current height. */
1262 ret = fillup_metapath(ip, &mp, mp_h);
1263 if (ret < 0)
1264 goto out;
1265
1266 /* If buffers found for the entire strip height */
1267 if ((ret == ip->i_height) && (mp_h == strip_h)) {
1268 state = DEALLOC_MP_FULL;
1269 break;
1270 }
1271 if (ret < ip->i_height) /* We have a partial height */
1272 mp_h = ret - 1;
1273
1274 /* If we find a non-null block pointer, crawl a bit
1275 higher up in the metapath and try again, otherwise
1276 we need to look lower for a new starting point. */
1277 if (find_nonnull_ptr(sdp, &mp, mp_h))
1278 mp_h++;
1279 else
1280 state = DEALLOC_MP_LOWER;
1138 break; 1281 break;
1282 }
1139 } 1283 }
1140 1284
1141 gfs2_quota_unhold(ip); 1285 if (btotal) {
1286 if (current->journal_info == NULL) {
1287 ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1288 RES_QUOTA, 0);
1289 if (ret)
1290 goto out;
1291 down_write(&ip->i_rw_mutex);
1292 }
1293 gfs2_statfs_change(sdp, 0, +btotal, 0);
1294 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1295 ip->i_inode.i_gid);
1296 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
1297 gfs2_trans_add_meta(ip->i_gl, dibh);
1298 gfs2_dinode_out(ip, dibh->b_data);
1299 up_write(&ip->i_rw_mutex);
1300 gfs2_trans_end(sdp);
1301 }
1142 1302
1143 return error; 1303out:
1304 if (gfs2_holder_initialized(&rd_gh))
1305 gfs2_glock_dq_uninit(&rd_gh);
1306 if (current->journal_info) {
1307 up_write(&ip->i_rw_mutex);
1308 gfs2_trans_end(sdp);
1309 cond_resched();
1310 }
1311 gfs2_quota_unhold(ip);
1312out_metapath:
1313 release_metapath(&mp);
1314 return ret;
1144} 1315}
1145 1316
1146static int trunc_end(struct gfs2_inode *ip) 1317static int trunc_end(struct gfs2_inode *ip)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 86ccc0159393..83c9909ff14a 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -483,13 +483,6 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
483 } 483 }
484} 484}
485 485
486static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
487{
488 u64 first = rgd->rd_data0;
489 u64 last = first + rgd->rd_data;
490 return first <= block && block < last;
491}
492
493/** 486/**
494 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number 487 * gfs2_blk2rgrpd - Find resource group for a given data/meta block number
495 * @sdp: The GFS2 superblock 488 * @sdp: The GFS2 superblock
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 66b51cf66dfa..e90478e2f545 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -83,5 +83,12 @@ static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
83 return rs && !RB_EMPTY_NODE(&rs->rs_node); 83 return rs && !RB_EMPTY_NODE(&rs->rs_node);
84} 84}
85 85
86static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
87{
88 u64 first = rgd->rd_data0;
89 u64 last = first + rgd->rd_data;
90 return first <= block && block < last;
91}
92
86extern void check_and_update_goal(struct gfs2_inode *ip); 93extern void check_and_update_goal(struct gfs2_inode *ip);
87#endif /* __RGRP_DOT_H__ */ 94#endif /* __RGRP_DOT_H__ */