[GFS2] Make journaled data files identical to normal files on disk

This is a very large patch, with a few still to be resolved issues so you might want to check out the previous head of the tree since this is known to be unstable. Fixes for the various bugs will be forthcoming shortly. This patch removes the special data format which has been used up till now for journaled data files. Directories still retain the old format so that they will remain on disk compatible with earlier releases. As a result you can now do the following with journaled data files: 1) mmap them 2) export them over NFS 3) convert to/from normal files whenever you want to (the zero length restriction is gone) In addition the level at which GFS' locking is done has changed for all files (since they all now use the page cache) such that the locking is done at the page cache level rather than the level of the fs operations. This should mean that things like loopback mounts and other things which touch the page cache directly should now work. Current known issues: 1. There is a lock mode inversion problem related to the resource group hold function which needs to be resolved. 2. Any significant amount of I/O causes an oops with an offset of hex 320 (NULL pointer dereference) which appears to be related to a journaled data buffer appearing on a list where it shouldn't be. 3. Direct I/O writes are disabled for the time being (will reappear later) 4. There is probably a deadlock between the page lock and GFS' locks under certain combinations of mmap and fs operation I/O. 5. Issue relating to ref counting on internally used inodes causes a hang on umount (discovered before this patch, and not fixed by it) 6. One part of the directory metadata is different from GFS1 and will need to be resolved before next release. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
author: Steven Whitehouse <swhiteho@redhat.com> 2006-02-08 06:50:51 -0500
committer: Steven Whitehouse <swhiteho@redhat.com> 2006-02-08 06:50:51 -0500
commit: 18ec7d5c3f434aed9661ed10a9e1f48cdeb4981d (patch)
tree: a7161a4c4b3592052e6772e1c23849de16cac649 /fs/gfs2/lops.c
parent: 257f9b4e97e9a6cceeb247cead92119a4396d37b (diff)
1 files changed, 257 insertions, 23 deletions
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a065f7667238..dd41863810d7 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -428,49 +428,188 @@ static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
        gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
 }
+/**
+ * databuf_lo_add - Add a databuf to the transaction.
+ *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
+ */
 static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
 {
-        get_transaction->tr_touched = 1;
+        struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
+        struct gfs2_trans *tr = get_transaction;
+        struct address_space *mapping = bd->bd_bh->b_page->mapping;
+        struct gfs2_inode *ip = get_v2ip(mapping->host);
+        tr->tr_touched = 1;
+        if (!list_empty(&bd->bd_list_tr) &&
+            (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
+                tr->tr_num_buf++;
+                gfs2_trans_add_gl(bd->bd_gl);
+                list_add(&bd->bd_list_tr, &tr->tr_list_buf);
+                gfs2_pin(sdp, bd->bd_bh);
+        } else {
+                clear_buffer_pinned(bd->bd_bh);
+        }
        gfs2_log_lock(sdp);
+        if (ip->i_di.di_flags & GFS2_DIF_JDATA)
+                sdp->sd_log_num_jdata++;
        sdp->sd_log_num_databuf++;
        list_add(&le->le_list, &sdp->sd_log_le_databuf);
        gfs2_log_unlock(sdp);
 }
+static int gfs2_check_magic(struct buffer_head *bh)
+{
+        struct page *page = bh->b_page;
+        void *kaddr;
+        __be32 *ptr;
+        int rv = 0;
+        kaddr = kmap_atomic(page, KM_USER0);
+        ptr = kaddr + bh_offset(bh);
+        if (*ptr == cpu_to_be32(GFS2_MAGIC))
+                rv = 1;
+        kunmap_atomic(page, KM_USER0);
+        return rv;
+}
+/**
+ * databuf_lo_before_commit - Scan the data buffers, writing as we go
+ *
+ * Here we scan through the lists of buffers and make the assumption
+ * that any buffer thats been pinned is being journaled, and that
+ * any unpinned buffer is an ordered write data buffer and therefore
+ * will be written back rather than journaled.
+ */
 static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
 {
-        struct list_head *head = &sdp->sd_log_le_databuf;
        LIST_HEAD(started);
-        struct gfs2_bufdata *bd;
+        struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
-        struct buffer_head *bh;
+        struct buffer_head *bh = NULL;
+        unsigned int offset = sizeof(struct gfs2_log_descriptor);
+        struct gfs2_log_descriptor *ld;
+        unsigned int limit;
+        unsigned int total_dbuf = sdp->sd_log_num_databuf;
+        unsigned int total_jdata = sdp->sd_log_num_jdata;
+        unsigned int num, n;
+        __be64 *ptr;
-        while (!list_empty(head)) {
+        offset += (2*sizeof(__be64) - 1);
-                bd = list_entry(head->prev, struct gfs2_bufdata, bd_le.le_list);
+        offset &= ~(2*sizeof(__be64) - 1);
-                list_move(&bd->bd_le.le_list, &started);
+        limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
-                gfs2_log_lock(sdp);
+        /* printk(KERN_INFO "totals: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */
-                bh = bd->bd_bh;
+        /*
+         * Start writing ordered buffers, write journaled buffers
+         * into the log along with a header
+         */
+        bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, bd_le.le_list);
+        while(total_dbuf) {
+                num = total_jdata;
+                if (num > limit)
+                        num = limit;
+                n = 0;
+                list_for_each_entry_safe_continue(bd1, bdt, &sdp->sd_log_le_databuf, bd_le.le_list) {
+                        gfs2_log_lock(sdp);
+                        /* An ordered write buffer */
+                        if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
+                                list_move(&bd1->bd_le.le_list, &started);
+                                if (bd1 == bd2) {
+                                        bd2 = NULL;
+                                        bd2 = list_prepare_entry(bd2, &sdp->sd_log_le_databuf, bd_le.le_list);
+                                }
+                                total_dbuf--;
+                                if (bd1->bd_bh) {
+                                        get_bh(bd1->bd_bh);
+                                        gfs2_log_unlock(sdp);
+                                        if (buffer_dirty(bd1->bd_bh)) {
+                                                wait_on_buffer(bd1->bd_bh);
+                                                ll_rw_block(WRITE, 1, &bd1->bd_bh);
+                                        }
+                                        brelse(bd1->bd_bh);
+                                        continue;
+                                }
+                                gfs2_log_unlock(sdp);
+                                continue;
+                        } else if (bd1->bd_bh) { /* A journaled buffer */
+                                int magic;
+                                gfs2_log_unlock(sdp);
+                                /* printk(KERN_INFO "journaled buffer\n"); */
+                                if (!bh) {
+                                        bh = gfs2_log_get_buf(sdp);
+                                        ld = (struct gfs2_log_descriptor *)bh->b_data;
+                                        ptr = (__be64 *)(bh->b_data + offset);
+                                        ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
+                                        ld->ld_header.mh_type = cpu_to_be16(GFS2_METATYPE_LD);
+                                        ld->ld_header.mh_format = cpu_to_be16(GFS2_FORMAT_LD);
+                                        ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_JDATA);
+                                        ld->ld_length = cpu_to_be32(num + 1);
+                                        ld->ld_data1 = cpu_to_be32(num);
+                                        ld->ld_data2 = cpu_to_be32(0);
+                                        memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
+                                }
+                                magic = gfs2_check_magic(bd1->bd_bh);
+                                *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
+                                *ptr++ = cpu_to_be64((__u64)magic);
+                                clear_buffer_escaped(bd1->bd_bh);
+                                if (unlikely(magic != 0))
+                                        set_buffer_escaped(bd1->bd_bh);
+                                if (n++ > num)
+                                        break;
+                        }
+                }
                if (bh) {
-                        get_bh(bh);
+                        set_buffer_dirty(bh);
-                        gfs2_log_unlock(sdp);
+                        ll_rw_block(WRITE, 1, &bh);
-                        if (buffer_dirty(bh)) {
+                        bh = NULL;
-                                wait_on_buffer(bh);
+                }
-                                ll_rw_block(WRITE, 1, &bh);
+                n = 0;
+                /* printk(KERN_INFO "totals2: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */
+                list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, bd_le.le_list) {
+                        if (!bd2->bd_bh)
+                                continue;
+                        /* copy buffer if it needs escaping */
+                        if (unlikely(buffer_escaped(bd2->bd_bh))) {
+                                void *kaddr;
+                                struct page *page = bd2->bd_bh->b_page;
+                                bh = gfs2_log_get_buf(sdp);
+                                kaddr = kmap_atomic(page, KM_USER0);
+                                memcpy(bh->b_data, kaddr + bh_offset(bd2->bd_bh), sdp->sd_sb.sb_bsize);
+                                kunmap_atomic(page, KM_USER0);
+                                *(__be32 *)bh->b_data = 0;
+                        } else {
+                                bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
                        }
-                        brelse(bh);
+                        set_buffer_dirty(bh);
-                } else
+                        ll_rw_block(WRITE, 1, &bh);
-                        gfs2_log_unlock(sdp);
+                        if (++n >= num)
+                                break;
+                }
+                bh = NULL;
+                total_dbuf -= num;
+                total_jdata -= num;
        }
+        /* printk(KERN_INFO "wait on ordered data buffers\n"); */
+        /* Wait on all ordered buffers */
        while (!list_empty(&started)) {
-                bd = list_entry(started.next, struct gfs2_bufdata,
+                bd1 = list_entry(started.next, struct gfs2_bufdata, bd_le.le_list);
-                                bd_le.le_list);
+                list_del(&bd1->bd_le.le_list);
-                list_del(&bd->bd_le.le_list);
                sdp->sd_log_num_databuf--;
                gfs2_log_lock(sdp);
-                bh = bd->bd_bh;
+                bh = bd1->bd_bh;
                if (bh) {
                        set_v2bd(bh, NULL);
                        gfs2_log_unlock(sdp);
@@ -479,12 +618,103 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
                } else
                        gfs2_log_unlock(sdp);
-                kfree(bd);
+                kfree(bd1);
        }
+        /* printk(KERN_INFO "sd_log_num_databuf %u sd_log_num_jdata %u\n", sdp->sd_log_num_databuf, sdp->sd_log_num_jdata); */
+        /* We've removed all the ordered write bufs here, so only jdata left */
+        gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
+}
+static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
+                                    struct gfs2_log_descriptor *ld,
+                                    __be64 *ptr, int pass)
+{
+        struct gfs2_sbd *sdp = jd->jd_inode->i_sbd;
+        struct gfs2_glock *gl = jd->jd_inode->i_gl;
+        unsigned int blks = be32_to_cpu(ld->ld_data1);
+        struct buffer_head *bh_log, *bh_ip;
+        uint64_t blkno;
+        uint64_t esc;
+        int error = 0;
+        if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
+                return 0;
+        gfs2_replay_incr_blk(sdp, &start);
+        for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
+                blkno = be64_to_cpu(*ptr++);
+                esc = be64_to_cpu(*ptr++);
+                sdp->sd_found_blocks++;
+                if (gfs2_revoke_check(sdp, blkno, start))
+                        continue;
+                error = gfs2_replay_read_block(jd, start, &bh_log);
+                if (error)
+                        return error;
+                bh_ip = gfs2_meta_new(gl, blkno);
+                memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
+                /* Unescape */
+                if (esc) {
+                        __be32 *eptr = (__be32 *)bh_ip->b_data;
+                        *eptr = cpu_to_be32(GFS2_MAGIC);
+                }
+                mark_buffer_dirty(bh_ip);
+                brelse(bh_log);
+                brelse(bh_ip);
+                if (error)
+                        break;
+                sdp->sd_replayed_blocks++;
+        }
+        return error;
+}
+/* FIXME: sort out accounting for log blocks etc. */
+static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
+{
+        struct gfs2_sbd *sdp = jd->jd_inode->i_sbd;
+        if (error) {
+                gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START | DIO_WAIT);
+                return;
+        }
+        if (pass != 1)
+                return;
+        /* data sync? */
+        gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START | DIO_WAIT);
+        fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
+                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
+}
+static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
+{
+        struct list_head *head = &sdp->sd_log_le_databuf;
+        struct gfs2_bufdata *bd;
+        while (!list_empty(head)) {
+                bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
+                list_del_init(&bd->bd_le.le_list);
+                sdp->sd_log_num_databuf--;
+                sdp->sd_log_num_jdata--;
+                gfs2_unpin(sdp, bd->bd_bh, ai);
+                brelse(bd->bd_bh);
+                kfree(bd);
+        }
        gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
+        gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
 }
 struct gfs2_log_operations gfs2_glock_lops = {
        .lo_add = glock_lo_add,
        .lo_after_commit = glock_lo_after_commit,
@@ -519,7 +749,11 @@ struct gfs2_log_operations gfs2_rg_lops = {
 struct gfs2_log_operations gfs2_databuf_lops = {
        .lo_add = databuf_lo_add,
+        .lo_incore_commit = buf_lo_incore_commit,
        .lo_before_commit = databuf_lo_before_commit,
+        .lo_after_commit = databuf_lo_after_commit,
+        .lo_scan_elements = databuf_lo_scan_elements,
+        .lo_after_scan = databuf_lo_after_scan,
        .lo_name = "databuf"
 };
author	Steven Whitehouse <swhiteho@redhat.com>	2006-02-08 06:50:51 -0500
committer	Steven Whitehouse <swhiteho@redhat.com>	2006-02-08 06:50:51 -0500
commit	18ec7d5c3f434aed9661ed10a9e1f48cdeb4981d (patch)
tree	a7161a4c4b3592052e6772e1c23849de16cac649 /fs/gfs2/lops.c
parent	257f9b4e97e9a6cceeb247cead92119a4396d37b (diff)

diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index a065f7667238..dd41863810d7 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c
@@ -428,49 +428,188 @@ static void rg_lo_after_commit(struct gfs2_sbd sdp, struct gfs2_ail ai)
428	gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);	428	gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
429	}	429	}
430		430
		431	/**
		432	* databuf_lo_add - Add a databuf to the transaction.
		433	*
		434	* This is used in two distinct cases:
		435	* i) In ordered write mode
		436	* We put the data buffer on a list so that we can ensure that its
		437	* synced to disk at the right time
		438	* ii) In journaled data mode
		439	* We need to journal the data block in the same way as metadata in
		440	* the functions above. The difference is that here we have a tag
		441	* which is two __be64's being the block number (as per meta data)
		442	* and a flag which says whether the data block needs escaping or
		443	* not. This means we need a new log entry for each 251 or so data
		444	* blocks, which isn't an enormous overhead but twice as much as
		445	* for normal metadata blocks.
		446	*/
431	static void databuf_lo_add(struct gfs2_sbd sdp, struct gfs2_log_element le)	447	static void databuf_lo_add(struct gfs2_sbd sdp, struct gfs2_log_element le)
432	{	448	{
433	get_transaction->tr_touched = 1;	449	struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
		450	struct gfs2_trans *tr = get_transaction;
		451	struct address_space *mapping = bd->bd_bh->b_page->mapping;
		452	struct gfs2_inode *ip = get_v2ip(mapping->host);
434		453
		454	tr->tr_touched = 1;
		455	if (!list_empty(&bd->bd_list_tr) &&
		456	(ip->i_di.di_flags & GFS2_DIF_JDATA)) {
		457	tr->tr_num_buf++;
		458	gfs2_trans_add_gl(bd->bd_gl);
		459	list_add(&bd->bd_list_tr, &tr->tr_list_buf);
		460	gfs2_pin(sdp, bd->bd_bh);
		461	} else {
		462	clear_buffer_pinned(bd->bd_bh);
		463	}
435	gfs2_log_lock(sdp);	464	gfs2_log_lock(sdp);
		465	if (ip->i_di.di_flags & GFS2_DIF_JDATA)
		466	sdp->sd_log_num_jdata++;
436	sdp->sd_log_num_databuf++;	467	sdp->sd_log_num_databuf++;
437	list_add(&le->le_list, &sdp->sd_log_le_databuf);	468	list_add(&le->le_list, &sdp->sd_log_le_databuf);
438	gfs2_log_unlock(sdp);	469	gfs2_log_unlock(sdp);
439	}	470	}
440		471
		472	static int gfs2_check_magic(struct buffer_head *bh)
		473	{
		474	struct page *page = bh->b_page;
		475	void *kaddr;
		476	__be32 *ptr;
		477	int rv = 0;
		478
		479	kaddr = kmap_atomic(page, KM_USER0);
		480	ptr = kaddr + bh_offset(bh);
		481	if (*ptr == cpu_to_be32(GFS2_MAGIC))
		482	rv = 1;
		483	kunmap_atomic(page, KM_USER0);
		484
		485	return rv;
		486	}
		487
		488	/**
		489	* databuf_lo_before_commit - Scan the data buffers, writing as we go
		490	*
		491	* Here we scan through the lists of buffers and make the assumption
		492	* that any buffer thats been pinned is being journaled, and that
		493	* any unpinned buffer is an ordered write data buffer and therefore
		494	* will be written back rather than journaled.
		495	*/
441	static void databuf_lo_before_commit(struct gfs2_sbd *sdp)	496	static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
442	{	497	{
443	struct list_head *head = &sdp->sd_log_le_databuf;
444	LIST_HEAD(started);	498	LIST_HEAD(started);
445	struct gfs2_bufdata *bd;	499	struct gfs2_bufdata bd1 = NULL, bd2, *bdt;
446	struct buffer_head *bh;	500	struct buffer_head *bh = NULL;
		501	unsigned int offset = sizeof(struct gfs2_log_descriptor);
		502	struct gfs2_log_descriptor *ld;
		503	unsigned int limit;
		504	unsigned int total_dbuf = sdp->sd_log_num_databuf;
		505	unsigned int total_jdata = sdp->sd_log_num_jdata;
		506	unsigned int num, n;
		507	__be64 *ptr;
447		508
448	while (!list_empty(head)) {	509	offset += (2*sizeof(__be64) - 1);
449	bd = list_entry(head->prev, struct gfs2_bufdata, bd_le.le_list);	510	offset &= ~(2*sizeof(__be64) - 1);
450	list_move(&bd->bd_le.le_list, &started);	511	limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
451		512
452	gfs2_log_lock(sdp);	513	/* printk(KERN_INFO "totals: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */
453	bh = bd->bd_bh;	514	/*
		515	* Start writing ordered buffers, write journaled buffers
		516	* into the log along with a header
		517	*/
		518	bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, bd_le.le_list);
		519	while(total_dbuf) {
		520	num = total_jdata;
		521	if (num > limit)
		522	num = limit;
		523	n = 0;
		524	list_for_each_entry_safe_continue(bd1, bdt, &sdp->sd_log_le_databuf, bd_le.le_list) {
		525	gfs2_log_lock(sdp);
		526	/* An ordered write buffer */
		527	if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
		528	list_move(&bd1->bd_le.le_list, &started);
		529	if (bd1 == bd2) {
		530	bd2 = NULL;
		531	bd2 = list_prepare_entry(bd2, &sdp->sd_log_le_databuf, bd_le.le_list);
		532	}
		533	total_dbuf--;
		534	if (bd1->bd_bh) {
		535	get_bh(bd1->bd_bh);
		536	gfs2_log_unlock(sdp);
		537	if (buffer_dirty(bd1->bd_bh)) {
		538	wait_on_buffer(bd1->bd_bh);
		539	ll_rw_block(WRITE, 1, &bd1->bd_bh);
		540	}
		541	brelse(bd1->bd_bh);
		542	continue;
		543	}
		544	gfs2_log_unlock(sdp);
		545	continue;
		546	} else if (bd1->bd_bh) { /* A journaled buffer */
		547	int magic;
		548	gfs2_log_unlock(sdp);
		549	/* printk(KERN_INFO "journaled buffer\n"); */
		550	if (!bh) {
		551	bh = gfs2_log_get_buf(sdp);
		552	ld = (struct gfs2_log_descriptor *)bh->b_data;
		553	ptr = (__be64 *)(bh->b_data + offset);
		554	ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
		555	ld->ld_header.mh_type = cpu_to_be16(GFS2_METATYPE_LD);
		556	ld->ld_header.mh_format = cpu_to_be16(GFS2_FORMAT_LD);
		557	ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_JDATA);
		558	ld->ld_length = cpu_to_be32(num + 1);
		559	ld->ld_data1 = cpu_to_be32(num);
		560	ld->ld_data2 = cpu_to_be32(0);
		561	memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
		562	}
		563	magic = gfs2_check_magic(bd1->bd_bh);
		564	*ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
		565	*ptr++ = cpu_to_be64((__u64)magic);
		566	clear_buffer_escaped(bd1->bd_bh);
		567	if (unlikely(magic != 0))
		568	set_buffer_escaped(bd1->bd_bh);
		569	if (n++ > num)
		570	break;
		571	}
		572	}
454	if (bh) {	573	if (bh) {
455	get_bh(bh);	574	set_buffer_dirty(bh);
456	gfs2_log_unlock(sdp);	575	ll_rw_block(WRITE, 1, &bh);
457	if (buffer_dirty(bh)) {	576	bh = NULL;
458	wait_on_buffer(bh);	577	}
459	ll_rw_block(WRITE, 1, &bh);	578	n = 0;
		579	/* printk(KERN_INFO "totals2: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */
		580	list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, bd_le.le_list) {
		581	if (!bd2->bd_bh)
		582	continue;
		583	/* copy buffer if it needs escaping */
		584	if (unlikely(buffer_escaped(bd2->bd_bh))) {
		585	void *kaddr;
		586	struct page *page = bd2->bd_bh->b_page;
		587	bh = gfs2_log_get_buf(sdp);
		588	kaddr = kmap_atomic(page, KM_USER0);
		589	memcpy(bh->b_data, kaddr + bh_offset(bd2->bd_bh), sdp->sd_sb.sb_bsize);
		590	kunmap_atomic(page, KM_USER0);
		591	(__be32 )bh->b_data = 0;
		592	} else {
		593	bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
460	}	594	}
461	brelse(bh);	595	set_buffer_dirty(bh);
462	} else	596	ll_rw_block(WRITE, 1, &bh);
463	gfs2_log_unlock(sdp);	597	if (++n >= num)
		598	break;
		599	}
		600	bh = NULL;
		601	total_dbuf -= num;
		602	total_jdata -= num;
464	}	603	}
465		604	/* printk(KERN_INFO "wait on ordered data buffers\n"); */
		605	/* Wait on all ordered buffers */
466	while (!list_empty(&started)) {	606	while (!list_empty(&started)) {
467	bd = list_entry(started.next, struct gfs2_bufdata,	607	bd1 = list_entry(started.next, struct gfs2_bufdata, bd_le.le_list);
468	bd_le.le_list);	608	list_del(&bd1->bd_le.le_list);
469	list_del(&bd->bd_le.le_list);
470	sdp->sd_log_num_databuf--;	609	sdp->sd_log_num_databuf--;
471		610
472	gfs2_log_lock(sdp);	611	gfs2_log_lock(sdp);
473	bh = bd->bd_bh;	612	bh = bd1->bd_bh;
474	if (bh) {	613	if (bh) {
475	set_v2bd(bh, NULL);	614	set_v2bd(bh, NULL);
476	gfs2_log_unlock(sdp);	615	gfs2_log_unlock(sdp);
@@ -479,12 +618,103 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
479	} else	618	} else
480	gfs2_log_unlock(sdp);	619	gfs2_log_unlock(sdp);
481		620
482	kfree(bd);	621	kfree(bd1);
483	}	622	}
484		623
		624	/* printk(KERN_INFO "sd_log_num_databuf %u sd_log_num_jdata %u\n", sdp->sd_log_num_databuf, sdp->sd_log_num_jdata); */
		625	/* We've removed all the ordered write bufs here, so only jdata left */
		626	gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
		627	}
		628
		629	static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
		630	struct gfs2_log_descriptor *ld,
		631	__be64 *ptr, int pass)
		632	{
		633	struct gfs2_sbd *sdp = jd->jd_inode->i_sbd;
		634	struct gfs2_glock *gl = jd->jd_inode->i_gl;
		635	unsigned int blks = be32_to_cpu(ld->ld_data1);
		636	struct buffer_head bh_log, bh_ip;
		637	uint64_t blkno;
		638	uint64_t esc;
		639	int error = 0;
		640
		641	if (pass != 1 \|\| be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
		642	return 0;
		643
		644	gfs2_replay_incr_blk(sdp, &start);
		645	for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
		646	blkno = be64_to_cpu(*ptr++);
		647	esc = be64_to_cpu(*ptr++);
		648
		649	sdp->sd_found_blocks++;
		650
		651	if (gfs2_revoke_check(sdp, blkno, start))
		652	continue;
		653
		654	error = gfs2_replay_read_block(jd, start, &bh_log);
		655	if (error)
		656	return error;
		657
		658	bh_ip = gfs2_meta_new(gl, blkno);
		659	memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
		660
		661	/* Unescape */
		662	if (esc) {
		663	__be32 eptr = (__be32 )bh_ip->b_data;
		664	*eptr = cpu_to_be32(GFS2_MAGIC);
		665	}
		666	mark_buffer_dirty(bh_ip);
		667
		668	brelse(bh_log);
		669	brelse(bh_ip);
		670	if (error)
		671	break;
		672
		673	sdp->sd_replayed_blocks++;
		674	}
		675
		676	return error;
		677	}
		678
		679	/* FIXME: sort out accounting for log blocks etc. */
		680
		681	static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
		682	{
		683	struct gfs2_sbd *sdp = jd->jd_inode->i_sbd;
		684
		685	if (error) {
		686	gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START \| DIO_WAIT);
		687	return;
		688	}
		689	if (pass != 1)
		690	return;
		691
		692	/* data sync? */
		693	gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START \| DIO_WAIT);
		694
		695	fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
		696	jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
		697	}
		698
		699	static void databuf_lo_after_commit(struct gfs2_sbd sdp, struct gfs2_ail ai)
		700	{
		701	struct list_head *head = &sdp->sd_log_le_databuf;
		702	struct gfs2_bufdata *bd;
		703
		704	while (!list_empty(head)) {
		705	bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
		706	list_del_init(&bd->bd_le.le_list);
		707	sdp->sd_log_num_databuf--;
		708	sdp->sd_log_num_jdata--;
		709	gfs2_unpin(sdp, bd->bd_bh, ai);
		710	brelse(bd->bd_bh);
		711	kfree(bd);
		712	}
485	gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);	713	gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
		714	gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
486	}	715	}
487		716
		717
488	struct gfs2_log_operations gfs2_glock_lops = {	718	struct gfs2_log_operations gfs2_glock_lops = {
489	.lo_add = glock_lo_add,	719	.lo_add = glock_lo_add,
490	.lo_after_commit = glock_lo_after_commit,	720	.lo_after_commit = glock_lo_after_commit,
@@ -519,7 +749,11 @@ struct gfs2_log_operations gfs2_rg_lops = {
519		749
520	struct gfs2_log_operations gfs2_databuf_lops = {	750	struct gfs2_log_operations gfs2_databuf_lops = {
521	.lo_add = databuf_lo_add,	751	.lo_add = databuf_lo_add,
		752	.lo_incore_commit = buf_lo_incore_commit,
522	.lo_before_commit = databuf_lo_before_commit,	753	.lo_before_commit = databuf_lo_before_commit,
		754	.lo_after_commit = databuf_lo_after_commit,
		755	.lo_scan_elements = databuf_lo_scan_elements,
		756	.lo_after_scan = databuf_lo_after_scan,
523	.lo_name = "databuf"	757	.lo_name = "databuf"
524	};	758	};
525		759