aboutsummaryrefslogtreecommitdiffstats
path: root/fs/gfs2/lops.c
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-02-08 06:50:51 -0500
committerSteven Whitehouse <swhiteho@redhat.com>2006-02-08 06:50:51 -0500
commit18ec7d5c3f434aed9661ed10a9e1f48cdeb4981d (patch)
treea7161a4c4b3592052e6772e1c23849de16cac649 /fs/gfs2/lops.c
parent257f9b4e97e9a6cceeb247cead92119a4396d37b (diff)
[GFS2] Make journaled data files identical to normal files on disk
This is a very large patch, with a few still to be resolved issues so you might want to check out the previous head of the tree since this is known to be unstable. Fixes for the various bugs will be forthcoming shortly. This patch removes the special data format which has been used up till now for journaled data files. Directories still retain the old format so that they will remain on disk compatible with earlier releases. As a result you can now do the following with journaled data files: 1) mmap them 2) export them over NFS 3) convert to/from normal files whenever you want to (the zero length restriction is gone) In addition the level at which GFS' locking is done has changed for all files (since they all now use the page cache) such that the locking is done at the page cache level rather than the level of the fs operations. This should mean that things like loopback mounts and other things which touch the page cache directly should now work. Current known issues: 1. There is a lock mode inversion problem related to the resource group hold function which needs to be resolved. 2. Any significant amount of I/O causes an oops with an offset of hex 320 (NULL pointer dereference) which appears to be related to a journaled data buffer appearing on a list where it shouldn't be. 3. Direct I/O writes are disabled for the time being (will reappear later) 4. There is probably a deadlock between the page lock and GFS' locks under certain combinations of mmap and fs operation I/O. 5. Issue relating to ref counting on internally used inodes causes a hang on umount (discovered before this patch, and not fixed by it) 6. One part of the directory metadata is different from GFS1 and will need to be resolved before next release. Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
Diffstat (limited to 'fs/gfs2/lops.c')
-rw-r--r--fs/gfs2/lops.c280
1 files changed, 257 insertions, 23 deletions
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a065f7667238..dd41863810d7 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -428,49 +428,188 @@ static void rg_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
428 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg); 428 gfs2_assert_warn(sdp, !sdp->sd_log_num_rg);
429} 429}
430 430
431/**
432 * databuf_lo_add - Add a databuf to the transaction.
433 *
434 * This is used in two distinct cases:
435 * i) In ordered write mode
436 * We put the data buffer on a list so that we can ensure that its
437 * synced to disk at the right time
438 * ii) In journaled data mode
439 * We need to journal the data block in the same way as metadata in
440 * the functions above. The difference is that here we have a tag
441 * which is two __be64's being the block number (as per meta data)
442 * and a flag which says whether the data block needs escaping or
443 * not. This means we need a new log entry for each 251 or so data
444 * blocks, which isn't an enormous overhead but twice as much as
445 * for normal metadata blocks.
446 */
431static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le) 447static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
432{ 448{
433 get_transaction->tr_touched = 1; 449 struct gfs2_bufdata *bd = container_of(le, struct gfs2_bufdata, bd_le);
450 struct gfs2_trans *tr = get_transaction;
451 struct address_space *mapping = bd->bd_bh->b_page->mapping;
452 struct gfs2_inode *ip = get_v2ip(mapping->host);
434 453
454 tr->tr_touched = 1;
455 if (!list_empty(&bd->bd_list_tr) &&
456 (ip->i_di.di_flags & GFS2_DIF_JDATA)) {
457 tr->tr_num_buf++;
458 gfs2_trans_add_gl(bd->bd_gl);
459 list_add(&bd->bd_list_tr, &tr->tr_list_buf);
460 gfs2_pin(sdp, bd->bd_bh);
461 } else {
462 clear_buffer_pinned(bd->bd_bh);
463 }
435 gfs2_log_lock(sdp); 464 gfs2_log_lock(sdp);
465 if (ip->i_di.di_flags & GFS2_DIF_JDATA)
466 sdp->sd_log_num_jdata++;
436 sdp->sd_log_num_databuf++; 467 sdp->sd_log_num_databuf++;
437 list_add(&le->le_list, &sdp->sd_log_le_databuf); 468 list_add(&le->le_list, &sdp->sd_log_le_databuf);
438 gfs2_log_unlock(sdp); 469 gfs2_log_unlock(sdp);
439} 470}
440 471
472static int gfs2_check_magic(struct buffer_head *bh)
473{
474 struct page *page = bh->b_page;
475 void *kaddr;
476 __be32 *ptr;
477 int rv = 0;
478
479 kaddr = kmap_atomic(page, KM_USER0);
480 ptr = kaddr + bh_offset(bh);
481 if (*ptr == cpu_to_be32(GFS2_MAGIC))
482 rv = 1;
483 kunmap_atomic(page, KM_USER0);
484
485 return rv;
486}
487
488/**
489 * databuf_lo_before_commit - Scan the data buffers, writing as we go
490 *
491 * Here we scan through the lists of buffers and make the assumption
492 * that any buffer thats been pinned is being journaled, and that
493 * any unpinned buffer is an ordered write data buffer and therefore
494 * will be written back rather than journaled.
495 */
441static void databuf_lo_before_commit(struct gfs2_sbd *sdp) 496static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
442{ 497{
443 struct list_head *head = &sdp->sd_log_le_databuf;
444 LIST_HEAD(started); 498 LIST_HEAD(started);
445 struct gfs2_bufdata *bd; 499 struct gfs2_bufdata *bd1 = NULL, *bd2, *bdt;
446 struct buffer_head *bh; 500 struct buffer_head *bh = NULL;
501 unsigned int offset = sizeof(struct gfs2_log_descriptor);
502 struct gfs2_log_descriptor *ld;
503 unsigned int limit;
504 unsigned int total_dbuf = sdp->sd_log_num_databuf;
505 unsigned int total_jdata = sdp->sd_log_num_jdata;
506 unsigned int num, n;
507 __be64 *ptr;
447 508
448 while (!list_empty(head)) { 509 offset += (2*sizeof(__be64) - 1);
449 bd = list_entry(head->prev, struct gfs2_bufdata, bd_le.le_list); 510 offset &= ~(2*sizeof(__be64) - 1);
450 list_move(&bd->bd_le.le_list, &started); 511 limit = (sdp->sd_sb.sb_bsize - offset)/sizeof(__be64);
451 512
452 gfs2_log_lock(sdp); 513 /* printk(KERN_INFO "totals: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */
453 bh = bd->bd_bh; 514 /*
515 * Start writing ordered buffers, write journaled buffers
516 * into the log along with a header
517 */
518 bd2 = bd1 = list_prepare_entry(bd1, &sdp->sd_log_le_databuf, bd_le.le_list);
519 while(total_dbuf) {
520 num = total_jdata;
521 if (num > limit)
522 num = limit;
523 n = 0;
524 list_for_each_entry_safe_continue(bd1, bdt, &sdp->sd_log_le_databuf, bd_le.le_list) {
525 gfs2_log_lock(sdp);
526 /* An ordered write buffer */
527 if (bd1->bd_bh && !buffer_pinned(bd1->bd_bh)) {
528 list_move(&bd1->bd_le.le_list, &started);
529 if (bd1 == bd2) {
530 bd2 = NULL;
531 bd2 = list_prepare_entry(bd2, &sdp->sd_log_le_databuf, bd_le.le_list);
532 }
533 total_dbuf--;
534 if (bd1->bd_bh) {
535 get_bh(bd1->bd_bh);
536 gfs2_log_unlock(sdp);
537 if (buffer_dirty(bd1->bd_bh)) {
538 wait_on_buffer(bd1->bd_bh);
539 ll_rw_block(WRITE, 1, &bd1->bd_bh);
540 }
541 brelse(bd1->bd_bh);
542 continue;
543 }
544 gfs2_log_unlock(sdp);
545 continue;
546 } else if (bd1->bd_bh) { /* A journaled buffer */
547 int magic;
548 gfs2_log_unlock(sdp);
549 /* printk(KERN_INFO "journaled buffer\n"); */
550 if (!bh) {
551 bh = gfs2_log_get_buf(sdp);
552 ld = (struct gfs2_log_descriptor *)bh->b_data;
553 ptr = (__be64 *)(bh->b_data + offset);
554 ld->ld_header.mh_magic = cpu_to_be32(GFS2_MAGIC);
555 ld->ld_header.mh_type = cpu_to_be16(GFS2_METATYPE_LD);
556 ld->ld_header.mh_format = cpu_to_be16(GFS2_FORMAT_LD);
557 ld->ld_type = cpu_to_be32(GFS2_LOG_DESC_JDATA);
558 ld->ld_length = cpu_to_be32(num + 1);
559 ld->ld_data1 = cpu_to_be32(num);
560 ld->ld_data2 = cpu_to_be32(0);
561 memset(ld->ld_reserved, 0, sizeof(ld->ld_reserved));
562 }
563 magic = gfs2_check_magic(bd1->bd_bh);
564 *ptr++ = cpu_to_be64(bd1->bd_bh->b_blocknr);
565 *ptr++ = cpu_to_be64((__u64)magic);
566 clear_buffer_escaped(bd1->bd_bh);
567 if (unlikely(magic != 0))
568 set_buffer_escaped(bd1->bd_bh);
569 if (n++ > num)
570 break;
571 }
572 }
454 if (bh) { 573 if (bh) {
455 get_bh(bh); 574 set_buffer_dirty(bh);
456 gfs2_log_unlock(sdp); 575 ll_rw_block(WRITE, 1, &bh);
457 if (buffer_dirty(bh)) { 576 bh = NULL;
458 wait_on_buffer(bh); 577 }
459 ll_rw_block(WRITE, 1, &bh); 578 n = 0;
579 /* printk(KERN_INFO "totals2: jdata=%u dbuf=%u\n", total_jdata, total_dbuf); */
580 list_for_each_entry_continue(bd2, &sdp->sd_log_le_databuf, bd_le.le_list) {
581 if (!bd2->bd_bh)
582 continue;
583 /* copy buffer if it needs escaping */
584 if (unlikely(buffer_escaped(bd2->bd_bh))) {
585 void *kaddr;
586 struct page *page = bd2->bd_bh->b_page;
587 bh = gfs2_log_get_buf(sdp);
588 kaddr = kmap_atomic(page, KM_USER0);
589 memcpy(bh->b_data, kaddr + bh_offset(bd2->bd_bh), sdp->sd_sb.sb_bsize);
590 kunmap_atomic(page, KM_USER0);
591 *(__be32 *)bh->b_data = 0;
592 } else {
593 bh = gfs2_log_fake_buf(sdp, bd2->bd_bh);
460 } 594 }
461 brelse(bh); 595 set_buffer_dirty(bh);
462 } else 596 ll_rw_block(WRITE, 1, &bh);
463 gfs2_log_unlock(sdp); 597 if (++n >= num)
598 break;
599 }
600 bh = NULL;
601 total_dbuf -= num;
602 total_jdata -= num;
464 } 603 }
465 604 /* printk(KERN_INFO "wait on ordered data buffers\n"); */
605 /* Wait on all ordered buffers */
466 while (!list_empty(&started)) { 606 while (!list_empty(&started)) {
467 bd = list_entry(started.next, struct gfs2_bufdata, 607 bd1 = list_entry(started.next, struct gfs2_bufdata, bd_le.le_list);
468 bd_le.le_list); 608 list_del(&bd1->bd_le.le_list);
469 list_del(&bd->bd_le.le_list);
470 sdp->sd_log_num_databuf--; 609 sdp->sd_log_num_databuf--;
471 610
472 gfs2_log_lock(sdp); 611 gfs2_log_lock(sdp);
473 bh = bd->bd_bh; 612 bh = bd1->bd_bh;
474 if (bh) { 613 if (bh) {
475 set_v2bd(bh, NULL); 614 set_v2bd(bh, NULL);
476 gfs2_log_unlock(sdp); 615 gfs2_log_unlock(sdp);
@@ -479,12 +618,103 @@ static void databuf_lo_before_commit(struct gfs2_sbd *sdp)
479 } else 618 } else
480 gfs2_log_unlock(sdp); 619 gfs2_log_unlock(sdp);
481 620
482 kfree(bd); 621 kfree(bd1);
483 } 622 }
484 623
624 /* printk(KERN_INFO "sd_log_num_databuf %u sd_log_num_jdata %u\n", sdp->sd_log_num_databuf, sdp->sd_log_num_jdata); */
625 /* We've removed all the ordered write bufs here, so only jdata left */
626 gfs2_assert_warn(sdp, sdp->sd_log_num_databuf == sdp->sd_log_num_jdata);
627}
628
629static int databuf_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
630 struct gfs2_log_descriptor *ld,
631 __be64 *ptr, int pass)
632{
633 struct gfs2_sbd *sdp = jd->jd_inode->i_sbd;
634 struct gfs2_glock *gl = jd->jd_inode->i_gl;
635 unsigned int blks = be32_to_cpu(ld->ld_data1);
636 struct buffer_head *bh_log, *bh_ip;
637 uint64_t blkno;
638 uint64_t esc;
639 int error = 0;
640
641 if (pass != 1 || be32_to_cpu(ld->ld_type) != GFS2_LOG_DESC_JDATA)
642 return 0;
643
644 gfs2_replay_incr_blk(sdp, &start);
645 for (; blks; gfs2_replay_incr_blk(sdp, &start), blks--) {
646 blkno = be64_to_cpu(*ptr++);
647 esc = be64_to_cpu(*ptr++);
648
649 sdp->sd_found_blocks++;
650
651 if (gfs2_revoke_check(sdp, blkno, start))
652 continue;
653
654 error = gfs2_replay_read_block(jd, start, &bh_log);
655 if (error)
656 return error;
657
658 bh_ip = gfs2_meta_new(gl, blkno);
659 memcpy(bh_ip->b_data, bh_log->b_data, bh_log->b_size);
660
661 /* Unescape */
662 if (esc) {
663 __be32 *eptr = (__be32 *)bh_ip->b_data;
664 *eptr = cpu_to_be32(GFS2_MAGIC);
665 }
666 mark_buffer_dirty(bh_ip);
667
668 brelse(bh_log);
669 brelse(bh_ip);
670 if (error)
671 break;
672
673 sdp->sd_replayed_blocks++;
674 }
675
676 return error;
677}
678
679/* FIXME: sort out accounting for log blocks etc. */
680
681static void databuf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
682{
683 struct gfs2_sbd *sdp = jd->jd_inode->i_sbd;
684
685 if (error) {
686 gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START | DIO_WAIT);
687 return;
688 }
689 if (pass != 1)
690 return;
691
692 /* data sync? */
693 gfs2_meta_sync(jd->jd_inode->i_gl, DIO_START | DIO_WAIT);
694
695 fs_info(sdp, "jid=%u: Replayed %u of %u data blocks\n",
696 jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
697}
698
699static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
700{
701 struct list_head *head = &sdp->sd_log_le_databuf;
702 struct gfs2_bufdata *bd;
703
704 while (!list_empty(head)) {
705 bd = list_entry(head->next, struct gfs2_bufdata, bd_le.le_list);
706 list_del_init(&bd->bd_le.le_list);
707 sdp->sd_log_num_databuf--;
708 sdp->sd_log_num_jdata--;
709 gfs2_unpin(sdp, bd->bd_bh, ai);
710 brelse(bd->bd_bh);
711 kfree(bd);
712 }
485 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf); 713 gfs2_assert_warn(sdp, !sdp->sd_log_num_databuf);
714 gfs2_assert_warn(sdp, !sdp->sd_log_num_jdata);
486} 715}
487 716
717
488struct gfs2_log_operations gfs2_glock_lops = { 718struct gfs2_log_operations gfs2_glock_lops = {
489 .lo_add = glock_lo_add, 719 .lo_add = glock_lo_add,
490 .lo_after_commit = glock_lo_after_commit, 720 .lo_after_commit = glock_lo_after_commit,
@@ -519,7 +749,11 @@ struct gfs2_log_operations gfs2_rg_lops = {
519 749
520struct gfs2_log_operations gfs2_databuf_lops = { 750struct gfs2_log_operations gfs2_databuf_lops = {
521 .lo_add = databuf_lo_add, 751 .lo_add = databuf_lo_add,
752 .lo_incore_commit = buf_lo_incore_commit,
522 .lo_before_commit = databuf_lo_before_commit, 753 .lo_before_commit = databuf_lo_before_commit,
754 .lo_after_commit = databuf_lo_after_commit,
755 .lo_scan_elements = databuf_lo_scan_elements,
756 .lo_after_scan = databuf_lo_after_scan,
523 .lo_name = "databuf" 757 .lo_name = "databuf"
524}; 758};
525 759