aboutsummaryrefslogtreecommitdiffstats
path: root/fs/jbd2/transaction.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/jbd2/transaction.c')
-rw-r--r--fs/jbd2/transaction.c365
1 files changed, 150 insertions, 215 deletions
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index d6e006e67804..4f7cadbb19fa 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
41 * new transaction and we can't block without protecting against other 41 * new transaction and we can't block without protecting against other
42 * processes trying to touch the journal while it is in transition. 42 * processes trying to touch the journal while it is in transition.
43 * 43 *
44 * Called under j_state_lock
45 */ 44 */
46 45
47static transaction_t * 46static transaction_t *
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
52 transaction->t_tid = journal->j_transaction_sequence++; 51 transaction->t_tid = journal->j_transaction_sequence++;
53 transaction->t_expires = jiffies + journal->j_commit_interval; 52 transaction->t_expires = jiffies + journal->j_commit_interval;
54 spin_lock_init(&transaction->t_handle_lock); 53 spin_lock_init(&transaction->t_handle_lock);
54 INIT_LIST_HEAD(&transaction->t_inode_list);
55 55
56 /* Set up the commit timer for the new transaction. */ 56 /* Set up the commit timer for the new transaction. */
57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); 57 journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
@@ -943,183 +943,6 @@ out:
943} 943}
944 944
945/** 945/**
946 * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which
947 * needs to be flushed before we can commit the
948 * current transaction.
949 * @handle: transaction
950 * @bh: bufferhead to mark
951 *
952 * The buffer is placed on the transaction's data list and is marked as
953 * belonging to the transaction.
954 *
955 * Returns error number or 0 on success.
956 *
957 * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage
958 * by kswapd.
959 */
960int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
961{
962 journal_t *journal = handle->h_transaction->t_journal;
963 int need_brelse = 0;
964 struct journal_head *jh;
965
966 if (is_handle_aborted(handle))
967 return 0;
968
969 jh = jbd2_journal_add_journal_head(bh);
970 JBUFFER_TRACE(jh, "entry");
971
972 /*
973 * The buffer could *already* be dirty. Writeout can start
974 * at any time.
975 */
976 jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid);
977
978 /*
979 * What if the buffer is already part of a running transaction?
980 *
981 * There are two cases:
982 * 1) It is part of the current running transaction. Refile it,
983 * just in case we have allocated it as metadata, deallocated
984 * it, then reallocated it as data.
985 * 2) It is part of the previous, still-committing transaction.
986 * If all we want to do is to guarantee that the buffer will be
987 * written to disk before this new transaction commits, then
988 * being sure that the *previous* transaction has this same
989 * property is sufficient for us! Just leave it on its old
990 * transaction.
991 *
992 * In case (2), the buffer must not already exist as metadata
993 * --- that would violate write ordering (a transaction is free
994 * to write its data at any point, even before the previous
995 * committing transaction has committed). The caller must
996 * never, ever allow this to happen: there's nothing we can do
997 * about it in this layer.
998 */
999 jbd_lock_bh_state(bh);
1000 spin_lock(&journal->j_list_lock);
1001
1002 /* Now that we have bh_state locked, are we really still mapped? */
1003 if (!buffer_mapped(bh)) {
1004 JBUFFER_TRACE(jh, "unmapped buffer, bailing out");
1005 goto no_journal;
1006 }
1007
1008 if (jh->b_transaction) {
1009 JBUFFER_TRACE(jh, "has transaction");
1010 if (jh->b_transaction != handle->h_transaction) {
1011 JBUFFER_TRACE(jh, "belongs to older transaction");
1012 J_ASSERT_JH(jh, jh->b_transaction ==
1013 journal->j_committing_transaction);
1014
1015 /* @@@ IS THIS TRUE ? */
1016 /*
1017 * Not any more. Scenario: someone does a write()
1018 * in data=journal mode. The buffer's transaction has
1019 * moved into commit. Then someone does another
1020 * write() to the file. We do the frozen data copyout
1021 * and set b_next_transaction to point to j_running_t.
1022 * And while we're in that state, someone does a
1023 * writepage() in an attempt to pageout the same area
1024 * of the file via a shared mapping. At present that
1025 * calls jbd2_journal_dirty_data(), and we get right here.
1026 * It may be too late to journal the data. Simply
1027 * falling through to the next test will suffice: the
1028 * data will be dirty and wil be checkpointed. The
1029 * ordering comments in the next comment block still
1030 * apply.
1031 */
1032 //J_ASSERT_JH(jh, jh->b_next_transaction == NULL);
1033
1034 /*
1035 * If we're journalling data, and this buffer was
1036 * subject to a write(), it could be metadata, forget
1037 * or shadow against the committing transaction. Now,
1038 * someone has dirtied the same darn page via a mapping
1039 * and it is being writepage()'d.
1040 * We *could* just steal the page from commit, with some
1041 * fancy locking there. Instead, we just skip it -
1042 * don't tie the page's buffers to the new transaction
1043 * at all.
1044 * Implication: if we crash before the writepage() data
1045 * is written into the filesystem, recovery will replay
1046 * the write() data.
1047 */
1048 if (jh->b_jlist != BJ_None &&
1049 jh->b_jlist != BJ_SyncData &&
1050 jh->b_jlist != BJ_Locked) {
1051 JBUFFER_TRACE(jh, "Not stealing");
1052 goto no_journal;
1053 }
1054
1055 /*
1056 * This buffer may be undergoing writeout in commit. We
1057 * can't return from here and let the caller dirty it
1058 * again because that can cause the write-out loop in
1059 * commit to never terminate.
1060 */
1061 if (buffer_dirty(bh)) {
1062 get_bh(bh);
1063 spin_unlock(&journal->j_list_lock);
1064 jbd_unlock_bh_state(bh);
1065 need_brelse = 1;
1066 sync_dirty_buffer(bh);
1067 jbd_lock_bh_state(bh);
1068 spin_lock(&journal->j_list_lock);
1069 /* Since we dropped the lock... */
1070 if (!buffer_mapped(bh)) {
1071 JBUFFER_TRACE(jh, "buffer got unmapped");
1072 goto no_journal;
1073 }
1074 /* The buffer may become locked again at any
1075 time if it is redirtied */
1076 }
1077
1078 /* journal_clean_data_list() may have got there first */
1079 if (jh->b_transaction != NULL) {
1080 JBUFFER_TRACE(jh, "unfile from commit");
1081 __jbd2_journal_temp_unlink_buffer(jh);
1082 /* It still points to the committing
1083 * transaction; move it to this one so
1084 * that the refile assert checks are
1085 * happy. */
1086 jh->b_transaction = handle->h_transaction;
1087 }
1088 /* The buffer will be refiled below */
1089
1090 }
1091 /*
1092 * Special case --- the buffer might actually have been
1093 * allocated and then immediately deallocated in the previous,
1094 * committing transaction, so might still be left on that
1095 * transaction's metadata lists.
1096 */
1097 if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) {
1098 JBUFFER_TRACE(jh, "not on correct data list: unfile");
1099 J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow);
1100 __jbd2_journal_temp_unlink_buffer(jh);
1101 jh->b_transaction = handle->h_transaction;
1102 JBUFFER_TRACE(jh, "file as data");
1103 __jbd2_journal_file_buffer(jh, handle->h_transaction,
1104 BJ_SyncData);
1105 }
1106 } else {
1107 JBUFFER_TRACE(jh, "not on a transaction");
1108 __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData);
1109 }
1110no_journal:
1111 spin_unlock(&journal->j_list_lock);
1112 jbd_unlock_bh_state(bh);
1113 if (need_brelse) {
1114 BUFFER_TRACE(bh, "brelse");
1115 __brelse(bh);
1116 }
1117 JBUFFER_TRACE(jh, "exit");
1118 jbd2_journal_put_journal_head(jh);
1119 return 0;
1120}
1121
1122/**
1123 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata 946 * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
1124 * @handle: transaction to add buffer to. 947 * @handle: transaction to add buffer to.
1125 * @bh: buffer to mark 948 * @bh: buffer to mark
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh)
1541 * Remove a buffer from the appropriate transaction list. 1364 * Remove a buffer from the appropriate transaction list.
1542 * 1365 *
1543 * Note that this function can *change* the value of 1366 * Note that this function can *change* the value of
1544 * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, 1367 * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list,
1545 * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller 1368 * t_log_list or t_reserved_list. If the caller is holding onto a copy of one
1546 * is holding onto a copy of one of thee pointers, it could go bad. 1369 * of these pointers, it could go bad. Generally the caller needs to re-read
1547 * Generally the caller needs to re-read the pointer from the transaction_t. 1370 * the pointer from the transaction_t.
1548 * 1371 *
1549 * Called under j_list_lock. The journal may not be locked. 1372 * Called under j_list_lock. The journal may not be locked.
1550 */ 1373 */
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1566 switch (jh->b_jlist) { 1389 switch (jh->b_jlist) {
1567 case BJ_None: 1390 case BJ_None:
1568 return; 1391 return;
1569 case BJ_SyncData:
1570 list = &transaction->t_sync_datalist;
1571 break;
1572 case BJ_Metadata: 1392 case BJ_Metadata:
1573 transaction->t_nr_buffers--; 1393 transaction->t_nr_buffers--;
1574 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); 1394 J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0);
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
1589 case BJ_Reserved: 1409 case BJ_Reserved:
1590 list = &transaction->t_reserved_list; 1410 list = &transaction->t_reserved_list;
1591 break; 1411 break;
1592 case BJ_Locked:
1593 list = &transaction->t_locked_list;
1594 break;
1595 } 1412 }
1596 1413
1597 __blist_del_buffer(list, jh); 1414 __blist_del_buffer(list, jh);
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh)
1634 goto out; 1451 goto out;
1635 1452
1636 spin_lock(&journal->j_list_lock); 1453 spin_lock(&journal->j_list_lock);
1637 if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { 1454 if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1638 if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) {
1639 /* A written-back ordered data buffer */
1640 JBUFFER_TRACE(jh, "release data");
1641 __jbd2_journal_unfile_buffer(jh);
1642 jbd2_journal_remove_journal_head(bh);
1643 __brelse(bh);
1644 }
1645 } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
1646 /* written-back checkpointed metadata buffer */ 1455 /* written-back checkpointed metadata buffer */
1647 if (jh->b_jlist == BJ_None) { 1456 if (jh->b_jlist == BJ_None) {
1648 JBUFFER_TRACE(jh, "remove from checkpoint list"); 1457 JBUFFER_TRACE(jh, "remove from checkpoint list");
@@ -1656,12 +1465,43 @@ out:
1656 return; 1465 return;
1657} 1466}
1658 1467
1468/*
1469 * jbd2_journal_try_to_free_buffers() could race with
1470 * jbd2_journal_commit_transaction(). The later might still hold the
1471 * reference count to the buffers when inspecting them on
1472 * t_syncdata_list or t_locked_list.
1473 *
1474 * jbd2_journal_try_to_free_buffers() will call this function to
1475 * wait for the current transaction to finish syncing data buffers, before
1476 * try to free that buffer.
1477 *
1478 * Called with journal->j_state_lock hold.
1479 */
1480static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal)
1481{
1482 transaction_t *transaction;
1483 tid_t tid;
1484
1485 spin_lock(&journal->j_state_lock);
1486 transaction = journal->j_committing_transaction;
1487
1488 if (!transaction) {
1489 spin_unlock(&journal->j_state_lock);
1490 return;
1491 }
1492
1493 tid = transaction->t_tid;
1494 spin_unlock(&journal->j_state_lock);
1495 jbd2_log_wait_commit(journal, tid);
1496}
1659 1497
1660/** 1498/**
1661 * int jbd2_journal_try_to_free_buffers() - try to free page buffers. 1499 * int jbd2_journal_try_to_free_buffers() - try to free page buffers.
1662 * @journal: journal for operation 1500 * @journal: journal for operation
1663 * @page: to try and free 1501 * @page: to try and free
1664 * @unused_gfp_mask: unused 1502 * @gfp_mask: we use the mask to detect how hard should we try to release
1503 * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to
1504 * release the buffers.
1665 * 1505 *
1666 * 1506 *
1667 * For all the buffers on this page, 1507 * For all the buffers on this page,
@@ -1690,9 +1530,11 @@ out:
1690 * journal_try_to_free_buffer() is changing its state. But that 1530 * journal_try_to_free_buffer() is changing its state. But that
1691 * cannot happen because we never reallocate freed data as metadata 1531 * cannot happen because we never reallocate freed data as metadata
1692 * while the data is part of a transaction. Yes? 1532 * while the data is part of a transaction. Yes?
1533 *
1534 * Return 0 on failure, 1 on success
1693 */ 1535 */
1694int jbd2_journal_try_to_free_buffers(journal_t *journal, 1536int jbd2_journal_try_to_free_buffers(journal_t *journal,
1695 struct page *page, gfp_t unused_gfp_mask) 1537 struct page *page, gfp_t gfp_mask)
1696{ 1538{
1697 struct buffer_head *head; 1539 struct buffer_head *head;
1698 struct buffer_head *bh; 1540 struct buffer_head *bh;
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
1708 /* 1550 /*
1709 * We take our own ref against the journal_head here to avoid 1551 * We take our own ref against the journal_head here to avoid
1710 * having to add tons of locking around each instance of 1552 * having to add tons of locking around each instance of
1711 * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). 1553 * jbd2_journal_remove_journal_head() and
1554 * jbd2_journal_put_journal_head().
1712 */ 1555 */
1713 jh = jbd2_journal_grab_journal_head(bh); 1556 jh = jbd2_journal_grab_journal_head(bh);
1714 if (!jh) 1557 if (!jh)
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal,
1721 if (buffer_jbd(bh)) 1564 if (buffer_jbd(bh))
1722 goto busy; 1565 goto busy;
1723 } while ((bh = bh->b_this_page) != head); 1566 } while ((bh = bh->b_this_page) != head);
1567
1724 ret = try_to_free_buffers(page); 1568 ret = try_to_free_buffers(page);
1569
1570 /*
1571 * There are a number of places where jbd2_journal_try_to_free_buffers()
1572 * could race with jbd2_journal_commit_transaction(), the later still
1573 * holds the reference to the buffers to free while processing them.
1574 * try_to_free_buffers() failed to free those buffers. Some of the
1575 * caller of releasepage() request page buffers to be dropped, otherwise
1576 * treat the fail-to-free as errors (such as generic_file_direct_IO())
1577 *
1578 * So, if the caller of try_to_release_page() wants the synchronous
1579 * behaviour(i.e make sure buffers are dropped upon return),
1580 * let's wait for the current transaction to finish flush of
1581 * dirty data buffers, then try to free those buffers again,
1582 * with the journal locked.
1583 */
1584 if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) {
1585 jbd2_journal_wait_for_transaction_sync_data(journal);
1586 ret = try_to_free_buffers(page);
1587 }
1588
1725busy: 1589busy:
1726 return ret; 1590 return ret;
1727} 1591}
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1823 if (!buffer_jbd(bh)) 1687 if (!buffer_jbd(bh))
1824 goto zap_buffer_unlocked; 1688 goto zap_buffer_unlocked;
1825 1689
1690 /* OK, we have data buffer in journaled mode */
1826 spin_lock(&journal->j_state_lock); 1691 spin_lock(&journal->j_state_lock);
1827 jbd_lock_bh_state(bh); 1692 jbd_lock_bh_state(bh);
1828 spin_lock(&journal->j_list_lock); 1693 spin_lock(&journal->j_list_lock);
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh)
1886 } 1751 }
1887 } else if (transaction == journal->j_committing_transaction) { 1752 } else if (transaction == journal->j_committing_transaction) {
1888 JBUFFER_TRACE(jh, "on committing transaction"); 1753 JBUFFER_TRACE(jh, "on committing transaction");
1889 if (jh->b_jlist == BJ_Locked) {
1890 /*
1891 * The buffer is on the committing transaction's locked
1892 * list. We have the buffer locked, so I/O has
1893 * completed. So we can nail the buffer now.
1894 */
1895 may_free = __dispose_buffer(jh, transaction);
1896 goto zap_buffer;
1897 }
1898 /* 1754 /*
1899 * If it is committing, we simply cannot touch it. We 1755 * If it is committing, we simply cannot touch it. We
1900 * can remove it's next_transaction pointer from the 1756 * can remove it's next_transaction pointer from the
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2027 J_ASSERT_JH(jh, !jh->b_committed_data); 1883 J_ASSERT_JH(jh, !jh->b_committed_data);
2028 J_ASSERT_JH(jh, !jh->b_frozen_data); 1884 J_ASSERT_JH(jh, !jh->b_frozen_data);
2029 return; 1885 return;
2030 case BJ_SyncData:
2031 list = &transaction->t_sync_datalist;
2032 break;
2033 case BJ_Metadata: 1886 case BJ_Metadata:
2034 transaction->t_nr_buffers++; 1887 transaction->t_nr_buffers++;
2035 list = &transaction->t_buffers; 1888 list = &transaction->t_buffers;
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh,
2049 case BJ_Reserved: 1902 case BJ_Reserved:
2050 list = &transaction->t_reserved_list; 1903 list = &transaction->t_reserved_list;
2051 break; 1904 break;
2052 case BJ_Locked:
2053 list = &transaction->t_locked_list;
2054 break;
2055 } 1905 }
2056 1906
2057 __blist_add_buffer(list, jh); 1907 __blist_add_buffer(list, jh);
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh)
2141 spin_unlock(&journal->j_list_lock); 1991 spin_unlock(&journal->j_list_lock);
2142 __brelse(bh); 1992 __brelse(bh);
2143} 1993}
1994
1995/*
1996 * File inode in the inode list of the handle's transaction
1997 */
1998int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode)
1999{
2000 transaction_t *transaction = handle->h_transaction;
2001 journal_t *journal = transaction->t_journal;
2002
2003 if (is_handle_aborted(handle))
2004 return -EIO;
2005
2006 jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino,
2007 transaction->t_tid);
2008
2009 /*
2010 * First check whether inode isn't already on the transaction's
2011 * lists without taking the lock. Note that this check is safe
2012 * without the lock as we cannot race with somebody removing inode
2013 * from the transaction. The reason is that we remove inode from the
2014 * transaction only in journal_release_jbd_inode() and when we commit
2015 * the transaction. We are guarded from the first case by holding
2016 * a reference to the inode. We are safe against the second case
2017 * because if jinode->i_transaction == transaction, commit code
2018 * cannot touch the transaction because we hold reference to it,
2019 * and if jinode->i_next_transaction == transaction, commit code
2020 * will only file the inode where we want it.
2021 */
2022 if (jinode->i_transaction == transaction ||
2023 jinode->i_next_transaction == transaction)
2024 return 0;
2025
2026 spin_lock(&journal->j_list_lock);
2027
2028 if (jinode->i_transaction == transaction ||
2029 jinode->i_next_transaction == transaction)
2030 goto done;
2031
2032 /* On some different transaction's list - should be
2033 * the committing one */
2034 if (jinode->i_transaction) {
2035 J_ASSERT(jinode->i_next_transaction == NULL);
2036 J_ASSERT(jinode->i_transaction ==
2037 journal->j_committing_transaction);
2038 jinode->i_next_transaction = transaction;
2039 goto done;
2040 }
2041 /* Not on any transaction list... */
2042 J_ASSERT(!jinode->i_next_transaction);
2043 jinode->i_transaction = transaction;
2044 list_add(&jinode->i_list, &transaction->t_inode_list);
2045done:
2046 spin_unlock(&journal->j_list_lock);
2047
2048 return 0;
2049}
2050
2051/*
2052 * This function must be called when inode is journaled in ordered mode
2053 * before truncation happens. It starts writeout of truncated part in
2054 * case it is in the committing transaction so that we stand to ordered
2055 * mode consistency guarantees.
2056 */
2057int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
2058 loff_t new_size)
2059{
2060 journal_t *journal;
2061 transaction_t *commit_trans;
2062 int ret = 0;
2063
2064 if (!inode->i_transaction && !inode->i_next_transaction)
2065 goto out;
2066 journal = inode->i_transaction->t_journal;
2067 spin_lock(&journal->j_state_lock);
2068 commit_trans = journal->j_committing_transaction;
2069 spin_unlock(&journal->j_state_lock);
2070 if (inode->i_transaction == commit_trans) {
2071 ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
2072 new_size, LLONG_MAX);
2073 if (ret)
2074 jbd2_journal_abort(journal, ret);
2075 }
2076out:
2077 return ret;
2078}