aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/linux-2.6/xfs_buf.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_buf.c')
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c290
1 files changed, 245 insertions, 45 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 77b8be81c769..d50df3a8101c 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -33,6 +33,7 @@
33#include <linux/migrate.h> 33#include <linux/migrate.h>
34#include <linux/backing-dev.h> 34#include <linux/backing-dev.h>
35#include <linux/freezer.h> 35#include <linux/freezer.h>
36#include <linux/list_sort.h>
36 37
37#include "xfs_sb.h" 38#include "xfs_sb.h"
38#include "xfs_inum.h" 39#include "xfs_inum.h"
@@ -1051,22 +1052,30 @@ xfs_buf_ioerror(
1051} 1052}
1052 1053
1053int 1054int
1054xfs_bawrite( 1055xfs_bwrite(
1055 void *mp, 1056 struct xfs_mount *mp,
1056 struct xfs_buf *bp) 1057 struct xfs_buf *bp)
1057{ 1058{
1058 trace_xfs_buf_bawrite(bp, _RET_IP_); 1059 int iowait = (bp->b_flags & XBF_ASYNC) == 0;
1060 int error = 0;
1059 1061
1060 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 1062 bp->b_strat = xfs_bdstrat_cb;
1063 bp->b_mount = mp;
1064 bp->b_flags |= XBF_WRITE;
1065 if (!iowait)
1066 bp->b_flags |= _XBF_RUN_QUEUES;
1061 1067
1062 xfs_buf_delwri_dequeue(bp); 1068 xfs_buf_delwri_dequeue(bp);
1069 xfs_buf_iostrategy(bp);
1063 1070
1064 bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); 1071 if (iowait) {
1065 bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); 1072 error = xfs_buf_iowait(bp);
1073 if (error)
1074 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
1075 xfs_buf_relse(bp);
1076 }
1066 1077
1067 bp->b_mount = mp; 1078 return error;
1068 bp->b_strat = xfs_bdstrat_cb;
1069 return xfs_bdstrat_cb(bp);
1070} 1079}
1071 1080
1072void 1081void
@@ -1085,6 +1094,126 @@ xfs_bdwrite(
1085 xfs_buf_delwri_queue(bp, 1); 1094 xfs_buf_delwri_queue(bp, 1);
1086} 1095}
1087 1096
1097/*
1098 * Called when we want to stop a buffer from getting written or read.
1099 * We attach the EIO error, muck with its flags, and call biodone
1100 * so that the proper iodone callbacks get called.
1101 */
1102STATIC int
1103xfs_bioerror(
1104 xfs_buf_t *bp)
1105{
1106#ifdef XFSERRORDEBUG
1107 ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone);
1108#endif
1109
1110 /*
1111 * No need to wait until the buffer is unpinned, we aren't flushing it.
1112 */
1113 XFS_BUF_ERROR(bp, EIO);
1114
1115 /*
1116 * We're calling biodone, so delete XBF_DONE flag.
1117 */
1118 XFS_BUF_UNREAD(bp);
1119 XFS_BUF_UNDELAYWRITE(bp);
1120 XFS_BUF_UNDONE(bp);
1121 XFS_BUF_STALE(bp);
1122
1123 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1124 xfs_biodone(bp);
1125
1126 return EIO;
1127}
1128
1129/*
1130 * Same as xfs_bioerror, except that we are releasing the buffer
1131 * here ourselves, and avoiding the biodone call.
1132 * This is meant for userdata errors; metadata bufs come with
1133 * iodone functions attached, so that we can track down errors.
1134 */
1135STATIC int
1136xfs_bioerror_relse(
1137 struct xfs_buf *bp)
1138{
1139 int64_t fl = XFS_BUF_BFLAGS(bp);
1140 /*
1141 * No need to wait until the buffer is unpinned.
1142 * We aren't flushing it.
1143 *
1144 * chunkhold expects B_DONE to be set, whether
1145 * we actually finish the I/O or not. We don't want to
1146 * change that interface.
1147 */
1148 XFS_BUF_UNREAD(bp);
1149 XFS_BUF_UNDELAYWRITE(bp);
1150 XFS_BUF_DONE(bp);
1151 XFS_BUF_STALE(bp);
1152 XFS_BUF_CLR_IODONE_FUNC(bp);
1153 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
1154 if (!(fl & XBF_ASYNC)) {
1155 /*
1156 * Mark b_error and B_ERROR _both_.
1157 * Lot's of chunkcache code assumes that.
1158 * There's no reason to mark error for
1159 * ASYNC buffers.
1160 */
1161 XFS_BUF_ERROR(bp, EIO);
1162 XFS_BUF_FINISH_IOWAIT(bp);
1163 } else {
1164 xfs_buf_relse(bp);
1165 }
1166
1167 return EIO;
1168}
1169
1170
1171/*
1172 * All xfs metadata buffers except log state machine buffers
1173 * get this attached as their b_bdstrat callback function.
1174 * This is so that we can catch a buffer
1175 * after prematurely unpinning it to forcibly shutdown the filesystem.
1176 */
1177int
1178xfs_bdstrat_cb(
1179 struct xfs_buf *bp)
1180{
1181 if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
1182 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1183 /*
1184 * Metadata write that didn't get logged but
1185 * written delayed anyway. These aren't associated
1186 * with a transaction, and can be ignored.
1187 */
1188 if (!bp->b_iodone && !XFS_BUF_ISREAD(bp))
1189 return xfs_bioerror_relse(bp);
1190 else
1191 return xfs_bioerror(bp);
1192 }
1193
1194 xfs_buf_iorequest(bp);
1195 return 0;
1196}
1197
1198/*
1199 * Wrapper around bdstrat so that we can stop data from going to disk in case
1200 * we are shutting down the filesystem. Typically user data goes thru this
1201 * path; one of the exceptions is the superblock.
1202 */
1203void
1204xfsbdstrat(
1205 struct xfs_mount *mp,
1206 struct xfs_buf *bp)
1207{
1208 if (XFS_FORCED_SHUTDOWN(mp)) {
1209 trace_xfs_bdstrat_shut(bp, _RET_IP_);
1210 xfs_bioerror_relse(bp);
1211 return;
1212 }
1213
1214 xfs_buf_iorequest(bp);
1215}
1216
1088STATIC void 1217STATIC void
1089_xfs_buf_ioend( 1218_xfs_buf_ioend(
1090 xfs_buf_t *bp, 1219 xfs_buf_t *bp,
@@ -1296,7 +1425,7 @@ xfs_buf_iomove(
1296 xfs_buf_t *bp, /* buffer to process */ 1425 xfs_buf_t *bp, /* buffer to process */
1297 size_t boff, /* starting buffer offset */ 1426 size_t boff, /* starting buffer offset */
1298 size_t bsize, /* length to copy */ 1427 size_t bsize, /* length to copy */
1299 caddr_t data, /* data address */ 1428 void *data, /* data address */
1300 xfs_buf_rw_t mode) /* read/write/zero flag */ 1429 xfs_buf_rw_t mode) /* read/write/zero flag */
1301{ 1430{
1302 size_t bend, cpoff, csize; 1431 size_t bend, cpoff, csize;
@@ -1378,8 +1507,8 @@ xfs_alloc_bufhash(
1378 1507
1379 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ 1508 btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */
1380 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; 1509 btp->bt_hashmask = (1 << btp->bt_hashshift) - 1;
1381 btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * 1510 btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) *
1382 sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); 1511 sizeof(xfs_bufhash_t));
1383 for (i = 0; i < (1 << btp->bt_hashshift); i++) { 1512 for (i = 0; i < (1 << btp->bt_hashshift); i++) {
1384 spin_lock_init(&btp->bt_hash[i].bh_lock); 1513 spin_lock_init(&btp->bt_hash[i].bh_lock);
1385 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); 1514 INIT_LIST_HEAD(&btp->bt_hash[i].bh_list);
@@ -1390,7 +1519,7 @@ STATIC void
1390xfs_free_bufhash( 1519xfs_free_bufhash(
1391 xfs_buftarg_t *btp) 1520 xfs_buftarg_t *btp)
1392{ 1521{
1393 kmem_free(btp->bt_hash); 1522 kmem_free_large(btp->bt_hash);
1394 btp->bt_hash = NULL; 1523 btp->bt_hash = NULL;
1395} 1524}
1396 1525
@@ -1595,6 +1724,11 @@ xfs_buf_delwri_queue(
1595 list_del(&bp->b_list); 1724 list_del(&bp->b_list);
1596 } 1725 }
1597 1726
1727 if (list_empty(dwq)) {
1728 /* start xfsbufd as it is about to have something to do */
1729 wake_up_process(bp->b_target->bt_task);
1730 }
1731
1598 bp->b_flags |= _XBF_DELWRI_Q; 1732 bp->b_flags |= _XBF_DELWRI_Q;
1599 list_add_tail(&bp->b_list, dwq); 1733 list_add_tail(&bp->b_list, dwq);
1600 bp->b_queuetime = jiffies; 1734 bp->b_queuetime = jiffies;
@@ -1626,6 +1760,35 @@ xfs_buf_delwri_dequeue(
1626 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1760 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1627} 1761}
1628 1762
1763/*
1764 * If a delwri buffer needs to be pushed before it has aged out, then promote
1765 * it to the head of the delwri queue so that it will be flushed on the next
1766 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1767 * than the age currently needed to flush the buffer. Hence the next time the
1768 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1769 */
1770void
1771xfs_buf_delwri_promote(
1772 struct xfs_buf *bp)
1773{
1774 struct xfs_buftarg *btp = bp->b_target;
1775 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1776
1777 ASSERT(bp->b_flags & XBF_DELWRI);
1778 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1779
1780 /*
1781 * Check the buffer age before locking the delayed write queue as we
1782 * don't need to promote buffers that are already past the flush age.
1783 */
1784 if (bp->b_queuetime < jiffies - age)
1785 return;
1786 bp->b_queuetime = jiffies - age;
1787 spin_lock(&btp->bt_delwrite_lock);
1788 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1789 spin_unlock(&btp->bt_delwrite_lock);
1790}
1791
1629STATIC void 1792STATIC void
1630xfs_buf_runall_queues( 1793xfs_buf_runall_queues(
1631 struct workqueue_struct *queue) 1794 struct workqueue_struct *queue)
@@ -1644,6 +1807,8 @@ xfsbufd_wakeup(
1644 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { 1807 list_for_each_entry(btp, &xfs_buftarg_list, bt_list) {
1645 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) 1808 if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags))
1646 continue; 1809 continue;
1810 if (list_empty(&btp->bt_delwrite_queue))
1811 continue;
1647 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); 1812 set_bit(XBT_FORCE_FLUSH, &btp->bt_flags);
1648 wake_up_process(btp->bt_task); 1813 wake_up_process(btp->bt_task);
1649 } 1814 }
@@ -1694,20 +1859,53 @@ xfs_buf_delwri_split(
1694 1859
1695} 1860}
1696 1861
1862/*
1863 * Compare function is more complex than it needs to be because
1864 * the return value is only 32 bits and we are doing comparisons
1865 * on 64 bit values
1866 */
1867static int
1868xfs_buf_cmp(
1869 void *priv,
1870 struct list_head *a,
1871 struct list_head *b)
1872{
1873 struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list);
1874 struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list);
1875 xfs_daddr_t diff;
1876
1877 diff = ap->b_bn - bp->b_bn;
1878 if (diff < 0)
1879 return -1;
1880 if (diff > 0)
1881 return 1;
1882 return 0;
1883}
1884
1885void
1886xfs_buf_delwri_sort(
1887 xfs_buftarg_t *target,
1888 struct list_head *list)
1889{
1890 list_sort(NULL, list, xfs_buf_cmp);
1891}
1892
1697STATIC int 1893STATIC int
1698xfsbufd( 1894xfsbufd(
1699 void *data) 1895 void *data)
1700{ 1896{
1701 struct list_head tmp; 1897 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1702 xfs_buftarg_t *target = (xfs_buftarg_t *)data;
1703 int count;
1704 xfs_buf_t *bp;
1705 1898
1706 current->flags |= PF_MEMALLOC; 1899 current->flags |= PF_MEMALLOC;
1707 1900
1708 set_freezable(); 1901 set_freezable();
1709 1902
1710 do { 1903 do {
1904 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10);
1905 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10);
1906 int count = 0;
1907 struct list_head tmp;
1908
1711 if (unlikely(freezing(current))) { 1909 if (unlikely(freezing(current))) {
1712 set_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1910 set_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1713 refrigerator(); 1911 refrigerator();
@@ -1715,17 +1913,16 @@ xfsbufd(
1715 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); 1913 clear_bit(XBT_FORCE_SLEEP, &target->bt_flags);
1716 } 1914 }
1717 1915
1718 schedule_timeout_interruptible( 1916 /* sleep for a long time if there is nothing to do. */
1719 xfs_buf_timer_centisecs * msecs_to_jiffies(10)); 1917 if (list_empty(&target->bt_delwrite_queue))
1720 1918 tout = MAX_SCHEDULE_TIMEOUT;
1721 xfs_buf_delwri_split(target, &tmp, 1919 schedule_timeout_interruptible(tout);
1722 xfs_buf_age_centisecs * msecs_to_jiffies(10));
1723 1920
1724 count = 0; 1921 xfs_buf_delwri_split(target, &tmp, age);
1922 list_sort(NULL, &tmp, xfs_buf_cmp);
1725 while (!list_empty(&tmp)) { 1923 while (!list_empty(&tmp)) {
1726 bp = list_entry(tmp.next, xfs_buf_t, b_list); 1924 struct xfs_buf *bp;
1727 ASSERT(target == bp->b_target); 1925 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1728
1729 list_del_init(&bp->b_list); 1926 list_del_init(&bp->b_list);
1730 xfs_buf_iostrategy(bp); 1927 xfs_buf_iostrategy(bp);
1731 count++; 1928 count++;
@@ -1751,42 +1948,45 @@ xfs_flush_buftarg(
1751 xfs_buftarg_t *target, 1948 xfs_buftarg_t *target,
1752 int wait) 1949 int wait)
1753{ 1950{
1754 struct list_head tmp; 1951 xfs_buf_t *bp;
1755 xfs_buf_t *bp, *n;
1756 int pincount = 0; 1952 int pincount = 0;
1953 LIST_HEAD(tmp_list);
1954 LIST_HEAD(wait_list);
1757 1955
1758 xfs_buf_runall_queues(xfsconvertd_workqueue); 1956 xfs_buf_runall_queues(xfsconvertd_workqueue);
1759 xfs_buf_runall_queues(xfsdatad_workqueue); 1957 xfs_buf_runall_queues(xfsdatad_workqueue);
1760 xfs_buf_runall_queues(xfslogd_workqueue); 1958 xfs_buf_runall_queues(xfslogd_workqueue);
1761 1959
1762 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1960 set_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1763 pincount = xfs_buf_delwri_split(target, &tmp, 0); 1961 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1764 1962
1765 /* 1963 /*
1766 * Dropped the delayed write list lock, now walk the temporary list 1964 * Dropped the delayed write list lock, now walk the temporary list.
1965 * All I/O is issued async and then if we need to wait for completion
1966 * we do that after issuing all the IO.
1767 */ 1967 */
1768 list_for_each_entry_safe(bp, n, &tmp, b_list) { 1968 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1969 while (!list_empty(&tmp_list)) {
1970 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1769 ASSERT(target == bp->b_target); 1971 ASSERT(target == bp->b_target);
1770 if (wait) 1972 list_del_init(&bp->b_list);
1973 if (wait) {
1771 bp->b_flags &= ~XBF_ASYNC; 1974 bp->b_flags &= ~XBF_ASYNC;
1772 else 1975 list_add(&bp->b_list, &wait_list);
1773 list_del_init(&bp->b_list); 1976 }
1774
1775 xfs_buf_iostrategy(bp); 1977 xfs_buf_iostrategy(bp);
1776 } 1978 }
1777 1979
1778 if (wait) 1980 if (wait) {
1981 /* Expedite and wait for IO to complete. */
1779 blk_run_address_space(target->bt_mapping); 1982 blk_run_address_space(target->bt_mapping);
1983 while (!list_empty(&wait_list)) {
1984 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1780 1985
1781 /* 1986 list_del_init(&bp->b_list);
1782 * Remaining list items must be flushed before returning 1987 xfs_iowait(bp);
1783 */ 1988 xfs_buf_relse(bp);
1784 while (!list_empty(&tmp)) { 1989 }
1785 bp = list_entry(tmp.next, xfs_buf_t, b_list);
1786
1787 list_del_init(&bp->b_list);
1788 xfs_iowait(bp);
1789 xfs_buf_relse(bp);
1790 } 1990 }
1791 1991
1792 return pincount; 1992 return pincount;