diff options
Diffstat (limited to 'fs/xfs/linux-2.6/xfs_buf.c')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 290 |
1 files changed, 245 insertions, 45 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 77b8be81c769..d50df3a8101c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c | |||
@@ -33,6 +33,7 @@ | |||
33 | #include <linux/migrate.h> | 33 | #include <linux/migrate.h> |
34 | #include <linux/backing-dev.h> | 34 | #include <linux/backing-dev.h> |
35 | #include <linux/freezer.h> | 35 | #include <linux/freezer.h> |
36 | #include <linux/list_sort.h> | ||
36 | 37 | ||
37 | #include "xfs_sb.h" | 38 | #include "xfs_sb.h" |
38 | #include "xfs_inum.h" | 39 | #include "xfs_inum.h" |
@@ -1051,22 +1052,30 @@ xfs_buf_ioerror( | |||
1051 | } | 1052 | } |
1052 | 1053 | ||
1053 | int | 1054 | int |
1054 | xfs_bawrite( | 1055 | xfs_bwrite( |
1055 | void *mp, | 1056 | struct xfs_mount *mp, |
1056 | struct xfs_buf *bp) | 1057 | struct xfs_buf *bp) |
1057 | { | 1058 | { |
1058 | trace_xfs_buf_bawrite(bp, _RET_IP_); | 1059 | int iowait = (bp->b_flags & XBF_ASYNC) == 0; |
1060 | int error = 0; | ||
1059 | 1061 | ||
1060 | ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); | 1062 | bp->b_strat = xfs_bdstrat_cb; |
1063 | bp->b_mount = mp; | ||
1064 | bp->b_flags |= XBF_WRITE; | ||
1065 | if (!iowait) | ||
1066 | bp->b_flags |= _XBF_RUN_QUEUES; | ||
1061 | 1067 | ||
1062 | xfs_buf_delwri_dequeue(bp); | 1068 | xfs_buf_delwri_dequeue(bp); |
1069 | xfs_buf_iostrategy(bp); | ||
1063 | 1070 | ||
1064 | bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD); | 1071 | if (iowait) { |
1065 | bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES); | 1072 | error = xfs_buf_iowait(bp); |
1073 | if (error) | ||
1074 | xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); | ||
1075 | xfs_buf_relse(bp); | ||
1076 | } | ||
1066 | 1077 | ||
1067 | bp->b_mount = mp; | 1078 | return error; |
1068 | bp->b_strat = xfs_bdstrat_cb; | ||
1069 | return xfs_bdstrat_cb(bp); | ||
1070 | } | 1079 | } |
1071 | 1080 | ||
1072 | void | 1081 | void |
@@ -1085,6 +1094,126 @@ xfs_bdwrite( | |||
1085 | xfs_buf_delwri_queue(bp, 1); | 1094 | xfs_buf_delwri_queue(bp, 1); |
1086 | } | 1095 | } |
1087 | 1096 | ||
1097 | /* | ||
1098 | * Called when we want to stop a buffer from getting written or read. | ||
1099 | * We attach the EIO error, muck with its flags, and call biodone | ||
1100 | * so that the proper iodone callbacks get called. | ||
1101 | */ | ||
1102 | STATIC int | ||
1103 | xfs_bioerror( | ||
1104 | xfs_buf_t *bp) | ||
1105 | { | ||
1106 | #ifdef XFSERRORDEBUG | ||
1107 | ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); | ||
1108 | #endif | ||
1109 | |||
1110 | /* | ||
1111 | * No need to wait until the buffer is unpinned, we aren't flushing it. | ||
1112 | */ | ||
1113 | XFS_BUF_ERROR(bp, EIO); | ||
1114 | |||
1115 | /* | ||
1116 | * We're calling biodone, so delete XBF_DONE flag. | ||
1117 | */ | ||
1118 | XFS_BUF_UNREAD(bp); | ||
1119 | XFS_BUF_UNDELAYWRITE(bp); | ||
1120 | XFS_BUF_UNDONE(bp); | ||
1121 | XFS_BUF_STALE(bp); | ||
1122 | |||
1123 | XFS_BUF_CLR_BDSTRAT_FUNC(bp); | ||
1124 | xfs_biodone(bp); | ||
1125 | |||
1126 | return EIO; | ||
1127 | } | ||
1128 | |||
1129 | /* | ||
1130 | * Same as xfs_bioerror, except that we are releasing the buffer | ||
1131 | * here ourselves, and avoiding the biodone call. | ||
1132 | * This is meant for userdata errors; metadata bufs come with | ||
1133 | * iodone functions attached, so that we can track down errors. | ||
1134 | */ | ||
1135 | STATIC int | ||
1136 | xfs_bioerror_relse( | ||
1137 | struct xfs_buf *bp) | ||
1138 | { | ||
1139 | int64_t fl = XFS_BUF_BFLAGS(bp); | ||
1140 | /* | ||
1141 | * No need to wait until the buffer is unpinned. | ||
1142 | * We aren't flushing it. | ||
1143 | * | ||
1144 | * chunkhold expects B_DONE to be set, whether | ||
1145 | * we actually finish the I/O or not. We don't want to | ||
1146 | * change that interface. | ||
1147 | */ | ||
1148 | XFS_BUF_UNREAD(bp); | ||
1149 | XFS_BUF_UNDELAYWRITE(bp); | ||
1150 | XFS_BUF_DONE(bp); | ||
1151 | XFS_BUF_STALE(bp); | ||
1152 | XFS_BUF_CLR_IODONE_FUNC(bp); | ||
1153 | XFS_BUF_CLR_BDSTRAT_FUNC(bp); | ||
1154 | if (!(fl & XBF_ASYNC)) { | ||
1155 | /* | ||
1156 | * Mark b_error and B_ERROR _both_. | ||
1157 | * Lot's of chunkcache code assumes that. | ||
1158 | * There's no reason to mark error for | ||
1159 | * ASYNC buffers. | ||
1160 | */ | ||
1161 | XFS_BUF_ERROR(bp, EIO); | ||
1162 | XFS_BUF_FINISH_IOWAIT(bp); | ||
1163 | } else { | ||
1164 | xfs_buf_relse(bp); | ||
1165 | } | ||
1166 | |||
1167 | return EIO; | ||
1168 | } | ||
1169 | |||
1170 | |||
1171 | /* | ||
1172 | * All xfs metadata buffers except log state machine buffers | ||
1173 | * get this attached as their b_bdstrat callback function. | ||
1174 | * This is so that we can catch a buffer | ||
1175 | * after prematurely unpinning it to forcibly shutdown the filesystem. | ||
1176 | */ | ||
1177 | int | ||
1178 | xfs_bdstrat_cb( | ||
1179 | struct xfs_buf *bp) | ||
1180 | { | ||
1181 | if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { | ||
1182 | trace_xfs_bdstrat_shut(bp, _RET_IP_); | ||
1183 | /* | ||
1184 | * Metadata write that didn't get logged but | ||
1185 | * written delayed anyway. These aren't associated | ||
1186 | * with a transaction, and can be ignored. | ||
1187 | */ | ||
1188 | if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) | ||
1189 | return xfs_bioerror_relse(bp); | ||
1190 | else | ||
1191 | return xfs_bioerror(bp); | ||
1192 | } | ||
1193 | |||
1194 | xfs_buf_iorequest(bp); | ||
1195 | return 0; | ||
1196 | } | ||
1197 | |||
1198 | /* | ||
1199 | * Wrapper around bdstrat so that we can stop data from going to disk in case | ||
1200 | * we are shutting down the filesystem. Typically user data goes thru this | ||
1201 | * path; one of the exceptions is the superblock. | ||
1202 | */ | ||
1203 | void | ||
1204 | xfsbdstrat( | ||
1205 | struct xfs_mount *mp, | ||
1206 | struct xfs_buf *bp) | ||
1207 | { | ||
1208 | if (XFS_FORCED_SHUTDOWN(mp)) { | ||
1209 | trace_xfs_bdstrat_shut(bp, _RET_IP_); | ||
1210 | xfs_bioerror_relse(bp); | ||
1211 | return; | ||
1212 | } | ||
1213 | |||
1214 | xfs_buf_iorequest(bp); | ||
1215 | } | ||
1216 | |||
1088 | STATIC void | 1217 | STATIC void |
1089 | _xfs_buf_ioend( | 1218 | _xfs_buf_ioend( |
1090 | xfs_buf_t *bp, | 1219 | xfs_buf_t *bp, |
@@ -1296,7 +1425,7 @@ xfs_buf_iomove( | |||
1296 | xfs_buf_t *bp, /* buffer to process */ | 1425 | xfs_buf_t *bp, /* buffer to process */ |
1297 | size_t boff, /* starting buffer offset */ | 1426 | size_t boff, /* starting buffer offset */ |
1298 | size_t bsize, /* length to copy */ | 1427 | size_t bsize, /* length to copy */ |
1299 | caddr_t data, /* data address */ | 1428 | void *data, /* data address */ |
1300 | xfs_buf_rw_t mode) /* read/write/zero flag */ | 1429 | xfs_buf_rw_t mode) /* read/write/zero flag */ |
1301 | { | 1430 | { |
1302 | size_t bend, cpoff, csize; | 1431 | size_t bend, cpoff, csize; |
@@ -1378,8 +1507,8 @@ xfs_alloc_bufhash( | |||
1378 | 1507 | ||
1379 | btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ | 1508 | btp->bt_hashshift = external ? 3 : 8; /* 8 or 256 buckets */ |
1380 | btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; | 1509 | btp->bt_hashmask = (1 << btp->bt_hashshift) - 1; |
1381 | btp->bt_hash = kmem_zalloc((1 << btp->bt_hashshift) * | 1510 | btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * |
1382 | sizeof(xfs_bufhash_t), KM_SLEEP | KM_LARGE); | 1511 | sizeof(xfs_bufhash_t)); |
1383 | for (i = 0; i < (1 << btp->bt_hashshift); i++) { | 1512 | for (i = 0; i < (1 << btp->bt_hashshift); i++) { |
1384 | spin_lock_init(&btp->bt_hash[i].bh_lock); | 1513 | spin_lock_init(&btp->bt_hash[i].bh_lock); |
1385 | INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); | 1514 | INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); |
@@ -1390,7 +1519,7 @@ STATIC void | |||
1390 | xfs_free_bufhash( | 1519 | xfs_free_bufhash( |
1391 | xfs_buftarg_t *btp) | 1520 | xfs_buftarg_t *btp) |
1392 | { | 1521 | { |
1393 | kmem_free(btp->bt_hash); | 1522 | kmem_free_large(btp->bt_hash); |
1394 | btp->bt_hash = NULL; | 1523 | btp->bt_hash = NULL; |
1395 | } | 1524 | } |
1396 | 1525 | ||
@@ -1595,6 +1724,11 @@ xfs_buf_delwri_queue( | |||
1595 | list_del(&bp->b_list); | 1724 | list_del(&bp->b_list); |
1596 | } | 1725 | } |
1597 | 1726 | ||
1727 | if (list_empty(dwq)) { | ||
1728 | /* start xfsbufd as it is about to have something to do */ | ||
1729 | wake_up_process(bp->b_target->bt_task); | ||
1730 | } | ||
1731 | |||
1598 | bp->b_flags |= _XBF_DELWRI_Q; | 1732 | bp->b_flags |= _XBF_DELWRI_Q; |
1599 | list_add_tail(&bp->b_list, dwq); | 1733 | list_add_tail(&bp->b_list, dwq); |
1600 | bp->b_queuetime = jiffies; | 1734 | bp->b_queuetime = jiffies; |
@@ -1626,6 +1760,35 @@ xfs_buf_delwri_dequeue( | |||
1626 | trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); | 1760 | trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); |
1627 | } | 1761 | } |
1628 | 1762 | ||
1763 | /* | ||
1764 | * If a delwri buffer needs to be pushed before it has aged out, then promote | ||
1765 | * it to the head of the delwri queue so that it will be flushed on the next | ||
1766 | * xfsbufd run. We do this by resetting the queuetime of the buffer to be older | ||
1767 | * than the age currently needed to flush the buffer. Hence the next time the | ||
1768 | * xfsbufd sees it is guaranteed to be considered old enough to flush. | ||
1769 | */ | ||
1770 | void | ||
1771 | xfs_buf_delwri_promote( | ||
1772 | struct xfs_buf *bp) | ||
1773 | { | ||
1774 | struct xfs_buftarg *btp = bp->b_target; | ||
1775 | long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1; | ||
1776 | |||
1777 | ASSERT(bp->b_flags & XBF_DELWRI); | ||
1778 | ASSERT(bp->b_flags & _XBF_DELWRI_Q); | ||
1779 | |||
1780 | /* | ||
1781 | * Check the buffer age before locking the delayed write queue as we | ||
1782 | * don't need to promote buffers that are already past the flush age. | ||
1783 | */ | ||
1784 | if (bp->b_queuetime < jiffies - age) | ||
1785 | return; | ||
1786 | bp->b_queuetime = jiffies - age; | ||
1787 | spin_lock(&btp->bt_delwrite_lock); | ||
1788 | list_move(&bp->b_list, &btp->bt_delwrite_queue); | ||
1789 | spin_unlock(&btp->bt_delwrite_lock); | ||
1790 | } | ||
1791 | |||
1629 | STATIC void | 1792 | STATIC void |
1630 | xfs_buf_runall_queues( | 1793 | xfs_buf_runall_queues( |
1631 | struct workqueue_struct *queue) | 1794 | struct workqueue_struct *queue) |
@@ -1644,6 +1807,8 @@ xfsbufd_wakeup( | |||
1644 | list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { | 1807 | list_for_each_entry(btp, &xfs_buftarg_list, bt_list) { |
1645 | if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) | 1808 | if (test_bit(XBT_FORCE_SLEEP, &btp->bt_flags)) |
1646 | continue; | 1809 | continue; |
1810 | if (list_empty(&btp->bt_delwrite_queue)) | ||
1811 | continue; | ||
1647 | set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); | 1812 | set_bit(XBT_FORCE_FLUSH, &btp->bt_flags); |
1648 | wake_up_process(btp->bt_task); | 1813 | wake_up_process(btp->bt_task); |
1649 | } | 1814 | } |
@@ -1694,20 +1859,53 @@ xfs_buf_delwri_split( | |||
1694 | 1859 | ||
1695 | } | 1860 | } |
1696 | 1861 | ||
1862 | /* | ||
1863 | * Compare function is more complex than it needs to be because | ||
1864 | * the return value is only 32 bits and we are doing comparisons | ||
1865 | * on 64 bit values | ||
1866 | */ | ||
1867 | static int | ||
1868 | xfs_buf_cmp( | ||
1869 | void *priv, | ||
1870 | struct list_head *a, | ||
1871 | struct list_head *b) | ||
1872 | { | ||
1873 | struct xfs_buf *ap = container_of(a, struct xfs_buf, b_list); | ||
1874 | struct xfs_buf *bp = container_of(b, struct xfs_buf, b_list); | ||
1875 | xfs_daddr_t diff; | ||
1876 | |||
1877 | diff = ap->b_bn - bp->b_bn; | ||
1878 | if (diff < 0) | ||
1879 | return -1; | ||
1880 | if (diff > 0) | ||
1881 | return 1; | ||
1882 | return 0; | ||
1883 | } | ||
1884 | |||
1885 | void | ||
1886 | xfs_buf_delwri_sort( | ||
1887 | xfs_buftarg_t *target, | ||
1888 | struct list_head *list) | ||
1889 | { | ||
1890 | list_sort(NULL, list, xfs_buf_cmp); | ||
1891 | } | ||
1892 | |||
1697 | STATIC int | 1893 | STATIC int |
1698 | xfsbufd( | 1894 | xfsbufd( |
1699 | void *data) | 1895 | void *data) |
1700 | { | 1896 | { |
1701 | struct list_head tmp; | 1897 | xfs_buftarg_t *target = (xfs_buftarg_t *)data; |
1702 | xfs_buftarg_t *target = (xfs_buftarg_t *)data; | ||
1703 | int count; | ||
1704 | xfs_buf_t *bp; | ||
1705 | 1898 | ||
1706 | current->flags |= PF_MEMALLOC; | 1899 | current->flags |= PF_MEMALLOC; |
1707 | 1900 | ||
1708 | set_freezable(); | 1901 | set_freezable(); |
1709 | 1902 | ||
1710 | do { | 1903 | do { |
1904 | long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); | ||
1905 | long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); | ||
1906 | int count = 0; | ||
1907 | struct list_head tmp; | ||
1908 | |||
1711 | if (unlikely(freezing(current))) { | 1909 | if (unlikely(freezing(current))) { |
1712 | set_bit(XBT_FORCE_SLEEP, &target->bt_flags); | 1910 | set_bit(XBT_FORCE_SLEEP, &target->bt_flags); |
1713 | refrigerator(); | 1911 | refrigerator(); |
@@ -1715,17 +1913,16 @@ xfsbufd( | |||
1715 | clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); | 1913 | clear_bit(XBT_FORCE_SLEEP, &target->bt_flags); |
1716 | } | 1914 | } |
1717 | 1915 | ||
1718 | schedule_timeout_interruptible( | 1916 | /* sleep for a long time if there is nothing to do. */ |
1719 | xfs_buf_timer_centisecs * msecs_to_jiffies(10)); | 1917 | if (list_empty(&target->bt_delwrite_queue)) |
1720 | 1918 | tout = MAX_SCHEDULE_TIMEOUT; | |
1721 | xfs_buf_delwri_split(target, &tmp, | 1919 | schedule_timeout_interruptible(tout); |
1722 | xfs_buf_age_centisecs * msecs_to_jiffies(10)); | ||
1723 | 1920 | ||
1724 | count = 0; | 1921 | xfs_buf_delwri_split(target, &tmp, age); |
1922 | list_sort(NULL, &tmp, xfs_buf_cmp); | ||
1725 | while (!list_empty(&tmp)) { | 1923 | while (!list_empty(&tmp)) { |
1726 | bp = list_entry(tmp.next, xfs_buf_t, b_list); | 1924 | struct xfs_buf *bp; |
1727 | ASSERT(target == bp->b_target); | 1925 | bp = list_first_entry(&tmp, struct xfs_buf, b_list); |
1728 | |||
1729 | list_del_init(&bp->b_list); | 1926 | list_del_init(&bp->b_list); |
1730 | xfs_buf_iostrategy(bp); | 1927 | xfs_buf_iostrategy(bp); |
1731 | count++; | 1928 | count++; |
@@ -1751,42 +1948,45 @@ xfs_flush_buftarg( | |||
1751 | xfs_buftarg_t *target, | 1948 | xfs_buftarg_t *target, |
1752 | int wait) | 1949 | int wait) |
1753 | { | 1950 | { |
1754 | struct list_head tmp; | 1951 | xfs_buf_t *bp; |
1755 | xfs_buf_t *bp, *n; | ||
1756 | int pincount = 0; | 1952 | int pincount = 0; |
1953 | LIST_HEAD(tmp_list); | ||
1954 | LIST_HEAD(wait_list); | ||
1757 | 1955 | ||
1758 | xfs_buf_runall_queues(xfsconvertd_workqueue); | 1956 | xfs_buf_runall_queues(xfsconvertd_workqueue); |
1759 | xfs_buf_runall_queues(xfsdatad_workqueue); | 1957 | xfs_buf_runall_queues(xfsdatad_workqueue); |
1760 | xfs_buf_runall_queues(xfslogd_workqueue); | 1958 | xfs_buf_runall_queues(xfslogd_workqueue); |
1761 | 1959 | ||
1762 | set_bit(XBT_FORCE_FLUSH, &target->bt_flags); | 1960 | set_bit(XBT_FORCE_FLUSH, &target->bt_flags); |
1763 | pincount = xfs_buf_delwri_split(target, &tmp, 0); | 1961 | pincount = xfs_buf_delwri_split(target, &tmp_list, 0); |
1764 | 1962 | ||
1765 | /* | 1963 | /* |
1766 | * Dropped the delayed write list lock, now walk the temporary list | 1964 | * Dropped the delayed write list lock, now walk the temporary list. |
1965 | * All I/O is issued async and then if we need to wait for completion | ||
1966 | * we do that after issuing all the IO. | ||
1767 | */ | 1967 | */ |
1768 | list_for_each_entry_safe(bp, n, &tmp, b_list) { | 1968 | list_sort(NULL, &tmp_list, xfs_buf_cmp); |
1969 | while (!list_empty(&tmp_list)) { | ||
1970 | bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); | ||
1769 | ASSERT(target == bp->b_target); | 1971 | ASSERT(target == bp->b_target); |
1770 | if (wait) | 1972 | list_del_init(&bp->b_list); |
1973 | if (wait) { | ||
1771 | bp->b_flags &= ~XBF_ASYNC; | 1974 | bp->b_flags &= ~XBF_ASYNC; |
1772 | else | 1975 | list_add(&bp->b_list, &wait_list); |
1773 | list_del_init(&bp->b_list); | 1976 | } |
1774 | |||
1775 | xfs_buf_iostrategy(bp); | 1977 | xfs_buf_iostrategy(bp); |
1776 | } | 1978 | } |
1777 | 1979 | ||
1778 | if (wait) | 1980 | if (wait) { |
1981 | /* Expedite and wait for IO to complete. */ | ||
1779 | blk_run_address_space(target->bt_mapping); | 1982 | blk_run_address_space(target->bt_mapping); |
1983 | while (!list_empty(&wait_list)) { | ||
1984 | bp = list_first_entry(&wait_list, struct xfs_buf, b_list); | ||
1780 | 1985 | ||
1781 | /* | 1986 | list_del_init(&bp->b_list); |
1782 | * Remaining list items must be flushed before returning | 1987 | xfs_iowait(bp); |
1783 | */ | 1988 | xfs_buf_relse(bp); |
1784 | while (!list_empty(&tmp)) { | 1989 | } |
1785 | bp = list_entry(tmp.next, xfs_buf_t, b_list); | ||
1786 | |||
1787 | list_del_init(&bp->b_list); | ||
1788 | xfs_iowait(bp); | ||
1789 | xfs_buf_relse(bp); | ||
1790 | } | 1990 | } |
1791 | 1991 | ||
1792 | return pincount; | 1992 | return pincount; |