aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
committerChris Mason <chris.mason@oracle.com>2008-10-29 14:49:59 -0400
commitc8b978188c9a0fd3d535c13debd19d522b726f1f (patch)
tree873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/extent_io.c
parent26ce34a9c47334ff7984769e4661b2f1883594ff (diff)
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing, both for inline and regular extents. It does some fairly large surgery to the writeback paths. Compression is off by default and enabled by mount -o compress. Even when the -o compress mount option is not used, it is possible to read compressed extents off the disk. If compression for a given set of pages fails to make them smaller, the file is flagged to avoid future compression attempts later. * While finding delalloc extents, the pages are locked before being sent down to the delalloc handler. This allows the delalloc handler to do complex things such as cleaning the pages, marking them writeback and starting IO on their behalf. * Inline extents are inserted at delalloc time now. This allows us to compress the data before inserting the inline extent, and it allows us to insert an inline extent that spans multiple pages. * All of the in-memory extent representations (extent_map.c, ordered-data.c etc) are changed to record both an in-memory size and an on disk size, as well as a flag for compression. From a disk format point of view, the extent pointers in the file are changed to record the on disk size of a given extent and some encoding flags. Space in the disk format is allocated for compression encoding, as well as encryption and a generic 'other' field. Neither the encryption or the 'other' field are currently used. In order to limit the amount of data read for a single random read in the file, the size of a compressed extent is limited to 128k. This is a software only limit, the disk format supports u64 sized compressed extents. In order to limit the ram consumed while processing extents, the uncompressed size of a compressed extent is limited to 256k. This is a software only limit and will be subject to tuning later. Checksumming is still done on compressed extents, and it is done on the uncompressed version of the data. This way additional encodings can be layered on without having to figure out which encoding to checksum. Compression happens at delalloc time, which is basically singled threaded because it is usually done by a single pdflush thread. This makes it tricky to spread the compression load across all the cpus on the box. We'll have to look at parallel pdflush walks of dirty inodes at a later time. Decompression is hooked into readpages and it does spread across CPUs nicely. Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c411
1 files changed, 323 insertions, 88 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 563b2d12f4f2..314041fdfa43 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache;
30static LIST_HEAD(buffers); 30static LIST_HEAD(buffers);
31static LIST_HEAD(states); 31static LIST_HEAD(states);
32 32
33#define LEAK_DEBUG 1
33#ifdef LEAK_DEBUG 34#ifdef LEAK_DEBUG
34static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; 35static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED;
35#endif 36#endif
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state);
1067 * 1068 *
1068 * 1 is returned if we find something, 0 if nothing was in the tree 1069 * 1 is returned if we find something, 0 if nothing was in the tree
1069 */ 1070 */
1070static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, 1071static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1071 u64 *start, u64 *end, u64 max_bytes) 1072 u64 *start, u64 *end, u64 max_bytes)
1072{ 1073{
1073 struct rb_node *node; 1074 struct rb_node *node;
1074 struct extent_state *state; 1075 struct extent_state *state;
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree,
1077 u64 total_bytes = 0; 1078 u64 total_bytes = 0;
1078 1079
1079 spin_lock_irq(&tree->lock); 1080 spin_lock_irq(&tree->lock);
1081
1080 /* 1082 /*
1081 * this search will find all the extents that end after 1083 * this search will find all the extents that end after
1082 * our range starts. 1084 * our range starts.
1083 */ 1085 */
1084search_again:
1085 node = tree_search(tree, cur_start); 1086 node = tree_search(tree, cur_start);
1086 if (!node) { 1087 if (!node) {
1087 if (!found) 1088 if (!found)
@@ -1100,40 +1101,6 @@ search_again:
1100 *end = state->end; 1101 *end = state->end;
1101 goto out; 1102 goto out;
1102 } 1103 }
1103 if (!found && !(state->state & EXTENT_BOUNDARY)) {
1104 struct extent_state *prev_state;
1105 struct rb_node *prev_node = node;
1106 while(1) {
1107 prev_node = rb_prev(prev_node);
1108 if (!prev_node)
1109 break;
1110 prev_state = rb_entry(prev_node,
1111 struct extent_state,
1112 rb_node);
1113 if ((prev_state->end + 1 != state->start) ||
1114 !(prev_state->state & EXTENT_DELALLOC))
1115 break;
1116 if ((cur_start - prev_state->start) * 2 >
1117 max_bytes)
1118 break;
1119 state = prev_state;
1120 node = prev_node;
1121 }
1122 }
1123 if (state->state & EXTENT_LOCKED) {
1124 DEFINE_WAIT(wait);
1125 atomic_inc(&state->refs);
1126 prepare_to_wait(&state->wq, &wait,
1127 TASK_UNINTERRUPTIBLE);
1128 spin_unlock_irq(&tree->lock);
1129 schedule();
1130 spin_lock_irq(&tree->lock);
1131 finish_wait(&state->wq, &wait);
1132 free_extent_state(state);
1133 goto search_again;
1134 }
1135 set_state_cb(tree, state, EXTENT_LOCKED);
1136 state->state |= EXTENT_LOCKED;
1137 if (!found) 1104 if (!found)
1138 *start = state->start; 1105 *start = state->start;
1139 found++; 1106 found++;
@@ -1151,6 +1118,208 @@ out:
1151 return found; 1118 return found;
1152} 1119}
1153 1120
1121static noinline int __unlock_for_delalloc(struct inode *inode,
1122 struct page *locked_page,
1123 u64 start, u64 end)
1124{
1125 int ret;
1126 struct page *pages[16];
1127 unsigned long index = start >> PAGE_CACHE_SHIFT;
1128 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1129 unsigned long nr_pages = end_index - index + 1;
1130 int i;
1131
1132 if (index == locked_page->index && end_index == index)
1133 return 0;
1134
1135 while(nr_pages > 0) {
1136 ret = find_get_pages_contig(inode->i_mapping, index,
1137 min(nr_pages, ARRAY_SIZE(pages)), pages);
1138 for (i = 0; i < ret; i++) {
1139 if (pages[i] != locked_page)
1140 unlock_page(pages[i]);
1141 page_cache_release(pages[i]);
1142 }
1143 nr_pages -= ret;
1144 index += ret;
1145 cond_resched();
1146 }
1147 return 0;
1148}
1149
1150static noinline int lock_delalloc_pages(struct inode *inode,
1151 struct page *locked_page,
1152 u64 delalloc_start,
1153 u64 delalloc_end)
1154{
1155 unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT;
1156 unsigned long start_index = index;
1157 unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT;
1158 unsigned long pages_locked = 0;
1159 struct page *pages[16];
1160 unsigned long nrpages;
1161 int ret;
1162 int i;
1163
1164 /* the caller is responsible for locking the start index */
1165 if (index == locked_page->index && index == end_index)
1166 return 0;
1167
1168 /* skip the page at the start index */
1169 nrpages = end_index - index + 1;
1170 while(nrpages > 0) {
1171 ret = find_get_pages_contig(inode->i_mapping, index,
1172 min(nrpages, ARRAY_SIZE(pages)), pages);
1173 if (ret == 0) {
1174 ret = -EAGAIN;
1175 goto done;
1176 }
1177 /* now we have an array of pages, lock them all */
1178 for (i = 0; i < ret; i++) {
1179 /*
1180 * the caller is taking responsibility for
1181 * locked_page
1182 */
1183 if (pages[i] != locked_page)
1184 lock_page(pages[i]);
1185 page_cache_release(pages[i]);
1186 }
1187 pages_locked += ret;
1188 nrpages -= ret;
1189 index += ret;
1190 cond_resched();
1191 }
1192 ret = 0;
1193done:
1194 if (ret && pages_locked) {
1195 __unlock_for_delalloc(inode, locked_page,
1196 delalloc_start,
1197 ((u64)(start_index + pages_locked - 1)) <<
1198 PAGE_CACHE_SHIFT);
1199 }
1200 return ret;
1201}
1202
1203/*
1204 * find a contiguous range of bytes in the file marked as delalloc, not
1205 * more than 'max_bytes'. start and end are used to return the range,
1206 *
1207 * 1 is returned if we find something, 0 if nothing was in the tree
1208 */
1209static noinline u64 find_lock_delalloc_range(struct inode *inode,
1210 struct extent_io_tree *tree,
1211 struct page *locked_page,
1212 u64 *start, u64 *end,
1213 u64 max_bytes)
1214{
1215 u64 delalloc_start;
1216 u64 delalloc_end;
1217 u64 found;
1218 int ret;
1219 int loops = 0;
1220
1221again:
1222 /* step one, find a bunch of delalloc bytes starting at start */
1223 delalloc_start = *start;
1224 delalloc_end = 0;
1225 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1226 max_bytes);
1227 if (!found) {
1228 *start = delalloc_start;
1229 *end = delalloc_end;
1230 return found;
1231 }
1232
1233 /*
1234 * make sure to limit the number of pages we try to lock down
1235 * if we're looping.
1236 */
1237 if (delalloc_end + 1 - delalloc_start > max_bytes && loops) {
1238 delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) &
1239 ~((u64)PAGE_CACHE_SIZE - 1);
1240 }
1241 /* step two, lock all the pages after the page that has start */
1242 ret = lock_delalloc_pages(inode, locked_page,
1243 delalloc_start, delalloc_end);
1244 if (ret == -EAGAIN) {
1245 /* some of the pages are gone, lets avoid looping by
1246 * shortening the size of the delalloc range we're searching
1247 */
1248 if (!loops) {
1249 unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1);
1250 max_bytes = PAGE_CACHE_SIZE - offset;
1251 loops = 1;
1252 goto again;
1253 } else {
1254 found = 0;
1255 goto out_failed;
1256 }
1257 }
1258 BUG_ON(ret);
1259
1260 /* step three, lock the state bits for the whole range */
1261 lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1262
1263 /* then test to make sure it is all still delalloc */
1264 ret = test_range_bit(tree, delalloc_start, delalloc_end,
1265 EXTENT_DELALLOC, 1);
1266 if (!ret) {
1267 unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS);
1268 __unlock_for_delalloc(inode, locked_page,
1269 delalloc_start, delalloc_end);
1270 cond_resched();
1271 goto again;
1272 }
1273 *start = delalloc_start;
1274 *end = delalloc_end;
1275out_failed:
1276 return found;
1277}
1278
1279int extent_clear_unlock_delalloc(struct inode *inode,
1280 struct extent_io_tree *tree,
1281 u64 start, u64 end, struct page *locked_page,
1282 int clear_dirty, int set_writeback,
1283 int end_writeback)
1284{
1285 int ret;
1286 struct page *pages[16];
1287 unsigned long index = start >> PAGE_CACHE_SHIFT;
1288 unsigned long end_index = end >> PAGE_CACHE_SHIFT;
1289 unsigned long nr_pages = end_index - index + 1;
1290 int i;
1291 int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1292
1293 if (clear_dirty)
1294 clear_bits |= EXTENT_DIRTY;
1295
1296 clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS);
1297
1298 while(nr_pages > 0) {
1299 ret = find_get_pages_contig(inode->i_mapping, index,
1300 min(nr_pages, ARRAY_SIZE(pages)), pages);
1301 for (i = 0; i < ret; i++) {
1302 if (pages[i] == locked_page) {
1303 page_cache_release(pages[i]);
1304 continue;
1305 }
1306 if (clear_dirty)
1307 clear_page_dirty_for_io(pages[i]);
1308 if (set_writeback)
1309 set_page_writeback(pages[i]);
1310 if (end_writeback)
1311 end_page_writeback(pages[i]);
1312 unlock_page(pages[i]);
1313 page_cache_release(pages[i]);
1314 }
1315 nr_pages -= ret;
1316 index += ret;
1317 cond_resched();
1318 }
1319 return 0;
1320}
1321EXPORT_SYMBOL(extent_clear_unlock_delalloc);
1322
1154/* 1323/*
1155 * count the number of bytes in the tree that have a given bit(s) 1324 * count the number of bytes in the tree that have a given bit(s)
1156 * set. This can be fairly slow, except for EXTENT_DIRTY which is 1325 * set. This can be fairly slow, except for EXTENT_DIRTY which is
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
1631 return bio; 1800 return bio;
1632} 1801}
1633 1802
1634static int submit_one_bio(int rw, struct bio *bio, int mirror_num) 1803static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1804 unsigned long bio_flags)
1635{ 1805{
1636 int ret = 0; 1806 int ret = 0;
1637 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1807 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
1638 struct page *page = bvec->bv_page; 1808 struct page *page = bvec->bv_page;
1639 struct extent_io_tree *tree = bio->bi_private; 1809 struct extent_io_tree *tree = bio->bi_private;
1640 struct rb_node *node;
1641 struct extent_state *state;
1642 u64 start; 1810 u64 start;
1643 u64 end; 1811 u64 end;
1644 1812
1645 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; 1813 start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
1646 end = start + bvec->bv_len - 1; 1814 end = start + bvec->bv_len - 1;
1647 1815
1648 spin_lock_irq(&tree->lock);
1649 node = __etree_search(tree, start, NULL, NULL);
1650 BUG_ON(!node);
1651 state = rb_entry(node, struct extent_state, rb_node);
1652 while(state->end < end) {
1653 node = rb_next(node);
1654 state = rb_entry(node, struct extent_state, rb_node);
1655 }
1656 BUG_ON(state->end != end);
1657 spin_unlock_irq(&tree->lock);
1658
1659 bio->bi_private = NULL; 1816 bio->bi_private = NULL;
1660 1817
1661 bio_get(bio); 1818 bio_get(bio);
1662 1819
1663 if (tree->ops && tree->ops->submit_bio_hook) 1820 if (tree->ops && tree->ops->submit_bio_hook)
1664 tree->ops->submit_bio_hook(page->mapping->host, rw, bio, 1821 tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
1665 mirror_num); 1822 mirror_num, bio_flags);
1666 else 1823 else
1667 submit_bio(rw, bio); 1824 submit_bio(rw, bio);
1668 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 1825 if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
1678 struct bio **bio_ret, 1835 struct bio **bio_ret,
1679 unsigned long max_pages, 1836 unsigned long max_pages,
1680 bio_end_io_t end_io_func, 1837 bio_end_io_t end_io_func,
1681 int mirror_num) 1838 int mirror_num,
1839 unsigned long prev_bio_flags,
1840 unsigned long bio_flags)
1682{ 1841{
1683 int ret = 0; 1842 int ret = 0;
1684 struct bio *bio; 1843 struct bio *bio;
1685 int nr; 1844 int nr;
1845 int contig = 0;
1846 int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED;
1847 int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED;
1848 size_t page_size = min(size, PAGE_CACHE_SIZE);
1686 1849
1687 if (bio_ret && *bio_ret) { 1850 if (bio_ret && *bio_ret) {
1688 bio = *bio_ret; 1851 bio = *bio_ret;
1689 if (bio->bi_sector + (bio->bi_size >> 9) != sector || 1852 if (old_compressed)
1853 contig = bio->bi_sector == sector;
1854 else
1855 contig = bio->bi_sector + (bio->bi_size >> 9) ==
1856 sector;
1857
1858 if (prev_bio_flags != bio_flags || !contig ||
1690 (tree->ops && tree->ops->merge_bio_hook && 1859 (tree->ops && tree->ops->merge_bio_hook &&
1691 tree->ops->merge_bio_hook(page, offset, size, bio)) || 1860 tree->ops->merge_bio_hook(page, offset, page_size, bio,
1692 bio_add_page(bio, page, size, offset) < size) { 1861 bio_flags)) ||
1693 ret = submit_one_bio(rw, bio, mirror_num); 1862 bio_add_page(bio, page, page_size, offset) < page_size) {
1863 ret = submit_one_bio(rw, bio, mirror_num,
1864 prev_bio_flags);
1694 bio = NULL; 1865 bio = NULL;
1695 } else { 1866 } else {
1696 return 0; 1867 return 0;
1697 } 1868 }
1698 } 1869 }
1699 nr = bio_get_nr_vecs(bdev); 1870 if (this_compressed)
1871 nr = BIO_MAX_PAGES;
1872 else
1873 nr = bio_get_nr_vecs(bdev);
1874
1700 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); 1875 bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
1701 if (!bio) { 1876 if (!bio) {
1702 printk("failed to allocate bio nr %d\n", nr); 1877 printk("failed to allocate bio nr %d\n", nr);
1703 } 1878 }
1704 1879
1705 1880 bio_add_page(bio, page, page_size, offset);
1706 bio_add_page(bio, page, size, offset);
1707 bio->bi_end_io = end_io_func; 1881 bio->bi_end_io = end_io_func;
1708 bio->bi_private = tree; 1882 bio->bi_private = tree;
1709 1883
1710 if (bio_ret) { 1884 if (bio_ret) {
1711 *bio_ret = bio; 1885 *bio_ret = bio;
1712 } else { 1886 } else {
1713 ret = submit_one_bio(rw, bio, mirror_num); 1887 ret = submit_one_bio(rw, bio, mirror_num, bio_flags);
1714 } 1888 }
1715 1889
1716 return ret; 1890 return ret;
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len)
1738static int __extent_read_full_page(struct extent_io_tree *tree, 1912static int __extent_read_full_page(struct extent_io_tree *tree,
1739 struct page *page, 1913 struct page *page,
1740 get_extent_t *get_extent, 1914 get_extent_t *get_extent,
1741 struct bio **bio, int mirror_num) 1915 struct bio **bio, int mirror_num,
1916 unsigned long *bio_flags)
1742{ 1917{
1743 struct inode *inode = page->mapping->host; 1918 struct inode *inode = page->mapping->host;
1744 u64 start = (u64)page->index << PAGE_CACHE_SHIFT; 1919 u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
1756 int nr = 0; 1931 int nr = 0;
1757 size_t page_offset = 0; 1932 size_t page_offset = 0;
1758 size_t iosize; 1933 size_t iosize;
1934 size_t disk_io_size;
1759 size_t blocksize = inode->i_sb->s_blocksize; 1935 size_t blocksize = inode->i_sb->s_blocksize;
1936 unsigned long this_bio_flag = 0;
1760 1937
1761 set_page_extent_mapped(page); 1938 set_page_extent_mapped(page);
1762 1939
1763 end = page_end; 1940 end = page_end;
1764 lock_extent(tree, start, end, GFP_NOFS); 1941 lock_extent(tree, start, end, GFP_NOFS);
1765 1942
1943 if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
1944 char *userpage;
1945 size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
1946
1947 if (zero_offset) {
1948 iosize = PAGE_CACHE_SIZE - zero_offset;
1949 userpage = kmap_atomic(page, KM_USER0);
1950 memset(userpage + zero_offset, 0, iosize);
1951 flush_dcache_page(page);
1952 kunmap_atomic(userpage, KM_USER0);
1953 }
1954 }
1766 while (cur <= end) { 1955 while (cur <= end) {
1767 if (cur >= last_byte) { 1956 if (cur >= last_byte) {
1768 char *userpage; 1957 char *userpage;
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1793 } 1982 }
1794 BUG_ON(end < cur); 1983 BUG_ON(end < cur);
1795 1984
1985 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
1986 this_bio_flag = EXTENT_BIO_COMPRESSED;
1987
1796 iosize = min(extent_map_end(em) - cur, end - cur + 1); 1988 iosize = min(extent_map_end(em) - cur, end - cur + 1);
1797 cur_end = min(extent_map_end(em) - 1, end); 1989 cur_end = min(extent_map_end(em) - 1, end);
1798 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); 1990 iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
1799 sector = (em->block_start + extent_offset) >> 9; 1991 if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
1992 disk_io_size = em->block_len;
1993 sector = em->block_start >> 9;
1994 } else {
1995 sector = (em->block_start + extent_offset) >> 9;
1996 disk_io_size = iosize;
1997 }
1800 bdev = em->bdev; 1998 bdev = em->bdev;
1801 block_start = em->block_start; 1999 block_start = em->block_start;
1802 free_extent_map(em); 2000 free_extent_map(em);
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1845 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2043 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1846 pnr -= page->index; 2044 pnr -= page->index;
1847 ret = submit_extent_page(READ, tree, page, 2045 ret = submit_extent_page(READ, tree, page,
1848 sector, iosize, page_offset, 2046 sector, disk_io_size, page_offset,
1849 bdev, bio, pnr, 2047 bdev, bio, pnr,
1850 end_bio_extent_readpage, mirror_num); 2048 end_bio_extent_readpage, mirror_num,
2049 *bio_flags,
2050 this_bio_flag);
1851 nr++; 2051 nr++;
2052 *bio_flags = this_bio_flag;
1852 } 2053 }
1853 if (ret) 2054 if (ret)
1854 SetPageError(page); 2055 SetPageError(page);
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
1867 get_extent_t *get_extent) 2068 get_extent_t *get_extent)
1868{ 2069{
1869 struct bio *bio = NULL; 2070 struct bio *bio = NULL;
2071 unsigned long bio_flags = 0;
1870 int ret; 2072 int ret;
1871 2073
1872 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); 2074 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
2075 &bio_flags);
1873 if (bio) 2076 if (bio)
1874 submit_one_bio(READ, bio, 0); 2077 submit_one_bio(READ, bio, 0, bio_flags);
1875 return ret; 2078 return ret;
1876} 2079}
1877EXPORT_SYMBOL(extent_read_full_page); 2080EXPORT_SYMBOL(extent_read_full_page);
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1909 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; 2112 unsigned long end_index = i_size >> PAGE_CACHE_SHIFT;
1910 u64 nr_delalloc; 2113 u64 nr_delalloc;
1911 u64 delalloc_end; 2114 u64 delalloc_end;
2115 int page_started;
2116 int compressed;
1912 2117
1913 WARN_ON(!PageLocked(page)); 2118 WARN_ON(!PageLocked(page));
1914 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2119 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
1934 2139
1935 delalloc_start = start; 2140 delalloc_start = start;
1936 delalloc_end = 0; 2141 delalloc_end = 0;
2142 page_started = 0;
1937 while(delalloc_end < page_end) { 2143 while(delalloc_end < page_end) {
1938 nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, 2144 nr_delalloc = find_lock_delalloc_range(inode, tree,
2145 page,
2146 &delalloc_start,
1939 &delalloc_end, 2147 &delalloc_end,
1940 128 * 1024 * 1024); 2148 128 * 1024 * 1024);
1941 if (nr_delalloc == 0) { 2149 if (nr_delalloc == 0) {
1942 delalloc_start = delalloc_end + 1; 2150 delalloc_start = delalloc_end + 1;
1943 continue; 2151 continue;
1944 } 2152 }
1945 tree->ops->fill_delalloc(inode, delalloc_start, 2153 tree->ops->fill_delalloc(inode, page, delalloc_start,
1946 delalloc_end); 2154 delalloc_end, &page_started);
1947 clear_extent_bit(tree, delalloc_start,
1948 delalloc_end,
1949 EXTENT_LOCKED | EXTENT_DELALLOC,
1950 1, 0, GFP_NOFS);
1951 delalloc_start = delalloc_end + 1; 2155 delalloc_start = delalloc_end + 1;
1952 } 2156 }
2157
2158 /* did the fill delalloc function already unlock and start the IO? */
2159 if (page_started) {
2160 return 0;
2161 }
2162
1953 lock_extent(tree, start, page_end, GFP_NOFS); 2163 lock_extent(tree, start, page_end, GFP_NOFS);
1954 unlock_start = start; 2164 unlock_start = start;
1955 2165
1956 if (tree->ops && tree->ops->writepage_start_hook) { 2166 if (tree->ops && tree->ops->writepage_start_hook) {
1957 ret = tree->ops->writepage_start_hook(page, start, page_end); 2167 ret = tree->ops->writepage_start_hook(page, start,
2168 page_end);
1958 if (ret == -EAGAIN) { 2169 if (ret == -EAGAIN) {
1959 unlock_extent(tree, start, page_end, GFP_NOFS); 2170 unlock_extent(tree, start, page_end, GFP_NOFS);
1960 redirty_page_for_writepage(wbc, page); 2171 redirty_page_for_writepage(wbc, page);
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2006 sector = (em->block_start + extent_offset) >> 9; 2217 sector = (em->block_start + extent_offset) >> 9;
2007 bdev = em->bdev; 2218 bdev = em->bdev;
2008 block_start = em->block_start; 2219 block_start = em->block_start;
2220 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
2009 free_extent_map(em); 2221 free_extent_map(em);
2010 em = NULL; 2222 em = NULL;
2011 2223
2012 if (block_start == EXTENT_MAP_HOLE || 2224 /*
2225 * compressed and inline extents are written through other
2226 * paths in the FS
2227 */
2228 if (compressed || block_start == EXTENT_MAP_HOLE ||
2013 block_start == EXTENT_MAP_INLINE) { 2229 block_start == EXTENT_MAP_INLINE) {
2014 clear_extent_dirty(tree, cur, 2230 clear_extent_dirty(tree, cur,
2015 cur + iosize - 1, GFP_NOFS); 2231 cur + iosize - 1, GFP_NOFS);
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2017 unlock_extent(tree, unlock_start, cur + iosize -1, 2233 unlock_extent(tree, unlock_start, cur + iosize -1,
2018 GFP_NOFS); 2234 GFP_NOFS);
2019 2235
2020 if (tree->ops && tree->ops->writepage_end_io_hook) 2236 /*
2237 * end_io notification does not happen here for
2238 * compressed extents
2239 */
2240 if (!compressed && tree->ops &&
2241 tree->ops->writepage_end_io_hook)
2021 tree->ops->writepage_end_io_hook(page, cur, 2242 tree->ops->writepage_end_io_hook(page, cur,
2022 cur + iosize - 1, 2243 cur + iosize - 1,
2023 NULL, 1); 2244 NULL, 1);
2024 cur = cur + iosize; 2245 else if (compressed) {
2246 /* we don't want to end_page_writeback on
2247 * a compressed extent. this happens
2248 * elsewhere
2249 */
2250 nr++;
2251 }
2252
2253 cur += iosize;
2025 pg_offset += iosize; 2254 pg_offset += iosize;
2026 unlock_start = cur; 2255 unlock_start = cur;
2027 continue; 2256 continue;
2028 } 2257 }
2029
2030 /* leave this out until we have a page_mkwrite call */ 2258 /* leave this out until we have a page_mkwrite call */
2031 if (0 && !test_range_bit(tree, cur, cur + iosize - 1, 2259 if (0 && !test_range_bit(tree, cur, cur + iosize - 1,
2032 EXTENT_DIRTY, 0)) { 2260 EXTENT_DIRTY, 0)) {
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2034 pg_offset += iosize; 2262 pg_offset += iosize;
2035 continue; 2263 continue;
2036 } 2264 }
2265
2037 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); 2266 clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS);
2038 if (tree->ops && tree->ops->writepage_io_hook) { 2267 if (tree->ops && tree->ops->writepage_io_hook) {
2039 ret = tree->ops->writepage_io_hook(page, cur, 2268 ret = tree->ops->writepage_io_hook(page, cur,
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2057 ret = submit_extent_page(WRITE, tree, page, sector, 2286 ret = submit_extent_page(WRITE, tree, page, sector,
2058 iosize, pg_offset, bdev, 2287 iosize, pg_offset, bdev,
2059 &epd->bio, max_nr, 2288 &epd->bio, max_nr,
2060 end_bio_extent_writepage, 0); 2289 end_bio_extent_writepage,
2290 0, 0, 0);
2061 if (ret) 2291 if (ret)
2062 SetPageError(page); 2292 SetPageError(page);
2063 } 2293 }
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
2226 extent_write_cache_pages(tree, mapping, &wbc_writepages, 2456 extent_write_cache_pages(tree, mapping, &wbc_writepages,
2227 __extent_writepage, &epd); 2457 __extent_writepage, &epd);
2228 if (epd.bio) { 2458 if (epd.bio) {
2229 submit_one_bio(WRITE, epd.bio, 0); 2459 submit_one_bio(WRITE, epd.bio, 0, 0);
2230 } 2460 }
2231 return ret; 2461 return ret;
2232} 2462}
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree,
2248 ret = extent_write_cache_pages(tree, mapping, wbc, 2478 ret = extent_write_cache_pages(tree, mapping, wbc,
2249 __extent_writepage, &epd); 2479 __extent_writepage, &epd);
2250 if (epd.bio) { 2480 if (epd.bio) {
2251 submit_one_bio(WRITE, epd.bio, 0); 2481 submit_one_bio(WRITE, epd.bio, 0, 0);
2252 } 2482 }
2253 return ret; 2483 return ret;
2254} 2484}
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree,
2262 struct bio *bio = NULL; 2492 struct bio *bio = NULL;
2263 unsigned page_idx; 2493 unsigned page_idx;
2264 struct pagevec pvec; 2494 struct pagevec pvec;
2495 unsigned long bio_flags = 0;
2265 2496
2266 pagevec_init(&pvec, 0); 2497 pagevec_init(&pvec, 0);
2267 for (page_idx = 0; page_idx < nr_pages; page_idx++) { 2498 for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree,
2281 if (!pagevec_add(&pvec, page)) 2512 if (!pagevec_add(&pvec, page))
2282 __pagevec_lru_add(&pvec); 2513 __pagevec_lru_add(&pvec);
2283 __extent_read_full_page(tree, page, get_extent, 2514 __extent_read_full_page(tree, page, get_extent,
2284 &bio, 0); 2515 &bio, 0, &bio_flags);
2285 } 2516 }
2286 page_cache_release(page); 2517 page_cache_release(page);
2287 } 2518 }
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree,
2289 __pagevec_lru_add(&pvec); 2520 __pagevec_lru_add(&pvec);
2290 BUG_ON(!list_empty(pages)); 2521 BUG_ON(!list_empty(pages));
2291 if (bio) 2522 if (bio)
2292 submit_one_bio(READ, bio, 0); 2523 submit_one_bio(READ, bio, 0, bio_flags);
2293 return 0; 2524 return 0;
2294} 2525}
2295EXPORT_SYMBOL(extent_readpages); 2526EXPORT_SYMBOL(extent_readpages);
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree,
2414 ret = submit_extent_page(READ, tree, page, 2645 ret = submit_extent_page(READ, tree, page,
2415 sector, iosize, page_offset, em->bdev, 2646 sector, iosize, page_offset, em->bdev,
2416 NULL, 1, 2647 NULL, 1,
2417 end_bio_extent_preparewrite, 0); 2648 end_bio_extent_preparewrite, 0,
2649 0, 0);
2418 iocount++; 2650 iocount++;
2419 block_start = block_start + iosize; 2651 block_start = block_start + iosize;
2420 } else { 2652 } else {
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map,
2495 } 2727 }
2496 if (!test_range_bit(tree, em->start, 2728 if (!test_range_bit(tree, em->start,
2497 extent_map_end(em) - 1, 2729 extent_map_end(em) - 1,
2498 EXTENT_LOCKED, 0)) { 2730 EXTENT_LOCKED | EXTENT_WRITEBACK |
2731 EXTENT_ORDERED,
2732 0)) {
2499 remove_extent_mapping(map, em); 2733 remove_extent_mapping(map, em);
2500 /* once for the rb tree */ 2734 /* once for the rb tree */
2501 free_extent_map(em); 2735 free_extent_map(em);
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2923 int inc_all_pages = 0; 3157 int inc_all_pages = 0;
2924 unsigned long num_pages; 3158 unsigned long num_pages;
2925 struct bio *bio = NULL; 3159 struct bio *bio = NULL;
3160 unsigned long bio_flags = 0;
2926 3161
2927 if (eb->flags & EXTENT_UPTODATE) 3162 if (eb->flags & EXTENT_UPTODATE)
2928 return 0; 3163 return 0;
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2973 ClearPageError(page); 3208 ClearPageError(page);
2974 err = __extent_read_full_page(tree, page, 3209 err = __extent_read_full_page(tree, page,
2975 get_extent, &bio, 3210 get_extent, &bio,
2976 mirror_num); 3211 mirror_num, &bio_flags);
2977 if (err) { 3212 if (err) {
2978 ret = err; 3213 ret = err;
2979 printk("err %d from __extent_read_full_page\n", ret); 3214 printk("err %d from __extent_read_full_page\n", ret);
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
2984 } 3219 }
2985 3220
2986 if (bio) 3221 if (bio)
2987 submit_one_bio(READ, bio, mirror_num); 3222 submit_one_bio(READ, bio, mirror_num, bio_flags);
2988 3223
2989 if (ret || !wait) { 3224 if (ret || !wait) {
2990 if (ret) 3225 if (ret)