diff options
author | Chris Mason <chris.mason@oracle.com> | 2008-10-29 14:49:59 -0400 |
---|---|---|
committer | Chris Mason <chris.mason@oracle.com> | 2008-10-29 14:49:59 -0400 |
commit | c8b978188c9a0fd3d535c13debd19d522b726f1f (patch) | |
tree | 873628723fb82fe2a7c77adc65fa93eca1d61c0c /fs/btrfs/extent_io.c | |
parent | 26ce34a9c47334ff7984769e4661b2f1883594ff (diff) |
Btrfs: Add zlib compression support
This is a large change for adding compression on reading and writing,
both for inline and regular extents. It does some fairly large
surgery to the writeback paths.
Compression is off by default and enabled by mount -o compress. Even
when the -o compress mount option is not used, it is possible to read
compressed extents off the disk.
If compression for a given set of pages fails to make them smaller, the
file is flagged to avoid future compression attempts later.
* While finding delalloc extents, the pages are locked before being sent down
to the delalloc handler. This allows the delalloc handler to do complex things
such as cleaning the pages, marking them writeback and starting IO on their
behalf.
* Inline extents are inserted at delalloc time now. This allows us to compress
the data before inserting the inline extent, and it allows us to insert
an inline extent that spans multiple pages.
* All of the in-memory extent representations (extent_map.c, ordered-data.c etc)
are changed to record both an in-memory size and an on disk size, as well
as a flag for compression.
From a disk format point of view, the extent pointers in the file are changed
to record the on disk size of a given extent and some encoding flags.
Space in the disk format is allocated for compression encoding, as well
as encryption and a generic 'other' field. Neither the encryption or the
'other' field are currently used.
In order to limit the amount of data read for a single random read in the
file, the size of a compressed extent is limited to 128k. This is a
software only limit, the disk format supports u64 sized compressed extents.
In order to limit the ram consumed while processing extents, the uncompressed
size of a compressed extent is limited to 256k. This is a software only limit
and will be subject to tuning later.
Checksumming is still done on compressed extents, and it is done on the
uncompressed version of the data. This way additional encodings can be
layered on without having to figure out which encoding to checksum.
Compression happens at delalloc time, which is basically singled threaded because
it is usually done by a single pdflush thread. This makes it tricky to
spread the compression load across all the cpus on the box. We'll have to
look at parallel pdflush walks of dirty inodes at a later time.
Decompression is hooked into readpages and it does spread across CPUs nicely.
Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 411 |
1 files changed, 323 insertions, 88 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 563b2d12f4f2..314041fdfa43 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -30,6 +30,7 @@ static struct kmem_cache *extent_buffer_cache; | |||
30 | static LIST_HEAD(buffers); | 30 | static LIST_HEAD(buffers); |
31 | static LIST_HEAD(states); | 31 | static LIST_HEAD(states); |
32 | 32 | ||
33 | #define LEAK_DEBUG 1 | ||
33 | #ifdef LEAK_DEBUG | 34 | #ifdef LEAK_DEBUG |
34 | static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; | 35 | static spinlock_t leak_lock = SPIN_LOCK_UNLOCKED; |
35 | #endif | 36 | #endif |
@@ -1067,8 +1068,8 @@ EXPORT_SYMBOL(find_first_extent_bit_state); | |||
1067 | * | 1068 | * |
1068 | * 1 is returned if we find something, 0 if nothing was in the tree | 1069 | * 1 is returned if we find something, 0 if nothing was in the tree |
1069 | */ | 1070 | */ |
1070 | static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, | 1071 | static noinline u64 find_delalloc_range(struct extent_io_tree *tree, |
1071 | u64 *start, u64 *end, u64 max_bytes) | 1072 | u64 *start, u64 *end, u64 max_bytes) |
1072 | { | 1073 | { |
1073 | struct rb_node *node; | 1074 | struct rb_node *node; |
1074 | struct extent_state *state; | 1075 | struct extent_state *state; |
@@ -1077,11 +1078,11 @@ static noinline u64 find_lock_delalloc_range(struct extent_io_tree *tree, | |||
1077 | u64 total_bytes = 0; | 1078 | u64 total_bytes = 0; |
1078 | 1079 | ||
1079 | spin_lock_irq(&tree->lock); | 1080 | spin_lock_irq(&tree->lock); |
1081 | |||
1080 | /* | 1082 | /* |
1081 | * this search will find all the extents that end after | 1083 | * this search will find all the extents that end after |
1082 | * our range starts. | 1084 | * our range starts. |
1083 | */ | 1085 | */ |
1084 | search_again: | ||
1085 | node = tree_search(tree, cur_start); | 1086 | node = tree_search(tree, cur_start); |
1086 | if (!node) { | 1087 | if (!node) { |
1087 | if (!found) | 1088 | if (!found) |
@@ -1100,40 +1101,6 @@ search_again: | |||
1100 | *end = state->end; | 1101 | *end = state->end; |
1101 | goto out; | 1102 | goto out; |
1102 | } | 1103 | } |
1103 | if (!found && !(state->state & EXTENT_BOUNDARY)) { | ||
1104 | struct extent_state *prev_state; | ||
1105 | struct rb_node *prev_node = node; | ||
1106 | while(1) { | ||
1107 | prev_node = rb_prev(prev_node); | ||
1108 | if (!prev_node) | ||
1109 | break; | ||
1110 | prev_state = rb_entry(prev_node, | ||
1111 | struct extent_state, | ||
1112 | rb_node); | ||
1113 | if ((prev_state->end + 1 != state->start) || | ||
1114 | !(prev_state->state & EXTENT_DELALLOC)) | ||
1115 | break; | ||
1116 | if ((cur_start - prev_state->start) * 2 > | ||
1117 | max_bytes) | ||
1118 | break; | ||
1119 | state = prev_state; | ||
1120 | node = prev_node; | ||
1121 | } | ||
1122 | } | ||
1123 | if (state->state & EXTENT_LOCKED) { | ||
1124 | DEFINE_WAIT(wait); | ||
1125 | atomic_inc(&state->refs); | ||
1126 | prepare_to_wait(&state->wq, &wait, | ||
1127 | TASK_UNINTERRUPTIBLE); | ||
1128 | spin_unlock_irq(&tree->lock); | ||
1129 | schedule(); | ||
1130 | spin_lock_irq(&tree->lock); | ||
1131 | finish_wait(&state->wq, &wait); | ||
1132 | free_extent_state(state); | ||
1133 | goto search_again; | ||
1134 | } | ||
1135 | set_state_cb(tree, state, EXTENT_LOCKED); | ||
1136 | state->state |= EXTENT_LOCKED; | ||
1137 | if (!found) | 1104 | if (!found) |
1138 | *start = state->start; | 1105 | *start = state->start; |
1139 | found++; | 1106 | found++; |
@@ -1151,6 +1118,208 @@ out: | |||
1151 | return found; | 1118 | return found; |
1152 | } | 1119 | } |
1153 | 1120 | ||
1121 | static noinline int __unlock_for_delalloc(struct inode *inode, | ||
1122 | struct page *locked_page, | ||
1123 | u64 start, u64 end) | ||
1124 | { | ||
1125 | int ret; | ||
1126 | struct page *pages[16]; | ||
1127 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
1128 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
1129 | unsigned long nr_pages = end_index - index + 1; | ||
1130 | int i; | ||
1131 | |||
1132 | if (index == locked_page->index && end_index == index) | ||
1133 | return 0; | ||
1134 | |||
1135 | while(nr_pages > 0) { | ||
1136 | ret = find_get_pages_contig(inode->i_mapping, index, | ||
1137 | min(nr_pages, ARRAY_SIZE(pages)), pages); | ||
1138 | for (i = 0; i < ret; i++) { | ||
1139 | if (pages[i] != locked_page) | ||
1140 | unlock_page(pages[i]); | ||
1141 | page_cache_release(pages[i]); | ||
1142 | } | ||
1143 | nr_pages -= ret; | ||
1144 | index += ret; | ||
1145 | cond_resched(); | ||
1146 | } | ||
1147 | return 0; | ||
1148 | } | ||
1149 | |||
1150 | static noinline int lock_delalloc_pages(struct inode *inode, | ||
1151 | struct page *locked_page, | ||
1152 | u64 delalloc_start, | ||
1153 | u64 delalloc_end) | ||
1154 | { | ||
1155 | unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; | ||
1156 | unsigned long start_index = index; | ||
1157 | unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; | ||
1158 | unsigned long pages_locked = 0; | ||
1159 | struct page *pages[16]; | ||
1160 | unsigned long nrpages; | ||
1161 | int ret; | ||
1162 | int i; | ||
1163 | |||
1164 | /* the caller is responsible for locking the start index */ | ||
1165 | if (index == locked_page->index && index == end_index) | ||
1166 | return 0; | ||
1167 | |||
1168 | /* skip the page at the start index */ | ||
1169 | nrpages = end_index - index + 1; | ||
1170 | while(nrpages > 0) { | ||
1171 | ret = find_get_pages_contig(inode->i_mapping, index, | ||
1172 | min(nrpages, ARRAY_SIZE(pages)), pages); | ||
1173 | if (ret == 0) { | ||
1174 | ret = -EAGAIN; | ||
1175 | goto done; | ||
1176 | } | ||
1177 | /* now we have an array of pages, lock them all */ | ||
1178 | for (i = 0; i < ret; i++) { | ||
1179 | /* | ||
1180 | * the caller is taking responsibility for | ||
1181 | * locked_page | ||
1182 | */ | ||
1183 | if (pages[i] != locked_page) | ||
1184 | lock_page(pages[i]); | ||
1185 | page_cache_release(pages[i]); | ||
1186 | } | ||
1187 | pages_locked += ret; | ||
1188 | nrpages -= ret; | ||
1189 | index += ret; | ||
1190 | cond_resched(); | ||
1191 | } | ||
1192 | ret = 0; | ||
1193 | done: | ||
1194 | if (ret && pages_locked) { | ||
1195 | __unlock_for_delalloc(inode, locked_page, | ||
1196 | delalloc_start, | ||
1197 | ((u64)(start_index + pages_locked - 1)) << | ||
1198 | PAGE_CACHE_SHIFT); | ||
1199 | } | ||
1200 | return ret; | ||
1201 | } | ||
1202 | |||
1203 | /* | ||
1204 | * find a contiguous range of bytes in the file marked as delalloc, not | ||
1205 | * more than 'max_bytes'. start and end are used to return the range, | ||
1206 | * | ||
1207 | * 1 is returned if we find something, 0 if nothing was in the tree | ||
1208 | */ | ||
1209 | static noinline u64 find_lock_delalloc_range(struct inode *inode, | ||
1210 | struct extent_io_tree *tree, | ||
1211 | struct page *locked_page, | ||
1212 | u64 *start, u64 *end, | ||
1213 | u64 max_bytes) | ||
1214 | { | ||
1215 | u64 delalloc_start; | ||
1216 | u64 delalloc_end; | ||
1217 | u64 found; | ||
1218 | int ret; | ||
1219 | int loops = 0; | ||
1220 | |||
1221 | again: | ||
1222 | /* step one, find a bunch of delalloc bytes starting at start */ | ||
1223 | delalloc_start = *start; | ||
1224 | delalloc_end = 0; | ||
1225 | found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, | ||
1226 | max_bytes); | ||
1227 | if (!found) { | ||
1228 | *start = delalloc_start; | ||
1229 | *end = delalloc_end; | ||
1230 | return found; | ||
1231 | } | ||
1232 | |||
1233 | /* | ||
1234 | * make sure to limit the number of pages we try to lock down | ||
1235 | * if we're looping. | ||
1236 | */ | ||
1237 | if (delalloc_end + 1 - delalloc_start > max_bytes && loops) { | ||
1238 | delalloc_end = (delalloc_start + PAGE_CACHE_SIZE - 1) & | ||
1239 | ~((u64)PAGE_CACHE_SIZE - 1); | ||
1240 | } | ||
1241 | /* step two, lock all the pages after the page that has start */ | ||
1242 | ret = lock_delalloc_pages(inode, locked_page, | ||
1243 | delalloc_start, delalloc_end); | ||
1244 | if (ret == -EAGAIN) { | ||
1245 | /* some of the pages are gone, lets avoid looping by | ||
1246 | * shortening the size of the delalloc range we're searching | ||
1247 | */ | ||
1248 | if (!loops) { | ||
1249 | unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); | ||
1250 | max_bytes = PAGE_CACHE_SIZE - offset; | ||
1251 | loops = 1; | ||
1252 | goto again; | ||
1253 | } else { | ||
1254 | found = 0; | ||
1255 | goto out_failed; | ||
1256 | } | ||
1257 | } | ||
1258 | BUG_ON(ret); | ||
1259 | |||
1260 | /* step three, lock the state bits for the whole range */ | ||
1261 | lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); | ||
1262 | |||
1263 | /* then test to make sure it is all still delalloc */ | ||
1264 | ret = test_range_bit(tree, delalloc_start, delalloc_end, | ||
1265 | EXTENT_DELALLOC, 1); | ||
1266 | if (!ret) { | ||
1267 | unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); | ||
1268 | __unlock_for_delalloc(inode, locked_page, | ||
1269 | delalloc_start, delalloc_end); | ||
1270 | cond_resched(); | ||
1271 | goto again; | ||
1272 | } | ||
1273 | *start = delalloc_start; | ||
1274 | *end = delalloc_end; | ||
1275 | out_failed: | ||
1276 | return found; | ||
1277 | } | ||
1278 | |||
1279 | int extent_clear_unlock_delalloc(struct inode *inode, | ||
1280 | struct extent_io_tree *tree, | ||
1281 | u64 start, u64 end, struct page *locked_page, | ||
1282 | int clear_dirty, int set_writeback, | ||
1283 | int end_writeback) | ||
1284 | { | ||
1285 | int ret; | ||
1286 | struct page *pages[16]; | ||
1287 | unsigned long index = start >> PAGE_CACHE_SHIFT; | ||
1288 | unsigned long end_index = end >> PAGE_CACHE_SHIFT; | ||
1289 | unsigned long nr_pages = end_index - index + 1; | ||
1290 | int i; | ||
1291 | int clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC; | ||
1292 | |||
1293 | if (clear_dirty) | ||
1294 | clear_bits |= EXTENT_DIRTY; | ||
1295 | |||
1296 | clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); | ||
1297 | |||
1298 | while(nr_pages > 0) { | ||
1299 | ret = find_get_pages_contig(inode->i_mapping, index, | ||
1300 | min(nr_pages, ARRAY_SIZE(pages)), pages); | ||
1301 | for (i = 0; i < ret; i++) { | ||
1302 | if (pages[i] == locked_page) { | ||
1303 | page_cache_release(pages[i]); | ||
1304 | continue; | ||
1305 | } | ||
1306 | if (clear_dirty) | ||
1307 | clear_page_dirty_for_io(pages[i]); | ||
1308 | if (set_writeback) | ||
1309 | set_page_writeback(pages[i]); | ||
1310 | if (end_writeback) | ||
1311 | end_page_writeback(pages[i]); | ||
1312 | unlock_page(pages[i]); | ||
1313 | page_cache_release(pages[i]); | ||
1314 | } | ||
1315 | nr_pages -= ret; | ||
1316 | index += ret; | ||
1317 | cond_resched(); | ||
1318 | } | ||
1319 | return 0; | ||
1320 | } | ||
1321 | EXPORT_SYMBOL(extent_clear_unlock_delalloc); | ||
1322 | |||
1154 | /* | 1323 | /* |
1155 | * count the number of bytes in the tree that have a given bit(s) | 1324 | * count the number of bytes in the tree that have a given bit(s) |
1156 | * set. This can be fairly slow, except for EXTENT_DIRTY which is | 1325 | * set. This can be fairly slow, except for EXTENT_DIRTY which is |
@@ -1631,38 +1800,26 @@ extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, | |||
1631 | return bio; | 1800 | return bio; |
1632 | } | 1801 | } |
1633 | 1802 | ||
1634 | static int submit_one_bio(int rw, struct bio *bio, int mirror_num) | 1803 | static int submit_one_bio(int rw, struct bio *bio, int mirror_num, |
1804 | unsigned long bio_flags) | ||
1635 | { | 1805 | { |
1636 | int ret = 0; | 1806 | int ret = 0; |
1637 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; | 1807 | struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; |
1638 | struct page *page = bvec->bv_page; | 1808 | struct page *page = bvec->bv_page; |
1639 | struct extent_io_tree *tree = bio->bi_private; | 1809 | struct extent_io_tree *tree = bio->bi_private; |
1640 | struct rb_node *node; | ||
1641 | struct extent_state *state; | ||
1642 | u64 start; | 1810 | u64 start; |
1643 | u64 end; | 1811 | u64 end; |
1644 | 1812 | ||
1645 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; | 1813 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; |
1646 | end = start + bvec->bv_len - 1; | 1814 | end = start + bvec->bv_len - 1; |
1647 | 1815 | ||
1648 | spin_lock_irq(&tree->lock); | ||
1649 | node = __etree_search(tree, start, NULL, NULL); | ||
1650 | BUG_ON(!node); | ||
1651 | state = rb_entry(node, struct extent_state, rb_node); | ||
1652 | while(state->end < end) { | ||
1653 | node = rb_next(node); | ||
1654 | state = rb_entry(node, struct extent_state, rb_node); | ||
1655 | } | ||
1656 | BUG_ON(state->end != end); | ||
1657 | spin_unlock_irq(&tree->lock); | ||
1658 | |||
1659 | bio->bi_private = NULL; | 1816 | bio->bi_private = NULL; |
1660 | 1817 | ||
1661 | bio_get(bio); | 1818 | bio_get(bio); |
1662 | 1819 | ||
1663 | if (tree->ops && tree->ops->submit_bio_hook) | 1820 | if (tree->ops && tree->ops->submit_bio_hook) |
1664 | tree->ops->submit_bio_hook(page->mapping->host, rw, bio, | 1821 | tree->ops->submit_bio_hook(page->mapping->host, rw, bio, |
1665 | mirror_num); | 1822 | mirror_num, bio_flags); |
1666 | else | 1823 | else |
1667 | submit_bio(rw, bio); | 1824 | submit_bio(rw, bio); |
1668 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 1825 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
@@ -1678,39 +1835,56 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree, | |||
1678 | struct bio **bio_ret, | 1835 | struct bio **bio_ret, |
1679 | unsigned long max_pages, | 1836 | unsigned long max_pages, |
1680 | bio_end_io_t end_io_func, | 1837 | bio_end_io_t end_io_func, |
1681 | int mirror_num) | 1838 | int mirror_num, |
1839 | unsigned long prev_bio_flags, | ||
1840 | unsigned long bio_flags) | ||
1682 | { | 1841 | { |
1683 | int ret = 0; | 1842 | int ret = 0; |
1684 | struct bio *bio; | 1843 | struct bio *bio; |
1685 | int nr; | 1844 | int nr; |
1845 | int contig = 0; | ||
1846 | int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; | ||
1847 | int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; | ||
1848 | size_t page_size = min(size, PAGE_CACHE_SIZE); | ||
1686 | 1849 | ||
1687 | if (bio_ret && *bio_ret) { | 1850 | if (bio_ret && *bio_ret) { |
1688 | bio = *bio_ret; | 1851 | bio = *bio_ret; |
1689 | if (bio->bi_sector + (bio->bi_size >> 9) != sector || | 1852 | if (old_compressed) |
1853 | contig = bio->bi_sector == sector; | ||
1854 | else | ||
1855 | contig = bio->bi_sector + (bio->bi_size >> 9) == | ||
1856 | sector; | ||
1857 | |||
1858 | if (prev_bio_flags != bio_flags || !contig || | ||
1690 | (tree->ops && tree->ops->merge_bio_hook && | 1859 | (tree->ops && tree->ops->merge_bio_hook && |
1691 | tree->ops->merge_bio_hook(page, offset, size, bio)) || | 1860 | tree->ops->merge_bio_hook(page, offset, page_size, bio, |
1692 | bio_add_page(bio, page, size, offset) < size) { | 1861 | bio_flags)) || |
1693 | ret = submit_one_bio(rw, bio, mirror_num); | 1862 | bio_add_page(bio, page, page_size, offset) < page_size) { |
1863 | ret = submit_one_bio(rw, bio, mirror_num, | ||
1864 | prev_bio_flags); | ||
1694 | bio = NULL; | 1865 | bio = NULL; |
1695 | } else { | 1866 | } else { |
1696 | return 0; | 1867 | return 0; |
1697 | } | 1868 | } |
1698 | } | 1869 | } |
1699 | nr = bio_get_nr_vecs(bdev); | 1870 | if (this_compressed) |
1871 | nr = BIO_MAX_PAGES; | ||
1872 | else | ||
1873 | nr = bio_get_nr_vecs(bdev); | ||
1874 | |||
1700 | bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); | 1875 | bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); |
1701 | if (!bio) { | 1876 | if (!bio) { |
1702 | printk("failed to allocate bio nr %d\n", nr); | 1877 | printk("failed to allocate bio nr %d\n", nr); |
1703 | } | 1878 | } |
1704 | 1879 | ||
1705 | 1880 | bio_add_page(bio, page, page_size, offset); | |
1706 | bio_add_page(bio, page, size, offset); | ||
1707 | bio->bi_end_io = end_io_func; | 1881 | bio->bi_end_io = end_io_func; |
1708 | bio->bi_private = tree; | 1882 | bio->bi_private = tree; |
1709 | 1883 | ||
1710 | if (bio_ret) { | 1884 | if (bio_ret) { |
1711 | *bio_ret = bio; | 1885 | *bio_ret = bio; |
1712 | } else { | 1886 | } else { |
1713 | ret = submit_one_bio(rw, bio, mirror_num); | 1887 | ret = submit_one_bio(rw, bio, mirror_num, bio_flags); |
1714 | } | 1888 | } |
1715 | 1889 | ||
1716 | return ret; | 1890 | return ret; |
@@ -1738,7 +1912,8 @@ void set_page_extent_head(struct page *page, unsigned long len) | |||
1738 | static int __extent_read_full_page(struct extent_io_tree *tree, | 1912 | static int __extent_read_full_page(struct extent_io_tree *tree, |
1739 | struct page *page, | 1913 | struct page *page, |
1740 | get_extent_t *get_extent, | 1914 | get_extent_t *get_extent, |
1741 | struct bio **bio, int mirror_num) | 1915 | struct bio **bio, int mirror_num, |
1916 | unsigned long *bio_flags) | ||
1742 | { | 1917 | { |
1743 | struct inode *inode = page->mapping->host; | 1918 | struct inode *inode = page->mapping->host; |
1744 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; | 1919 | u64 start = (u64)page->index << PAGE_CACHE_SHIFT; |
@@ -1756,13 +1931,27 @@ static int __extent_read_full_page(struct extent_io_tree *tree, | |||
1756 | int nr = 0; | 1931 | int nr = 0; |
1757 | size_t page_offset = 0; | 1932 | size_t page_offset = 0; |
1758 | size_t iosize; | 1933 | size_t iosize; |
1934 | size_t disk_io_size; | ||
1759 | size_t blocksize = inode->i_sb->s_blocksize; | 1935 | size_t blocksize = inode->i_sb->s_blocksize; |
1936 | unsigned long this_bio_flag = 0; | ||
1760 | 1937 | ||
1761 | set_page_extent_mapped(page); | 1938 | set_page_extent_mapped(page); |
1762 | 1939 | ||
1763 | end = page_end; | 1940 | end = page_end; |
1764 | lock_extent(tree, start, end, GFP_NOFS); | 1941 | lock_extent(tree, start, end, GFP_NOFS); |
1765 | 1942 | ||
1943 | if (page->index == last_byte >> PAGE_CACHE_SHIFT) { | ||
1944 | char *userpage; | ||
1945 | size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); | ||
1946 | |||
1947 | if (zero_offset) { | ||
1948 | iosize = PAGE_CACHE_SIZE - zero_offset; | ||
1949 | userpage = kmap_atomic(page, KM_USER0); | ||
1950 | memset(userpage + zero_offset, 0, iosize); | ||
1951 | flush_dcache_page(page); | ||
1952 | kunmap_atomic(userpage, KM_USER0); | ||
1953 | } | ||
1954 | } | ||
1766 | while (cur <= end) { | 1955 | while (cur <= end) { |
1767 | if (cur >= last_byte) { | 1956 | if (cur >= last_byte) { |
1768 | char *userpage; | 1957 | char *userpage; |
@@ -1793,10 +1982,19 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); | |||
1793 | } | 1982 | } |
1794 | BUG_ON(end < cur); | 1983 | BUG_ON(end < cur); |
1795 | 1984 | ||
1985 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) | ||
1986 | this_bio_flag = EXTENT_BIO_COMPRESSED; | ||
1987 | |||
1796 | iosize = min(extent_map_end(em) - cur, end - cur + 1); | 1988 | iosize = min(extent_map_end(em) - cur, end - cur + 1); |
1797 | cur_end = min(extent_map_end(em) - 1, end); | 1989 | cur_end = min(extent_map_end(em) - 1, end); |
1798 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); | 1990 | iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); |
1799 | sector = (em->block_start + extent_offset) >> 9; | 1991 | if (this_bio_flag & EXTENT_BIO_COMPRESSED) { |
1992 | disk_io_size = em->block_len; | ||
1993 | sector = em->block_start >> 9; | ||
1994 | } else { | ||
1995 | sector = (em->block_start + extent_offset) >> 9; | ||
1996 | disk_io_size = iosize; | ||
1997 | } | ||
1800 | bdev = em->bdev; | 1998 | bdev = em->bdev; |
1801 | block_start = em->block_start; | 1999 | block_start = em->block_start; |
1802 | free_extent_map(em); | 2000 | free_extent_map(em); |
@@ -1845,10 +2043,13 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur); | |||
1845 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; | 2043 | unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; |
1846 | pnr -= page->index; | 2044 | pnr -= page->index; |
1847 | ret = submit_extent_page(READ, tree, page, | 2045 | ret = submit_extent_page(READ, tree, page, |
1848 | sector, iosize, page_offset, | 2046 | sector, disk_io_size, page_offset, |
1849 | bdev, bio, pnr, | 2047 | bdev, bio, pnr, |
1850 | end_bio_extent_readpage, mirror_num); | 2048 | end_bio_extent_readpage, mirror_num, |
2049 | *bio_flags, | ||
2050 | this_bio_flag); | ||
1851 | nr++; | 2051 | nr++; |
2052 | *bio_flags = this_bio_flag; | ||
1852 | } | 2053 | } |
1853 | if (ret) | 2054 | if (ret) |
1854 | SetPageError(page); | 2055 | SetPageError(page); |
@@ -1867,11 +2068,13 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | |||
1867 | get_extent_t *get_extent) | 2068 | get_extent_t *get_extent) |
1868 | { | 2069 | { |
1869 | struct bio *bio = NULL; | 2070 | struct bio *bio = NULL; |
2071 | unsigned long bio_flags = 0; | ||
1870 | int ret; | 2072 | int ret; |
1871 | 2073 | ||
1872 | ret = __extent_read_full_page(tree, page, get_extent, &bio, 0); | 2074 | ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, |
2075 | &bio_flags); | ||
1873 | if (bio) | 2076 | if (bio) |
1874 | submit_one_bio(READ, bio, 0); | 2077 | submit_one_bio(READ, bio, 0, bio_flags); |
1875 | return ret; | 2078 | return ret; |
1876 | } | 2079 | } |
1877 | EXPORT_SYMBOL(extent_read_full_page); | 2080 | EXPORT_SYMBOL(extent_read_full_page); |
@@ -1909,6 +2112,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
1909 | unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; | 2112 | unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; |
1910 | u64 nr_delalloc; | 2113 | u64 nr_delalloc; |
1911 | u64 delalloc_end; | 2114 | u64 delalloc_end; |
2115 | int page_started; | ||
2116 | int compressed; | ||
1912 | 2117 | ||
1913 | WARN_ON(!PageLocked(page)); | 2118 | WARN_ON(!PageLocked(page)); |
1914 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); | 2119 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); |
@@ -1934,27 +2139,33 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
1934 | 2139 | ||
1935 | delalloc_start = start; | 2140 | delalloc_start = start; |
1936 | delalloc_end = 0; | 2141 | delalloc_end = 0; |
2142 | page_started = 0; | ||
1937 | while(delalloc_end < page_end) { | 2143 | while(delalloc_end < page_end) { |
1938 | nr_delalloc = find_lock_delalloc_range(tree, &delalloc_start, | 2144 | nr_delalloc = find_lock_delalloc_range(inode, tree, |
2145 | page, | ||
2146 | &delalloc_start, | ||
1939 | &delalloc_end, | 2147 | &delalloc_end, |
1940 | 128 * 1024 * 1024); | 2148 | 128 * 1024 * 1024); |
1941 | if (nr_delalloc == 0) { | 2149 | if (nr_delalloc == 0) { |
1942 | delalloc_start = delalloc_end + 1; | 2150 | delalloc_start = delalloc_end + 1; |
1943 | continue; | 2151 | continue; |
1944 | } | 2152 | } |
1945 | tree->ops->fill_delalloc(inode, delalloc_start, | 2153 | tree->ops->fill_delalloc(inode, page, delalloc_start, |
1946 | delalloc_end); | 2154 | delalloc_end, &page_started); |
1947 | clear_extent_bit(tree, delalloc_start, | ||
1948 | delalloc_end, | ||
1949 | EXTENT_LOCKED | EXTENT_DELALLOC, | ||
1950 | 1, 0, GFP_NOFS); | ||
1951 | delalloc_start = delalloc_end + 1; | 2155 | delalloc_start = delalloc_end + 1; |
1952 | } | 2156 | } |
2157 | |||
2158 | /* did the fill delalloc function already unlock and start the IO? */ | ||
2159 | if (page_started) { | ||
2160 | return 0; | ||
2161 | } | ||
2162 | |||
1953 | lock_extent(tree, start, page_end, GFP_NOFS); | 2163 | lock_extent(tree, start, page_end, GFP_NOFS); |
1954 | unlock_start = start; | 2164 | unlock_start = start; |
1955 | 2165 | ||
1956 | if (tree->ops && tree->ops->writepage_start_hook) { | 2166 | if (tree->ops && tree->ops->writepage_start_hook) { |
1957 | ret = tree->ops->writepage_start_hook(page, start, page_end); | 2167 | ret = tree->ops->writepage_start_hook(page, start, |
2168 | page_end); | ||
1958 | if (ret == -EAGAIN) { | 2169 | if (ret == -EAGAIN) { |
1959 | unlock_extent(tree, start, page_end, GFP_NOFS); | 2170 | unlock_extent(tree, start, page_end, GFP_NOFS); |
1960 | redirty_page_for_writepage(wbc, page); | 2171 | redirty_page_for_writepage(wbc, page); |
@@ -2006,10 +2217,15 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2006 | sector = (em->block_start + extent_offset) >> 9; | 2217 | sector = (em->block_start + extent_offset) >> 9; |
2007 | bdev = em->bdev; | 2218 | bdev = em->bdev; |
2008 | block_start = em->block_start; | 2219 | block_start = em->block_start; |
2220 | compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); | ||
2009 | free_extent_map(em); | 2221 | free_extent_map(em); |
2010 | em = NULL; | 2222 | em = NULL; |
2011 | 2223 | ||
2012 | if (block_start == EXTENT_MAP_HOLE || | 2224 | /* |
2225 | * compressed and inline extents are written through other | ||
2226 | * paths in the FS | ||
2227 | */ | ||
2228 | if (compressed || block_start == EXTENT_MAP_HOLE || | ||
2013 | block_start == EXTENT_MAP_INLINE) { | 2229 | block_start == EXTENT_MAP_INLINE) { |
2014 | clear_extent_dirty(tree, cur, | 2230 | clear_extent_dirty(tree, cur, |
2015 | cur + iosize - 1, GFP_NOFS); | 2231 | cur + iosize - 1, GFP_NOFS); |
@@ -2017,16 +2233,28 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2017 | unlock_extent(tree, unlock_start, cur + iosize -1, | 2233 | unlock_extent(tree, unlock_start, cur + iosize -1, |
2018 | GFP_NOFS); | 2234 | GFP_NOFS); |
2019 | 2235 | ||
2020 | if (tree->ops && tree->ops->writepage_end_io_hook) | 2236 | /* |
2237 | * end_io notification does not happen here for | ||
2238 | * compressed extents | ||
2239 | */ | ||
2240 | if (!compressed && tree->ops && | ||
2241 | tree->ops->writepage_end_io_hook) | ||
2021 | tree->ops->writepage_end_io_hook(page, cur, | 2242 | tree->ops->writepage_end_io_hook(page, cur, |
2022 | cur + iosize - 1, | 2243 | cur + iosize - 1, |
2023 | NULL, 1); | 2244 | NULL, 1); |
2024 | cur = cur + iosize; | 2245 | else if (compressed) { |
2246 | /* we don't want to end_page_writeback on | ||
2247 | * a compressed extent. this happens | ||
2248 | * elsewhere | ||
2249 | */ | ||
2250 | nr++; | ||
2251 | } | ||
2252 | |||
2253 | cur += iosize; | ||
2025 | pg_offset += iosize; | 2254 | pg_offset += iosize; |
2026 | unlock_start = cur; | 2255 | unlock_start = cur; |
2027 | continue; | 2256 | continue; |
2028 | } | 2257 | } |
2029 | |||
2030 | /* leave this out until we have a page_mkwrite call */ | 2258 | /* leave this out until we have a page_mkwrite call */ |
2031 | if (0 && !test_range_bit(tree, cur, cur + iosize - 1, | 2259 | if (0 && !test_range_bit(tree, cur, cur + iosize - 1, |
2032 | EXTENT_DIRTY, 0)) { | 2260 | EXTENT_DIRTY, 0)) { |
@@ -2034,6 +2262,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2034 | pg_offset += iosize; | 2262 | pg_offset += iosize; |
2035 | continue; | 2263 | continue; |
2036 | } | 2264 | } |
2265 | |||
2037 | clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); | 2266 | clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); |
2038 | if (tree->ops && tree->ops->writepage_io_hook) { | 2267 | if (tree->ops && tree->ops->writepage_io_hook) { |
2039 | ret = tree->ops->writepage_io_hook(page, cur, | 2268 | ret = tree->ops->writepage_io_hook(page, cur, |
@@ -2057,7 +2286,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2057 | ret = submit_extent_page(WRITE, tree, page, sector, | 2286 | ret = submit_extent_page(WRITE, tree, page, sector, |
2058 | iosize, pg_offset, bdev, | 2287 | iosize, pg_offset, bdev, |
2059 | &epd->bio, max_nr, | 2288 | &epd->bio, max_nr, |
2060 | end_bio_extent_writepage, 0); | 2289 | end_bio_extent_writepage, |
2290 | 0, 0, 0); | ||
2061 | if (ret) | 2291 | if (ret) |
2062 | SetPageError(page); | 2292 | SetPageError(page); |
2063 | } | 2293 | } |
@@ -2226,7 +2456,7 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page, | |||
2226 | extent_write_cache_pages(tree, mapping, &wbc_writepages, | 2456 | extent_write_cache_pages(tree, mapping, &wbc_writepages, |
2227 | __extent_writepage, &epd); | 2457 | __extent_writepage, &epd); |
2228 | if (epd.bio) { | 2458 | if (epd.bio) { |
2229 | submit_one_bio(WRITE, epd.bio, 0); | 2459 | submit_one_bio(WRITE, epd.bio, 0, 0); |
2230 | } | 2460 | } |
2231 | return ret; | 2461 | return ret; |
2232 | } | 2462 | } |
@@ -2248,7 +2478,7 @@ int extent_writepages(struct extent_io_tree *tree, | |||
2248 | ret = extent_write_cache_pages(tree, mapping, wbc, | 2478 | ret = extent_write_cache_pages(tree, mapping, wbc, |
2249 | __extent_writepage, &epd); | 2479 | __extent_writepage, &epd); |
2250 | if (epd.bio) { | 2480 | if (epd.bio) { |
2251 | submit_one_bio(WRITE, epd.bio, 0); | 2481 | submit_one_bio(WRITE, epd.bio, 0, 0); |
2252 | } | 2482 | } |
2253 | return ret; | 2483 | return ret; |
2254 | } | 2484 | } |
@@ -2262,6 +2492,7 @@ int extent_readpages(struct extent_io_tree *tree, | |||
2262 | struct bio *bio = NULL; | 2492 | struct bio *bio = NULL; |
2263 | unsigned page_idx; | 2493 | unsigned page_idx; |
2264 | struct pagevec pvec; | 2494 | struct pagevec pvec; |
2495 | unsigned long bio_flags = 0; | ||
2265 | 2496 | ||
2266 | pagevec_init(&pvec, 0); | 2497 | pagevec_init(&pvec, 0); |
2267 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { | 2498 | for (page_idx = 0; page_idx < nr_pages; page_idx++) { |
@@ -2281,7 +2512,7 @@ int extent_readpages(struct extent_io_tree *tree, | |||
2281 | if (!pagevec_add(&pvec, page)) | 2512 | if (!pagevec_add(&pvec, page)) |
2282 | __pagevec_lru_add(&pvec); | 2513 | __pagevec_lru_add(&pvec); |
2283 | __extent_read_full_page(tree, page, get_extent, | 2514 | __extent_read_full_page(tree, page, get_extent, |
2284 | &bio, 0); | 2515 | &bio, 0, &bio_flags); |
2285 | } | 2516 | } |
2286 | page_cache_release(page); | 2517 | page_cache_release(page); |
2287 | } | 2518 | } |
@@ -2289,7 +2520,7 @@ int extent_readpages(struct extent_io_tree *tree, | |||
2289 | __pagevec_lru_add(&pvec); | 2520 | __pagevec_lru_add(&pvec); |
2290 | BUG_ON(!list_empty(pages)); | 2521 | BUG_ON(!list_empty(pages)); |
2291 | if (bio) | 2522 | if (bio) |
2292 | submit_one_bio(READ, bio, 0); | 2523 | submit_one_bio(READ, bio, 0, bio_flags); |
2293 | return 0; | 2524 | return 0; |
2294 | } | 2525 | } |
2295 | EXPORT_SYMBOL(extent_readpages); | 2526 | EXPORT_SYMBOL(extent_readpages); |
@@ -2414,7 +2645,8 @@ int extent_prepare_write(struct extent_io_tree *tree, | |||
2414 | ret = submit_extent_page(READ, tree, page, | 2645 | ret = submit_extent_page(READ, tree, page, |
2415 | sector, iosize, page_offset, em->bdev, | 2646 | sector, iosize, page_offset, em->bdev, |
2416 | NULL, 1, | 2647 | NULL, 1, |
2417 | end_bio_extent_preparewrite, 0); | 2648 | end_bio_extent_preparewrite, 0, |
2649 | 0, 0); | ||
2418 | iocount++; | 2650 | iocount++; |
2419 | block_start = block_start + iosize; | 2651 | block_start = block_start + iosize; |
2420 | } else { | 2652 | } else { |
@@ -2495,7 +2727,9 @@ int try_release_extent_mapping(struct extent_map_tree *map, | |||
2495 | } | 2727 | } |
2496 | if (!test_range_bit(tree, em->start, | 2728 | if (!test_range_bit(tree, em->start, |
2497 | extent_map_end(em) - 1, | 2729 | extent_map_end(em) - 1, |
2498 | EXTENT_LOCKED, 0)) { | 2730 | EXTENT_LOCKED | EXTENT_WRITEBACK | |
2731 | EXTENT_ORDERED, | ||
2732 | 0)) { | ||
2499 | remove_extent_mapping(map, em); | 2733 | remove_extent_mapping(map, em); |
2500 | /* once for the rb tree */ | 2734 | /* once for the rb tree */ |
2501 | free_extent_map(em); | 2735 | free_extent_map(em); |
@@ -2923,6 +3157,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, | |||
2923 | int inc_all_pages = 0; | 3157 | int inc_all_pages = 0; |
2924 | unsigned long num_pages; | 3158 | unsigned long num_pages; |
2925 | struct bio *bio = NULL; | 3159 | struct bio *bio = NULL; |
3160 | unsigned long bio_flags = 0; | ||
2926 | 3161 | ||
2927 | if (eb->flags & EXTENT_UPTODATE) | 3162 | if (eb->flags & EXTENT_UPTODATE) |
2928 | return 0; | 3163 | return 0; |
@@ -2973,7 +3208,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, | |||
2973 | ClearPageError(page); | 3208 | ClearPageError(page); |
2974 | err = __extent_read_full_page(tree, page, | 3209 | err = __extent_read_full_page(tree, page, |
2975 | get_extent, &bio, | 3210 | get_extent, &bio, |
2976 | mirror_num); | 3211 | mirror_num, &bio_flags); |
2977 | if (err) { | 3212 | if (err) { |
2978 | ret = err; | 3213 | ret = err; |
2979 | printk("err %d from __extent_read_full_page\n", ret); | 3214 | printk("err %d from __extent_read_full_page\n", ret); |
@@ -2984,7 +3219,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, | |||
2984 | } | 3219 | } |
2985 | 3220 | ||
2986 | if (bio) | 3221 | if (bio) |
2987 | submit_one_bio(READ, bio, mirror_num); | 3222 | submit_one_bio(READ, bio, mirror_num, bio_flags); |
2988 | 3223 | ||
2989 | if (ret || !wait) { | 3224 | if (ret || !wait) { |
2990 | if (ret) | 3225 | if (ret) |