diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-15 11:36:38 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2008-07-15 11:36:38 -0400 |
commit | 8d2567a620ae8c24968a2bdc1c906c724fac1f6a (patch) | |
tree | 8e228abbadbe760e3f015d30c2e1180a67eeb8f9 | |
parent | bcf559e385ba099996c90469c817d2eb38aba418 (diff) | |
parent | 49f1487b2e41bd8439ea39a4f15b4064e823cc54 (diff) |
Merge branch 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
* 'for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (61 commits)
ext4: Documention update for new ordered mode and delayed allocation
ext4: do not set extents feature from the kernel
ext4: Don't allow nonextenst mount option for large filesystem
ext4: Enable delalloc by default.
ext4: delayed allocation i_blocks fix for stat
ext4: fix delalloc i_disksize early update issue
ext4: Handle page without buffers in ext4_*_writepage()
ext4: Add ordered mode support for delalloc
ext4: Invert lock ordering of page_lock and transaction start in delalloc
mm: Add range_cont mode for writeback
ext4: delayed allocation ENOSPC handling
percpu_counter: new function percpu_counter_sum_and_set
ext4: Add delayed allocation support in data=writeback mode
vfs: add hooks for ext4's delayed allocation support
jbd2: Remove data=ordered mode support using jbd buffer heads
ext4: Use new framework for data=ordered mode in JBD2
jbd2: Implement data=ordered mode handling via inodes
vfs: export filemap_fdatawrite_range()
ext4: Fix lock inversion in ext4_ext_truncate()
ext4: Invert the locking order of page_lock and transaction start
...
35 files changed, 2805 insertions, 1042 deletions
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt index 0c5086db8352..80e193d82e2e 100644 --- a/Documentation/filesystems/ext4.txt +++ b/Documentation/filesystems/ext4.txt | |||
@@ -13,72 +13,93 @@ Mailing list: linux-ext4@vger.kernel.org | |||
13 | 1. Quick usage instructions: | 13 | 1. Quick usage instructions: |
14 | =========================== | 14 | =========================== |
15 | 15 | ||
16 | - Grab updated e2fsprogs from | 16 | - Compile and install the latest version of e2fsprogs (as of this |
17 | ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs-interim/ | 17 | writing version 1.41) from: |
18 | This is a patchset on top of e2fsprogs-1.39, which can be found at | 18 | |
19 | http://sourceforge.net/project/showfiles.php?group_id=2406 | ||
20 | |||
21 | or | ||
22 | |||
19 | ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ | 23 | ftp://ftp.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/ |
20 | 24 | ||
21 | - It's still mke2fs -j /dev/hda1 | 25 | or grab the latest git repository from: |
26 | |||
27 | git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git | ||
28 | |||
29 | - Create a new filesystem using the ext4dev filesystem type: | ||
30 | |||
31 | # mke2fs -t ext4dev /dev/hda1 | ||
32 | |||
33 | Or configure an existing ext3 filesystem to support extents and set | ||
34 | the test_fs flag to indicate that it's ok for an in-development | ||
35 | filesystem to touch this filesystem: | ||
22 | 36 | ||
23 | - mount /dev/hda1 /wherever -t ext4dev | 37 | # tune2fs -O extents -E test_fs /dev/hda1 |
24 | 38 | ||
25 | - To enable extents, | 39 | If the filesystem was created with 128 byte inodes, it can be |
40 | converted to use 256 byte for greater efficiency via: | ||
26 | 41 | ||
27 | mount /dev/hda1 /wherever -t ext4dev -o extents | 42 | # tune2fs -I 256 /dev/hda1 |
28 | 43 | ||
29 | - The filesystem is compatible with the ext3 driver until you add a file | 44 | (Note: we currently do not have tools to convert an ext4dev |
30 | which has extents (ie: `mount -o extents', then create a file). | 45 | filesystem back to ext3; so please do not do try this on production |
46 | filesystems.) | ||
31 | 47 | ||
32 | NOTE: The "extents" mount flag is temporary. It will soon go away and | 48 | - Mounting: |
33 | extents will be enabled by the "-o extents" flag to mke2fs or tune2fs | 49 | |
50 | # mount -t ext4dev /dev/hda1 /wherever | ||
34 | 51 | ||
35 | - When comparing performance with other filesystems, remember that | 52 | - When comparing performance with other filesystems, remember that |
36 | ext3/4 by default offers higher data integrity guarantees than most. So | 53 | ext3/4 by default offers higher data integrity guarantees than most. |
37 | when comparing with a metadata-only journalling filesystem, use `mount -o | 54 | So when comparing with a metadata-only journalling filesystem, such |
38 | data=writeback'. And you might as well use `mount -o nobh' too along | 55 | as ext3, use `mount -o data=writeback'. And you might as well use |
39 | with it. Making the journal larger than the mke2fs default often helps | 56 | `mount -o nobh' too along with it. Making the journal larger than |
40 | performance with metadata-intensive workloads. | 57 | the mke2fs default often helps performance with metadata-intensive |
58 | workloads. | ||
41 | 59 | ||
42 | 2. Features | 60 | 2. Features |
43 | =========== | 61 | =========== |
44 | 62 | ||
45 | 2.1 Currently available | 63 | 2.1 Currently available |
46 | 64 | ||
47 | * ability to use filesystems > 16TB | 65 | * ability to use filesystems > 16TB (e2fsprogs support not available yet) |
48 | * extent format reduces metadata overhead (RAM, IO for access, transactions) | 66 | * extent format reduces metadata overhead (RAM, IO for access, transactions) |
49 | * extent format more robust in face of on-disk corruption due to magics, | 67 | * extent format more robust in face of on-disk corruption due to magics, |
50 | * internal redunancy in tree | 68 | * internal redunancy in tree |
51 | 69 | * improved file allocation (multi-block alloc) | |
52 | 2.1 Previously available, soon to be enabled by default by "mkefs.ext4": | 70 | * fix 32000 subdirectory limit |
53 | 71 | * nsec timestamps for mtime, atime, ctime, create time | |
54 | * dir_index and resize inode will be on by default | 72 | * inode version field on disk (NFSv4, Lustre) |
55 | * large inodes will be used by default for fast EAs, nsec timestamps, etc | 73 | * reduced e2fsck time via uninit_bg feature |
74 | * journal checksumming for robustness, performance | ||
75 | * persistent file preallocation (e.g for streaming media, databases) | ||
76 | * ability to pack bitmaps and inode tables into larger virtual groups via the | ||
77 | flex_bg feature | ||
78 | * large file support | ||
79 | * Inode allocation using large virtual block groups via flex_bg | ||
80 | * delayed allocation | ||
81 | * large block (up to pagesize) support | ||
82 | * efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force | ||
83 | the ordering) | ||
56 | 84 | ||
57 | 2.2 Candidate features for future inclusion | 85 | 2.2 Candidate features for future inclusion |
58 | 86 | ||
59 | There are several under discussion, whether they all make it in is | 87 | * Online defrag (patches available but not well tested) |
60 | partly a function of how much time everyone has to work on them: | 88 | * reduced mke2fs time via lazy itable initialization in conjuction with |
89 | the uninit_bg feature (capability to do this is available in e2fsprogs | ||
90 | but a kernel thread to do lazy zeroing of unused inode table blocks | ||
91 | after filesystem is first mounted is required for safety) | ||
61 | 92 | ||
62 | * improved file allocation (multi-block alloc, delayed alloc; basically done) | 93 | There are several others under discussion, whether they all make it in is |
63 | * fix 32000 subdirectory limit (patch exists, needs some e2fsck work) | 94 | partly a function of how much time everyone has to work on them. Features like |
64 | * nsec timestamps for mtime, atime, ctime, create time (patch exists, | 95 | metadata checksumming have been discussed and planned for a bit but no patches |
65 | needs some e2fsck work) | 96 | exist yet so I'm not sure they're in the near-term roadmap. |
66 | * inode version field on disk (NFSv4, Lustre; prototype exists) | ||
67 | * reduced mke2fs/e2fsck time via uninitialized groups (prototype exists) | ||
68 | * journal checksumming for robustness, performance (prototype exists) | ||
69 | * persistent file preallocation (e.g for streaming media, databases) | ||
70 | 97 | ||
71 | Features like metadata checksumming have been discussed and planned for | 98 | The big performance win will come with mballoc, delalloc and flex_bg |
72 | a bit but no patches exist yet so I'm not sure they're in the near-term | 99 | grouping of bitmaps and inode tables. Some test results available here: |
73 | roadmap. | ||
74 | 100 | ||
75 | The big performance win will come with mballoc and delalloc. CFS has | 101 | - http://www.bullopensource.org/ext4/20080530/ffsb-write-2.6.26-rc2.html |
76 | been using mballoc for a few years already with Lustre, and IBM + Bull | 102 | - http://www.bullopensource.org/ext4/20080530/ffsb-readwrite-2.6.26-rc2.html |
77 | did a lot of benchmarking on it. The reason it isn't in the first set of | ||
78 | patches is partly a manageability issue, and partly because it doesn't | ||
79 | directly affect the on-disk format (outside of much better allocation) | ||
80 | so it isn't critical to get into the first round of changes. I believe | ||
81 | Alex is working on a new set of patches right now. | ||
82 | 103 | ||
83 | 3. Options | 104 | 3. Options |
84 | ========== | 105 | ========== |
@@ -222,9 +243,11 @@ stripe=n Number of filesystem blocks that mballoc will try | |||
222 | to use for allocation size and alignment. For RAID5/6 | 243 | to use for allocation size and alignment. For RAID5/6 |
223 | systems this should be the number of data | 244 | systems this should be the number of data |
224 | disks * RAID chunk size in file system blocks. | 245 | disks * RAID chunk size in file system blocks. |
225 | 246 | delalloc (*) Deferring block allocation until write-out time. | |
247 | nodelalloc Disable delayed allocation. Blocks are allocation | ||
248 | when data is copied from user to page cache. | ||
226 | Data Mode | 249 | Data Mode |
227 | --------- | 250 | ========= |
228 | There are 3 different data modes: | 251 | There are 3 different data modes: |
229 | 252 | ||
230 | * writeback mode | 253 | * writeback mode |
@@ -236,10 +259,10 @@ typically provide the best ext4 performance. | |||
236 | 259 | ||
237 | * ordered mode | 260 | * ordered mode |
238 | In data=ordered mode, ext4 only officially journals metadata, but it logically | 261 | In data=ordered mode, ext4 only officially journals metadata, but it logically |
239 | groups metadata and data blocks into a single unit called a transaction. When | 262 | groups metadata information related to data changes with the data blocks into a |
240 | it's time to write the new metadata out to disk, the associated data blocks | 263 | single unit called a transaction. When it's time to write the new metadata |
241 | are written first. In general, this mode performs slightly slower than | 264 | out to disk, the associated data blocks are written first. In general, |
242 | writeback but significantly faster than journal mode. | 265 | this mode performs slightly slower than writeback but significantly faster than journal mode. |
243 | 266 | ||
244 | * journal mode | 267 | * journal mode |
245 | data=journal mode provides full data and metadata journaling. All new data is | 268 | data=journal mode provides full data and metadata journaling. All new data is |
@@ -247,7 +270,8 @@ written to the journal first, and then to its final location. | |||
247 | In the event of a crash, the journal can be replayed, bringing both data and | 270 | In the event of a crash, the journal can be replayed, bringing both data and |
248 | metadata into a consistent state. This mode is the slowest except when data | 271 | metadata into a consistent state. This mode is the slowest except when data |
249 | needs to be read from and written to disk at the same time where it | 272 | needs to be read from and written to disk at the same time where it |
250 | outperforms all others modes. | 273 | outperforms all others modes. Curently ext4 does not have delayed |
274 | allocation support if this data journalling mode is selected. | ||
251 | 275 | ||
252 | References | 276 | References |
253 | ========== | 277 | ========== |
@@ -256,7 +280,8 @@ kernel source: <file:fs/ext4/> | |||
256 | <file:fs/jbd2/> | 280 | <file:fs/jbd2/> |
257 | 281 | ||
258 | programs: http://e2fsprogs.sourceforge.net/ | 282 | programs: http://e2fsprogs.sourceforge.net/ |
259 | http://ext2resize.sourceforge.net | ||
260 | 283 | ||
261 | useful links: http://fedoraproject.org/wiki/ext3-devel | 284 | useful links: http://fedoraproject.org/wiki/ext3-devel |
262 | http://www.bullopensource.org/ext4/ | 285 | http://www.bullopensource.org/ext4/ |
286 | http://ext4.wiki.kernel.org/index.php/Main_Page | ||
287 | http://fedoraproject.org/wiki/Features/Ext4 | ||
diff --git a/fs/buffer.c b/fs/buffer.c index 0f51c0f7c266..5fa1512cd9a2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
@@ -1691,11 +1691,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page, | |||
1691 | */ | 1691 | */ |
1692 | clear_buffer_dirty(bh); | 1692 | clear_buffer_dirty(bh); |
1693 | set_buffer_uptodate(bh); | 1693 | set_buffer_uptodate(bh); |
1694 | } else if (!buffer_mapped(bh) && buffer_dirty(bh)) { | 1694 | } else if ((!buffer_mapped(bh) || buffer_delay(bh)) && |
1695 | buffer_dirty(bh)) { | ||
1695 | WARN_ON(bh->b_size != blocksize); | 1696 | WARN_ON(bh->b_size != blocksize); |
1696 | err = get_block(inode, block, bh, 1); | 1697 | err = get_block(inode, block, bh, 1); |
1697 | if (err) | 1698 | if (err) |
1698 | goto recover; | 1699 | goto recover; |
1700 | clear_buffer_delay(bh); | ||
1699 | if (buffer_new(bh)) { | 1701 | if (buffer_new(bh)) { |
1700 | /* blockdev mappings never come here */ | 1702 | /* blockdev mappings never come here */ |
1701 | clear_buffer_new(bh); | 1703 | clear_buffer_new(bh); |
@@ -1774,7 +1776,8 @@ recover: | |||
1774 | bh = head; | 1776 | bh = head; |
1775 | /* Recovery: lock and submit the mapped buffers */ | 1777 | /* Recovery: lock and submit the mapped buffers */ |
1776 | do { | 1778 | do { |
1777 | if (buffer_mapped(bh) && buffer_dirty(bh)) { | 1779 | if (buffer_mapped(bh) && buffer_dirty(bh) && |
1780 | !buffer_delay(bh)) { | ||
1778 | lock_buffer(bh); | 1781 | lock_buffer(bh); |
1779 | mark_buffer_async_write(bh); | 1782 | mark_buffer_async_write(bh); |
1780 | } else { | 1783 | } else { |
@@ -2061,6 +2064,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, | |||
2061 | struct page *page, void *fsdata) | 2064 | struct page *page, void *fsdata) |
2062 | { | 2065 | { |
2063 | struct inode *inode = mapping->host; | 2066 | struct inode *inode = mapping->host; |
2067 | int i_size_changed = 0; | ||
2064 | 2068 | ||
2065 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | 2069 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); |
2066 | 2070 | ||
@@ -2073,12 +2077,21 @@ int generic_write_end(struct file *file, struct address_space *mapping, | |||
2073 | */ | 2077 | */ |
2074 | if (pos+copied > inode->i_size) { | 2078 | if (pos+copied > inode->i_size) { |
2075 | i_size_write(inode, pos+copied); | 2079 | i_size_write(inode, pos+copied); |
2076 | mark_inode_dirty(inode); | 2080 | i_size_changed = 1; |
2077 | } | 2081 | } |
2078 | 2082 | ||
2079 | unlock_page(page); | 2083 | unlock_page(page); |
2080 | page_cache_release(page); | 2084 | page_cache_release(page); |
2081 | 2085 | ||
2086 | /* | ||
2087 | * Don't mark the inode dirty under page lock. First, it unnecessarily | ||
2088 | * makes the holding time of page lock longer. Second, it forces lock | ||
2089 | * ordering of page lock and transaction start for journaling | ||
2090 | * filesystems. | ||
2091 | */ | ||
2092 | if (i_size_changed) | ||
2093 | mark_inode_dirty(inode); | ||
2094 | |||
2082 | return copied; | 2095 | return copied; |
2083 | } | 2096 | } |
2084 | EXPORT_SYMBOL(generic_write_end); | 2097 | EXPORT_SYMBOL(generic_write_end); |
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9cc80b9cc8d8..495ab21b9832 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c | |||
@@ -47,7 +47,7 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, | |||
47 | ext4_group_t block_group) | 47 | ext4_group_t block_group) |
48 | { | 48 | { |
49 | ext4_group_t actual_group; | 49 | ext4_group_t actual_group; |
50 | ext4_get_group_no_and_offset(sb, block, &actual_group, 0); | 50 | ext4_get_group_no_and_offset(sb, block, &actual_group, NULL); |
51 | if (actual_group == block_group) | 51 | if (actual_group == block_group) |
52 | return 1; | 52 | return 1; |
53 | return 0; | 53 | return 0; |
@@ -121,12 +121,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, | |||
121 | le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); | 121 | le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); |
122 | } | 122 | } |
123 | } else { /* For META_BG_BLOCK_GROUPS */ | 123 | } else { /* For META_BG_BLOCK_GROUPS */ |
124 | int group_rel = (block_group - | 124 | bit_max += ext4_bg_num_gdb(sb, block_group); |
125 | le32_to_cpu(sbi->s_es->s_first_meta_bg)) % | ||
126 | EXT4_DESC_PER_BLOCK(sb); | ||
127 | if (group_rel == 0 || group_rel == 1 || | ||
128 | (group_rel == EXT4_DESC_PER_BLOCK(sb) - 1)) | ||
129 | bit_max += 1; | ||
130 | } | 125 | } |
131 | 126 | ||
132 | if (block_group == sbi->s_groups_count - 1) { | 127 | if (block_group == sbi->s_groups_count - 1) { |
@@ -295,7 +290,7 @@ err_out: | |||
295 | return 0; | 290 | return 0; |
296 | } | 291 | } |
297 | /** | 292 | /** |
298 | * read_block_bitmap() | 293 | * ext4_read_block_bitmap() |
299 | * @sb: super block | 294 | * @sb: super block |
300 | * @block_group: given block group | 295 | * @block_group: given block group |
301 | * | 296 | * |
@@ -305,7 +300,7 @@ err_out: | |||
305 | * Return buffer_head on success or NULL in case of failure. | 300 | * Return buffer_head on success or NULL in case of failure. |
306 | */ | 301 | */ |
307 | struct buffer_head * | 302 | struct buffer_head * |
308 | read_block_bitmap(struct super_block *sb, ext4_group_t block_group) | 303 | ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) |
309 | { | 304 | { |
310 | struct ext4_group_desc * desc; | 305 | struct ext4_group_desc * desc; |
311 | struct buffer_head * bh = NULL; | 306 | struct buffer_head * bh = NULL; |
@@ -409,8 +404,7 @@ restart: | |||
409 | prev = rsv; | 404 | prev = rsv; |
410 | } | 405 | } |
411 | printk("Window map complete.\n"); | 406 | printk("Window map complete.\n"); |
412 | if (bad) | 407 | BUG_ON(bad); |
413 | BUG(); | ||
414 | } | 408 | } |
415 | #define rsv_window_dump(root, verbose) \ | 409 | #define rsv_window_dump(root, verbose) \ |
416 | __rsv_window_dump((root), (verbose), __func__) | 410 | __rsv_window_dump((root), (verbose), __func__) |
@@ -694,7 +688,7 @@ do_more: | |||
694 | count -= overflow; | 688 | count -= overflow; |
695 | } | 689 | } |
696 | brelse(bitmap_bh); | 690 | brelse(bitmap_bh); |
697 | bitmap_bh = read_block_bitmap(sb, block_group); | 691 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); |
698 | if (!bitmap_bh) | 692 | if (!bitmap_bh) |
699 | goto error_return; | 693 | goto error_return; |
700 | desc = ext4_get_group_desc (sb, block_group, &gd_bh); | 694 | desc = ext4_get_group_desc (sb, block_group, &gd_bh); |
@@ -810,6 +804,13 @@ do_more: | |||
810 | spin_unlock(sb_bgl_lock(sbi, block_group)); | 804 | spin_unlock(sb_bgl_lock(sbi, block_group)); |
811 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | 805 | percpu_counter_add(&sbi->s_freeblocks_counter, count); |
812 | 806 | ||
807 | if (sbi->s_log_groups_per_flex) { | ||
808 | ext4_group_t flex_group = ext4_flex_group(sbi, block_group); | ||
809 | spin_lock(sb_bgl_lock(sbi, flex_group)); | ||
810 | sbi->s_flex_groups[flex_group].free_blocks += count; | ||
811 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | ||
812 | } | ||
813 | |||
813 | /* We dirtied the bitmap block */ | 814 | /* We dirtied the bitmap block */ |
814 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); | 815 | BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); |
815 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | 816 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); |
@@ -1598,23 +1599,35 @@ out: | |||
1598 | 1599 | ||
1599 | /** | 1600 | /** |
1600 | * ext4_has_free_blocks() | 1601 | * ext4_has_free_blocks() |
1601 | * @sbi: in-core super block structure. | 1602 | * @sbi: in-core super block structure. |
1603 | * @nblocks: number of neeed blocks | ||
1602 | * | 1604 | * |
1603 | * Check if filesystem has at least 1 free block available for allocation. | 1605 | * Check if filesystem has free blocks available for allocation. |
1606 | * Return the number of blocks avaible for allocation for this request | ||
1607 | * On success, return nblocks | ||
1604 | */ | 1608 | */ |
1605 | static int ext4_has_free_blocks(struct ext4_sb_info *sbi) | 1609 | ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, |
1610 | ext4_fsblk_t nblocks) | ||
1606 | { | 1611 | { |
1607 | ext4_fsblk_t free_blocks, root_blocks; | 1612 | ext4_fsblk_t free_blocks; |
1613 | ext4_fsblk_t root_blocks = 0; | ||
1608 | 1614 | ||
1609 | free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); | 1615 | free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); |
1610 | root_blocks = ext4_r_blocks_count(sbi->s_es); | 1616 | |
1611 | if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && | 1617 | if (!capable(CAP_SYS_RESOURCE) && |
1612 | sbi->s_resuid != current->fsuid && | 1618 | sbi->s_resuid != current->fsuid && |
1613 | (sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) { | 1619 | (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) |
1614 | return 0; | 1620 | root_blocks = ext4_r_blocks_count(sbi->s_es); |
1615 | } | 1621 | #ifdef CONFIG_SMP |
1616 | return 1; | 1622 | if (free_blocks - root_blocks < FBC_BATCH) |
1617 | } | 1623 | free_blocks = |
1624 | percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); | ||
1625 | #endif | ||
1626 | if (free_blocks - root_blocks < nblocks) | ||
1627 | return free_blocks - root_blocks; | ||
1628 | return nblocks; | ||
1629 | } | ||
1630 | |||
1618 | 1631 | ||
1619 | /** | 1632 | /** |
1620 | * ext4_should_retry_alloc() | 1633 | * ext4_should_retry_alloc() |
@@ -1630,7 +1643,7 @@ static int ext4_has_free_blocks(struct ext4_sb_info *sbi) | |||
1630 | */ | 1643 | */ |
1631 | int ext4_should_retry_alloc(struct super_block *sb, int *retries) | 1644 | int ext4_should_retry_alloc(struct super_block *sb, int *retries) |
1632 | { | 1645 | { |
1633 | if (!ext4_has_free_blocks(EXT4_SB(sb)) || (*retries)++ > 3) | 1646 | if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) |
1634 | return 0; | 1647 | return 0; |
1635 | 1648 | ||
1636 | jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); | 1649 | jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); |
@@ -1639,20 +1652,24 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) | |||
1639 | } | 1652 | } |
1640 | 1653 | ||
1641 | /** | 1654 | /** |
1642 | * ext4_new_blocks_old() -- core block(s) allocation function | 1655 | * ext4_old_new_blocks() -- core block bitmap based block allocation function |
1656 | * | ||
1643 | * @handle: handle to this transaction | 1657 | * @handle: handle to this transaction |
1644 | * @inode: file inode | 1658 | * @inode: file inode |
1645 | * @goal: given target block(filesystem wide) | 1659 | * @goal: given target block(filesystem wide) |
1646 | * @count: target number of blocks to allocate | 1660 | * @count: target number of blocks to allocate |
1647 | * @errp: error code | 1661 | * @errp: error code |
1648 | * | 1662 | * |
1649 | * ext4_new_blocks uses a goal block to assist allocation. It tries to | 1663 | * ext4_old_new_blocks uses a goal block to assist allocation and look up |
1650 | * allocate block(s) from the block group contains the goal block first. If that | 1664 | * the block bitmap directly to do block allocation. It tries to |
1651 | * fails, it will try to allocate block(s) from other block groups without | 1665 | * allocate block(s) from the block group contains the goal block first. If |
1652 | * any specific goal block. | 1666 | * that fails, it will try to allocate block(s) from other block groups |
1667 | * without any specific goal block. | ||
1668 | * | ||
1669 | * This function is called when -o nomballoc mount option is enabled | ||
1653 | * | 1670 | * |
1654 | */ | 1671 | */ |
1655 | ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | 1672 | ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, |
1656 | ext4_fsblk_t goal, unsigned long *count, int *errp) | 1673 | ext4_fsblk_t goal, unsigned long *count, int *errp) |
1657 | { | 1674 | { |
1658 | struct buffer_head *bitmap_bh = NULL; | 1675 | struct buffer_head *bitmap_bh = NULL; |
@@ -1676,13 +1693,26 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | |||
1676 | ext4_group_t ngroups; | 1693 | ext4_group_t ngroups; |
1677 | unsigned long num = *count; | 1694 | unsigned long num = *count; |
1678 | 1695 | ||
1679 | *errp = -ENOSPC; | ||
1680 | sb = inode->i_sb; | 1696 | sb = inode->i_sb; |
1681 | if (!sb) { | 1697 | if (!sb) { |
1698 | *errp = -ENODEV; | ||
1682 | printk("ext4_new_block: nonexistent device"); | 1699 | printk("ext4_new_block: nonexistent device"); |
1683 | return 0; | 1700 | return 0; |
1684 | } | 1701 | } |
1685 | 1702 | ||
1703 | sbi = EXT4_SB(sb); | ||
1704 | if (!EXT4_I(inode)->i_delalloc_reserved_flag) { | ||
1705 | /* | ||
1706 | * With delalloc we already reserved the blocks | ||
1707 | */ | ||
1708 | *count = ext4_has_free_blocks(sbi, *count); | ||
1709 | } | ||
1710 | if (*count == 0) { | ||
1711 | *errp = -ENOSPC; | ||
1712 | return 0; /*return with ENOSPC error */ | ||
1713 | } | ||
1714 | num = *count; | ||
1715 | |||
1686 | /* | 1716 | /* |
1687 | * Check quota for allocation of this block. | 1717 | * Check quota for allocation of this block. |
1688 | */ | 1718 | */ |
@@ -1706,11 +1736,6 @@ ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | |||
1706 | if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) | 1736 | if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) |
1707 | my_rsv = &block_i->rsv_window_node; | 1737 | my_rsv = &block_i->rsv_window_node; |
1708 | 1738 | ||
1709 | if (!ext4_has_free_blocks(sbi)) { | ||
1710 | *errp = -ENOSPC; | ||
1711 | goto out; | ||
1712 | } | ||
1713 | |||
1714 | /* | 1739 | /* |
1715 | * First, test whether the goal block is free. | 1740 | * First, test whether the goal block is free. |
1716 | */ | 1741 | */ |
@@ -1734,7 +1759,7 @@ retry_alloc: | |||
1734 | my_rsv = NULL; | 1759 | my_rsv = NULL; |
1735 | 1760 | ||
1736 | if (free_blocks > 0) { | 1761 | if (free_blocks > 0) { |
1737 | bitmap_bh = read_block_bitmap(sb, group_no); | 1762 | bitmap_bh = ext4_read_block_bitmap(sb, group_no); |
1738 | if (!bitmap_bh) | 1763 | if (!bitmap_bh) |
1739 | goto io_error; | 1764 | goto io_error; |
1740 | grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, | 1765 | grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, |
@@ -1770,7 +1795,7 @@ retry_alloc: | |||
1770 | continue; | 1795 | continue; |
1771 | 1796 | ||
1772 | brelse(bitmap_bh); | 1797 | brelse(bitmap_bh); |
1773 | bitmap_bh = read_block_bitmap(sb, group_no); | 1798 | bitmap_bh = ext4_read_block_bitmap(sb, group_no); |
1774 | if (!bitmap_bh) | 1799 | if (!bitmap_bh) |
1775 | goto io_error; | 1800 | goto io_error; |
1776 | /* | 1801 | /* |
@@ -1882,7 +1907,15 @@ allocated: | |||
1882 | le16_add_cpu(&gdp->bg_free_blocks_count, -num); | 1907 | le16_add_cpu(&gdp->bg_free_blocks_count, -num); |
1883 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); | 1908 | gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); |
1884 | spin_unlock(sb_bgl_lock(sbi, group_no)); | 1909 | spin_unlock(sb_bgl_lock(sbi, group_no)); |
1885 | percpu_counter_sub(&sbi->s_freeblocks_counter, num); | 1910 | if (!EXT4_I(inode)->i_delalloc_reserved_flag) |
1911 | percpu_counter_sub(&sbi->s_freeblocks_counter, num); | ||
1912 | |||
1913 | if (sbi->s_log_groups_per_flex) { | ||
1914 | ext4_group_t flex_group = ext4_flex_group(sbi, group_no); | ||
1915 | spin_lock(sb_bgl_lock(sbi, flex_group)); | ||
1916 | sbi->s_flex_groups[flex_group].free_blocks -= num; | ||
1917 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | ||
1918 | } | ||
1886 | 1919 | ||
1887 | BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); | 1920 | BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); |
1888 | err = ext4_journal_dirty_metadata(handle, gdp_bh); | 1921 | err = ext4_journal_dirty_metadata(handle, gdp_bh); |
@@ -1915,46 +1948,104 @@ out: | |||
1915 | return 0; | 1948 | return 0; |
1916 | } | 1949 | } |
1917 | 1950 | ||
1918 | ext4_fsblk_t ext4_new_block(handle_t *handle, struct inode *inode, | 1951 | #define EXT4_META_BLOCK 0x1 |
1919 | ext4_fsblk_t goal, int *errp) | 1952 | |
1953 | static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, | ||
1954 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
1955 | unsigned long *count, int *errp, int flags) | ||
1920 | { | 1956 | { |
1921 | struct ext4_allocation_request ar; | 1957 | struct ext4_allocation_request ar; |
1922 | ext4_fsblk_t ret; | 1958 | ext4_fsblk_t ret; |
1923 | 1959 | ||
1924 | if (!test_opt(inode->i_sb, MBALLOC)) { | 1960 | if (!test_opt(inode->i_sb, MBALLOC)) { |
1925 | unsigned long count = 1; | 1961 | return ext4_old_new_blocks(handle, inode, goal, count, errp); |
1926 | ret = ext4_new_blocks_old(handle, inode, goal, &count, errp); | ||
1927 | return ret; | ||
1928 | } | 1962 | } |
1929 | 1963 | ||
1930 | memset(&ar, 0, sizeof(ar)); | 1964 | memset(&ar, 0, sizeof(ar)); |
1965 | /* Fill with neighbour allocated blocks */ | ||
1966 | |||
1931 | ar.inode = inode; | 1967 | ar.inode = inode; |
1932 | ar.goal = goal; | 1968 | ar.goal = goal; |
1933 | ar.len = 1; | 1969 | ar.len = *count; |
1970 | ar.logical = iblock; | ||
1971 | |||
1972 | if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) | ||
1973 | /* enable in-core preallocation for data block allocation */ | ||
1974 | ar.flags = EXT4_MB_HINT_DATA; | ||
1975 | else | ||
1976 | /* disable in-core preallocation for non-regular files */ | ||
1977 | ar.flags = 0; | ||
1978 | |||
1934 | ret = ext4_mb_new_blocks(handle, &ar, errp); | 1979 | ret = ext4_mb_new_blocks(handle, &ar, errp); |
1980 | *count = ar.len; | ||
1935 | return ret; | 1981 | return ret; |
1936 | } | 1982 | } |
1937 | 1983 | ||
1938 | ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | 1984 | /* |
1985 | * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks | ||
1986 | * | ||
1987 | * @handle: handle to this transaction | ||
1988 | * @inode: file inode | ||
1989 | * @goal: given target block(filesystem wide) | ||
1990 | * @count: total number of blocks need | ||
1991 | * @errp: error code | ||
1992 | * | ||
1993 | * Return 1st allocated block numberon success, *count stores total account | ||
1994 | * error stores in errp pointer | ||
1995 | */ | ||
1996 | ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, | ||
1939 | ext4_fsblk_t goal, unsigned long *count, int *errp) | 1997 | ext4_fsblk_t goal, unsigned long *count, int *errp) |
1940 | { | 1998 | { |
1941 | struct ext4_allocation_request ar; | ||
1942 | ext4_fsblk_t ret; | 1999 | ext4_fsblk_t ret; |
1943 | 2000 | ret = do_blk_alloc(handle, inode, 0, goal, | |
1944 | if (!test_opt(inode->i_sb, MBALLOC)) { | 2001 | count, errp, EXT4_META_BLOCK); |
1945 | ret = ext4_new_blocks_old(handle, inode, goal, count, errp); | 2002 | /* |
1946 | return ret; | 2003 | * Account for the allocated meta blocks |
2004 | */ | ||
2005 | if (!(*errp)) { | ||
2006 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | ||
2007 | EXT4_I(inode)->i_allocated_meta_blocks += *count; | ||
2008 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1947 | } | 2009 | } |
1948 | |||
1949 | memset(&ar, 0, sizeof(ar)); | ||
1950 | ar.inode = inode; | ||
1951 | ar.goal = goal; | ||
1952 | ar.len = *count; | ||
1953 | ret = ext4_mb_new_blocks(handle, &ar, errp); | ||
1954 | *count = ar.len; | ||
1955 | return ret; | 2010 | return ret; |
1956 | } | 2011 | } |
1957 | 2012 | ||
2013 | /* | ||
2014 | * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks | ||
2015 | * | ||
2016 | * @handle: handle to this transaction | ||
2017 | * @inode: file inode | ||
2018 | * @goal: given target block(filesystem wide) | ||
2019 | * @errp: error code | ||
2020 | * | ||
2021 | * Return allocated block number on success | ||
2022 | */ | ||
2023 | ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, | ||
2024 | ext4_fsblk_t goal, int *errp) | ||
2025 | { | ||
2026 | unsigned long count = 1; | ||
2027 | return ext4_new_meta_blocks(handle, inode, goal, &count, errp); | ||
2028 | } | ||
2029 | |||
2030 | /* | ||
2031 | * ext4_new_blocks() -- allocate data blocks | ||
2032 | * | ||
2033 | * @handle: handle to this transaction | ||
2034 | * @inode: file inode | ||
2035 | * @goal: given target block(filesystem wide) | ||
2036 | * @count: total number of blocks need | ||
2037 | * @errp: error code | ||
2038 | * | ||
2039 | * Return 1st allocated block numberon success, *count stores total account | ||
2040 | * error stores in errp pointer | ||
2041 | */ | ||
2042 | |||
2043 | ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, | ||
2044 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
2045 | unsigned long *count, int *errp) | ||
2046 | { | ||
2047 | return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); | ||
2048 | } | ||
1958 | 2049 | ||
1959 | /** | 2050 | /** |
1960 | * ext4_count_free_blocks() -- count filesystem free blocks | 2051 | * ext4_count_free_blocks() -- count filesystem free blocks |
@@ -1986,7 +2077,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) | |||
1986 | continue; | 2077 | continue; |
1987 | desc_count += le16_to_cpu(gdp->bg_free_blocks_count); | 2078 | desc_count += le16_to_cpu(gdp->bg_free_blocks_count); |
1988 | brelse(bitmap_bh); | 2079 | brelse(bitmap_bh); |
1989 | bitmap_bh = read_block_bitmap(sb, i); | 2080 | bitmap_bh = ext4_read_block_bitmap(sb, i); |
1990 | if (bitmap_bh == NULL) | 2081 | if (bitmap_bh == NULL) |
1991 | continue; | 2082 | continue; |
1992 | 2083 | ||
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2bf0331ea194..d3d23d73c08b 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c | |||
@@ -129,7 +129,8 @@ static int ext4_readdir(struct file * filp, | |||
129 | struct buffer_head *bh = NULL; | 129 | struct buffer_head *bh = NULL; |
130 | 130 | ||
131 | map_bh.b_state = 0; | 131 | map_bh.b_state = 0; |
132 | err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, 0, 0); | 132 | err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, |
133 | 0, 0, 0); | ||
133 | if (err > 0) { | 134 | if (err > 0) { |
134 | pgoff_t index = map_bh.b_blocknr >> | 135 | pgoff_t index = map_bh.b_blocknr >> |
135 | (PAGE_CACHE_SHIFT - inode->i_blkbits); | 136 | (PAGE_CACHE_SHIFT - inode->i_blkbits); |
@@ -272,7 +273,7 @@ static void free_rb_tree_fname(struct rb_root *root) | |||
272 | 273 | ||
273 | while (n) { | 274 | while (n) { |
274 | /* Do the node's children first */ | 275 | /* Do the node's children first */ |
275 | if ((n)->rb_left) { | 276 | if (n->rb_left) { |
276 | n = n->rb_left; | 277 | n = n->rb_left; |
277 | continue; | 278 | continue; |
278 | } | 279 | } |
@@ -301,24 +302,18 @@ static void free_rb_tree_fname(struct rb_root *root) | |||
301 | parent->rb_right = NULL; | 302 | parent->rb_right = NULL; |
302 | n = parent; | 303 | n = parent; |
303 | } | 304 | } |
304 | root->rb_node = NULL; | ||
305 | } | 305 | } |
306 | 306 | ||
307 | 307 | ||
308 | static struct dir_private_info *create_dir_info(loff_t pos) | 308 | static struct dir_private_info *ext4_htree_create_dir_info(loff_t pos) |
309 | { | 309 | { |
310 | struct dir_private_info *p; | 310 | struct dir_private_info *p; |
311 | 311 | ||
312 | p = kmalloc(sizeof(struct dir_private_info), GFP_KERNEL); | 312 | p = kzalloc(sizeof(struct dir_private_info), GFP_KERNEL); |
313 | if (!p) | 313 | if (!p) |
314 | return NULL; | 314 | return NULL; |
315 | p->root.rb_node = NULL; | ||
316 | p->curr_node = NULL; | ||
317 | p->extra_fname = NULL; | ||
318 | p->last_pos = 0; | ||
319 | p->curr_hash = pos2maj_hash(pos); | 315 | p->curr_hash = pos2maj_hash(pos); |
320 | p->curr_minor_hash = pos2min_hash(pos); | 316 | p->curr_minor_hash = pos2min_hash(pos); |
321 | p->next_hash = 0; | ||
322 | return p; | 317 | return p; |
323 | } | 318 | } |
324 | 319 | ||
@@ -433,7 +428,7 @@ static int ext4_dx_readdir(struct file * filp, | |||
433 | int ret; | 428 | int ret; |
434 | 429 | ||
435 | if (!info) { | 430 | if (!info) { |
436 | info = create_dir_info(filp->f_pos); | 431 | info = ext4_htree_create_dir_info(filp->f_pos); |
437 | if (!info) | 432 | if (!info) |
438 | return -ENOMEM; | 433 | return -ENOMEM; |
439 | filp->private_data = info; | 434 | filp->private_data = info; |
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 8158083f7ac0..303e41cf7b14 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -22,7 +22,7 @@ | |||
22 | #include "ext4_i.h" | 22 | #include "ext4_i.h" |
23 | 23 | ||
24 | /* | 24 | /* |
25 | * The second extended filesystem constants/structures | 25 | * The fourth extended filesystem constants/structures |
26 | */ | 26 | */ |
27 | 27 | ||
28 | /* | 28 | /* |
@@ -45,7 +45,7 @@ | |||
45 | #define ext4_debug(f, a...) \ | 45 | #define ext4_debug(f, a...) \ |
46 | do { \ | 46 | do { \ |
47 | printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ | 47 | printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ |
48 | __FILE__, __LINE__, __FUNCTION__); \ | 48 | __FILE__, __LINE__, __func__); \ |
49 | printk (KERN_DEBUG f, ## a); \ | 49 | printk (KERN_DEBUG f, ## a); \ |
50 | } while (0) | 50 | } while (0) |
51 | #else | 51 | #else |
@@ -74,6 +74,9 @@ | |||
74 | #define EXT4_MB_HINT_GOAL_ONLY 256 | 74 | #define EXT4_MB_HINT_GOAL_ONLY 256 |
75 | /* goal is meaningful */ | 75 | /* goal is meaningful */ |
76 | #define EXT4_MB_HINT_TRY_GOAL 512 | 76 | #define EXT4_MB_HINT_TRY_GOAL 512 |
77 | /* blocks already pre-reserved by delayed allocation */ | ||
78 | #define EXT4_MB_DELALLOC_RESERVED 1024 | ||
79 | |||
77 | 80 | ||
78 | struct ext4_allocation_request { | 81 | struct ext4_allocation_request { |
79 | /* target inode for block we're allocating */ | 82 | /* target inode for block we're allocating */ |
@@ -170,6 +173,15 @@ struct ext4_group_desc | |||
170 | __u32 bg_reserved2[3]; | 173 | __u32 bg_reserved2[3]; |
171 | }; | 174 | }; |
172 | 175 | ||
176 | /* | ||
177 | * Structure of a flex block group info | ||
178 | */ | ||
179 | |||
180 | struct flex_groups { | ||
181 | __u32 free_inodes; | ||
182 | __u32 free_blocks; | ||
183 | }; | ||
184 | |||
173 | #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ | 185 | #define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ |
174 | #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ | 186 | #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ |
175 | #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ | 187 | #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ |
@@ -527,6 +539,7 @@ do { \ | |||
527 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ | 539 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ |
528 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ | 540 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ |
529 | #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ | 541 | #define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ |
542 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ | ||
530 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ | 543 | /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ |
531 | #ifndef _LINUX_EXT2_FS_H | 544 | #ifndef _LINUX_EXT2_FS_H |
532 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt | 545 | #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt |
@@ -647,7 +660,10 @@ struct ext4_super_block { | |||
647 | __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ | 660 | __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ |
648 | __le64 s_mmp_block; /* Block for multi-mount protection */ | 661 | __le64 s_mmp_block; /* Block for multi-mount protection */ |
649 | __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ | 662 | __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ |
650 | __u32 s_reserved[163]; /* Padding to the end of the block */ | 663 | __u8 s_log_groups_per_flex; /* FLEX_BG group size */ |
664 | __u8 s_reserved_char_pad2; | ||
665 | __le16 s_reserved_pad; | ||
666 | __u32 s_reserved[162]; /* Padding to the end of the block */ | ||
651 | }; | 667 | }; |
652 | 668 | ||
653 | #ifdef __KERNEL__ | 669 | #ifdef __KERNEL__ |
@@ -958,12 +974,17 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, | |||
958 | extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); | 974 | extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); |
959 | extern unsigned long ext4_bg_num_gdb(struct super_block *sb, | 975 | extern unsigned long ext4_bg_num_gdb(struct super_block *sb, |
960 | ext4_group_t group); | 976 | ext4_group_t group); |
961 | extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, | 977 | extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, |
962 | ext4_fsblk_t goal, int *errp); | 978 | ext4_fsblk_t goal, int *errp); |
963 | extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, | 979 | extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, |
964 | ext4_fsblk_t goal, unsigned long *count, int *errp); | 980 | ext4_fsblk_t goal, unsigned long *count, int *errp); |
965 | extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, | 981 | extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, |
982 | ext4_lblk_t iblock, ext4_fsblk_t goal, | ||
983 | unsigned long *count, int *errp); | ||
984 | extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, | ||
966 | ext4_fsblk_t goal, unsigned long *count, int *errp); | 985 | ext4_fsblk_t goal, unsigned long *count, int *errp); |
986 | extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, | ||
987 | ext4_fsblk_t nblocks); | ||
967 | extern void ext4_free_blocks (handle_t *handle, struct inode *inode, | 988 | extern void ext4_free_blocks (handle_t *handle, struct inode *inode, |
968 | ext4_fsblk_t block, unsigned long count, int metadata); | 989 | ext4_fsblk_t block, unsigned long count, int metadata); |
969 | extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, | 990 | extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, |
@@ -1016,9 +1037,14 @@ extern int __init init_ext4_mballoc(void); | |||
1016 | extern void exit_ext4_mballoc(void); | 1037 | extern void exit_ext4_mballoc(void); |
1017 | extern void ext4_mb_free_blocks(handle_t *, struct inode *, | 1038 | extern void ext4_mb_free_blocks(handle_t *, struct inode *, |
1018 | unsigned long, unsigned long, int, unsigned long *); | 1039 | unsigned long, unsigned long, int, unsigned long *); |
1040 | extern int ext4_mb_add_more_groupinfo(struct super_block *sb, | ||
1041 | ext4_group_t i, struct ext4_group_desc *desc); | ||
1042 | extern void ext4_mb_update_group_info(struct ext4_group_info *grp, | ||
1043 | ext4_grpblk_t add); | ||
1019 | 1044 | ||
1020 | 1045 | ||
1021 | /* inode.c */ | 1046 | /* inode.c */ |
1047 | void ext4_da_release_space(struct inode *inode, int used, int to_free); | ||
1022 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, | 1048 | int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, |
1023 | struct buffer_head *bh, ext4_fsblk_t blocknr); | 1049 | struct buffer_head *bh, ext4_fsblk_t blocknr); |
1024 | struct buffer_head *ext4_getblk(handle_t *, struct inode *, | 1050 | struct buffer_head *ext4_getblk(handle_t *, struct inode *, |
@@ -1033,19 +1059,23 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
1033 | extern struct inode *ext4_iget(struct super_block *, unsigned long); | 1059 | extern struct inode *ext4_iget(struct super_block *, unsigned long); |
1034 | extern int ext4_write_inode (struct inode *, int); | 1060 | extern int ext4_write_inode (struct inode *, int); |
1035 | extern int ext4_setattr (struct dentry *, struct iattr *); | 1061 | extern int ext4_setattr (struct dentry *, struct iattr *); |
1062 | extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | ||
1063 | struct kstat *stat); | ||
1036 | extern void ext4_delete_inode (struct inode *); | 1064 | extern void ext4_delete_inode (struct inode *); |
1037 | extern int ext4_sync_inode (handle_t *, struct inode *); | 1065 | extern int ext4_sync_inode (handle_t *, struct inode *); |
1038 | extern void ext4_discard_reservation (struct inode *); | 1066 | extern void ext4_discard_reservation (struct inode *); |
1039 | extern void ext4_dirty_inode(struct inode *); | 1067 | extern void ext4_dirty_inode(struct inode *); |
1040 | extern int ext4_change_inode_journal_flag(struct inode *, int); | 1068 | extern int ext4_change_inode_journal_flag(struct inode *, int); |
1041 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); | 1069 | extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); |
1070 | extern int ext4_can_truncate(struct inode *inode); | ||
1042 | extern void ext4_truncate (struct inode *); | 1071 | extern void ext4_truncate (struct inode *); |
1043 | extern void ext4_set_inode_flags(struct inode *); | 1072 | extern void ext4_set_inode_flags(struct inode *); |
1044 | extern void ext4_get_inode_flags(struct ext4_inode_info *); | 1073 | extern void ext4_get_inode_flags(struct ext4_inode_info *); |
1045 | extern void ext4_set_aops(struct inode *inode); | 1074 | extern void ext4_set_aops(struct inode *inode); |
1046 | extern int ext4_writepage_trans_blocks(struct inode *); | 1075 | extern int ext4_writepage_trans_blocks(struct inode *); |
1047 | extern int ext4_block_truncate_page(handle_t *handle, struct page *page, | 1076 | extern int ext4_block_truncate_page(handle_t *handle, |
1048 | struct address_space *mapping, loff_t from); | 1077 | struct address_space *mapping, loff_t from); |
1078 | extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); | ||
1049 | 1079 | ||
1050 | /* ioctl.c */ | 1080 | /* ioctl.c */ |
1051 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); | 1081 | extern long ext4_ioctl(struct file *, unsigned int, unsigned long); |
@@ -1159,10 +1189,21 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, | |||
1159 | } | 1189 | } |
1160 | 1190 | ||
1161 | 1191 | ||
1192 | static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, | ||
1193 | ext4_group_t block_group) | ||
1194 | { | ||
1195 | return block_group >> sbi->s_log_groups_per_flex; | ||
1196 | } | ||
1197 | |||
1198 | static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) | ||
1199 | { | ||
1200 | return 1 << sbi->s_log_groups_per_flex; | ||
1201 | } | ||
1202 | |||
1162 | #define ext4_std_error(sb, errno) \ | 1203 | #define ext4_std_error(sb, errno) \ |
1163 | do { \ | 1204 | do { \ |
1164 | if ((errno)) \ | 1205 | if ((errno)) \ |
1165 | __ext4_std_error((sb), __FUNCTION__, (errno)); \ | 1206 | __ext4_std_error((sb), __func__, (errno)); \ |
1166 | } while (0) | 1207 | } while (0) |
1167 | 1208 | ||
1168 | /* | 1209 | /* |
@@ -1191,7 +1232,7 @@ extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
1191 | ext4_lblk_t iblock, | 1232 | ext4_lblk_t iblock, |
1192 | unsigned long max_blocks, struct buffer_head *bh_result, | 1233 | unsigned long max_blocks, struct buffer_head *bh_result, |
1193 | int create, int extend_disksize); | 1234 | int create, int extend_disksize); |
1194 | extern void ext4_ext_truncate(struct inode *, struct page *); | 1235 | extern void ext4_ext_truncate(struct inode *); |
1195 | extern void ext4_ext_init(struct super_block *); | 1236 | extern void ext4_ext_init(struct super_block *); |
1196 | extern void ext4_ext_release(struct super_block *); | 1237 | extern void ext4_ext_release(struct super_block *); |
1197 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | 1238 | extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, |
@@ -1199,7 +1240,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, | |||
1199 | extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, | 1240 | extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, |
1200 | sector_t block, unsigned long max_blocks, | 1241 | sector_t block, unsigned long max_blocks, |
1201 | struct buffer_head *bh, int create, | 1242 | struct buffer_head *bh, int create, |
1202 | int extend_disksize); | 1243 | int extend_disksize, int flag); |
1203 | #endif /* __KERNEL__ */ | 1244 | #endif /* __KERNEL__ */ |
1204 | 1245 | ||
1205 | #endif /* _EXT4_H */ | 1246 | #endif /* _EXT4_H */ |
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 75333b595fab..6c166c0a54b7 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h | |||
@@ -212,6 +212,7 @@ static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) | |||
212 | (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); | 212 | (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); |
213 | } | 213 | } |
214 | 214 | ||
215 | extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); | ||
215 | extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); | 216 | extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); |
216 | extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); | 217 | extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); |
217 | extern int ext4_extent_tree_init(handle_t *, struct inode *); | 218 | extern int ext4_extent_tree_init(handle_t *, struct inode *); |
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 26a4ae255d79..ef7409f0e7e4 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h | |||
@@ -79,7 +79,7 @@ struct ext4_ext_cache { | |||
79 | }; | 79 | }; |
80 | 80 | ||
81 | /* | 81 | /* |
82 | * third extended file system inode data in memory | 82 | * fourth extended file system inode data in memory |
83 | */ | 83 | */ |
84 | struct ext4_inode_info { | 84 | struct ext4_inode_info { |
85 | __le32 i_data[15]; /* unconverted */ | 85 | __le32 i_data[15]; /* unconverted */ |
@@ -150,6 +150,7 @@ struct ext4_inode_info { | |||
150 | */ | 150 | */ |
151 | struct rw_semaphore i_data_sem; | 151 | struct rw_semaphore i_data_sem; |
152 | struct inode vfs_inode; | 152 | struct inode vfs_inode; |
153 | struct jbd2_inode jinode; | ||
153 | 154 | ||
154 | unsigned long i_ext_generation; | 155 | unsigned long i_ext_generation; |
155 | struct ext4_ext_cache i_cached_extent; | 156 | struct ext4_ext_cache i_cached_extent; |
@@ -162,6 +163,13 @@ struct ext4_inode_info { | |||
162 | /* mballoc */ | 163 | /* mballoc */ |
163 | struct list_head i_prealloc_list; | 164 | struct list_head i_prealloc_list; |
164 | spinlock_t i_prealloc_lock; | 165 | spinlock_t i_prealloc_lock; |
166 | |||
167 | /* allocation reservation info for delalloc */ | ||
168 | unsigned long i_reserved_data_blocks; | ||
169 | unsigned long i_reserved_meta_blocks; | ||
170 | unsigned long i_allocated_meta_blocks; | ||
171 | unsigned short i_delalloc_reserved_flag; | ||
172 | spinlock_t i_block_reservation_lock; | ||
165 | }; | 173 | }; |
166 | 174 | ||
167 | #endif /* _EXT4_I */ | 175 | #endif /* _EXT4_I */ |
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 9255a7d28b24..eb8bc3afe6e9 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h | |||
@@ -142,19 +142,17 @@ int __ext4_journal_dirty_metadata(const char *where, | |||
142 | handle_t *handle, struct buffer_head *bh); | 142 | handle_t *handle, struct buffer_head *bh); |
143 | 143 | ||
144 | #define ext4_journal_get_undo_access(handle, bh) \ | 144 | #define ext4_journal_get_undo_access(handle, bh) \ |
145 | __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) | 145 | __ext4_journal_get_undo_access(__func__, (handle), (bh)) |
146 | #define ext4_journal_get_write_access(handle, bh) \ | 146 | #define ext4_journal_get_write_access(handle, bh) \ |
147 | __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) | 147 | __ext4_journal_get_write_access(__func__, (handle), (bh)) |
148 | #define ext4_journal_revoke(handle, blocknr, bh) \ | 148 | #define ext4_journal_revoke(handle, blocknr, bh) \ |
149 | __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) | 149 | __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) |
150 | #define ext4_journal_get_create_access(handle, bh) \ | 150 | #define ext4_journal_get_create_access(handle, bh) \ |
151 | __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) | 151 | __ext4_journal_get_create_access(__func__, (handle), (bh)) |
152 | #define ext4_journal_dirty_metadata(handle, bh) \ | 152 | #define ext4_journal_dirty_metadata(handle, bh) \ |
153 | __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) | 153 | __ext4_journal_dirty_metadata(__func__, (handle), (bh)) |
154 | #define ext4_journal_forget(handle, bh) \ | 154 | #define ext4_journal_forget(handle, bh) \ |
155 | __ext4_journal_forget(__FUNCTION__, (handle), (bh)) | 155 | __ext4_journal_forget(__func__, (handle), (bh)) |
156 | |||
157 | int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); | ||
158 | 156 | ||
159 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); | 157 | handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); |
160 | int __ext4_journal_stop(const char *where, handle_t *handle); | 158 | int __ext4_journal_stop(const char *where, handle_t *handle); |
@@ -165,7 +163,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) | |||
165 | } | 163 | } |
166 | 164 | ||
167 | #define ext4_journal_stop(handle) \ | 165 | #define ext4_journal_stop(handle) \ |
168 | __ext4_journal_stop(__FUNCTION__, (handle)) | 166 | __ext4_journal_stop(__func__, (handle)) |
169 | 167 | ||
170 | static inline handle_t *ext4_journal_current_handle(void) | 168 | static inline handle_t *ext4_journal_current_handle(void) |
171 | { | 169 | { |
@@ -192,6 +190,11 @@ static inline int ext4_journal_force_commit(journal_t *journal) | |||
192 | return jbd2_journal_force_commit(journal); | 190 | return jbd2_journal_force_commit(journal); |
193 | } | 191 | } |
194 | 192 | ||
193 | static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) | ||
194 | { | ||
195 | return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); | ||
196 | } | ||
197 | |||
195 | /* super.c */ | 198 | /* super.c */ |
196 | int ext4_force_commit(struct super_block *sb); | 199 | int ext4_force_commit(struct super_block *sb); |
197 | 200 | ||
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 5802e69f2191..6300226d5531 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h | |||
@@ -25,7 +25,7 @@ | |||
25 | #include <linux/rbtree.h> | 25 | #include <linux/rbtree.h> |
26 | 26 | ||
27 | /* | 27 | /* |
28 | * third extended-fs super-block data in memory | 28 | * fourth extended-fs super-block data in memory |
29 | */ | 29 | */ |
30 | struct ext4_sb_info { | 30 | struct ext4_sb_info { |
31 | unsigned long s_desc_size; /* Size of a group descriptor in bytes */ | 31 | unsigned long s_desc_size; /* Size of a group descriptor in bytes */ |
@@ -143,6 +143,9 @@ struct ext4_sb_info { | |||
143 | 143 | ||
144 | /* locality groups */ | 144 | /* locality groups */ |
145 | struct ext4_locality_group *s_locality_groups; | 145 | struct ext4_locality_group *s_locality_groups; |
146 | |||
147 | unsigned int s_log_groups_per_flex; | ||
148 | struct flex_groups *s_flex_groups; | ||
146 | }; | 149 | }; |
147 | 150 | ||
148 | #endif /* _EXT4_SB */ | 151 | #endif /* _EXT4_SB */ |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 47929c4e3dae..42c4c0c892ed 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -92,17 +92,16 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) | |||
92 | ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); | 92 | ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); |
93 | } | 93 | } |
94 | 94 | ||
95 | static handle_t *ext4_ext_journal_restart(handle_t *handle, int needed) | 95 | static int ext4_ext_journal_restart(handle_t *handle, int needed) |
96 | { | 96 | { |
97 | int err; | 97 | int err; |
98 | 98 | ||
99 | if (handle->h_buffer_credits > needed) | 99 | if (handle->h_buffer_credits > needed) |
100 | return handle; | 100 | return 0; |
101 | if (!ext4_journal_extend(handle, needed)) | 101 | err = ext4_journal_extend(handle, needed); |
102 | return handle; | 102 | if (err) |
103 | err = ext4_journal_restart(handle, needed); | 103 | return err; |
104 | 104 | return ext4_journal_restart(handle, needed); | |
105 | return handle; | ||
106 | } | 105 | } |
107 | 106 | ||
108 | /* | 107 | /* |
@@ -180,15 +179,18 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode, | |||
180 | return bg_start + colour + block; | 179 | return bg_start + colour + block; |
181 | } | 180 | } |
182 | 181 | ||
182 | /* | ||
183 | * Allocation for a meta data block | ||
184 | */ | ||
183 | static ext4_fsblk_t | 185 | static ext4_fsblk_t |
184 | ext4_ext_new_block(handle_t *handle, struct inode *inode, | 186 | ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, |
185 | struct ext4_ext_path *path, | 187 | struct ext4_ext_path *path, |
186 | struct ext4_extent *ex, int *err) | 188 | struct ext4_extent *ex, int *err) |
187 | { | 189 | { |
188 | ext4_fsblk_t goal, newblock; | 190 | ext4_fsblk_t goal, newblock; |
189 | 191 | ||
190 | goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); | 192 | goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); |
191 | newblock = ext4_new_block(handle, inode, goal, err); | 193 | newblock = ext4_new_meta_block(handle, inode, goal, err); |
192 | return newblock; | 194 | return newblock; |
193 | } | 195 | } |
194 | 196 | ||
@@ -246,6 +248,36 @@ static int ext4_ext_space_root_idx(struct inode *inode) | |||
246 | return size; | 248 | return size; |
247 | } | 249 | } |
248 | 250 | ||
251 | /* | ||
252 | * Calculate the number of metadata blocks needed | ||
253 | * to allocate @blocks | ||
254 | * Worse case is one block per extent | ||
255 | */ | ||
256 | int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) | ||
257 | { | ||
258 | int lcap, icap, rcap, leafs, idxs, num; | ||
259 | int newextents = blocks; | ||
260 | |||
261 | rcap = ext4_ext_space_root_idx(inode); | ||
262 | lcap = ext4_ext_space_block(inode); | ||
263 | icap = ext4_ext_space_block_idx(inode); | ||
264 | |||
265 | /* number of new leaf blocks needed */ | ||
266 | num = leafs = (newextents + lcap - 1) / lcap; | ||
267 | |||
268 | /* | ||
269 | * Worse case, we need separate index block(s) | ||
270 | * to link all new leaf blocks | ||
271 | */ | ||
272 | idxs = (leafs + icap - 1) / icap; | ||
273 | do { | ||
274 | num += idxs; | ||
275 | idxs = (idxs + icap - 1) / icap; | ||
276 | } while (idxs > rcap); | ||
277 | |||
278 | return num; | ||
279 | } | ||
280 | |||
249 | static int | 281 | static int |
250 | ext4_ext_max_entries(struct inode *inode, int depth) | 282 | ext4_ext_max_entries(struct inode *inode, int depth) |
251 | { | 283 | { |
@@ -524,6 +556,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, | |||
524 | alloc = 1; | 556 | alloc = 1; |
525 | } | 557 | } |
526 | path[0].p_hdr = eh; | 558 | path[0].p_hdr = eh; |
559 | path[0].p_bh = NULL; | ||
527 | 560 | ||
528 | i = depth; | 561 | i = depth; |
529 | /* walk through the tree */ | 562 | /* walk through the tree */ |
@@ -552,12 +585,14 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, | |||
552 | } | 585 | } |
553 | 586 | ||
554 | path[ppos].p_depth = i; | 587 | path[ppos].p_depth = i; |
555 | path[ppos].p_hdr = eh; | ||
556 | path[ppos].p_ext = NULL; | 588 | path[ppos].p_ext = NULL; |
557 | path[ppos].p_idx = NULL; | 589 | path[ppos].p_idx = NULL; |
558 | 590 | ||
559 | /* find extent */ | 591 | /* find extent */ |
560 | ext4_ext_binsearch(inode, path + ppos, block); | 592 | ext4_ext_binsearch(inode, path + ppos, block); |
593 | /* if not an empty leaf */ | ||
594 | if (path[ppos].p_ext) | ||
595 | path[ppos].p_block = ext_pblock(path[ppos].p_ext); | ||
561 | 596 | ||
562 | ext4_ext_show_path(inode, path); | 597 | ext4_ext_show_path(inode, path); |
563 | 598 | ||
@@ -688,7 +723,8 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, | |||
688 | /* allocate all needed blocks */ | 723 | /* allocate all needed blocks */ |
689 | ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); | 724 | ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); |
690 | for (a = 0; a < depth - at; a++) { | 725 | for (a = 0; a < depth - at; a++) { |
691 | newblock = ext4_ext_new_block(handle, inode, path, newext, &err); | 726 | newblock = ext4_ext_new_meta_block(handle, inode, path, |
727 | newext, &err); | ||
692 | if (newblock == 0) | 728 | if (newblock == 0) |
693 | goto cleanup; | 729 | goto cleanup; |
694 | ablocks[a] = newblock; | 730 | ablocks[a] = newblock; |
@@ -884,7 +920,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, | |||
884 | ext4_fsblk_t newblock; | 920 | ext4_fsblk_t newblock; |
885 | int err = 0; | 921 | int err = 0; |
886 | 922 | ||
887 | newblock = ext4_ext_new_block(handle, inode, path, newext, &err); | 923 | newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err); |
888 | if (newblock == 0) | 924 | if (newblock == 0) |
889 | return err; | 925 | return err; |
890 | 926 | ||
@@ -981,6 +1017,8 @@ repeat: | |||
981 | /* if we found index with free entry, then use that | 1017 | /* if we found index with free entry, then use that |
982 | * entry: create all needed subtree and add new leaf */ | 1018 | * entry: create all needed subtree and add new leaf */ |
983 | err = ext4_ext_split(handle, inode, path, newext, i); | 1019 | err = ext4_ext_split(handle, inode, path, newext, i); |
1020 | if (err) | ||
1021 | goto out; | ||
984 | 1022 | ||
985 | /* refill path */ | 1023 | /* refill path */ |
986 | ext4_ext_drop_refs(path); | 1024 | ext4_ext_drop_refs(path); |
@@ -1883,11 +1921,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, | |||
1883 | credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); | 1921 | credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); |
1884 | #endif | 1922 | #endif |
1885 | 1923 | ||
1886 | handle = ext4_ext_journal_restart(handle, credits); | 1924 | err = ext4_ext_journal_restart(handle, credits); |
1887 | if (IS_ERR(handle)) { | 1925 | if (err) |
1888 | err = PTR_ERR(handle); | ||
1889 | goto out; | 1926 | goto out; |
1890 | } | ||
1891 | 1927 | ||
1892 | err = ext4_ext_get_access(handle, inode, path + depth); | 1928 | err = ext4_ext_get_access(handle, inode, path + depth); |
1893 | if (err) | 1929 | if (err) |
@@ -2529,6 +2565,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2529 | int err = 0, depth, ret; | 2565 | int err = 0, depth, ret; |
2530 | unsigned long allocated = 0; | 2566 | unsigned long allocated = 0; |
2531 | struct ext4_allocation_request ar; | 2567 | struct ext4_allocation_request ar; |
2568 | loff_t disksize; | ||
2532 | 2569 | ||
2533 | __clear_bit(BH_New, &bh_result->b_state); | 2570 | __clear_bit(BH_New, &bh_result->b_state); |
2534 | ext_debug("blocks %u/%lu requested for inode %u\n", | 2571 | ext_debug("blocks %u/%lu requested for inode %u\n", |
@@ -2616,8 +2653,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2616 | */ | 2653 | */ |
2617 | if (allocated > max_blocks) | 2654 | if (allocated > max_blocks) |
2618 | allocated = max_blocks; | 2655 | allocated = max_blocks; |
2619 | /* mark the buffer unwritten */ | 2656 | set_buffer_unwritten(bh_result); |
2620 | __set_bit(BH_Unwritten, &bh_result->b_state); | ||
2621 | goto out2; | 2657 | goto out2; |
2622 | } | 2658 | } |
2623 | 2659 | ||
@@ -2716,14 +2752,19 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, | |||
2716 | goto out2; | 2752 | goto out2; |
2717 | } | 2753 | } |
2718 | 2754 | ||
2719 | if (extend_disksize && inode->i_size > EXT4_I(inode)->i_disksize) | ||
2720 | EXT4_I(inode)->i_disksize = inode->i_size; | ||
2721 | |||
2722 | /* previous routine could use block we allocated */ | 2755 | /* previous routine could use block we allocated */ |
2723 | newblock = ext_pblock(&newex); | 2756 | newblock = ext_pblock(&newex); |
2724 | allocated = ext4_ext_get_actual_len(&newex); | 2757 | allocated = ext4_ext_get_actual_len(&newex); |
2725 | outnew: | 2758 | outnew: |
2726 | __set_bit(BH_New, &bh_result->b_state); | 2759 | if (extend_disksize) { |
2760 | disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; | ||
2761 | if (disksize > i_size_read(inode)) | ||
2762 | disksize = i_size_read(inode); | ||
2763 | if (disksize > EXT4_I(inode)->i_disksize) | ||
2764 | EXT4_I(inode)->i_disksize = disksize; | ||
2765 | } | ||
2766 | |||
2767 | set_buffer_new(bh_result); | ||
2727 | 2768 | ||
2728 | /* Cache only when it is _not_ an uninitialized extent */ | 2769 | /* Cache only when it is _not_ an uninitialized extent */ |
2729 | if (create != EXT4_CREATE_UNINITIALIZED_EXT) | 2770 | if (create != EXT4_CREATE_UNINITIALIZED_EXT) |
@@ -2733,7 +2774,7 @@ out: | |||
2733 | if (allocated > max_blocks) | 2774 | if (allocated > max_blocks) |
2734 | allocated = max_blocks; | 2775 | allocated = max_blocks; |
2735 | ext4_ext_show_leaf(inode, path); | 2776 | ext4_ext_show_leaf(inode, path); |
2736 | __set_bit(BH_Mapped, &bh_result->b_state); | 2777 | set_buffer_mapped(bh_result); |
2737 | bh_result->b_bdev = inode->i_sb->s_bdev; | 2778 | bh_result->b_bdev = inode->i_sb->s_bdev; |
2738 | bh_result->b_blocknr = newblock; | 2779 | bh_result->b_blocknr = newblock; |
2739 | out2: | 2780 | out2: |
@@ -2744,7 +2785,7 @@ out2: | |||
2744 | return err ? err : allocated; | 2785 | return err ? err : allocated; |
2745 | } | 2786 | } |
2746 | 2787 | ||
2747 | void ext4_ext_truncate(struct inode * inode, struct page *page) | 2788 | void ext4_ext_truncate(struct inode *inode) |
2748 | { | 2789 | { |
2749 | struct address_space *mapping = inode->i_mapping; | 2790 | struct address_space *mapping = inode->i_mapping; |
2750 | struct super_block *sb = inode->i_sb; | 2791 | struct super_block *sb = inode->i_sb; |
@@ -2757,18 +2798,14 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |||
2757 | */ | 2798 | */ |
2758 | err = ext4_writepage_trans_blocks(inode) + 3; | 2799 | err = ext4_writepage_trans_blocks(inode) + 3; |
2759 | handle = ext4_journal_start(inode, err); | 2800 | handle = ext4_journal_start(inode, err); |
2760 | if (IS_ERR(handle)) { | 2801 | if (IS_ERR(handle)) |
2761 | if (page) { | ||
2762 | clear_highpage(page); | ||
2763 | flush_dcache_page(page); | ||
2764 | unlock_page(page); | ||
2765 | page_cache_release(page); | ||
2766 | } | ||
2767 | return; | 2802 | return; |
2768 | } | ||
2769 | 2803 | ||
2770 | if (page) | 2804 | if (inode->i_size & (sb->s_blocksize - 1)) |
2771 | ext4_block_truncate_page(handle, page, mapping, inode->i_size); | 2805 | ext4_block_truncate_page(handle, mapping, inode->i_size); |
2806 | |||
2807 | if (ext4_orphan_add(handle, inode)) | ||
2808 | goto out_stop; | ||
2772 | 2809 | ||
2773 | down_write(&EXT4_I(inode)->i_data_sem); | 2810 | down_write(&EXT4_I(inode)->i_data_sem); |
2774 | ext4_ext_invalidate_cache(inode); | 2811 | ext4_ext_invalidate_cache(inode); |
@@ -2780,8 +2817,6 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |||
2780 | * Probably we need not scan at all, | 2817 | * Probably we need not scan at all, |
2781 | * because page truncation is enough. | 2818 | * because page truncation is enough. |
2782 | */ | 2819 | */ |
2783 | if (ext4_orphan_add(handle, inode)) | ||
2784 | goto out_stop; | ||
2785 | 2820 | ||
2786 | /* we have to know where to truncate from in crash case */ | 2821 | /* we have to know where to truncate from in crash case */ |
2787 | EXT4_I(inode)->i_disksize = inode->i_size; | 2822 | EXT4_I(inode)->i_disksize = inode->i_size; |
@@ -2798,6 +2833,7 @@ void ext4_ext_truncate(struct inode * inode, struct page *page) | |||
2798 | handle->h_sync = 1; | 2833 | handle->h_sync = 1; |
2799 | 2834 | ||
2800 | out_stop: | 2835 | out_stop: |
2836 | up_write(&EXT4_I(inode)->i_data_sem); | ||
2801 | /* | 2837 | /* |
2802 | * If this was a simple ftruncate() and the file will remain alive, | 2838 | * If this was a simple ftruncate() and the file will remain alive, |
2803 | * then we need to clear up the orphan record which we created above. | 2839 | * then we need to clear up the orphan record which we created above. |
@@ -2808,7 +2844,6 @@ out_stop: | |||
2808 | if (inode->i_nlink) | 2844 | if (inode->i_nlink) |
2809 | ext4_orphan_del(handle, inode); | 2845 | ext4_orphan_del(handle, inode); |
2810 | 2846 | ||
2811 | up_write(&EXT4_I(inode)->i_data_sem); | ||
2812 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); | 2847 | inode->i_mtime = inode->i_ctime = ext4_current_time(inode); |
2813 | ext4_mark_inode_dirty(handle, inode); | 2848 | ext4_mark_inode_dirty(handle, inode); |
2814 | ext4_journal_stop(handle); | 2849 | ext4_journal_stop(handle); |
@@ -2911,7 +2946,7 @@ retry: | |||
2911 | } | 2946 | } |
2912 | ret = ext4_get_blocks_wrap(handle, inode, block, | 2947 | ret = ext4_get_blocks_wrap(handle, inode, block, |
2913 | max_blocks, &map_bh, | 2948 | max_blocks, &map_bh, |
2914 | EXT4_CREATE_UNINITIALIZED_EXT, 0); | 2949 | EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); |
2915 | if (ret <= 0) { | 2950 | if (ret <= 0) { |
2916 | #ifdef EXT4FS_DEBUG | 2951 | #ifdef EXT4FS_DEBUG |
2917 | WARN_ON(ret <= 0); | 2952 | WARN_ON(ret <= 0); |
diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 4159be6366ab..430eb7978db4 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c | |||
@@ -123,6 +123,23 @@ force_commit: | |||
123 | return ret; | 123 | return ret; |
124 | } | 124 | } |
125 | 125 | ||
126 | static struct vm_operations_struct ext4_file_vm_ops = { | ||
127 | .fault = filemap_fault, | ||
128 | .page_mkwrite = ext4_page_mkwrite, | ||
129 | }; | ||
130 | |||
131 | static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) | ||
132 | { | ||
133 | struct address_space *mapping = file->f_mapping; | ||
134 | |||
135 | if (!mapping->a_ops->readpage) | ||
136 | return -ENOEXEC; | ||
137 | file_accessed(file); | ||
138 | vma->vm_ops = &ext4_file_vm_ops; | ||
139 | vma->vm_flags |= VM_CAN_NONLINEAR; | ||
140 | return 0; | ||
141 | } | ||
142 | |||
126 | const struct file_operations ext4_file_operations = { | 143 | const struct file_operations ext4_file_operations = { |
127 | .llseek = generic_file_llseek, | 144 | .llseek = generic_file_llseek, |
128 | .read = do_sync_read, | 145 | .read = do_sync_read, |
@@ -133,7 +150,7 @@ const struct file_operations ext4_file_operations = { | |||
133 | #ifdef CONFIG_COMPAT | 150 | #ifdef CONFIG_COMPAT |
134 | .compat_ioctl = ext4_compat_ioctl, | 151 | .compat_ioctl = ext4_compat_ioctl, |
135 | #endif | 152 | #endif |
136 | .mmap = generic_file_mmap, | 153 | .mmap = ext4_file_mmap, |
137 | .open = generic_file_open, | 154 | .open = generic_file_open, |
138 | .release = ext4_release_file, | 155 | .release = ext4_release_file, |
139 | .fsync = ext4_sync_file, | 156 | .fsync = ext4_sync_file, |
@@ -144,6 +161,7 @@ const struct file_operations ext4_file_operations = { | |||
144 | const struct inode_operations ext4_file_inode_operations = { | 161 | const struct inode_operations ext4_file_inode_operations = { |
145 | .truncate = ext4_truncate, | 162 | .truncate = ext4_truncate, |
146 | .setattr = ext4_setattr, | 163 | .setattr = ext4_setattr, |
164 | .getattr = ext4_getattr, | ||
147 | #ifdef CONFIG_EXT4DEV_FS_XATTR | 165 | #ifdef CONFIG_EXT4DEV_FS_XATTR |
148 | .setxattr = generic_setxattr, | 166 | .setxattr = generic_setxattr, |
149 | .getxattr = generic_getxattr, | 167 | .getxattr = generic_getxattr, |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 1c8ba48d4f8d..a45c3737ad31 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/sched.h> | 27 | #include <linux/sched.h> |
28 | #include <linux/writeback.h> | 28 | #include <linux/writeback.h> |
29 | #include <linux/jbd2.h> | 29 | #include <linux/jbd2.h> |
30 | #include <linux/blkdev.h> | ||
30 | #include "ext4.h" | 31 | #include "ext4.h" |
31 | #include "ext4_jbd2.h" | 32 | #include "ext4_jbd2.h" |
32 | 33 | ||
@@ -45,6 +46,7 @@ | |||
45 | int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) | 46 | int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) |
46 | { | 47 | { |
47 | struct inode *inode = dentry->d_inode; | 48 | struct inode *inode = dentry->d_inode; |
49 | journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; | ||
48 | int ret = 0; | 50 | int ret = 0; |
49 | 51 | ||
50 | J_ASSERT(ext4_journal_current_handle() == NULL); | 52 | J_ASSERT(ext4_journal_current_handle() == NULL); |
@@ -85,6 +87,8 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) | |||
85 | .nr_to_write = 0, /* sys_fsync did this */ | 87 | .nr_to_write = 0, /* sys_fsync did this */ |
86 | }; | 88 | }; |
87 | ret = sync_inode(inode, &wbc); | 89 | ret = sync_inode(inode, &wbc); |
90 | if (journal && (journal->j_flags & JBD2_BARRIER)) | ||
91 | blkdev_issue_flush(inode->i_sb->s_bdev, NULL); | ||
88 | } | 92 | } |
89 | out: | 93 | out: |
90 | return ret; | 94 | return ret; |
diff --git a/fs/ext4/group.h b/fs/ext4/group.h index 7eb0604e7eea..c2c0a8d06d0e 100644 --- a/fs/ext4/group.h +++ b/fs/ext4/group.h | |||
@@ -13,7 +13,7 @@ extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, | |||
13 | struct ext4_group_desc *gdp); | 13 | struct ext4_group_desc *gdp); |
14 | extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, | 14 | extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, |
15 | struct ext4_group_desc *gdp); | 15 | struct ext4_group_desc *gdp); |
16 | struct buffer_head *read_block_bitmap(struct super_block *sb, | 16 | struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, |
17 | ext4_group_t block_group); | 17 | ext4_group_t block_group); |
18 | extern unsigned ext4_init_block_bitmap(struct super_block *sb, | 18 | extern unsigned ext4_init_block_bitmap(struct super_block *sb, |
19 | struct buffer_head *bh, | 19 | struct buffer_head *bh, |
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c6efbab0c801..a92eb305344f 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c | |||
@@ -157,6 +157,7 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) | |||
157 | struct ext4_super_block * es; | 157 | struct ext4_super_block * es; |
158 | struct ext4_sb_info *sbi; | 158 | struct ext4_sb_info *sbi; |
159 | int fatal = 0, err; | 159 | int fatal = 0, err; |
160 | ext4_group_t flex_group; | ||
160 | 161 | ||
161 | if (atomic_read(&inode->i_count) > 1) { | 162 | if (atomic_read(&inode->i_count) > 1) { |
162 | printk ("ext4_free_inode: inode has count=%d\n", | 163 | printk ("ext4_free_inode: inode has count=%d\n", |
@@ -232,6 +233,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) | |||
232 | if (is_directory) | 233 | if (is_directory) |
233 | percpu_counter_dec(&sbi->s_dirs_counter); | 234 | percpu_counter_dec(&sbi->s_dirs_counter); |
234 | 235 | ||
236 | if (sbi->s_log_groups_per_flex) { | ||
237 | flex_group = ext4_flex_group(sbi, block_group); | ||
238 | spin_lock(sb_bgl_lock(sbi, flex_group)); | ||
239 | sbi->s_flex_groups[flex_group].free_inodes++; | ||
240 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | ||
241 | } | ||
235 | } | 242 | } |
236 | BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); | 243 | BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); |
237 | err = ext4_journal_dirty_metadata(handle, bh2); | 244 | err = ext4_journal_dirty_metadata(handle, bh2); |
@@ -286,6 +293,80 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, | |||
286 | return ret; | 293 | return ret; |
287 | } | 294 | } |
288 | 295 | ||
296 | #define free_block_ratio 10 | ||
297 | |||
298 | static int find_group_flex(struct super_block *sb, struct inode *parent, | ||
299 | ext4_group_t *best_group) | ||
300 | { | ||
301 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
302 | struct ext4_group_desc *desc; | ||
303 | struct buffer_head *bh; | ||
304 | struct flex_groups *flex_group = sbi->s_flex_groups; | ||
305 | ext4_group_t parent_group = EXT4_I(parent)->i_block_group; | ||
306 | ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); | ||
307 | ext4_group_t ngroups = sbi->s_groups_count; | ||
308 | int flex_size = ext4_flex_bg_size(sbi); | ||
309 | ext4_group_t best_flex = parent_fbg_group; | ||
310 | int blocks_per_flex = sbi->s_blocks_per_group * flex_size; | ||
311 | int flexbg_free_blocks; | ||
312 | int flex_freeb_ratio; | ||
313 | ext4_group_t n_fbg_groups; | ||
314 | ext4_group_t i; | ||
315 | |||
316 | n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> | ||
317 | sbi->s_log_groups_per_flex; | ||
318 | |||
319 | find_close_to_parent: | ||
320 | flexbg_free_blocks = flex_group[best_flex].free_blocks; | ||
321 | flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; | ||
322 | if (flex_group[best_flex].free_inodes && | ||
323 | flex_freeb_ratio > free_block_ratio) | ||
324 | goto found_flexbg; | ||
325 | |||
326 | if (best_flex && best_flex == parent_fbg_group) { | ||
327 | best_flex--; | ||
328 | goto find_close_to_parent; | ||
329 | } | ||
330 | |||
331 | for (i = 0; i < n_fbg_groups; i++) { | ||
332 | if (i == parent_fbg_group || i == parent_fbg_group - 1) | ||
333 | continue; | ||
334 | |||
335 | flexbg_free_blocks = flex_group[i].free_blocks; | ||
336 | flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex; | ||
337 | |||
338 | if (flex_freeb_ratio > free_block_ratio && | ||
339 | flex_group[i].free_inodes) { | ||
340 | best_flex = i; | ||
341 | goto found_flexbg; | ||
342 | } | ||
343 | |||
344 | if (best_flex < 0 || | ||
345 | (flex_group[i].free_blocks > | ||
346 | flex_group[best_flex].free_blocks && | ||
347 | flex_group[i].free_inodes)) | ||
348 | best_flex = i; | ||
349 | } | ||
350 | |||
351 | if (!flex_group[best_flex].free_inodes || | ||
352 | !flex_group[best_flex].free_blocks) | ||
353 | return -1; | ||
354 | |||
355 | found_flexbg: | ||
356 | for (i = best_flex * flex_size; i < ngroups && | ||
357 | i < (best_flex + 1) * flex_size; i++) { | ||
358 | desc = ext4_get_group_desc(sb, i, &bh); | ||
359 | if (le16_to_cpu(desc->bg_free_inodes_count)) { | ||
360 | *best_group = i; | ||
361 | goto out; | ||
362 | } | ||
363 | } | ||
364 | |||
365 | return -1; | ||
366 | out: | ||
367 | return 0; | ||
368 | } | ||
369 | |||
289 | /* | 370 | /* |
290 | * Orlov's allocator for directories. | 371 | * Orlov's allocator for directories. |
291 | * | 372 | * |
@@ -501,6 +582,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |||
501 | struct inode *ret; | 582 | struct inode *ret; |
502 | ext4_group_t i; | 583 | ext4_group_t i; |
503 | int free = 0; | 584 | int free = 0; |
585 | ext4_group_t flex_group; | ||
504 | 586 | ||
505 | /* Cannot create files in a deleted directory */ | 587 | /* Cannot create files in a deleted directory */ |
506 | if (!dir || !dir->i_nlink) | 588 | if (!dir || !dir->i_nlink) |
@@ -514,6 +596,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |||
514 | 596 | ||
515 | sbi = EXT4_SB(sb); | 597 | sbi = EXT4_SB(sb); |
516 | es = sbi->s_es; | 598 | es = sbi->s_es; |
599 | |||
600 | if (sbi->s_log_groups_per_flex) { | ||
601 | ret2 = find_group_flex(sb, dir, &group); | ||
602 | goto got_group; | ||
603 | } | ||
604 | |||
517 | if (S_ISDIR(mode)) { | 605 | if (S_ISDIR(mode)) { |
518 | if (test_opt (sb, OLDALLOC)) | 606 | if (test_opt (sb, OLDALLOC)) |
519 | ret2 = find_group_dir(sb, dir, &group); | 607 | ret2 = find_group_dir(sb, dir, &group); |
@@ -522,6 +610,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) | |||
522 | } else | 610 | } else |
523 | ret2 = find_group_other(sb, dir, &group); | 611 | ret2 = find_group_other(sb, dir, &group); |
524 | 612 | ||
613 | got_group: | ||
525 | err = -ENOSPC; | 614 | err = -ENOSPC; |
526 | if (ret2 == -1) | 615 | if (ret2 == -1) |
527 | goto out; | 616 | goto out; |
@@ -600,7 +689,7 @@ got: | |||
600 | /* We may have to initialize the block bitmap if it isn't already */ | 689 | /* We may have to initialize the block bitmap if it isn't already */ |
601 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && | 690 | if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && |
602 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | 691 | gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { |
603 | struct buffer_head *block_bh = read_block_bitmap(sb, group); | 692 | struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); |
604 | 693 | ||
605 | BUFFER_TRACE(block_bh, "get block bitmap access"); | 694 | BUFFER_TRACE(block_bh, "get block bitmap access"); |
606 | err = ext4_journal_get_write_access(handle, block_bh); | 695 | err = ext4_journal_get_write_access(handle, block_bh); |
@@ -676,6 +765,13 @@ got: | |||
676 | percpu_counter_inc(&sbi->s_dirs_counter); | 765 | percpu_counter_inc(&sbi->s_dirs_counter); |
677 | sb->s_dirt = 1; | 766 | sb->s_dirt = 1; |
678 | 767 | ||
768 | if (sbi->s_log_groups_per_flex) { | ||
769 | flex_group = ext4_flex_group(sbi, group); | ||
770 | spin_lock(sb_bgl_lock(sbi, flex_group)); | ||
771 | sbi->s_flex_groups[flex_group].free_inodes--; | ||
772 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | ||
773 | } | ||
774 | |||
679 | inode->i_uid = current->fsuid; | 775 | inode->i_uid = current->fsuid; |
680 | if (test_opt (sb, GRPID)) | 776 | if (test_opt (sb, GRPID)) |
681 | inode->i_gid = dir->i_gid; | 777 | inode->i_gid = dir->i_gid; |
@@ -740,14 +836,10 @@ got: | |||
740 | goto fail_free_drop; | 836 | goto fail_free_drop; |
741 | 837 | ||
742 | if (test_opt(sb, EXTENTS)) { | 838 | if (test_opt(sb, EXTENTS)) { |
743 | /* set extent flag only for diretory, file and normal symlink*/ | 839 | /* set extent flag only for directory, file and normal symlink*/ |
744 | if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { | 840 | if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { |
745 | EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; | 841 | EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; |
746 | ext4_ext_tree_init(handle, inode); | 842 | ext4_ext_tree_init(handle, inode); |
747 | err = ext4_update_incompat_feature(handle, sb, | ||
748 | EXT4_FEATURE_INCOMPAT_EXTENTS); | ||
749 | if (err) | ||
750 | goto fail_free_drop; | ||
751 | } | 843 | } |
752 | } | 844 | } |
753 | 845 | ||
@@ -817,6 +909,14 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) | |||
817 | if (IS_ERR(inode)) | 909 | if (IS_ERR(inode)) |
818 | goto iget_failed; | 910 | goto iget_failed; |
819 | 911 | ||
912 | /* | ||
913 | * If the orphans has i_nlinks > 0 then it should be able to be | ||
914 | * truncated, otherwise it won't be removed from the orphan list | ||
915 | * during processing and an infinite loop will result. | ||
916 | */ | ||
917 | if (inode->i_nlink && !ext4_can_truncate(inode)) | ||
918 | goto bad_orphan; | ||
919 | |||
820 | if (NEXT_ORPHAN(inode) > max_ino) | 920 | if (NEXT_ORPHAN(inode) > max_ino) |
821 | goto bad_orphan; | 921 | goto bad_orphan; |
822 | brelse(bitmap_bh); | 922 | brelse(bitmap_bh); |
@@ -838,6 +938,7 @@ bad_orphan: | |||
838 | printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", | 938 | printk(KERN_NOTICE "NEXT_ORPHAN(inode)=%u\n", |
839 | NEXT_ORPHAN(inode)); | 939 | NEXT_ORPHAN(inode)); |
840 | printk(KERN_NOTICE "max_ino=%lu\n", max_ino); | 940 | printk(KERN_NOTICE "max_ino=%lu\n", max_ino); |
941 | printk(KERN_NOTICE "i_nlink=%u\n", inode->i_nlink); | ||
841 | /* Avoid freeing blocks if we got a bad deleted inode */ | 942 | /* Avoid freeing blocks if we got a bad deleted inode */ |
842 | if (inode->i_nlink == 0) | 943 | if (inode->i_nlink == 0) |
843 | inode->i_blocks = 0; | 944 | inode->i_blocks = 0; |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8d9707746413..8ca2763df091 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -32,12 +32,23 @@ | |||
32 | #include <linux/string.h> | 32 | #include <linux/string.h> |
33 | #include <linux/buffer_head.h> | 33 | #include <linux/buffer_head.h> |
34 | #include <linux/writeback.h> | 34 | #include <linux/writeback.h> |
35 | #include <linux/pagevec.h> | ||
35 | #include <linux/mpage.h> | 36 | #include <linux/mpage.h> |
36 | #include <linux/uio.h> | 37 | #include <linux/uio.h> |
37 | #include <linux/bio.h> | 38 | #include <linux/bio.h> |
38 | #include "ext4_jbd2.h" | 39 | #include "ext4_jbd2.h" |
39 | #include "xattr.h" | 40 | #include "xattr.h" |
40 | #include "acl.h" | 41 | #include "acl.h" |
42 | #include "ext4_extents.h" | ||
43 | |||
44 | static inline int ext4_begin_ordered_truncate(struct inode *inode, | ||
45 | loff_t new_size) | ||
46 | { | ||
47 | return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, | ||
48 | new_size); | ||
49 | } | ||
50 | |||
51 | static void ext4_invalidatepage(struct page *page, unsigned long offset); | ||
41 | 52 | ||
42 | /* | 53 | /* |
43 | * Test whether an inode is a fast symlink. | 54 | * Test whether an inode is a fast symlink. |
@@ -181,6 +192,8 @@ void ext4_delete_inode (struct inode * inode) | |||
181 | { | 192 | { |
182 | handle_t *handle; | 193 | handle_t *handle; |
183 | 194 | ||
195 | if (ext4_should_order_data(inode)) | ||
196 | ext4_begin_ordered_truncate(inode, 0); | ||
184 | truncate_inode_pages(&inode->i_data, 0); | 197 | truncate_inode_pages(&inode->i_data, 0); |
185 | 198 | ||
186 | if (is_bad_inode(inode)) | 199 | if (is_bad_inode(inode)) |
@@ -508,11 +521,12 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, | |||
508 | * direct blocks | 521 | * direct blocks |
509 | */ | 522 | */ |
510 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | 523 | static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, |
511 | ext4_fsblk_t goal, int indirect_blks, int blks, | 524 | ext4_lblk_t iblock, ext4_fsblk_t goal, |
512 | ext4_fsblk_t new_blocks[4], int *err) | 525 | int indirect_blks, int blks, |
526 | ext4_fsblk_t new_blocks[4], int *err) | ||
513 | { | 527 | { |
514 | int target, i; | 528 | int target, i; |
515 | unsigned long count = 0; | 529 | unsigned long count = 0, blk_allocated = 0; |
516 | int index = 0; | 530 | int index = 0; |
517 | ext4_fsblk_t current_block = 0; | 531 | ext4_fsblk_t current_block = 0; |
518 | int ret = 0; | 532 | int ret = 0; |
@@ -525,12 +539,13 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
525 | * the first direct block of this branch. That's the | 539 | * the first direct block of this branch. That's the |
526 | * minimum number of blocks need to allocate(required) | 540 | * minimum number of blocks need to allocate(required) |
527 | */ | 541 | */ |
528 | target = blks + indirect_blks; | 542 | /* first we try to allocate the indirect blocks */ |
529 | 543 | target = indirect_blks; | |
530 | while (1) { | 544 | while (target > 0) { |
531 | count = target; | 545 | count = target; |
532 | /* allocating blocks for indirect blocks and direct blocks */ | 546 | /* allocating blocks for indirect blocks and direct blocks */ |
533 | current_block = ext4_new_blocks(handle,inode,goal,&count,err); | 547 | current_block = ext4_new_meta_blocks(handle, inode, |
548 | goal, &count, err); | ||
534 | if (*err) | 549 | if (*err) |
535 | goto failed_out; | 550 | goto failed_out; |
536 | 551 | ||
@@ -540,16 +555,48 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, | |||
540 | new_blocks[index++] = current_block++; | 555 | new_blocks[index++] = current_block++; |
541 | count--; | 556 | count--; |
542 | } | 557 | } |
543 | 558 | if (count > 0) { | |
544 | if (count > 0) | 559 | /* |
560 | * save the new block number | ||
561 | * for the first direct block | ||
562 | */ | ||
563 | new_blocks[index] = current_block; | ||
564 | printk(KERN_INFO "%s returned more blocks than " | ||
565 | "requested\n", __func__); | ||
566 | WARN_ON(1); | ||
545 | break; | 567 | break; |
568 | } | ||
546 | } | 569 | } |
547 | 570 | ||
548 | /* save the new block number for the first direct block */ | 571 | target = blks - count ; |
549 | new_blocks[index] = current_block; | 572 | blk_allocated = count; |
550 | 573 | if (!target) | |
574 | goto allocated; | ||
575 | /* Now allocate data blocks */ | ||
576 | count = target; | ||
577 | /* allocating blocks for data blocks */ | ||
578 | current_block = ext4_new_blocks(handle, inode, iblock, | ||
579 | goal, &count, err); | ||
580 | if (*err && (target == blks)) { | ||
581 | /* | ||
582 | * if the allocation failed and we didn't allocate | ||
583 | * any blocks before | ||
584 | */ | ||
585 | goto failed_out; | ||
586 | } | ||
587 | if (!*err) { | ||
588 | if (target == blks) { | ||
589 | /* | ||
590 | * save the new block number | ||
591 | * for the first direct block | ||
592 | */ | ||
593 | new_blocks[index] = current_block; | ||
594 | } | ||
595 | blk_allocated += count; | ||
596 | } | ||
597 | allocated: | ||
551 | /* total number of blocks allocated for direct blocks */ | 598 | /* total number of blocks allocated for direct blocks */ |
552 | ret = count; | 599 | ret = blk_allocated; |
553 | *err = 0; | 600 | *err = 0; |
554 | return ret; | 601 | return ret; |
555 | failed_out: | 602 | failed_out: |
@@ -584,8 +631,9 @@ failed_out: | |||
584 | * as described above and return 0. | 631 | * as described above and return 0. |
585 | */ | 632 | */ |
586 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | 633 | static int ext4_alloc_branch(handle_t *handle, struct inode *inode, |
587 | int indirect_blks, int *blks, ext4_fsblk_t goal, | 634 | ext4_lblk_t iblock, int indirect_blks, |
588 | ext4_lblk_t *offsets, Indirect *branch) | 635 | int *blks, ext4_fsblk_t goal, |
636 | ext4_lblk_t *offsets, Indirect *branch) | ||
589 | { | 637 | { |
590 | int blocksize = inode->i_sb->s_blocksize; | 638 | int blocksize = inode->i_sb->s_blocksize; |
591 | int i, n = 0; | 639 | int i, n = 0; |
@@ -595,7 +643,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, | |||
595 | ext4_fsblk_t new_blocks[4]; | 643 | ext4_fsblk_t new_blocks[4]; |
596 | ext4_fsblk_t current_block; | 644 | ext4_fsblk_t current_block; |
597 | 645 | ||
598 | num = ext4_alloc_blocks(handle, inode, goal, indirect_blks, | 646 | num = ext4_alloc_blocks(handle, inode, iblock, goal, indirect_blks, |
599 | *blks, new_blocks, &err); | 647 | *blks, new_blocks, &err); |
600 | if (err) | 648 | if (err) |
601 | return err; | 649 | return err; |
@@ -799,6 +847,7 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
799 | struct ext4_inode_info *ei = EXT4_I(inode); | 847 | struct ext4_inode_info *ei = EXT4_I(inode); |
800 | int count = 0; | 848 | int count = 0; |
801 | ext4_fsblk_t first_block = 0; | 849 | ext4_fsblk_t first_block = 0; |
850 | loff_t disksize; | ||
802 | 851 | ||
803 | 852 | ||
804 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); | 853 | J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); |
@@ -855,8 +904,9 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
855 | /* | 904 | /* |
856 | * Block out ext4_truncate while we alter the tree | 905 | * Block out ext4_truncate while we alter the tree |
857 | */ | 906 | */ |
858 | err = ext4_alloc_branch(handle, inode, indirect_blks, &count, goal, | 907 | err = ext4_alloc_branch(handle, inode, iblock, indirect_blks, |
859 | offsets + (partial - chain), partial); | 908 | &count, goal, |
909 | offsets + (partial - chain), partial); | ||
860 | 910 | ||
861 | /* | 911 | /* |
862 | * The ext4_splice_branch call will free and forget any buffers | 912 | * The ext4_splice_branch call will free and forget any buffers |
@@ -873,8 +923,13 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, | |||
873 | * protect it if you're about to implement concurrent | 923 | * protect it if you're about to implement concurrent |
874 | * ext4_get_block() -bzzz | 924 | * ext4_get_block() -bzzz |
875 | */ | 925 | */ |
876 | if (!err && extend_disksize && inode->i_size > ei->i_disksize) | 926 | if (!err && extend_disksize) { |
877 | ei->i_disksize = inode->i_size; | 927 | disksize = ((loff_t) iblock + count) << inode->i_blkbits; |
928 | if (disksize > i_size_read(inode)) | ||
929 | disksize = i_size_read(inode); | ||
930 | if (disksize > ei->i_disksize) | ||
931 | ei->i_disksize = disksize; | ||
932 | } | ||
878 | if (err) | 933 | if (err) |
879 | goto cleanup; | 934 | goto cleanup; |
880 | 935 | ||
@@ -934,7 +989,7 @@ out: | |||
934 | */ | 989 | */ |
935 | int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | 990 | int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, |
936 | unsigned long max_blocks, struct buffer_head *bh, | 991 | unsigned long max_blocks, struct buffer_head *bh, |
937 | int create, int extend_disksize) | 992 | int create, int extend_disksize, int flag) |
938 | { | 993 | { |
939 | int retval; | 994 | int retval; |
940 | 995 | ||
@@ -975,6 +1030,15 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | |||
975 | * with create == 1 flag. | 1030 | * with create == 1 flag. |
976 | */ | 1031 | */ |
977 | down_write((&EXT4_I(inode)->i_data_sem)); | 1032 | down_write((&EXT4_I(inode)->i_data_sem)); |
1033 | |||
1034 | /* | ||
1035 | * if the caller is from delayed allocation writeout path | ||
1036 | * we have already reserved fs blocks for allocation | ||
1037 | * let the underlying get_block() function know to | ||
1038 | * avoid double accounting | ||
1039 | */ | ||
1040 | if (flag) | ||
1041 | EXT4_I(inode)->i_delalloc_reserved_flag = 1; | ||
978 | /* | 1042 | /* |
979 | * We need to check for EXT4 here because migrate | 1043 | * We need to check for EXT4 here because migrate |
980 | * could have changed the inode type in between | 1044 | * could have changed the inode type in between |
@@ -996,6 +1060,18 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, | |||
996 | ~EXT4_EXT_MIGRATE; | 1060 | ~EXT4_EXT_MIGRATE; |
997 | } | 1061 | } |
998 | } | 1062 | } |
1063 | |||
1064 | if (flag) { | ||
1065 | EXT4_I(inode)->i_delalloc_reserved_flag = 0; | ||
1066 | /* | ||
1067 | * Update reserved blocks/metadata blocks | ||
1068 | * after successful block allocation | ||
1069 | * which were deferred till now | ||
1070 | */ | ||
1071 | if ((retval > 0) && buffer_delay(bh)) | ||
1072 | ext4_da_release_space(inode, retval, 0); | ||
1073 | } | ||
1074 | |||
999 | up_write((&EXT4_I(inode)->i_data_sem)); | 1075 | up_write((&EXT4_I(inode)->i_data_sem)); |
1000 | return retval; | 1076 | return retval; |
1001 | } | 1077 | } |
@@ -1021,7 +1097,7 @@ static int ext4_get_block(struct inode *inode, sector_t iblock, | |||
1021 | } | 1097 | } |
1022 | 1098 | ||
1023 | ret = ext4_get_blocks_wrap(handle, inode, iblock, | 1099 | ret = ext4_get_blocks_wrap(handle, inode, iblock, |
1024 | max_blocks, bh_result, create, 0); | 1100 | max_blocks, bh_result, create, 0, 0); |
1025 | if (ret > 0) { | 1101 | if (ret > 0) { |
1026 | bh_result->b_size = (ret << inode->i_blkbits); | 1102 | bh_result->b_size = (ret << inode->i_blkbits); |
1027 | ret = 0; | 1103 | ret = 0; |
@@ -1047,7 +1123,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, | |||
1047 | dummy.b_blocknr = -1000; | 1123 | dummy.b_blocknr = -1000; |
1048 | buffer_trace_init(&dummy.b_history); | 1124 | buffer_trace_init(&dummy.b_history); |
1049 | err = ext4_get_blocks_wrap(handle, inode, block, 1, | 1125 | err = ext4_get_blocks_wrap(handle, inode, block, 1, |
1050 | &dummy, create, 1); | 1126 | &dummy, create, 1, 0); |
1051 | /* | 1127 | /* |
1052 | * ext4_get_blocks_handle() returns number of blocks | 1128 | * ext4_get_blocks_handle() returns number of blocks |
1053 | * mapped. 0 in case of a HOLE. | 1129 | * mapped. 0 in case of a HOLE. |
@@ -1203,19 +1279,20 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, | |||
1203 | to = from + len; | 1279 | to = from + len; |
1204 | 1280 | ||
1205 | retry: | 1281 | retry: |
1206 | page = __grab_cache_page(mapping, index); | ||
1207 | if (!page) | ||
1208 | return -ENOMEM; | ||
1209 | *pagep = page; | ||
1210 | |||
1211 | handle = ext4_journal_start(inode, needed_blocks); | 1282 | handle = ext4_journal_start(inode, needed_blocks); |
1212 | if (IS_ERR(handle)) { | 1283 | if (IS_ERR(handle)) { |
1213 | unlock_page(page); | ||
1214 | page_cache_release(page); | ||
1215 | ret = PTR_ERR(handle); | 1284 | ret = PTR_ERR(handle); |
1216 | goto out; | 1285 | goto out; |
1217 | } | 1286 | } |
1218 | 1287 | ||
1288 | page = __grab_cache_page(mapping, index); | ||
1289 | if (!page) { | ||
1290 | ext4_journal_stop(handle); | ||
1291 | ret = -ENOMEM; | ||
1292 | goto out; | ||
1293 | } | ||
1294 | *pagep = page; | ||
1295 | |||
1219 | ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | 1296 | ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, |
1220 | ext4_get_block); | 1297 | ext4_get_block); |
1221 | 1298 | ||
@@ -1225,8 +1302,8 @@ retry: | |||
1225 | } | 1302 | } |
1226 | 1303 | ||
1227 | if (ret) { | 1304 | if (ret) { |
1228 | ext4_journal_stop(handle); | ||
1229 | unlock_page(page); | 1305 | unlock_page(page); |
1306 | ext4_journal_stop(handle); | ||
1230 | page_cache_release(page); | 1307 | page_cache_release(page); |
1231 | } | 1308 | } |
1232 | 1309 | ||
@@ -1236,15 +1313,6 @@ out: | |||
1236 | return ret; | 1313 | return ret; |
1237 | } | 1314 | } |
1238 | 1315 | ||
1239 | int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) | ||
1240 | { | ||
1241 | int err = jbd2_journal_dirty_data(handle, bh); | ||
1242 | if (err) | ||
1243 | ext4_journal_abort_handle(__func__, __func__, | ||
1244 | bh, handle, err); | ||
1245 | return err; | ||
1246 | } | ||
1247 | |||
1248 | /* For write_end() in data=journal mode */ | 1316 | /* For write_end() in data=journal mode */ |
1249 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) | 1317 | static int write_end_fn(handle_t *handle, struct buffer_head *bh) |
1250 | { | 1318 | { |
@@ -1255,29 +1323,6 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) | |||
1255 | } | 1323 | } |
1256 | 1324 | ||
1257 | /* | 1325 | /* |
1258 | * Generic write_end handler for ordered and writeback ext4 journal modes. | ||
1259 | * We can't use generic_write_end, because that unlocks the page and we need to | ||
1260 | * unlock the page after ext4_journal_stop, but ext4_journal_stop must run | ||
1261 | * after block_write_end. | ||
1262 | */ | ||
1263 | static int ext4_generic_write_end(struct file *file, | ||
1264 | struct address_space *mapping, | ||
1265 | loff_t pos, unsigned len, unsigned copied, | ||
1266 | struct page *page, void *fsdata) | ||
1267 | { | ||
1268 | struct inode *inode = file->f_mapping->host; | ||
1269 | |||
1270 | copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); | ||
1271 | |||
1272 | if (pos+copied > inode->i_size) { | ||
1273 | i_size_write(inode, pos+copied); | ||
1274 | mark_inode_dirty(inode); | ||
1275 | } | ||
1276 | |||
1277 | return copied; | ||
1278 | } | ||
1279 | |||
1280 | /* | ||
1281 | * We need to pick up the new inode size which generic_commit_write gave us | 1326 | * We need to pick up the new inode size which generic_commit_write gave us |
1282 | * `file' can be NULL - eg, when called from page_symlink(). | 1327 | * `file' can be NULL - eg, when called from page_symlink(). |
1283 | * | 1328 | * |
@@ -1290,15 +1335,14 @@ static int ext4_ordered_write_end(struct file *file, | |||
1290 | struct page *page, void *fsdata) | 1335 | struct page *page, void *fsdata) |
1291 | { | 1336 | { |
1292 | handle_t *handle = ext4_journal_current_handle(); | 1337 | handle_t *handle = ext4_journal_current_handle(); |
1293 | struct inode *inode = file->f_mapping->host; | 1338 | struct inode *inode = mapping->host; |
1294 | unsigned from, to; | 1339 | unsigned from, to; |
1295 | int ret = 0, ret2; | 1340 | int ret = 0, ret2; |
1296 | 1341 | ||
1297 | from = pos & (PAGE_CACHE_SIZE - 1); | 1342 | from = pos & (PAGE_CACHE_SIZE - 1); |
1298 | to = from + len; | 1343 | to = from + len; |
1299 | 1344 | ||
1300 | ret = walk_page_buffers(handle, page_buffers(page), | 1345 | ret = ext4_jbd2_file_inode(handle, inode); |
1301 | from, to, NULL, ext4_journal_dirty_data); | ||
1302 | 1346 | ||
1303 | if (ret == 0) { | 1347 | if (ret == 0) { |
1304 | /* | 1348 | /* |
@@ -1311,7 +1355,7 @@ static int ext4_ordered_write_end(struct file *file, | |||
1311 | new_i_size = pos + copied; | 1355 | new_i_size = pos + copied; |
1312 | if (new_i_size > EXT4_I(inode)->i_disksize) | 1356 | if (new_i_size > EXT4_I(inode)->i_disksize) |
1313 | EXT4_I(inode)->i_disksize = new_i_size; | 1357 | EXT4_I(inode)->i_disksize = new_i_size; |
1314 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | 1358 | ret2 = generic_write_end(file, mapping, pos, len, copied, |
1315 | page, fsdata); | 1359 | page, fsdata); |
1316 | copied = ret2; | 1360 | copied = ret2; |
1317 | if (ret2 < 0) | 1361 | if (ret2 < 0) |
@@ -1320,8 +1364,6 @@ static int ext4_ordered_write_end(struct file *file, | |||
1320 | ret2 = ext4_journal_stop(handle); | 1364 | ret2 = ext4_journal_stop(handle); |
1321 | if (!ret) | 1365 | if (!ret) |
1322 | ret = ret2; | 1366 | ret = ret2; |
1323 | unlock_page(page); | ||
1324 | page_cache_release(page); | ||
1325 | 1367 | ||
1326 | return ret ? ret : copied; | 1368 | return ret ? ret : copied; |
1327 | } | 1369 | } |
@@ -1332,7 +1374,7 @@ static int ext4_writeback_write_end(struct file *file, | |||
1332 | struct page *page, void *fsdata) | 1374 | struct page *page, void *fsdata) |
1333 | { | 1375 | { |
1334 | handle_t *handle = ext4_journal_current_handle(); | 1376 | handle_t *handle = ext4_journal_current_handle(); |
1335 | struct inode *inode = file->f_mapping->host; | 1377 | struct inode *inode = mapping->host; |
1336 | int ret = 0, ret2; | 1378 | int ret = 0, ret2; |
1337 | loff_t new_i_size; | 1379 | loff_t new_i_size; |
1338 | 1380 | ||
@@ -1340,7 +1382,7 @@ static int ext4_writeback_write_end(struct file *file, | |||
1340 | if (new_i_size > EXT4_I(inode)->i_disksize) | 1382 | if (new_i_size > EXT4_I(inode)->i_disksize) |
1341 | EXT4_I(inode)->i_disksize = new_i_size; | 1383 | EXT4_I(inode)->i_disksize = new_i_size; |
1342 | 1384 | ||
1343 | ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, | 1385 | ret2 = generic_write_end(file, mapping, pos, len, copied, |
1344 | page, fsdata); | 1386 | page, fsdata); |
1345 | copied = ret2; | 1387 | copied = ret2; |
1346 | if (ret2 < 0) | 1388 | if (ret2 < 0) |
@@ -1349,8 +1391,6 @@ static int ext4_writeback_write_end(struct file *file, | |||
1349 | ret2 = ext4_journal_stop(handle); | 1391 | ret2 = ext4_journal_stop(handle); |
1350 | if (!ret) | 1392 | if (!ret) |
1351 | ret = ret2; | 1393 | ret = ret2; |
1352 | unlock_page(page); | ||
1353 | page_cache_release(page); | ||
1354 | 1394 | ||
1355 | return ret ? ret : copied; | 1395 | return ret ? ret : copied; |
1356 | } | 1396 | } |
@@ -1389,14 +1429,965 @@ static int ext4_journalled_write_end(struct file *file, | |||
1389 | ret = ret2; | 1429 | ret = ret2; |
1390 | } | 1430 | } |
1391 | 1431 | ||
1432 | unlock_page(page); | ||
1392 | ret2 = ext4_journal_stop(handle); | 1433 | ret2 = ext4_journal_stop(handle); |
1393 | if (!ret) | 1434 | if (!ret) |
1394 | ret = ret2; | 1435 | ret = ret2; |
1395 | unlock_page(page); | ||
1396 | page_cache_release(page); | 1436 | page_cache_release(page); |
1397 | 1437 | ||
1398 | return ret ? ret : copied; | 1438 | return ret ? ret : copied; |
1399 | } | 1439 | } |
1440 | /* | ||
1441 | * Calculate the number of metadata blocks need to reserve | ||
1442 | * to allocate @blocks for non extent file based file | ||
1443 | */ | ||
1444 | static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) | ||
1445 | { | ||
1446 | int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); | ||
1447 | int ind_blks, dind_blks, tind_blks; | ||
1448 | |||
1449 | /* number of new indirect blocks needed */ | ||
1450 | ind_blks = (blocks + icap - 1) / icap; | ||
1451 | |||
1452 | dind_blks = (ind_blks + icap - 1) / icap; | ||
1453 | |||
1454 | tind_blks = 1; | ||
1455 | |||
1456 | return ind_blks + dind_blks + tind_blks; | ||
1457 | } | ||
1458 | |||
1459 | /* | ||
1460 | * Calculate the number of metadata blocks need to reserve | ||
1461 | * to allocate given number of blocks | ||
1462 | */ | ||
1463 | static int ext4_calc_metadata_amount(struct inode *inode, int blocks) | ||
1464 | { | ||
1465 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) | ||
1466 | return ext4_ext_calc_metadata_amount(inode, blocks); | ||
1467 | |||
1468 | return ext4_indirect_calc_metadata_amount(inode, blocks); | ||
1469 | } | ||
1470 | |||
1471 | static int ext4_da_reserve_space(struct inode *inode, int nrblocks) | ||
1472 | { | ||
1473 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1474 | unsigned long md_needed, mdblocks, total = 0; | ||
1475 | |||
1476 | /* | ||
1477 | * recalculate the amount of metadata blocks to reserve | ||
1478 | * in order to allocate nrblocks | ||
1479 | * worse case is one extent per block | ||
1480 | */ | ||
1481 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1482 | total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; | ||
1483 | mdblocks = ext4_calc_metadata_amount(inode, total); | ||
1484 | BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); | ||
1485 | |||
1486 | md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; | ||
1487 | total = md_needed + nrblocks; | ||
1488 | |||
1489 | if (ext4_has_free_blocks(sbi, total) < total) { | ||
1490 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1491 | return -ENOSPC; | ||
1492 | } | ||
1493 | |||
1494 | /* reduce fs free blocks counter */ | ||
1495 | percpu_counter_sub(&sbi->s_freeblocks_counter, total); | ||
1496 | |||
1497 | EXT4_I(inode)->i_reserved_data_blocks += nrblocks; | ||
1498 | EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; | ||
1499 | |||
1500 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1501 | return 0; /* success */ | ||
1502 | } | ||
1503 | |||
1504 | void ext4_da_release_space(struct inode *inode, int used, int to_free) | ||
1505 | { | ||
1506 | struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); | ||
1507 | int total, mdb, mdb_free, release; | ||
1508 | |||
1509 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1510 | /* recalculate the number of metablocks still need to be reserved */ | ||
1511 | total = EXT4_I(inode)->i_reserved_data_blocks - used - to_free; | ||
1512 | mdb = ext4_calc_metadata_amount(inode, total); | ||
1513 | |||
1514 | /* figure out how many metablocks to release */ | ||
1515 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | ||
1516 | mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; | ||
1517 | |||
1518 | /* Account for allocated meta_blocks */ | ||
1519 | mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; | ||
1520 | |||
1521 | release = to_free + mdb_free; | ||
1522 | |||
1523 | /* update fs free blocks counter for truncate case */ | ||
1524 | percpu_counter_add(&sbi->s_freeblocks_counter, release); | ||
1525 | |||
1526 | /* update per-inode reservations */ | ||
1527 | BUG_ON(used + to_free > EXT4_I(inode)->i_reserved_data_blocks); | ||
1528 | EXT4_I(inode)->i_reserved_data_blocks -= (used + to_free); | ||
1529 | |||
1530 | BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); | ||
1531 | EXT4_I(inode)->i_reserved_meta_blocks = mdb; | ||
1532 | EXT4_I(inode)->i_allocated_meta_blocks = 0; | ||
1533 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
1534 | } | ||
1535 | |||
1536 | static void ext4_da_page_release_reservation(struct page *page, | ||
1537 | unsigned long offset) | ||
1538 | { | ||
1539 | int to_release = 0; | ||
1540 | struct buffer_head *head, *bh; | ||
1541 | unsigned int curr_off = 0; | ||
1542 | |||
1543 | head = page_buffers(page); | ||
1544 | bh = head; | ||
1545 | do { | ||
1546 | unsigned int next_off = curr_off + bh->b_size; | ||
1547 | |||
1548 | if ((offset <= curr_off) && (buffer_delay(bh))) { | ||
1549 | to_release++; | ||
1550 | clear_buffer_delay(bh); | ||
1551 | } | ||
1552 | curr_off = next_off; | ||
1553 | } while ((bh = bh->b_this_page) != head); | ||
1554 | ext4_da_release_space(page->mapping->host, 0, to_release); | ||
1555 | } | ||
1556 | |||
1557 | /* | ||
1558 | * Delayed allocation stuff | ||
1559 | */ | ||
1560 | |||
1561 | struct mpage_da_data { | ||
1562 | struct inode *inode; | ||
1563 | struct buffer_head lbh; /* extent of blocks */ | ||
1564 | unsigned long first_page, next_page; /* extent of pages */ | ||
1565 | get_block_t *get_block; | ||
1566 | struct writeback_control *wbc; | ||
1567 | }; | ||
1568 | |||
1569 | /* | ||
1570 | * mpage_da_submit_io - walks through extent of pages and try to write | ||
1571 | * them with __mpage_writepage() | ||
1572 | * | ||
1573 | * @mpd->inode: inode | ||
1574 | * @mpd->first_page: first page of the extent | ||
1575 | * @mpd->next_page: page after the last page of the extent | ||
1576 | * @mpd->get_block: the filesystem's block mapper function | ||
1577 | * | ||
1578 | * By the time mpage_da_submit_io() is called we expect all blocks | ||
1579 | * to be allocated. this may be wrong if allocation failed. | ||
1580 | * | ||
1581 | * As pages are already locked by write_cache_pages(), we can't use it | ||
1582 | */ | ||
1583 | static int mpage_da_submit_io(struct mpage_da_data *mpd) | ||
1584 | { | ||
1585 | struct address_space *mapping = mpd->inode->i_mapping; | ||
1586 | struct mpage_data mpd_pp = { | ||
1587 | .bio = NULL, | ||
1588 | .last_block_in_bio = 0, | ||
1589 | .get_block = mpd->get_block, | ||
1590 | .use_writepage = 1, | ||
1591 | }; | ||
1592 | int ret = 0, err, nr_pages, i; | ||
1593 | unsigned long index, end; | ||
1594 | struct pagevec pvec; | ||
1595 | |||
1596 | BUG_ON(mpd->next_page <= mpd->first_page); | ||
1597 | |||
1598 | pagevec_init(&pvec, 0); | ||
1599 | index = mpd->first_page; | ||
1600 | end = mpd->next_page - 1; | ||
1601 | |||
1602 | while (index <= end) { | ||
1603 | /* XXX: optimize tail */ | ||
1604 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
1605 | if (nr_pages == 0) | ||
1606 | break; | ||
1607 | for (i = 0; i < nr_pages; i++) { | ||
1608 | struct page *page = pvec.pages[i]; | ||
1609 | |||
1610 | index = page->index; | ||
1611 | if (index > end) | ||
1612 | break; | ||
1613 | index++; | ||
1614 | |||
1615 | err = __mpage_writepage(page, mpd->wbc, &mpd_pp); | ||
1616 | |||
1617 | /* | ||
1618 | * In error case, we have to continue because | ||
1619 | * remaining pages are still locked | ||
1620 | * XXX: unlock and re-dirty them? | ||
1621 | */ | ||
1622 | if (ret == 0) | ||
1623 | ret = err; | ||
1624 | } | ||
1625 | pagevec_release(&pvec); | ||
1626 | } | ||
1627 | if (mpd_pp.bio) | ||
1628 | mpage_bio_submit(WRITE, mpd_pp.bio); | ||
1629 | |||
1630 | return ret; | ||
1631 | } | ||
1632 | |||
1633 | /* | ||
1634 | * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers | ||
1635 | * | ||
1636 | * @mpd->inode - inode to walk through | ||
1637 | * @exbh->b_blocknr - first block on a disk | ||
1638 | * @exbh->b_size - amount of space in bytes | ||
1639 | * @logical - first logical block to start assignment with | ||
1640 | * | ||
1641 | * the function goes through all passed space and put actual disk | ||
1642 | * block numbers into buffer heads, dropping BH_Delay | ||
1643 | */ | ||
1644 | static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, | ||
1645 | struct buffer_head *exbh) | ||
1646 | { | ||
1647 | struct inode *inode = mpd->inode; | ||
1648 | struct address_space *mapping = inode->i_mapping; | ||
1649 | int blocks = exbh->b_size >> inode->i_blkbits; | ||
1650 | sector_t pblock = exbh->b_blocknr, cur_logical; | ||
1651 | struct buffer_head *head, *bh; | ||
1652 | unsigned long index, end; | ||
1653 | struct pagevec pvec; | ||
1654 | int nr_pages, i; | ||
1655 | |||
1656 | index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1657 | end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1658 | cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1659 | |||
1660 | pagevec_init(&pvec, 0); | ||
1661 | |||
1662 | while (index <= end) { | ||
1663 | /* XXX: optimize tail */ | ||
1664 | nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); | ||
1665 | if (nr_pages == 0) | ||
1666 | break; | ||
1667 | for (i = 0; i < nr_pages; i++) { | ||
1668 | struct page *page = pvec.pages[i]; | ||
1669 | |||
1670 | index = page->index; | ||
1671 | if (index > end) | ||
1672 | break; | ||
1673 | index++; | ||
1674 | |||
1675 | BUG_ON(!PageLocked(page)); | ||
1676 | BUG_ON(PageWriteback(page)); | ||
1677 | BUG_ON(!page_has_buffers(page)); | ||
1678 | |||
1679 | bh = page_buffers(page); | ||
1680 | head = bh; | ||
1681 | |||
1682 | /* skip blocks out of the range */ | ||
1683 | do { | ||
1684 | if (cur_logical >= logical) | ||
1685 | break; | ||
1686 | cur_logical++; | ||
1687 | } while ((bh = bh->b_this_page) != head); | ||
1688 | |||
1689 | do { | ||
1690 | if (cur_logical >= logical + blocks) | ||
1691 | break; | ||
1692 | if (buffer_delay(bh)) { | ||
1693 | bh->b_blocknr = pblock; | ||
1694 | clear_buffer_delay(bh); | ||
1695 | } else if (buffer_mapped(bh)) | ||
1696 | BUG_ON(bh->b_blocknr != pblock); | ||
1697 | |||
1698 | cur_logical++; | ||
1699 | pblock++; | ||
1700 | } while ((bh = bh->b_this_page) != head); | ||
1701 | } | ||
1702 | pagevec_release(&pvec); | ||
1703 | } | ||
1704 | } | ||
1705 | |||
1706 | |||
1707 | /* | ||
1708 | * __unmap_underlying_blocks - just a helper function to unmap | ||
1709 | * set of blocks described by @bh | ||
1710 | */ | ||
1711 | static inline void __unmap_underlying_blocks(struct inode *inode, | ||
1712 | struct buffer_head *bh) | ||
1713 | { | ||
1714 | struct block_device *bdev = inode->i_sb->s_bdev; | ||
1715 | int blocks, i; | ||
1716 | |||
1717 | blocks = bh->b_size >> inode->i_blkbits; | ||
1718 | for (i = 0; i < blocks; i++) | ||
1719 | unmap_underlying_metadata(bdev, bh->b_blocknr + i); | ||
1720 | } | ||
1721 | |||
1722 | /* | ||
1723 | * mpage_da_map_blocks - go through given space | ||
1724 | * | ||
1725 | * @mpd->lbh - bh describing space | ||
1726 | * @mpd->get_block - the filesystem's block mapper function | ||
1727 | * | ||
1728 | * The function skips space we know is already mapped to disk blocks. | ||
1729 | * | ||
1730 | * The function ignores errors ->get_block() returns, thus real | ||
1731 | * error handling is postponed to __mpage_writepage() | ||
1732 | */ | ||
1733 | static void mpage_da_map_blocks(struct mpage_da_data *mpd) | ||
1734 | { | ||
1735 | struct buffer_head *lbh = &mpd->lbh; | ||
1736 | int err = 0, remain = lbh->b_size; | ||
1737 | sector_t next = lbh->b_blocknr; | ||
1738 | struct buffer_head new; | ||
1739 | |||
1740 | /* | ||
1741 | * We consider only non-mapped and non-allocated blocks | ||
1742 | */ | ||
1743 | if (buffer_mapped(lbh) && !buffer_delay(lbh)) | ||
1744 | return; | ||
1745 | |||
1746 | while (remain) { | ||
1747 | new.b_state = lbh->b_state; | ||
1748 | new.b_blocknr = 0; | ||
1749 | new.b_size = remain; | ||
1750 | err = mpd->get_block(mpd->inode, next, &new, 1); | ||
1751 | if (err) { | ||
1752 | /* | ||
1753 | * Rather than implement own error handling | ||
1754 | * here, we just leave remaining blocks | ||
1755 | * unallocated and try again with ->writepage() | ||
1756 | */ | ||
1757 | break; | ||
1758 | } | ||
1759 | BUG_ON(new.b_size == 0); | ||
1760 | |||
1761 | if (buffer_new(&new)) | ||
1762 | __unmap_underlying_blocks(mpd->inode, &new); | ||
1763 | |||
1764 | /* | ||
1765 | * If blocks are delayed marked, we need to | ||
1766 | * put actual blocknr and drop delayed bit | ||
1767 | */ | ||
1768 | if (buffer_delay(lbh)) | ||
1769 | mpage_put_bnr_to_bhs(mpd, next, &new); | ||
1770 | |||
1771 | /* go for the remaining blocks */ | ||
1772 | next += new.b_size >> mpd->inode->i_blkbits; | ||
1773 | remain -= new.b_size; | ||
1774 | } | ||
1775 | } | ||
1776 | |||
1777 | #define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) | ||
1778 | |||
1779 | /* | ||
1780 | * mpage_add_bh_to_extent - try to add one more block to extent of blocks | ||
1781 | * | ||
1782 | * @mpd->lbh - extent of blocks | ||
1783 | * @logical - logical number of the block in the file | ||
1784 | * @bh - bh of the block (used to access block's state) | ||
1785 | * | ||
1786 | * the function is used to collect contig. blocks in same state | ||
1787 | */ | ||
1788 | static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, | ||
1789 | sector_t logical, struct buffer_head *bh) | ||
1790 | { | ||
1791 | struct buffer_head *lbh = &mpd->lbh; | ||
1792 | sector_t next; | ||
1793 | |||
1794 | next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); | ||
1795 | |||
1796 | /* | ||
1797 | * First block in the extent | ||
1798 | */ | ||
1799 | if (lbh->b_size == 0) { | ||
1800 | lbh->b_blocknr = logical; | ||
1801 | lbh->b_size = bh->b_size; | ||
1802 | lbh->b_state = bh->b_state & BH_FLAGS; | ||
1803 | return; | ||
1804 | } | ||
1805 | |||
1806 | /* | ||
1807 | * Can we merge the block to our big extent? | ||
1808 | */ | ||
1809 | if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { | ||
1810 | lbh->b_size += bh->b_size; | ||
1811 | return; | ||
1812 | } | ||
1813 | |||
1814 | /* | ||
1815 | * We couldn't merge the block to our extent, so we | ||
1816 | * need to flush current extent and start new one | ||
1817 | */ | ||
1818 | mpage_da_map_blocks(mpd); | ||
1819 | |||
1820 | /* | ||
1821 | * Now start a new extent | ||
1822 | */ | ||
1823 | lbh->b_size = bh->b_size; | ||
1824 | lbh->b_state = bh->b_state & BH_FLAGS; | ||
1825 | lbh->b_blocknr = logical; | ||
1826 | } | ||
1827 | |||
1828 | /* | ||
1829 | * __mpage_da_writepage - finds extent of pages and blocks | ||
1830 | * | ||
1831 | * @page: page to consider | ||
1832 | * @wbc: not used, we just follow rules | ||
1833 | * @data: context | ||
1834 | * | ||
1835 | * The function finds extents of pages and scan them for all blocks. | ||
1836 | */ | ||
1837 | static int __mpage_da_writepage(struct page *page, | ||
1838 | struct writeback_control *wbc, void *data) | ||
1839 | { | ||
1840 | struct mpage_da_data *mpd = data; | ||
1841 | struct inode *inode = mpd->inode; | ||
1842 | struct buffer_head *bh, *head, fake; | ||
1843 | sector_t logical; | ||
1844 | |||
1845 | /* | ||
1846 | * Can we merge this page to current extent? | ||
1847 | */ | ||
1848 | if (mpd->next_page != page->index) { | ||
1849 | /* | ||
1850 | * Nope, we can't. So, we map non-allocated blocks | ||
1851 | * and start IO on them using __mpage_writepage() | ||
1852 | */ | ||
1853 | if (mpd->next_page != mpd->first_page) { | ||
1854 | mpage_da_map_blocks(mpd); | ||
1855 | mpage_da_submit_io(mpd); | ||
1856 | } | ||
1857 | |||
1858 | /* | ||
1859 | * Start next extent of pages ... | ||
1860 | */ | ||
1861 | mpd->first_page = page->index; | ||
1862 | |||
1863 | /* | ||
1864 | * ... and blocks | ||
1865 | */ | ||
1866 | mpd->lbh.b_size = 0; | ||
1867 | mpd->lbh.b_state = 0; | ||
1868 | mpd->lbh.b_blocknr = 0; | ||
1869 | } | ||
1870 | |||
1871 | mpd->next_page = page->index + 1; | ||
1872 | logical = (sector_t) page->index << | ||
1873 | (PAGE_CACHE_SHIFT - inode->i_blkbits); | ||
1874 | |||
1875 | if (!page_has_buffers(page)) { | ||
1876 | /* | ||
1877 | * There is no attached buffer heads yet (mmap?) | ||
1878 | * we treat the page asfull of dirty blocks | ||
1879 | */ | ||
1880 | bh = &fake; | ||
1881 | bh->b_size = PAGE_CACHE_SIZE; | ||
1882 | bh->b_state = 0; | ||
1883 | set_buffer_dirty(bh); | ||
1884 | set_buffer_uptodate(bh); | ||
1885 | mpage_add_bh_to_extent(mpd, logical, bh); | ||
1886 | } else { | ||
1887 | /* | ||
1888 | * Page with regular buffer heads, just add all dirty ones | ||
1889 | */ | ||
1890 | head = page_buffers(page); | ||
1891 | bh = head; | ||
1892 | do { | ||
1893 | BUG_ON(buffer_locked(bh)); | ||
1894 | if (buffer_dirty(bh)) | ||
1895 | mpage_add_bh_to_extent(mpd, logical, bh); | ||
1896 | logical++; | ||
1897 | } while ((bh = bh->b_this_page) != head); | ||
1898 | } | ||
1899 | |||
1900 | return 0; | ||
1901 | } | ||
1902 | |||
1903 | /* | ||
1904 | * mpage_da_writepages - walk the list of dirty pages of the given | ||
1905 | * address space, allocates non-allocated blocks, maps newly-allocated | ||
1906 | * blocks to existing bhs and issue IO them | ||
1907 | * | ||
1908 | * @mapping: address space structure to write | ||
1909 | * @wbc: subtract the number of written pages from *@wbc->nr_to_write | ||
1910 | * @get_block: the filesystem's block mapper function. | ||
1911 | * | ||
1912 | * This is a library function, which implements the writepages() | ||
1913 | * address_space_operation. | ||
1914 | * | ||
1915 | * In order to avoid duplication of logic that deals with partial pages, | ||
1916 | * multiple bio per page, etc, we find non-allocated blocks, allocate | ||
1917 | * them with minimal calls to ->get_block() and re-use __mpage_writepage() | ||
1918 | * | ||
1919 | * It's important that we call __mpage_writepage() only once for each | ||
1920 | * involved page, otherwise we'd have to implement more complicated logic | ||
1921 | * to deal with pages w/o PG_lock or w/ PG_writeback and so on. | ||
1922 | * | ||
1923 | * See comments to mpage_writepages() | ||
1924 | */ | ||
1925 | static int mpage_da_writepages(struct address_space *mapping, | ||
1926 | struct writeback_control *wbc, | ||
1927 | get_block_t get_block) | ||
1928 | { | ||
1929 | struct mpage_da_data mpd; | ||
1930 | int ret; | ||
1931 | |||
1932 | if (!get_block) | ||
1933 | return generic_writepages(mapping, wbc); | ||
1934 | |||
1935 | mpd.wbc = wbc; | ||
1936 | mpd.inode = mapping->host; | ||
1937 | mpd.lbh.b_size = 0; | ||
1938 | mpd.lbh.b_state = 0; | ||
1939 | mpd.lbh.b_blocknr = 0; | ||
1940 | mpd.first_page = 0; | ||
1941 | mpd.next_page = 0; | ||
1942 | mpd.get_block = get_block; | ||
1943 | |||
1944 | ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); | ||
1945 | |||
1946 | /* | ||
1947 | * Handle last extent of pages | ||
1948 | */ | ||
1949 | if (mpd.next_page != mpd.first_page) { | ||
1950 | mpage_da_map_blocks(&mpd); | ||
1951 | mpage_da_submit_io(&mpd); | ||
1952 | } | ||
1953 | |||
1954 | return ret; | ||
1955 | } | ||
1956 | |||
1957 | /* | ||
1958 | * this is a special callback for ->write_begin() only | ||
1959 | * it's intention is to return mapped block or reserve space | ||
1960 | */ | ||
1961 | static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, | ||
1962 | struct buffer_head *bh_result, int create) | ||
1963 | { | ||
1964 | int ret = 0; | ||
1965 | |||
1966 | BUG_ON(create == 0); | ||
1967 | BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); | ||
1968 | |||
1969 | /* | ||
1970 | * first, we need to know whether the block is allocated already | ||
1971 | * preallocated blocks are unmapped but should treated | ||
1972 | * the same as allocated blocks. | ||
1973 | */ | ||
1974 | ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); | ||
1975 | if ((ret == 0) && !buffer_delay(bh_result)) { | ||
1976 | /* the block isn't (pre)allocated yet, let's reserve space */ | ||
1977 | /* | ||
1978 | * XXX: __block_prepare_write() unmaps passed block, | ||
1979 | * is it OK? | ||
1980 | */ | ||
1981 | ret = ext4_da_reserve_space(inode, 1); | ||
1982 | if (ret) | ||
1983 | /* not enough space to reserve */ | ||
1984 | return ret; | ||
1985 | |||
1986 | map_bh(bh_result, inode->i_sb, 0); | ||
1987 | set_buffer_new(bh_result); | ||
1988 | set_buffer_delay(bh_result); | ||
1989 | } else if (ret > 0) { | ||
1990 | bh_result->b_size = (ret << inode->i_blkbits); | ||
1991 | ret = 0; | ||
1992 | } | ||
1993 | |||
1994 | return ret; | ||
1995 | } | ||
1996 | #define EXT4_DELALLOC_RSVED 1 | ||
1997 | static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, | ||
1998 | struct buffer_head *bh_result, int create) | ||
1999 | { | ||
2000 | int ret; | ||
2001 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
2002 | loff_t disksize = EXT4_I(inode)->i_disksize; | ||
2003 | handle_t *handle = NULL; | ||
2004 | |||
2005 | handle = ext4_journal_current_handle(); | ||
2006 | if (!handle) { | ||
2007 | ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, | ||
2008 | bh_result, 0, 0, 0); | ||
2009 | BUG_ON(!ret); | ||
2010 | } else { | ||
2011 | ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, | ||
2012 | bh_result, create, 0, EXT4_DELALLOC_RSVED); | ||
2013 | } | ||
2014 | |||
2015 | if (ret > 0) { | ||
2016 | bh_result->b_size = (ret << inode->i_blkbits); | ||
2017 | |||
2018 | /* | ||
2019 | * Update on-disk size along with block allocation | ||
2020 | * we don't use 'extend_disksize' as size may change | ||
2021 | * within already allocated block -bzzz | ||
2022 | */ | ||
2023 | disksize = ((loff_t) iblock + ret) << inode->i_blkbits; | ||
2024 | if (disksize > i_size_read(inode)) | ||
2025 | disksize = i_size_read(inode); | ||
2026 | if (disksize > EXT4_I(inode)->i_disksize) { | ||
2027 | /* | ||
2028 | * XXX: replace with spinlock if seen contended -bzzz | ||
2029 | */ | ||
2030 | down_write(&EXT4_I(inode)->i_data_sem); | ||
2031 | if (disksize > EXT4_I(inode)->i_disksize) | ||
2032 | EXT4_I(inode)->i_disksize = disksize; | ||
2033 | up_write(&EXT4_I(inode)->i_data_sem); | ||
2034 | |||
2035 | if (EXT4_I(inode)->i_disksize == disksize) { | ||
2036 | ret = ext4_mark_inode_dirty(handle, inode); | ||
2037 | return ret; | ||
2038 | } | ||
2039 | } | ||
2040 | ret = 0; | ||
2041 | } | ||
2042 | return ret; | ||
2043 | } | ||
2044 | |||
2045 | static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) | ||
2046 | { | ||
2047 | /* | ||
2048 | * unmapped buffer is possible for holes. | ||
2049 | * delay buffer is possible with delayed allocation | ||
2050 | */ | ||
2051 | return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); | ||
2052 | } | ||
2053 | |||
2054 | static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, | ||
2055 | struct buffer_head *bh_result, int create) | ||
2056 | { | ||
2057 | int ret = 0; | ||
2058 | unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; | ||
2059 | |||
2060 | /* | ||
2061 | * we don't want to do block allocation in writepage | ||
2062 | * so call get_block_wrap with create = 0 | ||
2063 | */ | ||
2064 | ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, | ||
2065 | bh_result, 0, 0, 0); | ||
2066 | if (ret > 0) { | ||
2067 | bh_result->b_size = (ret << inode->i_blkbits); | ||
2068 | ret = 0; | ||
2069 | } | ||
2070 | return ret; | ||
2071 | } | ||
2072 | |||
2073 | /* | ||
2074 | * get called vi ext4_da_writepages after taking page lock (have journal handle) | ||
2075 | * get called via journal_submit_inode_data_buffers (no journal handle) | ||
2076 | * get called via shrink_page_list via pdflush (no journal handle) | ||
2077 | * or grab_page_cache when doing write_begin (have journal handle) | ||
2078 | */ | ||
2079 | static int ext4_da_writepage(struct page *page, | ||
2080 | struct writeback_control *wbc) | ||
2081 | { | ||
2082 | int ret = 0; | ||
2083 | loff_t size; | ||
2084 | unsigned long len; | ||
2085 | struct buffer_head *page_bufs; | ||
2086 | struct inode *inode = page->mapping->host; | ||
2087 | |||
2088 | size = i_size_read(inode); | ||
2089 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
2090 | len = size & ~PAGE_CACHE_MASK; | ||
2091 | else | ||
2092 | len = PAGE_CACHE_SIZE; | ||
2093 | |||
2094 | if (page_has_buffers(page)) { | ||
2095 | page_bufs = page_buffers(page); | ||
2096 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | ||
2097 | ext4_bh_unmapped_or_delay)) { | ||
2098 | /* | ||
2099 | * We don't want to do block allocation | ||
2100 | * So redirty the page and return | ||
2101 | * We may reach here when we do a journal commit | ||
2102 | * via journal_submit_inode_data_buffers. | ||
2103 | * If we don't have mapping block we just ignore | ||
2104 | * them. We can also reach here via shrink_page_list | ||
2105 | */ | ||
2106 | redirty_page_for_writepage(wbc, page); | ||
2107 | unlock_page(page); | ||
2108 | return 0; | ||
2109 | } | ||
2110 | } else { | ||
2111 | /* | ||
2112 | * The test for page_has_buffers() is subtle: | ||
2113 | * We know the page is dirty but it lost buffers. That means | ||
2114 | * that at some moment in time after write_begin()/write_end() | ||
2115 | * has been called all buffers have been clean and thus they | ||
2116 | * must have been written at least once. So they are all | ||
2117 | * mapped and we can happily proceed with mapping them | ||
2118 | * and writing the page. | ||
2119 | * | ||
2120 | * Try to initialize the buffer_heads and check whether | ||
2121 | * all are mapped and non delay. We don't want to | ||
2122 | * do block allocation here. | ||
2123 | */ | ||
2124 | ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | ||
2125 | ext4_normal_get_block_write); | ||
2126 | if (!ret) { | ||
2127 | page_bufs = page_buffers(page); | ||
2128 | /* check whether all are mapped and non delay */ | ||
2129 | if (walk_page_buffers(NULL, page_bufs, 0, len, NULL, | ||
2130 | ext4_bh_unmapped_or_delay)) { | ||
2131 | redirty_page_for_writepage(wbc, page); | ||
2132 | unlock_page(page); | ||
2133 | return 0; | ||
2134 | } | ||
2135 | } else { | ||
2136 | /* | ||
2137 | * We can't do block allocation here | ||
2138 | * so just redity the page and unlock | ||
2139 | * and return | ||
2140 | */ | ||
2141 | redirty_page_for_writepage(wbc, page); | ||
2142 | unlock_page(page); | ||
2143 | return 0; | ||
2144 | } | ||
2145 | } | ||
2146 | |||
2147 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | ||
2148 | ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); | ||
2149 | else | ||
2150 | ret = block_write_full_page(page, | ||
2151 | ext4_normal_get_block_write, | ||
2152 | wbc); | ||
2153 | |||
2154 | return ret; | ||
2155 | } | ||
2156 | |||
2157 | /* | ||
2158 | * For now just follow the DIO way to estimate the max credits | ||
2159 | * needed to write out EXT4_MAX_WRITEBACK_PAGES. | ||
2160 | * todo: need to calculate the max credits need for | ||
2161 | * extent based files, currently the DIO credits is based on | ||
2162 | * indirect-blocks mapping way. | ||
2163 | * | ||
2164 | * Probably should have a generic way to calculate credits | ||
2165 | * for DIO, writepages, and truncate | ||
2166 | */ | ||
2167 | #define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS | ||
2168 | #define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS | ||
2169 | |||
2170 | static int ext4_da_writepages(struct address_space *mapping, | ||
2171 | struct writeback_control *wbc) | ||
2172 | { | ||
2173 | struct inode *inode = mapping->host; | ||
2174 | handle_t *handle = NULL; | ||
2175 | int needed_blocks; | ||
2176 | int ret = 0; | ||
2177 | long to_write; | ||
2178 | loff_t range_start = 0; | ||
2179 | |||
2180 | /* | ||
2181 | * No pages to write? This is mainly a kludge to avoid starting | ||
2182 | * a transaction for special inodes like journal inode on last iput() | ||
2183 | * because that could violate lock ordering on umount | ||
2184 | */ | ||
2185 | if (!mapping->nrpages) | ||
2186 | return 0; | ||
2187 | |||
2188 | /* | ||
2189 | * Estimate the worse case needed credits to write out | ||
2190 | * EXT4_MAX_BUF_BLOCKS pages | ||
2191 | */ | ||
2192 | needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; | ||
2193 | |||
2194 | to_write = wbc->nr_to_write; | ||
2195 | if (!wbc->range_cyclic) { | ||
2196 | /* | ||
2197 | * If range_cyclic is not set force range_cont | ||
2198 | * and save the old writeback_index | ||
2199 | */ | ||
2200 | wbc->range_cont = 1; | ||
2201 | range_start = wbc->range_start; | ||
2202 | } | ||
2203 | |||
2204 | while (!ret && to_write) { | ||
2205 | /* start a new transaction*/ | ||
2206 | handle = ext4_journal_start(inode, needed_blocks); | ||
2207 | if (IS_ERR(handle)) { | ||
2208 | ret = PTR_ERR(handle); | ||
2209 | goto out_writepages; | ||
2210 | } | ||
2211 | if (ext4_should_order_data(inode)) { | ||
2212 | /* | ||
2213 | * With ordered mode we need to add | ||
2214 | * the inode to the journal handle | ||
2215 | * when we do block allocation. | ||
2216 | */ | ||
2217 | ret = ext4_jbd2_file_inode(handle, inode); | ||
2218 | if (ret) { | ||
2219 | ext4_journal_stop(handle); | ||
2220 | goto out_writepages; | ||
2221 | } | ||
2222 | |||
2223 | } | ||
2224 | /* | ||
2225 | * set the max dirty pages could be write at a time | ||
2226 | * to fit into the reserved transaction credits | ||
2227 | */ | ||
2228 | if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) | ||
2229 | wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; | ||
2230 | |||
2231 | to_write -= wbc->nr_to_write; | ||
2232 | ret = mpage_da_writepages(mapping, wbc, | ||
2233 | ext4_da_get_block_write); | ||
2234 | ext4_journal_stop(handle); | ||
2235 | if (wbc->nr_to_write) { | ||
2236 | /* | ||
2237 | * There is no more writeout needed | ||
2238 | * or we requested for a noblocking writeout | ||
2239 | * and we found the device congested | ||
2240 | */ | ||
2241 | to_write += wbc->nr_to_write; | ||
2242 | break; | ||
2243 | } | ||
2244 | wbc->nr_to_write = to_write; | ||
2245 | } | ||
2246 | |||
2247 | out_writepages: | ||
2248 | wbc->nr_to_write = to_write; | ||
2249 | if (range_start) | ||
2250 | wbc->range_start = range_start; | ||
2251 | return ret; | ||
2252 | } | ||
2253 | |||
2254 | static int ext4_da_write_begin(struct file *file, struct address_space *mapping, | ||
2255 | loff_t pos, unsigned len, unsigned flags, | ||
2256 | struct page **pagep, void **fsdata) | ||
2257 | { | ||
2258 | int ret, retries = 0; | ||
2259 | struct page *page; | ||
2260 | pgoff_t index; | ||
2261 | unsigned from, to; | ||
2262 | struct inode *inode = mapping->host; | ||
2263 | handle_t *handle; | ||
2264 | |||
2265 | index = pos >> PAGE_CACHE_SHIFT; | ||
2266 | from = pos & (PAGE_CACHE_SIZE - 1); | ||
2267 | to = from + len; | ||
2268 | |||
2269 | retry: | ||
2270 | /* | ||
2271 | * With delayed allocation, we don't log the i_disksize update | ||
2272 | * if there is delayed block allocation. But we still need | ||
2273 | * to journalling the i_disksize update if writes to the end | ||
2274 | * of file which has an already mapped buffer. | ||
2275 | */ | ||
2276 | handle = ext4_journal_start(inode, 1); | ||
2277 | if (IS_ERR(handle)) { | ||
2278 | ret = PTR_ERR(handle); | ||
2279 | goto out; | ||
2280 | } | ||
2281 | |||
2282 | page = __grab_cache_page(mapping, index); | ||
2283 | if (!page) | ||
2284 | return -ENOMEM; | ||
2285 | *pagep = page; | ||
2286 | |||
2287 | ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, | ||
2288 | ext4_da_get_block_prep); | ||
2289 | if (ret < 0) { | ||
2290 | unlock_page(page); | ||
2291 | ext4_journal_stop(handle); | ||
2292 | page_cache_release(page); | ||
2293 | } | ||
2294 | |||
2295 | if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) | ||
2296 | goto retry; | ||
2297 | out: | ||
2298 | return ret; | ||
2299 | } | ||
2300 | |||
2301 | /* | ||
2302 | * Check if we should update i_disksize | ||
2303 | * when write to the end of file but not require block allocation | ||
2304 | */ | ||
2305 | static int ext4_da_should_update_i_disksize(struct page *page, | ||
2306 | unsigned long offset) | ||
2307 | { | ||
2308 | struct buffer_head *bh; | ||
2309 | struct inode *inode = page->mapping->host; | ||
2310 | unsigned int idx; | ||
2311 | int i; | ||
2312 | |||
2313 | bh = page_buffers(page); | ||
2314 | idx = offset >> inode->i_blkbits; | ||
2315 | |||
2316 | for (i=0; i < idx; i++) | ||
2317 | bh = bh->b_this_page; | ||
2318 | |||
2319 | if (!buffer_mapped(bh) || (buffer_delay(bh))) | ||
2320 | return 0; | ||
2321 | return 1; | ||
2322 | } | ||
2323 | |||
2324 | static int ext4_da_write_end(struct file *file, | ||
2325 | struct address_space *mapping, | ||
2326 | loff_t pos, unsigned len, unsigned copied, | ||
2327 | struct page *page, void *fsdata) | ||
2328 | { | ||
2329 | struct inode *inode = mapping->host; | ||
2330 | int ret = 0, ret2; | ||
2331 | handle_t *handle = ext4_journal_current_handle(); | ||
2332 | loff_t new_i_size; | ||
2333 | unsigned long start, end; | ||
2334 | |||
2335 | start = pos & (PAGE_CACHE_SIZE - 1); | ||
2336 | end = start + copied -1; | ||
2337 | |||
2338 | /* | ||
2339 | * generic_write_end() will run mark_inode_dirty() if i_size | ||
2340 | * changes. So let's piggyback the i_disksize mark_inode_dirty | ||
2341 | * into that. | ||
2342 | */ | ||
2343 | |||
2344 | new_i_size = pos + copied; | ||
2345 | if (new_i_size > EXT4_I(inode)->i_disksize) { | ||
2346 | if (ext4_da_should_update_i_disksize(page, end)) { | ||
2347 | down_write(&EXT4_I(inode)->i_data_sem); | ||
2348 | if (new_i_size > EXT4_I(inode)->i_disksize) { | ||
2349 | /* | ||
2350 | * Updating i_disksize when extending file | ||
2351 | * without needing block allocation | ||
2352 | */ | ||
2353 | if (ext4_should_order_data(inode)) | ||
2354 | ret = ext4_jbd2_file_inode(handle, | ||
2355 | inode); | ||
2356 | |||
2357 | EXT4_I(inode)->i_disksize = new_i_size; | ||
2358 | } | ||
2359 | up_write(&EXT4_I(inode)->i_data_sem); | ||
2360 | } | ||
2361 | } | ||
2362 | ret2 = generic_write_end(file, mapping, pos, len, copied, | ||
2363 | page, fsdata); | ||
2364 | copied = ret2; | ||
2365 | if (ret2 < 0) | ||
2366 | ret = ret2; | ||
2367 | ret2 = ext4_journal_stop(handle); | ||
2368 | if (!ret) | ||
2369 | ret = ret2; | ||
2370 | |||
2371 | return ret ? ret : copied; | ||
2372 | } | ||
2373 | |||
2374 | static void ext4_da_invalidatepage(struct page *page, unsigned long offset) | ||
2375 | { | ||
2376 | /* | ||
2377 | * Drop reserved blocks | ||
2378 | */ | ||
2379 | BUG_ON(!PageLocked(page)); | ||
2380 | if (!page_has_buffers(page)) | ||
2381 | goto out; | ||
2382 | |||
2383 | ext4_da_page_release_reservation(page, offset); | ||
2384 | |||
2385 | out: | ||
2386 | ext4_invalidatepage(page, offset); | ||
2387 | |||
2388 | return; | ||
2389 | } | ||
2390 | |||
1400 | 2391 | ||
1401 | /* | 2392 | /* |
1402 | * bmap() is special. It gets used by applications such as lilo and by | 2393 | * bmap() is special. It gets used by applications such as lilo and by |
@@ -1418,6 +2409,16 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) | |||
1418 | journal_t *journal; | 2409 | journal_t *journal; |
1419 | int err; | 2410 | int err; |
1420 | 2411 | ||
2412 | if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) && | ||
2413 | test_opt(inode->i_sb, DELALLOC)) { | ||
2414 | /* | ||
2415 | * With delalloc we want to sync the file | ||
2416 | * so that we can make sure we allocate | ||
2417 | * blocks for file | ||
2418 | */ | ||
2419 | filemap_write_and_wait(mapping); | ||
2420 | } | ||
2421 | |||
1421 | if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { | 2422 | if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { |
1422 | /* | 2423 | /* |
1423 | * This is a REALLY heavyweight approach, but the use of | 2424 | * This is a REALLY heavyweight approach, but the use of |
@@ -1462,21 +2463,17 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) | |||
1462 | return 0; | 2463 | return 0; |
1463 | } | 2464 | } |
1464 | 2465 | ||
1465 | static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | ||
1466 | { | ||
1467 | if (buffer_mapped(bh)) | ||
1468 | return ext4_journal_dirty_data(handle, bh); | ||
1469 | return 0; | ||
1470 | } | ||
1471 | |||
1472 | /* | 2466 | /* |
1473 | * Note that we always start a transaction even if we're not journalling | 2467 | * Note that we don't need to start a transaction unless we're journaling data |
1474 | * data. This is to preserve ordering: any hole instantiation within | 2468 | * because we should have holes filled from ext4_page_mkwrite(). We even don't |
1475 | * __block_write_full_page -> ext4_get_block() should be journalled | 2469 | * need to file the inode to the transaction's list in ordered mode because if |
1476 | * along with the data so we don't crash and then get metadata which | 2470 | * we are writing back data added by write(), the inode is already there and if |
1477 | * refers to old data. | 2471 | * we are writing back data modified via mmap(), noone guarantees in which |
2472 | * transaction the data will hit the disk. In case we are journaling data, we | ||
2473 | * cannot start transaction directly because transaction start ranks above page | ||
2474 | * lock so we have to do some magic. | ||
1478 | * | 2475 | * |
1479 | * In all journalling modes block_write_full_page() will start the I/O. | 2476 | * In all journaling modes block_write_full_page() will start the I/O. |
1480 | * | 2477 | * |
1481 | * Problem: | 2478 | * Problem: |
1482 | * | 2479 | * |
@@ -1518,105 +2515,103 @@ static int jbd2_journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh) | |||
1518 | * disastrous. Any write() or metadata operation will sync the fs for | 2515 | * disastrous. Any write() or metadata operation will sync the fs for |
1519 | * us. | 2516 | * us. |
1520 | * | 2517 | * |
1521 | * AKPM2: if all the page's buffers are mapped to disk and !data=journal, | ||
1522 | * we don't need to open a transaction here. | ||
1523 | */ | 2518 | */ |
1524 | static int ext4_ordered_writepage(struct page *page, | 2519 | static int __ext4_normal_writepage(struct page *page, |
1525 | struct writeback_control *wbc) | 2520 | struct writeback_control *wbc) |
1526 | { | 2521 | { |
1527 | struct inode *inode = page->mapping->host; | 2522 | struct inode *inode = page->mapping->host; |
1528 | struct buffer_head *page_bufs; | ||
1529 | handle_t *handle = NULL; | ||
1530 | int ret = 0; | ||
1531 | int err; | ||
1532 | |||
1533 | J_ASSERT(PageLocked(page)); | ||
1534 | |||
1535 | /* | ||
1536 | * We give up here if we're reentered, because it might be for a | ||
1537 | * different filesystem. | ||
1538 | */ | ||
1539 | if (ext4_journal_current_handle()) | ||
1540 | goto out_fail; | ||
1541 | 2523 | ||
1542 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | 2524 | if (test_opt(inode->i_sb, NOBH)) |
2525 | return nobh_writepage(page, | ||
2526 | ext4_normal_get_block_write, wbc); | ||
2527 | else | ||
2528 | return block_write_full_page(page, | ||
2529 | ext4_normal_get_block_write, | ||
2530 | wbc); | ||
2531 | } | ||
1543 | 2532 | ||
1544 | if (IS_ERR(handle)) { | 2533 | static int ext4_normal_writepage(struct page *page, |
1545 | ret = PTR_ERR(handle); | 2534 | struct writeback_control *wbc) |
1546 | goto out_fail; | 2535 | { |
1547 | } | 2536 | struct inode *inode = page->mapping->host; |
2537 | loff_t size = i_size_read(inode); | ||
2538 | loff_t len; | ||
1548 | 2539 | ||
1549 | if (!page_has_buffers(page)) { | 2540 | J_ASSERT(PageLocked(page)); |
1550 | create_empty_buffers(page, inode->i_sb->s_blocksize, | 2541 | if (page->index == size >> PAGE_CACHE_SHIFT) |
1551 | (1 << BH_Dirty)|(1 << BH_Uptodate)); | 2542 | len = size & ~PAGE_CACHE_MASK; |
2543 | else | ||
2544 | len = PAGE_CACHE_SIZE; | ||
2545 | |||
2546 | if (page_has_buffers(page)) { | ||
2547 | /* if page has buffers it should all be mapped | ||
2548 | * and allocated. If there are not buffers attached | ||
2549 | * to the page we know the page is dirty but it lost | ||
2550 | * buffers. That means that at some moment in time | ||
2551 | * after write_begin() / write_end() has been called | ||
2552 | * all buffers have been clean and thus they must have been | ||
2553 | * written at least once. So they are all mapped and we can | ||
2554 | * happily proceed with mapping them and writing the page. | ||
2555 | */ | ||
2556 | BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
2557 | ext4_bh_unmapped_or_delay)); | ||
1552 | } | 2558 | } |
1553 | page_bufs = page_buffers(page); | ||
1554 | walk_page_buffers(handle, page_bufs, 0, | ||
1555 | PAGE_CACHE_SIZE, NULL, bget_one); | ||
1556 | |||
1557 | ret = block_write_full_page(page, ext4_get_block, wbc); | ||
1558 | 2559 | ||
1559 | /* | 2560 | if (!ext4_journal_current_handle()) |
1560 | * The page can become unlocked at any point now, and | 2561 | return __ext4_normal_writepage(page, wbc); |
1561 | * truncate can then come in and change things. So we | ||
1562 | * can't touch *page from now on. But *page_bufs is | ||
1563 | * safe due to elevated refcount. | ||
1564 | */ | ||
1565 | 2562 | ||
1566 | /* | ||
1567 | * And attach them to the current transaction. But only if | ||
1568 | * block_write_full_page() succeeded. Otherwise they are unmapped, | ||
1569 | * and generally junk. | ||
1570 | */ | ||
1571 | if (ret == 0) { | ||
1572 | err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, | ||
1573 | NULL, jbd2_journal_dirty_data_fn); | ||
1574 | if (!ret) | ||
1575 | ret = err; | ||
1576 | } | ||
1577 | walk_page_buffers(handle, page_bufs, 0, | ||
1578 | PAGE_CACHE_SIZE, NULL, bput_one); | ||
1579 | err = ext4_journal_stop(handle); | ||
1580 | if (!ret) | ||
1581 | ret = err; | ||
1582 | return ret; | ||
1583 | |||
1584 | out_fail: | ||
1585 | redirty_page_for_writepage(wbc, page); | 2563 | redirty_page_for_writepage(wbc, page); |
1586 | unlock_page(page); | 2564 | unlock_page(page); |
1587 | return ret; | 2565 | return 0; |
1588 | } | 2566 | } |
1589 | 2567 | ||
1590 | static int ext4_writeback_writepage(struct page *page, | 2568 | static int __ext4_journalled_writepage(struct page *page, |
1591 | struct writeback_control *wbc) | 2569 | struct writeback_control *wbc) |
1592 | { | 2570 | { |
1593 | struct inode *inode = page->mapping->host; | 2571 | struct address_space *mapping = page->mapping; |
2572 | struct inode *inode = mapping->host; | ||
2573 | struct buffer_head *page_bufs; | ||
1594 | handle_t *handle = NULL; | 2574 | handle_t *handle = NULL; |
1595 | int ret = 0; | 2575 | int ret = 0; |
1596 | int err; | 2576 | int err; |
1597 | 2577 | ||
1598 | if (ext4_journal_current_handle()) | 2578 | ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, |
1599 | goto out_fail; | 2579 | ext4_normal_get_block_write); |
2580 | if (ret != 0) | ||
2581 | goto out_unlock; | ||
2582 | |||
2583 | page_bufs = page_buffers(page); | ||
2584 | walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE, NULL, | ||
2585 | bget_one); | ||
2586 | /* As soon as we unlock the page, it can go away, but we have | ||
2587 | * references to buffers so we are safe */ | ||
2588 | unlock_page(page); | ||
1600 | 2589 | ||
1601 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | 2590 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); |
1602 | if (IS_ERR(handle)) { | 2591 | if (IS_ERR(handle)) { |
1603 | ret = PTR_ERR(handle); | 2592 | ret = PTR_ERR(handle); |
1604 | goto out_fail; | 2593 | goto out; |
1605 | } | 2594 | } |
1606 | 2595 | ||
1607 | if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) | 2596 | ret = walk_page_buffers(handle, page_bufs, 0, |
1608 | ret = nobh_writepage(page, ext4_get_block, wbc); | 2597 | PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); |
1609 | else | ||
1610 | ret = block_write_full_page(page, ext4_get_block, wbc); | ||
1611 | 2598 | ||
2599 | err = walk_page_buffers(handle, page_bufs, 0, | ||
2600 | PAGE_CACHE_SIZE, NULL, write_end_fn); | ||
2601 | if (ret == 0) | ||
2602 | ret = err; | ||
1612 | err = ext4_journal_stop(handle); | 2603 | err = ext4_journal_stop(handle); |
1613 | if (!ret) | 2604 | if (!ret) |
1614 | ret = err; | 2605 | ret = err; |
1615 | return ret; | ||
1616 | 2606 | ||
1617 | out_fail: | 2607 | walk_page_buffers(handle, page_bufs, 0, |
1618 | redirty_page_for_writepage(wbc, page); | 2608 | PAGE_CACHE_SIZE, NULL, bput_one); |
2609 | EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | ||
2610 | goto out; | ||
2611 | |||
2612 | out_unlock: | ||
1619 | unlock_page(page); | 2613 | unlock_page(page); |
2614 | out: | ||
1620 | return ret; | 2615 | return ret; |
1621 | } | 2616 | } |
1622 | 2617 | ||
@@ -1624,59 +2619,53 @@ static int ext4_journalled_writepage(struct page *page, | |||
1624 | struct writeback_control *wbc) | 2619 | struct writeback_control *wbc) |
1625 | { | 2620 | { |
1626 | struct inode *inode = page->mapping->host; | 2621 | struct inode *inode = page->mapping->host; |
1627 | handle_t *handle = NULL; | 2622 | loff_t size = i_size_read(inode); |
1628 | int ret = 0; | 2623 | loff_t len; |
1629 | int err; | ||
1630 | 2624 | ||
1631 | if (ext4_journal_current_handle()) | 2625 | J_ASSERT(PageLocked(page)); |
1632 | goto no_write; | 2626 | if (page->index == size >> PAGE_CACHE_SHIFT) |
2627 | len = size & ~PAGE_CACHE_MASK; | ||
2628 | else | ||
2629 | len = PAGE_CACHE_SIZE; | ||
2630 | |||
2631 | if (page_has_buffers(page)) { | ||
2632 | /* if page has buffers it should all be mapped | ||
2633 | * and allocated. If there are not buffers attached | ||
2634 | * to the page we know the page is dirty but it lost | ||
2635 | * buffers. That means that at some moment in time | ||
2636 | * after write_begin() / write_end() has been called | ||
2637 | * all buffers have been clean and thus they must have been | ||
2638 | * written at least once. So they are all mapped and we can | ||
2639 | * happily proceed with mapping them and writing the page. | ||
2640 | */ | ||
2641 | BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
2642 | ext4_bh_unmapped_or_delay)); | ||
2643 | } | ||
1633 | 2644 | ||
1634 | handle = ext4_journal_start(inode, ext4_writepage_trans_blocks(inode)); | 2645 | if (ext4_journal_current_handle()) |
1635 | if (IS_ERR(handle)) { | ||
1636 | ret = PTR_ERR(handle); | ||
1637 | goto no_write; | 2646 | goto no_write; |
1638 | } | ||
1639 | 2647 | ||
1640 | if (!page_has_buffers(page) || PageChecked(page)) { | 2648 | if (PageChecked(page)) { |
1641 | /* | 2649 | /* |
1642 | * It's mmapped pagecache. Add buffers and journal it. There | 2650 | * It's mmapped pagecache. Add buffers and journal it. There |
1643 | * doesn't seem much point in redirtying the page here. | 2651 | * doesn't seem much point in redirtying the page here. |
1644 | */ | 2652 | */ |
1645 | ClearPageChecked(page); | 2653 | ClearPageChecked(page); |
1646 | ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, | 2654 | return __ext4_journalled_writepage(page, wbc); |
1647 | ext4_get_block); | ||
1648 | if (ret != 0) { | ||
1649 | ext4_journal_stop(handle); | ||
1650 | goto out_unlock; | ||
1651 | } | ||
1652 | ret = walk_page_buffers(handle, page_buffers(page), 0, | ||
1653 | PAGE_CACHE_SIZE, NULL, do_journal_get_write_access); | ||
1654 | |||
1655 | err = walk_page_buffers(handle, page_buffers(page), 0, | ||
1656 | PAGE_CACHE_SIZE, NULL, write_end_fn); | ||
1657 | if (ret == 0) | ||
1658 | ret = err; | ||
1659 | EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; | ||
1660 | unlock_page(page); | ||
1661 | } else { | 2655 | } else { |
1662 | /* | 2656 | /* |
1663 | * It may be a page full of checkpoint-mode buffers. We don't | 2657 | * It may be a page full of checkpoint-mode buffers. We don't |
1664 | * really know unless we go poke around in the buffer_heads. | 2658 | * really know unless we go poke around in the buffer_heads. |
1665 | * But block_write_full_page will do the right thing. | 2659 | * But block_write_full_page will do the right thing. |
1666 | */ | 2660 | */ |
1667 | ret = block_write_full_page(page, ext4_get_block, wbc); | 2661 | return block_write_full_page(page, |
2662 | ext4_normal_get_block_write, | ||
2663 | wbc); | ||
1668 | } | 2664 | } |
1669 | err = ext4_journal_stop(handle); | ||
1670 | if (!ret) | ||
1671 | ret = err; | ||
1672 | out: | ||
1673 | return ret; | ||
1674 | |||
1675 | no_write: | 2665 | no_write: |
1676 | redirty_page_for_writepage(wbc, page); | 2666 | redirty_page_for_writepage(wbc, page); |
1677 | out_unlock: | ||
1678 | unlock_page(page); | 2667 | unlock_page(page); |
1679 | goto out; | 2668 | return 0; |
1680 | } | 2669 | } |
1681 | 2670 | ||
1682 | static int ext4_readpage(struct file *file, struct page *page) | 2671 | static int ext4_readpage(struct file *file, struct page *page) |
@@ -1819,7 +2808,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) | |||
1819 | static const struct address_space_operations ext4_ordered_aops = { | 2808 | static const struct address_space_operations ext4_ordered_aops = { |
1820 | .readpage = ext4_readpage, | 2809 | .readpage = ext4_readpage, |
1821 | .readpages = ext4_readpages, | 2810 | .readpages = ext4_readpages, |
1822 | .writepage = ext4_ordered_writepage, | 2811 | .writepage = ext4_normal_writepage, |
1823 | .sync_page = block_sync_page, | 2812 | .sync_page = block_sync_page, |
1824 | .write_begin = ext4_write_begin, | 2813 | .write_begin = ext4_write_begin, |
1825 | .write_end = ext4_ordered_write_end, | 2814 | .write_end = ext4_ordered_write_end, |
@@ -1833,7 +2822,7 @@ static const struct address_space_operations ext4_ordered_aops = { | |||
1833 | static const struct address_space_operations ext4_writeback_aops = { | 2822 | static const struct address_space_operations ext4_writeback_aops = { |
1834 | .readpage = ext4_readpage, | 2823 | .readpage = ext4_readpage, |
1835 | .readpages = ext4_readpages, | 2824 | .readpages = ext4_readpages, |
1836 | .writepage = ext4_writeback_writepage, | 2825 | .writepage = ext4_normal_writepage, |
1837 | .sync_page = block_sync_page, | 2826 | .sync_page = block_sync_page, |
1838 | .write_begin = ext4_write_begin, | 2827 | .write_begin = ext4_write_begin, |
1839 | .write_end = ext4_writeback_write_end, | 2828 | .write_end = ext4_writeback_write_end, |
@@ -1857,10 +2846,31 @@ static const struct address_space_operations ext4_journalled_aops = { | |||
1857 | .releasepage = ext4_releasepage, | 2846 | .releasepage = ext4_releasepage, |
1858 | }; | 2847 | }; |
1859 | 2848 | ||
2849 | static const struct address_space_operations ext4_da_aops = { | ||
2850 | .readpage = ext4_readpage, | ||
2851 | .readpages = ext4_readpages, | ||
2852 | .writepage = ext4_da_writepage, | ||
2853 | .writepages = ext4_da_writepages, | ||
2854 | .sync_page = block_sync_page, | ||
2855 | .write_begin = ext4_da_write_begin, | ||
2856 | .write_end = ext4_da_write_end, | ||
2857 | .bmap = ext4_bmap, | ||
2858 | .invalidatepage = ext4_da_invalidatepage, | ||
2859 | .releasepage = ext4_releasepage, | ||
2860 | .direct_IO = ext4_direct_IO, | ||
2861 | .migratepage = buffer_migrate_page, | ||
2862 | }; | ||
2863 | |||
1860 | void ext4_set_aops(struct inode *inode) | 2864 | void ext4_set_aops(struct inode *inode) |
1861 | { | 2865 | { |
1862 | if (ext4_should_order_data(inode)) | 2866 | if (ext4_should_order_data(inode) && |
2867 | test_opt(inode->i_sb, DELALLOC)) | ||
2868 | inode->i_mapping->a_ops = &ext4_da_aops; | ||
2869 | else if (ext4_should_order_data(inode)) | ||
1863 | inode->i_mapping->a_ops = &ext4_ordered_aops; | 2870 | inode->i_mapping->a_ops = &ext4_ordered_aops; |
2871 | else if (ext4_should_writeback_data(inode) && | ||
2872 | test_opt(inode->i_sb, DELALLOC)) | ||
2873 | inode->i_mapping->a_ops = &ext4_da_aops; | ||
1864 | else if (ext4_should_writeback_data(inode)) | 2874 | else if (ext4_should_writeback_data(inode)) |
1865 | inode->i_mapping->a_ops = &ext4_writeback_aops; | 2875 | inode->i_mapping->a_ops = &ext4_writeback_aops; |
1866 | else | 2876 | else |
@@ -1873,7 +2883,7 @@ void ext4_set_aops(struct inode *inode) | |||
1873 | * This required during truncate. We need to physically zero the tail end | 2883 | * This required during truncate. We need to physically zero the tail end |
1874 | * of that block so it doesn't yield old data if the file is later grown. | 2884 | * of that block so it doesn't yield old data if the file is later grown. |
1875 | */ | 2885 | */ |
1876 | int ext4_block_truncate_page(handle_t *handle, struct page *page, | 2886 | int ext4_block_truncate_page(handle_t *handle, |
1877 | struct address_space *mapping, loff_t from) | 2887 | struct address_space *mapping, loff_t from) |
1878 | { | 2888 | { |
1879 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; | 2889 | ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; |
@@ -1882,8 +2892,13 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, | |||
1882 | ext4_lblk_t iblock; | 2892 | ext4_lblk_t iblock; |
1883 | struct inode *inode = mapping->host; | 2893 | struct inode *inode = mapping->host; |
1884 | struct buffer_head *bh; | 2894 | struct buffer_head *bh; |
2895 | struct page *page; | ||
1885 | int err = 0; | 2896 | int err = 0; |
1886 | 2897 | ||
2898 | page = grab_cache_page(mapping, from >> PAGE_CACHE_SHIFT); | ||
2899 | if (!page) | ||
2900 | return -EINVAL; | ||
2901 | |||
1887 | blocksize = inode->i_sb->s_blocksize; | 2902 | blocksize = inode->i_sb->s_blocksize; |
1888 | length = blocksize - (offset & (blocksize - 1)); | 2903 | length = blocksize - (offset & (blocksize - 1)); |
1889 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); | 2904 | iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); |
@@ -1956,7 +2971,7 @@ int ext4_block_truncate_page(handle_t *handle, struct page *page, | |||
1956 | err = ext4_journal_dirty_metadata(handle, bh); | 2971 | err = ext4_journal_dirty_metadata(handle, bh); |
1957 | } else { | 2972 | } else { |
1958 | if (ext4_should_order_data(inode)) | 2973 | if (ext4_should_order_data(inode)) |
1959 | err = ext4_journal_dirty_data(handle, bh); | 2974 | err = ext4_jbd2_file_inode(handle, inode); |
1960 | mark_buffer_dirty(bh); | 2975 | mark_buffer_dirty(bh); |
1961 | } | 2976 | } |
1962 | 2977 | ||
@@ -2179,7 +3194,21 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, | |||
2179 | 3194 | ||
2180 | if (this_bh) { | 3195 | if (this_bh) { |
2181 | BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); | 3196 | BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); |
2182 | ext4_journal_dirty_metadata(handle, this_bh); | 3197 | |
3198 | /* | ||
3199 | * The buffer head should have an attached journal head at this | ||
3200 | * point. However, if the data is corrupted and an indirect | ||
3201 | * block pointed to itself, it would have been detached when | ||
3202 | * the block was cleared. Check for this instead of OOPSing. | ||
3203 | */ | ||
3204 | if (bh2jh(this_bh)) | ||
3205 | ext4_journal_dirty_metadata(handle, this_bh); | ||
3206 | else | ||
3207 | ext4_error(inode->i_sb, __func__, | ||
3208 | "circular indirect block detected, " | ||
3209 | "inode=%lu, block=%llu", | ||
3210 | inode->i_ino, | ||
3211 | (unsigned long long) this_bh->b_blocknr); | ||
2183 | } | 3212 | } |
2184 | } | 3213 | } |
2185 | 3214 | ||
@@ -2305,6 +3334,19 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, | |||
2305 | } | 3334 | } |
2306 | } | 3335 | } |
2307 | 3336 | ||
3337 | int ext4_can_truncate(struct inode *inode) | ||
3338 | { | ||
3339 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) | ||
3340 | return 0; | ||
3341 | if (S_ISREG(inode->i_mode)) | ||
3342 | return 1; | ||
3343 | if (S_ISDIR(inode->i_mode)) | ||
3344 | return 1; | ||
3345 | if (S_ISLNK(inode->i_mode)) | ||
3346 | return !ext4_inode_is_fast_symlink(inode); | ||
3347 | return 0; | ||
3348 | } | ||
3349 | |||
2308 | /* | 3350 | /* |
2309 | * ext4_truncate() | 3351 | * ext4_truncate() |
2310 | * | 3352 | * |
@@ -2347,51 +3389,25 @@ void ext4_truncate(struct inode *inode) | |||
2347 | int n; | 3389 | int n; |
2348 | ext4_lblk_t last_block; | 3390 | ext4_lblk_t last_block; |
2349 | unsigned blocksize = inode->i_sb->s_blocksize; | 3391 | unsigned blocksize = inode->i_sb->s_blocksize; |
2350 | struct page *page; | ||
2351 | 3392 | ||
2352 | if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | 3393 | if (!ext4_can_truncate(inode)) |
2353 | S_ISLNK(inode->i_mode))) | ||
2354 | return; | ||
2355 | if (ext4_inode_is_fast_symlink(inode)) | ||
2356 | return; | ||
2357 | if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) | ||
2358 | return; | 3394 | return; |
2359 | 3395 | ||
2360 | /* | ||
2361 | * We have to lock the EOF page here, because lock_page() nests | ||
2362 | * outside jbd2_journal_start(). | ||
2363 | */ | ||
2364 | if ((inode->i_size & (blocksize - 1)) == 0) { | ||
2365 | /* Block boundary? Nothing to do */ | ||
2366 | page = NULL; | ||
2367 | } else { | ||
2368 | page = grab_cache_page(mapping, | ||
2369 | inode->i_size >> PAGE_CACHE_SHIFT); | ||
2370 | if (!page) | ||
2371 | return; | ||
2372 | } | ||
2373 | |||
2374 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { | 3396 | if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { |
2375 | ext4_ext_truncate(inode, page); | 3397 | ext4_ext_truncate(inode); |
2376 | return; | 3398 | return; |
2377 | } | 3399 | } |
2378 | 3400 | ||
2379 | handle = start_transaction(inode); | 3401 | handle = start_transaction(inode); |
2380 | if (IS_ERR(handle)) { | 3402 | if (IS_ERR(handle)) |
2381 | if (page) { | ||
2382 | clear_highpage(page); | ||
2383 | flush_dcache_page(page); | ||
2384 | unlock_page(page); | ||
2385 | page_cache_release(page); | ||
2386 | } | ||
2387 | return; /* AKPM: return what? */ | 3403 | return; /* AKPM: return what? */ |
2388 | } | ||
2389 | 3404 | ||
2390 | last_block = (inode->i_size + blocksize-1) | 3405 | last_block = (inode->i_size + blocksize-1) |
2391 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); | 3406 | >> EXT4_BLOCK_SIZE_BITS(inode->i_sb); |
2392 | 3407 | ||
2393 | if (page) | 3408 | if (inode->i_size & (blocksize - 1)) |
2394 | ext4_block_truncate_page(handle, page, mapping, inode->i_size); | 3409 | if (ext4_block_truncate_page(handle, mapping, inode->i_size)) |
3410 | goto out_stop; | ||
2395 | 3411 | ||
2396 | n = ext4_block_to_path(inode, last_block, offsets, NULL); | 3412 | n = ext4_block_to_path(inode, last_block, offsets, NULL); |
2397 | if (n == 0) | 3413 | if (n == 0) |
@@ -2410,6 +3426,11 @@ void ext4_truncate(struct inode *inode) | |||
2410 | goto out_stop; | 3426 | goto out_stop; |
2411 | 3427 | ||
2412 | /* | 3428 | /* |
3429 | * From here we block out all ext4_get_block() callers who want to | ||
3430 | * modify the block allocation tree. | ||
3431 | */ | ||
3432 | down_write(&ei->i_data_sem); | ||
3433 | /* | ||
2413 | * The orphan list entry will now protect us from any crash which | 3434 | * The orphan list entry will now protect us from any crash which |
2414 | * occurs before the truncate completes, so it is now safe to propagate | 3435 | * occurs before the truncate completes, so it is now safe to propagate |
2415 | * the new, shorter inode size (held for now in i_size) into the | 3436 | * the new, shorter inode size (held for now in i_size) into the |
@@ -2418,12 +3439,6 @@ void ext4_truncate(struct inode *inode) | |||
2418 | */ | 3439 | */ |
2419 | ei->i_disksize = inode->i_size; | 3440 | ei->i_disksize = inode->i_size; |
2420 | 3441 | ||
2421 | /* | ||
2422 | * From here we block out all ext4_get_block() callers who want to | ||
2423 | * modify the block allocation tree. | ||
2424 | */ | ||
2425 | down_write(&ei->i_data_sem); | ||
2426 | |||
2427 | if (n == 1) { /* direct blocks */ | 3442 | if (n == 1) { /* direct blocks */ |
2428 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], | 3443 | ext4_free_data(handle, inode, NULL, i_data+offsets[0], |
2429 | i_data + EXT4_NDIR_BLOCKS); | 3444 | i_data + EXT4_NDIR_BLOCKS); |
@@ -3107,7 +4122,14 @@ int ext4_write_inode(struct inode *inode, int wait) | |||
3107 | * be freed, so we have a strong guarantee that no future commit will | 4122 | * be freed, so we have a strong guarantee that no future commit will |
3108 | * leave these blocks visible to the user.) | 4123 | * leave these blocks visible to the user.) |
3109 | * | 4124 | * |
3110 | * Called with inode->sem down. | 4125 | * Another thing we have to assure is that if we are in ordered mode |
4126 | * and inode is still attached to the committing transaction, we must | ||
4127 | * we start writeout of all the dirty pages which are being truncated. | ||
4128 | * This way we are sure that all the data written in the previous | ||
4129 | * transaction are already on disk (truncate waits for pages under | ||
4130 | * writeback). | ||
4131 | * | ||
4132 | * Called with inode->i_mutex down. | ||
3111 | */ | 4133 | */ |
3112 | int ext4_setattr(struct dentry *dentry, struct iattr *attr) | 4134 | int ext4_setattr(struct dentry *dentry, struct iattr *attr) |
3113 | { | 4135 | { |
@@ -3173,6 +4195,22 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) | |||
3173 | if (!error) | 4195 | if (!error) |
3174 | error = rc; | 4196 | error = rc; |
3175 | ext4_journal_stop(handle); | 4197 | ext4_journal_stop(handle); |
4198 | |||
4199 | if (ext4_should_order_data(inode)) { | ||
4200 | error = ext4_begin_ordered_truncate(inode, | ||
4201 | attr->ia_size); | ||
4202 | if (error) { | ||
4203 | /* Do as much error cleanup as possible */ | ||
4204 | handle = ext4_journal_start(inode, 3); | ||
4205 | if (IS_ERR(handle)) { | ||
4206 | ext4_orphan_del(NULL, inode); | ||
4207 | goto err_out; | ||
4208 | } | ||
4209 | ext4_orphan_del(handle, inode); | ||
4210 | ext4_journal_stop(handle); | ||
4211 | goto err_out; | ||
4212 | } | ||
4213 | } | ||
3176 | } | 4214 | } |
3177 | 4215 | ||
3178 | rc = inode_setattr(inode, attr); | 4216 | rc = inode_setattr(inode, attr); |
@@ -3193,6 +4231,32 @@ err_out: | |||
3193 | return error; | 4231 | return error; |
3194 | } | 4232 | } |
3195 | 4233 | ||
4234 | int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, | ||
4235 | struct kstat *stat) | ||
4236 | { | ||
4237 | struct inode *inode; | ||
4238 | unsigned long delalloc_blocks; | ||
4239 | |||
4240 | inode = dentry->d_inode; | ||
4241 | generic_fillattr(inode, stat); | ||
4242 | |||
4243 | /* | ||
4244 | * We can't update i_blocks if the block allocation is delayed | ||
4245 | * otherwise in the case of system crash before the real block | ||
4246 | * allocation is done, we will have i_blocks inconsistent with | ||
4247 | * on-disk file blocks. | ||
4248 | * We always keep i_blocks updated together with real | ||
4249 | * allocation. But to not confuse with user, stat | ||
4250 | * will return the blocks that include the delayed allocation | ||
4251 | * blocks for this file. | ||
4252 | */ | ||
4253 | spin_lock(&EXT4_I(inode)->i_block_reservation_lock); | ||
4254 | delalloc_blocks = EXT4_I(inode)->i_reserved_data_blocks; | ||
4255 | spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); | ||
4256 | |||
4257 | stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; | ||
4258 | return 0; | ||
4259 | } | ||
3196 | 4260 | ||
3197 | /* | 4261 | /* |
3198 | * How many blocks doth make a writepage()? | 4262 | * How many blocks doth make a writepage()? |
@@ -3506,3 +4570,64 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) | |||
3506 | 4570 | ||
3507 | return err; | 4571 | return err; |
3508 | } | 4572 | } |
4573 | |||
4574 | static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh) | ||
4575 | { | ||
4576 | return !buffer_mapped(bh); | ||
4577 | } | ||
4578 | |||
4579 | int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) | ||
4580 | { | ||
4581 | loff_t size; | ||
4582 | unsigned long len; | ||
4583 | int ret = -EINVAL; | ||
4584 | struct file *file = vma->vm_file; | ||
4585 | struct inode *inode = file->f_path.dentry->d_inode; | ||
4586 | struct address_space *mapping = inode->i_mapping; | ||
4587 | |||
4588 | /* | ||
4589 | * Get i_alloc_sem to stop truncates messing with the inode. We cannot | ||
4590 | * get i_mutex because we are already holding mmap_sem. | ||
4591 | */ | ||
4592 | down_read(&inode->i_alloc_sem); | ||
4593 | size = i_size_read(inode); | ||
4594 | if (page->mapping != mapping || size <= page_offset(page) | ||
4595 | || !PageUptodate(page)) { | ||
4596 | /* page got truncated from under us? */ | ||
4597 | goto out_unlock; | ||
4598 | } | ||
4599 | ret = 0; | ||
4600 | if (PageMappedToDisk(page)) | ||
4601 | goto out_unlock; | ||
4602 | |||
4603 | if (page->index == size >> PAGE_CACHE_SHIFT) | ||
4604 | len = size & ~PAGE_CACHE_MASK; | ||
4605 | else | ||
4606 | len = PAGE_CACHE_SIZE; | ||
4607 | |||
4608 | if (page_has_buffers(page)) { | ||
4609 | /* return if we have all the buffers mapped */ | ||
4610 | if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, | ||
4611 | ext4_bh_unmapped)) | ||
4612 | goto out_unlock; | ||
4613 | } | ||
4614 | /* | ||
4615 | * OK, we need to fill the hole... Do write_begin write_end | ||
4616 | * to do block allocation/reservation.We are not holding | ||
4617 | * inode.i__mutex here. That allow * parallel write_begin, | ||
4618 | * write_end call. lock_page prevent this from happening | ||
4619 | * on the same page though | ||
4620 | */ | ||
4621 | ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), | ||
4622 | len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); | ||
4623 | if (ret < 0) | ||
4624 | goto out_unlock; | ||
4625 | ret = mapping->a_ops->write_end(file, mapping, page_offset(page), | ||
4626 | len, len, page, NULL); | ||
4627 | if (ret < 0) | ||
4628 | goto out_unlock; | ||
4629 | ret = 0; | ||
4630 | out_unlock: | ||
4631 | up_read(&inode->i_alloc_sem); | ||
4632 | return ret; | ||
4633 | } | ||
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c9900aade150..8d141a25bbee 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c | |||
@@ -381,22 +381,28 @@ static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) | |||
381 | 381 | ||
382 | static inline int mb_find_next_zero_bit(void *addr, int max, int start) | 382 | static inline int mb_find_next_zero_bit(void *addr, int max, int start) |
383 | { | 383 | { |
384 | int fix = 0; | 384 | int fix = 0, ret, tmpmax; |
385 | addr = mb_correct_addr_and_bit(&fix, addr); | 385 | addr = mb_correct_addr_and_bit(&fix, addr); |
386 | max += fix; | 386 | tmpmax = max + fix; |
387 | start += fix; | 387 | start += fix; |
388 | 388 | ||
389 | return ext4_find_next_zero_bit(addr, max, start) - fix; | 389 | ret = ext4_find_next_zero_bit(addr, tmpmax, start) - fix; |
390 | if (ret > max) | ||
391 | return max; | ||
392 | return ret; | ||
390 | } | 393 | } |
391 | 394 | ||
392 | static inline int mb_find_next_bit(void *addr, int max, int start) | 395 | static inline int mb_find_next_bit(void *addr, int max, int start) |
393 | { | 396 | { |
394 | int fix = 0; | 397 | int fix = 0, ret, tmpmax; |
395 | addr = mb_correct_addr_and_bit(&fix, addr); | 398 | addr = mb_correct_addr_and_bit(&fix, addr); |
396 | max += fix; | 399 | tmpmax = max + fix; |
397 | start += fix; | 400 | start += fix; |
398 | 401 | ||
399 | return ext4_find_next_bit(addr, max, start) - fix; | 402 | ret = ext4_find_next_bit(addr, tmpmax, start) - fix; |
403 | if (ret > max) | ||
404 | return max; | ||
405 | return ret; | ||
400 | } | 406 | } |
401 | 407 | ||
402 | static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) | 408 | static void *mb_find_buddy(struct ext4_buddy *e4b, int order, int *max) |
@@ -803,6 +809,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) | |||
803 | if (!buffer_uptodate(bh[i])) | 809 | if (!buffer_uptodate(bh[i])) |
804 | goto out; | 810 | goto out; |
805 | 811 | ||
812 | err = 0; | ||
806 | first_block = page->index * blocks_per_page; | 813 | first_block = page->index * blocks_per_page; |
807 | for (i = 0; i < blocks_per_page; i++) { | 814 | for (i = 0; i < blocks_per_page; i++) { |
808 | int group; | 815 | int group; |
@@ -883,6 +890,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
883 | int pnum; | 890 | int pnum; |
884 | int poff; | 891 | int poff; |
885 | struct page *page; | 892 | struct page *page; |
893 | int ret; | ||
886 | 894 | ||
887 | mb_debug("load group %lu\n", group); | 895 | mb_debug("load group %lu\n", group); |
888 | 896 | ||
@@ -914,15 +922,21 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
914 | if (page) { | 922 | if (page) { |
915 | BUG_ON(page->mapping != inode->i_mapping); | 923 | BUG_ON(page->mapping != inode->i_mapping); |
916 | if (!PageUptodate(page)) { | 924 | if (!PageUptodate(page)) { |
917 | ext4_mb_init_cache(page, NULL); | 925 | ret = ext4_mb_init_cache(page, NULL); |
926 | if (ret) { | ||
927 | unlock_page(page); | ||
928 | goto err; | ||
929 | } | ||
918 | mb_cmp_bitmaps(e4b, page_address(page) + | 930 | mb_cmp_bitmaps(e4b, page_address(page) + |
919 | (poff * sb->s_blocksize)); | 931 | (poff * sb->s_blocksize)); |
920 | } | 932 | } |
921 | unlock_page(page); | 933 | unlock_page(page); |
922 | } | 934 | } |
923 | } | 935 | } |
924 | if (page == NULL || !PageUptodate(page)) | 936 | if (page == NULL || !PageUptodate(page)) { |
937 | ret = -EIO; | ||
925 | goto err; | 938 | goto err; |
939 | } | ||
926 | e4b->bd_bitmap_page = page; | 940 | e4b->bd_bitmap_page = page; |
927 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); | 941 | e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); |
928 | mark_page_accessed(page); | 942 | mark_page_accessed(page); |
@@ -938,14 +952,20 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, | |||
938 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); | 952 | page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); |
939 | if (page) { | 953 | if (page) { |
940 | BUG_ON(page->mapping != inode->i_mapping); | 954 | BUG_ON(page->mapping != inode->i_mapping); |
941 | if (!PageUptodate(page)) | 955 | if (!PageUptodate(page)) { |
942 | ext4_mb_init_cache(page, e4b->bd_bitmap); | 956 | ret = ext4_mb_init_cache(page, e4b->bd_bitmap); |
943 | 957 | if (ret) { | |
958 | unlock_page(page); | ||
959 | goto err; | ||
960 | } | ||
961 | } | ||
944 | unlock_page(page); | 962 | unlock_page(page); |
945 | } | 963 | } |
946 | } | 964 | } |
947 | if (page == NULL || !PageUptodate(page)) | 965 | if (page == NULL || !PageUptodate(page)) { |
966 | ret = -EIO; | ||
948 | goto err; | 967 | goto err; |
968 | } | ||
949 | e4b->bd_buddy_page = page; | 969 | e4b->bd_buddy_page = page; |
950 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); | 970 | e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); |
951 | mark_page_accessed(page); | 971 | mark_page_accessed(page); |
@@ -962,7 +982,7 @@ err: | |||
962 | page_cache_release(e4b->bd_buddy_page); | 982 | page_cache_release(e4b->bd_buddy_page); |
963 | e4b->bd_buddy = NULL; | 983 | e4b->bd_buddy = NULL; |
964 | e4b->bd_bitmap = NULL; | 984 | e4b->bd_bitmap = NULL; |
965 | return -EIO; | 985 | return ret; |
966 | } | 986 | } |
967 | 987 | ||
968 | static void ext4_mb_release_desc(struct ext4_buddy *e4b) | 988 | static void ext4_mb_release_desc(struct ext4_buddy *e4b) |
@@ -1031,7 +1051,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) | |||
1031 | } | 1051 | } |
1032 | } | 1052 | } |
1033 | 1053 | ||
1034 | static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | 1054 | static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, |
1035 | int first, int count) | 1055 | int first, int count) |
1036 | { | 1056 | { |
1037 | int block = 0; | 1057 | int block = 0; |
@@ -1071,11 +1091,12 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |||
1071 | blocknr += block; | 1091 | blocknr += block; |
1072 | blocknr += | 1092 | blocknr += |
1073 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); | 1093 | le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); |
1074 | 1094 | ext4_unlock_group(sb, e4b->bd_group); | |
1075 | ext4_error(sb, __func__, "double-free of inode" | 1095 | ext4_error(sb, __func__, "double-free of inode" |
1076 | " %lu's block %llu(bit %u in group %lu)\n", | 1096 | " %lu's block %llu(bit %u in group %lu)\n", |
1077 | inode ? inode->i_ino : 0, blocknr, block, | 1097 | inode ? inode->i_ino : 0, blocknr, block, |
1078 | e4b->bd_group); | 1098 | e4b->bd_group); |
1099 | ext4_lock_group(sb, e4b->bd_group); | ||
1079 | } | 1100 | } |
1080 | mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); | 1101 | mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); |
1081 | e4b->bd_info->bb_counters[order]++; | 1102 | e4b->bd_info->bb_counters[order]++; |
@@ -1113,8 +1134,6 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, | |||
1113 | } while (1); | 1134 | } while (1); |
1114 | } | 1135 | } |
1115 | mb_check_buddy(e4b); | 1136 | mb_check_buddy(e4b); |
1116 | |||
1117 | return 0; | ||
1118 | } | 1137 | } |
1119 | 1138 | ||
1120 | static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, | 1139 | static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, |
@@ -1730,10 +1749,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | |||
1730 | ac->ac_g_ex.fe_start = sbi->s_mb_last_start; | 1749 | ac->ac_g_ex.fe_start = sbi->s_mb_last_start; |
1731 | spin_unlock(&sbi->s_md_lock); | 1750 | spin_unlock(&sbi->s_md_lock); |
1732 | } | 1751 | } |
1733 | |||
1734 | /* searching for the right group start from the goal value specified */ | ||
1735 | group = ac->ac_g_ex.fe_group; | ||
1736 | |||
1737 | /* Let's just scan groups to find more-less suitable blocks */ | 1752 | /* Let's just scan groups to find more-less suitable blocks */ |
1738 | cr = ac->ac_2order ? 0 : 1; | 1753 | cr = ac->ac_2order ? 0 : 1; |
1739 | /* | 1754 | /* |
@@ -1743,6 +1758,12 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) | |||
1743 | repeat: | 1758 | repeat: |
1744 | for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { | 1759 | for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) { |
1745 | ac->ac_criteria = cr; | 1760 | ac->ac_criteria = cr; |
1761 | /* | ||
1762 | * searching for the right group start | ||
1763 | * from the goal value specified | ||
1764 | */ | ||
1765 | group = ac->ac_g_ex.fe_group; | ||
1766 | |||
1746 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { | 1767 | for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { |
1747 | struct ext4_group_info *grp; | 1768 | struct ext4_group_info *grp; |
1748 | struct ext4_group_desc *desc; | 1769 | struct ext4_group_desc *desc; |
@@ -1963,6 +1984,8 @@ static int ext4_mb_seq_history_open(struct inode *inode, struct file *file) | |||
1963 | int rc; | 1984 | int rc; |
1964 | int size; | 1985 | int size; |
1965 | 1986 | ||
1987 | if (unlikely(sbi->s_mb_history == NULL)) | ||
1988 | return -ENOMEM; | ||
1966 | s = kmalloc(sizeof(*s), GFP_KERNEL); | 1989 | s = kmalloc(sizeof(*s), GFP_KERNEL); |
1967 | if (s == NULL) | 1990 | if (s == NULL) |
1968 | return -ENOMEM; | 1991 | return -ENOMEM; |
@@ -2165,9 +2188,7 @@ static void ext4_mb_history_init(struct super_block *sb) | |||
2165 | sbi->s_mb_history_cur = 0; | 2188 | sbi->s_mb_history_cur = 0; |
2166 | spin_lock_init(&sbi->s_mb_history_lock); | 2189 | spin_lock_init(&sbi->s_mb_history_lock); |
2167 | i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); | 2190 | i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); |
2168 | sbi->s_mb_history = kmalloc(i, GFP_KERNEL); | 2191 | sbi->s_mb_history = kzalloc(i, GFP_KERNEL); |
2169 | if (likely(sbi->s_mb_history != NULL)) | ||
2170 | memset(sbi->s_mb_history, 0, i); | ||
2171 | /* if we can't allocate history, then we simple won't use it */ | 2192 | /* if we can't allocate history, then we simple won't use it */ |
2172 | } | 2193 | } |
2173 | 2194 | ||
@@ -2215,21 +2236,192 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) | |||
2215 | #define ext4_mb_history_init(sb) | 2236 | #define ext4_mb_history_init(sb) |
2216 | #endif | 2237 | #endif |
2217 | 2238 | ||
2239 | |||
2240 | /* Create and initialize ext4_group_info data for the given group. */ | ||
2241 | int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, | ||
2242 | struct ext4_group_desc *desc) | ||
2243 | { | ||
2244 | int i, len; | ||
2245 | int metalen = 0; | ||
2246 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2247 | struct ext4_group_info **meta_group_info; | ||
2248 | |||
2249 | /* | ||
2250 | * First check if this group is the first of a reserved block. | ||
2251 | * If it's true, we have to allocate a new table of pointers | ||
2252 | * to ext4_group_info structures | ||
2253 | */ | ||
2254 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { | ||
2255 | metalen = sizeof(*meta_group_info) << | ||
2256 | EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2257 | meta_group_info = kmalloc(metalen, GFP_KERNEL); | ||
2258 | if (meta_group_info == NULL) { | ||
2259 | printk(KERN_ERR "EXT4-fs: can't allocate mem for a " | ||
2260 | "buddy group\n"); | ||
2261 | goto exit_meta_group_info; | ||
2262 | } | ||
2263 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)] = | ||
2264 | meta_group_info; | ||
2265 | } | ||
2266 | |||
2267 | /* | ||
2268 | * calculate needed size. if change bb_counters size, | ||
2269 | * don't forget about ext4_mb_generate_buddy() | ||
2270 | */ | ||
2271 | len = offsetof(typeof(**meta_group_info), | ||
2272 | bb_counters[sb->s_blocksize_bits + 2]); | ||
2273 | |||
2274 | meta_group_info = | ||
2275 | sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; | ||
2276 | i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); | ||
2277 | |||
2278 | meta_group_info[i] = kzalloc(len, GFP_KERNEL); | ||
2279 | if (meta_group_info[i] == NULL) { | ||
2280 | printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | ||
2281 | goto exit_group_info; | ||
2282 | } | ||
2283 | set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, | ||
2284 | &(meta_group_info[i]->bb_state)); | ||
2285 | |||
2286 | /* | ||
2287 | * initialize bb_free to be able to skip | ||
2288 | * empty groups without initialization | ||
2289 | */ | ||
2290 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
2291 | meta_group_info[i]->bb_free = | ||
2292 | ext4_free_blocks_after_init(sb, group, desc); | ||
2293 | } else { | ||
2294 | meta_group_info[i]->bb_free = | ||
2295 | le16_to_cpu(desc->bg_free_blocks_count); | ||
2296 | } | ||
2297 | |||
2298 | INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); | ||
2299 | |||
2300 | #ifdef DOUBLE_CHECK | ||
2301 | { | ||
2302 | struct buffer_head *bh; | ||
2303 | meta_group_info[i]->bb_bitmap = | ||
2304 | kmalloc(sb->s_blocksize, GFP_KERNEL); | ||
2305 | BUG_ON(meta_group_info[i]->bb_bitmap == NULL); | ||
2306 | bh = ext4_read_block_bitmap(sb, group); | ||
2307 | BUG_ON(bh == NULL); | ||
2308 | memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, | ||
2309 | sb->s_blocksize); | ||
2310 | put_bh(bh); | ||
2311 | } | ||
2312 | #endif | ||
2313 | |||
2314 | return 0; | ||
2315 | |||
2316 | exit_group_info: | ||
2317 | /* If a meta_group_info table has been allocated, release it now */ | ||
2318 | if (group % EXT4_DESC_PER_BLOCK(sb) == 0) | ||
2319 | kfree(sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]); | ||
2320 | exit_meta_group_info: | ||
2321 | return -ENOMEM; | ||
2322 | } /* ext4_mb_add_groupinfo */ | ||
2323 | |||
2324 | /* | ||
2325 | * Add a group to the existing groups. | ||
2326 | * This function is used for online resize | ||
2327 | */ | ||
2328 | int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, | ||
2329 | struct ext4_group_desc *desc) | ||
2330 | { | ||
2331 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
2332 | struct inode *inode = sbi->s_buddy_cache; | ||
2333 | int blocks_per_page; | ||
2334 | int block; | ||
2335 | int pnum; | ||
2336 | struct page *page; | ||
2337 | int err; | ||
2338 | |||
2339 | /* Add group based on group descriptor*/ | ||
2340 | err = ext4_mb_add_groupinfo(sb, group, desc); | ||
2341 | if (err) | ||
2342 | return err; | ||
2343 | |||
2344 | /* | ||
2345 | * Cache pages containing dynamic mb_alloc datas (buddy and bitmap | ||
2346 | * datas) are set not up to date so that they will be re-initilaized | ||
2347 | * during the next call to ext4_mb_load_buddy | ||
2348 | */ | ||
2349 | |||
2350 | /* Set buddy page as not up to date */ | ||
2351 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
2352 | block = group * 2; | ||
2353 | pnum = block / blocks_per_page; | ||
2354 | page = find_get_page(inode->i_mapping, pnum); | ||
2355 | if (page != NULL) { | ||
2356 | ClearPageUptodate(page); | ||
2357 | page_cache_release(page); | ||
2358 | } | ||
2359 | |||
2360 | /* Set bitmap page as not up to date */ | ||
2361 | block++; | ||
2362 | pnum = block / blocks_per_page; | ||
2363 | page = find_get_page(inode->i_mapping, pnum); | ||
2364 | if (page != NULL) { | ||
2365 | ClearPageUptodate(page); | ||
2366 | page_cache_release(page); | ||
2367 | } | ||
2368 | |||
2369 | return 0; | ||
2370 | } | ||
2371 | |||
2372 | /* | ||
2373 | * Update an existing group. | ||
2374 | * This function is used for online resize | ||
2375 | */ | ||
2376 | void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) | ||
2377 | { | ||
2378 | grp->bb_free += add; | ||
2379 | } | ||
2380 | |||
2218 | static int ext4_mb_init_backend(struct super_block *sb) | 2381 | static int ext4_mb_init_backend(struct super_block *sb) |
2219 | { | 2382 | { |
2220 | ext4_group_t i; | 2383 | ext4_group_t i; |
2221 | int j, len, metalen; | 2384 | int metalen; |
2222 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 2385 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
2223 | int num_meta_group_infos = | 2386 | struct ext4_super_block *es = sbi->s_es; |
2224 | (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) >> | 2387 | int num_meta_group_infos; |
2225 | EXT4_DESC_PER_BLOCK_BITS(sb); | 2388 | int num_meta_group_infos_max; |
2389 | int array_size; | ||
2226 | struct ext4_group_info **meta_group_info; | 2390 | struct ext4_group_info **meta_group_info; |
2391 | struct ext4_group_desc *desc; | ||
2392 | |||
2393 | /* This is the number of blocks used by GDT */ | ||
2394 | num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - | ||
2395 | 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); | ||
2396 | |||
2397 | /* | ||
2398 | * This is the total number of blocks used by GDT including | ||
2399 | * the number of reserved blocks for GDT. | ||
2400 | * The s_group_info array is allocated with this value | ||
2401 | * to allow a clean online resize without a complex | ||
2402 | * manipulation of pointer. | ||
2403 | * The drawback is the unused memory when no resize | ||
2404 | * occurs but it's very low in terms of pages | ||
2405 | * (see comments below) | ||
2406 | * Need to handle this properly when META_BG resizing is allowed | ||
2407 | */ | ||
2408 | num_meta_group_infos_max = num_meta_group_infos + | ||
2409 | le16_to_cpu(es->s_reserved_gdt_blocks); | ||
2227 | 2410 | ||
2411 | /* | ||
2412 | * array_size is the size of s_group_info array. We round it | ||
2413 | * to the next power of two because this approximation is done | ||
2414 | * internally by kmalloc so we can have some more memory | ||
2415 | * for free here (e.g. may be used for META_BG resize). | ||
2416 | */ | ||
2417 | array_size = 1; | ||
2418 | while (array_size < sizeof(*sbi->s_group_info) * | ||
2419 | num_meta_group_infos_max) | ||
2420 | array_size = array_size << 1; | ||
2228 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte | 2421 | /* An 8TB filesystem with 64-bit pointers requires a 4096 byte |
2229 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. | 2422 | * kmalloc. A 128kb malloc should suffice for a 256TB filesystem. |
2230 | * So a two level scheme suffices for now. */ | 2423 | * So a two level scheme suffices for now. */ |
2231 | sbi->s_group_info = kmalloc(sizeof(*sbi->s_group_info) * | 2424 | sbi->s_group_info = kmalloc(array_size, GFP_KERNEL); |
2232 | num_meta_group_infos, GFP_KERNEL); | ||
2233 | if (sbi->s_group_info == NULL) { | 2425 | if (sbi->s_group_info == NULL) { |
2234 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); | 2426 | printk(KERN_ERR "EXT4-fs: can't allocate buddy meta group\n"); |
2235 | return -ENOMEM; | 2427 | return -ENOMEM; |
@@ -2256,63 +2448,15 @@ static int ext4_mb_init_backend(struct super_block *sb) | |||
2256 | sbi->s_group_info[i] = meta_group_info; | 2448 | sbi->s_group_info[i] = meta_group_info; |
2257 | } | 2449 | } |
2258 | 2450 | ||
2259 | /* | ||
2260 | * calculate needed size. if change bb_counters size, | ||
2261 | * don't forget about ext4_mb_generate_buddy() | ||
2262 | */ | ||
2263 | len = sizeof(struct ext4_group_info); | ||
2264 | len += sizeof(unsigned short) * (sb->s_blocksize_bits + 2); | ||
2265 | for (i = 0; i < sbi->s_groups_count; i++) { | 2451 | for (i = 0; i < sbi->s_groups_count; i++) { |
2266 | struct ext4_group_desc *desc; | ||
2267 | |||
2268 | meta_group_info = | ||
2269 | sbi->s_group_info[i >> EXT4_DESC_PER_BLOCK_BITS(sb)]; | ||
2270 | j = i & (EXT4_DESC_PER_BLOCK(sb) - 1); | ||
2271 | |||
2272 | meta_group_info[j] = kzalloc(len, GFP_KERNEL); | ||
2273 | if (meta_group_info[j] == NULL) { | ||
2274 | printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); | ||
2275 | goto err_freebuddy; | ||
2276 | } | ||
2277 | desc = ext4_get_group_desc(sb, i, NULL); | 2452 | desc = ext4_get_group_desc(sb, i, NULL); |
2278 | if (desc == NULL) { | 2453 | if (desc == NULL) { |
2279 | printk(KERN_ERR | 2454 | printk(KERN_ERR |
2280 | "EXT4-fs: can't read descriptor %lu\n", i); | 2455 | "EXT4-fs: can't read descriptor %lu\n", i); |
2281 | i++; | ||
2282 | goto err_freebuddy; | 2456 | goto err_freebuddy; |
2283 | } | 2457 | } |
2284 | memset(meta_group_info[j], 0, len); | 2458 | if (ext4_mb_add_groupinfo(sb, i, desc) != 0) |
2285 | set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, | 2459 | goto err_freebuddy; |
2286 | &(meta_group_info[j]->bb_state)); | ||
2287 | |||
2288 | /* | ||
2289 | * initialize bb_free to be able to skip | ||
2290 | * empty groups without initialization | ||
2291 | */ | ||
2292 | if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { | ||
2293 | meta_group_info[j]->bb_free = | ||
2294 | ext4_free_blocks_after_init(sb, i, desc); | ||
2295 | } else { | ||
2296 | meta_group_info[j]->bb_free = | ||
2297 | le16_to_cpu(desc->bg_free_blocks_count); | ||
2298 | } | ||
2299 | |||
2300 | INIT_LIST_HEAD(&meta_group_info[j]->bb_prealloc_list); | ||
2301 | |||
2302 | #ifdef DOUBLE_CHECK | ||
2303 | { | ||
2304 | struct buffer_head *bh; | ||
2305 | meta_group_info[j]->bb_bitmap = | ||
2306 | kmalloc(sb->s_blocksize, GFP_KERNEL); | ||
2307 | BUG_ON(meta_group_info[j]->bb_bitmap == NULL); | ||
2308 | bh = read_block_bitmap(sb, i); | ||
2309 | BUG_ON(bh == NULL); | ||
2310 | memcpy(meta_group_info[j]->bb_bitmap, bh->b_data, | ||
2311 | sb->s_blocksize); | ||
2312 | put_bh(bh); | ||
2313 | } | ||
2314 | #endif | ||
2315 | |||
2316 | } | 2460 | } |
2317 | 2461 | ||
2318 | return 0; | 2462 | return 0; |
@@ -2336,6 +2480,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2336 | unsigned i; | 2480 | unsigned i; |
2337 | unsigned offset; | 2481 | unsigned offset; |
2338 | unsigned max; | 2482 | unsigned max; |
2483 | int ret; | ||
2339 | 2484 | ||
2340 | if (!test_opt(sb, MBALLOC)) | 2485 | if (!test_opt(sb, MBALLOC)) |
2341 | return 0; | 2486 | return 0; |
@@ -2370,12 +2515,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) | |||
2370 | } while (i <= sb->s_blocksize_bits + 1); | 2515 | } while (i <= sb->s_blocksize_bits + 1); |
2371 | 2516 | ||
2372 | /* init file for buddy data */ | 2517 | /* init file for buddy data */ |
2373 | i = ext4_mb_init_backend(sb); | 2518 | ret = ext4_mb_init_backend(sb); |
2374 | if (i) { | 2519 | if (ret != 0) { |
2375 | clear_opt(sbi->s_mount_opt, MBALLOC); | 2520 | clear_opt(sbi->s_mount_opt, MBALLOC); |
2376 | kfree(sbi->s_mb_offsets); | 2521 | kfree(sbi->s_mb_offsets); |
2377 | kfree(sbi->s_mb_maxs); | 2522 | kfree(sbi->s_mb_maxs); |
2378 | return i; | 2523 | return ret; |
2379 | } | 2524 | } |
2380 | 2525 | ||
2381 | spin_lock_init(&sbi->s_md_lock); | 2526 | spin_lock_init(&sbi->s_md_lock); |
@@ -2548,8 +2693,7 @@ ext4_mb_free_committed_blocks(struct super_block *sb) | |||
2548 | ext4_lock_group(sb, md->group); | 2693 | ext4_lock_group(sb, md->group); |
2549 | for (i = 0; i < md->num; i++) { | 2694 | for (i = 0; i < md->num; i++) { |
2550 | mb_debug(" %u", md->blocks[i]); | 2695 | mb_debug(" %u", md->blocks[i]); |
2551 | err = mb_free_blocks(NULL, &e4b, md->blocks[i], 1); | 2696 | mb_free_blocks(NULL, &e4b, md->blocks[i], 1); |
2552 | BUG_ON(err != 0); | ||
2553 | } | 2697 | } |
2554 | mb_debug("\n"); | 2698 | mb_debug("\n"); |
2555 | ext4_unlock_group(sb, md->group); | 2699 | ext4_unlock_group(sb, md->group); |
@@ -2575,25 +2719,24 @@ ext4_mb_free_committed_blocks(struct super_block *sb) | |||
2575 | 2719 | ||
2576 | 2720 | ||
2577 | 2721 | ||
2578 | #define MB_PROC_VALUE_READ(name) \ | 2722 | #define MB_PROC_FOPS(name) \ |
2579 | static int ext4_mb_read_##name(char *page, char **start, \ | 2723 | static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \ |
2580 | off_t off, int count, int *eof, void *data) \ | ||
2581 | { \ | 2724 | { \ |
2582 | struct ext4_sb_info *sbi = data; \ | 2725 | struct ext4_sb_info *sbi = m->private; \ |
2583 | int len; \ | 2726 | \ |
2584 | *eof = 1; \ | 2727 | seq_printf(m, "%ld\n", sbi->s_mb_##name); \ |
2585 | if (off != 0) \ | 2728 | return 0; \ |
2586 | return 0; \ | 2729 | } \ |
2587 | len = sprintf(page, "%ld\n", sbi->s_mb_##name); \ | 2730 | \ |
2588 | *start = page; \ | 2731 | static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\ |
2589 | return len; \ | 2732 | { \ |
2590 | } | 2733 | return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\ |
2591 | 2734 | } \ | |
2592 | #define MB_PROC_VALUE_WRITE(name) \ | 2735 | \ |
2593 | static int ext4_mb_write_##name(struct file *file, \ | 2736 | static ssize_t ext4_mb_##name##_proc_write(struct file *file, \ |
2594 | const char __user *buf, unsigned long cnt, void *data) \ | 2737 | const char __user *buf, size_t cnt, loff_t *ppos) \ |
2595 | { \ | 2738 | { \ |
2596 | struct ext4_sb_info *sbi = data; \ | 2739 | struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\ |
2597 | char str[32]; \ | 2740 | char str[32]; \ |
2598 | long value; \ | 2741 | long value; \ |
2599 | if (cnt >= sizeof(str)) \ | 2742 | if (cnt >= sizeof(str)) \ |
@@ -2605,31 +2748,32 @@ static int ext4_mb_write_##name(struct file *file, \ | |||
2605 | return -ERANGE; \ | 2748 | return -ERANGE; \ |
2606 | sbi->s_mb_##name = value; \ | 2749 | sbi->s_mb_##name = value; \ |
2607 | return cnt; \ | 2750 | return cnt; \ |
2608 | } | 2751 | } \ |
2752 | \ | ||
2753 | static const struct file_operations ext4_mb_##name##_proc_fops = { \ | ||
2754 | .owner = THIS_MODULE, \ | ||
2755 | .open = ext4_mb_##name##_proc_open, \ | ||
2756 | .read = seq_read, \ | ||
2757 | .llseek = seq_lseek, \ | ||
2758 | .release = single_release, \ | ||
2759 | .write = ext4_mb_##name##_proc_write, \ | ||
2760 | }; | ||
2609 | 2761 | ||
2610 | MB_PROC_VALUE_READ(stats); | 2762 | MB_PROC_FOPS(stats); |
2611 | MB_PROC_VALUE_WRITE(stats); | 2763 | MB_PROC_FOPS(max_to_scan); |
2612 | MB_PROC_VALUE_READ(max_to_scan); | 2764 | MB_PROC_FOPS(min_to_scan); |
2613 | MB_PROC_VALUE_WRITE(max_to_scan); | 2765 | MB_PROC_FOPS(order2_reqs); |
2614 | MB_PROC_VALUE_READ(min_to_scan); | 2766 | MB_PROC_FOPS(stream_request); |
2615 | MB_PROC_VALUE_WRITE(min_to_scan); | 2767 | MB_PROC_FOPS(group_prealloc); |
2616 | MB_PROC_VALUE_READ(order2_reqs); | ||
2617 | MB_PROC_VALUE_WRITE(order2_reqs); | ||
2618 | MB_PROC_VALUE_READ(stream_request); | ||
2619 | MB_PROC_VALUE_WRITE(stream_request); | ||
2620 | MB_PROC_VALUE_READ(group_prealloc); | ||
2621 | MB_PROC_VALUE_WRITE(group_prealloc); | ||
2622 | 2768 | ||
2623 | #define MB_PROC_HANDLER(name, var) \ | 2769 | #define MB_PROC_HANDLER(name, var) \ |
2624 | do { \ | 2770 | do { \ |
2625 | proc = create_proc_entry(name, mode, sbi->s_mb_proc); \ | 2771 | proc = proc_create_data(name, mode, sbi->s_mb_proc, \ |
2772 | &ext4_mb_##var##_proc_fops, sbi); \ | ||
2626 | if (proc == NULL) { \ | 2773 | if (proc == NULL) { \ |
2627 | printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ | 2774 | printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ |
2628 | goto err_out; \ | 2775 | goto err_out; \ |
2629 | } \ | 2776 | } \ |
2630 | proc->data = sbi; \ | ||
2631 | proc->read_proc = ext4_mb_read_##var ; \ | ||
2632 | proc->write_proc = ext4_mb_write_##var; \ | ||
2633 | } while (0) | 2777 | } while (0) |
2634 | 2778 | ||
2635 | static int ext4_mb_init_per_dev_proc(struct super_block *sb) | 2779 | static int ext4_mb_init_per_dev_proc(struct super_block *sb) |
@@ -2639,6 +2783,10 @@ static int ext4_mb_init_per_dev_proc(struct super_block *sb) | |||
2639 | struct proc_dir_entry *proc; | 2783 | struct proc_dir_entry *proc; |
2640 | char devname[64]; | 2784 | char devname[64]; |
2641 | 2785 | ||
2786 | if (proc_root_ext4 == NULL) { | ||
2787 | sbi->s_mb_proc = NULL; | ||
2788 | return -EINVAL; | ||
2789 | } | ||
2642 | bdevname(sb->s_bdev, devname); | 2790 | bdevname(sb->s_bdev, devname); |
2643 | sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); | 2791 | sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); |
2644 | 2792 | ||
@@ -2747,7 +2895,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2747 | 2895 | ||
2748 | 2896 | ||
2749 | err = -EIO; | 2897 | err = -EIO; |
2750 | bitmap_bh = read_block_bitmap(sb, ac->ac_b_ex.fe_group); | 2898 | bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); |
2751 | if (!bitmap_bh) | 2899 | if (!bitmap_bh) |
2752 | goto out_err; | 2900 | goto out_err; |
2753 | 2901 | ||
@@ -2816,7 +2964,23 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, | |||
2816 | le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); | 2964 | le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); |
2817 | gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); | 2965 | gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); |
2818 | spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); | 2966 | spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); |
2819 | percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); | 2967 | |
2968 | /* | ||
2969 | * free blocks account has already be reduced/reserved | ||
2970 | * at write_begin() time for delayed allocation | ||
2971 | * do not double accounting | ||
2972 | */ | ||
2973 | if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) | ||
2974 | percpu_counter_sub(&sbi->s_freeblocks_counter, | ||
2975 | ac->ac_b_ex.fe_len); | ||
2976 | |||
2977 | if (sbi->s_log_groups_per_flex) { | ||
2978 | ext4_group_t flex_group = ext4_flex_group(sbi, | ||
2979 | ac->ac_b_ex.fe_group); | ||
2980 | spin_lock(sb_bgl_lock(sbi, flex_group)); | ||
2981 | sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len; | ||
2982 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | ||
2983 | } | ||
2820 | 2984 | ||
2821 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); | 2985 | err = ext4_journal_dirty_metadata(handle, bitmap_bh); |
2822 | if (err) | 2986 | if (err) |
@@ -3473,8 +3637,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, | |||
3473 | if (bit >= end) | 3637 | if (bit >= end) |
3474 | break; | 3638 | break; |
3475 | next = mb_find_next_bit(bitmap_bh->b_data, end, bit); | 3639 | next = mb_find_next_bit(bitmap_bh->b_data, end, bit); |
3476 | if (next > end) | ||
3477 | next = end; | ||
3478 | start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + | 3640 | start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit + |
3479 | le32_to_cpu(sbi->s_es->s_first_data_block); | 3641 | le32_to_cpu(sbi->s_es->s_first_data_block); |
3480 | mb_debug(" free preallocated %u/%u in group %u\n", | 3642 | mb_debug(" free preallocated %u/%u in group %u\n", |
@@ -3569,7 +3731,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, | |||
3569 | if (list_empty(&grp->bb_prealloc_list)) | 3731 | if (list_empty(&grp->bb_prealloc_list)) |
3570 | return 0; | 3732 | return 0; |
3571 | 3733 | ||
3572 | bitmap_bh = read_block_bitmap(sb, group); | 3734 | bitmap_bh = ext4_read_block_bitmap(sb, group); |
3573 | if (bitmap_bh == NULL) { | 3735 | if (bitmap_bh == NULL) { |
3574 | /* error handling here */ | 3736 | /* error handling here */ |
3575 | ext4_mb_release_desc(&e4b); | 3737 | ext4_mb_release_desc(&e4b); |
@@ -3743,7 +3905,7 @@ repeat: | |||
3743 | err = ext4_mb_load_buddy(sb, group, &e4b); | 3905 | err = ext4_mb_load_buddy(sb, group, &e4b); |
3744 | BUG_ON(err != 0); /* error handling here */ | 3906 | BUG_ON(err != 0); /* error handling here */ |
3745 | 3907 | ||
3746 | bitmap_bh = read_block_bitmap(sb, group); | 3908 | bitmap_bh = ext4_read_block_bitmap(sb, group); |
3747 | if (bitmap_bh == NULL) { | 3909 | if (bitmap_bh == NULL) { |
3748 | /* error handling here */ | 3910 | /* error handling here */ |
3749 | ext4_mb_release_desc(&e4b); | 3911 | ext4_mb_release_desc(&e4b); |
@@ -4011,10 +4173,21 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |||
4011 | sbi = EXT4_SB(sb); | 4173 | sbi = EXT4_SB(sb); |
4012 | 4174 | ||
4013 | if (!test_opt(sb, MBALLOC)) { | 4175 | if (!test_opt(sb, MBALLOC)) { |
4014 | block = ext4_new_blocks_old(handle, ar->inode, ar->goal, | 4176 | block = ext4_old_new_blocks(handle, ar->inode, ar->goal, |
4015 | &(ar->len), errp); | 4177 | &(ar->len), errp); |
4016 | return block; | 4178 | return block; |
4017 | } | 4179 | } |
4180 | if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { | ||
4181 | /* | ||
4182 | * With delalloc we already reserved the blocks | ||
4183 | */ | ||
4184 | ar->len = ext4_has_free_blocks(sbi, ar->len); | ||
4185 | } | ||
4186 | |||
4187 | if (ar->len == 0) { | ||
4188 | *errp = -ENOSPC; | ||
4189 | return 0; | ||
4190 | } | ||
4018 | 4191 | ||
4019 | while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { | 4192 | while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { |
4020 | ar->flags |= EXT4_MB_HINT_NOPREALLOC; | 4193 | ar->flags |= EXT4_MB_HINT_NOPREALLOC; |
@@ -4026,10 +4199,14 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |||
4026 | } | 4199 | } |
4027 | inquota = ar->len; | 4200 | inquota = ar->len; |
4028 | 4201 | ||
4202 | if (EXT4_I(ar->inode)->i_delalloc_reserved_flag) | ||
4203 | ar->flags |= EXT4_MB_DELALLOC_RESERVED; | ||
4204 | |||
4029 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); | 4205 | ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); |
4030 | if (!ac) { | 4206 | if (!ac) { |
4207 | ar->len = 0; | ||
4031 | *errp = -ENOMEM; | 4208 | *errp = -ENOMEM; |
4032 | return 0; | 4209 | goto out1; |
4033 | } | 4210 | } |
4034 | 4211 | ||
4035 | ext4_mb_poll_new_transaction(sb, handle); | 4212 | ext4_mb_poll_new_transaction(sb, handle); |
@@ -4037,12 +4214,11 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, | |||
4037 | *errp = ext4_mb_initialize_context(ac, ar); | 4214 | *errp = ext4_mb_initialize_context(ac, ar); |
4038 | if (*errp) { | 4215 | if (*errp) { |
4039 | ar->len = 0; | 4216 | ar->len = 0; |
4040 | goto out; | 4217 | goto out2; |
4041 | } | 4218 | } |
4042 | 4219 | ||
4043 | ac->ac_op = EXT4_MB_HISTORY_PREALLOC; | 4220 | ac->ac_op = EXT4_MB_HISTORY_PREALLOC; |
4044 | if (!ext4_mb_use_preallocated(ac)) { | 4221 | if (!ext4_mb_use_preallocated(ac)) { |
4045 | |||
4046 | ac->ac_op = EXT4_MB_HISTORY_ALLOC; | 4222 | ac->ac_op = EXT4_MB_HISTORY_ALLOC; |
4047 | ext4_mb_normalize_request(ac, ar); | 4223 | ext4_mb_normalize_request(ac, ar); |
4048 | repeat: | 4224 | repeat: |
@@ -4085,11 +4261,12 @@ repeat: | |||
4085 | 4261 | ||
4086 | ext4_mb_release_context(ac); | 4262 | ext4_mb_release_context(ac); |
4087 | 4263 | ||
4088 | out: | 4264 | out2: |
4265 | kmem_cache_free(ext4_ac_cachep, ac); | ||
4266 | out1: | ||
4089 | if (ar->len < inquota) | 4267 | if (ar->len < inquota) |
4090 | DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); | 4268 | DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); |
4091 | 4269 | ||
4092 | kmem_cache_free(ext4_ac_cachep, ac); | ||
4093 | return block; | 4270 | return block; |
4094 | } | 4271 | } |
4095 | static void ext4_mb_poll_new_transaction(struct super_block *sb, | 4272 | static void ext4_mb_poll_new_transaction(struct super_block *sb, |
@@ -4242,7 +4419,7 @@ do_more: | |||
4242 | overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); | 4419 | overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); |
4243 | count -= overflow; | 4420 | count -= overflow; |
4244 | } | 4421 | } |
4245 | bitmap_bh = read_block_bitmap(sb, block_group); | 4422 | bitmap_bh = ext4_read_block_bitmap(sb, block_group); |
4246 | if (!bitmap_bh) | 4423 | if (!bitmap_bh) |
4247 | goto error_return; | 4424 | goto error_return; |
4248 | gdp = ext4_get_group_desc(sb, block_group, &gd_bh); | 4425 | gdp = ext4_get_group_desc(sb, block_group, &gd_bh); |
@@ -4309,10 +4486,9 @@ do_more: | |||
4309 | ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); | 4486 | ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); |
4310 | } else { | 4487 | } else { |
4311 | ext4_lock_group(sb, block_group); | 4488 | ext4_lock_group(sb, block_group); |
4312 | err = mb_free_blocks(inode, &e4b, bit, count); | 4489 | mb_free_blocks(inode, &e4b, bit, count); |
4313 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); | 4490 | ext4_mb_return_to_preallocation(inode, &e4b, block, count); |
4314 | ext4_unlock_group(sb, block_group); | 4491 | ext4_unlock_group(sb, block_group); |
4315 | BUG_ON(err != 0); | ||
4316 | } | 4492 | } |
4317 | 4493 | ||
4318 | spin_lock(sb_bgl_lock(sbi, block_group)); | 4494 | spin_lock(sb_bgl_lock(sbi, block_group)); |
@@ -4321,6 +4497,13 @@ do_more: | |||
4321 | spin_unlock(sb_bgl_lock(sbi, block_group)); | 4497 | spin_unlock(sb_bgl_lock(sbi, block_group)); |
4322 | percpu_counter_add(&sbi->s_freeblocks_counter, count); | 4498 | percpu_counter_add(&sbi->s_freeblocks_counter, count); |
4323 | 4499 | ||
4500 | if (sbi->s_log_groups_per_flex) { | ||
4501 | ext4_group_t flex_group = ext4_flex_group(sbi, block_group); | ||
4502 | spin_lock(sb_bgl_lock(sbi, flex_group)); | ||
4503 | sbi->s_flex_groups[flex_group].free_blocks += count; | ||
4504 | spin_unlock(sb_bgl_lock(sbi, flex_group)); | ||
4505 | } | ||
4506 | |||
4324 | ext4_mb_release_desc(&e4b); | 4507 | ext4_mb_release_desc(&e4b); |
4325 | 4508 | ||
4326 | *freed += count; | 4509 | *freed += count; |
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index ab16beaa830d..387ad98350c3 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c | |||
@@ -183,6 +183,16 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, | |||
183 | struct inode *inode); | 183 | struct inode *inode); |
184 | 184 | ||
185 | /* | 185 | /* |
186 | * p is at least 6 bytes before the end of page | ||
187 | */ | ||
188 | static inline struct ext4_dir_entry_2 * | ||
189 | ext4_next_entry(struct ext4_dir_entry_2 *p) | ||
190 | { | ||
191 | return (struct ext4_dir_entry_2 *)((char *)p + | ||
192 | ext4_rec_len_from_disk(p->rec_len)); | ||
193 | } | ||
194 | |||
195 | /* | ||
186 | * Future: use high four bits of block for coalesce-on-delete flags | 196 | * Future: use high four bits of block for coalesce-on-delete flags |
187 | * Mask them off for now. | 197 | * Mask them off for now. |
188 | */ | 198 | */ |
@@ -231,13 +241,13 @@ static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) | |||
231 | { | 241 | { |
232 | unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - | 242 | unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - |
233 | EXT4_DIR_REC_LEN(2) - infosize; | 243 | EXT4_DIR_REC_LEN(2) - infosize; |
234 | return 0? 20: entry_space / sizeof(struct dx_entry); | 244 | return entry_space / sizeof(struct dx_entry); |
235 | } | 245 | } |
236 | 246 | ||
237 | static inline unsigned dx_node_limit (struct inode *dir) | 247 | static inline unsigned dx_node_limit (struct inode *dir) |
238 | { | 248 | { |
239 | unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); | 249 | unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); |
240 | return 0? 22: entry_space / sizeof(struct dx_entry); | 250 | return entry_space / sizeof(struct dx_entry); |
241 | } | 251 | } |
242 | 252 | ||
243 | /* | 253 | /* |
@@ -554,15 +564,6 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, | |||
554 | 564 | ||
555 | 565 | ||
556 | /* | 566 | /* |
557 | * p is at least 6 bytes before the end of page | ||
558 | */ | ||
559 | static inline struct ext4_dir_entry_2 *ext4_next_entry(struct ext4_dir_entry_2 *p) | ||
560 | { | ||
561 | return (struct ext4_dir_entry_2 *)((char *)p + | ||
562 | ext4_rec_len_from_disk(p->rec_len)); | ||
563 | } | ||
564 | |||
565 | /* | ||
566 | * This function fills a red-black tree with information from a | 567 | * This function fills a red-black tree with information from a |
567 | * directory block. It returns the number directory entries loaded | 568 | * directory block. It returns the number directory entries loaded |
568 | * into the tree. If there is an error it is returned in err. | 569 | * into the tree. If there is an error it is returned in err. |
@@ -993,19 +994,21 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, | |||
993 | de = (struct ext4_dir_entry_2 *) bh->b_data; | 994 | de = (struct ext4_dir_entry_2 *) bh->b_data; |
994 | top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - | 995 | top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize - |
995 | EXT4_DIR_REC_LEN(0)); | 996 | EXT4_DIR_REC_LEN(0)); |
996 | for (; de < top; de = ext4_next_entry(de)) | 997 | for (; de < top; de = ext4_next_entry(de)) { |
997 | if (ext4_match (namelen, name, de)) { | 998 | int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) |
998 | if (!ext4_check_dir_entry("ext4_find_entry", | 999 | + ((char *) de - bh->b_data); |
999 | dir, de, bh, | 1000 | |
1000 | (block<<EXT4_BLOCK_SIZE_BITS(sb)) | 1001 | if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { |
1001 | +((char *)de - bh->b_data))) { | 1002 | brelse(bh); |
1002 | brelse (bh); | ||
1003 | *err = ERR_BAD_DX_DIR; | 1003 | *err = ERR_BAD_DX_DIR; |
1004 | goto errout; | 1004 | goto errout; |
1005 | } | 1005 | } |
1006 | *res_dir = de; | 1006 | |
1007 | dx_release (frames); | 1007 | if (ext4_match(namelen, name, de)) { |
1008 | return bh; | 1008 | *res_dir = de; |
1009 | dx_release(frames); | ||
1010 | return bh; | ||
1011 | } | ||
1009 | } | 1012 | } |
1010 | brelse (bh); | 1013 | brelse (bh); |
1011 | /* Check to see if we should continue to search */ | 1014 | /* Check to see if we should continue to search */ |
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 9ff7b1c04239..f000fbe2cd93 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c | |||
@@ -866,6 +866,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) | |||
866 | gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); | 866 | gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); |
867 | 867 | ||
868 | /* | 868 | /* |
869 | * We can allocate memory for mb_alloc based on the new group | ||
870 | * descriptor | ||
871 | */ | ||
872 | if (test_opt(sb, MBALLOC)) { | ||
873 | err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); | ||
874 | if (err) | ||
875 | goto exit_journal; | ||
876 | } | ||
877 | /* | ||
869 | * Make the new blocks and inodes valid next. We do this before | 878 | * Make the new blocks and inodes valid next. We do this before |
870 | * increasing the group count so that once the group is enabled, | 879 | * increasing the group count so that once the group is enabled, |
871 | * all of its blocks and inodes are already valid. | 880 | * all of its blocks and inodes are already valid. |
@@ -957,6 +966,8 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
957 | handle_t *handle; | 966 | handle_t *handle; |
958 | int err; | 967 | int err; |
959 | unsigned long freed_blocks; | 968 | unsigned long freed_blocks; |
969 | ext4_group_t group; | ||
970 | struct ext4_group_info *grp; | ||
960 | 971 | ||
961 | /* We don't need to worry about locking wrt other resizers just | 972 | /* We don't need to worry about locking wrt other resizers just |
962 | * yet: we're going to revalidate es->s_blocks_count after | 973 | * yet: we're going to revalidate es->s_blocks_count after |
@@ -988,7 +999,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
988 | } | 999 | } |
989 | 1000 | ||
990 | /* Handle the remaining blocks in the last group only. */ | 1001 | /* Handle the remaining blocks in the last group only. */ |
991 | ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); | 1002 | ext4_get_group_no_and_offset(sb, o_blocks_count, &group, &last); |
992 | 1003 | ||
993 | if (last == 0) { | 1004 | if (last == 0) { |
994 | ext4_warning(sb, __func__, | 1005 | ext4_warning(sb, __func__, |
@@ -1060,6 +1071,45 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, | |||
1060 | o_blocks_count + add); | 1071 | o_blocks_count + add); |
1061 | if ((err = ext4_journal_stop(handle))) | 1072 | if ((err = ext4_journal_stop(handle))) |
1062 | goto exit_put; | 1073 | goto exit_put; |
1074 | |||
1075 | /* | ||
1076 | * Mark mballoc pages as not up to date so that they will be updated | ||
1077 | * next time they are loaded by ext4_mb_load_buddy. | ||
1078 | */ | ||
1079 | if (test_opt(sb, MBALLOC)) { | ||
1080 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1081 | struct inode *inode = sbi->s_buddy_cache; | ||
1082 | int blocks_per_page; | ||
1083 | int block; | ||
1084 | int pnum; | ||
1085 | struct page *page; | ||
1086 | |||
1087 | /* Set buddy page as not up to date */ | ||
1088 | blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; | ||
1089 | block = group * 2; | ||
1090 | pnum = block / blocks_per_page; | ||
1091 | page = find_get_page(inode->i_mapping, pnum); | ||
1092 | if (page != NULL) { | ||
1093 | ClearPageUptodate(page); | ||
1094 | page_cache_release(page); | ||
1095 | } | ||
1096 | |||
1097 | /* Set bitmap page as not up to date */ | ||
1098 | block++; | ||
1099 | pnum = block / blocks_per_page; | ||
1100 | page = find_get_page(inode->i_mapping, pnum); | ||
1101 | if (page != NULL) { | ||
1102 | ClearPageUptodate(page); | ||
1103 | page_cache_release(page); | ||
1104 | } | ||
1105 | |||
1106 | /* Get the info on the last group */ | ||
1107 | grp = ext4_get_group_info(sb, group); | ||
1108 | |||
1109 | /* Update free blocks in group info */ | ||
1110 | ext4_mb_update_group_info(grp, add); | ||
1111 | } | ||
1112 | |||
1063 | if (test_opt(sb, DEBUG)) | 1113 | if (test_opt(sb, DEBUG)) |
1064 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", | 1114 | printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", |
1065 | ext4_blocks_count(es)); | 1115 | ext4_blocks_count(es)); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 02bf24343979..1cb371dcd609 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -506,6 +506,7 @@ static void ext4_put_super (struct super_block * sb) | |||
506 | ext4_ext_release(sb); | 506 | ext4_ext_release(sb); |
507 | ext4_xattr_put_super(sb); | 507 | ext4_xattr_put_super(sb); |
508 | jbd2_journal_destroy(sbi->s_journal); | 508 | jbd2_journal_destroy(sbi->s_journal); |
509 | sbi->s_journal = NULL; | ||
509 | if (!(sb->s_flags & MS_RDONLY)) { | 510 | if (!(sb->s_flags & MS_RDONLY)) { |
510 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); | 511 | EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); |
511 | es->s_state = cpu_to_le16(sbi->s_mount_state); | 512 | es->s_state = cpu_to_le16(sbi->s_mount_state); |
@@ -517,6 +518,7 @@ static void ext4_put_super (struct super_block * sb) | |||
517 | for (i = 0; i < sbi->s_gdb_count; i++) | 518 | for (i = 0; i < sbi->s_gdb_count; i++) |
518 | brelse(sbi->s_group_desc[i]); | 519 | brelse(sbi->s_group_desc[i]); |
519 | kfree(sbi->s_group_desc); | 520 | kfree(sbi->s_group_desc); |
521 | kfree(sbi->s_flex_groups); | ||
520 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 522 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
521 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 523 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
522 | percpu_counter_destroy(&sbi->s_dirs_counter); | 524 | percpu_counter_destroy(&sbi->s_dirs_counter); |
@@ -571,6 +573,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) | |||
571 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); | 573 | memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); |
572 | INIT_LIST_HEAD(&ei->i_prealloc_list); | 574 | INIT_LIST_HEAD(&ei->i_prealloc_list); |
573 | spin_lock_init(&ei->i_prealloc_lock); | 575 | spin_lock_init(&ei->i_prealloc_lock); |
576 | jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); | ||
577 | ei->i_reserved_data_blocks = 0; | ||
578 | ei->i_reserved_meta_blocks = 0; | ||
579 | ei->i_allocated_meta_blocks = 0; | ||
580 | ei->i_delalloc_reserved_flag = 0; | ||
581 | spin_lock_init(&(ei->i_block_reservation_lock)); | ||
574 | return &ei->vfs_inode; | 582 | return &ei->vfs_inode; |
575 | } | 583 | } |
576 | 584 | ||
@@ -635,6 +643,8 @@ static void ext4_clear_inode(struct inode *inode) | |||
635 | EXT4_I(inode)->i_block_alloc_info = NULL; | 643 | EXT4_I(inode)->i_block_alloc_info = NULL; |
636 | if (unlikely(rsv)) | 644 | if (unlikely(rsv)) |
637 | kfree(rsv); | 645 | kfree(rsv); |
646 | jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, | ||
647 | &EXT4_I(inode)->jinode); | ||
638 | } | 648 | } |
639 | 649 | ||
640 | static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) | 650 | static inline void ext4_show_quota_options(struct seq_file *seq, struct super_block *sb) |
@@ -671,7 +681,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
671 | unsigned long def_mount_opts; | 681 | unsigned long def_mount_opts; |
672 | struct super_block *sb = vfs->mnt_sb; | 682 | struct super_block *sb = vfs->mnt_sb; |
673 | struct ext4_sb_info *sbi = EXT4_SB(sb); | 683 | struct ext4_sb_info *sbi = EXT4_SB(sb); |
674 | journal_t *journal = sbi->s_journal; | ||
675 | struct ext4_super_block *es = sbi->s_es; | 684 | struct ext4_super_block *es = sbi->s_es; |
676 | 685 | ||
677 | def_mount_opts = le32_to_cpu(es->s_default_mount_opts); | 686 | def_mount_opts = le32_to_cpu(es->s_default_mount_opts); |
@@ -747,6 +756,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
747 | seq_puts(seq, ",nomballoc"); | 756 | seq_puts(seq, ",nomballoc"); |
748 | if (test_opt(sb, I_VERSION)) | 757 | if (test_opt(sb, I_VERSION)) |
749 | seq_puts(seq, ",i_version"); | 758 | seq_puts(seq, ",i_version"); |
759 | if (!test_opt(sb, DELALLOC)) | ||
760 | seq_puts(seq, ",nodelalloc"); | ||
761 | |||
750 | 762 | ||
751 | if (sbi->s_stripe) | 763 | if (sbi->s_stripe) |
752 | seq_printf(seq, ",stripe=%lu", sbi->s_stripe); | 764 | seq_printf(seq, ",stripe=%lu", sbi->s_stripe); |
@@ -894,7 +906,7 @@ enum { | |||
894 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, | 906 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, |
895 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, | 907 | Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, |
896 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, | 908 | Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, |
897 | Opt_mballoc, Opt_nomballoc, Opt_stripe, | 909 | Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, |
898 | }; | 910 | }; |
899 | 911 | ||
900 | static match_table_t tokens = { | 912 | static match_table_t tokens = { |
@@ -953,6 +965,8 @@ static match_table_t tokens = { | |||
953 | {Opt_nomballoc, "nomballoc"}, | 965 | {Opt_nomballoc, "nomballoc"}, |
954 | {Opt_stripe, "stripe=%u"}, | 966 | {Opt_stripe, "stripe=%u"}, |
955 | {Opt_resize, "resize"}, | 967 | {Opt_resize, "resize"}, |
968 | {Opt_delalloc, "delalloc"}, | ||
969 | {Opt_nodelalloc, "nodelalloc"}, | ||
956 | {Opt_err, NULL}, | 970 | {Opt_err, NULL}, |
957 | }; | 971 | }; |
958 | 972 | ||
@@ -990,6 +1004,7 @@ static int parse_options (char *options, struct super_block *sb, | |||
990 | int qtype, qfmt; | 1004 | int qtype, qfmt; |
991 | char *qname; | 1005 | char *qname; |
992 | #endif | 1006 | #endif |
1007 | ext4_fsblk_t last_block; | ||
993 | 1008 | ||
994 | if (!options) | 1009 | if (!options) |
995 | return 1; | 1010 | return 1; |
@@ -1309,15 +1324,39 @@ set_qf_format: | |||
1309 | clear_opt(sbi->s_mount_opt, NOBH); | 1324 | clear_opt(sbi->s_mount_opt, NOBH); |
1310 | break; | 1325 | break; |
1311 | case Opt_extents: | 1326 | case Opt_extents: |
1327 | if (!EXT4_HAS_INCOMPAT_FEATURE(sb, | ||
1328 | EXT4_FEATURE_INCOMPAT_EXTENTS)) { | ||
1329 | ext4_warning(sb, __func__, | ||
1330 | "extents feature not enabled " | ||
1331 | "on this filesystem, use tune2fs\n"); | ||
1332 | return 0; | ||
1333 | } | ||
1312 | set_opt (sbi->s_mount_opt, EXTENTS); | 1334 | set_opt (sbi->s_mount_opt, EXTENTS); |
1313 | break; | 1335 | break; |
1314 | case Opt_noextents: | 1336 | case Opt_noextents: |
1337 | /* | ||
1338 | * When e2fsprogs support resizing an already existing | ||
1339 | * ext3 file system to greater than 2**32 we need to | ||
1340 | * add support to block allocator to handle growing | ||
1341 | * already existing block mapped inode so that blocks | ||
1342 | * allocated for them fall within 2**32 | ||
1343 | */ | ||
1344 | last_block = ext4_blocks_count(sbi->s_es) - 1; | ||
1345 | if (last_block > 0xffffffffULL) { | ||
1346 | printk(KERN_ERR "EXT4-fs: Filesystem too " | ||
1347 | "large to mount with " | ||
1348 | "-o noextents options\n"); | ||
1349 | return 0; | ||
1350 | } | ||
1315 | clear_opt (sbi->s_mount_opt, EXTENTS); | 1351 | clear_opt (sbi->s_mount_opt, EXTENTS); |
1316 | break; | 1352 | break; |
1317 | case Opt_i_version: | 1353 | case Opt_i_version: |
1318 | set_opt(sbi->s_mount_opt, I_VERSION); | 1354 | set_opt(sbi->s_mount_opt, I_VERSION); |
1319 | sb->s_flags |= MS_I_VERSION; | 1355 | sb->s_flags |= MS_I_VERSION; |
1320 | break; | 1356 | break; |
1357 | case Opt_nodelalloc: | ||
1358 | clear_opt(sbi->s_mount_opt, DELALLOC); | ||
1359 | break; | ||
1321 | case Opt_mballoc: | 1360 | case Opt_mballoc: |
1322 | set_opt(sbi->s_mount_opt, MBALLOC); | 1361 | set_opt(sbi->s_mount_opt, MBALLOC); |
1323 | break; | 1362 | break; |
@@ -1331,6 +1370,9 @@ set_qf_format: | |||
1331 | return 0; | 1370 | return 0; |
1332 | sbi->s_stripe = option; | 1371 | sbi->s_stripe = option; |
1333 | break; | 1372 | break; |
1373 | case Opt_delalloc: | ||
1374 | set_opt(sbi->s_mount_opt, DELALLOC); | ||
1375 | break; | ||
1334 | default: | 1376 | default: |
1335 | printk (KERN_ERR | 1377 | printk (KERN_ERR |
1336 | "EXT4-fs: Unrecognized mount option \"%s\" " | 1378 | "EXT4-fs: Unrecognized mount option \"%s\" " |
@@ -1443,6 +1485,54 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, | |||
1443 | return res; | 1485 | return res; |
1444 | } | 1486 | } |
1445 | 1487 | ||
1488 | static int ext4_fill_flex_info(struct super_block *sb) | ||
1489 | { | ||
1490 | struct ext4_sb_info *sbi = EXT4_SB(sb); | ||
1491 | struct ext4_group_desc *gdp = NULL; | ||
1492 | struct buffer_head *bh; | ||
1493 | ext4_group_t flex_group_count; | ||
1494 | ext4_group_t flex_group; | ||
1495 | int groups_per_flex = 0; | ||
1496 | __u64 block_bitmap = 0; | ||
1497 | int i; | ||
1498 | |||
1499 | if (!sbi->s_es->s_log_groups_per_flex) { | ||
1500 | sbi->s_log_groups_per_flex = 0; | ||
1501 | return 1; | ||
1502 | } | ||
1503 | |||
1504 | sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; | ||
1505 | groups_per_flex = 1 << sbi->s_log_groups_per_flex; | ||
1506 | |||
1507 | flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / | ||
1508 | groups_per_flex; | ||
1509 | sbi->s_flex_groups = kmalloc(flex_group_count * | ||
1510 | sizeof(struct flex_groups), GFP_KERNEL); | ||
1511 | if (sbi->s_flex_groups == NULL) { | ||
1512 | printk(KERN_ERR "EXT4-fs: not enough memory\n"); | ||
1513 | goto failed; | ||
1514 | } | ||
1515 | memset(sbi->s_flex_groups, 0, flex_group_count * | ||
1516 | sizeof(struct flex_groups)); | ||
1517 | |||
1518 | gdp = ext4_get_group_desc(sb, 1, &bh); | ||
1519 | block_bitmap = ext4_block_bitmap(sb, gdp) - 1; | ||
1520 | |||
1521 | for (i = 0; i < sbi->s_groups_count; i++) { | ||
1522 | gdp = ext4_get_group_desc(sb, i, &bh); | ||
1523 | |||
1524 | flex_group = ext4_flex_group(sbi, i); | ||
1525 | sbi->s_flex_groups[flex_group].free_inodes += | ||
1526 | le16_to_cpu(gdp->bg_free_inodes_count); | ||
1527 | sbi->s_flex_groups[flex_group].free_blocks += | ||
1528 | le16_to_cpu(gdp->bg_free_blocks_count); | ||
1529 | } | ||
1530 | |||
1531 | return 1; | ||
1532 | failed: | ||
1533 | return 0; | ||
1534 | } | ||
1535 | |||
1446 | __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, | 1536 | __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, |
1447 | struct ext4_group_desc *gdp) | 1537 | struct ext4_group_desc *gdp) |
1448 | { | 1538 | { |
@@ -1810,8 +1900,8 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi) | |||
1810 | } | 1900 | } |
1811 | 1901 | ||
1812 | static int ext4_fill_super (struct super_block *sb, void *data, int silent) | 1902 | static int ext4_fill_super (struct super_block *sb, void *data, int silent) |
1813 | __releases(kernel_sem) | 1903 | __releases(kernel_lock) |
1814 | __acquires(kernel_sem) | 1904 | __acquires(kernel_lock) |
1815 | 1905 | ||
1816 | { | 1906 | { |
1817 | struct buffer_head * bh; | 1907 | struct buffer_head * bh; |
@@ -1851,11 +1941,6 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1851 | goto out_fail; | 1941 | goto out_fail; |
1852 | } | 1942 | } |
1853 | 1943 | ||
1854 | if (!sb_set_blocksize(sb, blocksize)) { | ||
1855 | printk(KERN_ERR "EXT4-fs: bad blocksize %d.\n", blocksize); | ||
1856 | goto out_fail; | ||
1857 | } | ||
1858 | |||
1859 | /* | 1944 | /* |
1860 | * The ext4 superblock will not be buffer aligned for other than 1kB | 1945 | * The ext4 superblock will not be buffer aligned for other than 1kB |
1861 | * block sizes. We need to calculate the offset from buffer start. | 1946 | * block sizes. We need to calculate the offset from buffer start. |
@@ -1919,15 +2004,28 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
1919 | 2004 | ||
1920 | /* | 2005 | /* |
1921 | * turn on extents feature by default in ext4 filesystem | 2006 | * turn on extents feature by default in ext4 filesystem |
1922 | * User -o noextents to turn it off | 2007 | * only if feature flag already set by mkfs or tune2fs. |
2008 | * Use -o noextents to turn it off | ||
1923 | */ | 2009 | */ |
1924 | set_opt(sbi->s_mount_opt, EXTENTS); | 2010 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) |
2011 | set_opt(sbi->s_mount_opt, EXTENTS); | ||
2012 | else | ||
2013 | ext4_warning(sb, __func__, | ||
2014 | "extents feature not enabled on this filesystem, " | ||
2015 | "use tune2fs.\n"); | ||
1925 | /* | 2016 | /* |
1926 | * turn on mballoc feature by default in ext4 filesystem | 2017 | * turn on mballoc code by default in ext4 filesystem |
1927 | * User -o nomballoc to turn it off | 2018 | * Use -o nomballoc to turn it off |
1928 | */ | 2019 | */ |
1929 | set_opt(sbi->s_mount_opt, MBALLOC); | 2020 | set_opt(sbi->s_mount_opt, MBALLOC); |
1930 | 2021 | ||
2022 | /* | ||
2023 | * enable delayed allocation by default | ||
2024 | * Use -o nodelalloc to turn it off | ||
2025 | */ | ||
2026 | set_opt(sbi->s_mount_opt, DELALLOC); | ||
2027 | |||
2028 | |||
1931 | if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, | 2029 | if (!parse_options ((char *) data, sb, &journal_inum, &journal_devnum, |
1932 | NULL, 0)) | 2030 | NULL, 0)) |
1933 | goto failed_mount; | 2031 | goto failed_mount; |
@@ -2138,6 +2236,14 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
2138 | printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); | 2236 | printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n"); |
2139 | goto failed_mount2; | 2237 | goto failed_mount2; |
2140 | } | 2238 | } |
2239 | if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) | ||
2240 | if (!ext4_fill_flex_info(sb)) { | ||
2241 | printk(KERN_ERR | ||
2242 | "EXT4-fs: unable to initialize " | ||
2243 | "flex_bg meta info!\n"); | ||
2244 | goto failed_mount2; | ||
2245 | } | ||
2246 | |||
2141 | sbi->s_gdb_count = db_count; | 2247 | sbi->s_gdb_count = db_count; |
2142 | get_random_bytes(&sbi->s_next_generation, sizeof(u32)); | 2248 | get_random_bytes(&sbi->s_next_generation, sizeof(u32)); |
2143 | spin_lock_init(&sbi->s_next_gen_lock); | 2249 | spin_lock_init(&sbi->s_next_gen_lock); |
@@ -2358,6 +2464,13 @@ static int ext4_fill_super (struct super_block *sb, void *data, int silent) | |||
2358 | test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": | 2464 | test_opt(sb,DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": |
2359 | "writeback"); | 2465 | "writeback"); |
2360 | 2466 | ||
2467 | if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { | ||
2468 | printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " | ||
2469 | "requested data journaling mode\n"); | ||
2470 | clear_opt(sbi->s_mount_opt, DELALLOC); | ||
2471 | } else if (test_opt(sb, DELALLOC)) | ||
2472 | printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); | ||
2473 | |||
2361 | ext4_ext_init(sb); | 2474 | ext4_ext_init(sb); |
2362 | ext4_mb_init(sb, needs_recovery); | 2475 | ext4_mb_init(sb, needs_recovery); |
2363 | 2476 | ||
@@ -2372,6 +2485,7 @@ cantfind_ext4: | |||
2372 | 2485 | ||
2373 | failed_mount4: | 2486 | failed_mount4: |
2374 | jbd2_journal_destroy(sbi->s_journal); | 2487 | jbd2_journal_destroy(sbi->s_journal); |
2488 | sbi->s_journal = NULL; | ||
2375 | failed_mount3: | 2489 | failed_mount3: |
2376 | percpu_counter_destroy(&sbi->s_freeblocks_counter); | 2490 | percpu_counter_destroy(&sbi->s_freeblocks_counter); |
2377 | percpu_counter_destroy(&sbi->s_freeinodes_counter); | 2491 | percpu_counter_destroy(&sbi->s_freeinodes_counter); |
@@ -3325,7 +3439,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, | |||
3325 | err = ext4_journal_dirty_metadata(handle, bh); | 3439 | err = ext4_journal_dirty_metadata(handle, bh); |
3326 | else { | 3440 | else { |
3327 | /* Always do at least ordered writes for quotas */ | 3441 | /* Always do at least ordered writes for quotas */ |
3328 | err = ext4_journal_dirty_data(handle, bh); | 3442 | err = ext4_jbd2_file_inode(handle, inode); |
3329 | mark_buffer_dirty(bh); | 3443 | mark_buffer_dirty(bh); |
3330 | } | 3444 | } |
3331 | brelse(bh); | 3445 | brelse(bh); |
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index ff08633f398e..93c5fdcdad2e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c | |||
@@ -810,7 +810,7 @@ inserted: | |||
810 | /* We need to allocate a new block */ | 810 | /* We need to allocate a new block */ |
811 | ext4_fsblk_t goal = ext4_group_first_block_no(sb, | 811 | ext4_fsblk_t goal = ext4_group_first_block_no(sb, |
812 | EXT4_I(inode)->i_block_group); | 812 | EXT4_I(inode)->i_block_group); |
813 | ext4_fsblk_t block = ext4_new_block(handle, inode, | 813 | ext4_fsblk_t block = ext4_new_meta_block(handle, inode, |
814 | goal, &error); | 814 | goal, &error); |
815 | if (error) | 815 | if (error) |
816 | goto cleanup; | 816 | goto cleanup; |
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index fff33382cadc..ac1a52cf2a37 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c | |||
@@ -13,13 +13,11 @@ | |||
13 | #include "ext4.h" | 13 | #include "ext4.h" |
14 | #include "xattr.h" | 14 | #include "xattr.h" |
15 | 15 | ||
16 | #define XATTR_TRUSTED_PREFIX "trusted." | ||
17 | |||
18 | static size_t | 16 | static size_t |
19 | ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, | 17 | ext4_xattr_trusted_list(struct inode *inode, char *list, size_t list_size, |
20 | const char *name, size_t name_len) | 18 | const char *name, size_t name_len) |
21 | { | 19 | { |
22 | const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX)-1; | 20 | const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; |
23 | const size_t total_len = prefix_len + name_len + 1; | 21 | const size_t total_len = prefix_len + name_len + 1; |
24 | 22 | ||
25 | if (!capable(CAP_SYS_ADMIN)) | 23 | if (!capable(CAP_SYS_ADMIN)) |
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 67be723fcc4e..d91aa61b42aa 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c | |||
@@ -12,13 +12,11 @@ | |||
12 | #include "ext4.h" | 12 | #include "ext4.h" |
13 | #include "xattr.h" | 13 | #include "xattr.h" |
14 | 14 | ||
15 | #define XATTR_USER_PREFIX "user." | ||
16 | |||
17 | static size_t | 15 | static size_t |
18 | ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, | 16 | ext4_xattr_user_list(struct inode *inode, char *list, size_t list_size, |
19 | const char *name, size_t name_len) | 17 | const char *name, size_t name_len) |
20 | { | 18 | { |
21 | const size_t prefix_len = sizeof(XATTR_USER_PREFIX)-1; | 19 | const size_t prefix_len = XATTR_USER_PREFIX_LEN; |
22 | const size_t total_len = prefix_len + name_len + 1; | 20 | const size_t total_len = prefix_len + name_len + 1; |
23 | 21 | ||
24 | if (!test_opt(inode->i_sb, XATTR_USER)) | 22 | if (!test_opt(inode->i_sb, XATTR_USER)) |
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 6914598022ce..91389c8aee8a 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c | |||
@@ -688,7 +688,6 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact | |||
688 | 688 | ||
689 | J_ASSERT(transaction->t_state == T_FINISHED); | 689 | J_ASSERT(transaction->t_state == T_FINISHED); |
690 | J_ASSERT(transaction->t_buffers == NULL); | 690 | J_ASSERT(transaction->t_buffers == NULL); |
691 | J_ASSERT(transaction->t_sync_datalist == NULL); | ||
692 | J_ASSERT(transaction->t_forget == NULL); | 691 | J_ASSERT(transaction->t_forget == NULL); |
693 | J_ASSERT(transaction->t_iobuf_list == NULL); | 692 | J_ASSERT(transaction->t_iobuf_list == NULL); |
694 | J_ASSERT(transaction->t_shadow_list == NULL); | 693 | J_ASSERT(transaction->t_shadow_list == NULL); |
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a2ed72f7ceee..f8b3be873226 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c | |||
@@ -22,6 +22,8 @@ | |||
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/jiffies.h> | 23 | #include <linux/jiffies.h> |
24 | #include <linux/crc32.h> | 24 | #include <linux/crc32.h> |
25 | #include <linux/writeback.h> | ||
26 | #include <linux/backing-dev.h> | ||
25 | 27 | ||
26 | /* | 28 | /* |
27 | * Default IO end handler for temporary BJ_IO buffer_heads. | 29 | * Default IO end handler for temporary BJ_IO buffer_heads. |
@@ -37,8 +39,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) | |||
37 | } | 39 | } |
38 | 40 | ||
39 | /* | 41 | /* |
40 | * When an ext3-ordered file is truncated, it is possible that many pages are | 42 | * When an ext4 file is truncated, it is possible that some pages are not |
41 | * not sucessfully freed, because they are attached to a committing transaction. | 43 | * successfully freed, because they are attached to a committing transaction. |
42 | * After the transaction commits, these pages are left on the LRU, with no | 44 | * After the transaction commits, these pages are left on the LRU, with no |
43 | * ->mapping, and with attached buffers. These pages are trivially reclaimable | 45 | * ->mapping, and with attached buffers. These pages are trivially reclaimable |
44 | * by the VM, but their apparent absence upsets the VM accounting, and it makes | 46 | * by the VM, but their apparent absence upsets the VM accounting, and it makes |
@@ -80,21 +82,6 @@ nope: | |||
80 | } | 82 | } |
81 | 83 | ||
82 | /* | 84 | /* |
83 | * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is | ||
84 | * held. For ranking reasons we must trylock. If we lose, schedule away and | ||
85 | * return 0. j_list_lock is dropped in this case. | ||
86 | */ | ||
87 | static int inverted_lock(journal_t *journal, struct buffer_head *bh) | ||
88 | { | ||
89 | if (!jbd_trylock_bh_state(bh)) { | ||
90 | spin_unlock(&journal->j_list_lock); | ||
91 | schedule(); | ||
92 | return 0; | ||
93 | } | ||
94 | return 1; | ||
95 | } | ||
96 | |||
97 | /* | ||
98 | * Done it all: now submit the commit record. We should have | 85 | * Done it all: now submit the commit record. We should have |
99 | * cleaned up our previous buffers by now, so if we are in abort | 86 | * cleaned up our previous buffers by now, so if we are in abort |
100 | * mode we can now just skip the rest of the journal write | 87 | * mode we can now just skip the rest of the journal write |
@@ -112,6 +99,7 @@ static int journal_submit_commit_record(journal_t *journal, | |||
112 | struct buffer_head *bh; | 99 | struct buffer_head *bh; |
113 | int ret; | 100 | int ret; |
114 | int barrier_done = 0; | 101 | int barrier_done = 0; |
102 | struct timespec now = current_kernel_time(); | ||
115 | 103 | ||
116 | if (is_journal_aborted(journal)) | 104 | if (is_journal_aborted(journal)) |
117 | return 0; | 105 | return 0; |
@@ -126,6 +114,8 @@ static int journal_submit_commit_record(journal_t *journal, | |||
126 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); | 114 | tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); |
127 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); | 115 | tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK); |
128 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); | 116 | tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid); |
117 | tmp->h_commit_sec = cpu_to_be64(now.tv_sec); | ||
118 | tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec); | ||
129 | 119 | ||
130 | if (JBD2_HAS_COMPAT_FEATURE(journal, | 120 | if (JBD2_HAS_COMPAT_FEATURE(journal, |
131 | JBD2_FEATURE_COMPAT_CHECKSUM)) { | 121 | JBD2_FEATURE_COMPAT_CHECKSUM)) { |
@@ -197,159 +187,104 @@ static int journal_wait_on_commit_record(struct buffer_head *bh) | |||
197 | } | 187 | } |
198 | 188 | ||
199 | /* | 189 | /* |
200 | * Wait for all submitted IO to complete. | 190 | * write the filemap data using writepage() address_space_operations. |
191 | * We don't do block allocation here even for delalloc. We don't | ||
192 | * use writepages() because with dealyed allocation we may be doing | ||
193 | * block allocation in writepages(). | ||
201 | */ | 194 | */ |
202 | static int journal_wait_on_locked_list(journal_t *journal, | 195 | static int journal_submit_inode_data_buffers(struct address_space *mapping) |
203 | transaction_t *commit_transaction) | ||
204 | { | 196 | { |
205 | int ret = 0; | 197 | int ret; |
206 | struct journal_head *jh; | 198 | struct writeback_control wbc = { |
207 | 199 | .sync_mode = WB_SYNC_ALL, | |
208 | while (commit_transaction->t_locked_list) { | 200 | .nr_to_write = mapping->nrpages * 2, |
209 | struct buffer_head *bh; | 201 | .range_start = 0, |
210 | 202 | .range_end = i_size_read(mapping->host), | |
211 | jh = commit_transaction->t_locked_list->b_tprev; | 203 | .for_writepages = 1, |
212 | bh = jh2bh(jh); | 204 | }; |
213 | get_bh(bh); | 205 | |
214 | if (buffer_locked(bh)) { | 206 | ret = generic_writepages(mapping, &wbc); |
215 | spin_unlock(&journal->j_list_lock); | ||
216 | wait_on_buffer(bh); | ||
217 | if (unlikely(!buffer_uptodate(bh))) | ||
218 | ret = -EIO; | ||
219 | spin_lock(&journal->j_list_lock); | ||
220 | } | ||
221 | if (!inverted_lock(journal, bh)) { | ||
222 | put_bh(bh); | ||
223 | spin_lock(&journal->j_list_lock); | ||
224 | continue; | ||
225 | } | ||
226 | if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) { | ||
227 | __jbd2_journal_unfile_buffer(jh); | ||
228 | jbd_unlock_bh_state(bh); | ||
229 | jbd2_journal_remove_journal_head(bh); | ||
230 | put_bh(bh); | ||
231 | } else { | ||
232 | jbd_unlock_bh_state(bh); | ||
233 | } | ||
234 | put_bh(bh); | ||
235 | cond_resched_lock(&journal->j_list_lock); | ||
236 | } | ||
237 | return ret; | 207 | return ret; |
238 | } | 208 | } |
239 | 209 | ||
240 | static void journal_do_submit_data(struct buffer_head **wbuf, int bufs) | 210 | /* |
211 | * Submit all the data buffers of inode associated with the transaction to | ||
212 | * disk. | ||
213 | * | ||
214 | * We are in a committing transaction. Therefore no new inode can be added to | ||
215 | * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently | ||
216 | * operate on from being released while we write out pages. | ||
217 | */ | ||
218 | static int journal_submit_data_buffers(journal_t *journal, | ||
219 | transaction_t *commit_transaction) | ||
241 | { | 220 | { |
242 | int i; | 221 | struct jbd2_inode *jinode; |
222 | int err, ret = 0; | ||
223 | struct address_space *mapping; | ||
243 | 224 | ||
244 | for (i = 0; i < bufs; i++) { | 225 | spin_lock(&journal->j_list_lock); |
245 | wbuf[i]->b_end_io = end_buffer_write_sync; | 226 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { |
246 | /* We use-up our safety reference in submit_bh() */ | 227 | mapping = jinode->i_vfs_inode->i_mapping; |
247 | submit_bh(WRITE, wbuf[i]); | 228 | jinode->i_flags |= JI_COMMIT_RUNNING; |
229 | spin_unlock(&journal->j_list_lock); | ||
230 | /* | ||
231 | * submit the inode data buffers. We use writepage | ||
232 | * instead of writepages. Because writepages can do | ||
233 | * block allocation with delalloc. We need to write | ||
234 | * only allocated blocks here. | ||
235 | */ | ||
236 | err = journal_submit_inode_data_buffers(mapping); | ||
237 | if (!ret) | ||
238 | ret = err; | ||
239 | spin_lock(&journal->j_list_lock); | ||
240 | J_ASSERT(jinode->i_transaction == commit_transaction); | ||
241 | jinode->i_flags &= ~JI_COMMIT_RUNNING; | ||
242 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | ||
248 | } | 243 | } |
244 | spin_unlock(&journal->j_list_lock); | ||
245 | return ret; | ||
249 | } | 246 | } |
250 | 247 | ||
251 | /* | 248 | /* |
252 | * Submit all the data buffers to disk | 249 | * Wait for data submitted for writeout, refile inodes to proper |
250 | * transaction if needed. | ||
251 | * | ||
253 | */ | 252 | */ |
254 | static void journal_submit_data_buffers(journal_t *journal, | 253 | static int journal_finish_inode_data_buffers(journal_t *journal, |
255 | transaction_t *commit_transaction) | 254 | transaction_t *commit_transaction) |
256 | { | 255 | { |
257 | struct journal_head *jh; | 256 | struct jbd2_inode *jinode, *next_i; |
258 | struct buffer_head *bh; | 257 | int err, ret = 0; |
259 | int locked; | ||
260 | int bufs = 0; | ||
261 | struct buffer_head **wbuf = journal->j_wbuf; | ||
262 | 258 | ||
263 | /* | 259 | /* For locking, see the comment in journal_submit_data_buffers() */ |
264 | * Whenever we unlock the journal and sleep, things can get added | ||
265 | * onto ->t_sync_datalist, so we have to keep looping back to | ||
266 | * write_out_data until we *know* that the list is empty. | ||
267 | * | ||
268 | * Cleanup any flushed data buffers from the data list. Even in | ||
269 | * abort mode, we want to flush this out as soon as possible. | ||
270 | */ | ||
271 | write_out_data: | ||
272 | cond_resched(); | ||
273 | spin_lock(&journal->j_list_lock); | 260 | spin_lock(&journal->j_list_lock); |
261 | list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { | ||
262 | jinode->i_flags |= JI_COMMIT_RUNNING; | ||
263 | spin_unlock(&journal->j_list_lock); | ||
264 | err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); | ||
265 | if (!ret) | ||
266 | ret = err; | ||
267 | spin_lock(&journal->j_list_lock); | ||
268 | jinode->i_flags &= ~JI_COMMIT_RUNNING; | ||
269 | wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); | ||
270 | } | ||
274 | 271 | ||
275 | while (commit_transaction->t_sync_datalist) { | 272 | /* Now refile inode to proper lists */ |
276 | jh = commit_transaction->t_sync_datalist; | 273 | list_for_each_entry_safe(jinode, next_i, |
277 | bh = jh2bh(jh); | 274 | &commit_transaction->t_inode_list, i_list) { |
278 | locked = 0; | 275 | list_del(&jinode->i_list); |
279 | 276 | if (jinode->i_next_transaction) { | |
280 | /* Get reference just to make sure buffer does not disappear | 277 | jinode->i_transaction = jinode->i_next_transaction; |
281 | * when we are forced to drop various locks */ | 278 | jinode->i_next_transaction = NULL; |
282 | get_bh(bh); | 279 | list_add(&jinode->i_list, |
283 | /* If the buffer is dirty, we need to submit IO and hence | 280 | &jinode->i_transaction->t_inode_list); |
284 | * we need the buffer lock. We try to lock the buffer without | ||
285 | * blocking. If we fail, we need to drop j_list_lock and do | ||
286 | * blocking lock_buffer(). | ||
287 | */ | ||
288 | if (buffer_dirty(bh)) { | ||
289 | if (test_set_buffer_locked(bh)) { | ||
290 | BUFFER_TRACE(bh, "needs blocking lock"); | ||
291 | spin_unlock(&journal->j_list_lock); | ||
292 | /* Write out all data to prevent deadlocks */ | ||
293 | journal_do_submit_data(wbuf, bufs); | ||
294 | bufs = 0; | ||
295 | lock_buffer(bh); | ||
296 | spin_lock(&journal->j_list_lock); | ||
297 | } | ||
298 | locked = 1; | ||
299 | } | ||
300 | /* We have to get bh_state lock. Again out of order, sigh. */ | ||
301 | if (!inverted_lock(journal, bh)) { | ||
302 | jbd_lock_bh_state(bh); | ||
303 | spin_lock(&journal->j_list_lock); | ||
304 | } | ||
305 | /* Someone already cleaned up the buffer? */ | ||
306 | if (!buffer_jbd(bh) | ||
307 | || jh->b_transaction != commit_transaction | ||
308 | || jh->b_jlist != BJ_SyncData) { | ||
309 | jbd_unlock_bh_state(bh); | ||
310 | if (locked) | ||
311 | unlock_buffer(bh); | ||
312 | BUFFER_TRACE(bh, "already cleaned up"); | ||
313 | put_bh(bh); | ||
314 | continue; | ||
315 | } | ||
316 | if (locked && test_clear_buffer_dirty(bh)) { | ||
317 | BUFFER_TRACE(bh, "needs writeout, adding to array"); | ||
318 | wbuf[bufs++] = bh; | ||
319 | __jbd2_journal_file_buffer(jh, commit_transaction, | ||
320 | BJ_Locked); | ||
321 | jbd_unlock_bh_state(bh); | ||
322 | if (bufs == journal->j_wbufsize) { | ||
323 | spin_unlock(&journal->j_list_lock); | ||
324 | journal_do_submit_data(wbuf, bufs); | ||
325 | bufs = 0; | ||
326 | goto write_out_data; | ||
327 | } | ||
328 | } else if (!locked && buffer_locked(bh)) { | ||
329 | __jbd2_journal_file_buffer(jh, commit_transaction, | ||
330 | BJ_Locked); | ||
331 | jbd_unlock_bh_state(bh); | ||
332 | put_bh(bh); | ||
333 | } else { | 281 | } else { |
334 | BUFFER_TRACE(bh, "writeout complete: unfile"); | 282 | jinode->i_transaction = NULL; |
335 | __jbd2_journal_unfile_buffer(jh); | ||
336 | jbd_unlock_bh_state(bh); | ||
337 | if (locked) | ||
338 | unlock_buffer(bh); | ||
339 | jbd2_journal_remove_journal_head(bh); | ||
340 | /* Once for our safety reference, once for | ||
341 | * jbd2_journal_remove_journal_head() */ | ||
342 | put_bh(bh); | ||
343 | put_bh(bh); | ||
344 | } | ||
345 | |||
346 | if (need_resched() || spin_needbreak(&journal->j_list_lock)) { | ||
347 | spin_unlock(&journal->j_list_lock); | ||
348 | goto write_out_data; | ||
349 | } | 283 | } |
350 | } | 284 | } |
351 | spin_unlock(&journal->j_list_lock); | 285 | spin_unlock(&journal->j_list_lock); |
352 | journal_do_submit_data(wbuf, bufs); | 286 | |
287 | return ret; | ||
353 | } | 288 | } |
354 | 289 | ||
355 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) | 290 | static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh) |
@@ -524,21 +459,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
524 | * Now start flushing things to disk, in the order they appear | 459 | * Now start flushing things to disk, in the order they appear |
525 | * on the transaction lists. Data blocks go first. | 460 | * on the transaction lists. Data blocks go first. |
526 | */ | 461 | */ |
527 | err = 0; | 462 | err = journal_submit_data_buffers(journal, commit_transaction); |
528 | journal_submit_data_buffers(journal, commit_transaction); | ||
529 | |||
530 | /* | ||
531 | * Wait for all previously submitted IO to complete if commit | ||
532 | * record is to be written synchronously. | ||
533 | */ | ||
534 | spin_lock(&journal->j_list_lock); | ||
535 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | ||
536 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) | ||
537 | err = journal_wait_on_locked_list(journal, | ||
538 | commit_transaction); | ||
539 | |||
540 | spin_unlock(&journal->j_list_lock); | ||
541 | |||
542 | if (err) | 463 | if (err) |
543 | jbd2_journal_abort(journal, err); | 464 | jbd2_journal_abort(journal, err); |
544 | 465 | ||
@@ -547,16 +468,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
547 | jbd_debug(3, "JBD: commit phase 2\n"); | 468 | jbd_debug(3, "JBD: commit phase 2\n"); |
548 | 469 | ||
549 | /* | 470 | /* |
550 | * If we found any dirty or locked buffers, then we should have | ||
551 | * looped back up to the write_out_data label. If there weren't | ||
552 | * any then journal_clean_data_list should have wiped the list | ||
553 | * clean by now, so check that it is in fact empty. | ||
554 | */ | ||
555 | J_ASSERT (commit_transaction->t_sync_datalist == NULL); | ||
556 | |||
557 | jbd_debug (3, "JBD: commit phase 3\n"); | ||
558 | |||
559 | /* | ||
560 | * Way to go: we have now written out all of the data for a | 471 | * Way to go: we have now written out all of the data for a |
561 | * transaction! Now comes the tricky part: we need to write out | 472 | * transaction! Now comes the tricky part: we need to write out |
562 | * metadata. Loop over the transaction's entire buffer list: | 473 | * metadata. Loop over the transaction's entire buffer list: |
@@ -574,6 +485,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) | |||
574 | J_ASSERT(commit_transaction->t_nr_buffers <= | 485 | J_ASSERT(commit_transaction->t_nr_buffers <= |
575 | commit_transaction->t_outstanding_credits); | 486 | commit_transaction->t_outstanding_credits); |
576 | 487 | ||
488 | err = 0; | ||
577 | descriptor = NULL; | 489 | descriptor = NULL; |
578 | bufs = 0; | 490 | bufs = 0; |
579 | while (commit_transaction->t_buffers) { | 491 | while (commit_transaction->t_buffers) { |
@@ -748,15 +660,19 @@ start_journal_io: | |||
748 | &cbh, crc32_sum); | 660 | &cbh, crc32_sum); |
749 | if (err) | 661 | if (err) |
750 | __jbd2_journal_abort_hard(journal); | 662 | __jbd2_journal_abort_hard(journal); |
751 | |||
752 | spin_lock(&journal->j_list_lock); | ||
753 | err = journal_wait_on_locked_list(journal, | ||
754 | commit_transaction); | ||
755 | spin_unlock(&journal->j_list_lock); | ||
756 | if (err) | ||
757 | __jbd2_journal_abort_hard(journal); | ||
758 | } | 663 | } |
759 | 664 | ||
665 | /* | ||
666 | * This is the right place to wait for data buffers both for ASYNC | ||
667 | * and !ASYNC commit. If commit is ASYNC, we need to wait only after | ||
668 | * the commit block went to disk (which happens above). If commit is | ||
669 | * SYNC, we need to wait for data buffers before we start writing | ||
670 | * commit block, which happens below in such setting. | ||
671 | */ | ||
672 | err = journal_finish_inode_data_buffers(journal, commit_transaction); | ||
673 | if (err) | ||
674 | jbd2_journal_abort(journal, err); | ||
675 | |||
760 | /* Lo and behold: we have just managed to send a transaction to | 676 | /* Lo and behold: we have just managed to send a transaction to |
761 | the log. Before we can commit it, wait for the IO so far to | 677 | the log. Before we can commit it, wait for the IO so far to |
762 | complete. Control buffers being written are on the | 678 | complete. Control buffers being written are on the |
@@ -768,7 +684,7 @@ start_journal_io: | |||
768 | so we incur less scheduling load. | 684 | so we incur less scheduling load. |
769 | */ | 685 | */ |
770 | 686 | ||
771 | jbd_debug(3, "JBD: commit phase 4\n"); | 687 | jbd_debug(3, "JBD: commit phase 3\n"); |
772 | 688 | ||
773 | /* | 689 | /* |
774 | * akpm: these are BJ_IO, and j_list_lock is not needed. | 690 | * akpm: these are BJ_IO, and j_list_lock is not needed. |
@@ -827,7 +743,7 @@ wait_for_iobuf: | |||
827 | 743 | ||
828 | J_ASSERT (commit_transaction->t_shadow_list == NULL); | 744 | J_ASSERT (commit_transaction->t_shadow_list == NULL); |
829 | 745 | ||
830 | jbd_debug(3, "JBD: commit phase 5\n"); | 746 | jbd_debug(3, "JBD: commit phase 4\n"); |
831 | 747 | ||
832 | /* Here we wait for the revoke record and descriptor record buffers */ | 748 | /* Here we wait for the revoke record and descriptor record buffers */ |
833 | wait_for_ctlbuf: | 749 | wait_for_ctlbuf: |
@@ -854,7 +770,7 @@ wait_for_iobuf: | |||
854 | /* AKPM: bforget here */ | 770 | /* AKPM: bforget here */ |
855 | } | 771 | } |
856 | 772 | ||
857 | jbd_debug(3, "JBD: commit phase 6\n"); | 773 | jbd_debug(3, "JBD: commit phase 5\n"); |
858 | 774 | ||
859 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, | 775 | if (!JBD2_HAS_INCOMPAT_FEATURE(journal, |
860 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { | 776 | JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { |
@@ -874,9 +790,9 @@ wait_for_iobuf: | |||
874 | transaction can be removed from any checkpoint list it was on | 790 | transaction can be removed from any checkpoint list it was on |
875 | before. */ | 791 | before. */ |
876 | 792 | ||
877 | jbd_debug(3, "JBD: commit phase 7\n"); | 793 | jbd_debug(3, "JBD: commit phase 6\n"); |
878 | 794 | ||
879 | J_ASSERT(commit_transaction->t_sync_datalist == NULL); | 795 | J_ASSERT(list_empty(&commit_transaction->t_inode_list)); |
880 | J_ASSERT(commit_transaction->t_buffers == NULL); | 796 | J_ASSERT(commit_transaction->t_buffers == NULL); |
881 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); | 797 | J_ASSERT(commit_transaction->t_checkpoint_list == NULL); |
882 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); | 798 | J_ASSERT(commit_transaction->t_iobuf_list == NULL); |
@@ -997,7 +913,7 @@ restart_loop: | |||
997 | 913 | ||
998 | /* Done with this transaction! */ | 914 | /* Done with this transaction! */ |
999 | 915 | ||
1000 | jbd_debug(3, "JBD: commit phase 8\n"); | 916 | jbd_debug(3, "JBD: commit phase 7\n"); |
1001 | 917 | ||
1002 | J_ASSERT(commit_transaction->t_state == T_COMMIT); | 918 | J_ASSERT(commit_transaction->t_state == T_COMMIT); |
1003 | 919 | ||
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 2e24567c4a79..b26c6d9fe6ae 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c | |||
@@ -50,7 +50,6 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); | |||
50 | EXPORT_SYMBOL(jbd2_journal_get_write_access); | 50 | EXPORT_SYMBOL(jbd2_journal_get_write_access); |
51 | EXPORT_SYMBOL(jbd2_journal_get_create_access); | 51 | EXPORT_SYMBOL(jbd2_journal_get_create_access); |
52 | EXPORT_SYMBOL(jbd2_journal_get_undo_access); | 52 | EXPORT_SYMBOL(jbd2_journal_get_undo_access); |
53 | EXPORT_SYMBOL(jbd2_journal_dirty_data); | ||
54 | EXPORT_SYMBOL(jbd2_journal_dirty_metadata); | 53 | EXPORT_SYMBOL(jbd2_journal_dirty_metadata); |
55 | EXPORT_SYMBOL(jbd2_journal_release_buffer); | 54 | EXPORT_SYMBOL(jbd2_journal_release_buffer); |
56 | EXPORT_SYMBOL(jbd2_journal_forget); | 55 | EXPORT_SYMBOL(jbd2_journal_forget); |
@@ -82,6 +81,10 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); | |||
82 | EXPORT_SYMBOL(jbd2_journal_invalidatepage); | 81 | EXPORT_SYMBOL(jbd2_journal_invalidatepage); |
83 | EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); | 82 | EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); |
84 | EXPORT_SYMBOL(jbd2_journal_force_commit); | 83 | EXPORT_SYMBOL(jbd2_journal_force_commit); |
84 | EXPORT_SYMBOL(jbd2_journal_file_inode); | ||
85 | EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); | ||
86 | EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); | ||
87 | EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); | ||
85 | 88 | ||
86 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); | 89 | static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); |
87 | static void __journal_abort_soft (journal_t *journal, int errno); | 90 | static void __journal_abort_soft (journal_t *journal, int errno); |
@@ -2195,6 +2198,54 @@ void jbd2_journal_put_journal_head(struct journal_head *jh) | |||
2195 | } | 2198 | } |
2196 | 2199 | ||
2197 | /* | 2200 | /* |
2201 | * Initialize jbd inode head | ||
2202 | */ | ||
2203 | void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) | ||
2204 | { | ||
2205 | jinode->i_transaction = NULL; | ||
2206 | jinode->i_next_transaction = NULL; | ||
2207 | jinode->i_vfs_inode = inode; | ||
2208 | jinode->i_flags = 0; | ||
2209 | INIT_LIST_HEAD(&jinode->i_list); | ||
2210 | } | ||
2211 | |||
2212 | /* | ||
2213 | * Function to be called before we start removing inode from memory (i.e., | ||
2214 | * clear_inode() is a fine place to be called from). It removes inode from | ||
2215 | * transaction's lists. | ||
2216 | */ | ||
2217 | void jbd2_journal_release_jbd_inode(journal_t *journal, | ||
2218 | struct jbd2_inode *jinode) | ||
2219 | { | ||
2220 | int writeout = 0; | ||
2221 | |||
2222 | if (!journal) | ||
2223 | return; | ||
2224 | restart: | ||
2225 | spin_lock(&journal->j_list_lock); | ||
2226 | /* Is commit writing out inode - we have to wait */ | ||
2227 | if (jinode->i_flags & JI_COMMIT_RUNNING) { | ||
2228 | wait_queue_head_t *wq; | ||
2229 | DEFINE_WAIT_BIT(wait, &jinode->i_flags, __JI_COMMIT_RUNNING); | ||
2230 | wq = bit_waitqueue(&jinode->i_flags, __JI_COMMIT_RUNNING); | ||
2231 | prepare_to_wait(wq, &wait.wait, TASK_UNINTERRUPTIBLE); | ||
2232 | spin_unlock(&journal->j_list_lock); | ||
2233 | schedule(); | ||
2234 | finish_wait(wq, &wait.wait); | ||
2235 | goto restart; | ||
2236 | } | ||
2237 | |||
2238 | /* Do we need to wait for data writeback? */ | ||
2239 | if (journal->j_committing_transaction == jinode->i_transaction) | ||
2240 | writeout = 1; | ||
2241 | if (jinode->i_transaction) { | ||
2242 | list_del(&jinode->i_list); | ||
2243 | jinode->i_transaction = NULL; | ||
2244 | } | ||
2245 | spin_unlock(&journal->j_list_lock); | ||
2246 | } | ||
2247 | |||
2248 | /* | ||
2198 | * debugfs tunables | 2249 | * debugfs tunables |
2199 | */ | 2250 | */ |
2200 | #ifdef CONFIG_JBD2_DEBUG | 2251 | #ifdef CONFIG_JBD2_DEBUG |
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index d6e006e67804..4f7cadbb19fa 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c | |||
@@ -41,7 +41,6 @@ static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); | |||
41 | * new transaction and we can't block without protecting against other | 41 | * new transaction and we can't block without protecting against other |
42 | * processes trying to touch the journal while it is in transition. | 42 | * processes trying to touch the journal while it is in transition. |
43 | * | 43 | * |
44 | * Called under j_state_lock | ||
45 | */ | 44 | */ |
46 | 45 | ||
47 | static transaction_t * | 46 | static transaction_t * |
@@ -52,6 +51,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) | |||
52 | transaction->t_tid = journal->j_transaction_sequence++; | 51 | transaction->t_tid = journal->j_transaction_sequence++; |
53 | transaction->t_expires = jiffies + journal->j_commit_interval; | 52 | transaction->t_expires = jiffies + journal->j_commit_interval; |
54 | spin_lock_init(&transaction->t_handle_lock); | 53 | spin_lock_init(&transaction->t_handle_lock); |
54 | INIT_LIST_HEAD(&transaction->t_inode_list); | ||
55 | 55 | ||
56 | /* Set up the commit timer for the new transaction. */ | 56 | /* Set up the commit timer for the new transaction. */ |
57 | journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); | 57 | journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); |
@@ -943,183 +943,6 @@ out: | |||
943 | } | 943 | } |
944 | 944 | ||
945 | /** | 945 | /** |
946 | * int jbd2_journal_dirty_data() - mark a buffer as containing dirty data which | ||
947 | * needs to be flushed before we can commit the | ||
948 | * current transaction. | ||
949 | * @handle: transaction | ||
950 | * @bh: bufferhead to mark | ||
951 | * | ||
952 | * The buffer is placed on the transaction's data list and is marked as | ||
953 | * belonging to the transaction. | ||
954 | * | ||
955 | * Returns error number or 0 on success. | ||
956 | * | ||
957 | * jbd2_journal_dirty_data() can be called via page_launder->ext3_writepage | ||
958 | * by kswapd. | ||
959 | */ | ||
960 | int jbd2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) | ||
961 | { | ||
962 | journal_t *journal = handle->h_transaction->t_journal; | ||
963 | int need_brelse = 0; | ||
964 | struct journal_head *jh; | ||
965 | |||
966 | if (is_handle_aborted(handle)) | ||
967 | return 0; | ||
968 | |||
969 | jh = jbd2_journal_add_journal_head(bh); | ||
970 | JBUFFER_TRACE(jh, "entry"); | ||
971 | |||
972 | /* | ||
973 | * The buffer could *already* be dirty. Writeout can start | ||
974 | * at any time. | ||
975 | */ | ||
976 | jbd_debug(4, "jh: %p, tid:%d\n", jh, handle->h_transaction->t_tid); | ||
977 | |||
978 | /* | ||
979 | * What if the buffer is already part of a running transaction? | ||
980 | * | ||
981 | * There are two cases: | ||
982 | * 1) It is part of the current running transaction. Refile it, | ||
983 | * just in case we have allocated it as metadata, deallocated | ||
984 | * it, then reallocated it as data. | ||
985 | * 2) It is part of the previous, still-committing transaction. | ||
986 | * If all we want to do is to guarantee that the buffer will be | ||
987 | * written to disk before this new transaction commits, then | ||
988 | * being sure that the *previous* transaction has this same | ||
989 | * property is sufficient for us! Just leave it on its old | ||
990 | * transaction. | ||
991 | * | ||
992 | * In case (2), the buffer must not already exist as metadata | ||
993 | * --- that would violate write ordering (a transaction is free | ||
994 | * to write its data at any point, even before the previous | ||
995 | * committing transaction has committed). The caller must | ||
996 | * never, ever allow this to happen: there's nothing we can do | ||
997 | * about it in this layer. | ||
998 | */ | ||
999 | jbd_lock_bh_state(bh); | ||
1000 | spin_lock(&journal->j_list_lock); | ||
1001 | |||
1002 | /* Now that we have bh_state locked, are we really still mapped? */ | ||
1003 | if (!buffer_mapped(bh)) { | ||
1004 | JBUFFER_TRACE(jh, "unmapped buffer, bailing out"); | ||
1005 | goto no_journal; | ||
1006 | } | ||
1007 | |||
1008 | if (jh->b_transaction) { | ||
1009 | JBUFFER_TRACE(jh, "has transaction"); | ||
1010 | if (jh->b_transaction != handle->h_transaction) { | ||
1011 | JBUFFER_TRACE(jh, "belongs to older transaction"); | ||
1012 | J_ASSERT_JH(jh, jh->b_transaction == | ||
1013 | journal->j_committing_transaction); | ||
1014 | |||
1015 | /* @@@ IS THIS TRUE ? */ | ||
1016 | /* | ||
1017 | * Not any more. Scenario: someone does a write() | ||
1018 | * in data=journal mode. The buffer's transaction has | ||
1019 | * moved into commit. Then someone does another | ||
1020 | * write() to the file. We do the frozen data copyout | ||
1021 | * and set b_next_transaction to point to j_running_t. | ||
1022 | * And while we're in that state, someone does a | ||
1023 | * writepage() in an attempt to pageout the same area | ||
1024 | * of the file via a shared mapping. At present that | ||
1025 | * calls jbd2_journal_dirty_data(), and we get right here. | ||
1026 | * It may be too late to journal the data. Simply | ||
1027 | * falling through to the next test will suffice: the | ||
1028 | * data will be dirty and wil be checkpointed. The | ||
1029 | * ordering comments in the next comment block still | ||
1030 | * apply. | ||
1031 | */ | ||
1032 | //J_ASSERT_JH(jh, jh->b_next_transaction == NULL); | ||
1033 | |||
1034 | /* | ||
1035 | * If we're journalling data, and this buffer was | ||
1036 | * subject to a write(), it could be metadata, forget | ||
1037 | * or shadow against the committing transaction. Now, | ||
1038 | * someone has dirtied the same darn page via a mapping | ||
1039 | * and it is being writepage()'d. | ||
1040 | * We *could* just steal the page from commit, with some | ||
1041 | * fancy locking there. Instead, we just skip it - | ||
1042 | * don't tie the page's buffers to the new transaction | ||
1043 | * at all. | ||
1044 | * Implication: if we crash before the writepage() data | ||
1045 | * is written into the filesystem, recovery will replay | ||
1046 | * the write() data. | ||
1047 | */ | ||
1048 | if (jh->b_jlist != BJ_None && | ||
1049 | jh->b_jlist != BJ_SyncData && | ||
1050 | jh->b_jlist != BJ_Locked) { | ||
1051 | JBUFFER_TRACE(jh, "Not stealing"); | ||
1052 | goto no_journal; | ||
1053 | } | ||
1054 | |||
1055 | /* | ||
1056 | * This buffer may be undergoing writeout in commit. We | ||
1057 | * can't return from here and let the caller dirty it | ||
1058 | * again because that can cause the write-out loop in | ||
1059 | * commit to never terminate. | ||
1060 | */ | ||
1061 | if (buffer_dirty(bh)) { | ||
1062 | get_bh(bh); | ||
1063 | spin_unlock(&journal->j_list_lock); | ||
1064 | jbd_unlock_bh_state(bh); | ||
1065 | need_brelse = 1; | ||
1066 | sync_dirty_buffer(bh); | ||
1067 | jbd_lock_bh_state(bh); | ||
1068 | spin_lock(&journal->j_list_lock); | ||
1069 | /* Since we dropped the lock... */ | ||
1070 | if (!buffer_mapped(bh)) { | ||
1071 | JBUFFER_TRACE(jh, "buffer got unmapped"); | ||
1072 | goto no_journal; | ||
1073 | } | ||
1074 | /* The buffer may become locked again at any | ||
1075 | time if it is redirtied */ | ||
1076 | } | ||
1077 | |||
1078 | /* journal_clean_data_list() may have got there first */ | ||
1079 | if (jh->b_transaction != NULL) { | ||
1080 | JBUFFER_TRACE(jh, "unfile from commit"); | ||
1081 | __jbd2_journal_temp_unlink_buffer(jh); | ||
1082 | /* It still points to the committing | ||
1083 | * transaction; move it to this one so | ||
1084 | * that the refile assert checks are | ||
1085 | * happy. */ | ||
1086 | jh->b_transaction = handle->h_transaction; | ||
1087 | } | ||
1088 | /* The buffer will be refiled below */ | ||
1089 | |||
1090 | } | ||
1091 | /* | ||
1092 | * Special case --- the buffer might actually have been | ||
1093 | * allocated and then immediately deallocated in the previous, | ||
1094 | * committing transaction, so might still be left on that | ||
1095 | * transaction's metadata lists. | ||
1096 | */ | ||
1097 | if (jh->b_jlist != BJ_SyncData && jh->b_jlist != BJ_Locked) { | ||
1098 | JBUFFER_TRACE(jh, "not on correct data list: unfile"); | ||
1099 | J_ASSERT_JH(jh, jh->b_jlist != BJ_Shadow); | ||
1100 | __jbd2_journal_temp_unlink_buffer(jh); | ||
1101 | jh->b_transaction = handle->h_transaction; | ||
1102 | JBUFFER_TRACE(jh, "file as data"); | ||
1103 | __jbd2_journal_file_buffer(jh, handle->h_transaction, | ||
1104 | BJ_SyncData); | ||
1105 | } | ||
1106 | } else { | ||
1107 | JBUFFER_TRACE(jh, "not on a transaction"); | ||
1108 | __jbd2_journal_file_buffer(jh, handle->h_transaction, BJ_SyncData); | ||
1109 | } | ||
1110 | no_journal: | ||
1111 | spin_unlock(&journal->j_list_lock); | ||
1112 | jbd_unlock_bh_state(bh); | ||
1113 | if (need_brelse) { | ||
1114 | BUFFER_TRACE(bh, "brelse"); | ||
1115 | __brelse(bh); | ||
1116 | } | ||
1117 | JBUFFER_TRACE(jh, "exit"); | ||
1118 | jbd2_journal_put_journal_head(jh); | ||
1119 | return 0; | ||
1120 | } | ||
1121 | |||
1122 | /** | ||
1123 | * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata | 946 | * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata |
1124 | * @handle: transaction to add buffer to. | 947 | * @handle: transaction to add buffer to. |
1125 | * @bh: buffer to mark | 948 | * @bh: buffer to mark |
@@ -1541,10 +1364,10 @@ __blist_del_buffer(struct journal_head **list, struct journal_head *jh) | |||
1541 | * Remove a buffer from the appropriate transaction list. | 1364 | * Remove a buffer from the appropriate transaction list. |
1542 | * | 1365 | * |
1543 | * Note that this function can *change* the value of | 1366 | * Note that this function can *change* the value of |
1544 | * bh->b_transaction->t_sync_datalist, t_buffers, t_forget, | 1367 | * bh->b_transaction->t_buffers, t_forget, t_iobuf_list, t_shadow_list, |
1545 | * t_iobuf_list, t_shadow_list, t_log_list or t_reserved_list. If the caller | 1368 | * t_log_list or t_reserved_list. If the caller is holding onto a copy of one |
1546 | * is holding onto a copy of one of thee pointers, it could go bad. | 1369 | * of these pointers, it could go bad. Generally the caller needs to re-read |
1547 | * Generally the caller needs to re-read the pointer from the transaction_t. | 1370 | * the pointer from the transaction_t. |
1548 | * | 1371 | * |
1549 | * Called under j_list_lock. The journal may not be locked. | 1372 | * Called under j_list_lock. The journal may not be locked. |
1550 | */ | 1373 | */ |
@@ -1566,9 +1389,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |||
1566 | switch (jh->b_jlist) { | 1389 | switch (jh->b_jlist) { |
1567 | case BJ_None: | 1390 | case BJ_None: |
1568 | return; | 1391 | return; |
1569 | case BJ_SyncData: | ||
1570 | list = &transaction->t_sync_datalist; | ||
1571 | break; | ||
1572 | case BJ_Metadata: | 1392 | case BJ_Metadata: |
1573 | transaction->t_nr_buffers--; | 1393 | transaction->t_nr_buffers--; |
1574 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); | 1394 | J_ASSERT_JH(jh, transaction->t_nr_buffers >= 0); |
@@ -1589,9 +1409,6 @@ void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh) | |||
1589 | case BJ_Reserved: | 1409 | case BJ_Reserved: |
1590 | list = &transaction->t_reserved_list; | 1410 | list = &transaction->t_reserved_list; |
1591 | break; | 1411 | break; |
1592 | case BJ_Locked: | ||
1593 | list = &transaction->t_locked_list; | ||
1594 | break; | ||
1595 | } | 1412 | } |
1596 | 1413 | ||
1597 | __blist_del_buffer(list, jh); | 1414 | __blist_del_buffer(list, jh); |
@@ -1634,15 +1451,7 @@ __journal_try_to_free_buffer(journal_t *journal, struct buffer_head *bh) | |||
1634 | goto out; | 1451 | goto out; |
1635 | 1452 | ||
1636 | spin_lock(&journal->j_list_lock); | 1453 | spin_lock(&journal->j_list_lock); |
1637 | if (jh->b_transaction != NULL && jh->b_cp_transaction == NULL) { | 1454 | if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { |
1638 | if (jh->b_jlist == BJ_SyncData || jh->b_jlist == BJ_Locked) { | ||
1639 | /* A written-back ordered data buffer */ | ||
1640 | JBUFFER_TRACE(jh, "release data"); | ||
1641 | __jbd2_journal_unfile_buffer(jh); | ||
1642 | jbd2_journal_remove_journal_head(bh); | ||
1643 | __brelse(bh); | ||
1644 | } | ||
1645 | } else if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) { | ||
1646 | /* written-back checkpointed metadata buffer */ | 1455 | /* written-back checkpointed metadata buffer */ |
1647 | if (jh->b_jlist == BJ_None) { | 1456 | if (jh->b_jlist == BJ_None) { |
1648 | JBUFFER_TRACE(jh, "remove from checkpoint list"); | 1457 | JBUFFER_TRACE(jh, "remove from checkpoint list"); |
@@ -1656,12 +1465,43 @@ out: | |||
1656 | return; | 1465 | return; |
1657 | } | 1466 | } |
1658 | 1467 | ||
1468 | /* | ||
1469 | * jbd2_journal_try_to_free_buffers() could race with | ||
1470 | * jbd2_journal_commit_transaction(). The later might still hold the | ||
1471 | * reference count to the buffers when inspecting them on | ||
1472 | * t_syncdata_list or t_locked_list. | ||
1473 | * | ||
1474 | * jbd2_journal_try_to_free_buffers() will call this function to | ||
1475 | * wait for the current transaction to finish syncing data buffers, before | ||
1476 | * try to free that buffer. | ||
1477 | * | ||
1478 | * Called with journal->j_state_lock hold. | ||
1479 | */ | ||
1480 | static void jbd2_journal_wait_for_transaction_sync_data(journal_t *journal) | ||
1481 | { | ||
1482 | transaction_t *transaction; | ||
1483 | tid_t tid; | ||
1484 | |||
1485 | spin_lock(&journal->j_state_lock); | ||
1486 | transaction = journal->j_committing_transaction; | ||
1487 | |||
1488 | if (!transaction) { | ||
1489 | spin_unlock(&journal->j_state_lock); | ||
1490 | return; | ||
1491 | } | ||
1492 | |||
1493 | tid = transaction->t_tid; | ||
1494 | spin_unlock(&journal->j_state_lock); | ||
1495 | jbd2_log_wait_commit(journal, tid); | ||
1496 | } | ||
1659 | 1497 | ||
1660 | /** | 1498 | /** |
1661 | * int jbd2_journal_try_to_free_buffers() - try to free page buffers. | 1499 | * int jbd2_journal_try_to_free_buffers() - try to free page buffers. |
1662 | * @journal: journal for operation | 1500 | * @journal: journal for operation |
1663 | * @page: to try and free | 1501 | * @page: to try and free |
1664 | * @unused_gfp_mask: unused | 1502 | * @gfp_mask: we use the mask to detect how hard should we try to release |
1503 | * buffers. If __GFP_WAIT and __GFP_FS is set, we wait for commit code to | ||
1504 | * release the buffers. | ||
1665 | * | 1505 | * |
1666 | * | 1506 | * |
1667 | * For all the buffers on this page, | 1507 | * For all the buffers on this page, |
@@ -1690,9 +1530,11 @@ out: | |||
1690 | * journal_try_to_free_buffer() is changing its state. But that | 1530 | * journal_try_to_free_buffer() is changing its state. But that |
1691 | * cannot happen because we never reallocate freed data as metadata | 1531 | * cannot happen because we never reallocate freed data as metadata |
1692 | * while the data is part of a transaction. Yes? | 1532 | * while the data is part of a transaction. Yes? |
1533 | * | ||
1534 | * Return 0 on failure, 1 on success | ||
1693 | */ | 1535 | */ |
1694 | int jbd2_journal_try_to_free_buffers(journal_t *journal, | 1536 | int jbd2_journal_try_to_free_buffers(journal_t *journal, |
1695 | struct page *page, gfp_t unused_gfp_mask) | 1537 | struct page *page, gfp_t gfp_mask) |
1696 | { | 1538 | { |
1697 | struct buffer_head *head; | 1539 | struct buffer_head *head; |
1698 | struct buffer_head *bh; | 1540 | struct buffer_head *bh; |
@@ -1708,7 +1550,8 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, | |||
1708 | /* | 1550 | /* |
1709 | * We take our own ref against the journal_head here to avoid | 1551 | * We take our own ref against the journal_head here to avoid |
1710 | * having to add tons of locking around each instance of | 1552 | * having to add tons of locking around each instance of |
1711 | * jbd2_journal_remove_journal_head() and jbd2_journal_put_journal_head(). | 1553 | * jbd2_journal_remove_journal_head() and |
1554 | * jbd2_journal_put_journal_head(). | ||
1712 | */ | 1555 | */ |
1713 | jh = jbd2_journal_grab_journal_head(bh); | 1556 | jh = jbd2_journal_grab_journal_head(bh); |
1714 | if (!jh) | 1557 | if (!jh) |
@@ -1721,7 +1564,28 @@ int jbd2_journal_try_to_free_buffers(journal_t *journal, | |||
1721 | if (buffer_jbd(bh)) | 1564 | if (buffer_jbd(bh)) |
1722 | goto busy; | 1565 | goto busy; |
1723 | } while ((bh = bh->b_this_page) != head); | 1566 | } while ((bh = bh->b_this_page) != head); |
1567 | |||
1724 | ret = try_to_free_buffers(page); | 1568 | ret = try_to_free_buffers(page); |
1569 | |||
1570 | /* | ||
1571 | * There are a number of places where jbd2_journal_try_to_free_buffers() | ||
1572 | * could race with jbd2_journal_commit_transaction(), the later still | ||
1573 | * holds the reference to the buffers to free while processing them. | ||
1574 | * try_to_free_buffers() failed to free those buffers. Some of the | ||
1575 | * caller of releasepage() request page buffers to be dropped, otherwise | ||
1576 | * treat the fail-to-free as errors (such as generic_file_direct_IO()) | ||
1577 | * | ||
1578 | * So, if the caller of try_to_release_page() wants the synchronous | ||
1579 | * behaviour(i.e make sure buffers are dropped upon return), | ||
1580 | * let's wait for the current transaction to finish flush of | ||
1581 | * dirty data buffers, then try to free those buffers again, | ||
1582 | * with the journal locked. | ||
1583 | */ | ||
1584 | if (ret == 0 && (gfp_mask & __GFP_WAIT) && (gfp_mask & __GFP_FS)) { | ||
1585 | jbd2_journal_wait_for_transaction_sync_data(journal); | ||
1586 | ret = try_to_free_buffers(page); | ||
1587 | } | ||
1588 | |||
1725 | busy: | 1589 | busy: |
1726 | return ret; | 1590 | return ret; |
1727 | } | 1591 | } |
@@ -1823,6 +1687,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | |||
1823 | if (!buffer_jbd(bh)) | 1687 | if (!buffer_jbd(bh)) |
1824 | goto zap_buffer_unlocked; | 1688 | goto zap_buffer_unlocked; |
1825 | 1689 | ||
1690 | /* OK, we have data buffer in journaled mode */ | ||
1826 | spin_lock(&journal->j_state_lock); | 1691 | spin_lock(&journal->j_state_lock); |
1827 | jbd_lock_bh_state(bh); | 1692 | jbd_lock_bh_state(bh); |
1828 | spin_lock(&journal->j_list_lock); | 1693 | spin_lock(&journal->j_list_lock); |
@@ -1886,15 +1751,6 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) | |||
1886 | } | 1751 | } |
1887 | } else if (transaction == journal->j_committing_transaction) { | 1752 | } else if (transaction == journal->j_committing_transaction) { |
1888 | JBUFFER_TRACE(jh, "on committing transaction"); | 1753 | JBUFFER_TRACE(jh, "on committing transaction"); |
1889 | if (jh->b_jlist == BJ_Locked) { | ||
1890 | /* | ||
1891 | * The buffer is on the committing transaction's locked | ||
1892 | * list. We have the buffer locked, so I/O has | ||
1893 | * completed. So we can nail the buffer now. | ||
1894 | */ | ||
1895 | may_free = __dispose_buffer(jh, transaction); | ||
1896 | goto zap_buffer; | ||
1897 | } | ||
1898 | /* | 1754 | /* |
1899 | * If it is committing, we simply cannot touch it. We | 1755 | * If it is committing, we simply cannot touch it. We |
1900 | * can remove it's next_transaction pointer from the | 1756 | * can remove it's next_transaction pointer from the |
@@ -2027,9 +1883,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |||
2027 | J_ASSERT_JH(jh, !jh->b_committed_data); | 1883 | J_ASSERT_JH(jh, !jh->b_committed_data); |
2028 | J_ASSERT_JH(jh, !jh->b_frozen_data); | 1884 | J_ASSERT_JH(jh, !jh->b_frozen_data); |
2029 | return; | 1885 | return; |
2030 | case BJ_SyncData: | ||
2031 | list = &transaction->t_sync_datalist; | ||
2032 | break; | ||
2033 | case BJ_Metadata: | 1886 | case BJ_Metadata: |
2034 | transaction->t_nr_buffers++; | 1887 | transaction->t_nr_buffers++; |
2035 | list = &transaction->t_buffers; | 1888 | list = &transaction->t_buffers; |
@@ -2049,9 +1902,6 @@ void __jbd2_journal_file_buffer(struct journal_head *jh, | |||
2049 | case BJ_Reserved: | 1902 | case BJ_Reserved: |
2050 | list = &transaction->t_reserved_list; | 1903 | list = &transaction->t_reserved_list; |
2051 | break; | 1904 | break; |
2052 | case BJ_Locked: | ||
2053 | list = &transaction->t_locked_list; | ||
2054 | break; | ||
2055 | } | 1905 | } |
2056 | 1906 | ||
2057 | __blist_add_buffer(list, jh); | 1907 | __blist_add_buffer(list, jh); |
@@ -2141,3 +1991,88 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) | |||
2141 | spin_unlock(&journal->j_list_lock); | 1991 | spin_unlock(&journal->j_list_lock); |
2142 | __brelse(bh); | 1992 | __brelse(bh); |
2143 | } | 1993 | } |
1994 | |||
1995 | /* | ||
1996 | * File inode in the inode list of the handle's transaction | ||
1997 | */ | ||
1998 | int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) | ||
1999 | { | ||
2000 | transaction_t *transaction = handle->h_transaction; | ||
2001 | journal_t *journal = transaction->t_journal; | ||
2002 | |||
2003 | if (is_handle_aborted(handle)) | ||
2004 | return -EIO; | ||
2005 | |||
2006 | jbd_debug(4, "Adding inode %lu, tid:%d\n", jinode->i_vfs_inode->i_ino, | ||
2007 | transaction->t_tid); | ||
2008 | |||
2009 | /* | ||
2010 | * First check whether inode isn't already on the transaction's | ||
2011 | * lists without taking the lock. Note that this check is safe | ||
2012 | * without the lock as we cannot race with somebody removing inode | ||
2013 | * from the transaction. The reason is that we remove inode from the | ||
2014 | * transaction only in journal_release_jbd_inode() and when we commit | ||
2015 | * the transaction. We are guarded from the first case by holding | ||
2016 | * a reference to the inode. We are safe against the second case | ||
2017 | * because if jinode->i_transaction == transaction, commit code | ||
2018 | * cannot touch the transaction because we hold reference to it, | ||
2019 | * and if jinode->i_next_transaction == transaction, commit code | ||
2020 | * will only file the inode where we want it. | ||
2021 | */ | ||
2022 | if (jinode->i_transaction == transaction || | ||
2023 | jinode->i_next_transaction == transaction) | ||
2024 | return 0; | ||
2025 | |||
2026 | spin_lock(&journal->j_list_lock); | ||
2027 | |||
2028 | if (jinode->i_transaction == transaction || | ||
2029 | jinode->i_next_transaction == transaction) | ||
2030 | goto done; | ||
2031 | |||
2032 | /* On some different transaction's list - should be | ||
2033 | * the committing one */ | ||
2034 | if (jinode->i_transaction) { | ||
2035 | J_ASSERT(jinode->i_next_transaction == NULL); | ||
2036 | J_ASSERT(jinode->i_transaction == | ||
2037 | journal->j_committing_transaction); | ||
2038 | jinode->i_next_transaction = transaction; | ||
2039 | goto done; | ||
2040 | } | ||
2041 | /* Not on any transaction list... */ | ||
2042 | J_ASSERT(!jinode->i_next_transaction); | ||
2043 | jinode->i_transaction = transaction; | ||
2044 | list_add(&jinode->i_list, &transaction->t_inode_list); | ||
2045 | done: | ||
2046 | spin_unlock(&journal->j_list_lock); | ||
2047 | |||
2048 | return 0; | ||
2049 | } | ||
2050 | |||
2051 | /* | ||
2052 | * This function must be called when inode is journaled in ordered mode | ||
2053 | * before truncation happens. It starts writeout of truncated part in | ||
2054 | * case it is in the committing transaction so that we stand to ordered | ||
2055 | * mode consistency guarantees. | ||
2056 | */ | ||
2057 | int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, | ||
2058 | loff_t new_size) | ||
2059 | { | ||
2060 | journal_t *journal; | ||
2061 | transaction_t *commit_trans; | ||
2062 | int ret = 0; | ||
2063 | |||
2064 | if (!inode->i_transaction && !inode->i_next_transaction) | ||
2065 | goto out; | ||
2066 | journal = inode->i_transaction->t_journal; | ||
2067 | spin_lock(&journal->j_state_lock); | ||
2068 | commit_trans = journal->j_committing_transaction; | ||
2069 | spin_unlock(&journal->j_state_lock); | ||
2070 | if (inode->i_transaction == commit_trans) { | ||
2071 | ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, | ||
2072 | new_size, LLONG_MAX); | ||
2073 | if (ret) | ||
2074 | jbd2_journal_abort(journal, ret); | ||
2075 | } | ||
2076 | out: | ||
2077 | return ret; | ||
2078 | } | ||
diff --git a/fs/mpage.c b/fs/mpage.c index 235e4d3873a8..dbcc7af76a15 100644 --- a/fs/mpage.c +++ b/fs/mpage.c | |||
@@ -82,7 +82,7 @@ static void mpage_end_io_write(struct bio *bio, int err) | |||
82 | bio_put(bio); | 82 | bio_put(bio); |
83 | } | 83 | } |
84 | 84 | ||
85 | static struct bio *mpage_bio_submit(int rw, struct bio *bio) | 85 | struct bio *mpage_bio_submit(int rw, struct bio *bio) |
86 | { | 86 | { |
87 | bio->bi_end_io = mpage_end_io_read; | 87 | bio->bi_end_io = mpage_end_io_read; |
88 | if (rw == WRITE) | 88 | if (rw == WRITE) |
@@ -90,6 +90,7 @@ static struct bio *mpage_bio_submit(int rw, struct bio *bio) | |||
90 | submit_bio(rw, bio); | 90 | submit_bio(rw, bio); |
91 | return NULL; | 91 | return NULL; |
92 | } | 92 | } |
93 | EXPORT_SYMBOL(mpage_bio_submit); | ||
93 | 94 | ||
94 | static struct bio * | 95 | static struct bio * |
95 | mpage_alloc(struct block_device *bdev, | 96 | mpage_alloc(struct block_device *bdev, |
@@ -435,15 +436,9 @@ EXPORT_SYMBOL(mpage_readpage); | |||
435 | * written, so it can intelligently allocate a suitably-sized BIO. For now, | 436 | * written, so it can intelligently allocate a suitably-sized BIO. For now, |
436 | * just allocate full-size (16-page) BIOs. | 437 | * just allocate full-size (16-page) BIOs. |
437 | */ | 438 | */ |
438 | struct mpage_data { | ||
439 | struct bio *bio; | ||
440 | sector_t last_block_in_bio; | ||
441 | get_block_t *get_block; | ||
442 | unsigned use_writepage; | ||
443 | }; | ||
444 | 439 | ||
445 | static int __mpage_writepage(struct page *page, struct writeback_control *wbc, | 440 | int __mpage_writepage(struct page *page, struct writeback_control *wbc, |
446 | void *data) | 441 | void *data) |
447 | { | 442 | { |
448 | struct mpage_data *mpd = data; | 443 | struct mpage_data *mpd = data; |
449 | struct bio *bio = mpd->bio; | 444 | struct bio *bio = mpd->bio; |
@@ -651,6 +646,7 @@ out: | |||
651 | mpd->bio = bio; | 646 | mpd->bio = bio; |
652 | return ret; | 647 | return ret; |
653 | } | 648 | } |
649 | EXPORT_SYMBOL(__mpage_writepage); | ||
654 | 650 | ||
655 | /** | 651 | /** |
656 | * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them | 652 | * mpage_writepages - walk the list of dirty pages of the given address space & writepage() all of them |
diff --git a/include/linux/fs.h b/include/linux/fs.h index faac13e2cc5c..52e510a0aec2 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -1740,6 +1740,8 @@ extern int wait_on_page_writeback_range(struct address_space *mapping, | |||
1740 | pgoff_t start, pgoff_t end); | 1740 | pgoff_t start, pgoff_t end); |
1741 | extern int __filemap_fdatawrite_range(struct address_space *mapping, | 1741 | extern int __filemap_fdatawrite_range(struct address_space *mapping, |
1742 | loff_t start, loff_t end, int sync_mode); | 1742 | loff_t start, loff_t end, int sync_mode); |
1743 | extern int filemap_fdatawrite_range(struct address_space *mapping, | ||
1744 | loff_t start, loff_t end); | ||
1743 | 1745 | ||
1744 | extern long do_fsync(struct file *file, int datasync); | 1746 | extern long do_fsync(struct file *file, int datasync); |
1745 | extern void sync_supers(void); | 1747 | extern void sync_supers(void); |
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index d147f0f90360..3dd209007098 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h | |||
@@ -168,6 +168,8 @@ struct commit_header { | |||
168 | unsigned char h_chksum_size; | 168 | unsigned char h_chksum_size; |
169 | unsigned char h_padding[2]; | 169 | unsigned char h_padding[2]; |
170 | __be32 h_chksum[JBD2_CHECKSUM_BYTES]; | 170 | __be32 h_chksum[JBD2_CHECKSUM_BYTES]; |
171 | __be64 h_commit_sec; | ||
172 | __be32 h_commit_nsec; | ||
171 | }; | 173 | }; |
172 | 174 | ||
173 | /* | 175 | /* |
@@ -379,6 +381,38 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) | |||
379 | bit_spin_unlock(BH_JournalHead, &bh->b_state); | 381 | bit_spin_unlock(BH_JournalHead, &bh->b_state); |
380 | } | 382 | } |
381 | 383 | ||
384 | /* Flags in jbd_inode->i_flags */ | ||
385 | #define __JI_COMMIT_RUNNING 0 | ||
386 | /* Commit of the inode data in progress. We use this flag to protect us from | ||
387 | * concurrent deletion of inode. We cannot use reference to inode for this | ||
388 | * since we cannot afford doing last iput() on behalf of kjournald | ||
389 | */ | ||
390 | #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) | ||
391 | |||
392 | /** | ||
393 | * struct jbd_inode is the structure linking inodes in ordered mode | ||
394 | * present in a transaction so that we can sync them during commit. | ||
395 | */ | ||
396 | struct jbd2_inode { | ||
397 | /* Which transaction does this inode belong to? Either the running | ||
398 | * transaction or the committing one. [j_list_lock] */ | ||
399 | transaction_t *i_transaction; | ||
400 | |||
401 | /* Pointer to the running transaction modifying inode's data in case | ||
402 | * there is already a committing transaction touching it. [j_list_lock] */ | ||
403 | transaction_t *i_next_transaction; | ||
404 | |||
405 | /* List of inodes in the i_transaction [j_list_lock] */ | ||
406 | struct list_head i_list; | ||
407 | |||
408 | /* VFS inode this inode belongs to [constant during the lifetime | ||
409 | * of the structure] */ | ||
410 | struct inode *i_vfs_inode; | ||
411 | |||
412 | /* Flags of inode [j_list_lock] */ | ||
413 | unsigned int i_flags; | ||
414 | }; | ||
415 | |||
382 | struct jbd2_revoke_table_s; | 416 | struct jbd2_revoke_table_s; |
383 | 417 | ||
384 | /** | 418 | /** |
@@ -509,24 +543,12 @@ struct transaction_s | |||
509 | struct journal_head *t_reserved_list; | 543 | struct journal_head *t_reserved_list; |
510 | 544 | ||
511 | /* | 545 | /* |
512 | * Doubly-linked circular list of all buffers under writeout during | ||
513 | * commit [j_list_lock] | ||
514 | */ | ||
515 | struct journal_head *t_locked_list; | ||
516 | |||
517 | /* | ||
518 | * Doubly-linked circular list of all metadata buffers owned by this | 546 | * Doubly-linked circular list of all metadata buffers owned by this |
519 | * transaction [j_list_lock] | 547 | * transaction [j_list_lock] |
520 | */ | 548 | */ |
521 | struct journal_head *t_buffers; | 549 | struct journal_head *t_buffers; |
522 | 550 | ||
523 | /* | 551 | /* |
524 | * Doubly-linked circular list of all data buffers still to be | ||
525 | * flushed before this transaction can be committed [j_list_lock] | ||
526 | */ | ||
527 | struct journal_head *t_sync_datalist; | ||
528 | |||
529 | /* | ||
530 | * Doubly-linked circular list of all forget buffers (superseded | 552 | * Doubly-linked circular list of all forget buffers (superseded |
531 | * buffers which we can un-checkpoint once this transaction commits) | 553 | * buffers which we can un-checkpoint once this transaction commits) |
532 | * [j_list_lock] | 554 | * [j_list_lock] |
@@ -565,6 +587,12 @@ struct transaction_s | |||
565 | struct journal_head *t_log_list; | 587 | struct journal_head *t_log_list; |
566 | 588 | ||
567 | /* | 589 | /* |
590 | * List of inodes whose data we've modified in data=ordered mode. | ||
591 | * [j_list_lock] | ||
592 | */ | ||
593 | struct list_head t_inode_list; | ||
594 | |||
595 | /* | ||
568 | * Protects info related to handles | 596 | * Protects info related to handles |
569 | */ | 597 | */ |
570 | spinlock_t t_handle_lock; | 598 | spinlock_t t_handle_lock; |
@@ -1004,7 +1032,6 @@ extern int jbd2_journal_extend (handle_t *, int nblocks); | |||
1004 | extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); | 1032 | extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); |
1005 | extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); | 1033 | extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); |
1006 | extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); | 1034 | extern int jbd2_journal_get_undo_access(handle_t *, struct buffer_head *); |
1007 | extern int jbd2_journal_dirty_data (handle_t *, struct buffer_head *); | ||
1008 | extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); | 1035 | extern int jbd2_journal_dirty_metadata (handle_t *, struct buffer_head *); |
1009 | extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); | 1036 | extern void jbd2_journal_release_buffer (handle_t *, struct buffer_head *); |
1010 | extern int jbd2_journal_forget (handle_t *, struct buffer_head *); | 1037 | extern int jbd2_journal_forget (handle_t *, struct buffer_head *); |
@@ -1044,6 +1071,10 @@ extern void jbd2_journal_ack_err (journal_t *); | |||
1044 | extern int jbd2_journal_clear_err (journal_t *); | 1071 | extern int jbd2_journal_clear_err (journal_t *); |
1045 | extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); | 1072 | extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); |
1046 | extern int jbd2_journal_force_commit(journal_t *); | 1073 | extern int jbd2_journal_force_commit(journal_t *); |
1074 | extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); | ||
1075 | extern int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size); | ||
1076 | extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); | ||
1077 | extern void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode); | ||
1047 | 1078 | ||
1048 | /* | 1079 | /* |
1049 | * journal_head management | 1080 | * journal_head management |
@@ -1179,15 +1210,13 @@ static inline int jbd_space_needed(journal_t *journal) | |||
1179 | 1210 | ||
1180 | /* journaling buffer types */ | 1211 | /* journaling buffer types */ |
1181 | #define BJ_None 0 /* Not journaled */ | 1212 | #define BJ_None 0 /* Not journaled */ |
1182 | #define BJ_SyncData 1 /* Normal data: flush before commit */ | 1213 | #define BJ_Metadata 1 /* Normal journaled metadata */ |
1183 | #define BJ_Metadata 2 /* Normal journaled metadata */ | 1214 | #define BJ_Forget 2 /* Buffer superseded by this transaction */ |
1184 | #define BJ_Forget 3 /* Buffer superseded by this transaction */ | 1215 | #define BJ_IO 3 /* Buffer is for temporary IO use */ |
1185 | #define BJ_IO 4 /* Buffer is for temporary IO use */ | 1216 | #define BJ_Shadow 4 /* Buffer contents being shadowed to the log */ |
1186 | #define BJ_Shadow 5 /* Buffer contents being shadowed to the log */ | 1217 | #define BJ_LogCtl 5 /* Buffer contains log descriptors */ |
1187 | #define BJ_LogCtl 6 /* Buffer contains log descriptors */ | 1218 | #define BJ_Reserved 6 /* Buffer is reserved for access by journal */ |
1188 | #define BJ_Reserved 7 /* Buffer is reserved for access by journal */ | 1219 | #define BJ_Types 7 |
1189 | #define BJ_Locked 8 /* Locked for I/O during commit */ | ||
1190 | #define BJ_Types 9 | ||
1191 | 1220 | ||
1192 | extern int jbd_blocks_per_page(struct inode *inode); | 1221 | extern int jbd_blocks_per_page(struct inode *inode); |
1193 | 1222 | ||
diff --git a/include/linux/mpage.h b/include/linux/mpage.h index 068a0c9946af..5c42821da2d1 100644 --- a/include/linux/mpage.h +++ b/include/linux/mpage.h | |||
@@ -11,11 +11,21 @@ | |||
11 | */ | 11 | */ |
12 | #ifdef CONFIG_BLOCK | 12 | #ifdef CONFIG_BLOCK |
13 | 13 | ||
14 | struct mpage_data { | ||
15 | struct bio *bio; | ||
16 | sector_t last_block_in_bio; | ||
17 | get_block_t *get_block; | ||
18 | unsigned use_writepage; | ||
19 | }; | ||
20 | |||
14 | struct writeback_control; | 21 | struct writeback_control; |
15 | 22 | ||
23 | struct bio *mpage_bio_submit(int rw, struct bio *bio); | ||
16 | int mpage_readpages(struct address_space *mapping, struct list_head *pages, | 24 | int mpage_readpages(struct address_space *mapping, struct list_head *pages, |
17 | unsigned nr_pages, get_block_t get_block); | 25 | unsigned nr_pages, get_block_t get_block); |
18 | int mpage_readpage(struct page *page, get_block_t get_block); | 26 | int mpage_readpage(struct page *page, get_block_t get_block); |
27 | int __mpage_writepage(struct page *page, struct writeback_control *wbc, | ||
28 | void *data); | ||
19 | int mpage_writepages(struct address_space *mapping, | 29 | int mpage_writepages(struct address_space *mapping, |
20 | struct writeback_control *wbc, get_block_t get_block); | 30 | struct writeback_control *wbc, get_block_t get_block); |
21 | int mpage_writepage(struct page *page, get_block_t *get_block, | 31 | int mpage_writepage(struct page *page, get_block_t *get_block, |
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 9007ccdfc112..208388835357 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h | |||
@@ -35,7 +35,7 @@ int percpu_counter_init_irq(struct percpu_counter *fbc, s64 amount); | |||
35 | void percpu_counter_destroy(struct percpu_counter *fbc); | 35 | void percpu_counter_destroy(struct percpu_counter *fbc); |
36 | void percpu_counter_set(struct percpu_counter *fbc, s64 amount); | 36 | void percpu_counter_set(struct percpu_counter *fbc, s64 amount); |
37 | void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); | 37 | void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch); |
38 | s64 __percpu_counter_sum(struct percpu_counter *fbc); | 38 | s64 __percpu_counter_sum(struct percpu_counter *fbc, int set); |
39 | 39 | ||
40 | static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) | 40 | static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) |
41 | { | 41 | { |
@@ -44,13 +44,19 @@ static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) | |||
44 | 44 | ||
45 | static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) | 45 | static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) |
46 | { | 46 | { |
47 | s64 ret = __percpu_counter_sum(fbc); | 47 | s64 ret = __percpu_counter_sum(fbc, 0); |
48 | return ret < 0 ? 0 : ret; | 48 | return ret < 0 ? 0 : ret; |
49 | } | 49 | } |
50 | 50 | ||
51 | static inline s64 percpu_counter_sum_and_set(struct percpu_counter *fbc) | ||
52 | { | ||
53 | return __percpu_counter_sum(fbc, 1); | ||
54 | } | ||
55 | |||
56 | |||
51 | static inline s64 percpu_counter_sum(struct percpu_counter *fbc) | 57 | static inline s64 percpu_counter_sum(struct percpu_counter *fbc) |
52 | { | 58 | { |
53 | return __percpu_counter_sum(fbc); | 59 | return __percpu_counter_sum(fbc, 0); |
54 | } | 60 | } |
55 | 61 | ||
56 | static inline s64 percpu_counter_read(struct percpu_counter *fbc) | 62 | static inline s64 percpu_counter_read(struct percpu_counter *fbc) |
diff --git a/include/linux/writeback.h b/include/linux/writeback.h index bd91987c065f..12b15c561a1f 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h | |||
@@ -63,6 +63,7 @@ struct writeback_control { | |||
63 | unsigned for_writepages:1; /* This is a writepages() call */ | 63 | unsigned for_writepages:1; /* This is a writepages() call */ |
64 | unsigned range_cyclic:1; /* range_start is cyclic */ | 64 | unsigned range_cyclic:1; /* range_start is cyclic */ |
65 | unsigned more_io:1; /* more io to be dispatched */ | 65 | unsigned more_io:1; /* more io to be dispatched */ |
66 | unsigned range_cont:1; | ||
66 | }; | 67 | }; |
67 | 68 | ||
68 | /* | 69 | /* |
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 119174494cb5..4a8ba4bf5f6f 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c | |||
@@ -52,7 +52,7 @@ EXPORT_SYMBOL(__percpu_counter_add); | |||
52 | * Add up all the per-cpu counts, return the result. This is a more accurate | 52 | * Add up all the per-cpu counts, return the result. This is a more accurate |
53 | * but much slower version of percpu_counter_read_positive() | 53 | * but much slower version of percpu_counter_read_positive() |
54 | */ | 54 | */ |
55 | s64 __percpu_counter_sum(struct percpu_counter *fbc) | 55 | s64 __percpu_counter_sum(struct percpu_counter *fbc, int set) |
56 | { | 56 | { |
57 | s64 ret; | 57 | s64 ret; |
58 | int cpu; | 58 | int cpu; |
@@ -62,7 +62,12 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) | |||
62 | for_each_online_cpu(cpu) { | 62 | for_each_online_cpu(cpu) { |
63 | s32 *pcount = per_cpu_ptr(fbc->counters, cpu); | 63 | s32 *pcount = per_cpu_ptr(fbc->counters, cpu); |
64 | ret += *pcount; | 64 | ret += *pcount; |
65 | if (set) | ||
66 | *pcount = 0; | ||
65 | } | 67 | } |
68 | if (set) | ||
69 | fbc->count = ret; | ||
70 | |||
66 | spin_unlock(&fbc->lock); | 71 | spin_unlock(&fbc->lock); |
67 | return ret; | 72 | return ret; |
68 | } | 73 | } |
diff --git a/mm/filemap.c b/mm/filemap.c index 1e6a7d34874f..65d9d9e2b755 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -236,11 +236,12 @@ int filemap_fdatawrite(struct address_space *mapping) | |||
236 | } | 236 | } |
237 | EXPORT_SYMBOL(filemap_fdatawrite); | 237 | EXPORT_SYMBOL(filemap_fdatawrite); |
238 | 238 | ||
239 | static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, | 239 | int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, |
240 | loff_t end) | 240 | loff_t end) |
241 | { | 241 | { |
242 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); | 242 | return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); |
243 | } | 243 | } |
244 | EXPORT_SYMBOL(filemap_fdatawrite_range); | ||
244 | 245 | ||
245 | /** | 246 | /** |
246 | * filemap_flush - mostly a non-blocking flush | 247 | * filemap_flush - mostly a non-blocking flush |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b38f700825fc..94c6d8988ab3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -960,6 +960,9 @@ retry: | |||
960 | } | 960 | } |
961 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) | 961 | if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) |
962 | mapping->writeback_index = index; | 962 | mapping->writeback_index = index; |
963 | |||
964 | if (wbc->range_cont) | ||
965 | wbc->range_start = index << PAGE_CACHE_SHIFT; | ||
963 | return ret; | 966 | return ret; |
964 | } | 967 | } |
965 | EXPORT_SYMBOL(write_cache_pages); | 968 | EXPORT_SYMBOL(write_cache_pages); |