diff options
author | Theodore Ts'o <tytso@mit.edu> | 2010-12-14 15:27:50 -0500 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2010-12-14 15:27:50 -0500 |
commit | 1449032be17abb69116dbc393f67ceb8bd034f92 (patch) | |
tree | f62757457241c2fdc14105afc12cb2718f7a2e68 /fs | |
parent | e8a7e48bb248a1196484d3f8afa53bded2b24e71 (diff) |
ext4: Turn off multiple page-io submission by default
Jon Nelson has found a test case which causes postgresql to fail with
the error:
psql:t.sql:4: ERROR: invalid page header in block 38269 of relation base/16384/16581
Under memory pressure, it looks like part of a file can end up getting
replaced by zero's. Until we can figure out the cause, we'll roll
back the change and use block_write_full_page() instead of
ext4_bio_write_page(). The new, more efficient writing function can
be used via the mount option mblk_io_submit, so we can test and fix
the new page I/O code.
To reproduce the problem, install postgres 8.4 or 9.0, and pin enough
memory such that the system just at the end of triggering writeback
before running the following sql script:
begin;
create temporary table foo as select x as a, ARRAY[x] as b FROM
generate_series(1, 10000000 ) AS x;
create index foo_a_idx on foo (a);
create index foo_b_idx on foo USING GIN (b);
rollback;
If the temporary table is created on a hard drive partition which is
encrypted using dm_crypt, then under memory pressure, approximately
30-40% of the time, pgsql will issue the above failure.
This patch should fix this problem, and the problem will come back if
the file system is mounted with the mblk_io_submit mount option.
Reported-by: Jon Nelson <jnelson@jamponi.net>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/ext4/ext4.h | 1 | ||||
-rw-r--r-- | fs/ext4/inode.c | 5 | ||||
-rw-r--r-- | fs/ext4/super.c | 14 |
3 files changed, 17 insertions, 3 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6a5edea2d70b..94ce3d7a1c4b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -910,6 +910,7 @@ struct ext4_inode_info { | |||
910 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ | 910 | #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ |
911 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ | 911 | #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ |
912 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ | 912 | #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ |
913 | #define EXT4_MOUNT_MBLK_IO_SUBMIT 0x4000000 /* multi-block io submits */ | ||
913 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ | 914 | #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ |
914 | #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ | 915 | #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ |
915 | #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ | 916 | #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bdbe69902207..e659597b690b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2125,9 +2125,12 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd, | |||
2125 | */ | 2125 | */ |
2126 | if (unlikely(journal_data && PageChecked(page))) | 2126 | if (unlikely(journal_data && PageChecked(page))) |
2127 | err = __ext4_journalled_writepage(page, len); | 2127 | err = __ext4_journalled_writepage(page, len); |
2128 | else | 2128 | else if (test_opt(inode->i_sb, MBLK_IO_SUBMIT)) |
2129 | err = ext4_bio_write_page(&io_submit, page, | 2129 | err = ext4_bio_write_page(&io_submit, page, |
2130 | len, mpd->wbc); | 2130 | len, mpd->wbc); |
2131 | else | ||
2132 | err = block_write_full_page(page, | ||
2133 | noalloc_get_block_write, mpd->wbc); | ||
2131 | 2134 | ||
2132 | if (!err) | 2135 | if (!err) |
2133 | mpd->pages_written++; | 2136 | mpd->pages_written++; |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e32195d6aac3..fb15c9c0be74 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -1026,6 +1026,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) | |||
1026 | !(def_mount_opts & EXT4_DEFM_NODELALLOC)) | 1026 | !(def_mount_opts & EXT4_DEFM_NODELALLOC)) |
1027 | seq_puts(seq, ",nodelalloc"); | 1027 | seq_puts(seq, ",nodelalloc"); |
1028 | 1028 | ||
1029 | if (test_opt(sb, MBLK_IO_SUBMIT)) | ||
1030 | seq_puts(seq, ",mblk_io_submit"); | ||
1029 | if (sbi->s_stripe) | 1031 | if (sbi->s_stripe) |
1030 | seq_printf(seq, ",stripe=%lu", sbi->s_stripe); | 1032 | seq_printf(seq, ",stripe=%lu", sbi->s_stripe); |
1031 | /* | 1033 | /* |
@@ -1239,8 +1241,8 @@ enum { | |||
1239 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, | 1241 | Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, |
1240 | Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, | 1242 | Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, |
1241 | Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, | 1243 | Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, |
1242 | Opt_stripe, Opt_delalloc, Opt_nodelalloc, | 1244 | Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, |
1243 | Opt_block_validity, Opt_noblock_validity, | 1245 | Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, |
1244 | Opt_inode_readahead_blks, Opt_journal_ioprio, | 1246 | Opt_inode_readahead_blks, Opt_journal_ioprio, |
1245 | Opt_dioread_nolock, Opt_dioread_lock, | 1247 | Opt_dioread_nolock, Opt_dioread_lock, |
1246 | Opt_discard, Opt_nodiscard, | 1248 | Opt_discard, Opt_nodiscard, |
@@ -1304,6 +1306,8 @@ static const match_table_t tokens = { | |||
1304 | {Opt_resize, "resize"}, | 1306 | {Opt_resize, "resize"}, |
1305 | {Opt_delalloc, "delalloc"}, | 1307 | {Opt_delalloc, "delalloc"}, |
1306 | {Opt_nodelalloc, "nodelalloc"}, | 1308 | {Opt_nodelalloc, "nodelalloc"}, |
1309 | {Opt_mblk_io_submit, "mblk_io_submit"}, | ||
1310 | {Opt_nomblk_io_submit, "nomblk_io_submit"}, | ||
1307 | {Opt_block_validity, "block_validity"}, | 1311 | {Opt_block_validity, "block_validity"}, |
1308 | {Opt_noblock_validity, "noblock_validity"}, | 1312 | {Opt_noblock_validity, "noblock_validity"}, |
1309 | {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, | 1313 | {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, |
@@ -1725,6 +1729,12 @@ set_qf_format: | |||
1725 | case Opt_nodelalloc: | 1729 | case Opt_nodelalloc: |
1726 | clear_opt(sbi->s_mount_opt, DELALLOC); | 1730 | clear_opt(sbi->s_mount_opt, DELALLOC); |
1727 | break; | 1731 | break; |
1732 | case Opt_mblk_io_submit: | ||
1733 | set_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); | ||
1734 | break; | ||
1735 | case Opt_nomblk_io_submit: | ||
1736 | clear_opt(sbi->s_mount_opt, MBLK_IO_SUBMIT); | ||
1737 | break; | ||
1728 | case Opt_stripe: | 1738 | case Opt_stripe: |
1729 | if (match_int(&args[0], &option)) | 1739 | if (match_int(&args[0], &option)) |
1730 | return 0; | 1740 | return 0; |