aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/balloc.c18
-rw-r--r--fs/ext4/dir.c158
-rw-r--r--fs/ext4/ext4.h189
-rw-r--r--fs/ext4/ext4_jbd2.c58
-rw-r--r--fs/ext4/ext4_jbd2.h29
-rw-r--r--fs/ext4/extents.c214
-rw-r--r--fs/ext4/extents_status.c144
-rw-r--r--fs/ext4/extents_status.h5
-rw-r--r--fs/ext4/file.c38
-rw-r--r--fs/ext4/fsync.c52
-rw-r--r--fs/ext4/ialloc.c13
-rw-r--r--fs/ext4/indirect.c40
-rw-r--r--fs/ext4/inline.c168
-rw-r--r--fs/ext4/inode.c1791
-rw-r--r--fs/ext4/ioctl.c6
-rw-r--r--fs/ext4/mballoc.c32
-rw-r--r--fs/ext4/move_extent.c3
-rw-r--r--fs/ext4/namei.c54
-rw-r--r--fs/ext4/page-io.c336
-rw-r--r--fs/ext4/resize.c24
-rw-r--r--fs/ext4/super.c189
21 files changed, 1839 insertions, 1722 deletions
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d0f13eada0ed..ddd715e42a5c 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -38,8 +38,8 @@ ext4_group_t ext4_get_group_number(struct super_block *sb,
38 ext4_group_t group; 38 ext4_group_t group;
39 39
40 if (test_opt2(sb, STD_GROUP_SIZE)) 40 if (test_opt2(sb, STD_GROUP_SIZE))
41 group = (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) + 41 group = (block -
42 block) >> 42 le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >>
43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); 43 (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3);
44 else 44 else
45 ext4_get_group_no_and_offset(sb, block, &group, NULL); 45 ext4_get_group_no_and_offset(sb, block, &group, NULL);
@@ -682,11 +682,15 @@ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb)
682 682
683static inline int test_root(ext4_group_t a, int b) 683static inline int test_root(ext4_group_t a, int b)
684{ 684{
685 int num = b; 685 while (1) {
686 686 if (a < b)
687 while (a > num) 687 return 0;
688 num *= b; 688 if (a == b)
689 return num == a; 689 return 1;
690 if ((a % b) != 0)
691 return 0;
692 a = a / b;
693 }
690} 694}
691 695
692static int ext4_group_sparse(ext4_group_t group) 696static int ext4_group_sparse(ext4_group_t group)
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index f8d56e4254e0..3c7d288ae94c 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -29,8 +29,7 @@
29#include "ext4.h" 29#include "ext4.h"
30#include "xattr.h" 30#include "xattr.h"
31 31
32static int ext4_dx_readdir(struct file *filp, 32static int ext4_dx_readdir(struct file *, struct dir_context *);
33 void *dirent, filldir_t filldir);
34 33
35/** 34/**
36 * Check if the given dir-inode refers to an htree-indexed directory 35 * Check if the given dir-inode refers to an htree-indexed directory
@@ -103,60 +102,56 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
103 return 1; 102 return 1;
104} 103}
105 104
106static int ext4_readdir(struct file *filp, 105static int ext4_readdir(struct file *file, struct dir_context *ctx)
107 void *dirent, filldir_t filldir)
108{ 106{
109 int error = 0;
110 unsigned int offset; 107 unsigned int offset;
111 int i, stored; 108 int i, stored;
112 struct ext4_dir_entry_2 *de; 109 struct ext4_dir_entry_2 *de;
113 int err; 110 int err;
114 struct inode *inode = file_inode(filp); 111 struct inode *inode = file_inode(file);
115 struct super_block *sb = inode->i_sb; 112 struct super_block *sb = inode->i_sb;
116 int ret = 0;
117 int dir_has_error = 0; 113 int dir_has_error = 0;
118 114
119 if (is_dx_dir(inode)) { 115 if (is_dx_dir(inode)) {
120 err = ext4_dx_readdir(filp, dirent, filldir); 116 err = ext4_dx_readdir(file, ctx);
121 if (err != ERR_BAD_DX_DIR) { 117 if (err != ERR_BAD_DX_DIR) {
122 ret = err; 118 return err;
123 goto out;
124 } 119 }
125 /* 120 /*
126 * We don't set the inode dirty flag since it's not 121 * We don't set the inode dirty flag since it's not
127 * critical that it get flushed back to the disk. 122 * critical that it get flushed back to the disk.
128 */ 123 */
129 ext4_clear_inode_flag(file_inode(filp), 124 ext4_clear_inode_flag(file_inode(file),
130 EXT4_INODE_INDEX); 125 EXT4_INODE_INDEX);
131 } 126 }
132 127
133 if (ext4_has_inline_data(inode)) { 128 if (ext4_has_inline_data(inode)) {
134 int has_inline_data = 1; 129 int has_inline_data = 1;
135 ret = ext4_read_inline_dir(filp, dirent, filldir, 130 int ret = ext4_read_inline_dir(file, ctx,
136 &has_inline_data); 131 &has_inline_data);
137 if (has_inline_data) 132 if (has_inline_data)
138 return ret; 133 return ret;
139 } 134 }
140 135
141 stored = 0; 136 stored = 0;
142 offset = filp->f_pos & (sb->s_blocksize - 1); 137 offset = ctx->pos & (sb->s_blocksize - 1);
143 138
144 while (!error && !stored && filp->f_pos < inode->i_size) { 139 while (ctx->pos < inode->i_size) {
145 struct ext4_map_blocks map; 140 struct ext4_map_blocks map;
146 struct buffer_head *bh = NULL; 141 struct buffer_head *bh = NULL;
147 142
148 map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb); 143 map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb);
149 map.m_len = 1; 144 map.m_len = 1;
150 err = ext4_map_blocks(NULL, inode, &map, 0); 145 err = ext4_map_blocks(NULL, inode, &map, 0);
151 if (err > 0) { 146 if (err > 0) {
152 pgoff_t index = map.m_pblk >> 147 pgoff_t index = map.m_pblk >>
153 (PAGE_CACHE_SHIFT - inode->i_blkbits); 148 (PAGE_CACHE_SHIFT - inode->i_blkbits);
154 if (!ra_has_index(&filp->f_ra, index)) 149 if (!ra_has_index(&file->f_ra, index))
155 page_cache_sync_readahead( 150 page_cache_sync_readahead(
156 sb->s_bdev->bd_inode->i_mapping, 151 sb->s_bdev->bd_inode->i_mapping,
157 &filp->f_ra, filp, 152 &file->f_ra, file,
158 index, 1); 153 index, 1);
159 filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; 154 file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
160 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); 155 bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
161 } 156 }
162 157
@@ -166,16 +161,16 @@ static int ext4_readdir(struct file *filp,
166 */ 161 */
167 if (!bh) { 162 if (!bh) {
168 if (!dir_has_error) { 163 if (!dir_has_error) {
169 EXT4_ERROR_FILE(filp, 0, 164 EXT4_ERROR_FILE(file, 0,
170 "directory contains a " 165 "directory contains a "
171 "hole at offset %llu", 166 "hole at offset %llu",
172 (unsigned long long) filp->f_pos); 167 (unsigned long long) ctx->pos);
173 dir_has_error = 1; 168 dir_has_error = 1;
174 } 169 }
175 /* corrupt size? Maybe no more blocks to read */ 170 /* corrupt size? Maybe no more blocks to read */
176 if (filp->f_pos > inode->i_blocks << 9) 171 if (ctx->pos > inode->i_blocks << 9)
177 break; 172 break;
178 filp->f_pos += sb->s_blocksize - offset; 173 ctx->pos += sb->s_blocksize - offset;
179 continue; 174 continue;
180 } 175 }
181 176
@@ -183,21 +178,20 @@ static int ext4_readdir(struct file *filp,
183 if (!buffer_verified(bh) && 178 if (!buffer_verified(bh) &&
184 !ext4_dirent_csum_verify(inode, 179 !ext4_dirent_csum_verify(inode,
185 (struct ext4_dir_entry *)bh->b_data)) { 180 (struct ext4_dir_entry *)bh->b_data)) {
186 EXT4_ERROR_FILE(filp, 0, "directory fails checksum " 181 EXT4_ERROR_FILE(file, 0, "directory fails checksum "
187 "at offset %llu", 182 "at offset %llu",
188 (unsigned long long)filp->f_pos); 183 (unsigned long long)ctx->pos);
189 filp->f_pos += sb->s_blocksize - offset; 184 ctx->pos += sb->s_blocksize - offset;
190 brelse(bh); 185 brelse(bh);
191 continue; 186 continue;
192 } 187 }
193 set_buffer_verified(bh); 188 set_buffer_verified(bh);
194 189
195revalidate:
196 /* If the dir block has changed since the last call to 190 /* If the dir block has changed since the last call to
197 * readdir(2), then we might be pointing to an invalid 191 * readdir(2), then we might be pointing to an invalid
198 * dirent right now. Scan from the start of the block 192 * dirent right now. Scan from the start of the block
199 * to make sure. */ 193 * to make sure. */
200 if (filp->f_version != inode->i_version) { 194 if (file->f_version != inode->i_version) {
201 for (i = 0; i < sb->s_blocksize && i < offset; ) { 195 for (i = 0; i < sb->s_blocksize && i < offset; ) {
202 de = (struct ext4_dir_entry_2 *) 196 de = (struct ext4_dir_entry_2 *)
203 (bh->b_data + i); 197 (bh->b_data + i);
@@ -214,57 +208,46 @@ revalidate:
214 sb->s_blocksize); 208 sb->s_blocksize);
215 } 209 }
216 offset = i; 210 offset = i;
217 filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1)) 211 ctx->pos = (ctx->pos & ~(sb->s_blocksize - 1))
218 | offset; 212 | offset;
219 filp->f_version = inode->i_version; 213 file->f_version = inode->i_version;
220 } 214 }
221 215
222 while (!error && filp->f_pos < inode->i_size 216 while (ctx->pos < inode->i_size
223 && offset < sb->s_blocksize) { 217 && offset < sb->s_blocksize) {
224 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); 218 de = (struct ext4_dir_entry_2 *) (bh->b_data + offset);
225 if (ext4_check_dir_entry(inode, filp, de, bh, 219 if (ext4_check_dir_entry(inode, file, de, bh,
226 bh->b_data, bh->b_size, 220 bh->b_data, bh->b_size,
227 offset)) { 221 offset)) {
228 /* 222 /*
229 * On error, skip the f_pos to the next block 223 * On error, skip to the next block
230 */ 224 */
231 filp->f_pos = (filp->f_pos | 225 ctx->pos = (ctx->pos |
232 (sb->s_blocksize - 1)) + 1; 226 (sb->s_blocksize - 1)) + 1;
233 brelse(bh); 227 break;
234 ret = stored;
235 goto out;
236 } 228 }
237 offset += ext4_rec_len_from_disk(de->rec_len, 229 offset += ext4_rec_len_from_disk(de->rec_len,
238 sb->s_blocksize); 230 sb->s_blocksize);
239 if (le32_to_cpu(de->inode)) { 231 if (le32_to_cpu(de->inode)) {
240 /* We might block in the next section 232 if (!dir_emit(ctx, de->name,
241 * if the data destination is
242 * currently swapped out. So, use a
243 * version stamp to detect whether or
244 * not the directory has been modified
245 * during the copy operation.
246 */
247 u64 version = filp->f_version;
248
249 error = filldir(dirent, de->name,
250 de->name_len, 233 de->name_len,
251 filp->f_pos,
252 le32_to_cpu(de->inode), 234 le32_to_cpu(de->inode),
253 get_dtype(sb, de->file_type)); 235 get_dtype(sb, de->file_type))) {
254 if (error) 236 brelse(bh);
255 break; 237 return 0;
256 if (version != filp->f_version) 238 }
257 goto revalidate;
258 stored++;
259 } 239 }
260 filp->f_pos += ext4_rec_len_from_disk(de->rec_len, 240 ctx->pos += ext4_rec_len_from_disk(de->rec_len,
261 sb->s_blocksize); 241 sb->s_blocksize);
262 } 242 }
263 offset = 0; 243 offset = 0;
264 brelse(bh); 244 brelse(bh);
245 if (ctx->pos < inode->i_size) {
246 if (!dir_relax(inode))
247 return 0;
248 }
265 } 249 }
266out: 250 return 0;
267 return ret;
268} 251}
269 252
270static inline int is_32bit_api(void) 253static inline int is_32bit_api(void)
@@ -492,16 +475,12 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
492 * for all entres on the fname linked list. (Normally there is only 475 * for all entres on the fname linked list. (Normally there is only
493 * one entry on the linked list, unless there are 62 bit hash collisions.) 476 * one entry on the linked list, unless there are 62 bit hash collisions.)
494 */ 477 */
495static int call_filldir(struct file *filp, void *dirent, 478static int call_filldir(struct file *file, struct dir_context *ctx,
496 filldir_t filldir, struct fname *fname) 479 struct fname *fname)
497{ 480{
498 struct dir_private_info *info = filp->private_data; 481 struct dir_private_info *info = file->private_data;
499 loff_t curr_pos; 482 struct inode *inode = file_inode(file);
500 struct inode *inode = file_inode(filp); 483 struct super_block *sb = inode->i_sb;
501 struct super_block *sb;
502 int error;
503
504 sb = inode->i_sb;
505 484
506 if (!fname) { 485 if (!fname) {
507 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: " 486 ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
@@ -509,47 +488,44 @@ static int call_filldir(struct file *filp, void *dirent,
509 inode->i_ino, current->comm); 488 inode->i_ino, current->comm);
510 return 0; 489 return 0;
511 } 490 }
512 curr_pos = hash2pos(filp, fname->hash, fname->minor_hash); 491 ctx->pos = hash2pos(file, fname->hash, fname->minor_hash);
513 while (fname) { 492 while (fname) {
514 error = filldir(dirent, fname->name, 493 if (!dir_emit(ctx, fname->name,
515 fname->name_len, curr_pos, 494 fname->name_len,
516 fname->inode, 495 fname->inode,
517 get_dtype(sb, fname->file_type)); 496 get_dtype(sb, fname->file_type))) {
518 if (error) {
519 filp->f_pos = curr_pos;
520 info->extra_fname = fname; 497 info->extra_fname = fname;
521 return error; 498 return 1;
522 } 499 }
523 fname = fname->next; 500 fname = fname->next;
524 } 501 }
525 return 0; 502 return 0;
526} 503}
527 504
528static int ext4_dx_readdir(struct file *filp, 505static int ext4_dx_readdir(struct file *file, struct dir_context *ctx)
529 void *dirent, filldir_t filldir)
530{ 506{
531 struct dir_private_info *info = filp->private_data; 507 struct dir_private_info *info = file->private_data;
532 struct inode *inode = file_inode(filp); 508 struct inode *inode = file_inode(file);
533 struct fname *fname; 509 struct fname *fname;
534 int ret; 510 int ret;
535 511
536 if (!info) { 512 if (!info) {
537 info = ext4_htree_create_dir_info(filp, filp->f_pos); 513 info = ext4_htree_create_dir_info(file, ctx->pos);
538 if (!info) 514 if (!info)
539 return -ENOMEM; 515 return -ENOMEM;
540 filp->private_data = info; 516 file->private_data = info;
541 } 517 }
542 518
543 if (filp->f_pos == ext4_get_htree_eof(filp)) 519 if (ctx->pos == ext4_get_htree_eof(file))
544 return 0; /* EOF */ 520 return 0; /* EOF */
545 521
546 /* Some one has messed with f_pos; reset the world */ 522 /* Some one has messed with f_pos; reset the world */
547 if (info->last_pos != filp->f_pos) { 523 if (info->last_pos != ctx->pos) {
548 free_rb_tree_fname(&info->root); 524 free_rb_tree_fname(&info->root);
549 info->curr_node = NULL; 525 info->curr_node = NULL;
550 info->extra_fname = NULL; 526 info->extra_fname = NULL;
551 info->curr_hash = pos2maj_hash(filp, filp->f_pos); 527 info->curr_hash = pos2maj_hash(file, ctx->pos);
552 info->curr_minor_hash = pos2min_hash(filp, filp->f_pos); 528 info->curr_minor_hash = pos2min_hash(file, ctx->pos);
553 } 529 }
554 530
555 /* 531 /*
@@ -557,7 +533,7 @@ static int ext4_dx_readdir(struct file *filp,
557 * chain, return them first. 533 * chain, return them first.
558 */ 534 */
559 if (info->extra_fname) { 535 if (info->extra_fname) {
560 if (call_filldir(filp, dirent, filldir, info->extra_fname)) 536 if (call_filldir(file, ctx, info->extra_fname))
561 goto finished; 537 goto finished;
562 info->extra_fname = NULL; 538 info->extra_fname = NULL;
563 goto next_node; 539 goto next_node;
@@ -571,17 +547,17 @@ static int ext4_dx_readdir(struct file *filp,
571 * cached entries. 547 * cached entries.
572 */ 548 */
573 if ((!info->curr_node) || 549 if ((!info->curr_node) ||
574 (filp->f_version != inode->i_version)) { 550 (file->f_version != inode->i_version)) {
575 info->curr_node = NULL; 551 info->curr_node = NULL;
576 free_rb_tree_fname(&info->root); 552 free_rb_tree_fname(&info->root);
577 filp->f_version = inode->i_version; 553 file->f_version = inode->i_version;
578 ret = ext4_htree_fill_tree(filp, info->curr_hash, 554 ret = ext4_htree_fill_tree(file, info->curr_hash,
579 info->curr_minor_hash, 555 info->curr_minor_hash,
580 &info->next_hash); 556 &info->next_hash);
581 if (ret < 0) 557 if (ret < 0)
582 return ret; 558 return ret;
583 if (ret == 0) { 559 if (ret == 0) {
584 filp->f_pos = ext4_get_htree_eof(filp); 560 ctx->pos = ext4_get_htree_eof(file);
585 break; 561 break;
586 } 562 }
587 info->curr_node = rb_first(&info->root); 563 info->curr_node = rb_first(&info->root);
@@ -590,7 +566,7 @@ static int ext4_dx_readdir(struct file *filp,
590 fname = rb_entry(info->curr_node, struct fname, rb_hash); 566 fname = rb_entry(info->curr_node, struct fname, rb_hash);
591 info->curr_hash = fname->hash; 567 info->curr_hash = fname->hash;
592 info->curr_minor_hash = fname->minor_hash; 568 info->curr_minor_hash = fname->minor_hash;
593 if (call_filldir(filp, dirent, filldir, fname)) 569 if (call_filldir(file, ctx, fname))
594 break; 570 break;
595 next_node: 571 next_node:
596 info->curr_node = rb_next(info->curr_node); 572 info->curr_node = rb_next(info->curr_node);
@@ -601,7 +577,7 @@ static int ext4_dx_readdir(struct file *filp,
601 info->curr_minor_hash = fname->minor_hash; 577 info->curr_minor_hash = fname->minor_hash;
602 } else { 578 } else {
603 if (info->next_hash == ~0) { 579 if (info->next_hash == ~0) {
604 filp->f_pos = ext4_get_htree_eof(filp); 580 ctx->pos = ext4_get_htree_eof(file);
605 break; 581 break;
606 } 582 }
607 info->curr_hash = info->next_hash; 583 info->curr_hash = info->next_hash;
@@ -609,7 +585,7 @@ static int ext4_dx_readdir(struct file *filp,
609 } 585 }
610 } 586 }
611finished: 587finished:
612 info->last_pos = filp->f_pos; 588 info->last_pos = ctx->pos;
613 return 0; 589 return 0;
614} 590}
615 591
@@ -624,7 +600,7 @@ static int ext4_release_dir(struct inode *inode, struct file *filp)
624const struct file_operations ext4_dir_operations = { 600const struct file_operations ext4_dir_operations = {
625 .llseek = ext4_dir_llseek, 601 .llseek = ext4_dir_llseek,
626 .read = generic_read_dir, 602 .read = generic_read_dir,
627 .readdir = ext4_readdir, 603 .iterate = ext4_readdir,
628 .unlocked_ioctl = ext4_ioctl, 604 .unlocked_ioctl = ext4_ioctl,
629#ifdef CONFIG_COMPAT 605#ifdef CONFIG_COMPAT
630 .compat_ioctl = ext4_compat_ioctl, 606 .compat_ioctl = ext4_compat_ioctl,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5aae3d12d400..b577e45425b0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -177,38 +177,28 @@ struct ext4_map_blocks {
177}; 177};
178 178
179/* 179/*
180 * For delayed allocation tracking
181 */
182struct mpage_da_data {
183 struct inode *inode;
184 sector_t b_blocknr; /* start block number of extent */
185 size_t b_size; /* size of extent */
186 unsigned long b_state; /* state of the extent */
187 unsigned long first_page, next_page; /* extent of pages */
188 struct writeback_control *wbc;
189 int io_done;
190 int pages_written;
191 int retval;
192};
193
194/*
195 * Flags for ext4_io_end->flags 180 * Flags for ext4_io_end->flags
196 */ 181 */
197#define EXT4_IO_END_UNWRITTEN 0x0001 182#define EXT4_IO_END_UNWRITTEN 0x0001
198#define EXT4_IO_END_ERROR 0x0002 183#define EXT4_IO_END_DIRECT 0x0002
199#define EXT4_IO_END_DIRECT 0x0004
200 184
201/* 185/*
202 * For converting uninitialized extents on a work queue. 186 * For converting uninitialized extents on a work queue. 'handle' is used for
187 * buffered writeback.
203 */ 188 */
204typedef struct ext4_io_end { 189typedef struct ext4_io_end {
205 struct list_head list; /* per-file finished IO list */ 190 struct list_head list; /* per-file finished IO list */
191 handle_t *handle; /* handle reserved for extent
192 * conversion */
206 struct inode *inode; /* file being written to */ 193 struct inode *inode; /* file being written to */
194 struct bio *bio; /* Linked list of completed
195 * bios covering the extent */
207 unsigned int flag; /* unwritten or not */ 196 unsigned int flag; /* unwritten or not */
208 loff_t offset; /* offset in the file */ 197 loff_t offset; /* offset in the file */
209 ssize_t size; /* size of the extent */ 198 ssize_t size; /* size of the extent */
210 struct kiocb *iocb; /* iocb struct for AIO */ 199 struct kiocb *iocb; /* iocb struct for AIO */
211 int result; /* error value for AIO */ 200 int result; /* error value for AIO */
201 atomic_t count; /* reference counter */
212} ext4_io_end_t; 202} ext4_io_end_t;
213 203
214struct ext4_io_submit { 204struct ext4_io_submit {
@@ -581,11 +571,6 @@ enum {
581#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020 571#define EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER 0x0020
582 572
583/* 573/*
584 * Flags used by ext4_discard_partial_page_buffers
585 */
586#define EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED 0x0001
587
588/*
589 * ioctl commands 574 * ioctl commands
590 */ 575 */
591#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS 576#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -879,6 +864,7 @@ struct ext4_inode_info {
879 rwlock_t i_es_lock; 864 rwlock_t i_es_lock;
880 struct list_head i_es_lru; 865 struct list_head i_es_lru;
881 unsigned int i_es_lru_nr; /* protected by i_es_lock */ 866 unsigned int i_es_lru_nr; /* protected by i_es_lock */
867 unsigned long i_touch_when; /* jiffies of last accessing */
882 868
883 /* ialloc */ 869 /* ialloc */
884 ext4_group_t i_last_alloc_group; 870 ext4_group_t i_last_alloc_group;
@@ -903,12 +889,22 @@ struct ext4_inode_info {
903 qsize_t i_reserved_quota; 889 qsize_t i_reserved_quota;
904#endif 890#endif
905 891
906 /* completed IOs that might need unwritten extents handling */ 892 /* Lock protecting lists below */
907 struct list_head i_completed_io_list;
908 spinlock_t i_completed_io_lock; 893 spinlock_t i_completed_io_lock;
894 /*
895 * Completed IOs that need unwritten extents handling and have
896 * transaction reserved
897 */
898 struct list_head i_rsv_conversion_list;
899 /*
900 * Completed IOs that need unwritten extents handling and don't have
901 * transaction reserved
902 */
903 struct list_head i_unrsv_conversion_list;
909 atomic_t i_ioend_count; /* Number of outstanding io_end structs */ 904 atomic_t i_ioend_count; /* Number of outstanding io_end structs */
910 atomic_t i_unwritten; /* Nr. of inflight conversions pending */ 905 atomic_t i_unwritten; /* Nr. of inflight conversions pending */
911 struct work_struct i_unwritten_work; /* deferred extent conversion */ 906 struct work_struct i_rsv_conversion_work;
907 struct work_struct i_unrsv_conversion_work;
912 908
913 spinlock_t i_block_reservation_lock; 909 spinlock_t i_block_reservation_lock;
914 910
@@ -1245,7 +1241,6 @@ struct ext4_sb_info {
1245 unsigned int s_mb_stats; 1241 unsigned int s_mb_stats;
1246 unsigned int s_mb_order2_reqs; 1242 unsigned int s_mb_order2_reqs;
1247 unsigned int s_mb_group_prealloc; 1243 unsigned int s_mb_group_prealloc;
1248 unsigned int s_max_writeback_mb_bump;
1249 unsigned int s_max_dir_size_kb; 1244 unsigned int s_max_dir_size_kb;
1250 /* where last allocation was done - for stream allocation */ 1245 /* where last allocation was done - for stream allocation */
1251 unsigned long s_mb_last_group; 1246 unsigned long s_mb_last_group;
@@ -1281,8 +1276,10 @@ struct ext4_sb_info {
1281 struct flex_groups *s_flex_groups; 1276 struct flex_groups *s_flex_groups;
1282 ext4_group_t s_flex_groups_allocated; 1277 ext4_group_t s_flex_groups_allocated;
1283 1278
1284 /* workqueue for dio unwritten */ 1279 /* workqueue for unreserved extent convertions (dio) */
1285 struct workqueue_struct *dio_unwritten_wq; 1280 struct workqueue_struct *unrsv_conversion_wq;
1281 /* workqueue for reserved extent conversions (buffered io) */
1282 struct workqueue_struct *rsv_conversion_wq;
1286 1283
1287 /* timer for periodic error stats printing */ 1284 /* timer for periodic error stats printing */
1288 struct timer_list s_err_report; 1285 struct timer_list s_err_report;
@@ -1307,6 +1304,7 @@ struct ext4_sb_info {
1307 /* Reclaim extents from extent status tree */ 1304 /* Reclaim extents from extent status tree */
1308 struct shrinker s_es_shrinker; 1305 struct shrinker s_es_shrinker;
1309 struct list_head s_es_lru; 1306 struct list_head s_es_lru;
1307 unsigned long s_es_last_sorted;
1310 struct percpu_counter s_extent_cache_cnt; 1308 struct percpu_counter s_extent_cache_cnt;
1311 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; 1309 spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp;
1312}; 1310};
@@ -1342,6 +1340,9 @@ static inline void ext4_set_io_unwritten_flag(struct inode *inode,
1342 struct ext4_io_end *io_end) 1340 struct ext4_io_end *io_end)
1343{ 1341{
1344 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 1342 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
1343 /* Writeback has to have coversion transaction reserved */
1344 WARN_ON(EXT4_SB(inode->i_sb)->s_journal && !io_end->handle &&
1345 !(io_end->flag & EXT4_IO_END_DIRECT));
1345 io_end->flag |= EXT4_IO_END_UNWRITTEN; 1346 io_end->flag |= EXT4_IO_END_UNWRITTEN;
1346 atomic_inc(&EXT4_I(inode)->i_unwritten); 1347 atomic_inc(&EXT4_I(inode)->i_unwritten);
1347 } 1348 }
@@ -1999,7 +2000,6 @@ static inline unsigned char get_dtype(struct super_block *sb, int filetype)
1999 2000
2000/* fsync.c */ 2001/* fsync.c */
2001extern int ext4_sync_file(struct file *, loff_t, loff_t, int); 2002extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
2002extern int ext4_flush_unwritten_io(struct inode *);
2003 2003
2004/* hash.c */ 2004/* hash.c */
2005extern int ext4fs_dirhash(const char *name, int len, struct 2005extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2088,7 +2088,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); 2088extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
2089extern int ext4_can_truncate(struct inode *inode); 2089extern int ext4_can_truncate(struct inode *inode);
2090extern void ext4_truncate(struct inode *); 2090extern void ext4_truncate(struct inode *);
2091extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); 2091extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks); 2092extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
2093extern void ext4_set_inode_flags(struct inode *); 2093extern void ext4_set_inode_flags(struct inode *);
2094extern void ext4_get_inode_flags(struct ext4_inode_info *); 2094extern void ext4_get_inode_flags(struct ext4_inode_info *);
@@ -2096,9 +2096,12 @@ extern int ext4_alloc_da_blocks(struct inode *inode);
2096extern void ext4_set_aops(struct inode *inode); 2096extern void ext4_set_aops(struct inode *inode);
2097extern int ext4_writepage_trans_blocks(struct inode *); 2097extern int ext4_writepage_trans_blocks(struct inode *);
2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); 2098extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
2099extern int ext4_discard_partial_page_buffers(handle_t *handle, 2099extern int ext4_block_truncate_page(handle_t *handle,
2100 struct address_space *mapping, loff_t from, 2100 struct address_space *mapping, loff_t from);
2101 loff_t length, int flags); 2101extern int ext4_block_zero_page_range(handle_t *handle,
2102 struct address_space *mapping, loff_t from, loff_t length);
2103extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
2104 loff_t lstart, loff_t lend);
2102extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); 2105extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
2103extern qsize_t *ext4_get_reserved_space(struct inode *inode); 2106extern qsize_t *ext4_get_reserved_space(struct inode *inode);
2104extern void ext4_da_update_reserve_space(struct inode *inode, 2107extern void ext4_da_update_reserve_space(struct inode *inode,
@@ -2111,7 +2114,7 @@ extern ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
2111 const struct iovec *iov, loff_t offset, 2114 const struct iovec *iov, loff_t offset,
2112 unsigned long nr_segs); 2115 unsigned long nr_segs);
2113extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); 2116extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock);
2114extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk); 2117extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks);
2115extern void ext4_ind_truncate(handle_t *, struct inode *inode); 2118extern void ext4_ind_truncate(handle_t *, struct inode *inode);
2116extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode, 2119extern int ext4_free_hole_blocks(handle_t *handle, struct inode *inode,
2117 ext4_lblk_t first, ext4_lblk_t stop); 2120 ext4_lblk_t first, ext4_lblk_t stop);
@@ -2166,42 +2169,96 @@ extern int ext4_alloc_flex_bg_array(struct super_block *sb,
2166 ext4_group_t ngroup); 2169 ext4_group_t ngroup);
2167extern const char *ext4_decode_error(struct super_block *sb, int errno, 2170extern const char *ext4_decode_error(struct super_block *sb, int errno,
2168 char nbuf[16]); 2171 char nbuf[16]);
2172
2169extern __printf(4, 5) 2173extern __printf(4, 5)
2170void __ext4_error(struct super_block *, const char *, unsigned int, 2174void __ext4_error(struct super_block *, const char *, unsigned int,
2171 const char *, ...); 2175 const char *, ...);
2172#define ext4_error(sb, message...) __ext4_error(sb, __func__, \
2173 __LINE__, ## message)
2174extern __printf(5, 6) 2176extern __printf(5, 6)
2175void ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t, 2177void __ext4_error_inode(struct inode *, const char *, unsigned int, ext4_fsblk_t,
2176 const char *, ...); 2178 const char *, ...);
2177extern __printf(5, 6) 2179extern __printf(5, 6)
2178void ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t, 2180void __ext4_error_file(struct file *, const char *, unsigned int, ext4_fsblk_t,
2179 const char *, ...); 2181 const char *, ...);
2180extern void __ext4_std_error(struct super_block *, const char *, 2182extern void __ext4_std_error(struct super_block *, const char *,
2181 unsigned int, int); 2183 unsigned int, int);
2182extern __printf(4, 5) 2184extern __printf(4, 5)
2183void __ext4_abort(struct super_block *, const char *, unsigned int, 2185void __ext4_abort(struct super_block *, const char *, unsigned int,
2184 const char *, ...); 2186 const char *, ...);
2185#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \
2186 __LINE__, ## message)
2187extern __printf(4, 5) 2187extern __printf(4, 5)
2188void __ext4_warning(struct super_block *, const char *, unsigned int, 2188void __ext4_warning(struct super_block *, const char *, unsigned int,
2189 const char *, ...); 2189 const char *, ...);
2190#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \
2191 __LINE__, ## message)
2192extern __printf(3, 4) 2190extern __printf(3, 4)
2193void ext4_msg(struct super_block *, const char *, const char *, ...); 2191void __ext4_msg(struct super_block *, const char *, const char *, ...);
2194extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, 2192extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp,
2195 const char *, unsigned int, const char *); 2193 const char *, unsigned int, const char *);
2196#define dump_mmp_msg(sb, mmp, msg) __dump_mmp_msg(sb, mmp, __func__, \
2197 __LINE__, msg)
2198extern __printf(7, 8) 2194extern __printf(7, 8)
2199void __ext4_grp_locked_error(const char *, unsigned int, 2195void __ext4_grp_locked_error(const char *, unsigned int,
2200 struct super_block *, ext4_group_t, 2196 struct super_block *, ext4_group_t,
2201 unsigned long, ext4_fsblk_t, 2197 unsigned long, ext4_fsblk_t,
2202 const char *, ...); 2198 const char *, ...);
2203#define ext4_grp_locked_error(sb, grp, message...) \ 2199
2204 __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) 2200#ifdef CONFIG_PRINTK
2201
2202#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2203 __ext4_error_inode(inode, func, line, block, fmt, ##__VA_ARGS__)
2204#define ext4_error_file(file, func, line, block, fmt, ...) \
2205 __ext4_error_file(file, func, line, block, fmt, ##__VA_ARGS__)
2206#define ext4_error(sb, fmt, ...) \
2207 __ext4_error(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2208#define ext4_abort(sb, fmt, ...) \
2209 __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2210#define ext4_warning(sb, fmt, ...) \
2211 __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__)
2212#define ext4_msg(sb, level, fmt, ...) \
2213 __ext4_msg(sb, level, fmt, ##__VA_ARGS__)
2214#define dump_mmp_msg(sb, mmp, msg) \
2215 __dump_mmp_msg(sb, mmp, __func__, __LINE__, msg)
2216#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2217 __ext4_grp_locked_error(__func__, __LINE__, sb, grp, ino, block, \
2218 fmt, ##__VA_ARGS__)
2219
2220#else
2221
2222#define ext4_error_inode(inode, func, line, block, fmt, ...) \
2223do { \
2224 no_printk(fmt, ##__VA_ARGS__); \
2225 __ext4_error_inode(inode, "", 0, block, " "); \
2226} while (0)
2227#define ext4_error_file(file, func, line, block, fmt, ...) \
2228do { \
2229 no_printk(fmt, ##__VA_ARGS__); \
2230 __ext4_error_file(file, "", 0, block, " "); \
2231} while (0)
2232#define ext4_error(sb, fmt, ...) \
2233do { \
2234 no_printk(fmt, ##__VA_ARGS__); \
2235 __ext4_error(sb, "", 0, " "); \
2236} while (0)
2237#define ext4_abort(sb, fmt, ...) \
2238do { \
2239 no_printk(fmt, ##__VA_ARGS__); \
2240 __ext4_abort(sb, "", 0, " "); \
2241} while (0)
2242#define ext4_warning(sb, fmt, ...) \
2243do { \
2244 no_printk(fmt, ##__VA_ARGS__); \
2245 __ext4_warning(sb, "", 0, " "); \
2246} while (0)
2247#define ext4_msg(sb, level, fmt, ...) \
2248do { \
2249 no_printk(fmt, ##__VA_ARGS__); \
2250 __ext4_msg(sb, "", " "); \
2251} while (0)
2252#define dump_mmp_msg(sb, mmp, msg) \
2253 __dump_mmp_msg(sb, mmp, "", 0, "")
2254#define ext4_grp_locked_error(sb, grp, ino, block, fmt, ...) \
2255do { \
2256 no_printk(fmt, ##__VA_ARGS__); \
2257 __ext4_grp_locked_error("", 0, sb, grp, ino, block, " "); \
2258} while (0)
2259
2260#endif
2261
2205extern void ext4_update_dynamic_rev(struct super_block *sb); 2262extern void ext4_update_dynamic_rev(struct super_block *sb);
2206extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, 2263extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb,
2207 __u32 compat); 2264 __u32 compat);
@@ -2312,6 +2369,7 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
2312{ 2369{
2313 struct ext4_group_info ***grp_info; 2370 struct ext4_group_info ***grp_info;
2314 long indexv, indexh; 2371 long indexv, indexh;
2372 BUG_ON(group >= EXT4_SB(sb)->s_groups_count);
2315 grp_info = EXT4_SB(sb)->s_group_info; 2373 grp_info = EXT4_SB(sb)->s_group_info;
2316 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); 2374 indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb));
2317 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); 2375 indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1);
@@ -2515,7 +2573,7 @@ extern int ext4_try_create_inline_dir(handle_t *handle,
2515 struct inode *parent, 2573 struct inode *parent,
2516 struct inode *inode); 2574 struct inode *inode);
2517extern int ext4_read_inline_dir(struct file *filp, 2575extern int ext4_read_inline_dir(struct file *filp,
2518 void *dirent, filldir_t filldir, 2576 struct dir_context *ctx,
2519 int *has_inline_data); 2577 int *has_inline_data);
2520extern int htree_inlinedir_to_tree(struct file *dir_file, 2578extern int htree_inlinedir_to_tree(struct file *dir_file,
2521 struct inode *dir, ext4_lblk_t block, 2579 struct inode *dir, ext4_lblk_t block,
@@ -2598,8 +2656,7 @@ struct ext4_extent;
2598 2656
2599extern int ext4_ext_tree_init(handle_t *handle, struct inode *); 2657extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
2600extern int ext4_ext_writepage_trans_blocks(struct inode *, int); 2658extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
2601extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, 2659extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
2602 int chunk);
2603extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, 2660extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
2604 struct ext4_map_blocks *map, int flags); 2661 struct ext4_map_blocks *map, int flags);
2605extern void ext4_ext_truncate(handle_t *, struct inode *); 2662extern void ext4_ext_truncate(handle_t *, struct inode *);
@@ -2609,8 +2666,8 @@ extern void ext4_ext_init(struct super_block *);
2609extern void ext4_ext_release(struct super_block *); 2666extern void ext4_ext_release(struct super_block *);
2610extern long ext4_fallocate(struct file *file, int mode, loff_t offset, 2667extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
2611 loff_t len); 2668 loff_t len);
2612extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 2669extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
2613 ssize_t len); 2670 loff_t offset, ssize_t len);
2614extern int ext4_map_blocks(handle_t *handle, struct inode *inode, 2671extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
2615 struct ext4_map_blocks *map, int flags); 2672 struct ext4_map_blocks *map, int flags);
2616extern int ext4_ext_calc_metadata_amount(struct inode *inode, 2673extern int ext4_ext_calc_metadata_amount(struct inode *inode,
@@ -2650,12 +2707,15 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
2650 2707
2651/* page-io.c */ 2708/* page-io.c */
2652extern int __init ext4_init_pageio(void); 2709extern int __init ext4_init_pageio(void);
2653extern void ext4_add_complete_io(ext4_io_end_t *io_end);
2654extern void ext4_exit_pageio(void); 2710extern void ext4_exit_pageio(void);
2655extern void ext4_ioend_shutdown(struct inode *);
2656extern void ext4_free_io_end(ext4_io_end_t *io);
2657extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); 2711extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags);
2658extern void ext4_end_io_work(struct work_struct *work); 2712extern ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end);
2713extern int ext4_put_io_end(ext4_io_end_t *io_end);
2714extern void ext4_put_io_end_defer(ext4_io_end_t *io_end);
2715extern void ext4_io_submit_init(struct ext4_io_submit *io,
2716 struct writeback_control *wbc);
2717extern void ext4_end_io_rsv_work(struct work_struct *work);
2718extern void ext4_end_io_unrsv_work(struct work_struct *work);
2659extern void ext4_io_submit(struct ext4_io_submit *io); 2719extern void ext4_io_submit(struct ext4_io_submit *io);
2660extern int ext4_bio_write_page(struct ext4_io_submit *io, 2720extern int ext4_bio_write_page(struct ext4_io_submit *io,
2661 struct page *page, 2721 struct page *page,
@@ -2668,20 +2728,17 @@ extern void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp);
2668extern int ext4_mmp_csum_verify(struct super_block *sb, 2728extern int ext4_mmp_csum_verify(struct super_block *sb,
2669 struct mmp_struct *mmp); 2729 struct mmp_struct *mmp);
2670 2730
2671/* BH_Uninit flag: blocks are allocated but uninitialized on disk */ 2731/*
2732 * Note that these flags will never ever appear in a buffer_head's state flag.
2733 * See EXT4_MAP_... to see where this is used.
2734 */
2672enum ext4_state_bits { 2735enum ext4_state_bits {
2673 BH_Uninit /* blocks are allocated but uninitialized on disk */ 2736 BH_Uninit /* blocks are allocated but uninitialized on disk */
2674 = BH_JBDPrivateStart, 2737 = BH_JBDPrivateStart,
2675 BH_AllocFromCluster, /* allocated blocks were part of already 2738 BH_AllocFromCluster, /* allocated blocks were part of already
2676 * allocated cluster. Note that this flag will 2739 * allocated cluster. */
2677 * never, ever appear in a buffer_head's state
2678 * flag. See EXT4_MAP_FROM_CLUSTER to see where
2679 * this is used. */
2680}; 2740};
2681 2741
2682BUFFER_FNS(Uninit, uninit)
2683TAS_BUFFER_FNS(Uninit, uninit)
2684
2685/* 2742/*
2686 * Add new method to test whether block and inode bitmaps are properly 2743 * Add new method to test whether block and inode bitmaps are properly
2687 * initialized. With uninit_bg reading the block from disk is not enough 2744 * initialized. With uninit_bg reading the block from disk is not enough
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 451eb4045330..72a3600aedbd 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -38,31 +38,43 @@ static void ext4_put_nojournal(handle_t *handle)
38/* 38/*
39 * Wrappers for jbd2_journal_start/end. 39 * Wrappers for jbd2_journal_start/end.
40 */ 40 */
41handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 41static int ext4_journal_check_start(struct super_block *sb)
42 int type, int nblocks)
43{ 42{
44 journal_t *journal; 43 journal_t *journal;
45 44
46 might_sleep(); 45 might_sleep();
47
48 trace_ext4_journal_start(sb, nblocks, _RET_IP_);
49 if (sb->s_flags & MS_RDONLY) 46 if (sb->s_flags & MS_RDONLY)
50 return ERR_PTR(-EROFS); 47 return -EROFS;
51
52 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE); 48 WARN_ON(sb->s_writers.frozen == SB_FREEZE_COMPLETE);
53 journal = EXT4_SB(sb)->s_journal; 49 journal = EXT4_SB(sb)->s_journal;
54 if (!journal)
55 return ext4_get_nojournal();
56 /* 50 /*
57 * Special case here: if the journal has aborted behind our 51 * Special case here: if the journal has aborted behind our
58 * backs (eg. EIO in the commit thread), then we still need to 52 * backs (eg. EIO in the commit thread), then we still need to
59 * take the FS itself readonly cleanly. 53 * take the FS itself readonly cleanly.
60 */ 54 */
61 if (is_journal_aborted(journal)) { 55 if (journal && is_journal_aborted(journal)) {
62 ext4_abort(sb, "Detected aborted journal"); 56 ext4_abort(sb, "Detected aborted journal");
63 return ERR_PTR(-EROFS); 57 return -EROFS;
64 } 58 }
65 return jbd2__journal_start(journal, nblocks, GFP_NOFS, type, line); 59 return 0;
60}
61
62handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
63 int type, int blocks, int rsv_blocks)
64{
65 journal_t *journal;
66 int err;
67
68 trace_ext4_journal_start(sb, blocks, rsv_blocks, _RET_IP_);
69 err = ext4_journal_check_start(sb);
70 if (err < 0)
71 return ERR_PTR(err);
72
73 journal = EXT4_SB(sb)->s_journal;
74 if (!journal)
75 return ext4_get_nojournal();
76 return jbd2__journal_start(journal, blocks, rsv_blocks, GFP_NOFS,
77 type, line);
66} 78}
67 79
68int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) 80int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
@@ -86,6 +98,30 @@ int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle)
86 return err; 98 return err;
87} 99}
88 100
101handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
102 int type)
103{
104 struct super_block *sb;
105 int err;
106
107 if (!ext4_handle_valid(handle))
108 return ext4_get_nojournal();
109
110 sb = handle->h_journal->j_private;
111 trace_ext4_journal_start_reserved(sb, handle->h_buffer_credits,
112 _RET_IP_);
113 err = ext4_journal_check_start(sb);
114 if (err < 0) {
115 jbd2_journal_free_reserved(handle);
116 return ERR_PTR(err);
117 }
118
119 err = jbd2_journal_start_reserved(handle, type, line);
120 if (err < 0)
121 return ERR_PTR(err);
122 return handle;
123}
124
89void ext4_journal_abort_handle(const char *caller, unsigned int line, 125void ext4_journal_abort_handle(const char *caller, unsigned int line,
90 const char *err_fn, struct buffer_head *bh, 126 const char *err_fn, struct buffer_head *bh,
91 handle_t *handle, int err) 127 handle_t *handle, int err)
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index c8c6885406db..2877258d9497 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -134,7 +134,8 @@ static inline int ext4_jbd2_credits_xattr(struct inode *inode)
134#define EXT4_HT_MIGRATE 8 134#define EXT4_HT_MIGRATE 8
135#define EXT4_HT_MOVE_EXTENTS 9 135#define EXT4_HT_MOVE_EXTENTS 9
136#define EXT4_HT_XATTR 10 136#define EXT4_HT_XATTR 10
137#define EXT4_HT_MAX 11 137#define EXT4_HT_EXT_CONVERT 11
138#define EXT4_HT_MAX 12
138 139
139/** 140/**
140 * struct ext4_journal_cb_entry - Base structure for callback information. 141 * struct ext4_journal_cb_entry - Base structure for callback information.
@@ -265,7 +266,7 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line,
265 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) 266 __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb))
266 267
267handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, 268handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line,
268 int type, int nblocks); 269 int type, int blocks, int rsv_blocks);
269int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); 270int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle);
270 271
271#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) 272#define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096)
@@ -300,21 +301,37 @@ static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed)
300} 301}
301 302
302#define ext4_journal_start_sb(sb, type, nblocks) \ 303#define ext4_journal_start_sb(sb, type, nblocks) \
303 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks)) 304 __ext4_journal_start_sb((sb), __LINE__, (type), (nblocks), 0)
304 305
305#define ext4_journal_start(inode, type, nblocks) \ 306#define ext4_journal_start(inode, type, nblocks) \
306 __ext4_journal_start((inode), __LINE__, (type), (nblocks)) 307 __ext4_journal_start((inode), __LINE__, (type), (nblocks), 0)
308
309#define ext4_journal_start_with_reserve(inode, type, blocks, rsv_blocks) \
310 __ext4_journal_start((inode), __LINE__, (type), (blocks), (rsv_blocks))
307 311
308static inline handle_t *__ext4_journal_start(struct inode *inode, 312static inline handle_t *__ext4_journal_start(struct inode *inode,
309 unsigned int line, int type, 313 unsigned int line, int type,
310 int nblocks) 314 int blocks, int rsv_blocks)
311{ 315{
312 return __ext4_journal_start_sb(inode->i_sb, line, type, nblocks); 316 return __ext4_journal_start_sb(inode->i_sb, line, type, blocks,
317 rsv_blocks);
313} 318}
314 319
315#define ext4_journal_stop(handle) \ 320#define ext4_journal_stop(handle) \
316 __ext4_journal_stop(__func__, __LINE__, (handle)) 321 __ext4_journal_stop(__func__, __LINE__, (handle))
317 322
323#define ext4_journal_start_reserved(handle, type) \
324 __ext4_journal_start_reserved((handle), __LINE__, (type))
325
326handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line,
327 int type);
328
329static inline void ext4_journal_free_reserved(handle_t *handle)
330{
331 if (ext4_handle_valid(handle))
332 jbd2_journal_free_reserved(handle);
333}
334
318static inline handle_t *ext4_journal_current_handle(void) 335static inline handle_t *ext4_journal_current_handle(void)
319{ 336{
320 return journal_current_handle(); 337 return journal_current_handle();
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index bc0f1910b9cf..72ba4705d4fa 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -2125,7 +2125,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
2125 next_del = ext4_find_delayed_extent(inode, &es); 2125 next_del = ext4_find_delayed_extent(inode, &es);
2126 if (!exists && next_del) { 2126 if (!exists && next_del) {
2127 exists = 1; 2127 exists = 1;
2128 flags |= FIEMAP_EXTENT_DELALLOC; 2128 flags |= (FIEMAP_EXTENT_DELALLOC |
2129 FIEMAP_EXTENT_UNKNOWN);
2129 } 2130 }
2130 up_read(&EXT4_I(inode)->i_data_sem); 2131 up_read(&EXT4_I(inode)->i_data_sem);
2131 2132
@@ -2328,17 +2329,15 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
2328} 2329}
2329 2330
2330/* 2331/*
2331 * How many index/leaf blocks need to change/allocate to modify nrblocks? 2332 * How many index/leaf blocks need to change/allocate to add @extents extents?
2332 * 2333 *
2333 * if nrblocks are fit in a single extent (chunk flag is 1), then 2334 * If we add a single extent, then in the worse case, each tree level
2334 * in the worse case, each tree level index/leaf need to be changed 2335 * index/leaf need to be changed in case of the tree split.
2335 * if the tree split due to insert a new extent, then the old tree
2336 * index/leaf need to be updated too
2337 * 2336 *
2338 * If the nrblocks are discontiguous, they could cause 2337 * If more extents are inserted, they could cause the whole tree split more
2339 * the whole tree split more than once, but this is really rare. 2338 * than once, but this is really rare.
2340 */ 2339 */
2341int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 2340int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
2342{ 2341{
2343 int index; 2342 int index;
2344 int depth; 2343 int depth;
@@ -2349,7 +2348,7 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2349 2348
2350 depth = ext_depth(inode); 2349 depth = ext_depth(inode);
2351 2350
2352 if (chunk) 2351 if (extents <= 1)
2353 index = depth * 2; 2352 index = depth * 2;
2354 else 2353 else
2355 index = depth * 3; 2354 index = depth * 3;
@@ -2357,20 +2356,24 @@ int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
2357 return index; 2356 return index;
2358} 2357}
2359 2358
2359static inline int get_default_free_blocks_flags(struct inode *inode)
2360{
2361 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2362 return EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2363 else if (ext4_should_journal_data(inode))
2364 return EXT4_FREE_BLOCKS_FORGET;
2365 return 0;
2366}
2367
2360static int ext4_remove_blocks(handle_t *handle, struct inode *inode, 2368static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2361 struct ext4_extent *ex, 2369 struct ext4_extent *ex,
2362 ext4_fsblk_t *partial_cluster, 2370 long long *partial_cluster,
2363 ext4_lblk_t from, ext4_lblk_t to) 2371 ext4_lblk_t from, ext4_lblk_t to)
2364{ 2372{
2365 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2373 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
2366 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2374 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2367 ext4_fsblk_t pblk; 2375 ext4_fsblk_t pblk;
2368 int flags = 0; 2376 int flags = get_default_free_blocks_flags(inode);
2369
2370 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2371 flags |= EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET;
2372 else if (ext4_should_journal_data(inode))
2373 flags |= EXT4_FREE_BLOCKS_FORGET;
2374 2377
2375 /* 2378 /*
2376 * For bigalloc file systems, we never free a partial cluster 2379 * For bigalloc file systems, we never free a partial cluster
@@ -2388,7 +2391,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2388 * partial cluster here. 2391 * partial cluster here.
2389 */ 2392 */
2390 pblk = ext4_ext_pblock(ex) + ee_len - 1; 2393 pblk = ext4_ext_pblock(ex) + ee_len - 1;
2391 if (*partial_cluster && (EXT4_B2C(sbi, pblk) != *partial_cluster)) { 2394 if ((*partial_cluster > 0) &&
2395 (EXT4_B2C(sbi, pblk) != *partial_cluster)) {
2392 ext4_free_blocks(handle, inode, NULL, 2396 ext4_free_blocks(handle, inode, NULL,
2393 EXT4_C2B(sbi, *partial_cluster), 2397 EXT4_C2B(sbi, *partial_cluster),
2394 sbi->s_cluster_ratio, flags); 2398 sbi->s_cluster_ratio, flags);
@@ -2414,41 +2418,46 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2414 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { 2418 && to == le32_to_cpu(ex->ee_block) + ee_len - 1) {
2415 /* tail removal */ 2419 /* tail removal */
2416 ext4_lblk_t num; 2420 ext4_lblk_t num;
2421 unsigned int unaligned;
2417 2422
2418 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2423 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2419 pblk = ext4_ext_pblock(ex) + ee_len - num; 2424 pblk = ext4_ext_pblock(ex) + ee_len - num;
2420 ext_debug("free last %u blocks starting %llu\n", num, pblk); 2425 /*
2426 * Usually we want to free partial cluster at the end of the
2427 * extent, except for the situation when the cluster is still
2428 * used by any other extent (partial_cluster is negative).
2429 */
2430 if (*partial_cluster < 0 &&
2431 -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1))
2432 flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER;
2433
2434 ext_debug("free last %u blocks starting %llu partial %lld\n",
2435 num, pblk, *partial_cluster);
2421 ext4_free_blocks(handle, inode, NULL, pblk, num, flags); 2436 ext4_free_blocks(handle, inode, NULL, pblk, num, flags);
2422 /* 2437 /*
2423 * If the block range to be freed didn't start at the 2438 * If the block range to be freed didn't start at the
2424 * beginning of a cluster, and we removed the entire 2439 * beginning of a cluster, and we removed the entire
2425 * extent, save the partial cluster here, since we 2440 * extent and the cluster is not used by any other extent,
2426 * might need to delete if we determine that the 2441 * save the partial cluster here, since we might need to
2427 * truncate operation has removed all of the blocks in 2442 * delete if we determine that the truncate operation has
2428 * the cluster. 2443 * removed all of the blocks in the cluster.
2444 *
2445 * On the other hand, if we did not manage to free the whole
2446 * extent, we have to mark the cluster as used (store negative
2447 * cluster number in partial_cluster).
2429 */ 2448 */
2430 if (pblk & (sbi->s_cluster_ratio - 1) && 2449 unaligned = pblk & (sbi->s_cluster_ratio - 1);
2431 (ee_len == num)) 2450 if (unaligned && (ee_len == num) &&
2451 (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk))))
2432 *partial_cluster = EXT4_B2C(sbi, pblk); 2452 *partial_cluster = EXT4_B2C(sbi, pblk);
2433 else 2453 else if (unaligned)
2454 *partial_cluster = -((long long)EXT4_B2C(sbi, pblk));
2455 else if (*partial_cluster > 0)
2434 *partial_cluster = 0; 2456 *partial_cluster = 0;
2435 } else if (from == le32_to_cpu(ex->ee_block) 2457 } else
2436 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2458 ext4_error(sbi->s_sb, "strange request: removal(2) "
2437 /* head removal */ 2459 "%u-%u from %u:%u\n",
2438 ext4_lblk_t num; 2460 from, to, le32_to_cpu(ex->ee_block), ee_len);
2439 ext4_fsblk_t start;
2440
2441 num = to - from;
2442 start = ext4_ext_pblock(ex);
2443
2444 ext_debug("free first %u blocks starting %llu\n", num, start);
2445 ext4_free_blocks(handle, inode, NULL, start, num, flags);
2446
2447 } else {
2448 printk(KERN_INFO "strange request: removal(2) "
2449 "%u-%u from %u:%u\n",
2450 from, to, le32_to_cpu(ex->ee_block), ee_len);
2451 }
2452 return 0; 2461 return 0;
2453} 2462}
2454 2463
@@ -2461,12 +2470,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2461 * @handle: The journal handle 2470 * @handle: The journal handle
2462 * @inode: The files inode 2471 * @inode: The files inode
2463 * @path: The path to the leaf 2472 * @path: The path to the leaf
2473 * @partial_cluster: The cluster which we'll have to free if all extents
2474 * has been released from it. It gets negative in case
2475 * that the cluster is still used.
2464 * @start: The first block to remove 2476 * @start: The first block to remove
2465 * @end: The last block to remove 2477 * @end: The last block to remove
2466 */ 2478 */
2467static int 2479static int
2468ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, 2480ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2469 struct ext4_ext_path *path, ext4_fsblk_t *partial_cluster, 2481 struct ext4_ext_path *path,
2482 long long *partial_cluster,
2470 ext4_lblk_t start, ext4_lblk_t end) 2483 ext4_lblk_t start, ext4_lblk_t end)
2471{ 2484{
2472 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2485 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2479,6 +2492,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2479 unsigned short ex_ee_len; 2492 unsigned short ex_ee_len;
2480 unsigned uninitialized = 0; 2493 unsigned uninitialized = 0;
2481 struct ext4_extent *ex; 2494 struct ext4_extent *ex;
2495 ext4_fsblk_t pblk;
2482 2496
2483 /* the header must be checked already in ext4_ext_remove_space() */ 2497 /* the header must be checked already in ext4_ext_remove_space() */
2484 ext_debug("truncate since %u in leaf to %u\n", start, end); 2498 ext_debug("truncate since %u in leaf to %u\n", start, end);
@@ -2490,7 +2504,9 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2490 return -EIO; 2504 return -EIO;
2491 } 2505 }
2492 /* find where to start removing */ 2506 /* find where to start removing */
2493 ex = EXT_LAST_EXTENT(eh); 2507 ex = path[depth].p_ext;
2508 if (!ex)
2509 ex = EXT_LAST_EXTENT(eh);
2494 2510
2495 ex_ee_block = le32_to_cpu(ex->ee_block); 2511 ex_ee_block = le32_to_cpu(ex->ee_block);
2496 ex_ee_len = ext4_ext_get_actual_len(ex); 2512 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2517,6 +2533,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2517 2533
2518 /* If this extent is beyond the end of the hole, skip it */ 2534 /* If this extent is beyond the end of the hole, skip it */
2519 if (end < ex_ee_block) { 2535 if (end < ex_ee_block) {
2536 /*
2537 * We're going to skip this extent and move to another,
2538 * so if this extent is not cluster aligned we have
2539 * to mark the current cluster as used to avoid
2540 * accidentally freeing it later on
2541 */
2542 pblk = ext4_ext_pblock(ex);
2543 if (pblk & (sbi->s_cluster_ratio - 1))
2544 *partial_cluster =
2545 -((long long)EXT4_B2C(sbi, pblk));
2520 ex--; 2546 ex--;
2521 ex_ee_block = le32_to_cpu(ex->ee_block); 2547 ex_ee_block = le32_to_cpu(ex->ee_block);
2522 ex_ee_len = ext4_ext_get_actual_len(ex); 2548 ex_ee_len = ext4_ext_get_actual_len(ex);
@@ -2592,7 +2618,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2592 sizeof(struct ext4_extent)); 2618 sizeof(struct ext4_extent));
2593 } 2619 }
2594 le16_add_cpu(&eh->eh_entries, -1); 2620 le16_add_cpu(&eh->eh_entries, -1);
2595 } else 2621 } else if (*partial_cluster > 0)
2596 *partial_cluster = 0; 2622 *partial_cluster = 0;
2597 2623
2598 err = ext4_ext_dirty(handle, inode, path + depth); 2624 err = ext4_ext_dirty(handle, inode, path + depth);
@@ -2610,17 +2636,13 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2610 err = ext4_ext_correct_indexes(handle, inode, path); 2636 err = ext4_ext_correct_indexes(handle, inode, path);
2611 2637
2612 /* 2638 /*
2613 * If there is still a entry in the leaf node, check to see if 2639 * Free the partial cluster only if the current extent does not
2614 * it references the partial cluster. This is the only place 2640 * reference it. Otherwise we might free used cluster.
2615 * where it could; if it doesn't, we can free the cluster.
2616 */ 2641 */
2617 if (*partial_cluster && ex >= EXT_FIRST_EXTENT(eh) && 2642 if (*partial_cluster > 0 &&
2618 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != 2643 (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) !=
2619 *partial_cluster)) { 2644 *partial_cluster)) {
2620 int flags = EXT4_FREE_BLOCKS_FORGET; 2645 int flags = get_default_free_blocks_flags(inode);
2621
2622 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2623 flags |= EXT4_FREE_BLOCKS_METADATA;
2624 2646
2625 ext4_free_blocks(handle, inode, NULL, 2647 ext4_free_blocks(handle, inode, NULL,
2626 EXT4_C2B(sbi, *partial_cluster), 2648 EXT4_C2B(sbi, *partial_cluster),
@@ -2664,7 +2686,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2664 struct super_block *sb = inode->i_sb; 2686 struct super_block *sb = inode->i_sb;
2665 int depth = ext_depth(inode); 2687 int depth = ext_depth(inode);
2666 struct ext4_ext_path *path = NULL; 2688 struct ext4_ext_path *path = NULL;
2667 ext4_fsblk_t partial_cluster = 0; 2689 long long partial_cluster = 0;
2668 handle_t *handle; 2690 handle_t *handle;
2669 int i = 0, err = 0; 2691 int i = 0, err = 0;
2670 2692
@@ -2676,7 +2698,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
2676 return PTR_ERR(handle); 2698 return PTR_ERR(handle);
2677 2699
2678again: 2700again:
2679 trace_ext4_ext_remove_space(inode, start, depth); 2701 trace_ext4_ext_remove_space(inode, start, end, depth);
2680 2702
2681 /* 2703 /*
2682 * Check if we are removing extents inside the extent tree. If that 2704 * Check if we are removing extents inside the extent tree. If that
@@ -2813,6 +2835,9 @@ again:
2813 err = -EIO; 2835 err = -EIO;
2814 break; 2836 break;
2815 } 2837 }
2838 /* Yield here to deal with large extent trees.
2839 * Should be a no-op if we did IO above. */
2840 cond_resched();
2816 if (WARN_ON(i + 1 > depth)) { 2841 if (WARN_ON(i + 1 > depth)) {
2817 err = -EIO; 2842 err = -EIO;
2818 break; 2843 break;
@@ -2844,17 +2869,14 @@ again:
2844 } 2869 }
2845 } 2870 }
2846 2871
2847 trace_ext4_ext_remove_space_done(inode, start, depth, partial_cluster, 2872 trace_ext4_ext_remove_space_done(inode, start, end, depth,
2848 path->p_hdr->eh_entries); 2873 partial_cluster, path->p_hdr->eh_entries);
2849 2874
2850 /* If we still have something in the partial cluster and we have removed 2875 /* If we still have something in the partial cluster and we have removed
2851 * even the first extent, then we should free the blocks in the partial 2876 * even the first extent, then we should free the blocks in the partial
2852 * cluster as well. */ 2877 * cluster as well. */
2853 if (partial_cluster && path->p_hdr->eh_entries == 0) { 2878 if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) {
2854 int flags = EXT4_FREE_BLOCKS_FORGET; 2879 int flags = get_default_free_blocks_flags(inode);
2855
2856 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2857 flags |= EXT4_FREE_BLOCKS_METADATA;
2858 2880
2859 ext4_free_blocks(handle, inode, NULL, 2881 ext4_free_blocks(handle, inode, NULL,
2860 EXT4_C2B(EXT4_SB(sb), partial_cluster), 2882 EXT4_C2B(EXT4_SB(sb), partial_cluster),
@@ -4242,8 +4264,8 @@ got_allocated_blocks:
4242 /* not a good idea to call discard here directly, 4264 /* not a good idea to call discard here directly,
4243 * but otherwise we'd need to call it every free() */ 4265 * but otherwise we'd need to call it every free() */
4244 ext4_discard_preallocations(inode); 4266 ext4_discard_preallocations(inode);
4245 ext4_free_blocks(handle, inode, NULL, ext4_ext_pblock(&newex), 4267 ext4_free_blocks(handle, inode, NULL, newblock,
4246 ext4_ext_get_actual_len(&newex), fb_flags); 4268 EXT4_C2B(sbi, allocated_clusters), fb_flags);
4247 goto out2; 4269 goto out2;
4248 } 4270 }
4249 4271
@@ -4363,8 +4385,9 @@ out2:
4363 } 4385 }
4364 4386
4365out3: 4387out3:
4366 trace_ext4_ext_map_blocks_exit(inode, map, err ? err : allocated); 4388 trace_ext4_ext_map_blocks_exit(inode, flags, map,
4367 4389 err ? err : allocated);
4390 ext4_es_lru_add(inode);
4368 return err ? err : allocated; 4391 return err ? err : allocated;
4369} 4392}
4370 4393
@@ -4386,9 +4409,20 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
4386 4409
4387 last_block = (inode->i_size + sb->s_blocksize - 1) 4410 last_block = (inode->i_size + sb->s_blocksize - 1)
4388 >> EXT4_BLOCK_SIZE_BITS(sb); 4411 >> EXT4_BLOCK_SIZE_BITS(sb);
4412retry:
4389 err = ext4_es_remove_extent(inode, last_block, 4413 err = ext4_es_remove_extent(inode, last_block,
4390 EXT_MAX_BLOCKS - last_block); 4414 EXT_MAX_BLOCKS - last_block);
4415 if (err == -ENOMEM) {
4416 cond_resched();
4417 congestion_wait(BLK_RW_ASYNC, HZ/50);
4418 goto retry;
4419 }
4420 if (err) {
4421 ext4_std_error(inode->i_sb, err);
4422 return;
4423 }
4391 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); 4424 err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
4425 ext4_std_error(inode->i_sb, err);
4392} 4426}
4393 4427
4394static void ext4_falloc_update_inode(struct inode *inode, 4428static void ext4_falloc_update_inode(struct inode *inode,
@@ -4446,7 +4480,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
4446 return -EOPNOTSUPP; 4480 return -EOPNOTSUPP;
4447 4481
4448 if (mode & FALLOC_FL_PUNCH_HOLE) 4482 if (mode & FALLOC_FL_PUNCH_HOLE)
4449 return ext4_punch_hole(file, offset, len); 4483 return ext4_punch_hole(inode, offset, len);
4450 4484
4451 ret = ext4_convert_inline_data(inode); 4485 ret = ext4_convert_inline_data(inode);
4452 if (ret) 4486 if (ret)
@@ -4548,10 +4582,9 @@ retry:
4548 * function, to convert the fallocated extents after IO is completed. 4582 * function, to convert the fallocated extents after IO is completed.
4549 * Returns 0 on success. 4583 * Returns 0 on success.
4550 */ 4584 */
4551int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, 4585int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
4552 ssize_t len) 4586 loff_t offset, ssize_t len)
4553{ 4587{
4554 handle_t *handle;
4555 unsigned int max_blocks; 4588 unsigned int max_blocks;
4556 int ret = 0; 4589 int ret = 0;
4557 int ret2 = 0; 4590 int ret2 = 0;
@@ -4566,16 +4599,32 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4566 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - 4599 max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
4567 map.m_lblk); 4600 map.m_lblk);
4568 /* 4601 /*
4569 * credits to insert 1 extent into extent tree 4602 * This is somewhat ugly but the idea is clear: When transaction is
4603 * reserved, everything goes into it. Otherwise we rather start several
4604 * smaller transactions for conversion of each extent separately.
4570 */ 4605 */
4571 credits = ext4_chunk_trans_blocks(inode, max_blocks); 4606 if (handle) {
4607 handle = ext4_journal_start_reserved(handle,
4608 EXT4_HT_EXT_CONVERT);
4609 if (IS_ERR(handle))
4610 return PTR_ERR(handle);
4611 credits = 0;
4612 } else {
4613 /*
4614 * credits to insert 1 extent into extent tree
4615 */
4616 credits = ext4_chunk_trans_blocks(inode, max_blocks);
4617 }
4572 while (ret >= 0 && ret < max_blocks) { 4618 while (ret >= 0 && ret < max_blocks) {
4573 map.m_lblk += ret; 4619 map.m_lblk += ret;
4574 map.m_len = (max_blocks -= ret); 4620 map.m_len = (max_blocks -= ret);
4575 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); 4621 if (credits) {
4576 if (IS_ERR(handle)) { 4622 handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
4577 ret = PTR_ERR(handle); 4623 credits);
4578 break; 4624 if (IS_ERR(handle)) {
4625 ret = PTR_ERR(handle);
4626 break;
4627 }
4579 } 4628 }
4580 ret = ext4_map_blocks(handle, inode, &map, 4629 ret = ext4_map_blocks(handle, inode, &map,
4581 EXT4_GET_BLOCKS_IO_CONVERT_EXT); 4630 EXT4_GET_BLOCKS_IO_CONVERT_EXT);
@@ -4586,10 +4635,13 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
4586 inode->i_ino, map.m_lblk, 4635 inode->i_ino, map.m_lblk,
4587 map.m_len, ret); 4636 map.m_len, ret);
4588 ext4_mark_inode_dirty(handle, inode); 4637 ext4_mark_inode_dirty(handle, inode);
4589 ret2 = ext4_journal_stop(handle); 4638 if (credits)
4590 if (ret <= 0 || ret2 ) 4639 ret2 = ext4_journal_stop(handle);
4640 if (ret <= 0 || ret2)
4591 break; 4641 break;
4592 } 4642 }
4643 if (!credits)
4644 ret2 = ext4_journal_stop(handle);
4593 return ret > 0 ? ret2 : ret; 4645 return ret > 0 ? ret2 : ret;
4594} 4646}
4595 4647
@@ -4659,7 +4711,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4659 error = ext4_get_inode_loc(inode, &iloc); 4711 error = ext4_get_inode_loc(inode, &iloc);
4660 if (error) 4712 if (error)
4661 return error; 4713 return error;
4662 physical = iloc.bh->b_blocknr << blockbits; 4714 physical = (__u64)iloc.bh->b_blocknr << blockbits;
4663 offset = EXT4_GOOD_OLD_INODE_SIZE + 4715 offset = EXT4_GOOD_OLD_INODE_SIZE +
4664 EXT4_I(inode)->i_extra_isize; 4716 EXT4_I(inode)->i_extra_isize;
4665 physical += offset; 4717 physical += offset;
@@ -4667,7 +4719,7 @@ static int ext4_xattr_fiemap(struct inode *inode,
4667 flags |= FIEMAP_EXTENT_DATA_INLINE; 4719 flags |= FIEMAP_EXTENT_DATA_INLINE;
4668 brelse(iloc.bh); 4720 brelse(iloc.bh);
4669 } else { /* external block */ 4721 } else { /* external block */
4670 physical = EXT4_I(inode)->i_file_acl << blockbits; 4722 physical = (__u64)EXT4_I(inode)->i_file_acl << blockbits;
4671 length = inode->i_sb->s_blocksize; 4723 length = inode->i_sb->s_blocksize;
4672 } 4724 }
4673 4725
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index e6941e622d31..91cb110da1b4 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -10,6 +10,7 @@
10 * Ext4 extents status tree core functions. 10 * Ext4 extents status tree core functions.
11 */ 11 */
12#include <linux/rbtree.h> 12#include <linux/rbtree.h>
13#include <linux/list_sort.h>
13#include "ext4.h" 14#include "ext4.h"
14#include "extents_status.h" 15#include "extents_status.h"
15#include "ext4_extents.h" 16#include "ext4_extents.h"
@@ -147,6 +148,8 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
147 ext4_lblk_t end); 148 ext4_lblk_t end);
148static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, 149static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
149 int nr_to_scan); 150 int nr_to_scan);
151static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
152 struct ext4_inode_info *locked_ei);
150 153
151int __init ext4_init_es(void) 154int __init ext4_init_es(void)
152{ 155{
@@ -291,7 +294,6 @@ out:
291 294
292 read_unlock(&EXT4_I(inode)->i_es_lock); 295 read_unlock(&EXT4_I(inode)->i_es_lock);
293 296
294 ext4_es_lru_add(inode);
295 trace_ext4_es_find_delayed_extent_range_exit(inode, es); 297 trace_ext4_es_find_delayed_extent_range_exit(inode, es);
296} 298}
297 299
@@ -439,7 +441,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
439 */ 441 */
440 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) { 442 if (!ext4_es_is_written(es) && !ext4_es_is_unwritten(es)) {
441 if (in_range(es->es_lblk, ee_block, ee_len)) { 443 if (in_range(es->es_lblk, ee_block, ee_len)) {
442 pr_warn("ES insert assertation failed for " 444 pr_warn("ES insert assertion failed for "
443 "inode: %lu we can find an extent " 445 "inode: %lu we can find an extent "
444 "at block [%d/%d/%llu/%c], but we " 446 "at block [%d/%d/%llu/%c], but we "
445 "want to add an delayed/hole extent " 447 "want to add an delayed/hole extent "
@@ -458,7 +460,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
458 */ 460 */
459 if (es->es_lblk < ee_block || 461 if (es->es_lblk < ee_block ||
460 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) { 462 ext4_es_pblock(es) != ee_start + es->es_lblk - ee_block) {
461 pr_warn("ES insert assertation failed for inode: %lu " 463 pr_warn("ES insert assertion failed for inode: %lu "
462 "ex_status [%d/%d/%llu/%c] != " 464 "ex_status [%d/%d/%llu/%c] != "
463 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 465 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
464 ee_block, ee_len, ee_start, 466 ee_block, ee_len, ee_start,
@@ -468,7 +470,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
468 } 470 }
469 471
470 if (ee_status ^ es_status) { 472 if (ee_status ^ es_status) {
471 pr_warn("ES insert assertation failed for inode: %lu " 473 pr_warn("ES insert assertion failed for inode: %lu "
472 "ex_status [%d/%d/%llu/%c] != " 474 "ex_status [%d/%d/%llu/%c] != "
473 "es_status [%d/%d/%llu/%c]\n", inode->i_ino, 475 "es_status [%d/%d/%llu/%c]\n", inode->i_ino,
474 ee_block, ee_len, ee_start, 476 ee_block, ee_len, ee_start,
@@ -481,7 +483,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode,
481 * that we don't want to add an written/unwritten extent. 483 * that we don't want to add an written/unwritten extent.
482 */ 484 */
483 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) { 485 if (!ext4_es_is_delayed(es) && !ext4_es_is_hole(es)) {
484 pr_warn("ES insert assertation failed for inode: %lu " 486 pr_warn("ES insert assertion failed for inode: %lu "
485 "can't find an extent at block %d but we want " 487 "can't find an extent at block %d but we want "
486 "to add an written/unwritten extent " 488 "to add an written/unwritten extent "
487 "[%d/%d/%llu/%llx]\n", inode->i_ino, 489 "[%d/%d/%llu/%llx]\n", inode->i_ino,
@@ -519,7 +521,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
519 * We want to add a delayed/hole extent but this 521 * We want to add a delayed/hole extent but this
520 * block has been allocated. 522 * block has been allocated.
521 */ 523 */
522 pr_warn("ES insert assertation failed for inode: %lu " 524 pr_warn("ES insert assertion failed for inode: %lu "
523 "We can find blocks but we want to add a " 525 "We can find blocks but we want to add a "
524 "delayed/hole extent [%d/%d/%llu/%llx]\n", 526 "delayed/hole extent [%d/%d/%llu/%llx]\n",
525 inode->i_ino, es->es_lblk, es->es_len, 527 inode->i_ino, es->es_lblk, es->es_len,
@@ -527,13 +529,13 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
527 return; 529 return;
528 } else if (ext4_es_is_written(es)) { 530 } else if (ext4_es_is_written(es)) {
529 if (retval != es->es_len) { 531 if (retval != es->es_len) {
530 pr_warn("ES insert assertation failed for " 532 pr_warn("ES insert assertion failed for "
531 "inode: %lu retval %d != es_len %d\n", 533 "inode: %lu retval %d != es_len %d\n",
532 inode->i_ino, retval, es->es_len); 534 inode->i_ino, retval, es->es_len);
533 return; 535 return;
534 } 536 }
535 if (map.m_pblk != ext4_es_pblock(es)) { 537 if (map.m_pblk != ext4_es_pblock(es)) {
536 pr_warn("ES insert assertation failed for " 538 pr_warn("ES insert assertion failed for "
537 "inode: %lu m_pblk %llu != " 539 "inode: %lu m_pblk %llu != "
538 "es_pblk %llu\n", 540 "es_pblk %llu\n",
539 inode->i_ino, map.m_pblk, 541 inode->i_ino, map.m_pblk,
@@ -549,7 +551,7 @@ static void ext4_es_insert_extent_ind_check(struct inode *inode,
549 } 551 }
550 } else if (retval == 0) { 552 } else if (retval == 0) {
551 if (ext4_es_is_written(es)) { 553 if (ext4_es_is_written(es)) {
552 pr_warn("ES insert assertation failed for inode: %lu " 554 pr_warn("ES insert assertion failed for inode: %lu "
553 "We can't find the block but we want to add " 555 "We can't find the block but we want to add "
554 "an written extent [%d/%d/%llu/%llx]\n", 556 "an written extent [%d/%d/%llu/%llx]\n",
555 inode->i_ino, es->es_lblk, es->es_len, 557 inode->i_ino, es->es_lblk, es->es_len,
@@ -632,10 +634,8 @@ out:
632} 634}
633 635
634/* 636/*
635 * ext4_es_insert_extent() adds a space to a extent status tree. 637 * ext4_es_insert_extent() adds information to an inode's extent
636 * 638 * status tree.
637 * ext4_es_insert_extent is called by ext4_da_write_begin and
638 * ext4_es_remove_extent.
639 * 639 *
640 * Return 0 on success, error code on failure. 640 * Return 0 on success, error code on failure.
641 */ 641 */
@@ -667,12 +667,17 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
667 err = __es_remove_extent(inode, lblk, end); 667 err = __es_remove_extent(inode, lblk, end);
668 if (err != 0) 668 if (err != 0)
669 goto error; 669 goto error;
670retry:
670 err = __es_insert_extent(inode, &newes); 671 err = __es_insert_extent(inode, &newes);
672 if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
673 EXT4_I(inode)))
674 goto retry;
675 if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
676 err = 0;
671 677
672error: 678error:
673 write_unlock(&EXT4_I(inode)->i_es_lock); 679 write_unlock(&EXT4_I(inode)->i_es_lock);
674 680
675 ext4_es_lru_add(inode);
676 ext4_es_print_tree(inode); 681 ext4_es_print_tree(inode);
677 682
678 return err; 683 return err;
@@ -734,7 +739,6 @@ out:
734 739
735 read_unlock(&EXT4_I(inode)->i_es_lock); 740 read_unlock(&EXT4_I(inode)->i_es_lock);
736 741
737 ext4_es_lru_add(inode);
738 trace_ext4_es_lookup_extent_exit(inode, es, found); 742 trace_ext4_es_lookup_extent_exit(inode, es, found);
739 return found; 743 return found;
740} 744}
@@ -748,8 +752,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
748 struct extent_status orig_es; 752 struct extent_status orig_es;
749 ext4_lblk_t len1, len2; 753 ext4_lblk_t len1, len2;
750 ext4_fsblk_t block; 754 ext4_fsblk_t block;
751 int err = 0; 755 int err;
752 756
757retry:
758 err = 0;
753 es = __es_tree_search(&tree->root, lblk); 759 es = __es_tree_search(&tree->root, lblk);
754 if (!es) 760 if (!es)
755 goto out; 761 goto out;
@@ -784,6 +790,10 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
784 if (err) { 790 if (err) {
785 es->es_lblk = orig_es.es_lblk; 791 es->es_lblk = orig_es.es_lblk;
786 es->es_len = orig_es.es_len; 792 es->es_len = orig_es.es_len;
793 if ((err == -ENOMEM) &&
794 __ext4_es_shrink(EXT4_SB(inode->i_sb), 1,
795 EXT4_I(inode)))
796 goto retry;
787 goto out; 797 goto out;
788 } 798 }
789 } else { 799 } else {
@@ -878,38 +888,64 @@ int ext4_es_zeroout(struct inode *inode, struct ext4_extent *ex)
878 EXTENT_STATUS_WRITTEN); 888 EXTENT_STATUS_WRITTEN);
879} 889}
880 890
881static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc) 891static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a,
892 struct list_head *b)
893{
894 struct ext4_inode_info *eia, *eib;
895 eia = list_entry(a, struct ext4_inode_info, i_es_lru);
896 eib = list_entry(b, struct ext4_inode_info, i_es_lru);
897
898 if (eia->i_touch_when == eib->i_touch_when)
899 return 0;
900 if (time_after(eia->i_touch_when, eib->i_touch_when))
901 return 1;
902 else
903 return -1;
904}
905
906static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
907 struct ext4_inode_info *locked_ei)
882{ 908{
883 struct ext4_sb_info *sbi = container_of(shrink,
884 struct ext4_sb_info, s_es_shrinker);
885 struct ext4_inode_info *ei; 909 struct ext4_inode_info *ei;
886 struct list_head *cur, *tmp, scanned; 910 struct list_head *cur, *tmp;
887 int nr_to_scan = sc->nr_to_scan; 911 LIST_HEAD(skiped);
888 int ret, nr_shrunk = 0; 912 int ret, nr_shrunk = 0;
889 913
890 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 914 spin_lock(&sbi->s_es_lru_lock);
891 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
892
893 if (!nr_to_scan)
894 return ret;
895 915
896 INIT_LIST_HEAD(&scanned); 916 /*
917 * If the inode that is at the head of LRU list is newer than
918 * last_sorted time, that means that we need to sort this list.
919 */
920 ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru);
921 if (sbi->s_es_last_sorted < ei->i_touch_when) {
922 list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp);
923 sbi->s_es_last_sorted = jiffies;
924 }
897 925
898 spin_lock(&sbi->s_es_lru_lock);
899 list_for_each_safe(cur, tmp, &sbi->s_es_lru) { 926 list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
900 list_move_tail(cur, &scanned); 927 /*
928 * If we have already reclaimed all extents from extent
929 * status tree, just stop the loop immediately.
930 */
931 if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0)
932 break;
901 933
902 ei = list_entry(cur, struct ext4_inode_info, i_es_lru); 934 ei = list_entry(cur, struct ext4_inode_info, i_es_lru);
903 935
904 read_lock(&ei->i_es_lock); 936 /* Skip the inode that is newer than the last_sorted time */
905 if (ei->i_es_lru_nr == 0) { 937 if (sbi->s_es_last_sorted < ei->i_touch_when) {
906 read_unlock(&ei->i_es_lock); 938 list_move_tail(cur, &skiped);
907 continue; 939 continue;
908 } 940 }
909 read_unlock(&ei->i_es_lock); 941
942 if (ei->i_es_lru_nr == 0 || ei == locked_ei)
943 continue;
910 944
911 write_lock(&ei->i_es_lock); 945 write_lock(&ei->i_es_lock);
912 ret = __es_try_to_reclaim_extents(ei, nr_to_scan); 946 ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
947 if (ei->i_es_lru_nr == 0)
948 list_del_init(&ei->i_es_lru);
913 write_unlock(&ei->i_es_lock); 949 write_unlock(&ei->i_es_lock);
914 950
915 nr_shrunk += ret; 951 nr_shrunk += ret;
@@ -917,29 +953,50 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
917 if (nr_to_scan == 0) 953 if (nr_to_scan == 0)
918 break; 954 break;
919 } 955 }
920 list_splice_tail(&scanned, &sbi->s_es_lru); 956
957 /* Move the newer inodes into the tail of the LRU list. */
958 list_splice_tail(&skiped, &sbi->s_es_lru);
921 spin_unlock(&sbi->s_es_lru_lock); 959 spin_unlock(&sbi->s_es_lru_lock);
922 960
961 if (locked_ei && nr_shrunk == 0)
962 nr_shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
963
964 return nr_shrunk;
965}
966
967static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
968{
969 struct ext4_sb_info *sbi = container_of(shrink,
970 struct ext4_sb_info, s_es_shrinker);
971 int nr_to_scan = sc->nr_to_scan;
972 int ret, nr_shrunk;
973
974 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
975 trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret);
976
977 if (!nr_to_scan)
978 return ret;
979
980 nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
981
923 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); 982 ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
924 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); 983 trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
925 return ret; 984 return ret;
926} 985}
927 986
928void ext4_es_register_shrinker(struct super_block *sb) 987void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
929{ 988{
930 struct ext4_sb_info *sbi;
931
932 sbi = EXT4_SB(sb);
933 INIT_LIST_HEAD(&sbi->s_es_lru); 989 INIT_LIST_HEAD(&sbi->s_es_lru);
934 spin_lock_init(&sbi->s_es_lru_lock); 990 spin_lock_init(&sbi->s_es_lru_lock);
991 sbi->s_es_last_sorted = 0;
935 sbi->s_es_shrinker.shrink = ext4_es_shrink; 992 sbi->s_es_shrinker.shrink = ext4_es_shrink;
936 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; 993 sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
937 register_shrinker(&sbi->s_es_shrinker); 994 register_shrinker(&sbi->s_es_shrinker);
938} 995}
939 996
940void ext4_es_unregister_shrinker(struct super_block *sb) 997void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi)
941{ 998{
942 unregister_shrinker(&EXT4_SB(sb)->s_es_shrinker); 999 unregister_shrinker(&sbi->s_es_shrinker);
943} 1000}
944 1001
945void ext4_es_lru_add(struct inode *inode) 1002void ext4_es_lru_add(struct inode *inode)
@@ -947,11 +1004,14 @@ void ext4_es_lru_add(struct inode *inode)
947 struct ext4_inode_info *ei = EXT4_I(inode); 1004 struct ext4_inode_info *ei = EXT4_I(inode);
948 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1005 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
949 1006
1007 ei->i_touch_when = jiffies;
1008
1009 if (!list_empty(&ei->i_es_lru))
1010 return;
1011
950 spin_lock(&sbi->s_es_lru_lock); 1012 spin_lock(&sbi->s_es_lru_lock);
951 if (list_empty(&ei->i_es_lru)) 1013 if (list_empty(&ei->i_es_lru))
952 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); 1014 list_add_tail(&ei->i_es_lru, &sbi->s_es_lru);
953 else
954 list_move_tail(&ei->i_es_lru, &sbi->s_es_lru);
955 spin_unlock(&sbi->s_es_lru_lock); 1015 spin_unlock(&sbi->s_es_lru_lock);
956} 1016}
957 1017
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index f740eb03b707..e936730cc5b0 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -39,6 +39,7 @@
39 EXTENT_STATUS_DELAYED | \ 39 EXTENT_STATUS_DELAYED | \
40 EXTENT_STATUS_HOLE) 40 EXTENT_STATUS_HOLE)
41 41
42struct ext4_sb_info;
42struct ext4_extent; 43struct ext4_extent;
43 44
44struct extent_status { 45struct extent_status {
@@ -119,8 +120,8 @@ static inline void ext4_es_store_status(struct extent_status *es,
119 es->es_pblk = block; 120 es->es_pblk = block;
120} 121}
121 122
122extern void ext4_es_register_shrinker(struct super_block *sb); 123extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi);
123extern void ext4_es_unregister_shrinker(struct super_block *sb); 124extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi);
124extern void ext4_es_lru_add(struct inode *inode); 125extern void ext4_es_lru_add(struct inode *inode);
125extern void ext4_es_lru_del(struct inode *inode); 126extern void ext4_es_lru_del(struct inode *inode);
126 127
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index b1b4d51b5d86..6f4cc567c382 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -312,7 +312,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
312 blkbits = inode->i_sb->s_blocksize_bits; 312 blkbits = inode->i_sb->s_blocksize_bits;
313 startoff = *offset; 313 startoff = *offset;
314 lastoff = startoff; 314 lastoff = startoff;
315 endoff = (map->m_lblk + map->m_len) << blkbits; 315 endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
316 316
317 index = startoff >> PAGE_CACHE_SHIFT; 317 index = startoff >> PAGE_CACHE_SHIFT;
318 end = endoff >> PAGE_CACHE_SHIFT; 318 end = endoff >> PAGE_CACHE_SHIFT;
@@ -457,7 +457,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
457 ret = ext4_map_blocks(NULL, inode, &map, 0); 457 ret = ext4_map_blocks(NULL, inode, &map, 0);
458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 458 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
459 if (last != start) 459 if (last != start)
460 dataoff = last << blkbits; 460 dataoff = (loff_t)last << blkbits;
461 break; 461 break;
462 } 462 }
463 463
@@ -468,7 +468,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
468 ext4_es_find_delayed_extent_range(inode, last, last, &es); 468 ext4_es_find_delayed_extent_range(inode, last, last, &es);
469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 469 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
470 if (last != start) 470 if (last != start)
471 dataoff = last << blkbits; 471 dataoff = (loff_t)last << blkbits;
472 break; 472 break;
473 } 473 }
474 474
@@ -486,7 +486,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
486 } 486 }
487 487
488 last++; 488 last++;
489 dataoff = last << blkbits; 489 dataoff = (loff_t)last << blkbits;
490 } while (last <= end); 490 } while (last <= end);
491 491
492 mutex_unlock(&inode->i_mutex); 492 mutex_unlock(&inode->i_mutex);
@@ -494,17 +494,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
494 if (dataoff > isize) 494 if (dataoff > isize)
495 return -ENXIO; 495 return -ENXIO;
496 496
497 if (dataoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 497 return vfs_setpos(file, dataoff, maxsize);
498 return -EINVAL;
499 if (dataoff > maxsize)
500 return -EINVAL;
501
502 if (dataoff != file->f_pos) {
503 file->f_pos = dataoff;
504 file->f_version = 0;
505 }
506
507 return dataoff;
508} 498}
509 499
510/* 500/*
@@ -540,7 +530,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
540 ret = ext4_map_blocks(NULL, inode, &map, 0); 530 ret = ext4_map_blocks(NULL, inode, &map, 0);
541 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) { 531 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
542 last += ret; 532 last += ret;
543 holeoff = last << blkbits; 533 holeoff = (loff_t)last << blkbits;
544 continue; 534 continue;
545 } 535 }
546 536
@@ -551,7 +541,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
551 ext4_es_find_delayed_extent_range(inode, last, last, &es); 541 ext4_es_find_delayed_extent_range(inode, last, last, &es);
552 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { 542 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
553 last = es.es_lblk + es.es_len; 543 last = es.es_lblk + es.es_len;
554 holeoff = last << blkbits; 544 holeoff = (loff_t)last << blkbits;
555 continue; 545 continue;
556 } 546 }
557 547
@@ -566,7 +556,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
566 &map, &holeoff); 556 &map, &holeoff);
567 if (!unwritten) { 557 if (!unwritten) {
568 last += ret; 558 last += ret;
569 holeoff = last << blkbits; 559 holeoff = (loff_t)last << blkbits;
570 continue; 560 continue;
571 } 561 }
572 } 562 }
@@ -580,17 +570,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
580 if (holeoff > isize) 570 if (holeoff > isize)
581 holeoff = isize; 571 holeoff = isize;
582 572
583 if (holeoff < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET)) 573 return vfs_setpos(file, holeoff, maxsize);
584 return -EINVAL;
585 if (holeoff > maxsize)
586 return -EINVAL;
587
588 if (holeoff != file->f_pos) {
589 file->f_pos = holeoff;
590 file->f_version = 0;
591 }
592
593 return holeoff;
594} 574}
595 575
596/* 576/*
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index e0ba8a408def..a8bc47f75fa0 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -73,32 +73,6 @@ static int ext4_sync_parent(struct inode *inode)
73 return ret; 73 return ret;
74} 74}
75 75
76/**
77 * __sync_file - generic_file_fsync without the locking and filemap_write
78 * @inode: inode to sync
79 * @datasync: only sync essential metadata if true
80 *
81 * This is just generic_file_fsync without the locking. This is needed for
82 * nojournal mode to make sure this inodes data/metadata makes it to disk
83 * properly. The i_mutex should be held already.
84 */
85static int __sync_inode(struct inode *inode, int datasync)
86{
87 int err;
88 int ret;
89
90 ret = sync_mapping_buffers(inode->i_mapping);
91 if (!(inode->i_state & I_DIRTY))
92 return ret;
93 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
94 return ret;
95
96 err = sync_inode_metadata(inode, 1);
97 if (ret == 0)
98 ret = err;
99 return ret;
100}
101
102/* 76/*
103 * akpm: A new design for ext4_sync_file(). 77 * akpm: A new design for ext4_sync_file().
104 * 78 *
@@ -116,7 +90,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
116 struct inode *inode = file->f_mapping->host; 90 struct inode *inode = file->f_mapping->host;
117 struct ext4_inode_info *ei = EXT4_I(inode); 91 struct ext4_inode_info *ei = EXT4_I(inode);
118 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 92 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
119 int ret, err; 93 int ret = 0, err;
120 tid_t commit_tid; 94 tid_t commit_tid;
121 bool needs_barrier = false; 95 bool needs_barrier = false;
122 96
@@ -124,25 +98,24 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
124 98
125 trace_ext4_sync_file_enter(file, datasync); 99 trace_ext4_sync_file_enter(file, datasync);
126 100
127 ret = filemap_write_and_wait_range(inode->i_mapping, start, end); 101 if (inode->i_sb->s_flags & MS_RDONLY) {
128 if (ret) 102 /* Make sure that we read updated s_mount_flags value */
129 return ret; 103 smp_rmb();
130 mutex_lock(&inode->i_mutex); 104 if (EXT4_SB(inode->i_sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
131 105 ret = -EROFS;
132 if (inode->i_sb->s_flags & MS_RDONLY)
133 goto out;
134
135 ret = ext4_flush_unwritten_io(inode);
136 if (ret < 0)
137 goto out; 106 goto out;
107 }
138 108
139 if (!journal) { 109 if (!journal) {
140 ret = __sync_inode(inode, datasync); 110 ret = generic_file_fsync(file, start, end, datasync);
141 if (!ret && !hlist_empty(&inode->i_dentry)) 111 if (!ret && !hlist_empty(&inode->i_dentry))
142 ret = ext4_sync_parent(inode); 112 ret = ext4_sync_parent(inode);
143 goto out; 113 goto out;
144 } 114 }
145 115
116 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
117 if (ret)
118 return ret;
146 /* 119 /*
147 * data=writeback,ordered: 120 * data=writeback,ordered:
148 * The caller's filemap_fdatawrite()/wait will sync the data. 121 * The caller's filemap_fdatawrite()/wait will sync the data.
@@ -172,8 +145,7 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
172 if (!ret) 145 if (!ret)
173 ret = err; 146 ret = err;
174 } 147 }
175 out: 148out:
176 mutex_unlock(&inode->i_mutex);
177 trace_ext4_sync_file_exit(inode, ret); 149 trace_ext4_sync_file_exit(inode, ret);
178 return ret; 150 return ret;
179} 151}
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 00a818d67b54..8bf5999875ee 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -734,11 +734,8 @@ repeat_in_this_group:
734 ino = ext4_find_next_zero_bit((unsigned long *) 734 ino = ext4_find_next_zero_bit((unsigned long *)
735 inode_bitmap_bh->b_data, 735 inode_bitmap_bh->b_data,
736 EXT4_INODES_PER_GROUP(sb), ino); 736 EXT4_INODES_PER_GROUP(sb), ino);
737 if (ino >= EXT4_INODES_PER_GROUP(sb)) { 737 if (ino >= EXT4_INODES_PER_GROUP(sb))
738 if (++group == ngroups) 738 goto next_group;
739 group = 0;
740 continue;
741 }
742 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) { 739 if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
743 ext4_error(sb, "reserved inode found cleared - " 740 ext4_error(sb, "reserved inode found cleared - "
744 "inode=%lu", ino + 1); 741 "inode=%lu", ino + 1);
@@ -747,7 +744,8 @@ repeat_in_this_group:
747 if (!handle) { 744 if (!handle) {
748 BUG_ON(nblocks <= 0); 745 BUG_ON(nblocks <= 0);
749 handle = __ext4_journal_start_sb(dir->i_sb, line_no, 746 handle = __ext4_journal_start_sb(dir->i_sb, line_no,
750 handle_type, nblocks); 747 handle_type, nblocks,
748 0);
751 if (IS_ERR(handle)) { 749 if (IS_ERR(handle)) {
752 err = PTR_ERR(handle); 750 err = PTR_ERR(handle);
753 ext4_std_error(sb, err); 751 ext4_std_error(sb, err);
@@ -768,6 +766,9 @@ repeat_in_this_group:
768 goto got; /* we grabbed the inode! */ 766 goto got; /* we grabbed the inode! */
769 if (ino < EXT4_INODES_PER_GROUP(sb)) 767 if (ino < EXT4_INODES_PER_GROUP(sb))
770 goto repeat_in_this_group; 768 goto repeat_in_this_group;
769next_group:
770 if (++group == ngroups)
771 group = 0;
771 } 772 }
772 err = -ENOSPC; 773 err = -ENOSPC;
773 goto out; 774 goto out;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index b8d5d351e24f..87b30cd357e7 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -624,7 +624,7 @@ cleanup:
624 partial--; 624 partial--;
625 } 625 }
626out: 626out:
627 trace_ext4_ind_map_blocks_exit(inode, map, err); 627 trace_ext4_ind_map_blocks_exit(inode, flags, map, err);
628 return err; 628 return err;
629} 629}
630 630
@@ -675,11 +675,6 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb,
675 675
676retry: 676retry:
677 if (rw == READ && ext4_should_dioread_nolock(inode)) { 677 if (rw == READ && ext4_should_dioread_nolock(inode)) {
678 if (unlikely(atomic_read(&EXT4_I(inode)->i_unwritten))) {
679 mutex_lock(&inode->i_mutex);
680 ext4_flush_unwritten_io(inode);
681 mutex_unlock(&inode->i_mutex);
682 }
683 /* 678 /*
684 * Nolock dioread optimization may be dynamically disabled 679 * Nolock dioread optimization may be dynamically disabled
685 * via ext4_inode_block_unlocked_dio(). Check inode's state 680 * via ext4_inode_block_unlocked_dio(). Check inode's state
@@ -779,27 +774,18 @@ int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock)
779 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; 774 return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1;
780} 775}
781 776
782int ext4_ind_trans_blocks(struct inode *inode, int nrblocks, int chunk) 777/*
778 * Calculate number of indirect blocks touched by mapping @nrblocks logically
779 * contiguous blocks
780 */
781int ext4_ind_trans_blocks(struct inode *inode, int nrblocks)
783{ 782{
784 int indirects;
785
786 /* if nrblocks are contiguous */
787 if (chunk) {
788 /*
789 * With N contiguous data blocks, we need at most
790 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
791 * 2 dindirect blocks, and 1 tindirect block
792 */
793 return DIV_ROUND_UP(nrblocks,
794 EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
795 }
796 /* 783 /*
797 * if nrblocks are not contiguous, worse case, each block touch 784 * With N contiguous data blocks, we need at most
798 * a indirect block, and each indirect block touch a double indirect 785 * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
799 * block, plus a triple indirect block 786 * 2 dindirect blocks, and 1 tindirect block
800 */ 787 */
801 indirects = nrblocks * 2 + 1; 788 return DIV_ROUND_UP(nrblocks, EXT4_ADDR_PER_BLOCK(inode->i_sb)) + 4;
802 return indirects;
803} 789}
804 790
805/* 791/*
@@ -940,11 +926,13 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
940 __le32 *last) 926 __le32 *last)
941{ 927{
942 __le32 *p; 928 __le32 *p;
943 int flags = EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_VALIDATED; 929 int flags = EXT4_FREE_BLOCKS_VALIDATED;
944 int err; 930 int err;
945 931
946 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 932 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
947 flags |= EXT4_FREE_BLOCKS_METADATA; 933 flags |= EXT4_FREE_BLOCKS_FORGET | EXT4_FREE_BLOCKS_METADATA;
934 else if (ext4_should_journal_data(inode))
935 flags |= EXT4_FREE_BLOCKS_FORGET;
948 936
949 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free, 937 if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
950 count)) { 938 count)) {
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3e2bf873e8a8..d9ecbf1113a7 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -72,7 +72,7 @@ static int get_max_inline_xattr_value_size(struct inode *inode,
72 entry = (struct ext4_xattr_entry *) 72 entry = (struct ext4_xattr_entry *)
73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off); 73 ((void *)raw_inode + EXT4_I(inode)->i_inline_off);
74 74
75 free += le32_to_cpu(entry->e_value_size); 75 free += EXT4_XATTR_SIZE(le32_to_cpu(entry->e_value_size));
76 goto out; 76 goto out;
77 } 77 }
78 78
@@ -1404,16 +1404,15 @@ out:
1404 * offset as if '.' and '..' really take place. 1404 * offset as if '.' and '..' really take place.
1405 * 1405 *
1406 */ 1406 */
1407int ext4_read_inline_dir(struct file *filp, 1407int ext4_read_inline_dir(struct file *file,
1408 void *dirent, filldir_t filldir, 1408 struct dir_context *ctx,
1409 int *has_inline_data) 1409 int *has_inline_data)
1410{ 1410{
1411 int error = 0;
1412 unsigned int offset, parent_ino; 1411 unsigned int offset, parent_ino;
1413 int i, stored; 1412 int i;
1414 struct ext4_dir_entry_2 *de; 1413 struct ext4_dir_entry_2 *de;
1415 struct super_block *sb; 1414 struct super_block *sb;
1416 struct inode *inode = file_inode(filp); 1415 struct inode *inode = file_inode(file);
1417 int ret, inline_size = 0; 1416 int ret, inline_size = 0;
1418 struct ext4_iloc iloc; 1417 struct ext4_iloc iloc;
1419 void *dir_buf = NULL; 1418 void *dir_buf = NULL;
@@ -1444,9 +1443,8 @@ int ext4_read_inline_dir(struct file *filp,
1444 goto out; 1443 goto out;
1445 1444
1446 sb = inode->i_sb; 1445 sb = inode->i_sb;
1447 stored = 0;
1448 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode); 1446 parent_ino = le32_to_cpu(((struct ext4_dir_entry_2 *)dir_buf)->inode);
1449 offset = filp->f_pos; 1447 offset = ctx->pos;
1450 1448
1451 /* 1449 /*
1452 * dotdot_offset and dotdot_size is the real offset and 1450 * dotdot_offset and dotdot_size is the real offset and
@@ -1460,104 +1458,74 @@ int ext4_read_inline_dir(struct file *filp,
1460 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; 1458 extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE;
1461 extra_size = extra_offset + inline_size; 1459 extra_size = extra_offset + inline_size;
1462 1460
1463 while (!error && !stored && filp->f_pos < extra_size) { 1461 /*
1464revalidate: 1462 * If the version has changed since the last call to
1465 /* 1463 * readdir(2), then we might be pointing to an invalid
1466 * If the version has changed since the last call to 1464 * dirent right now. Scan from the start of the inline
1467 * readdir(2), then we might be pointing to an invalid 1465 * dir to make sure.
1468 * dirent right now. Scan from the start of the inline 1466 */
1469 * dir to make sure. 1467 if (file->f_version != inode->i_version) {
1470 */ 1468 for (i = 0; i < extra_size && i < offset;) {
1471 if (filp->f_version != inode->i_version) { 1469 /*
1472 for (i = 0; i < extra_size && i < offset;) { 1470 * "." is with offset 0 and
1473 /* 1471 * ".." is dotdot_offset.
1474 * "." is with offset 0 and 1472 */
1475 * ".." is dotdot_offset. 1473 if (!i) {
1476 */ 1474 i = dotdot_offset;
1477 if (!i) { 1475 continue;
1478 i = dotdot_offset; 1476 } else if (i == dotdot_offset) {
1479 continue; 1477 i = dotdot_size;
1480 } else if (i == dotdot_offset) {
1481 i = dotdot_size;
1482 continue;
1483 }
1484 /* for other entry, the real offset in
1485 * the buf has to be tuned accordingly.
1486 */
1487 de = (struct ext4_dir_entry_2 *)
1488 (dir_buf + i - extra_offset);
1489 /* It's too expensive to do a full
1490 * dirent test each time round this
1491 * loop, but we do have to test at
1492 * least that it is non-zero. A
1493 * failure will be detected in the
1494 * dirent test below. */
1495 if (ext4_rec_len_from_disk(de->rec_len,
1496 extra_size) < EXT4_DIR_REC_LEN(1))
1497 break;
1498 i += ext4_rec_len_from_disk(de->rec_len,
1499 extra_size);
1500 }
1501 offset = i;
1502 filp->f_pos = offset;
1503 filp->f_version = inode->i_version;
1504 }
1505
1506 while (!error && filp->f_pos < extra_size) {
1507 if (filp->f_pos == 0) {
1508 error = filldir(dirent, ".", 1, 0, inode->i_ino,
1509 DT_DIR);
1510 if (error)
1511 break;
1512 stored++;
1513 filp->f_pos = dotdot_offset;
1514 continue; 1478 continue;
1515 } 1479 }
1480 /* for other entry, the real offset in
1481 * the buf has to be tuned accordingly.
1482 */
1483 de = (struct ext4_dir_entry_2 *)
1484 (dir_buf + i - extra_offset);
1485 /* It's too expensive to do a full
1486 * dirent test each time round this
1487 * loop, but we do have to test at
1488 * least that it is non-zero. A
1489 * failure will be detected in the
1490 * dirent test below. */
1491 if (ext4_rec_len_from_disk(de->rec_len, extra_size)
1492 < EXT4_DIR_REC_LEN(1))
1493 break;
1494 i += ext4_rec_len_from_disk(de->rec_len,
1495 extra_size);
1496 }
1497 offset = i;
1498 ctx->pos = offset;
1499 file->f_version = inode->i_version;
1500 }
1516 1501
1517 if (filp->f_pos == dotdot_offset) { 1502 while (ctx->pos < extra_size) {
1518 error = filldir(dirent, "..", 2, 1503 if (ctx->pos == 0) {
1519 dotdot_offset, 1504 if (!dir_emit(ctx, ".", 1, inode->i_ino, DT_DIR))
1520 parent_ino, DT_DIR); 1505 goto out;
1521 if (error) 1506 ctx->pos = dotdot_offset;
1522 break; 1507 continue;
1523 stored++; 1508 }
1524 1509
1525 filp->f_pos = dotdot_size; 1510 if (ctx->pos == dotdot_offset) {
1526 continue; 1511 if (!dir_emit(ctx, "..", 2, parent_ino, DT_DIR))
1527 } 1512 goto out;
1513 ctx->pos = dotdot_size;
1514 continue;
1515 }
1528 1516
1529 de = (struct ext4_dir_entry_2 *) 1517 de = (struct ext4_dir_entry_2 *)
1530 (dir_buf + filp->f_pos - extra_offset); 1518 (dir_buf + ctx->pos - extra_offset);
1531 if (ext4_check_dir_entry(inode, filp, de, 1519 if (ext4_check_dir_entry(inode, file, de, iloc.bh, dir_buf,
1532 iloc.bh, dir_buf, 1520 extra_size, ctx->pos))
1533 extra_size, filp->f_pos)) { 1521 goto out;
1534 ret = stored; 1522 if (le32_to_cpu(de->inode)) {
1523 if (!dir_emit(ctx, de->name, de->name_len,
1524 le32_to_cpu(de->inode),
1525 get_dtype(sb, de->file_type)))
1535 goto out; 1526 goto out;
1536 }
1537 if (le32_to_cpu(de->inode)) {
1538 /* We might block in the next section
1539 * if the data destination is
1540 * currently swapped out. So, use a
1541 * version stamp to detect whether or
1542 * not the directory has been modified
1543 * during the copy operation.
1544 */
1545 u64 version = filp->f_version;
1546
1547 error = filldir(dirent, de->name,
1548 de->name_len,
1549 filp->f_pos,
1550 le32_to_cpu(de->inode),
1551 get_dtype(sb, de->file_type));
1552 if (error)
1553 break;
1554 if (version != filp->f_version)
1555 goto revalidate;
1556 stored++;
1557 }
1558 filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
1559 extra_size);
1560 } 1527 }
1528 ctx->pos += ext4_rec_len_from_disk(de->rec_len, extra_size);
1561 } 1529 }
1562out: 1530out:
1563 kfree(dir_buf); 1531 kfree(dir_buf);
@@ -1842,7 +1810,7 @@ int ext4_inline_data_fiemap(struct inode *inode,
1842 if (error) 1810 if (error)
1843 goto out; 1811 goto out;
1844 1812
1845 physical = iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; 1813 physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits;
1846 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; 1814 physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data;
1847 physical += offsetof(struct ext4_inode, i_block); 1815 physical += offsetof(struct ext4_inode, i_block);
1848 length = i_size_read(inode); 1816 length = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d6382b89ecbd..dd32a2eacd0d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -132,12 +132,12 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
132 new_size); 132 new_size);
133} 133}
134 134
135static void ext4_invalidatepage(struct page *page, unsigned long offset); 135static void ext4_invalidatepage(struct page *page, unsigned int offset,
136 unsigned int length);
136static int __ext4_journalled_writepage(struct page *page, unsigned int len); 137static int __ext4_journalled_writepage(struct page *page, unsigned int len);
137static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh); 138static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh);
138static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 139static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
139 struct inode *inode, struct page *page, loff_t from, 140 int pextents);
140 loff_t length, int flags);
141 141
142/* 142/*
143 * Test whether an inode is a fast symlink. 143 * Test whether an inode is a fast symlink.
@@ -215,7 +215,8 @@ void ext4_evict_inode(struct inode *inode)
215 filemap_write_and_wait(&inode->i_data); 215 filemap_write_and_wait(&inode->i_data);
216 } 216 }
217 truncate_inode_pages(&inode->i_data, 0); 217 truncate_inode_pages(&inode->i_data, 0);
218 ext4_ioend_shutdown(inode); 218
219 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
219 goto no_delete; 220 goto no_delete;
220 } 221 }
221 222
@@ -225,8 +226,8 @@ void ext4_evict_inode(struct inode *inode)
225 if (ext4_should_order_data(inode)) 226 if (ext4_should_order_data(inode))
226 ext4_begin_ordered_truncate(inode, 0); 227 ext4_begin_ordered_truncate(inode, 0);
227 truncate_inode_pages(&inode->i_data, 0); 228 truncate_inode_pages(&inode->i_data, 0);
228 ext4_ioend_shutdown(inode);
229 229
230 WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count));
230 if (is_bad_inode(inode)) 231 if (is_bad_inode(inode))
231 goto no_delete; 232 goto no_delete;
232 233
@@ -423,66 +424,6 @@ static int __check_block_validity(struct inode *inode, const char *func,
423#define check_block_validity(inode, map) \ 424#define check_block_validity(inode, map) \
424 __check_block_validity((inode), __func__, __LINE__, (map)) 425 __check_block_validity((inode), __func__, __LINE__, (map))
425 426
426/*
427 * Return the number of contiguous dirty pages in a given inode
428 * starting at page frame idx.
429 */
430static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
431 unsigned int max_pages)
432{
433 struct address_space *mapping = inode->i_mapping;
434 pgoff_t index;
435 struct pagevec pvec;
436 pgoff_t num = 0;
437 int i, nr_pages, done = 0;
438
439 if (max_pages == 0)
440 return 0;
441 pagevec_init(&pvec, 0);
442 while (!done) {
443 index = idx;
444 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
445 PAGECACHE_TAG_DIRTY,
446 (pgoff_t)PAGEVEC_SIZE);
447 if (nr_pages == 0)
448 break;
449 for (i = 0; i < nr_pages; i++) {
450 struct page *page = pvec.pages[i];
451 struct buffer_head *bh, *head;
452
453 lock_page(page);
454 if (unlikely(page->mapping != mapping) ||
455 !PageDirty(page) ||
456 PageWriteback(page) ||
457 page->index != idx) {
458 done = 1;
459 unlock_page(page);
460 break;
461 }
462 if (page_has_buffers(page)) {
463 bh = head = page_buffers(page);
464 do {
465 if (!buffer_delay(bh) &&
466 !buffer_unwritten(bh))
467 done = 1;
468 bh = bh->b_this_page;
469 } while (!done && (bh != head));
470 }
471 unlock_page(page);
472 if (done)
473 break;
474 idx++;
475 num++;
476 if (num >= max_pages) {
477 done = 1;
478 break;
479 }
480 }
481 pagevec_release(&pvec);
482 }
483 return num;
484}
485
486#ifdef ES_AGGRESSIVE_TEST 427#ifdef ES_AGGRESSIVE_TEST
487static void ext4_map_blocks_es_recheck(handle_t *handle, 428static void ext4_map_blocks_es_recheck(handle_t *handle,
488 struct inode *inode, 429 struct inode *inode,
@@ -524,7 +465,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
524 if (es_map->m_lblk != map->m_lblk || 465 if (es_map->m_lblk != map->m_lblk ||
525 es_map->m_flags != map->m_flags || 466 es_map->m_flags != map->m_flags ||
526 es_map->m_pblk != map->m_pblk) { 467 es_map->m_pblk != map->m_pblk) {
527 printk("ES cache assertation failed for inode: %lu " 468 printk("ES cache assertion failed for inode: %lu "
528 "es_cached ex [%d/%d/%llu/%x] != " 469 "es_cached ex [%d/%d/%llu/%x] != "
529 "found ex [%d/%d/%llu/%x] retval %d flags %x\n", 470 "found ex [%d/%d/%llu/%x] retval %d flags %x\n",
530 inode->i_ino, es_map->m_lblk, es_map->m_len, 471 inode->i_ino, es_map->m_lblk, es_map->m_len,
@@ -575,6 +516,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
575 516
576 /* Lookup extent status tree firstly */ 517 /* Lookup extent status tree firstly */
577 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { 518 if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
519 ext4_es_lru_add(inode);
578 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { 520 if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) {
579 map->m_pblk = ext4_es_pblock(&es) + 521 map->m_pblk = ext4_es_pblock(&es) +
580 map->m_lblk - es.es_lblk; 522 map->m_lblk - es.es_lblk;
@@ -613,14 +555,13 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
613 int ret; 555 int ret;
614 unsigned long long status; 556 unsigned long long status;
615 557
616#ifdef ES_AGGRESSIVE_TEST 558 if (unlikely(retval != map->m_len)) {
617 if (retval != map->m_len) { 559 ext4_warning(inode->i_sb,
618 printk("ES len assertation failed for inode: %lu " 560 "ES len assertion failed for inode "
619 "retval %d != map->m_len %d " 561 "%lu: retval %d != map->m_len %d",
620 "in %s (lookup)\n", inode->i_ino, retval, 562 inode->i_ino, retval, map->m_len);
621 map->m_len, __func__); 563 WARN_ON(1);
622 } 564 }
623#endif
624 565
625 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 566 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
626 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 567 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -714,14 +655,13 @@ found:
714 int ret; 655 int ret;
715 unsigned long long status; 656 unsigned long long status;
716 657
717#ifdef ES_AGGRESSIVE_TEST 658 if (unlikely(retval != map->m_len)) {
718 if (retval != map->m_len) { 659 ext4_warning(inode->i_sb,
719 printk("ES len assertation failed for inode: %lu " 660 "ES len assertion failed for inode "
720 "retval %d != map->m_len %d " 661 "%lu: retval %d != map->m_len %d",
721 "in %s (allocation)\n", inode->i_ino, retval, 662 inode->i_ino, retval, map->m_len);
722 map->m_len, __func__); 663 WARN_ON(1);
723 } 664 }
724#endif
725 665
726 /* 666 /*
727 * If the extent has been zeroed out, we don't need to update 667 * If the extent has been zeroed out, we don't need to update
@@ -1118,10 +1058,13 @@ static int ext4_write_end(struct file *file,
1118 } 1058 }
1119 } 1059 }
1120 1060
1121 if (ext4_has_inline_data(inode)) 1061 if (ext4_has_inline_data(inode)) {
1122 copied = ext4_write_inline_data_end(inode, pos, len, 1062 ret = ext4_write_inline_data_end(inode, pos, len,
1123 copied, page); 1063 copied, page);
1124 else 1064 if (ret < 0)
1065 goto errout;
1066 copied = ret;
1067 } else
1125 copied = block_write_end(file, mapping, pos, 1068 copied = block_write_end(file, mapping, pos,
1126 len, copied, page, fsdata); 1069 len, copied, page, fsdata);
1127 1070
@@ -1157,8 +1100,6 @@ static int ext4_write_end(struct file *file,
1157 if (i_size_changed) 1100 if (i_size_changed)
1158 ext4_mark_inode_dirty(handle, inode); 1101 ext4_mark_inode_dirty(handle, inode);
1159 1102
1160 if (copied < 0)
1161 ret = copied;
1162 if (pos + len > inode->i_size && ext4_can_truncate(inode)) 1103 if (pos + len > inode->i_size && ext4_can_truncate(inode))
1163 /* if we have allocated more blocks and copied 1104 /* if we have allocated more blocks and copied
1164 * less. We will have blocks allocated outside 1105 * less. We will have blocks allocated outside
@@ -1415,21 +1356,28 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
1415} 1356}
1416 1357
1417static void ext4_da_page_release_reservation(struct page *page, 1358static void ext4_da_page_release_reservation(struct page *page,
1418 unsigned long offset) 1359 unsigned int offset,
1360 unsigned int length)
1419{ 1361{
1420 int to_release = 0; 1362 int to_release = 0;
1421 struct buffer_head *head, *bh; 1363 struct buffer_head *head, *bh;
1422 unsigned int curr_off = 0; 1364 unsigned int curr_off = 0;
1423 struct inode *inode = page->mapping->host; 1365 struct inode *inode = page->mapping->host;
1424 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 1366 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
1367 unsigned int stop = offset + length;
1425 int num_clusters; 1368 int num_clusters;
1426 ext4_fsblk_t lblk; 1369 ext4_fsblk_t lblk;
1427 1370
1371 BUG_ON(stop > PAGE_CACHE_SIZE || stop < length);
1372
1428 head = page_buffers(page); 1373 head = page_buffers(page);
1429 bh = head; 1374 bh = head;
1430 do { 1375 do {
1431 unsigned int next_off = curr_off + bh->b_size; 1376 unsigned int next_off = curr_off + bh->b_size;
1432 1377
1378 if (next_off > stop)
1379 break;
1380
1433 if ((offset <= curr_off) && (buffer_delay(bh))) { 1381 if ((offset <= curr_off) && (buffer_delay(bh))) {
1434 to_release++; 1382 to_release++;
1435 clear_buffer_delay(bh); 1383 clear_buffer_delay(bh);
@@ -1460,140 +1408,43 @@ static void ext4_da_page_release_reservation(struct page *page,
1460 * Delayed allocation stuff 1408 * Delayed allocation stuff
1461 */ 1409 */
1462 1410
1463/* 1411struct mpage_da_data {
1464 * mpage_da_submit_io - walks through extent of pages and try to write 1412 struct inode *inode;
1465 * them with writepage() call back 1413 struct writeback_control *wbc;
1466 *
1467 * @mpd->inode: inode
1468 * @mpd->first_page: first page of the extent
1469 * @mpd->next_page: page after the last page of the extent
1470 *
1471 * By the time mpage_da_submit_io() is called we expect all blocks
1472 * to be allocated. this may be wrong if allocation failed.
1473 *
1474 * As pages are already locked by write_cache_pages(), we can't use it
1475 */
1476static int mpage_da_submit_io(struct mpage_da_data *mpd,
1477 struct ext4_map_blocks *map)
1478{
1479 struct pagevec pvec;
1480 unsigned long index, end;
1481 int ret = 0, err, nr_pages, i;
1482 struct inode *inode = mpd->inode;
1483 struct address_space *mapping = inode->i_mapping;
1484 loff_t size = i_size_read(inode);
1485 unsigned int len, block_start;
1486 struct buffer_head *bh, *page_bufs = NULL;
1487 sector_t pblock = 0, cur_logical = 0;
1488 struct ext4_io_submit io_submit;
1489 1414
1490 BUG_ON(mpd->next_page <= mpd->first_page); 1415 pgoff_t first_page; /* The first page to write */
1491 memset(&io_submit, 0, sizeof(io_submit)); 1416 pgoff_t next_page; /* Current page to examine */
1417 pgoff_t last_page; /* Last page to examine */
1492 /* 1418 /*
1493 * We need to start from the first_page to the next_page - 1 1419 * Extent to map - this can be after first_page because that can be
1494 * to make sure we also write the mapped dirty buffer_heads. 1420 * fully mapped. We somewhat abuse m_flags to store whether the extent
1495 * If we look at mpd->b_blocknr we would only be looking 1421 * is delalloc or unwritten.
1496 * at the currently mapped buffer_heads.
1497 */ 1422 */
1498 index = mpd->first_page; 1423 struct ext4_map_blocks map;
1499 end = mpd->next_page - 1; 1424 struct ext4_io_submit io_submit; /* IO submission data */
1500 1425};
1501 pagevec_init(&pvec, 0);
1502 while (index <= end) {
1503 nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE);
1504 if (nr_pages == 0)
1505 break;
1506 for (i = 0; i < nr_pages; i++) {
1507 int skip_page = 0;
1508 struct page *page = pvec.pages[i];
1509
1510 index = page->index;
1511 if (index > end)
1512 break;
1513
1514 if (index == size >> PAGE_CACHE_SHIFT)
1515 len = size & ~PAGE_CACHE_MASK;
1516 else
1517 len = PAGE_CACHE_SIZE;
1518 if (map) {
1519 cur_logical = index << (PAGE_CACHE_SHIFT -
1520 inode->i_blkbits);
1521 pblock = map->m_pblk + (cur_logical -
1522 map->m_lblk);
1523 }
1524 index++;
1525
1526 BUG_ON(!PageLocked(page));
1527 BUG_ON(PageWriteback(page));
1528
1529 bh = page_bufs = page_buffers(page);
1530 block_start = 0;
1531 do {
1532 if (map && (cur_logical >= map->m_lblk) &&
1533 (cur_logical <= (map->m_lblk +
1534 (map->m_len - 1)))) {
1535 if (buffer_delay(bh)) {
1536 clear_buffer_delay(bh);
1537 bh->b_blocknr = pblock;
1538 }
1539 if (buffer_unwritten(bh) ||
1540 buffer_mapped(bh))
1541 BUG_ON(bh->b_blocknr != pblock);
1542 if (map->m_flags & EXT4_MAP_UNINIT)
1543 set_buffer_uninit(bh);
1544 clear_buffer_unwritten(bh);
1545 }
1546
1547 /*
1548 * skip page if block allocation undone and
1549 * block is dirty
1550 */
1551 if (ext4_bh_delay_or_unwritten(NULL, bh))
1552 skip_page = 1;
1553 bh = bh->b_this_page;
1554 block_start += bh->b_size;
1555 cur_logical++;
1556 pblock++;
1557 } while (bh != page_bufs);
1558
1559 if (skip_page) {
1560 unlock_page(page);
1561 continue;
1562 }
1563
1564 clear_page_dirty_for_io(page);
1565 err = ext4_bio_write_page(&io_submit, page, len,
1566 mpd->wbc);
1567 if (!err)
1568 mpd->pages_written++;
1569 /*
1570 * In error case, we have to continue because
1571 * remaining pages are still locked
1572 */
1573 if (ret == 0)
1574 ret = err;
1575 }
1576 pagevec_release(&pvec);
1577 }
1578 ext4_io_submit(&io_submit);
1579 return ret;
1580}
1581 1426
1582static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd) 1427static void mpage_release_unused_pages(struct mpage_da_data *mpd,
1428 bool invalidate)
1583{ 1429{
1584 int nr_pages, i; 1430 int nr_pages, i;
1585 pgoff_t index, end; 1431 pgoff_t index, end;
1586 struct pagevec pvec; 1432 struct pagevec pvec;
1587 struct inode *inode = mpd->inode; 1433 struct inode *inode = mpd->inode;
1588 struct address_space *mapping = inode->i_mapping; 1434 struct address_space *mapping = inode->i_mapping;
1589 ext4_lblk_t start, last; 1435
1436 /* This is necessary when next_page == 0. */
1437 if (mpd->first_page >= mpd->next_page)
1438 return;
1590 1439
1591 index = mpd->first_page; 1440 index = mpd->first_page;
1592 end = mpd->next_page - 1; 1441 end = mpd->next_page - 1;
1593 1442 if (invalidate) {
1594 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1443 ext4_lblk_t start, last;
1595 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits); 1444 start = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1596 ext4_es_remove_extent(inode, start, last - start + 1); 1445 last = end << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1446 ext4_es_remove_extent(inode, start, last - start + 1);
1447 }
1597 1448
1598 pagevec_init(&pvec, 0); 1449 pagevec_init(&pvec, 0);
1599 while (index <= end) { 1450 while (index <= end) {
@@ -1606,14 +1457,15 @@ static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd)
1606 break; 1457 break;
1607 BUG_ON(!PageLocked(page)); 1458 BUG_ON(!PageLocked(page));
1608 BUG_ON(PageWriteback(page)); 1459 BUG_ON(PageWriteback(page));
1609 block_invalidatepage(page, 0); 1460 if (invalidate) {
1610 ClearPageUptodate(page); 1461 block_invalidatepage(page, 0, PAGE_CACHE_SIZE);
1462 ClearPageUptodate(page);
1463 }
1611 unlock_page(page); 1464 unlock_page(page);
1612 } 1465 }
1613 index = pvec.pages[nr_pages - 1]->index + 1; 1466 index = pvec.pages[nr_pages - 1]->index + 1;
1614 pagevec_release(&pvec); 1467 pagevec_release(&pvec);
1615 } 1468 }
1616 return;
1617} 1469}
1618 1470
1619static void ext4_print_free_blocks(struct inode *inode) 1471static void ext4_print_free_blocks(struct inode *inode)
@@ -1642,215 +1494,6 @@ static void ext4_print_free_blocks(struct inode *inode)
1642 return; 1494 return;
1643} 1495}
1644 1496
1645/*
1646 * mpage_da_map_and_submit - go through given space, map them
1647 * if necessary, and then submit them for I/O
1648 *
1649 * @mpd - bh describing space
1650 *
1651 * The function skips space we know is already mapped to disk blocks.
1652 *
1653 */
1654static void mpage_da_map_and_submit(struct mpage_da_data *mpd)
1655{
1656 int err, blks, get_blocks_flags;
1657 struct ext4_map_blocks map, *mapp = NULL;
1658 sector_t next = mpd->b_blocknr;
1659 unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
1660 loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
1661 handle_t *handle = NULL;
1662
1663 /*
1664 * If the blocks are mapped already, or we couldn't accumulate
1665 * any blocks, then proceed immediately to the submission stage.
1666 */
1667 if ((mpd->b_size == 0) ||
1668 ((mpd->b_state & (1 << BH_Mapped)) &&
1669 !(mpd->b_state & (1 << BH_Delay)) &&
1670 !(mpd->b_state & (1 << BH_Unwritten))))
1671 goto submit_io;
1672
1673 handle = ext4_journal_current_handle();
1674 BUG_ON(!handle);
1675
1676 /*
1677 * Call ext4_map_blocks() to allocate any delayed allocation
1678 * blocks, or to convert an uninitialized extent to be
1679 * initialized (in the case where we have written into
1680 * one or more preallocated blocks).
1681 *
1682 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
1683 * indicate that we are on the delayed allocation path. This
1684 * affects functions in many different parts of the allocation
1685 * call path. This flag exists primarily because we don't
1686 * want to change *many* call functions, so ext4_map_blocks()
1687 * will set the EXT4_STATE_DELALLOC_RESERVED flag once the
1688 * inode's allocation semaphore is taken.
1689 *
1690 * If the blocks in questions were delalloc blocks, set
1691 * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
1692 * variables are updated after the blocks have been allocated.
1693 */
1694 map.m_lblk = next;
1695 map.m_len = max_blocks;
1696 /*
1697 * We're in delalloc path and it is possible that we're going to
1698 * need more metadata blocks than previously reserved. However
1699 * we must not fail because we're in writeback and there is
1700 * nothing we can do about it so it might result in data loss.
1701 * So use reserved blocks to allocate metadata if possible.
1702 */
1703 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
1704 EXT4_GET_BLOCKS_METADATA_NOFAIL;
1705 if (ext4_should_dioread_nolock(mpd->inode))
1706 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
1707 if (mpd->b_state & (1 << BH_Delay))
1708 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
1709
1710
1711 blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
1712 if (blks < 0) {
1713 struct super_block *sb = mpd->inode->i_sb;
1714
1715 err = blks;
1716 /*
1717 * If get block returns EAGAIN or ENOSPC and there
1718 * appears to be free blocks we will just let
1719 * mpage_da_submit_io() unlock all of the pages.
1720 */
1721 if (err == -EAGAIN)
1722 goto submit_io;
1723
1724 if (err == -ENOSPC && ext4_count_free_clusters(sb)) {
1725 mpd->retval = err;
1726 goto submit_io;
1727 }
1728
1729 /*
1730 * get block failure will cause us to loop in
1731 * writepages, because a_ops->writepage won't be able
1732 * to make progress. The page will be redirtied by
1733 * writepage and writepages will again try to write
1734 * the same.
1735 */
1736 if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) {
1737 ext4_msg(sb, KERN_CRIT,
1738 "delayed block allocation failed for inode %lu "
1739 "at logical offset %llu with max blocks %zd "
1740 "with error %d", mpd->inode->i_ino,
1741 (unsigned long long) next,
1742 mpd->b_size >> mpd->inode->i_blkbits, err);
1743 ext4_msg(sb, KERN_CRIT,
1744 "This should not happen!! Data will be lost");
1745 if (err == -ENOSPC)
1746 ext4_print_free_blocks(mpd->inode);
1747 }
1748 /* invalidate all the pages */
1749 ext4_da_block_invalidatepages(mpd);
1750
1751 /* Mark this page range as having been completed */
1752 mpd->io_done = 1;
1753 return;
1754 }
1755 BUG_ON(blks == 0);
1756
1757 mapp = &map;
1758 if (map.m_flags & EXT4_MAP_NEW) {
1759 struct block_device *bdev = mpd->inode->i_sb->s_bdev;
1760 int i;
1761
1762 for (i = 0; i < map.m_len; i++)
1763 unmap_underlying_metadata(bdev, map.m_pblk + i);
1764 }
1765
1766 /*
1767 * Update on-disk size along with block allocation.
1768 */
1769 disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
1770 if (disksize > i_size_read(mpd->inode))
1771 disksize = i_size_read(mpd->inode);
1772 if (disksize > EXT4_I(mpd->inode)->i_disksize) {
1773 ext4_update_i_disksize(mpd->inode, disksize);
1774 err = ext4_mark_inode_dirty(handle, mpd->inode);
1775 if (err)
1776 ext4_error(mpd->inode->i_sb,
1777 "Failed to mark inode %lu dirty",
1778 mpd->inode->i_ino);
1779 }
1780
1781submit_io:
1782 mpage_da_submit_io(mpd, mapp);
1783 mpd->io_done = 1;
1784}
1785
1786#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \
1787 (1 << BH_Delay) | (1 << BH_Unwritten))
1788
1789/*
1790 * mpage_add_bh_to_extent - try to add one more block to extent of blocks
1791 *
1792 * @mpd->lbh - extent of blocks
1793 * @logical - logical number of the block in the file
1794 * @b_state - b_state of the buffer head added
1795 *
1796 * the function is used to collect contig. blocks in same state
1797 */
1798static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical,
1799 unsigned long b_state)
1800{
1801 sector_t next;
1802 int blkbits = mpd->inode->i_blkbits;
1803 int nrblocks = mpd->b_size >> blkbits;
1804
1805 /*
1806 * XXX Don't go larger than mballoc is willing to allocate
1807 * This is a stopgap solution. We eventually need to fold
1808 * mpage_da_submit_io() into this function and then call
1809 * ext4_map_blocks() multiple times in a loop
1810 */
1811 if (nrblocks >= (8*1024*1024 >> blkbits))
1812 goto flush_it;
1813
1814 /* check if the reserved journal credits might overflow */
1815 if (!ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS)) {
1816 if (nrblocks >= EXT4_MAX_TRANS_DATA) {
1817 /*
1818 * With non-extent format we are limited by the journal
1819 * credit available. Total credit needed to insert
1820 * nrblocks contiguous blocks is dependent on the
1821 * nrblocks. So limit nrblocks.
1822 */
1823 goto flush_it;
1824 }
1825 }
1826 /*
1827 * First block in the extent
1828 */
1829 if (mpd->b_size == 0) {
1830 mpd->b_blocknr = logical;
1831 mpd->b_size = 1 << blkbits;
1832 mpd->b_state = b_state & BH_FLAGS;
1833 return;
1834 }
1835
1836 next = mpd->b_blocknr + nrblocks;
1837 /*
1838 * Can we merge the block to our big extent?
1839 */
1840 if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
1841 mpd->b_size += 1 << blkbits;
1842 return;
1843 }
1844
1845flush_it:
1846 /*
1847 * We couldn't merge the block to our extent, so we
1848 * need to flush current extent and start new one
1849 */
1850 mpage_da_map_and_submit(mpd);
1851 return;
1852}
1853
1854static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh) 1497static int ext4_bh_delay_or_unwritten(handle_t *handle, struct buffer_head *bh)
1855{ 1498{
1856 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh); 1499 return (buffer_delay(bh) || buffer_unwritten(bh)) && buffer_dirty(bh);
@@ -1885,7 +1528,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock,
1885 1528
1886 /* Lookup extent status tree firstly */ 1529 /* Lookup extent status tree firstly */
1887 if (ext4_es_lookup_extent(inode, iblock, &es)) { 1530 if (ext4_es_lookup_extent(inode, iblock, &es)) {
1888 1531 ext4_es_lru_add(inode);
1889 if (ext4_es_is_hole(&es)) { 1532 if (ext4_es_is_hole(&es)) {
1890 retval = 0; 1533 retval = 0;
1891 down_read((&EXT4_I(inode)->i_data_sem)); 1534 down_read((&EXT4_I(inode)->i_data_sem));
@@ -1992,14 +1635,13 @@ add_delayed:
1992 int ret; 1635 int ret;
1993 unsigned long long status; 1636 unsigned long long status;
1994 1637
1995#ifdef ES_AGGRESSIVE_TEST 1638 if (unlikely(retval != map->m_len)) {
1996 if (retval != map->m_len) { 1639 ext4_warning(inode->i_sb,
1997 printk("ES len assertation failed for inode: %lu " 1640 "ES len assertion failed for inode "
1998 "retval %d != map->m_len %d " 1641 "%lu: retval %d != map->m_len %d",
1999 "in %s (lookup)\n", inode->i_ino, retval, 1642 inode->i_ino, retval, map->m_len);
2000 map->m_len, __func__); 1643 WARN_ON(1);
2001 } 1644 }
2002#endif
2003 1645
2004 status = map->m_flags & EXT4_MAP_UNWRITTEN ? 1646 status = map->m_flags & EXT4_MAP_UNWRITTEN ?
2005 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; 1647 EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -2156,7 +1798,7 @@ out:
2156 * lock so we have to do some magic. 1798 * lock so we have to do some magic.
2157 * 1799 *
2158 * This function can get called via... 1800 * This function can get called via...
2159 * - ext4_da_writepages after taking page lock (have journal handle) 1801 * - ext4_writepages after taking page lock (have journal handle)
2160 * - journal_submit_inode_data_buffers (no journal handle) 1802 * - journal_submit_inode_data_buffers (no journal handle)
2161 * - shrink_page_list via the kswapd/direct reclaim (no journal handle) 1803 * - shrink_page_list via the kswapd/direct reclaim (no journal handle)
2162 * - grab_page_cache when doing write_begin (have journal handle) 1804 * - grab_page_cache when doing write_begin (have journal handle)
@@ -2234,76 +1876,405 @@ static int ext4_writepage(struct page *page,
2234 */ 1876 */
2235 return __ext4_journalled_writepage(page, len); 1877 return __ext4_journalled_writepage(page, len);
2236 1878
2237 memset(&io_submit, 0, sizeof(io_submit)); 1879 ext4_io_submit_init(&io_submit, wbc);
1880 io_submit.io_end = ext4_init_io_end(inode, GFP_NOFS);
1881 if (!io_submit.io_end) {
1882 redirty_page_for_writepage(wbc, page);
1883 unlock_page(page);
1884 return -ENOMEM;
1885 }
2238 ret = ext4_bio_write_page(&io_submit, page, len, wbc); 1886 ret = ext4_bio_write_page(&io_submit, page, len, wbc);
2239 ext4_io_submit(&io_submit); 1887 ext4_io_submit(&io_submit);
1888 /* Drop io_end reference we got from init */
1889 ext4_put_io_end_defer(io_submit.io_end);
2240 return ret; 1890 return ret;
2241} 1891}
2242 1892
1893#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay))
1894
2243/* 1895/*
2244 * This is called via ext4_da_writepages() to 1896 * mballoc gives us at most this number of blocks...
2245 * calculate the total number of credits to reserve to fit 1897 * XXX: That seems to be only a limitation of ext4_mb_normalize_request().
2246 * a single extent allocation into a single transaction, 1898 * The rest of mballoc seems to handle chunks upto full group size.
2247 * ext4_da_writpeages() will loop calling this before
2248 * the block allocation.
2249 */ 1899 */
1900#define MAX_WRITEPAGES_EXTENT_LEN 2048
2250 1901
2251static int ext4_da_writepages_trans_blocks(struct inode *inode) 1902/*
1903 * mpage_add_bh_to_extent - try to add bh to extent of blocks to map
1904 *
1905 * @mpd - extent of blocks
1906 * @lblk - logical number of the block in the file
1907 * @b_state - b_state of the buffer head added
1908 *
1909 * the function is used to collect contig. blocks in same state
1910 */
1911static int mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk,
1912 unsigned long b_state)
2252{ 1913{
2253 int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; 1914 struct ext4_map_blocks *map = &mpd->map;
1915
1916 /* Don't go larger than mballoc is willing to allocate */
1917 if (map->m_len >= MAX_WRITEPAGES_EXTENT_LEN)
1918 return 0;
1919
1920 /* First block in the extent? */
1921 if (map->m_len == 0) {
1922 map->m_lblk = lblk;
1923 map->m_len = 1;
1924 map->m_flags = b_state & BH_FLAGS;
1925 return 1;
1926 }
1927
1928 /* Can we merge the block to our big extent? */
1929 if (lblk == map->m_lblk + map->m_len &&
1930 (b_state & BH_FLAGS) == map->m_flags) {
1931 map->m_len++;
1932 return 1;
1933 }
1934 return 0;
1935}
2254 1936
1937static bool add_page_bufs_to_extent(struct mpage_da_data *mpd,
1938 struct buffer_head *head,
1939 struct buffer_head *bh,
1940 ext4_lblk_t lblk)
1941{
1942 struct inode *inode = mpd->inode;
1943 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
1944 >> inode->i_blkbits;
1945
1946 do {
1947 BUG_ON(buffer_locked(bh));
1948
1949 if (!buffer_dirty(bh) || !buffer_mapped(bh) ||
1950 (!buffer_delay(bh) && !buffer_unwritten(bh)) ||
1951 lblk >= blocks) {
1952 /* Found extent to map? */
1953 if (mpd->map.m_len)
1954 return false;
1955 if (lblk >= blocks)
1956 return true;
1957 continue;
1958 }
1959 if (!mpage_add_bh_to_extent(mpd, lblk, bh->b_state))
1960 return false;
1961 } while (lblk++, (bh = bh->b_this_page) != head);
1962 return true;
1963}
1964
1965static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page)
1966{
1967 int len;
1968 loff_t size = i_size_read(mpd->inode);
1969 int err;
1970
1971 BUG_ON(page->index != mpd->first_page);
1972 if (page->index == size >> PAGE_CACHE_SHIFT)
1973 len = size & ~PAGE_CACHE_MASK;
1974 else
1975 len = PAGE_CACHE_SIZE;
1976 clear_page_dirty_for_io(page);
1977 err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc);
1978 if (!err)
1979 mpd->wbc->nr_to_write--;
1980 mpd->first_page++;
1981
1982 return err;
1983}
1984
1985/*
1986 * mpage_map_buffers - update buffers corresponding to changed extent and
1987 * submit fully mapped pages for IO
1988 *
1989 * @mpd - description of extent to map, on return next extent to map
1990 *
1991 * Scan buffers corresponding to changed extent (we expect corresponding pages
1992 * to be already locked) and update buffer state according to new extent state.
1993 * We map delalloc buffers to their physical location, clear unwritten bits,
1994 * and mark buffers as uninit when we perform writes to uninitialized extents
1995 * and do extent conversion after IO is finished. If the last page is not fully
1996 * mapped, we update @map to the next extent in the last page that needs
1997 * mapping. Otherwise we submit the page for IO.
1998 */
1999static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
2000{
2001 struct pagevec pvec;
2002 int nr_pages, i;
2003 struct inode *inode = mpd->inode;
2004 struct buffer_head *head, *bh;
2005 int bpp_bits = PAGE_CACHE_SHIFT - inode->i_blkbits;
2006 ext4_lblk_t blocks = (i_size_read(inode) + (1 << inode->i_blkbits) - 1)
2007 >> inode->i_blkbits;
2008 pgoff_t start, end;
2009 ext4_lblk_t lblk;
2010 sector_t pblock;
2011 int err;
2012
2013 start = mpd->map.m_lblk >> bpp_bits;
2014 end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
2015 lblk = start << bpp_bits;
2016 pblock = mpd->map.m_pblk;
2017
2018 pagevec_init(&pvec, 0);
2019 while (start <= end) {
2020 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, start,
2021 PAGEVEC_SIZE);
2022 if (nr_pages == 0)
2023 break;
2024 for (i = 0; i < nr_pages; i++) {
2025 struct page *page = pvec.pages[i];
2026
2027 if (page->index > end)
2028 break;
2029 /* Upto 'end' pages must be contiguous */
2030 BUG_ON(page->index != start);
2031 bh = head = page_buffers(page);
2032 do {
2033 if (lblk < mpd->map.m_lblk)
2034 continue;
2035 if (lblk >= mpd->map.m_lblk + mpd->map.m_len) {
2036 /*
2037 * Buffer after end of mapped extent.
2038 * Find next buffer in the page to map.
2039 */
2040 mpd->map.m_len = 0;
2041 mpd->map.m_flags = 0;
2042 add_page_bufs_to_extent(mpd, head, bh,
2043 lblk);
2044 pagevec_release(&pvec);
2045 return 0;
2046 }
2047 if (buffer_delay(bh)) {
2048 clear_buffer_delay(bh);
2049 bh->b_blocknr = pblock++;
2050 }
2051 clear_buffer_unwritten(bh);
2052 } while (++lblk < blocks &&
2053 (bh = bh->b_this_page) != head);
2054
2055 /*
2056 * FIXME: This is going to break if dioread_nolock
2057 * supports blocksize < pagesize as we will try to
2058 * convert potentially unmapped parts of inode.
2059 */
2060 mpd->io_submit.io_end->size += PAGE_CACHE_SIZE;
2061 /* Page fully mapped - let IO run! */
2062 err = mpage_submit_page(mpd, page);
2063 if (err < 0) {
2064 pagevec_release(&pvec);
2065 return err;
2066 }
2067 start++;
2068 }
2069 pagevec_release(&pvec);
2070 }
2071 /* Extent fully mapped and matches with page boundary. We are done. */
2072 mpd->map.m_len = 0;
2073 mpd->map.m_flags = 0;
2074 return 0;
2075}
2076
2077static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
2078{
2079 struct inode *inode = mpd->inode;
2080 struct ext4_map_blocks *map = &mpd->map;
2081 int get_blocks_flags;
2082 int err;
2083
2084 trace_ext4_da_write_pages_extent(inode, map);
2255 /* 2085 /*
2256 * With non-extent format the journal credit needed to 2086 * Call ext4_map_blocks() to allocate any delayed allocation blocks, or
2257 * insert nrblocks contiguous block is dependent on 2087 * to convert an uninitialized extent to be initialized (in the case
2258 * number of contiguous block. So we will limit 2088 * where we have written into one or more preallocated blocks). It is
2259 * number of contiguous block to a sane value 2089 * possible that we're going to need more metadata blocks than
2090 * previously reserved. However we must not fail because we're in
2091 * writeback and there is nothing we can do about it so it might result
2092 * in data loss. So use reserved blocks to allocate metadata if
2093 * possible.
2094 *
2095 * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks
2096 * in question are delalloc blocks. This affects functions in many
2097 * different parts of the allocation call path. This flag exists
2098 * primarily because we don't want to change *many* call functions, so
2099 * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag
2100 * once the inode's allocation semaphore is taken.
2260 */ 2101 */
2261 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) && 2102 get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
2262 (max_blocks > EXT4_MAX_TRANS_DATA)) 2103 EXT4_GET_BLOCKS_METADATA_NOFAIL;
2263 max_blocks = EXT4_MAX_TRANS_DATA; 2104 if (ext4_should_dioread_nolock(inode))
2105 get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
2106 if (map->m_flags & (1 << BH_Delay))
2107 get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
2264 2108
2265 return ext4_chunk_trans_blocks(inode, max_blocks); 2109 err = ext4_map_blocks(handle, inode, map, get_blocks_flags);
2110 if (err < 0)
2111 return err;
2112 if (map->m_flags & EXT4_MAP_UNINIT) {
2113 if (!mpd->io_submit.io_end->handle &&
2114 ext4_handle_valid(handle)) {
2115 mpd->io_submit.io_end->handle = handle->h_rsv_handle;
2116 handle->h_rsv_handle = NULL;
2117 }
2118 ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end);
2119 }
2120
2121 BUG_ON(map->m_len == 0);
2122 if (map->m_flags & EXT4_MAP_NEW) {
2123 struct block_device *bdev = inode->i_sb->s_bdev;
2124 int i;
2125
2126 for (i = 0; i < map->m_len; i++)
2127 unmap_underlying_metadata(bdev, map->m_pblk + i);
2128 }
2129 return 0;
2266} 2130}
2267 2131
2268/* 2132/*
2269 * write_cache_pages_da - walk the list of dirty pages of the given 2133 * mpage_map_and_submit_extent - map extent starting at mpd->lblk of length
2270 * address space and accumulate pages that need writing, and call 2134 * mpd->len and submit pages underlying it for IO
2271 * mpage_da_map_and_submit to map a single contiguous memory region 2135 *
2272 * and then write them. 2136 * @handle - handle for journal operations
2137 * @mpd - extent to map
2138 *
2139 * The function maps extent starting at mpd->lblk of length mpd->len. If it is
2140 * delayed, blocks are allocated, if it is unwritten, we may need to convert
2141 * them to initialized or split the described range from larger unwritten
2142 * extent. Note that we need not map all the described range since allocation
2143 * can return less blocks or the range is covered by more unwritten extents. We
2144 * cannot map more because we are limited by reserved transaction credits. On
2145 * the other hand we always make sure that the last touched page is fully
2146 * mapped so that it can be written out (and thus forward progress is
2147 * guaranteed). After mapping we submit all mapped pages for IO.
2273 */ 2148 */
2274static int write_cache_pages_da(handle_t *handle, 2149static int mpage_map_and_submit_extent(handle_t *handle,
2275 struct address_space *mapping, 2150 struct mpage_da_data *mpd,
2276 struct writeback_control *wbc, 2151 bool *give_up_on_write)
2277 struct mpage_da_data *mpd,
2278 pgoff_t *done_index)
2279{ 2152{
2280 struct buffer_head *bh, *head; 2153 struct inode *inode = mpd->inode;
2281 struct inode *inode = mapping->host; 2154 struct ext4_map_blocks *map = &mpd->map;
2282 struct pagevec pvec; 2155 int err;
2283 unsigned int nr_pages; 2156 loff_t disksize;
2284 sector_t logical;
2285 pgoff_t index, end;
2286 long nr_to_write = wbc->nr_to_write;
2287 int i, tag, ret = 0;
2288
2289 memset(mpd, 0, sizeof(struct mpage_da_data));
2290 mpd->wbc = wbc;
2291 mpd->inode = inode;
2292 pagevec_init(&pvec, 0);
2293 index = wbc->range_start >> PAGE_CACHE_SHIFT;
2294 end = wbc->range_end >> PAGE_CACHE_SHIFT;
2295 2157
2296 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2158 mpd->io_submit.io_end->offset =
2159 ((loff_t)map->m_lblk) << inode->i_blkbits;
2160 do {
2161 err = mpage_map_one_extent(handle, mpd);
2162 if (err < 0) {
2163 struct super_block *sb = inode->i_sb;
2164
2165 if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
2166 goto invalidate_dirty_pages;
2167 /*
2168 * Let the uper layers retry transient errors.
2169 * In the case of ENOSPC, if ext4_count_free_blocks()
2170 * is non-zero, a commit should free up blocks.
2171 */
2172 if ((err == -ENOMEM) ||
2173 (err == -ENOSPC && ext4_count_free_clusters(sb)))
2174 return err;
2175 ext4_msg(sb, KERN_CRIT,
2176 "Delayed block allocation failed for "
2177 "inode %lu at logical offset %llu with"
2178 " max blocks %u with error %d",
2179 inode->i_ino,
2180 (unsigned long long)map->m_lblk,
2181 (unsigned)map->m_len, -err);
2182 ext4_msg(sb, KERN_CRIT,
2183 "This should not happen!! Data will "
2184 "be lost\n");
2185 if (err == -ENOSPC)
2186 ext4_print_free_blocks(inode);
2187 invalidate_dirty_pages:
2188 *give_up_on_write = true;
2189 return err;
2190 }
2191 /*
2192 * Update buffer state, submit mapped pages, and get us new
2193 * extent to map
2194 */
2195 err = mpage_map_and_submit_buffers(mpd);
2196 if (err < 0)
2197 return err;
2198 } while (map->m_len);
2199
2200 /* Update on-disk size after IO is submitted */
2201 disksize = ((loff_t)mpd->first_page) << PAGE_CACHE_SHIFT;
2202 if (disksize > i_size_read(inode))
2203 disksize = i_size_read(inode);
2204 if (disksize > EXT4_I(inode)->i_disksize) {
2205 int err2;
2206
2207 ext4_update_i_disksize(inode, disksize);
2208 err2 = ext4_mark_inode_dirty(handle, inode);
2209 if (err2)
2210 ext4_error(inode->i_sb,
2211 "Failed to mark inode %lu dirty",
2212 inode->i_ino);
2213 if (!err)
2214 err = err2;
2215 }
2216 return err;
2217}
2218
2219/*
2220 * Calculate the total number of credits to reserve for one writepages
2221 * iteration. This is called from ext4_writepages(). We map an extent of
2222 * upto MAX_WRITEPAGES_EXTENT_LEN blocks and then we go on and finish mapping
2223 * the last partial page. So in total we can map MAX_WRITEPAGES_EXTENT_LEN +
2224 * bpp - 1 blocks in bpp different extents.
2225 */
2226static int ext4_da_writepages_trans_blocks(struct inode *inode)
2227{
2228 int bpp = ext4_journal_blocks_per_page(inode);
2229
2230 return ext4_meta_trans_blocks(inode,
2231 MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
2232}
2233
2234/*
2235 * mpage_prepare_extent_to_map - find & lock contiguous range of dirty pages
2236 * and underlying extent to map
2237 *
2238 * @mpd - where to look for pages
2239 *
2240 * Walk dirty pages in the mapping. If they are fully mapped, submit them for
2241 * IO immediately. When we find a page which isn't mapped we start accumulating
2242 * extent of buffers underlying these pages that needs mapping (formed by
2243 * either delayed or unwritten buffers). We also lock the pages containing
2244 * these buffers. The extent found is returned in @mpd structure (starting at
2245 * mpd->lblk with length mpd->len blocks).
2246 *
2247 * Note that this function can attach bios to one io_end structure which are
2248 * neither logically nor physically contiguous. Although it may seem as an
2249 * unnecessary complication, it is actually inevitable in blocksize < pagesize
2250 * case as we need to track IO to all buffers underlying a page in one io_end.
2251 */
2252static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
2253{
2254 struct address_space *mapping = mpd->inode->i_mapping;
2255 struct pagevec pvec;
2256 unsigned int nr_pages;
2257 pgoff_t index = mpd->first_page;
2258 pgoff_t end = mpd->last_page;
2259 int tag;
2260 int i, err = 0;
2261 int blkbits = mpd->inode->i_blkbits;
2262 ext4_lblk_t lblk;
2263 struct buffer_head *head;
2264
2265 if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
2297 tag = PAGECACHE_TAG_TOWRITE; 2266 tag = PAGECACHE_TAG_TOWRITE;
2298 else 2267 else
2299 tag = PAGECACHE_TAG_DIRTY; 2268 tag = PAGECACHE_TAG_DIRTY;
2300 2269
2301 *done_index = index; 2270 pagevec_init(&pvec, 0);
2271 mpd->map.m_len = 0;
2272 mpd->next_page = index;
2302 while (index <= end) { 2273 while (index <= end) {
2303 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, 2274 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag,
2304 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); 2275 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
2305 if (nr_pages == 0) 2276 if (nr_pages == 0)
2306 return 0; 2277 goto out;
2307 2278
2308 for (i = 0; i < nr_pages; i++) { 2279 for (i = 0; i < nr_pages; i++) {
2309 struct page *page = pvec.pages[i]; 2280 struct page *page = pvec.pages[i];
@@ -2318,31 +2289,21 @@ static int write_cache_pages_da(handle_t *handle,
2318 if (page->index > end) 2289 if (page->index > end)
2319 goto out; 2290 goto out;
2320 2291
2321 *done_index = page->index + 1; 2292 /* If we can't merge this page, we are done. */
2322 2293 if (mpd->map.m_len > 0 && mpd->next_page != page->index)
2323 /* 2294 goto out;
2324 * If we can't merge this page, and we have
2325 * accumulated an contiguous region, write it
2326 */
2327 if ((mpd->next_page != page->index) &&
2328 (mpd->next_page != mpd->first_page)) {
2329 mpage_da_map_and_submit(mpd);
2330 goto ret_extent_tail;
2331 }
2332 2295
2333 lock_page(page); 2296 lock_page(page);
2334
2335 /* 2297 /*
2336 * If the page is no longer dirty, or its 2298 * If the page is no longer dirty, or its mapping no
2337 * mapping no longer corresponds to inode we 2299 * longer corresponds to inode we are writing (which
2338 * are writing (which means it has been 2300 * means it has been truncated or invalidated), or the
2339 * truncated or invalidated), or the page is 2301 * page is already under writeback and we are not doing
2340 * already under writeback and we are not 2302 * a data integrity writeback, skip the page
2341 * doing a data integrity writeback, skip the page
2342 */ 2303 */
2343 if (!PageDirty(page) || 2304 if (!PageDirty(page) ||
2344 (PageWriteback(page) && 2305 (PageWriteback(page) &&
2345 (wbc->sync_mode == WB_SYNC_NONE)) || 2306 (mpd->wbc->sync_mode == WB_SYNC_NONE)) ||
2346 unlikely(page->mapping != mapping)) { 2307 unlikely(page->mapping != mapping)) {
2347 unlock_page(page); 2308 unlock_page(page);
2348 continue; 2309 continue;
@@ -2351,106 +2312,70 @@ static int write_cache_pages_da(handle_t *handle,
2351 wait_on_page_writeback(page); 2312 wait_on_page_writeback(page);
2352 BUG_ON(PageWriteback(page)); 2313 BUG_ON(PageWriteback(page));
2353 2314
2354 /* 2315 if (mpd->map.m_len == 0)
2355 * If we have inline data and arrive here, it means that
2356 * we will soon create the block for the 1st page, so
2357 * we'd better clear the inline data here.
2358 */
2359 if (ext4_has_inline_data(inode)) {
2360 BUG_ON(ext4_test_inode_state(inode,
2361 EXT4_STATE_MAY_INLINE_DATA));
2362 ext4_destroy_inline_data(handle, inode);
2363 }
2364
2365 if (mpd->next_page != page->index)
2366 mpd->first_page = page->index; 2316 mpd->first_page = page->index;
2367 mpd->next_page = page->index + 1; 2317 mpd->next_page = page->index + 1;
2368 logical = (sector_t) page->index <<
2369 (PAGE_CACHE_SHIFT - inode->i_blkbits);
2370
2371 /* Add all dirty buffers to mpd */ 2318 /* Add all dirty buffers to mpd */
2319 lblk = ((ext4_lblk_t)page->index) <<
2320 (PAGE_CACHE_SHIFT - blkbits);
2372 head = page_buffers(page); 2321 head = page_buffers(page);
2373 bh = head; 2322 if (!add_page_bufs_to_extent(mpd, head, head, lblk))
2374 do { 2323 goto out;
2375 BUG_ON(buffer_locked(bh)); 2324 /* So far everything mapped? Submit the page for IO. */
2376 /* 2325 if (mpd->map.m_len == 0) {
2377 * We need to try to allocate unmapped blocks 2326 err = mpage_submit_page(mpd, page);
2378 * in the same page. Otherwise we won't make 2327 if (err < 0)
2379 * progress with the page in ext4_writepage
2380 */
2381 if (ext4_bh_delay_or_unwritten(NULL, bh)) {
2382 mpage_add_bh_to_extent(mpd, logical,
2383 bh->b_state);
2384 if (mpd->io_done)
2385 goto ret_extent_tail;
2386 } else if (buffer_dirty(bh) &&
2387 buffer_mapped(bh)) {
2388 /*
2389 * mapped dirty buffer. We need to
2390 * update the b_state because we look
2391 * at b_state in mpage_da_map_blocks.
2392 * We don't update b_size because if we
2393 * find an unmapped buffer_head later
2394 * we need to use the b_state flag of
2395 * that buffer_head.
2396 */
2397 if (mpd->b_size == 0)
2398 mpd->b_state =
2399 bh->b_state & BH_FLAGS;
2400 }
2401 logical++;
2402 } while ((bh = bh->b_this_page) != head);
2403
2404 if (nr_to_write > 0) {
2405 nr_to_write--;
2406 if (nr_to_write == 0 &&
2407 wbc->sync_mode == WB_SYNC_NONE)
2408 /*
2409 * We stop writing back only if we are
2410 * not doing integrity sync. In case of
2411 * integrity sync we have to keep going
2412 * because someone may be concurrently
2413 * dirtying pages, and we might have
2414 * synced a lot of newly appeared dirty
2415 * pages, but have not synced all of the
2416 * old dirty pages.
2417 */
2418 goto out; 2328 goto out;
2419 } 2329 }
2330
2331 /*
2332 * Accumulated enough dirty pages? This doesn't apply
2333 * to WB_SYNC_ALL mode. For integrity sync we have to
2334 * keep going because someone may be concurrently
2335 * dirtying pages, and we might have synced a lot of
2336 * newly appeared dirty pages, but have not synced all
2337 * of the old dirty pages.
2338 */
2339 if (mpd->wbc->sync_mode == WB_SYNC_NONE &&
2340 mpd->next_page - mpd->first_page >=
2341 mpd->wbc->nr_to_write)
2342 goto out;
2420 } 2343 }
2421 pagevec_release(&pvec); 2344 pagevec_release(&pvec);
2422 cond_resched(); 2345 cond_resched();
2423 } 2346 }
2424 return 0; 2347 return 0;
2425ret_extent_tail:
2426 ret = MPAGE_DA_EXTENT_TAIL;
2427out: 2348out:
2428 pagevec_release(&pvec); 2349 pagevec_release(&pvec);
2429 cond_resched(); 2350 return err;
2430 return ret;
2431} 2351}
2432 2352
2353static int __writepage(struct page *page, struct writeback_control *wbc,
2354 void *data)
2355{
2356 struct address_space *mapping = data;
2357 int ret = ext4_writepage(page, wbc);
2358 mapping_set_error(mapping, ret);
2359 return ret;
2360}
2433 2361
2434static int ext4_da_writepages(struct address_space *mapping, 2362static int ext4_writepages(struct address_space *mapping,
2435 struct writeback_control *wbc) 2363 struct writeback_control *wbc)
2436{ 2364{
2437 pgoff_t index; 2365 pgoff_t writeback_index = 0;
2366 long nr_to_write = wbc->nr_to_write;
2438 int range_whole = 0; 2367 int range_whole = 0;
2368 int cycled = 1;
2439 handle_t *handle = NULL; 2369 handle_t *handle = NULL;
2440 struct mpage_da_data mpd; 2370 struct mpage_da_data mpd;
2441 struct inode *inode = mapping->host; 2371 struct inode *inode = mapping->host;
2442 int pages_written = 0; 2372 int needed_blocks, rsv_blocks = 0, ret = 0;
2443 unsigned int max_pages;
2444 int range_cyclic, cycled = 1, io_done = 0;
2445 int needed_blocks, ret = 0;
2446 long desired_nr_to_write, nr_to_writebump = 0;
2447 loff_t range_start = wbc->range_start;
2448 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); 2373 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2449 pgoff_t done_index = 0; 2374 bool done;
2450 pgoff_t end;
2451 struct blk_plug plug; 2375 struct blk_plug plug;
2376 bool give_up_on_write = false;
2452 2377
2453 trace_ext4_da_writepages(inode, wbc); 2378 trace_ext4_writepages(inode, wbc);
2454 2379
2455 /* 2380 /*
2456 * No pages to write? This is mainly a kludge to avoid starting 2381 * No pages to write? This is mainly a kludge to avoid starting
@@ -2460,164 +2385,165 @@ static int ext4_da_writepages(struct address_space *mapping,
2460 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) 2385 if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
2461 return 0; 2386 return 0;
2462 2387
2388 if (ext4_should_journal_data(inode)) {
2389 struct blk_plug plug;
2390 int ret;
2391
2392 blk_start_plug(&plug);
2393 ret = write_cache_pages(mapping, wbc, __writepage, mapping);
2394 blk_finish_plug(&plug);
2395 return ret;
2396 }
2397
2463 /* 2398 /*
2464 * If the filesystem has aborted, it is read-only, so return 2399 * If the filesystem has aborted, it is read-only, so return
2465 * right away instead of dumping stack traces later on that 2400 * right away instead of dumping stack traces later on that
2466 * will obscure the real source of the problem. We test 2401 * will obscure the real source of the problem. We test
2467 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because 2402 * EXT4_MF_FS_ABORTED instead of sb->s_flag's MS_RDONLY because
2468 * the latter could be true if the filesystem is mounted 2403 * the latter could be true if the filesystem is mounted
2469 * read-only, and in that case, ext4_da_writepages should 2404 * read-only, and in that case, ext4_writepages should
2470 * *never* be called, so if that ever happens, we would want 2405 * *never* be called, so if that ever happens, we would want
2471 * the stack trace. 2406 * the stack trace.
2472 */ 2407 */
2473 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED)) 2408 if (unlikely(sbi->s_mount_flags & EXT4_MF_FS_ABORTED))
2474 return -EROFS; 2409 return -EROFS;
2475 2410
2411 if (ext4_should_dioread_nolock(inode)) {
2412 /*
2413 * We may need to convert upto one extent per block in
2414 * the page and we may dirty the inode.
2415 */
2416 rsv_blocks = 1 + (PAGE_CACHE_SIZE >> inode->i_blkbits);
2417 }
2418
2419 /*
2420 * If we have inline data and arrive here, it means that
2421 * we will soon create the block for the 1st page, so
2422 * we'd better clear the inline data here.
2423 */
2424 if (ext4_has_inline_data(inode)) {
2425 /* Just inode will be modified... */
2426 handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
2427 if (IS_ERR(handle)) {
2428 ret = PTR_ERR(handle);
2429 goto out_writepages;
2430 }
2431 BUG_ON(ext4_test_inode_state(inode,
2432 EXT4_STATE_MAY_INLINE_DATA));
2433 ext4_destroy_inline_data(handle, inode);
2434 ext4_journal_stop(handle);
2435 }
2436
2476 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 2437 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
2477 range_whole = 1; 2438 range_whole = 1;
2478 2439
2479 range_cyclic = wbc->range_cyclic;
2480 if (wbc->range_cyclic) { 2440 if (wbc->range_cyclic) {
2481 index = mapping->writeback_index; 2441 writeback_index = mapping->writeback_index;
2482 if (index) 2442 if (writeback_index)
2483 cycled = 0; 2443 cycled = 0;
2484 wbc->range_start = index << PAGE_CACHE_SHIFT; 2444 mpd.first_page = writeback_index;
2485 wbc->range_end = LLONG_MAX; 2445 mpd.last_page = -1;
2486 wbc->range_cyclic = 0;
2487 end = -1;
2488 } else { 2446 } else {
2489 index = wbc->range_start >> PAGE_CACHE_SHIFT; 2447 mpd.first_page = wbc->range_start >> PAGE_CACHE_SHIFT;
2490 end = wbc->range_end >> PAGE_CACHE_SHIFT; 2448 mpd.last_page = wbc->range_end >> PAGE_CACHE_SHIFT;
2491 }
2492
2493 /*
2494 * This works around two forms of stupidity. The first is in
2495 * the writeback code, which caps the maximum number of pages
2496 * written to be 1024 pages. This is wrong on multiple
2497 * levels; different architectues have a different page size,
2498 * which changes the maximum amount of data which gets
2499 * written. Secondly, 4 megabytes is way too small. XFS
2500 * forces this value to be 16 megabytes by multiplying
2501 * nr_to_write parameter by four, and then relies on its
2502 * allocator to allocate larger extents to make them
2503 * contiguous. Unfortunately this brings us to the second
2504 * stupidity, which is that ext4's mballoc code only allocates
2505 * at most 2048 blocks. So we force contiguous writes up to
2506 * the number of dirty blocks in the inode, or
2507 * sbi->max_writeback_mb_bump whichever is smaller.
2508 */
2509 max_pages = sbi->s_max_writeback_mb_bump << (20 - PAGE_CACHE_SHIFT);
2510 if (!range_cyclic && range_whole) {
2511 if (wbc->nr_to_write == LONG_MAX)
2512 desired_nr_to_write = wbc->nr_to_write;
2513 else
2514 desired_nr_to_write = wbc->nr_to_write * 8;
2515 } else
2516 desired_nr_to_write = ext4_num_dirty_pages(inode, index,
2517 max_pages);
2518 if (desired_nr_to_write > max_pages)
2519 desired_nr_to_write = max_pages;
2520
2521 if (wbc->nr_to_write < desired_nr_to_write) {
2522 nr_to_writebump = desired_nr_to_write - wbc->nr_to_write;
2523 wbc->nr_to_write = desired_nr_to_write;
2524 } 2449 }
2525 2450
2451 mpd.inode = inode;
2452 mpd.wbc = wbc;
2453 ext4_io_submit_init(&mpd.io_submit, wbc);
2526retry: 2454retry:
2527 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) 2455 if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
2528 tag_pages_for_writeback(mapping, index, end); 2456 tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page);
2529 2457 done = false;
2530 blk_start_plug(&plug); 2458 blk_start_plug(&plug);
2531 while (!ret && wbc->nr_to_write > 0) { 2459 while (!done && mpd.first_page <= mpd.last_page) {
2460 /* For each extent of pages we use new io_end */
2461 mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL);
2462 if (!mpd.io_submit.io_end) {
2463 ret = -ENOMEM;
2464 break;
2465 }
2532 2466
2533 /* 2467 /*
2534 * we insert one extent at a time. So we need 2468 * We have two constraints: We find one extent to map and we
2535 * credit needed for single extent allocation. 2469 * must always write out whole page (makes a difference when
2536 * journalled mode is currently not supported 2470 * blocksize < pagesize) so that we don't block on IO when we
2537 * by delalloc 2471 * try to write out the rest of the page. Journalled mode is
2472 * not supported by delalloc.
2538 */ 2473 */
2539 BUG_ON(ext4_should_journal_data(inode)); 2474 BUG_ON(ext4_should_journal_data(inode));
2540 needed_blocks = ext4_da_writepages_trans_blocks(inode); 2475 needed_blocks = ext4_da_writepages_trans_blocks(inode);
2541 2476
2542 /* start a new transaction*/ 2477 /* start a new transaction */
2543 handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 2478 handle = ext4_journal_start_with_reserve(inode,
2544 needed_blocks); 2479 EXT4_HT_WRITE_PAGE, needed_blocks, rsv_blocks);
2545 if (IS_ERR(handle)) { 2480 if (IS_ERR(handle)) {
2546 ret = PTR_ERR(handle); 2481 ret = PTR_ERR(handle);
2547 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: " 2482 ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
2548 "%ld pages, ino %lu; err %d", __func__, 2483 "%ld pages, ino %lu; err %d", __func__,
2549 wbc->nr_to_write, inode->i_ino, ret); 2484 wbc->nr_to_write, inode->i_ino, ret);
2550 blk_finish_plug(&plug); 2485 /* Release allocated io_end */
2551 goto out_writepages; 2486 ext4_put_io_end(mpd.io_submit.io_end);
2487 break;
2552 } 2488 }
2553 2489
2554 /* 2490 trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc);
2555 * Now call write_cache_pages_da() to find the next 2491 ret = mpage_prepare_extent_to_map(&mpd);
2556 * contiguous region of logical blocks that need 2492 if (!ret) {
2557 * blocks to be allocated by ext4 and submit them. 2493 if (mpd.map.m_len)
2558 */ 2494 ret = mpage_map_and_submit_extent(handle, &mpd,
2559 ret = write_cache_pages_da(handle, mapping, 2495 &give_up_on_write);
2560 wbc, &mpd, &done_index); 2496 else {
2561 /* 2497 /*
2562 * If we have a contiguous extent of pages and we 2498 * We scanned the whole range (or exhausted
2563 * haven't done the I/O yet, map the blocks and submit 2499 * nr_to_write), submitted what was mapped and
2564 * them for I/O. 2500 * didn't find anything needing mapping. We are
2565 */ 2501 * done.
2566 if (!mpd.io_done && mpd.next_page != mpd.first_page) { 2502 */
2567 mpage_da_map_and_submit(&mpd); 2503 done = true;
2568 ret = MPAGE_DA_EXTENT_TAIL; 2504 }
2569 } 2505 }
2570 trace_ext4_da_write_pages(inode, &mpd);
2571 wbc->nr_to_write -= mpd.pages_written;
2572
2573 ext4_journal_stop(handle); 2506 ext4_journal_stop(handle);
2574 2507 /* Submit prepared bio */
2575 if ((mpd.retval == -ENOSPC) && sbi->s_journal) { 2508 ext4_io_submit(&mpd.io_submit);
2576 /* commit the transaction which would 2509 /* Unlock pages we didn't use */
2510 mpage_release_unused_pages(&mpd, give_up_on_write);
2511 /* Drop our io_end reference we got from init */
2512 ext4_put_io_end(mpd.io_submit.io_end);
2513
2514 if (ret == -ENOSPC && sbi->s_journal) {
2515 /*
2516 * Commit the transaction which would
2577 * free blocks released in the transaction 2517 * free blocks released in the transaction
2578 * and try again 2518 * and try again
2579 */ 2519 */
2580 jbd2_journal_force_commit_nested(sbi->s_journal); 2520 jbd2_journal_force_commit_nested(sbi->s_journal);
2581 ret = 0; 2521 ret = 0;
2582 } else if (ret == MPAGE_DA_EXTENT_TAIL) { 2522 continue;
2583 /* 2523 }
2584 * Got one extent now try with rest of the pages. 2524 /* Fatal error - ENOMEM, EIO... */
2585 * If mpd.retval is set -EIO, journal is aborted. 2525 if (ret)
2586 * So we don't need to write any more.
2587 */
2588 pages_written += mpd.pages_written;
2589 ret = mpd.retval;
2590 io_done = 1;
2591 } else if (wbc->nr_to_write)
2592 /*
2593 * There is no more writeout needed
2594 * or we requested for a noblocking writeout
2595 * and we found the device congested
2596 */
2597 break; 2526 break;
2598 } 2527 }
2599 blk_finish_plug(&plug); 2528 blk_finish_plug(&plug);
2600 if (!io_done && !cycled) { 2529 if (!ret && !cycled) {
2601 cycled = 1; 2530 cycled = 1;
2602 index = 0; 2531 mpd.last_page = writeback_index - 1;
2603 wbc->range_start = index << PAGE_CACHE_SHIFT; 2532 mpd.first_page = 0;
2604 wbc->range_end = mapping->writeback_index - 1;
2605 goto retry; 2533 goto retry;
2606 } 2534 }
2607 2535
2608 /* Update index */ 2536 /* Update index */
2609 wbc->range_cyclic = range_cyclic;
2610 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 2537 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
2611 /* 2538 /*
2612 * set the writeback_index so that range_cyclic 2539 * Set the writeback_index so that range_cyclic
2613 * mode will write it back later 2540 * mode will write it back later
2614 */ 2541 */
2615 mapping->writeback_index = done_index; 2542 mapping->writeback_index = mpd.first_page;
2616 2543
2617out_writepages: 2544out_writepages:
2618 wbc->nr_to_write -= nr_to_writebump; 2545 trace_ext4_writepages_result(inode, wbc, ret,
2619 wbc->range_start = range_start; 2546 nr_to_write - wbc->nr_to_write);
2620 trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
2621 return ret; 2547 return ret;
2622} 2548}
2623 2549
@@ -2829,7 +2755,8 @@ static int ext4_da_write_end(struct file *file,
2829 return ret ? ret : copied; 2755 return ret ? ret : copied;
2830} 2756}
2831 2757
2832static void ext4_da_invalidatepage(struct page *page, unsigned long offset) 2758static void ext4_da_invalidatepage(struct page *page, unsigned int offset,
2759 unsigned int length)
2833{ 2760{
2834 /* 2761 /*
2835 * Drop reserved blocks 2762 * Drop reserved blocks
@@ -2838,10 +2765,10 @@ static void ext4_da_invalidatepage(struct page *page, unsigned long offset)
2838 if (!page_has_buffers(page)) 2765 if (!page_has_buffers(page))
2839 goto out; 2766 goto out;
2840 2767
2841 ext4_da_page_release_reservation(page, offset); 2768 ext4_da_page_release_reservation(page, offset, length);
2842 2769
2843out: 2770out:
2844 ext4_invalidatepage(page, offset); 2771 ext4_invalidatepage(page, offset, length);
2845 2772
2846 return; 2773 return;
2847} 2774}
@@ -2864,7 +2791,7 @@ int ext4_alloc_da_blocks(struct inode *inode)
2864 * laptop_mode, not even desirable). However, to do otherwise 2791 * laptop_mode, not even desirable). However, to do otherwise
2865 * would require replicating code paths in: 2792 * would require replicating code paths in:
2866 * 2793 *
2867 * ext4_da_writepages() -> 2794 * ext4_writepages() ->
2868 * write_cache_pages() ---> (via passed in callback function) 2795 * write_cache_pages() ---> (via passed in callback function)
2869 * __mpage_da_writepage() --> 2796 * __mpage_da_writepage() -->
2870 * mpage_add_bh_to_extent() 2797 * mpage_add_bh_to_extent()
@@ -2989,37 +2916,40 @@ ext4_readpages(struct file *file, struct address_space *mapping,
2989 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block); 2916 return mpage_readpages(mapping, pages, nr_pages, ext4_get_block);
2990} 2917}
2991 2918
2992static void ext4_invalidatepage(struct page *page, unsigned long offset) 2919static void ext4_invalidatepage(struct page *page, unsigned int offset,
2920 unsigned int length)
2993{ 2921{
2994 trace_ext4_invalidatepage(page, offset); 2922 trace_ext4_invalidatepage(page, offset, length);
2995 2923
2996 /* No journalling happens on data buffers when this function is used */ 2924 /* No journalling happens on data buffers when this function is used */
2997 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page))); 2925 WARN_ON(page_has_buffers(page) && buffer_jbd(page_buffers(page)));
2998 2926
2999 block_invalidatepage(page, offset); 2927 block_invalidatepage(page, offset, length);
3000} 2928}
3001 2929
3002static int __ext4_journalled_invalidatepage(struct page *page, 2930static int __ext4_journalled_invalidatepage(struct page *page,
3003 unsigned long offset) 2931 unsigned int offset,
2932 unsigned int length)
3004{ 2933{
3005 journal_t *journal = EXT4_JOURNAL(page->mapping->host); 2934 journal_t *journal = EXT4_JOURNAL(page->mapping->host);
3006 2935
3007 trace_ext4_journalled_invalidatepage(page, offset); 2936 trace_ext4_journalled_invalidatepage(page, offset, length);
3008 2937
3009 /* 2938 /*
3010 * If it's a full truncate we just forget about the pending dirtying 2939 * If it's a full truncate we just forget about the pending dirtying
3011 */ 2940 */
3012 if (offset == 0) 2941 if (offset == 0 && length == PAGE_CACHE_SIZE)
3013 ClearPageChecked(page); 2942 ClearPageChecked(page);
3014 2943
3015 return jbd2_journal_invalidatepage(journal, page, offset); 2944 return jbd2_journal_invalidatepage(journal, page, offset, length);
3016} 2945}
3017 2946
3018/* Wrapper for aops... */ 2947/* Wrapper for aops... */
3019static void ext4_journalled_invalidatepage(struct page *page, 2948static void ext4_journalled_invalidatepage(struct page *page,
3020 unsigned long offset) 2949 unsigned int offset,
2950 unsigned int length)
3021{ 2951{
3022 WARN_ON(__ext4_journalled_invalidatepage(page, offset) < 0); 2952 WARN_ON(__ext4_journalled_invalidatepage(page, offset, length) < 0);
3023} 2953}
3024 2954
3025static int ext4_releasepage(struct page *page, gfp_t wait) 2955static int ext4_releasepage(struct page *page, gfp_t wait)
@@ -3067,9 +2997,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3067 struct inode *inode = file_inode(iocb->ki_filp); 2997 struct inode *inode = file_inode(iocb->ki_filp);
3068 ext4_io_end_t *io_end = iocb->private; 2998 ext4_io_end_t *io_end = iocb->private;
3069 2999
3070 /* if not async direct IO or dio with 0 bytes write, just return */ 3000 /* if not async direct IO just return */
3071 if (!io_end || !size) 3001 if (!io_end) {
3072 goto out; 3002 inode_dio_done(inode);
3003 if (is_async)
3004 aio_complete(iocb, ret, 0);
3005 return;
3006 }
3073 3007
3074 ext_debug("ext4_end_io_dio(): io_end 0x%p " 3008 ext_debug("ext4_end_io_dio(): io_end 0x%p "
3075 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n", 3009 "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
@@ -3077,25 +3011,13 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
3077 size); 3011 size);
3078 3012
3079 iocb->private = NULL; 3013 iocb->private = NULL;
3080
3081 /* if not aio dio with unwritten extents, just free io and return */
3082 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) {
3083 ext4_free_io_end(io_end);
3084out:
3085 inode_dio_done(inode);
3086 if (is_async)
3087 aio_complete(iocb, ret, 0);
3088 return;
3089 }
3090
3091 io_end->offset = offset; 3014 io_end->offset = offset;
3092 io_end->size = size; 3015 io_end->size = size;
3093 if (is_async) { 3016 if (is_async) {
3094 io_end->iocb = iocb; 3017 io_end->iocb = iocb;
3095 io_end->result = ret; 3018 io_end->result = ret;
3096 } 3019 }
3097 3020 ext4_put_io_end_defer(io_end);
3098 ext4_add_complete_io(io_end);
3099} 3021}
3100 3022
3101/* 3023/*
@@ -3129,6 +3051,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3129 get_block_t *get_block_func = NULL; 3051 get_block_t *get_block_func = NULL;
3130 int dio_flags = 0; 3052 int dio_flags = 0;
3131 loff_t final_size = offset + count; 3053 loff_t final_size = offset + count;
3054 ext4_io_end_t *io_end = NULL;
3132 3055
3133 /* Use the old path for reads and writes beyond i_size. */ 3056 /* Use the old path for reads and writes beyond i_size. */
3134 if (rw != WRITE || final_size > inode->i_size) 3057 if (rw != WRITE || final_size > inode->i_size)
@@ -3136,11 +3059,18 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3136 3059
3137 BUG_ON(iocb->private == NULL); 3060 BUG_ON(iocb->private == NULL);
3138 3061
3062 /*
3063 * Make all waiters for direct IO properly wait also for extent
3064 * conversion. This also disallows race between truncate() and
3065 * overwrite DIO as i_dio_count needs to be incremented under i_mutex.
3066 */
3067 if (rw == WRITE)
3068 atomic_inc(&inode->i_dio_count);
3069
3139 /* If we do a overwrite dio, i_mutex locking can be released */ 3070 /* If we do a overwrite dio, i_mutex locking can be released */
3140 overwrite = *((int *)iocb->private); 3071 overwrite = *((int *)iocb->private);
3141 3072
3142 if (overwrite) { 3073 if (overwrite) {
3143 atomic_inc(&inode->i_dio_count);
3144 down_read(&EXT4_I(inode)->i_data_sem); 3074 down_read(&EXT4_I(inode)->i_data_sem);
3145 mutex_unlock(&inode->i_mutex); 3075 mutex_unlock(&inode->i_mutex);
3146 } 3076 }
@@ -3167,13 +3097,16 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3167 iocb->private = NULL; 3097 iocb->private = NULL;
3168 ext4_inode_aio_set(inode, NULL); 3098 ext4_inode_aio_set(inode, NULL);
3169 if (!is_sync_kiocb(iocb)) { 3099 if (!is_sync_kiocb(iocb)) {
3170 ext4_io_end_t *io_end = ext4_init_io_end(inode, GFP_NOFS); 3100 io_end = ext4_init_io_end(inode, GFP_NOFS);
3171 if (!io_end) { 3101 if (!io_end) {
3172 ret = -ENOMEM; 3102 ret = -ENOMEM;
3173 goto retake_lock; 3103 goto retake_lock;
3174 } 3104 }
3175 io_end->flag |= EXT4_IO_END_DIRECT; 3105 io_end->flag |= EXT4_IO_END_DIRECT;
3176 iocb->private = io_end; 3106 /*
3107 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
3108 */
3109 iocb->private = ext4_get_io_end(io_end);
3177 /* 3110 /*
3178 * we save the io structure for current async direct 3111 * we save the io structure for current async direct
3179 * IO, so that later ext4_map_blocks() could flag the 3112 * IO, so that later ext4_map_blocks() could flag the
@@ -3197,33 +3130,42 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3197 NULL, 3130 NULL,
3198 dio_flags); 3131 dio_flags);
3199 3132
3200 if (iocb->private)
3201 ext4_inode_aio_set(inode, NULL);
3202 /* 3133 /*
3203 * The io_end structure takes a reference to the inode, that 3134 * Put our reference to io_end. This can free the io_end structure e.g.
3204 * structure needs to be destroyed and the reference to the 3135 * in sync IO case or in case of error. It can even perform extent
3205 * inode need to be dropped, when IO is complete, even with 0 3136 * conversion if all bios we submitted finished before we got here.
3206 * byte write, or failed. 3137 * Note that in that case iocb->private can be already set to NULL
3207 * 3138 * here.
3208 * In the successful AIO DIO case, the io_end structure will
3209 * be destroyed and the reference to the inode will be dropped
3210 * after the end_io call back function is called.
3211 *
3212 * In the case there is 0 byte write, or error case, since VFS
3213 * direct IO won't invoke the end_io call back function, we
3214 * need to free the end_io structure here.
3215 */ 3139 */
3216 if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { 3140 if (io_end) {
3217 ext4_free_io_end(iocb->private); 3141 ext4_inode_aio_set(inode, NULL);
3218 iocb->private = NULL; 3142 ext4_put_io_end(io_end);
3219 } else if (ret > 0 && !overwrite && ext4_test_inode_state(inode, 3143 /*
3144 * When no IO was submitted ext4_end_io_dio() was not
3145 * called so we have to put iocb's reference.
3146 */
3147 if (ret <= 0 && ret != -EIOCBQUEUED && iocb->private) {
3148 WARN_ON(iocb->private != io_end);
3149 WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
3150 WARN_ON(io_end->iocb);
3151 /*
3152 * Generic code already did inode_dio_done() so we
3153 * have to clear EXT4_IO_END_DIRECT to not do it for
3154 * the second time.
3155 */
3156 io_end->flag = 0;
3157 ext4_put_io_end(io_end);
3158 iocb->private = NULL;
3159 }
3160 }
3161 if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
3220 EXT4_STATE_DIO_UNWRITTEN)) { 3162 EXT4_STATE_DIO_UNWRITTEN)) {
3221 int err; 3163 int err;
3222 /* 3164 /*
3223 * for non AIO case, since the IO is already 3165 * for non AIO case, since the IO is already
3224 * completed, we could do the conversion right here 3166 * completed, we could do the conversion right here
3225 */ 3167 */
3226 err = ext4_convert_unwritten_extents(inode, 3168 err = ext4_convert_unwritten_extents(NULL, inode,
3227 offset, ret); 3169 offset, ret);
3228 if (err < 0) 3170 if (err < 0)
3229 ret = err; 3171 ret = err;
@@ -3231,9 +3173,10 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
3231 } 3173 }
3232 3174
3233retake_lock: 3175retake_lock:
3176 if (rw == WRITE)
3177 inode_dio_done(inode);
3234 /* take i_mutex locking again if we do a ovewrite dio */ 3178 /* take i_mutex locking again if we do a ovewrite dio */
3235 if (overwrite) { 3179 if (overwrite) {
3236 inode_dio_done(inode);
3237 up_read(&EXT4_I(inode)->i_data_sem); 3180 up_read(&EXT4_I(inode)->i_data_sem);
3238 mutex_lock(&inode->i_mutex); 3181 mutex_lock(&inode->i_mutex);
3239 } 3182 }
@@ -3292,6 +3235,7 @@ static const struct address_space_operations ext4_aops = {
3292 .readpage = ext4_readpage, 3235 .readpage = ext4_readpage,
3293 .readpages = ext4_readpages, 3236 .readpages = ext4_readpages,
3294 .writepage = ext4_writepage, 3237 .writepage = ext4_writepage,
3238 .writepages = ext4_writepages,
3295 .write_begin = ext4_write_begin, 3239 .write_begin = ext4_write_begin,
3296 .write_end = ext4_write_end, 3240 .write_end = ext4_write_end,
3297 .bmap = ext4_bmap, 3241 .bmap = ext4_bmap,
@@ -3307,6 +3251,7 @@ static const struct address_space_operations ext4_journalled_aops = {
3307 .readpage = ext4_readpage, 3251 .readpage = ext4_readpage,
3308 .readpages = ext4_readpages, 3252 .readpages = ext4_readpages,
3309 .writepage = ext4_writepage, 3253 .writepage = ext4_writepage,
3254 .writepages = ext4_writepages,
3310 .write_begin = ext4_write_begin, 3255 .write_begin = ext4_write_begin,
3311 .write_end = ext4_journalled_write_end, 3256 .write_end = ext4_journalled_write_end,
3312 .set_page_dirty = ext4_journalled_set_page_dirty, 3257 .set_page_dirty = ext4_journalled_set_page_dirty,
@@ -3322,7 +3267,7 @@ static const struct address_space_operations ext4_da_aops = {
3322 .readpage = ext4_readpage, 3267 .readpage = ext4_readpage,
3323 .readpages = ext4_readpages, 3268 .readpages = ext4_readpages,
3324 .writepage = ext4_writepage, 3269 .writepage = ext4_writepage,
3325 .writepages = ext4_da_writepages, 3270 .writepages = ext4_writepages,
3326 .write_begin = ext4_da_write_begin, 3271 .write_begin = ext4_da_write_begin,
3327 .write_end = ext4_da_write_end, 3272 .write_end = ext4_da_write_end,
3328 .bmap = ext4_bmap, 3273 .bmap = ext4_bmap,
@@ -3355,89 +3300,56 @@ void ext4_set_aops(struct inode *inode)
3355 inode->i_mapping->a_ops = &ext4_aops; 3300 inode->i_mapping->a_ops = &ext4_aops;
3356} 3301}
3357 3302
3358
3359/* 3303/*
3360 * ext4_discard_partial_page_buffers() 3304 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
3361 * Wrapper function for ext4_discard_partial_page_buffers_no_lock. 3305 * up to the end of the block which corresponds to `from'.
3362 * This function finds and locks the page containing the offset 3306 * This required during truncate. We need to physically zero the tail end
3363 * "from" and passes it to ext4_discard_partial_page_buffers_no_lock. 3307 * of that block so it doesn't yield old data if the file is later grown.
3364 * Calling functions that already have the page locked should call
3365 * ext4_discard_partial_page_buffers_no_lock directly.
3366 */ 3308 */
3367int ext4_discard_partial_page_buffers(handle_t *handle, 3309int ext4_block_truncate_page(handle_t *handle,
3368 struct address_space *mapping, loff_t from, 3310 struct address_space *mapping, loff_t from)
3369 loff_t length, int flags)
3370{ 3311{
3312 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3313 unsigned length;
3314 unsigned blocksize;
3371 struct inode *inode = mapping->host; 3315 struct inode *inode = mapping->host;
3372 struct page *page;
3373 int err = 0;
3374 3316
3375 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT, 3317 blocksize = inode->i_sb->s_blocksize;
3376 mapping_gfp_mask(mapping) & ~__GFP_FS); 3318 length = blocksize - (offset & (blocksize - 1));
3377 if (!page)
3378 return -ENOMEM;
3379
3380 err = ext4_discard_partial_page_buffers_no_lock(handle, inode, page,
3381 from, length, flags);
3382 3319
3383 unlock_page(page); 3320 return ext4_block_zero_page_range(handle, mapping, from, length);
3384 page_cache_release(page);
3385 return err;
3386} 3321}
3387 3322
3388/* 3323/*
3389 * ext4_discard_partial_page_buffers_no_lock() 3324 * ext4_block_zero_page_range() zeros out a mapping of length 'length'
3390 * Zeros a page range of length 'length' starting from offset 'from'. 3325 * starting from file offset 'from'. The range to be zero'd must
3391 * Buffer heads that correspond to the block aligned regions of the 3326 * be contained with in one block. If the specified range exceeds
3392 * zeroed range will be unmapped. Unblock aligned regions 3327 * the end of the block it will be shortened to end of the block
3393 * will have the corresponding buffer head mapped if needed so that 3328 * that cooresponds to 'from'
3394 * that region of the page can be updated with the partial zero out.
3395 *
3396 * This function assumes that the page has already been locked. The
3397 * The range to be discarded must be contained with in the given page.
3398 * If the specified range exceeds the end of the page it will be shortened
3399 * to the end of the page that corresponds to 'from'. This function is
3400 * appropriate for updating a page and it buffer heads to be unmapped and
3401 * zeroed for blocks that have been either released, or are going to be
3402 * released.
3403 *
3404 * handle: The journal handle
3405 * inode: The files inode
3406 * page: A locked page that contains the offset "from"
3407 * from: The starting byte offset (from the beginning of the file)
3408 * to begin discarding
3409 * len: The length of bytes to discard
3410 * flags: Optional flags that may be used:
3411 *
3412 * EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED
3413 * Only zero the regions of the page whose buffer heads
3414 * have already been unmapped. This flag is appropriate
3415 * for updating the contents of a page whose blocks may
3416 * have already been released, and we only want to zero
3417 * out the regions that correspond to those released blocks.
3418 *
3419 * Returns zero on success or negative on failure.
3420 */ 3329 */
3421static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle, 3330int ext4_block_zero_page_range(handle_t *handle,
3422 struct inode *inode, struct page *page, loff_t from, 3331 struct address_space *mapping, loff_t from, loff_t length)
3423 loff_t length, int flags)
3424{ 3332{
3425 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; 3333 ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
3426 unsigned int offset = from & (PAGE_CACHE_SIZE-1); 3334 unsigned offset = from & (PAGE_CACHE_SIZE-1);
3427 unsigned int blocksize, max, pos; 3335 unsigned blocksize, max, pos;
3428 ext4_lblk_t iblock; 3336 ext4_lblk_t iblock;
3337 struct inode *inode = mapping->host;
3429 struct buffer_head *bh; 3338 struct buffer_head *bh;
3339 struct page *page;
3430 int err = 0; 3340 int err = 0;
3431 3341
3432 blocksize = inode->i_sb->s_blocksize; 3342 page = find_or_create_page(mapping, from >> PAGE_CACHE_SHIFT,
3433 max = PAGE_CACHE_SIZE - offset; 3343 mapping_gfp_mask(mapping) & ~__GFP_FS);
3344 if (!page)
3345 return -ENOMEM;
3434 3346
3435 if (index != page->index) 3347 blocksize = inode->i_sb->s_blocksize;
3436 return -EINVAL; 3348 max = blocksize - (offset & (blocksize - 1));
3437 3349
3438 /* 3350 /*
3439 * correct length if it does not fall between 3351 * correct length if it does not fall between
3440 * 'from' and the end of the page 3352 * 'from' and the end of the block
3441 */ 3353 */
3442 if (length > max || length < 0) 3354 if (length > max || length < 0)
3443 length = max; 3355 length = max;
@@ -3455,106 +3367,91 @@ static int ext4_discard_partial_page_buffers_no_lock(handle_t *handle,
3455 iblock++; 3367 iblock++;
3456 pos += blocksize; 3368 pos += blocksize;
3457 } 3369 }
3458 3370 if (buffer_freed(bh)) {
3459 pos = offset; 3371 BUFFER_TRACE(bh, "freed: skip");
3460 while (pos < offset + length) { 3372 goto unlock;
3461 unsigned int end_of_block, range_to_discard; 3373 }
3462 3374 if (!buffer_mapped(bh)) {
3463 err = 0; 3375 BUFFER_TRACE(bh, "unmapped");
3464 3376 ext4_get_block(inode, iblock, bh, 0);
3465 /* The length of space left to zero and unmap */ 3377 /* unmapped? It's a hole - nothing to do */
3466 range_to_discard = offset + length - pos;
3467
3468 /* The length of space until the end of the block */
3469 end_of_block = blocksize - (pos & (blocksize-1));
3470
3471 /*
3472 * Do not unmap or zero past end of block
3473 * for this buffer head
3474 */
3475 if (range_to_discard > end_of_block)
3476 range_to_discard = end_of_block;
3477
3478
3479 /*
3480 * Skip this buffer head if we are only zeroing unampped
3481 * regions of the page
3482 */
3483 if (flags & EXT4_DISCARD_PARTIAL_PG_ZERO_UNMAPPED &&
3484 buffer_mapped(bh))
3485 goto next;
3486
3487 /* If the range is block aligned, unmap */
3488 if (range_to_discard == blocksize) {
3489 clear_buffer_dirty(bh);
3490 bh->b_bdev = NULL;
3491 clear_buffer_mapped(bh);
3492 clear_buffer_req(bh);
3493 clear_buffer_new(bh);
3494 clear_buffer_delay(bh);
3495 clear_buffer_unwritten(bh);
3496 clear_buffer_uptodate(bh);
3497 zero_user(page, pos, range_to_discard);
3498 BUFFER_TRACE(bh, "Buffer discarded");
3499 goto next;
3500 }
3501
3502 /*
3503 * If this block is not completely contained in the range
3504 * to be discarded, then it is not going to be released. Because
3505 * we need to keep this block, we need to make sure this part
3506 * of the page is uptodate before we modify it by writeing
3507 * partial zeros on it.
3508 */
3509 if (!buffer_mapped(bh)) { 3378 if (!buffer_mapped(bh)) {
3510 /* 3379 BUFFER_TRACE(bh, "still unmapped");
3511 * Buffer head must be mapped before we can read 3380 goto unlock;
3512 * from the block
3513 */
3514 BUFFER_TRACE(bh, "unmapped");
3515 ext4_get_block(inode, iblock, bh, 0);
3516 /* unmapped? It's a hole - nothing to do */
3517 if (!buffer_mapped(bh)) {
3518 BUFFER_TRACE(bh, "still unmapped");
3519 goto next;
3520 }
3521 } 3381 }
3382 }
3522 3383
3523 /* Ok, it's mapped. Make sure it's up-to-date */ 3384 /* Ok, it's mapped. Make sure it's up-to-date */
3524 if (PageUptodate(page)) 3385 if (PageUptodate(page))
3525 set_buffer_uptodate(bh); 3386 set_buffer_uptodate(bh);
3526 3387
3527 if (!buffer_uptodate(bh)) { 3388 if (!buffer_uptodate(bh)) {
3528 err = -EIO; 3389 err = -EIO;
3529 ll_rw_block(READ, 1, &bh); 3390 ll_rw_block(READ, 1, &bh);
3530 wait_on_buffer(bh); 3391 wait_on_buffer(bh);
3531 /* Uhhuh. Read error. Complain and punt.*/ 3392 /* Uhhuh. Read error. Complain and punt. */
3532 if (!buffer_uptodate(bh)) 3393 if (!buffer_uptodate(bh))
3533 goto next; 3394 goto unlock;
3534 } 3395 }
3396 if (ext4_should_journal_data(inode)) {
3397 BUFFER_TRACE(bh, "get write access");
3398 err = ext4_journal_get_write_access(handle, bh);
3399 if (err)
3400 goto unlock;
3401 }
3402 zero_user(page, offset, length);
3403 BUFFER_TRACE(bh, "zeroed end of block");
3535 3404
3536 if (ext4_should_journal_data(inode)) { 3405 if (ext4_should_journal_data(inode)) {
3537 BUFFER_TRACE(bh, "get write access"); 3406 err = ext4_handle_dirty_metadata(handle, inode, bh);
3538 err = ext4_journal_get_write_access(handle, bh); 3407 } else {
3539 if (err) 3408 err = 0;
3540 goto next; 3409 mark_buffer_dirty(bh);
3541 } 3410 if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE))
3411 err = ext4_jbd2_file_inode(handle, inode);
3412 }
3542 3413
3543 zero_user(page, pos, range_to_discard); 3414unlock:
3415 unlock_page(page);
3416 page_cache_release(page);
3417 return err;
3418}
3544 3419
3545 err = 0; 3420int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
3546 if (ext4_should_journal_data(inode)) { 3421 loff_t lstart, loff_t length)
3547 err = ext4_handle_dirty_metadata(handle, inode, bh); 3422{
3548 } else 3423 struct super_block *sb = inode->i_sb;
3549 mark_buffer_dirty(bh); 3424 struct address_space *mapping = inode->i_mapping;
3425 unsigned partial_start, partial_end;
3426 ext4_fsblk_t start, end;
3427 loff_t byte_end = (lstart + length - 1);
3428 int err = 0;
3550 3429
3551 BUFFER_TRACE(bh, "Partial buffer zeroed"); 3430 partial_start = lstart & (sb->s_blocksize - 1);
3552next: 3431 partial_end = byte_end & (sb->s_blocksize - 1);
3553 bh = bh->b_this_page;
3554 iblock++;
3555 pos += range_to_discard;
3556 }
3557 3432
3433 start = lstart >> sb->s_blocksize_bits;
3434 end = byte_end >> sb->s_blocksize_bits;
3435
3436 /* Handle partial zero within the single block */
3437 if (start == end &&
3438 (partial_start || (partial_end != sb->s_blocksize - 1))) {
3439 err = ext4_block_zero_page_range(handle, mapping,
3440 lstart, length);
3441 return err;
3442 }
3443 /* Handle partial zero out on the start of the range */
3444 if (partial_start) {
3445 err = ext4_block_zero_page_range(handle, mapping,
3446 lstart, sb->s_blocksize);
3447 if (err)
3448 return err;
3449 }
3450 /* Handle partial zero out on the end of the range */
3451 if (partial_end != sb->s_blocksize - 1)
3452 err = ext4_block_zero_page_range(handle, mapping,
3453 byte_end - partial_end,
3454 partial_end + 1);
3558 return err; 3455 return err;
3559} 3456}
3560 3457
@@ -3580,14 +3477,12 @@ int ext4_can_truncate(struct inode *inode)
3580 * Returns: 0 on success or negative on failure 3477 * Returns: 0 on success or negative on failure
3581 */ 3478 */
3582 3479
3583int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) 3480int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
3584{ 3481{
3585 struct inode *inode = file_inode(file);
3586 struct super_block *sb = inode->i_sb; 3482 struct super_block *sb = inode->i_sb;
3587 ext4_lblk_t first_block, stop_block; 3483 ext4_lblk_t first_block, stop_block;
3588 struct address_space *mapping = inode->i_mapping; 3484 struct address_space *mapping = inode->i_mapping;
3589 loff_t first_page, last_page, page_len; 3485 loff_t first_block_offset, last_block_offset;
3590 loff_t first_page_offset, last_page_offset;
3591 handle_t *handle; 3486 handle_t *handle;
3592 unsigned int credits; 3487 unsigned int credits;
3593 int ret = 0; 3488 int ret = 0;
@@ -3638,23 +3533,16 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3638 offset; 3533 offset;
3639 } 3534 }
3640 3535
3641 first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; 3536 first_block_offset = round_up(offset, sb->s_blocksize);
3642 last_page = (offset + length) >> PAGE_CACHE_SHIFT; 3537 last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
3643 3538
3644 first_page_offset = first_page << PAGE_CACHE_SHIFT; 3539 /* Now release the pages and zero block aligned part of pages*/
3645 last_page_offset = last_page << PAGE_CACHE_SHIFT; 3540 if (last_block_offset > first_block_offset)
3646 3541 truncate_pagecache_range(inode, first_block_offset,
3647 /* Now release the pages */ 3542 last_block_offset);
3648 if (last_page_offset > first_page_offset) {
3649 truncate_pagecache_range(inode, first_page_offset,
3650 last_page_offset - 1);
3651 }
3652 3543
3653 /* Wait all existing dio workers, newcomers will block on i_mutex */ 3544 /* Wait all existing dio workers, newcomers will block on i_mutex */
3654 ext4_inode_block_unlocked_dio(inode); 3545 ext4_inode_block_unlocked_dio(inode);
3655 ret = ext4_flush_unwritten_io(inode);
3656 if (ret)
3657 goto out_dio;
3658 inode_dio_wait(inode); 3546 inode_dio_wait(inode);
3659 3547
3660 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3548 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -3668,66 +3556,10 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
3668 goto out_dio; 3556 goto out_dio;
3669 } 3557 }
3670 3558
3671 /* 3559 ret = ext4_zero_partial_blocks(handle, inode, offset,
3672 * Now we need to zero out the non-page-aligned data in the 3560 length);
3673 * pages at the start and tail of the hole, and unmap the 3561 if (ret)
3674 * buffer heads for the block aligned regions of the page that 3562 goto out_stop;
3675 * were completely zeroed.
3676 */
3677 if (first_page > last_page) {
3678 /*
3679 * If the file space being truncated is contained
3680 * within a page just zero out and unmap the middle of
3681 * that page
3682 */
3683 ret = ext4_discard_partial_page_buffers(handle,
3684 mapping, offset, length, 0);
3685
3686 if (ret)
3687 goto out_stop;
3688 } else {
3689 /*
3690 * zero out and unmap the partial page that contains
3691 * the start of the hole
3692 */
3693 page_len = first_page_offset - offset;
3694 if (page_len > 0) {
3695 ret = ext4_discard_partial_page_buffers(handle, mapping,
3696 offset, page_len, 0);
3697 if (ret)
3698 goto out_stop;
3699 }
3700
3701 /*
3702 * zero out and unmap the partial page that contains
3703 * the end of the hole
3704 */
3705 page_len = offset + length - last_page_offset;
3706 if (page_len > 0) {
3707 ret = ext4_discard_partial_page_buffers(handle, mapping,
3708 last_page_offset, page_len, 0);
3709 if (ret)
3710 goto out_stop;
3711 }
3712 }
3713
3714 /*
3715 * If i_size is contained in the last page, we need to
3716 * unmap and zero the partial page after i_size
3717 */
3718 if (inode->i_size >> PAGE_CACHE_SHIFT == last_page &&
3719 inode->i_size % PAGE_CACHE_SIZE != 0) {
3720 page_len = PAGE_CACHE_SIZE -
3721 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3722
3723 if (page_len > 0) {
3724 ret = ext4_discard_partial_page_buffers(handle,
3725 mapping, inode->i_size, page_len, 0);
3726
3727 if (ret)
3728 goto out_stop;
3729 }
3730 }
3731 3563
3732 first_block = (offset + sb->s_blocksize - 1) >> 3564 first_block = (offset + sb->s_blocksize - 1) >>
3733 EXT4_BLOCK_SIZE_BITS(sb); 3565 EXT4_BLOCK_SIZE_BITS(sb);
@@ -3803,7 +3635,6 @@ void ext4_truncate(struct inode *inode)
3803 unsigned int credits; 3635 unsigned int credits;
3804 handle_t *handle; 3636 handle_t *handle;
3805 struct address_space *mapping = inode->i_mapping; 3637 struct address_space *mapping = inode->i_mapping;
3806 loff_t page_len;
3807 3638
3808 /* 3639 /*
3809 * There is a possibility that we're either freeing the inode 3640 * There is a possibility that we're either freeing the inode
@@ -3830,12 +3661,6 @@ void ext4_truncate(struct inode *inode)
3830 return; 3661 return;
3831 } 3662 }
3832 3663
3833 /*
3834 * finish any pending end_io work so we won't run the risk of
3835 * converting any truncated blocks to initialized later
3836 */
3837 ext4_flush_unwritten_io(inode);
3838
3839 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) 3664 if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
3840 credits = ext4_writepage_trans_blocks(inode); 3665 credits = ext4_writepage_trans_blocks(inode);
3841 else 3666 else
@@ -3847,14 +3672,8 @@ void ext4_truncate(struct inode *inode)
3847 return; 3672 return;
3848 } 3673 }
3849 3674
3850 if (inode->i_size % PAGE_CACHE_SIZE != 0) { 3675 if (inode->i_size & (inode->i_sb->s_blocksize - 1))
3851 page_len = PAGE_CACHE_SIZE - 3676 ext4_block_truncate_page(handle, mapping, inode->i_size);
3852 (inode->i_size & (PAGE_CACHE_SIZE - 1));
3853
3854 if (ext4_discard_partial_page_buffers(handle,
3855 mapping, inode->i_size, page_len, 0))
3856 goto out_stop;
3857 }
3858 3677
3859 /* 3678 /*
3860 * We add the inode to the orphan list, so that if this 3679 * We add the inode to the orphan list, so that if this
@@ -4623,7 +4442,8 @@ static void ext4_wait_for_tail_page_commit(struct inode *inode)
4623 inode->i_size >> PAGE_CACHE_SHIFT); 4442 inode->i_size >> PAGE_CACHE_SHIFT);
4624 if (!page) 4443 if (!page)
4625 return; 4444 return;
4626 ret = __ext4_journalled_invalidatepage(page, offset); 4445 ret = __ext4_journalled_invalidatepage(page, offset,
4446 PAGE_CACHE_SIZE - offset);
4627 unlock_page(page); 4447 unlock_page(page);
4628 page_cache_release(page); 4448 page_cache_release(page);
4629 if (ret != -EBUSY) 4449 if (ret != -EBUSY)
@@ -4805,7 +4625,7 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4805 struct kstat *stat) 4625 struct kstat *stat)
4806{ 4626{
4807 struct inode *inode; 4627 struct inode *inode;
4808 unsigned long delalloc_blocks; 4628 unsigned long long delalloc_blocks;
4809 4629
4810 inode = dentry->d_inode; 4630 inode = dentry->d_inode;
4811 generic_fillattr(inode, stat); 4631 generic_fillattr(inode, stat);
@@ -4823,15 +4643,16 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry,
4823 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb), 4643 delalloc_blocks = EXT4_C2B(EXT4_SB(inode->i_sb),
4824 EXT4_I(inode)->i_reserved_data_blocks); 4644 EXT4_I(inode)->i_reserved_data_blocks);
4825 4645
4826 stat->blocks += (delalloc_blocks << inode->i_sb->s_blocksize_bits)>>9; 4646 stat->blocks += delalloc_blocks << (inode->i_sb->s_blocksize_bits-9);
4827 return 0; 4647 return 0;
4828} 4648}
4829 4649
4830static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4650static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
4651 int pextents)
4831{ 4652{
4832 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) 4653 if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
4833 return ext4_ind_trans_blocks(inode, nrblocks, chunk); 4654 return ext4_ind_trans_blocks(inode, lblocks);
4834 return ext4_ext_index_trans_blocks(inode, nrblocks, chunk); 4655 return ext4_ext_index_trans_blocks(inode, pextents);
4835} 4656}
4836 4657
4837/* 4658/*
@@ -4845,7 +4666,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4845 * 4666 *
4846 * Also account for superblock, inode, quota and xattr blocks 4667 * Also account for superblock, inode, quota and xattr blocks
4847 */ 4668 */
4848static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) 4669static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
4670 int pextents)
4849{ 4671{
4850 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); 4672 ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
4851 int gdpblocks; 4673 int gdpblocks;
@@ -4853,14 +4675,10 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4853 int ret = 0; 4675 int ret = 0;
4854 4676
4855 /* 4677 /*
4856 * How many index blocks need to touch to modify nrblocks? 4678 * How many index blocks need to touch to map @lblocks logical blocks
4857 * The "Chunk" flag indicating whether the nrblocks is 4679 * to @pextents physical extents?
4858 * physically contiguous on disk
4859 *
4860 * For Direct IO and fallocate, they calls get_block to allocate
4861 * one single extent at a time, so they could set the "Chunk" flag
4862 */ 4680 */
4863 idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); 4681 idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
4864 4682
4865 ret = idxblocks; 4683 ret = idxblocks;
4866 4684
@@ -4868,12 +4686,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
4868 * Now let's see how many group bitmaps and group descriptors need 4686 * Now let's see how many group bitmaps and group descriptors need
4869 * to account 4687 * to account
4870 */ 4688 */
4871 groups = idxblocks; 4689 groups = idxblocks + pextents;
4872 if (chunk)
4873 groups += 1;
4874 else
4875 groups += nrblocks;
4876
4877 gdpblocks = groups; 4690 gdpblocks = groups;
4878 if (groups > ngroups) 4691 if (groups > ngroups)
4879 groups = ngroups; 4692 groups = ngroups;
@@ -4904,7 +4717,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
4904 int bpp = ext4_journal_blocks_per_page(inode); 4717 int bpp = ext4_journal_blocks_per_page(inode);
4905 int ret; 4718 int ret;
4906 4719
4907 ret = ext4_meta_trans_blocks(inode, bpp, 0); 4720 ret = ext4_meta_trans_blocks(inode, bpp, bpp);
4908 4721
4909 /* Account for data blocks for journalled mode */ 4722 /* Account for data blocks for journalled mode */
4910 if (ext4_should_journal_data(inode)) 4723 if (ext4_should_journal_data(inode))
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 9491ac0590f7..c0427e2f6648 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -77,8 +77,10 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2)
77 memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data)); 77 memswap(ei1->i_data, ei2->i_data, sizeof(ei1->i_data));
78 memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags)); 78 memswap(&ei1->i_flags, &ei2->i_flags, sizeof(ei1->i_flags));
79 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); 79 memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize));
80 memswap(&ei1->i_es_tree, &ei2->i_es_tree, sizeof(ei1->i_es_tree)); 80 ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS);
81 memswap(&ei1->i_es_lru_nr, &ei2->i_es_lru_nr, sizeof(ei1->i_es_lru_nr)); 81 ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS);
82 ext4_es_lru_del(inode1);
83 ext4_es_lru_del(inode2);
82 84
83 isize = i_size_read(inode1); 85 isize = i_size_read(inode1);
84 i_size_write(inode1, i_size_read(inode2)); 86 i_size_write(inode1, i_size_read(inode2));
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index def84082a9a9..4bbbf13bd743 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2105,6 +2105,7 @@ repeat:
2105 group = ac->ac_g_ex.fe_group; 2105 group = ac->ac_g_ex.fe_group;
2106 2106
2107 for (i = 0; i < ngroups; group++, i++) { 2107 for (i = 0; i < ngroups; group++, i++) {
2108 cond_resched();
2108 /* 2109 /*
2109 * Artificially restricted ngroups for non-extent 2110 * Artificially restricted ngroups for non-extent
2110 * files makes group > ngroups possible on first loop. 2111 * files makes group > ngroups possible on first loop.
@@ -4405,17 +4406,20 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
4405repeat: 4406repeat:
4406 /* allocate space in core */ 4407 /* allocate space in core */
4407 *errp = ext4_mb_regular_allocator(ac); 4408 *errp = ext4_mb_regular_allocator(ac);
4408 if (*errp) { 4409 if (*errp)
4409 ext4_discard_allocated_blocks(ac); 4410 goto discard_and_exit;
4410 goto errout;
4411 }
4412 4411
4413 /* as we've just preallocated more space than 4412 /* as we've just preallocated more space than
4414 * user requested orinally, we store allocated 4413 * user requested originally, we store allocated
4415 * space in a special descriptor */ 4414 * space in a special descriptor */
4416 if (ac->ac_status == AC_STATUS_FOUND && 4415 if (ac->ac_status == AC_STATUS_FOUND &&
4417 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) 4416 ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len)
4418 ext4_mb_new_preallocation(ac); 4417 *errp = ext4_mb_new_preallocation(ac);
4418 if (*errp) {
4419 discard_and_exit:
4420 ext4_discard_allocated_blocks(ac);
4421 goto errout;
4422 }
4419 } 4423 }
4420 if (likely(ac->ac_status == AC_STATUS_FOUND)) { 4424 if (likely(ac->ac_status == AC_STATUS_FOUND)) {
4421 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); 4425 *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
@@ -4612,10 +4616,11 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
4612 BUG_ON(bh && (count > 1)); 4616 BUG_ON(bh && (count > 1));
4613 4617
4614 for (i = 0; i < count; i++) { 4618 for (i = 0; i < count; i++) {
4619 cond_resched();
4615 if (!bh) 4620 if (!bh)
4616 tbh = sb_find_get_block(inode->i_sb, 4621 tbh = sb_find_get_block(inode->i_sb,
4617 block + i); 4622 block + i);
4618 if (unlikely(!tbh)) 4623 if (!tbh)
4619 continue; 4624 continue;
4620 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 4625 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4621 inode, tbh, block + i); 4626 inode, tbh, block + i);
@@ -4735,11 +4740,16 @@ do_more:
4735 * blocks being freed are metadata. these blocks shouldn't 4740 * blocks being freed are metadata. these blocks shouldn't
4736 * be used until this transaction is committed 4741 * be used until this transaction is committed
4737 */ 4742 */
4743 retry:
4738 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS); 4744 new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
4739 if (!new_entry) { 4745 if (!new_entry) {
4740 ext4_mb_unload_buddy(&e4b); 4746 /*
4741 err = -ENOMEM; 4747 * We use a retry loop because
4742 goto error_return; 4748 * ext4_free_blocks() is not allowed to fail.
4749 */
4750 cond_resched();
4751 congestion_wait(BLK_RW_ASYNC, HZ/50);
4752 goto retry;
4743 } 4753 }
4744 new_entry->efd_start_cluster = bit; 4754 new_entry->efd_start_cluster = bit;
4745 new_entry->efd_group = block_group; 4755 new_entry->efd_group = block_group;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 3dcbf364022f..e86dddbd8296 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -912,7 +912,6 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
912 struct page *pagep[2] = {NULL, NULL}; 912 struct page *pagep[2] = {NULL, NULL};
913 handle_t *handle; 913 handle_t *handle;
914 ext4_lblk_t orig_blk_offset; 914 ext4_lblk_t orig_blk_offset;
915 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
916 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 915 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
917 unsigned int w_flags = 0; 916 unsigned int w_flags = 0;
918 unsigned int tmp_data_size, data_size, replaced_size; 917 unsigned int tmp_data_size, data_size, replaced_size;
@@ -940,8 +939,6 @@ again:
940 orig_blk_offset = orig_page_offset * blocks_per_page + 939 orig_blk_offset = orig_page_offset * blocks_per_page +
941 data_offset_in_page; 940 data_offset_in_page;
942 941
943 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
944
945 /* Calculate data_size */ 942 /* Calculate data_size */
946 if ((orig_blk_offset + block_len_in_page - 1) == 943 if ((orig_blk_offset + block_len_in_page - 1) ==
947 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 944 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6653fc35ecb7..35f55a0dbc4b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -918,11 +918,8 @@ static int htree_dirblock_to_tree(struct file *dir_file,
918 bh->b_data, bh->b_size, 918 bh->b_data, bh->b_size,
919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) 919 (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
920 + ((char *)de - bh->b_data))) { 920 + ((char *)de - bh->b_data))) {
921 /* On error, skip the f_pos to the next block. */ 921 /* silently ignore the rest of the block */
922 dir_file->f_pos = (dir_file->f_pos | 922 break;
923 (dir->i_sb->s_blocksize - 1)) + 1;
924 brelse(bh);
925 return count;
926 } 923 }
927 ext4fs_dirhash(de->name, de->name_len, hinfo); 924 ext4fs_dirhash(de->name, de->name_len, hinfo);
928 if ((hinfo->hash < start_hash) || 925 if ((hinfo->hash < start_hash) ||
@@ -2299,6 +2296,45 @@ retry:
2299 return err; 2296 return err;
2300} 2297}
2301 2298
2299static int ext4_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
2300{
2301 handle_t *handle;
2302 struct inode *inode;
2303 int err, retries = 0;
2304
2305 dquot_initialize(dir);
2306
2307retry:
2308 inode = ext4_new_inode_start_handle(dir, mode,
2309 NULL, 0, NULL,
2310 EXT4_HT_DIR,
2311 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb) +
2312 4 + EXT4_XATTR_TRANS_BLOCKS);
2313 handle = ext4_journal_current_handle();
2314 err = PTR_ERR(inode);
2315 if (!IS_ERR(inode)) {
2316 inode->i_op = &ext4_file_inode_operations;
2317 inode->i_fop = &ext4_file_operations;
2318 ext4_set_aops(inode);
2319 d_tmpfile(dentry, inode);
2320 err = ext4_orphan_add(handle, inode);
2321 if (err)
2322 goto err_drop_inode;
2323 mark_inode_dirty(inode);
2324 unlock_new_inode(inode);
2325 }
2326 if (handle)
2327 ext4_journal_stop(handle);
2328 if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
2329 goto retry;
2330 return err;
2331err_drop_inode:
2332 ext4_journal_stop(handle);
2333 unlock_new_inode(inode);
2334 iput(inode);
2335 return err;
2336}
2337
2302struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, 2338struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode,
2303 struct ext4_dir_entry_2 *de, 2339 struct ext4_dir_entry_2 *de,
2304 int blocksize, int csum_size, 2340 int blocksize, int csum_size,
@@ -2906,7 +2942,7 @@ static int ext4_link(struct dentry *old_dentry,
2906retry: 2942retry:
2907 handle = ext4_journal_start(dir, EXT4_HT_DIR, 2943 handle = ext4_journal_start(dir, EXT4_HT_DIR,
2908 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2944 (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2909 EXT4_INDEX_EXTRA_TRANS_BLOCKS)); 2945 EXT4_INDEX_EXTRA_TRANS_BLOCKS) + 1);
2910 if (IS_ERR(handle)) 2946 if (IS_ERR(handle))
2911 return PTR_ERR(handle); 2947 return PTR_ERR(handle);
2912 2948
@@ -2920,6 +2956,11 @@ retry:
2920 err = ext4_add_entry(handle, dentry, inode); 2956 err = ext4_add_entry(handle, dentry, inode);
2921 if (!err) { 2957 if (!err) {
2922 ext4_mark_inode_dirty(handle, inode); 2958 ext4_mark_inode_dirty(handle, inode);
2959 /* this can happen only for tmpfile being
2960 * linked the first time
2961 */
2962 if (inode->i_nlink == 1)
2963 ext4_orphan_del(handle, inode);
2923 d_instantiate(dentry, inode); 2964 d_instantiate(dentry, inode);
2924 } else { 2965 } else {
2925 drop_nlink(inode); 2966 drop_nlink(inode);
@@ -3172,6 +3213,7 @@ const struct inode_operations ext4_dir_inode_operations = {
3172 .mkdir = ext4_mkdir, 3213 .mkdir = ext4_mkdir,
3173 .rmdir = ext4_rmdir, 3214 .rmdir = ext4_rmdir,
3174 .mknod = ext4_mknod, 3215 .mknod = ext4_mknod,
3216 .tmpfile = ext4_tmpfile,
3175 .rename = ext4_rename, 3217 .rename = ext4_rename,
3176 .setattr = ext4_setattr, 3218 .setattr = ext4_setattr,
3177 .setxattr = generic_setxattr, 3219 .setxattr = generic_setxattr,
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4acf1f78881b..6625d210fb45 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -25,6 +25,7 @@
25#include <linux/kernel.h> 25#include <linux/kernel.h>
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/ratelimit.h>
28 29
29#include "ext4_jbd2.h" 30#include "ext4_jbd2.h"
30#include "xattr.h" 31#include "xattr.h"
@@ -46,46 +47,121 @@ void ext4_exit_pageio(void)
46} 47}
47 48
48/* 49/*
49 * This function is called by ext4_evict_inode() to make sure there is 50 * Print an buffer I/O error compatible with the fs/buffer.c. This
50 * no more pending I/O completion work left to do. 51 * provides compatibility with dmesg scrapers that look for a specific
52 * buffer I/O error message. We really need a unified error reporting
53 * structure to userspace ala Digital Unix's uerf system, but it's
54 * probably not going to happen in my lifetime, due to LKML politics...
51 */ 55 */
52void ext4_ioend_shutdown(struct inode *inode) 56static void buffer_io_error(struct buffer_head *bh)
57{
58 char b[BDEVNAME_SIZE];
59 printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
60 bdevname(bh->b_bdev, b),
61 (unsigned long long)bh->b_blocknr);
62}
63
64static void ext4_finish_bio(struct bio *bio)
53{ 65{
54 wait_queue_head_t *wq = ext4_ioend_wq(inode); 66 int i;
67 int error = !test_bit(BIO_UPTODATE, &bio->bi_flags);
55 68
56 wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_ioend_count) == 0)); 69 for (i = 0; i < bio->bi_vcnt; i++) {
57 /* 70 struct bio_vec *bvec = &bio->bi_io_vec[i];
58 * We need to make sure the work structure is finished being 71 struct page *page = bvec->bv_page;
59 * used before we let the inode get destroyed. 72 struct buffer_head *bh, *head;
60 */ 73 unsigned bio_start = bvec->bv_offset;
61 if (work_pending(&EXT4_I(inode)->i_unwritten_work)) 74 unsigned bio_end = bio_start + bvec->bv_len;
62 cancel_work_sync(&EXT4_I(inode)->i_unwritten_work); 75 unsigned under_io = 0;
76 unsigned long flags;
77
78 if (!page)
79 continue;
80
81 if (error) {
82 SetPageError(page);
83 set_bit(AS_EIO, &page->mapping->flags);
84 }
85 bh = head = page_buffers(page);
86 /*
87 * We check all buffers in the page under BH_Uptodate_Lock
88 * to avoid races with other end io clearing async_write flags
89 */
90 local_irq_save(flags);
91 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
92 do {
93 if (bh_offset(bh) < bio_start ||
94 bh_offset(bh) + bh->b_size > bio_end) {
95 if (buffer_async_write(bh))
96 under_io++;
97 continue;
98 }
99 clear_buffer_async_write(bh);
100 if (error)
101 buffer_io_error(bh);
102 } while ((bh = bh->b_this_page) != head);
103 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
104 local_irq_restore(flags);
105 if (!under_io)
106 end_page_writeback(page);
107 }
108}
109
110static void ext4_release_io_end(ext4_io_end_t *io_end)
111{
112 struct bio *bio, *next_bio;
113
114 BUG_ON(!list_empty(&io_end->list));
115 BUG_ON(io_end->flag & EXT4_IO_END_UNWRITTEN);
116 WARN_ON(io_end->handle);
117
118 if (atomic_dec_and_test(&EXT4_I(io_end->inode)->i_ioend_count))
119 wake_up_all(ext4_ioend_wq(io_end->inode));
120
121 for (bio = io_end->bio; bio; bio = next_bio) {
122 next_bio = bio->bi_private;
123 ext4_finish_bio(bio);
124 bio_put(bio);
125 }
126 if (io_end->flag & EXT4_IO_END_DIRECT)
127 inode_dio_done(io_end->inode);
128 if (io_end->iocb)
129 aio_complete(io_end->iocb, io_end->result, 0);
130 kmem_cache_free(io_end_cachep, io_end);
63} 131}
64 132
65void ext4_free_io_end(ext4_io_end_t *io) 133static void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
66{ 134{
67 BUG_ON(!io); 135 struct inode *inode = io_end->inode;
68 BUG_ON(!list_empty(&io->list));
69 BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN);
70 136
71 if (atomic_dec_and_test(&EXT4_I(io->inode)->i_ioend_count)) 137 io_end->flag &= ~EXT4_IO_END_UNWRITTEN;
72 wake_up_all(ext4_ioend_wq(io->inode)); 138 /* Wake up anyone waiting on unwritten extent conversion */
73 kmem_cache_free(io_end_cachep, io); 139 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten))
140 wake_up_all(ext4_ioend_wq(inode));
74} 141}
75 142
76/* check a range of space and convert unwritten extents to written. */ 143/*
144 * Check a range of space and convert unwritten extents to written. Note that
145 * we are protected from truncate touching same part of extent tree by the
146 * fact that truncate code waits for all DIO to finish (thus exclusion from
147 * direct IO is achieved) and also waits for PageWriteback bits. Thus we
148 * cannot get to ext4_ext_truncate() before all IOs overlapping that range are
149 * completed (happens from ext4_free_ioend()).
150 */
77static int ext4_end_io(ext4_io_end_t *io) 151static int ext4_end_io(ext4_io_end_t *io)
78{ 152{
79 struct inode *inode = io->inode; 153 struct inode *inode = io->inode;
80 loff_t offset = io->offset; 154 loff_t offset = io->offset;
81 ssize_t size = io->size; 155 ssize_t size = io->size;
156 handle_t *handle = io->handle;
82 int ret = 0; 157 int ret = 0;
83 158
84 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," 159 ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p,"
85 "list->prev 0x%p\n", 160 "list->prev 0x%p\n",
86 io, inode->i_ino, io->list.next, io->list.prev); 161 io, inode->i_ino, io->list.next, io->list.prev);
87 162
88 ret = ext4_convert_unwritten_extents(inode, offset, size); 163 io->handle = NULL; /* Following call will use up the handle */
164 ret = ext4_convert_unwritten_extents(handle, inode, offset, size);
89 if (ret < 0) { 165 if (ret < 0) {
90 ext4_msg(inode->i_sb, KERN_EMERG, 166 ext4_msg(inode->i_sb, KERN_EMERG,
91 "failed to convert unwritten extents to written " 167 "failed to convert unwritten extents to written "
@@ -93,30 +169,22 @@ static int ext4_end_io(ext4_io_end_t *io)
93 "(inode %lu, offset %llu, size %zd, error %d)", 169 "(inode %lu, offset %llu, size %zd, error %d)",
94 inode->i_ino, offset, size, ret); 170 inode->i_ino, offset, size, ret);
95 } 171 }
96 /* Wake up anyone waiting on unwritten extent conversion */ 172 ext4_clear_io_unwritten_flag(io);
97 if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) 173 ext4_release_io_end(io);
98 wake_up_all(ext4_ioend_wq(inode));
99 if (io->flag & EXT4_IO_END_DIRECT)
100 inode_dio_done(inode);
101 if (io->iocb)
102 aio_complete(io->iocb, io->result, 0);
103 return ret; 174 return ret;
104} 175}
105 176
106static void dump_completed_IO(struct inode *inode) 177static void dump_completed_IO(struct inode *inode, struct list_head *head)
107{ 178{
108#ifdef EXT4FS_DEBUG 179#ifdef EXT4FS_DEBUG
109 struct list_head *cur, *before, *after; 180 struct list_head *cur, *before, *after;
110 ext4_io_end_t *io, *io0, *io1; 181 ext4_io_end_t *io, *io0, *io1;
111 182
112 if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { 183 if (list_empty(head))
113 ext4_debug("inode %lu completed_io list is empty\n",
114 inode->i_ino);
115 return; 184 return;
116 }
117 185
118 ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); 186 ext4_debug("Dump inode %lu completed io list\n", inode->i_ino);
119 list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { 187 list_for_each_entry(io, head, list) {
120 cur = &io->list; 188 cur = &io->list;
121 before = cur->prev; 189 before = cur->prev;
122 io0 = container_of(before, ext4_io_end_t, list); 190 io0 = container_of(before, ext4_io_end_t, list);
@@ -130,23 +198,30 @@ static void dump_completed_IO(struct inode *inode)
130} 198}
131 199
132/* Add the io_end to per-inode completed end_io list. */ 200/* Add the io_end to per-inode completed end_io list. */
133void ext4_add_complete_io(ext4_io_end_t *io_end) 201static void ext4_add_complete_io(ext4_io_end_t *io_end)
134{ 202{
135 struct ext4_inode_info *ei = EXT4_I(io_end->inode); 203 struct ext4_inode_info *ei = EXT4_I(io_end->inode);
136 struct workqueue_struct *wq; 204 struct workqueue_struct *wq;
137 unsigned long flags; 205 unsigned long flags;
138 206
139 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); 207 BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN));
140 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
141
142 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 208 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
143 if (list_empty(&ei->i_completed_io_list)) 209 if (io_end->handle) {
144 queue_work(wq, &ei->i_unwritten_work); 210 wq = EXT4_SB(io_end->inode->i_sb)->rsv_conversion_wq;
145 list_add_tail(&io_end->list, &ei->i_completed_io_list); 211 if (list_empty(&ei->i_rsv_conversion_list))
212 queue_work(wq, &ei->i_rsv_conversion_work);
213 list_add_tail(&io_end->list, &ei->i_rsv_conversion_list);
214 } else {
215 wq = EXT4_SB(io_end->inode->i_sb)->unrsv_conversion_wq;
216 if (list_empty(&ei->i_unrsv_conversion_list))
217 queue_work(wq, &ei->i_unrsv_conversion_work);
218 list_add_tail(&io_end->list, &ei->i_unrsv_conversion_list);
219 }
146 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 220 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
147} 221}
148 222
149static int ext4_do_flush_completed_IO(struct inode *inode) 223static int ext4_do_flush_completed_IO(struct inode *inode,
224 struct list_head *head)
150{ 225{
151 ext4_io_end_t *io; 226 ext4_io_end_t *io;
152 struct list_head unwritten; 227 struct list_head unwritten;
@@ -155,8 +230,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
155 int err, ret = 0; 230 int err, ret = 0;
156 231
157 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 232 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
158 dump_completed_IO(inode); 233 dump_completed_IO(inode, head);
159 list_replace_init(&ei->i_completed_io_list, &unwritten); 234 list_replace_init(head, &unwritten);
160 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 235 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
161 236
162 while (!list_empty(&unwritten)) { 237 while (!list_empty(&unwritten)) {
@@ -167,30 +242,25 @@ static int ext4_do_flush_completed_IO(struct inode *inode)
167 err = ext4_end_io(io); 242 err = ext4_end_io(io);
168 if (unlikely(!ret && err)) 243 if (unlikely(!ret && err))
169 ret = err; 244 ret = err;
170 io->flag &= ~EXT4_IO_END_UNWRITTEN;
171 ext4_free_io_end(io);
172 } 245 }
173 return ret; 246 return ret;
174} 247}
175 248
176/* 249/*
177 * work on completed aio dio IO, to convert unwritten extents to extents 250 * work on completed IO, to convert unwritten extents to extents
178 */ 251 */
179void ext4_end_io_work(struct work_struct *work) 252void ext4_end_io_rsv_work(struct work_struct *work)
180{ 253{
181 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info, 254 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
182 i_unwritten_work); 255 i_rsv_conversion_work);
183 ext4_do_flush_completed_IO(&ei->vfs_inode); 256 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_rsv_conversion_list);
184} 257}
185 258
186int ext4_flush_unwritten_io(struct inode *inode) 259void ext4_end_io_unrsv_work(struct work_struct *work)
187{ 260{
188 int ret; 261 struct ext4_inode_info *ei = container_of(work, struct ext4_inode_info,
189 WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex) && 262 i_unrsv_conversion_work);
190 !(inode->i_state & I_FREEING)); 263 ext4_do_flush_completed_IO(&ei->vfs_inode, &ei->i_unrsv_conversion_list);
191 ret = ext4_do_flush_completed_IO(inode);
192 ext4_unwritten_wait(inode);
193 return ret;
194} 264}
195 265
196ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) 266ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
@@ -200,83 +270,59 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags)
200 atomic_inc(&EXT4_I(inode)->i_ioend_count); 270 atomic_inc(&EXT4_I(inode)->i_ioend_count);
201 io->inode = inode; 271 io->inode = inode;
202 INIT_LIST_HEAD(&io->list); 272 INIT_LIST_HEAD(&io->list);
273 atomic_set(&io->count, 1);
203 } 274 }
204 return io; 275 return io;
205} 276}
206 277
207/* 278void ext4_put_io_end_defer(ext4_io_end_t *io_end)
208 * Print an buffer I/O error compatible with the fs/buffer.c. This
209 * provides compatibility with dmesg scrapers that look for a specific
210 * buffer I/O error message. We really need a unified error reporting
211 * structure to userspace ala Digital Unix's uerf system, but it's
212 * probably not going to happen in my lifetime, due to LKML politics...
213 */
214static void buffer_io_error(struct buffer_head *bh)
215{ 279{
216 char b[BDEVNAME_SIZE]; 280 if (atomic_dec_and_test(&io_end->count)) {
217 printk(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", 281 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || !io_end->size) {
218 bdevname(bh->b_bdev, b), 282 ext4_release_io_end(io_end);
219 (unsigned long long)bh->b_blocknr); 283 return;
284 }
285 ext4_add_complete_io(io_end);
286 }
220} 287}
221 288
289int ext4_put_io_end(ext4_io_end_t *io_end)
290{
291 int err = 0;
292
293 if (atomic_dec_and_test(&io_end->count)) {
294 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
295 err = ext4_convert_unwritten_extents(io_end->handle,
296 io_end->inode, io_end->offset,
297 io_end->size);
298 io_end->handle = NULL;
299 ext4_clear_io_unwritten_flag(io_end);
300 }
301 ext4_release_io_end(io_end);
302 }
303 return err;
304}
305
306ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end)
307{
308 atomic_inc(&io_end->count);
309 return io_end;
310}
311
312/* BIO completion function for page writeback */
222static void ext4_end_bio(struct bio *bio, int error) 313static void ext4_end_bio(struct bio *bio, int error)
223{ 314{
224 ext4_io_end_t *io_end = bio->bi_private; 315 ext4_io_end_t *io_end = bio->bi_private;
225 struct inode *inode;
226 int i;
227 int blocksize;
228 sector_t bi_sector = bio->bi_sector; 316 sector_t bi_sector = bio->bi_sector;
229 317
230 BUG_ON(!io_end); 318 BUG_ON(!io_end);
231 inode = io_end->inode;
232 blocksize = 1 << inode->i_blkbits;
233 bio->bi_private = NULL;
234 bio->bi_end_io = NULL; 319 bio->bi_end_io = NULL;
235 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 320 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
236 error = 0; 321 error = 0;
237 for (i = 0; i < bio->bi_vcnt; i++) {
238 struct bio_vec *bvec = &bio->bi_io_vec[i];
239 struct page *page = bvec->bv_page;
240 struct buffer_head *bh, *head;
241 unsigned bio_start = bvec->bv_offset;
242 unsigned bio_end = bio_start + bvec->bv_len;
243 unsigned under_io = 0;
244 unsigned long flags;
245
246 if (!page)
247 continue;
248
249 if (error) {
250 SetPageError(page);
251 set_bit(AS_EIO, &page->mapping->flags);
252 }
253 bh = head = page_buffers(page);
254 /*
255 * We check all buffers in the page under BH_Uptodate_Lock
256 * to avoid races with other end io clearing async_write flags
257 */
258 local_irq_save(flags);
259 bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
260 do {
261 if (bh_offset(bh) < bio_start ||
262 bh_offset(bh) + blocksize > bio_end) {
263 if (buffer_async_write(bh))
264 under_io++;
265 continue;
266 }
267 clear_buffer_async_write(bh);
268 if (error)
269 buffer_io_error(bh);
270 } while ((bh = bh->b_this_page) != head);
271 bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
272 local_irq_restore(flags);
273 if (!under_io)
274 end_page_writeback(page);
275 }
276 bio_put(bio);
277 322
278 if (error) { 323 if (error) {
279 io_end->flag |= EXT4_IO_END_ERROR; 324 struct inode *inode = io_end->inode;
325
280 ext4_warning(inode->i_sb, "I/O error writing to inode %lu " 326 ext4_warning(inode->i_sb, "I/O error writing to inode %lu "
281 "(offset %llu size %ld starting block %llu)", 327 "(offset %llu size %ld starting block %llu)",
282 inode->i_ino, 328 inode->i_ino,
@@ -286,12 +332,23 @@ static void ext4_end_bio(struct bio *bio, int error)
286 bi_sector >> (inode->i_blkbits - 9)); 332 bi_sector >> (inode->i_blkbits - 9));
287 } 333 }
288 334
289 if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { 335 if (io_end->flag & EXT4_IO_END_UNWRITTEN) {
290 ext4_free_io_end(io_end); 336 /*
291 return; 337 * Link bio into list hanging from io_end. We have to do it
338 * atomically as bio completions can be racing against each
339 * other.
340 */
341 bio->bi_private = xchg(&io_end->bio, bio);
342 ext4_put_io_end_defer(io_end);
343 } else {
344 /*
345 * Drop io_end reference early. Inode can get freed once
346 * we finish the bio.
347 */
348 ext4_put_io_end_defer(io_end);
349 ext4_finish_bio(bio);
350 bio_put(bio);
292 } 351 }
293
294 ext4_add_complete_io(io_end);
295} 352}
296 353
297void ext4_io_submit(struct ext4_io_submit *io) 354void ext4_io_submit(struct ext4_io_submit *io)
@@ -305,43 +362,38 @@ void ext4_io_submit(struct ext4_io_submit *io)
305 bio_put(io->io_bio); 362 bio_put(io->io_bio);
306 } 363 }
307 io->io_bio = NULL; 364 io->io_bio = NULL;
308 io->io_op = 0; 365}
366
367void ext4_io_submit_init(struct ext4_io_submit *io,
368 struct writeback_control *wbc)
369{
370 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
371 io->io_bio = NULL;
309 io->io_end = NULL; 372 io->io_end = NULL;
310} 373}
311 374
312static int io_submit_init(struct ext4_io_submit *io, 375static int io_submit_init_bio(struct ext4_io_submit *io,
313 struct inode *inode, 376 struct buffer_head *bh)
314 struct writeback_control *wbc,
315 struct buffer_head *bh)
316{ 377{
317 ext4_io_end_t *io_end;
318 struct page *page = bh->b_page;
319 int nvecs = bio_get_nr_vecs(bh->b_bdev); 378 int nvecs = bio_get_nr_vecs(bh->b_bdev);
320 struct bio *bio; 379 struct bio *bio;
321 380
322 io_end = ext4_init_io_end(inode, GFP_NOFS);
323 if (!io_end)
324 return -ENOMEM;
325 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES)); 381 bio = bio_alloc(GFP_NOIO, min(nvecs, BIO_MAX_PAGES));
382 if (!bio)
383 return -ENOMEM;
326 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9); 384 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
327 bio->bi_bdev = bh->b_bdev; 385 bio->bi_bdev = bh->b_bdev;
328 bio->bi_private = io->io_end = io_end;
329 bio->bi_end_io = ext4_end_bio; 386 bio->bi_end_io = ext4_end_bio;
330 387 bio->bi_private = ext4_get_io_end(io->io_end);
331 io_end->offset = (page->index << PAGE_CACHE_SHIFT) + bh_offset(bh);
332
333 io->io_bio = bio; 388 io->io_bio = bio;
334 io->io_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
335 io->io_next_block = bh->b_blocknr; 389 io->io_next_block = bh->b_blocknr;
336 return 0; 390 return 0;
337} 391}
338 392
339static int io_submit_add_bh(struct ext4_io_submit *io, 393static int io_submit_add_bh(struct ext4_io_submit *io,
340 struct inode *inode, 394 struct inode *inode,
341 struct writeback_control *wbc,
342 struct buffer_head *bh) 395 struct buffer_head *bh)
343{ 396{
344 ext4_io_end_t *io_end;
345 int ret; 397 int ret;
346 398
347 if (io->io_bio && bh->b_blocknr != io->io_next_block) { 399 if (io->io_bio && bh->b_blocknr != io->io_next_block) {
@@ -349,18 +401,14 @@ submit_and_retry:
349 ext4_io_submit(io); 401 ext4_io_submit(io);
350 } 402 }
351 if (io->io_bio == NULL) { 403 if (io->io_bio == NULL) {
352 ret = io_submit_init(io, inode, wbc, bh); 404 ret = io_submit_init_bio(io, bh);
353 if (ret) 405 if (ret)
354 return ret; 406 return ret;
355 } 407 }
356 io_end = io->io_end;
357 if (test_clear_buffer_uninit(bh))
358 ext4_set_io_unwritten_flag(inode, io_end);
359 io->io_end->size += bh->b_size;
360 io->io_next_block++;
361 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh)); 408 ret = bio_add_page(io->io_bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (ret != bh->b_size) 409 if (ret != bh->b_size)
363 goto submit_and_retry; 410 goto submit_and_retry;
411 io->io_next_block++;
364 return 0; 412 return 0;
365} 413}
366 414
@@ -432,7 +480,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
432 do { 480 do {
433 if (!buffer_async_write(bh)) 481 if (!buffer_async_write(bh))
434 continue; 482 continue;
435 ret = io_submit_add_bh(io, inode, wbc, bh); 483 ret = io_submit_add_bh(io, inode, bh);
436 if (ret) { 484 if (ret) {
437 /* 485 /*
438 * We only get here on ENOMEM. Not much else 486 * We only get here on ENOMEM. Not much else
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b27c96d01965..c5adbb318a90 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -79,12 +79,20 @@ static int verify_group_input(struct super_block *sb,
79 ext4_fsblk_t end = start + input->blocks_count; 79 ext4_fsblk_t end = start + input->blocks_count;
80 ext4_group_t group = input->group; 80 ext4_group_t group = input->group;
81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group; 81 ext4_fsblk_t itend = input->inode_table + sbi->s_itb_per_group;
82 unsigned overhead = ext4_group_overhead_blocks(sb, group); 82 unsigned overhead;
83 ext4_fsblk_t metaend = start + overhead; 83 ext4_fsblk_t metaend;
84 struct buffer_head *bh = NULL; 84 struct buffer_head *bh = NULL;
85 ext4_grpblk_t free_blocks_count, offset; 85 ext4_grpblk_t free_blocks_count, offset;
86 int err = -EINVAL; 86 int err = -EINVAL;
87 87
88 if (group != sbi->s_groups_count) {
89 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
90 input->group, sbi->s_groups_count);
91 return -EINVAL;
92 }
93
94 overhead = ext4_group_overhead_blocks(sb, group);
95 metaend = start + overhead;
88 input->free_blocks_count = free_blocks_count = 96 input->free_blocks_count = free_blocks_count =
89 input->blocks_count - 2 - overhead - sbi->s_itb_per_group; 97 input->blocks_count - 2 - overhead - sbi->s_itb_per_group;
90 98
@@ -96,10 +104,7 @@ static int verify_group_input(struct super_block *sb,
96 free_blocks_count, input->reserved_blocks); 104 free_blocks_count, input->reserved_blocks);
97 105
98 ext4_get_group_no_and_offset(sb, start, NULL, &offset); 106 ext4_get_group_no_and_offset(sb, start, NULL, &offset);
99 if (group != sbi->s_groups_count) 107 if (offset != 0)
100 ext4_warning(sb, "Cannot add at group %u (only %u groups)",
101 input->group, sbi->s_groups_count);
102 else if (offset != 0)
103 ext4_warning(sb, "Last group not full"); 108 ext4_warning(sb, "Last group not full");
104 else if (input->reserved_blocks > input->blocks_count / 5) 109 else if (input->reserved_blocks > input->blocks_count / 5)
105 ext4_warning(sb, "Reserved blocks too high (%u)", 110 ext4_warning(sb, "Reserved blocks too high (%u)",
@@ -1551,11 +1556,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
1551 int reserved_gdb = ext4_bg_has_super(sb, input->group) ? 1556 int reserved_gdb = ext4_bg_has_super(sb, input->group) ?
1552 le16_to_cpu(es->s_reserved_gdt_blocks) : 0; 1557 le16_to_cpu(es->s_reserved_gdt_blocks) : 0;
1553 struct inode *inode = NULL; 1558 struct inode *inode = NULL;
1554 int gdb_off, gdb_num; 1559 int gdb_off;
1555 int err; 1560 int err;
1556 __u16 bg_flags = 0; 1561 __u16 bg_flags = 0;
1557 1562
1558 gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
1559 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb); 1563 gdb_off = input->group % EXT4_DESC_PER_BLOCK(sb);
1560 1564
1561 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, 1565 if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb,
@@ -1656,12 +1660,10 @@ errout:
1656 err = err2; 1660 err = err2;
1657 1661
1658 if (!err) { 1662 if (!err) {
1659 ext4_fsblk_t first_block;
1660 first_block = ext4_group_first_block_no(sb, 0);
1661 if (test_opt(sb, DEBUG)) 1663 if (test_opt(sb, DEBUG))
1662 printk(KERN_DEBUG "EXT4-fs: extended group to %llu " 1664 printk(KERN_DEBUG "EXT4-fs: extended group to %llu "
1663 "blocks\n", ext4_blocks_count(es)); 1665 "blocks\n", ext4_blocks_count(es));
1664 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr - first_block, 1666 update_backups(sb, EXT4_SB(sb)->s_sbh->b_blocknr,
1665 (char *)es, sizeof(struct ext4_super_block), 0); 1667 (char *)es, sizeof(struct ext4_super_block), 0);
1666 } 1668 }
1667 return err; 1669 return err;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 94cc84db7c9a..b59373b625e9 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -69,6 +69,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
69static void ext4_clear_journal_err(struct super_block *sb, 69static void ext4_clear_journal_err(struct super_block *sb,
70 struct ext4_super_block *es); 70 struct ext4_super_block *es);
71static int ext4_sync_fs(struct super_block *sb, int wait); 71static int ext4_sync_fs(struct super_block *sb, int wait);
72static int ext4_sync_fs_nojournal(struct super_block *sb, int wait);
72static int ext4_remount(struct super_block *sb, int *flags, char *data); 73static int ext4_remount(struct super_block *sb, int *flags, char *data);
73static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); 74static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf);
74static int ext4_unfreeze(struct super_block *sb); 75static int ext4_unfreeze(struct super_block *sb);
@@ -398,6 +399,11 @@ static void ext4_handle_error(struct super_block *sb)
398 } 399 }
399 if (test_opt(sb, ERRORS_RO)) { 400 if (test_opt(sb, ERRORS_RO)) {
400 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 401 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
402 /*
403 * Make sure updated value of ->s_mount_flags will be visible
404 * before ->s_flags update
405 */
406 smp_wmb();
401 sb->s_flags |= MS_RDONLY; 407 sb->s_flags |= MS_RDONLY;
402 } 408 }
403 if (test_opt(sb, ERRORS_PANIC)) 409 if (test_opt(sb, ERRORS_PANIC))
@@ -422,9 +428,9 @@ void __ext4_error(struct super_block *sb, const char *function,
422 ext4_handle_error(sb); 428 ext4_handle_error(sb);
423} 429}
424 430
425void ext4_error_inode(struct inode *inode, const char *function, 431void __ext4_error_inode(struct inode *inode, const char *function,
426 unsigned int line, ext4_fsblk_t block, 432 unsigned int line, ext4_fsblk_t block,
427 const char *fmt, ...) 433 const char *fmt, ...)
428{ 434{
429 va_list args; 435 va_list args;
430 struct va_format vaf; 436 struct va_format vaf;
@@ -451,9 +457,9 @@ void ext4_error_inode(struct inode *inode, const char *function,
451 ext4_handle_error(inode->i_sb); 457 ext4_handle_error(inode->i_sb);
452} 458}
453 459
454void ext4_error_file(struct file *file, const char *function, 460void __ext4_error_file(struct file *file, const char *function,
455 unsigned int line, ext4_fsblk_t block, 461 unsigned int line, ext4_fsblk_t block,
456 const char *fmt, ...) 462 const char *fmt, ...)
457{ 463{
458 va_list args; 464 va_list args;
459 struct va_format vaf; 465 struct va_format vaf;
@@ -570,8 +576,13 @@ void __ext4_abort(struct super_block *sb, const char *function,
570 576
571 if ((sb->s_flags & MS_RDONLY) == 0) { 577 if ((sb->s_flags & MS_RDONLY) == 0) {
572 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); 578 ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
573 sb->s_flags |= MS_RDONLY;
574 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; 579 EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED;
580 /*
581 * Make sure updated value of ->s_mount_flags will be visible
582 * before ->s_flags update
583 */
584 smp_wmb();
585 sb->s_flags |= MS_RDONLY;
575 if (EXT4_SB(sb)->s_journal) 586 if (EXT4_SB(sb)->s_journal)
576 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); 587 jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
577 save_error_info(sb, function, line); 588 save_error_info(sb, function, line);
@@ -580,7 +591,8 @@ void __ext4_abort(struct super_block *sb, const char *function,
580 panic("EXT4-fs panic from previous error\n"); 591 panic("EXT4-fs panic from previous error\n");
581} 592}
582 593
583void ext4_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) 594void __ext4_msg(struct super_block *sb,
595 const char *prefix, const char *fmt, ...)
584{ 596{
585 struct va_format vaf; 597 struct va_format vaf;
586 va_list args; 598 va_list args;
@@ -750,8 +762,10 @@ static void ext4_put_super(struct super_block *sb)
750 ext4_unregister_li_request(sb); 762 ext4_unregister_li_request(sb);
751 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); 763 dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
752 764
753 flush_workqueue(sbi->dio_unwritten_wq); 765 flush_workqueue(sbi->unrsv_conversion_wq);
754 destroy_workqueue(sbi->dio_unwritten_wq); 766 flush_workqueue(sbi->rsv_conversion_wq);
767 destroy_workqueue(sbi->unrsv_conversion_wq);
768 destroy_workqueue(sbi->rsv_conversion_wq);
755 769
756 if (sbi->s_journal) { 770 if (sbi->s_journal) {
757 err = jbd2_journal_destroy(sbi->s_journal); 771 err = jbd2_journal_destroy(sbi->s_journal);
@@ -760,7 +774,7 @@ static void ext4_put_super(struct super_block *sb)
760 ext4_abort(sb, "Couldn't clean up the journal"); 774 ext4_abort(sb, "Couldn't clean up the journal");
761 } 775 }
762 776
763 ext4_es_unregister_shrinker(sb); 777 ext4_es_unregister_shrinker(sbi);
764 del_timer(&sbi->s_err_report); 778 del_timer(&sbi->s_err_report);
765 ext4_release_system_zone(sb); 779 ext4_release_system_zone(sb);
766 ext4_mb_release(sb); 780 ext4_mb_release(sb);
@@ -849,6 +863,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
849 rwlock_init(&ei->i_es_lock); 863 rwlock_init(&ei->i_es_lock);
850 INIT_LIST_HEAD(&ei->i_es_lru); 864 INIT_LIST_HEAD(&ei->i_es_lru);
851 ei->i_es_lru_nr = 0; 865 ei->i_es_lru_nr = 0;
866 ei->i_touch_when = 0;
852 ei->i_reserved_data_blocks = 0; 867 ei->i_reserved_data_blocks = 0;
853 ei->i_reserved_meta_blocks = 0; 868 ei->i_reserved_meta_blocks = 0;
854 ei->i_allocated_meta_blocks = 0; 869 ei->i_allocated_meta_blocks = 0;
@@ -859,13 +874,15 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
859 ei->i_reserved_quota = 0; 874 ei->i_reserved_quota = 0;
860#endif 875#endif
861 ei->jinode = NULL; 876 ei->jinode = NULL;
862 INIT_LIST_HEAD(&ei->i_completed_io_list); 877 INIT_LIST_HEAD(&ei->i_rsv_conversion_list);
878 INIT_LIST_HEAD(&ei->i_unrsv_conversion_list);
863 spin_lock_init(&ei->i_completed_io_lock); 879 spin_lock_init(&ei->i_completed_io_lock);
864 ei->i_sync_tid = 0; 880 ei->i_sync_tid = 0;
865 ei->i_datasync_tid = 0; 881 ei->i_datasync_tid = 0;
866 atomic_set(&ei->i_ioend_count, 0); 882 atomic_set(&ei->i_ioend_count, 0);
867 atomic_set(&ei->i_unwritten, 0); 883 atomic_set(&ei->i_unwritten, 0);
868 INIT_WORK(&ei->i_unwritten_work, ext4_end_io_work); 884 INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
885 INIT_WORK(&ei->i_unrsv_conversion_work, ext4_end_io_unrsv_work);
869 886
870 return &ei->vfs_inode; 887 return &ei->vfs_inode;
871} 888}
@@ -1093,6 +1110,7 @@ static const struct super_operations ext4_nojournal_sops = {
1093 .dirty_inode = ext4_dirty_inode, 1110 .dirty_inode = ext4_dirty_inode,
1094 .drop_inode = ext4_drop_inode, 1111 .drop_inode = ext4_drop_inode,
1095 .evict_inode = ext4_evict_inode, 1112 .evict_inode = ext4_evict_inode,
1113 .sync_fs = ext4_sync_fs_nojournal,
1096 .put_super = ext4_put_super, 1114 .put_super = ext4_put_super,
1097 .statfs = ext4_statfs, 1115 .statfs = ext4_statfs,
1098 .remount_fs = ext4_remount, 1116 .remount_fs = ext4_remount,
@@ -1341,7 +1359,7 @@ static const struct mount_opts {
1341 {Opt_delalloc, EXT4_MOUNT_DELALLOC, 1359 {Opt_delalloc, EXT4_MOUNT_DELALLOC,
1342 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, 1360 MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT},
1343 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, 1361 {Opt_nodelalloc, EXT4_MOUNT_DELALLOC,
1344 MOPT_EXT4_ONLY | MOPT_CLEAR | MOPT_EXPLICIT}, 1362 MOPT_EXT4_ONLY | MOPT_CLEAR},
1345 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, 1363 {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
1346 MOPT_EXT4_ONLY | MOPT_SET}, 1364 MOPT_EXT4_ONLY | MOPT_SET},
1347 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | 1365 {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
@@ -1684,12 +1702,6 @@ static inline void ext4_show_quota_options(struct seq_file *seq,
1684 1702
1685 if (sbi->s_qf_names[GRPQUOTA]) 1703 if (sbi->s_qf_names[GRPQUOTA])
1686 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]); 1704 seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
1687
1688 if (test_opt(sb, USRQUOTA))
1689 seq_puts(seq, ",usrquota");
1690
1691 if (test_opt(sb, GRPQUOTA))
1692 seq_puts(seq, ",grpquota");
1693#endif 1705#endif
1694} 1706}
1695 1707
@@ -1908,7 +1920,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1908 struct ext4_sb_info *sbi = EXT4_SB(sb); 1920 struct ext4_sb_info *sbi = EXT4_SB(sb);
1909 struct ext4_group_desc *gdp = NULL; 1921 struct ext4_group_desc *gdp = NULL;
1910 ext4_group_t flex_group; 1922 ext4_group_t flex_group;
1911 unsigned int groups_per_flex = 0;
1912 int i, err; 1923 int i, err;
1913 1924
1914 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; 1925 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
@@ -1916,7 +1927,6 @@ static int ext4_fill_flex_info(struct super_block *sb)
1916 sbi->s_log_groups_per_flex = 0; 1927 sbi->s_log_groups_per_flex = 0;
1917 return 1; 1928 return 1;
1918 } 1929 }
1919 groups_per_flex = 1U << sbi->s_log_groups_per_flex;
1920 1930
1921 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count); 1931 err = ext4_alloc_flex_bg_array(sb, sbi->s_groups_count);
1922 if (err) 1932 if (err)
@@ -2164,19 +2174,22 @@ static void ext4_orphan_cleanup(struct super_block *sb,
2164 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan); 2174 list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
2165 dquot_initialize(inode); 2175 dquot_initialize(inode);
2166 if (inode->i_nlink) { 2176 if (inode->i_nlink) {
2167 ext4_msg(sb, KERN_DEBUG, 2177 if (test_opt(sb, DEBUG))
2168 "%s: truncating inode %lu to %lld bytes", 2178 ext4_msg(sb, KERN_DEBUG,
2169 __func__, inode->i_ino, inode->i_size); 2179 "%s: truncating inode %lu to %lld bytes",
2180 __func__, inode->i_ino, inode->i_size);
2170 jbd_debug(2, "truncating inode %lu to %lld bytes\n", 2181 jbd_debug(2, "truncating inode %lu to %lld bytes\n",
2171 inode->i_ino, inode->i_size); 2182 inode->i_ino, inode->i_size);
2172 mutex_lock(&inode->i_mutex); 2183 mutex_lock(&inode->i_mutex);
2184 truncate_inode_pages(inode->i_mapping, inode->i_size);
2173 ext4_truncate(inode); 2185 ext4_truncate(inode);
2174 mutex_unlock(&inode->i_mutex); 2186 mutex_unlock(&inode->i_mutex);
2175 nr_truncates++; 2187 nr_truncates++;
2176 } else { 2188 } else {
2177 ext4_msg(sb, KERN_DEBUG, 2189 if (test_opt(sb, DEBUG))
2178 "%s: deleting unreferenced inode %lu", 2190 ext4_msg(sb, KERN_DEBUG,
2179 __func__, inode->i_ino); 2191 "%s: deleting unreferenced inode %lu",
2192 __func__, inode->i_ino);
2180 jbd_debug(2, "deleting unreferenced inode %lu\n", 2193 jbd_debug(2, "deleting unreferenced inode %lu\n",
2181 inode->i_ino); 2194 inode->i_ino);
2182 nr_orphans++; 2195 nr_orphans++;
@@ -2377,7 +2390,10 @@ struct ext4_attr {
2377 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *); 2390 ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
2378 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 2391 ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
2379 const char *, size_t); 2392 const char *, size_t);
2380 int offset; 2393 union {
2394 int offset;
2395 int deprecated_val;
2396 } u;
2381}; 2397};
2382 2398
2383static int parse_strtoull(const char *buf, 2399static int parse_strtoull(const char *buf,
@@ -2446,7 +2462,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
2446static ssize_t sbi_ui_show(struct ext4_attr *a, 2462static ssize_t sbi_ui_show(struct ext4_attr *a,
2447 struct ext4_sb_info *sbi, char *buf) 2463 struct ext4_sb_info *sbi, char *buf)
2448{ 2464{
2449 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2465 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2450 2466
2451 return snprintf(buf, PAGE_SIZE, "%u\n", *ui); 2467 return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
2452} 2468}
@@ -2455,7 +2471,7 @@ static ssize_t sbi_ui_store(struct ext4_attr *a,
2455 struct ext4_sb_info *sbi, 2471 struct ext4_sb_info *sbi,
2456 const char *buf, size_t count) 2472 const char *buf, size_t count)
2457{ 2473{
2458 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset); 2474 unsigned int *ui = (unsigned int *) (((char *) sbi) + a->u.offset);
2459 unsigned long t; 2475 unsigned long t;
2460 int ret; 2476 int ret;
2461 2477
@@ -2504,12 +2520,20 @@ static ssize_t trigger_test_error(struct ext4_attr *a,
2504 return count; 2520 return count;
2505} 2521}
2506 2522
2523static ssize_t sbi_deprecated_show(struct ext4_attr *a,
2524 struct ext4_sb_info *sbi, char *buf)
2525{
2526 return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
2527}
2528
2507#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \ 2529#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
2508static struct ext4_attr ext4_attr_##_name = { \ 2530static struct ext4_attr ext4_attr_##_name = { \
2509 .attr = {.name = __stringify(_name), .mode = _mode }, \ 2531 .attr = {.name = __stringify(_name), .mode = _mode }, \
2510 .show = _show, \ 2532 .show = _show, \
2511 .store = _store, \ 2533 .store = _store, \
2512 .offset = offsetof(struct ext4_sb_info, _elname), \ 2534 .u = { \
2535 .offset = offsetof(struct ext4_sb_info, _elname),\
2536 }, \
2513} 2537}
2514#define EXT4_ATTR(name, mode, show, store) \ 2538#define EXT4_ATTR(name, mode, show, store) \
2515static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) 2539static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
@@ -2520,6 +2544,14 @@ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
2520#define EXT4_RW_ATTR_SBI_UI(name, elname) \ 2544#define EXT4_RW_ATTR_SBI_UI(name, elname) \
2521 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) 2545 EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
2522#define ATTR_LIST(name) &ext4_attr_##name.attr 2546#define ATTR_LIST(name) &ext4_attr_##name.attr
2547#define EXT4_DEPRECATED_ATTR(_name, _val) \
2548static struct ext4_attr ext4_attr_##_name = { \
2549 .attr = {.name = __stringify(_name), .mode = 0444 }, \
2550 .show = sbi_deprecated_show, \
2551 .u = { \
2552 .deprecated_val = _val, \
2553 }, \
2554}
2523 2555
2524EXT4_RO_ATTR(delayed_allocation_blocks); 2556EXT4_RO_ATTR(delayed_allocation_blocks);
2525EXT4_RO_ATTR(session_write_kbytes); 2557EXT4_RO_ATTR(session_write_kbytes);
@@ -2534,7 +2566,7 @@ EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
2534EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); 2566EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
2535EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); 2567EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
2536EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); 2568EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
2537EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump); 2569EXT4_DEPRECATED_ATTR(max_writeback_mb_bump, 128);
2538EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); 2570EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
2539EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error); 2571EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
2540 2572
@@ -3451,7 +3483,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3451 } 3483 }
3452 if (test_opt(sb, DIOREAD_NOLOCK)) { 3484 if (test_opt(sb, DIOREAD_NOLOCK)) {
3453 ext4_msg(sb, KERN_ERR, "can't mount with " 3485 ext4_msg(sb, KERN_ERR, "can't mount with "
3454 "both data=journal and delalloc"); 3486 "both data=journal and dioread_nolock");
3455 goto failed_mount; 3487 goto failed_mount;
3456 } 3488 }
3457 if (test_opt(sb, DELALLOC)) 3489 if (test_opt(sb, DELALLOC))
@@ -3586,10 +3618,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3586 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb)); 3618 sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
3587 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb)); 3619 sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
3588 3620
3589 /* Do we have standard group size of blocksize * 8 blocks ? */
3590 if (sbi->s_blocks_per_group == blocksize << 3)
3591 set_opt2(sb, STD_GROUP_SIZE);
3592
3593 for (i = 0; i < 4; i++) 3621 for (i = 0; i < 4; i++)
3594 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); 3622 sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
3595 sbi->s_def_hash_version = es->s_def_hash_version; 3623 sbi->s_def_hash_version = es->s_def_hash_version;
@@ -3659,6 +3687,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3659 goto failed_mount; 3687 goto failed_mount;
3660 } 3688 }
3661 3689
3690 /* Do we have standard group size of clustersize * 8 blocks ? */
3691 if (sbi->s_blocks_per_group == clustersize << 3)
3692 set_opt2(sb, STD_GROUP_SIZE);
3693
3662 /* 3694 /*
3663 * Test whether we have more sectors than will fit in sector_t, 3695 * Test whether we have more sectors than will fit in sector_t,
3664 * and whether the max offset is addressable by the page cache. 3696 * and whether the max offset is addressable by the page cache.
@@ -3763,7 +3795,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3763 sbi->s_err_report.data = (unsigned long) sb; 3795 sbi->s_err_report.data = (unsigned long) sb;
3764 3796
3765 /* Register extent status tree shrinker */ 3797 /* Register extent status tree shrinker */
3766 ext4_es_register_shrinker(sb); 3798 ext4_es_register_shrinker(sbi);
3767 3799
3768 err = percpu_counter_init(&sbi->s_freeclusters_counter, 3800 err = percpu_counter_init(&sbi->s_freeclusters_counter,
3769 ext4_count_free_clusters(sb)); 3801 ext4_count_free_clusters(sb));
@@ -3787,7 +3819,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3787 } 3819 }
3788 3820
3789 sbi->s_stripe = ext4_get_stripe_size(sbi); 3821 sbi->s_stripe = ext4_get_stripe_size(sbi);
3790 sbi->s_max_writeback_mb_bump = 128;
3791 sbi->s_extent_max_zeroout_kb = 32; 3822 sbi->s_extent_max_zeroout_kb = 32;
3792 3823
3793 /* 3824 /*
@@ -3915,12 +3946,20 @@ no_journal:
3915 * The maximum number of concurrent works can be high and 3946 * The maximum number of concurrent works can be high and
3916 * concurrency isn't really necessary. Limit it to 1. 3947 * concurrency isn't really necessary. Limit it to 1.
3917 */ 3948 */
3918 EXT4_SB(sb)->dio_unwritten_wq = 3949 EXT4_SB(sb)->rsv_conversion_wq =
3919 alloc_workqueue("ext4-dio-unwritten", WQ_MEM_RECLAIM | WQ_UNBOUND, 1); 3950 alloc_workqueue("ext4-rsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3920 if (!EXT4_SB(sb)->dio_unwritten_wq) { 3951 if (!EXT4_SB(sb)->rsv_conversion_wq) {
3921 printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); 3952 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3922 ret = -ENOMEM; 3953 ret = -ENOMEM;
3923 goto failed_mount_wq; 3954 goto failed_mount4;
3955 }
3956
3957 EXT4_SB(sb)->unrsv_conversion_wq =
3958 alloc_workqueue("ext4-unrsv-conversion", WQ_MEM_RECLAIM | WQ_UNBOUND, 1);
3959 if (!EXT4_SB(sb)->unrsv_conversion_wq) {
3960 printk(KERN_ERR "EXT4-fs: failed to create workqueue\n");
3961 ret = -ENOMEM;
3962 goto failed_mount4;
3924 } 3963 }
3925 3964
3926 /* 3965 /*
@@ -4074,14 +4113,17 @@ failed_mount4a:
4074 sb->s_root = NULL; 4113 sb->s_root = NULL;
4075failed_mount4: 4114failed_mount4:
4076 ext4_msg(sb, KERN_ERR, "mount failed"); 4115 ext4_msg(sb, KERN_ERR, "mount failed");
4077 destroy_workqueue(EXT4_SB(sb)->dio_unwritten_wq); 4116 if (EXT4_SB(sb)->rsv_conversion_wq)
4117 destroy_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4118 if (EXT4_SB(sb)->unrsv_conversion_wq)
4119 destroy_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4078failed_mount_wq: 4120failed_mount_wq:
4079 if (sbi->s_journal) { 4121 if (sbi->s_journal) {
4080 jbd2_journal_destroy(sbi->s_journal); 4122 jbd2_journal_destroy(sbi->s_journal);
4081 sbi->s_journal = NULL; 4123 sbi->s_journal = NULL;
4082 } 4124 }
4083failed_mount3: 4125failed_mount3:
4084 ext4_es_unregister_shrinker(sb); 4126 ext4_es_unregister_shrinker(sbi);
4085 del_timer(&sbi->s_err_report); 4127 del_timer(&sbi->s_err_report);
4086 if (sbi->s_flex_groups) 4128 if (sbi->s_flex_groups)
4087 ext4_kvfree(sbi->s_flex_groups); 4129 ext4_kvfree(sbi->s_flex_groups);
@@ -4517,19 +4559,52 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
4517{ 4559{
4518 int ret = 0; 4560 int ret = 0;
4519 tid_t target; 4561 tid_t target;
4562 bool needs_barrier = false;
4520 struct ext4_sb_info *sbi = EXT4_SB(sb); 4563 struct ext4_sb_info *sbi = EXT4_SB(sb);
4521 4564
4522 trace_ext4_sync_fs(sb, wait); 4565 trace_ext4_sync_fs(sb, wait);
4523 flush_workqueue(sbi->dio_unwritten_wq); 4566 flush_workqueue(sbi->rsv_conversion_wq);
4567 flush_workqueue(sbi->unrsv_conversion_wq);
4524 /* 4568 /*
4525 * Writeback quota in non-journalled quota case - journalled quota has 4569 * Writeback quota in non-journalled quota case - journalled quota has
4526 * no dirty dquots 4570 * no dirty dquots
4527 */ 4571 */
4528 dquot_writeback_dquots(sb, -1); 4572 dquot_writeback_dquots(sb, -1);
4573 /*
4574 * Data writeback is possible w/o journal transaction, so barrier must
4575 * being sent at the end of the function. But we can skip it if
4576 * transaction_commit will do it for us.
4577 */
4578 target = jbd2_get_latest_transaction(sbi->s_journal);
4579 if (wait && sbi->s_journal->j_flags & JBD2_BARRIER &&
4580 !jbd2_trans_will_send_data_barrier(sbi->s_journal, target))
4581 needs_barrier = true;
4582
4529 if (jbd2_journal_start_commit(sbi->s_journal, &target)) { 4583 if (jbd2_journal_start_commit(sbi->s_journal, &target)) {
4530 if (wait) 4584 if (wait)
4531 jbd2_log_wait_commit(sbi->s_journal, target); 4585 ret = jbd2_log_wait_commit(sbi->s_journal, target);
4532 } 4586 }
4587 if (needs_barrier) {
4588 int err;
4589 err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4590 if (!ret)
4591 ret = err;
4592 }
4593
4594 return ret;
4595}
4596
4597static int ext4_sync_fs_nojournal(struct super_block *sb, int wait)
4598{
4599 int ret = 0;
4600
4601 trace_ext4_sync_fs(sb, wait);
4602 flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq);
4603 flush_workqueue(EXT4_SB(sb)->unrsv_conversion_wq);
4604 dquot_writeback_dquots(sb, -1);
4605 if (wait && test_opt(sb, BARRIER))
4606 ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
4607
4533 return ret; 4608 return ret;
4534} 4609}
4535 4610
@@ -4652,6 +4727,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
4652 goto restore_opts; 4727 goto restore_opts;
4653 } 4728 }
4654 4729
4730 if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
4731 if (test_opt2(sb, EXPLICIT_DELALLOC)) {
4732 ext4_msg(sb, KERN_ERR, "can't mount with "
4733 "both data=journal and delalloc");
4734 err = -EINVAL;
4735 goto restore_opts;
4736 }
4737 if (test_opt(sb, DIOREAD_NOLOCK)) {
4738 ext4_msg(sb, KERN_ERR, "can't mount with "
4739 "both data=journal and dioread_nolock");
4740 err = -EINVAL;
4741 goto restore_opts;
4742 }
4743 }
4744
4655 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) 4745 if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
4656 ext4_abort(sb, "Abort forced by user"); 4746 ext4_abort(sb, "Abort forced by user");
4657 4747
@@ -5406,6 +5496,7 @@ static void __exit ext4_exit_fs(void)
5406 kset_unregister(ext4_kset); 5496 kset_unregister(ext4_kset);
5407 ext4_exit_system_zone(); 5497 ext4_exit_system_zone();
5408 ext4_exit_pageio(); 5498 ext4_exit_pageio();
5499 ext4_exit_es();
5409} 5500}
5410 5501
5411MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); 5502MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");