aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2009-12-11 04:26:12 -0500
committerDavid S. Miller <davem@davemloft.net>2009-12-11 04:26:12 -0500
commite3f4e1cbc341bc2020241d8119bd078db3ec3b85 (patch)
tree8b23624cc792f60d0bf86d787c3514cf0fb04418 /fs
parentadfe67ddffbea51322b118896178bd71aaa4b4d8 (diff)
parentd71cb81af3817193bc605de061da0499934263a6 (diff)
Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_elf.c2
-rw-r--r--fs/bio.c2
-rw-r--r--fs/btrfs/extent_map.c2
-rw-r--r--fs/cifs/README2
-rw-r--r--fs/cifs/cifsglob.h2
-rw-r--r--fs/cifs/cifspdu.h2
-rw-r--r--fs/cifs/inode.c4
-rw-r--r--fs/cifs/smbdes.c2
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/debugfs/inode.c6
-rw-r--r--fs/dlm/config.c24
-rw-r--r--fs/dlm/debug_fs.c2
-rw-r--r--fs/dlm/dir.c7
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c6
-rw-r--r--fs/dlm/lockspace.c15
-rw-r--r--fs/dlm/lowcomms.c6
-rw-r--r--fs/dlm/member.c8
-rw-r--r--fs/dlm/memory.c6
-rw-r--r--fs/dlm/netlink.c2
-rw-r--r--fs/dlm/plock.c8
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/dlm/requestqueue.c2
-rw-r--r--fs/dlm/user.c12
-rw-r--r--fs/exofs/Kbuild2
-rw-r--r--fs/exofs/common.h81
-rw-r--r--fs/exofs/exofs.h97
-rw-r--r--fs/exofs/inode.c409
-rw-r--r--fs/exofs/ios.c421
-rw-r--r--fs/exofs/osd.c125
-rw-r--r--fs/exofs/pnfs.h51
-rw-r--r--fs/exofs/super.c353
-rw-r--r--fs/ext3/inode.c2
-rw-r--r--fs/ext4/Kconfig10
-rw-r--r--fs/ext4/balloc.c46
-rw-r--r--fs/ext4/block_validity.c3
-rw-r--r--fs/ext4/ext4.h23
-rw-r--r--fs/ext4/ext4_jbd2.c82
-rw-r--r--fs/ext4/ext4_jbd2.h44
-rw-r--r--fs/ext4/extents.c44
-rw-r--r--fs/ext4/fsync.c54
-rw-r--r--fs/ext4/inode.c201
-rw-r--r--fs/ext4/ioctl.c29
-rw-r--r--fs/ext4/mballoc.c103
-rw-r--r--fs/ext4/migrate.c27
-rw-r--r--fs/ext4/move_extent.c282
-rw-r--r--fs/ext4/namei.c38
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c118
-rw-r--r--fs/ext4/xattr.c15
-rw-r--r--fs/jbd2/commit.c4
-rw-r--r--fs/jbd2/journal.c12
-rw-r--r--fs/jffs2/compr.c2
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/jffs2/xattr.c2
-rw-r--r--fs/jfs/jfs_dmap.c4
-rw-r--r--fs/ncpfs/ioctl.c2
-rw-r--r--fs/notify/inotify/inotify_user.c4
-rw-r--r--fs/ntfs/compress.c2
-rw-r--r--fs/ntfs/file.c4
-rw-r--r--fs/ntfs/logfile.c2
-rw-r--r--fs/ocfs2/alloc.c2
-rw-r--r--fs/ocfs2/blockcheck.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c2
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/journal.c2
-rw-r--r--fs/ocfs2/refcounttree.c2
-rw-r--r--fs/omfs/bitmap.c2
-rw-r--r--fs/qnx4/bitmap.c2
-rw-r--r--fs/qnx4/dir.c6
-rw-r--r--fs/qnx4/inode.c26
-rw-r--r--fs/qnx4/namei.c6
-rw-r--r--fs/reiserfs/Makefile2
-rw-r--r--fs/reiserfs/bitmap.c4
-rw-r--r--fs/reiserfs/dir.c10
-rw-r--r--fs/reiserfs/do_balan.c17
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/fix_node.c21
-rw-r--r--fs/reiserfs/inode.c97
-rw-r--r--fs/reiserfs/ioctl.c77
-rw-r--r--fs/reiserfs/journal.c130
-rw-r--r--fs/reiserfs/lock.c88
-rw-r--r--fs/reiserfs/namei.c20
-rw-r--r--fs/reiserfs/prints.c4
-rw-r--r--fs/reiserfs/resize.c2
-rw-r--r--fs/reiserfs/stree.c53
-rw-r--r--fs/reiserfs/super.c52
-rw-r--r--fs/reiserfs/xattr.c6
-rw-r--r--fs/ubifs/debug.c2
-rw-r--r--fs/ubifs/file.c13
-rw-r--r--fs/ubifs/recovery.c2
-rw-r--r--fs/ubifs/super.c20
-rw-r--r--fs/xfs/quota/xfs_dquot.h2
93 files changed, 2265 insertions, 1242 deletions
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b9b3bb51b1e4..d15ea1790bfb 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -767,7 +767,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
767 767
768 current->mm->start_stack = bprm->p; 768 current->mm->start_stack = bprm->p;
769 769
770 /* Now we do a little grungy work by mmaping the ELF image into 770 /* Now we do a little grungy work by mmapping the ELF image into
771 the correct location in memory. */ 771 the correct location in memory. */
772 for(i = 0, elf_ppnt = elf_phdata; 772 for(i = 0, elf_ppnt = elf_phdata;
773 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { 773 i < loc->elf_ex.e_phnum; i++, elf_ppnt++) {
diff --git a/fs/bio.c b/fs/bio.c
index e23a63f4f7de..76e6713abf94 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -272,7 +272,7 @@ EXPORT_SYMBOL(bio_init);
272 * for a &struct bio to become free. If a %NULL @bs is passed in, we will 272 * for a &struct bio to become free. If a %NULL @bs is passed in, we will
273 * fall back to just using @kmalloc to allocate the required memory. 273 * fall back to just using @kmalloc to allocate the required memory.
274 * 274 *
275 * Note that the caller must set ->bi_destructor on succesful return 275 * Note that the caller must set ->bi_destructor on successful return
276 * of a bio, to do the appropriate freeing of the bio once the reference 276 * of a bio, to do the appropriate freeing of the bio once the reference
277 * count drops to zero. 277 * count drops to zero.
278 **/ 278 **/
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index ccbdcb54ec5d..46bea0f4dc7b 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -256,7 +256,7 @@ out:
256 * Insert @em into @tree or perform a simple forward/backward merge with 256 * Insert @em into @tree or perform a simple forward/backward merge with
257 * existing mappings. The extent_map struct passed in will be inserted 257 * existing mappings. The extent_map struct passed in will be inserted
258 * into the tree directly, with an additional reference taken, or a 258 * into the tree directly, with an additional reference taken, or a
259 * reference dropped if the merge attempt was sucessfull. 259 * reference dropped if the merge attempt was successfull.
260 */ 260 */
261int add_extent_mapping(struct extent_map_tree *tree, 261int add_extent_mapping(struct extent_map_tree *tree,
262 struct extent_map *em) 262 struct extent_map *em)
diff --git a/fs/cifs/README b/fs/cifs/README
index 79c1a93400be..a727b7cb075f 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -423,7 +423,7 @@ A partial list of the supported mount options follows:
423 source name to use to represent the client netbios machine 423 source name to use to represent the client netbios machine
424 name when doing the RFC1001 netbios session initialize. 424 name when doing the RFC1001 netbios session initialize.
425 direct Do not do inode data caching on files opened on this mount. 425 direct Do not do inode data caching on files opened on this mount.
426 This precludes mmaping files on this mount. In some cases 426 This precludes mmapping files on this mount. In some cases
427 with fast networks and little or no caching benefits on the 427 with fast networks and little or no caching benefits on the
428 client (e.g. when the application is doing large sequential 428 client (e.g. when the application is doing large sequential
429 reads bigger than page size without rereading the same data) 429 reads bigger than page size without rereading the same data)
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 5d0fde18039c..4b35f7ec0583 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -39,7 +39,7 @@
39 39
40/* 40/*
41 * MAX_REQ is the maximum number of requests that WE will send 41 * MAX_REQ is the maximum number of requests that WE will send
42 * on one socket concurently. It also matches the most common 42 * on one socket concurrently. It also matches the most common
43 * value of max multiplex returned by servers. We may 43 * value of max multiplex returned by servers. We may
44 * eventually want to use the negotiated value (in case 44 * eventually want to use the negotiated value (in case
45 * future servers can handle more) when we are more confident that 45 * future servers can handle more) when we are more confident that
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 2d07f890a842..3877737f96a6 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1227,7 +1227,7 @@ typedef struct smb_com_setattr_rsp {
1227/* empty wct response to setattr */ 1227/* empty wct response to setattr */
1228 1228
1229/*******************************************************/ 1229/*******************************************************/
1230/* NT Transact structure defintions follow */ 1230/* NT Transact structure definitions follow */
1231/* Currently only ioctl, acl (get security descriptor) */ 1231/* Currently only ioctl, acl (get security descriptor) */
1232/* and notify are implemented */ 1232/* and notify are implemented */
1233/*******************************************************/ 1233/*******************************************************/
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index cababd8a52df..cf18ee765590 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -914,8 +914,8 @@ undo_setattr:
914/* 914/*
915 * If dentry->d_inode is null (usually meaning the cached dentry 915 * If dentry->d_inode is null (usually meaning the cached dentry
916 * is a negative dentry) then we would attempt a standard SMB delete, but 916 * is a negative dentry) then we would attempt a standard SMB delete, but
917 * if that fails we can not attempt the fall back mechanisms on EACESS 917 * if that fails we can not attempt the fall back mechanisms on EACCESS
918 * but will return the EACESS to the caller. Note that the VFS does not call 918 * but will return the EACCESS to the caller. Note that the VFS does not call
919 * unlink on negative dentries currently. 919 * unlink on negative dentries currently.
920 */ 920 */
921int cifs_unlink(struct inode *dir, struct dentry *dentry) 921int cifs_unlink(struct inode *dir, struct dentry *dentry)
diff --git a/fs/cifs/smbdes.c b/fs/cifs/smbdes.c
index 224a1f478966..b6b6dcb500bf 100644
--- a/fs/cifs/smbdes.c
+++ b/fs/cifs/smbdes.c
@@ -371,7 +371,7 @@ E_P24(unsigned char *p21, const unsigned char *c8, unsigned char *p24)
371 smbhash(p24 + 16, c8, p21 + 14, 1); 371 smbhash(p24 + 16, c8, p21 + 14, 1);
372} 372}
373 373
374#if 0 /* currently unsued */ 374#if 0 /* currently unused */
375static void 375static void
376D_P16(unsigned char *p14, unsigned char *in, unsigned char *out) 376D_P16(unsigned char *p14, unsigned char *in, unsigned char *out)
377{ 377{
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 229e72218165..2346895b3a77 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1920,7 +1920,7 @@ COMPATIBLE_IOCTL(TIOCSLTC)
1920#endif 1920#endif
1921#ifdef TIOCSTART 1921#ifdef TIOCSTART
1922/* 1922/*
1923 * For these two we have defintions in ioctls.h and/or termios.h on 1923 * For these two we have definitions in ioctls.h and/or termios.h on
1924 * some architectures but no actual implemention. Some applications 1924 * some architectures but no actual implemention. Some applications
1925 * like bash call them if they are defined in the headers, so we provide 1925 * like bash call them if they are defined in the headers, so we provide
1926 * entries here to avoid syslog message spew. 1926 * entries here to avoid syslog message spew.
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index d22438ef7674..0d23b52dd22c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -184,7 +184,7 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
184/** 184/**
185 * debugfs_create_file - create a file in the debugfs filesystem 185 * debugfs_create_file - create a file in the debugfs filesystem
186 * @name: a pointer to a string containing the name of the file to create. 186 * @name: a pointer to a string containing the name of the file to create.
187 * @mode: the permission that the file should have 187 * @mode: the permission that the file should have.
188 * @parent: a pointer to the parent dentry for this file. This should be a 188 * @parent: a pointer to the parent dentry for this file. This should be a
189 * directory dentry if set. If this paramater is NULL, then the 189 * directory dentry if set. If this paramater is NULL, then the
190 * file will be created in the root of the debugfs filesystem. 190 * file will be created in the root of the debugfs filesystem.
@@ -195,8 +195,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode,
195 * this file. 195 * this file.
196 * 196 *
197 * This is the basic "create a file" function for debugfs. It allows for a 197 * This is the basic "create a file" function for debugfs. It allows for a
198 * wide range of flexibility in createing a file, or a directory (if you 198 * wide range of flexibility in creating a file, or a directory (if you want
199 * want to create a directory, the debugfs_create_dir() function is 199 * to create a directory, the debugfs_create_dir() function is
200 * recommended to be used instead.) 200 * recommended to be used instead.)
201 * 201 *
202 * This function will return a pointer to a dentry if it succeeds. This 202 * This function will return a pointer to a dentry if it succeeds. This
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index fd9859f92fad..0df243850818 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -410,10 +410,10 @@ static struct config_group *make_cluster(struct config_group *g,
410 struct dlm_comms *cms = NULL; 410 struct dlm_comms *cms = NULL;
411 void *gps = NULL; 411 void *gps = NULL;
412 412
413 cl = kzalloc(sizeof(struct dlm_cluster), GFP_KERNEL); 413 cl = kzalloc(sizeof(struct dlm_cluster), GFP_NOFS);
414 gps = kcalloc(3, sizeof(struct config_group *), GFP_KERNEL); 414 gps = kcalloc(3, sizeof(struct config_group *), GFP_NOFS);
415 sps = kzalloc(sizeof(struct dlm_spaces), GFP_KERNEL); 415 sps = kzalloc(sizeof(struct dlm_spaces), GFP_NOFS);
416 cms = kzalloc(sizeof(struct dlm_comms), GFP_KERNEL); 416 cms = kzalloc(sizeof(struct dlm_comms), GFP_NOFS);
417 417
418 if (!cl || !gps || !sps || !cms) 418 if (!cl || !gps || !sps || !cms)
419 goto fail; 419 goto fail;
@@ -482,9 +482,9 @@ static struct config_group *make_space(struct config_group *g, const char *name)
482 struct dlm_nodes *nds = NULL; 482 struct dlm_nodes *nds = NULL;
483 void *gps = NULL; 483 void *gps = NULL;
484 484
485 sp = kzalloc(sizeof(struct dlm_space), GFP_KERNEL); 485 sp = kzalloc(sizeof(struct dlm_space), GFP_NOFS);
486 gps = kcalloc(2, sizeof(struct config_group *), GFP_KERNEL); 486 gps = kcalloc(2, sizeof(struct config_group *), GFP_NOFS);
487 nds = kzalloc(sizeof(struct dlm_nodes), GFP_KERNEL); 487 nds = kzalloc(sizeof(struct dlm_nodes), GFP_NOFS);
488 488
489 if (!sp || !gps || !nds) 489 if (!sp || !gps || !nds)
490 goto fail; 490 goto fail;
@@ -536,7 +536,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name)
536{ 536{
537 struct dlm_comm *cm; 537 struct dlm_comm *cm;
538 538
539 cm = kzalloc(sizeof(struct dlm_comm), GFP_KERNEL); 539 cm = kzalloc(sizeof(struct dlm_comm), GFP_NOFS);
540 if (!cm) 540 if (!cm)
541 return ERR_PTR(-ENOMEM); 541 return ERR_PTR(-ENOMEM);
542 542
@@ -569,7 +569,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
569 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); 569 struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent);
570 struct dlm_node *nd; 570 struct dlm_node *nd;
571 571
572 nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); 572 nd = kzalloc(sizeof(struct dlm_node), GFP_NOFS);
573 if (!nd) 573 if (!nd)
574 return ERR_PTR(-ENOMEM); 574 return ERR_PTR(-ENOMEM);
575 575
@@ -705,7 +705,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len)
705 if (cm->addr_count >= DLM_MAX_ADDR_COUNT) 705 if (cm->addr_count >= DLM_MAX_ADDR_COUNT)
706 return -ENOSPC; 706 return -ENOSPC;
707 707
708 addr = kzalloc(sizeof(*addr), GFP_KERNEL); 708 addr = kzalloc(sizeof(*addr), GFP_NOFS);
709 if (!addr) 709 if (!addr)
710 return -ENOMEM; 710 return -ENOMEM;
711 711
@@ -868,7 +868,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
868 868
869 ids_count = sp->members_count; 869 ids_count = sp->members_count;
870 870
871 ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL); 871 ids = kcalloc(ids_count, sizeof(int), GFP_NOFS);
872 if (!ids) { 872 if (!ids) {
873 rv = -ENOMEM; 873 rv = -ENOMEM;
874 goto out; 874 goto out;
@@ -886,7 +886,7 @@ int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
886 if (!new_count) 886 if (!new_count)
887 goto out_ids; 887 goto out_ids;
888 888
889 new = kcalloc(new_count, sizeof(int), GFP_KERNEL); 889 new = kcalloc(new_count, sizeof(int), GFP_NOFS);
890 if (!new) { 890 if (!new) {
891 kfree(ids); 891 kfree(ids);
892 rv = -ENOMEM; 892 rv = -ENOMEM;
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 1c8bb8c3a82e..375a2359b3bf 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -404,7 +404,7 @@ static void *table_seq_start(struct seq_file *seq, loff_t *pos)
404 if (bucket >= ls->ls_rsbtbl_size) 404 if (bucket >= ls->ls_rsbtbl_size)
405 return NULL; 405 return NULL;
406 406
407 ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_KERNEL); 407 ri = kzalloc(sizeof(struct rsbtbl_iter), GFP_NOFS);
408 if (!ri) 408 if (!ri)
409 return NULL; 409 return NULL;
410 if (n == 0) 410 if (n == 0)
diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c
index c4dfa1dcc86f..7b84c1dbc82e 100644
--- a/fs/dlm/dir.c
+++ b/fs/dlm/dir.c
@@ -49,8 +49,7 @@ static struct dlm_direntry *get_free_de(struct dlm_ls *ls, int len)
49 spin_unlock(&ls->ls_recover_list_lock); 49 spin_unlock(&ls->ls_recover_list_lock);
50 50
51 if (!found) 51 if (!found)
52 de = kzalloc(sizeof(struct dlm_direntry) + len, 52 de = kzalloc(sizeof(struct dlm_direntry) + len, GFP_NOFS);
53 ls->ls_allocation);
54 return de; 53 return de;
55} 54}
56 55
@@ -212,7 +211,7 @@ int dlm_recover_directory(struct dlm_ls *ls)
212 211
213 dlm_dir_clear(ls); 212 dlm_dir_clear(ls);
214 213
215 last_name = kmalloc(DLM_RESNAME_MAXLEN, ls->ls_allocation); 214 last_name = kmalloc(DLM_RESNAME_MAXLEN, GFP_NOFS);
216 if (!last_name) 215 if (!last_name)
217 goto out; 216 goto out;
218 217
@@ -323,7 +322,7 @@ static int get_entry(struct dlm_ls *ls, int nodeid, char *name,
323 if (namelen > DLM_RESNAME_MAXLEN) 322 if (namelen > DLM_RESNAME_MAXLEN)
324 return -EINVAL; 323 return -EINVAL;
325 324
326 de = kzalloc(sizeof(struct dlm_direntry) + namelen, ls->ls_allocation); 325 de = kzalloc(sizeof(struct dlm_direntry) + namelen, GFP_NOFS);
327 if (!de) 326 if (!de)
328 return -ENOMEM; 327 return -ENOMEM;
329 328
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d01ca0a711db..826d3dc6e0ab 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -473,7 +473,6 @@ struct dlm_ls {
473 int ls_low_nodeid; 473 int ls_low_nodeid;
474 int ls_total_weight; 474 int ls_total_weight;
475 int *ls_node_array; 475 int *ls_node_array;
476 gfp_t ls_allocation;
477 476
478 struct dlm_rsb ls_stub_rsb; /* for returning errors */ 477 struct dlm_rsb ls_stub_rsb; /* for returning errors */
479 struct dlm_lkb ls_stub_lkb; /* for returning errors */ 478 struct dlm_lkb ls_stub_lkb; /* for returning errors */
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index eb507c453c5f..9c0c1db1e105 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -2689,7 +2689,7 @@ static int _create_message(struct dlm_ls *ls, int mb_len,
2689 pass into lowcomms_commit and a message buffer (mb) that we 2689 pass into lowcomms_commit and a message buffer (mb) that we
2690 write our data into */ 2690 write our data into */
2691 2691
2692 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); 2692 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
2693 if (!mh) 2693 if (!mh)
2694 return -ENOBUFS; 2694 return -ENOBUFS;
2695 2695
@@ -4512,7 +4512,7 @@ int dlm_user_request(struct dlm_ls *ls, struct dlm_user_args *ua,
4512 } 4512 }
4513 4513
4514 if (flags & DLM_LKF_VALBLK) { 4514 if (flags & DLM_LKF_VALBLK) {
4515 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); 4515 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
4516 if (!ua->lksb.sb_lvbptr) { 4516 if (!ua->lksb.sb_lvbptr) {
4517 kfree(ua); 4517 kfree(ua);
4518 __put_lkb(ls, lkb); 4518 __put_lkb(ls, lkb);
@@ -4582,7 +4582,7 @@ int dlm_user_convert(struct dlm_ls *ls, struct dlm_user_args *ua_tmp,
4582 ua = lkb->lkb_ua; 4582 ua = lkb->lkb_ua;
4583 4583
4584 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) { 4584 if (flags & DLM_LKF_VALBLK && !ua->lksb.sb_lvbptr) {
4585 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_KERNEL); 4585 ua->lksb.sb_lvbptr = kzalloc(DLM_USER_LVB_LEN, GFP_NOFS);
4586 if (!ua->lksb.sb_lvbptr) { 4586 if (!ua->lksb.sb_lvbptr) {
4587 error = -ENOMEM; 4587 error = -ENOMEM;
4588 goto out_put; 4588 goto out_put;
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index d489fcc86713..c010ecfc0d29 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -430,7 +430,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
430 430
431 error = -ENOMEM; 431 error = -ENOMEM;
432 432
433 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); 433 ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_NOFS);
434 if (!ls) 434 if (!ls)
435 goto out; 435 goto out;
436 memcpy(ls->ls_name, name, namelen); 436 memcpy(ls->ls_name, name, namelen);
@@ -443,11 +443,6 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
443 if (flags & DLM_LSFL_TIMEWARN) 443 if (flags & DLM_LSFL_TIMEWARN)
444 set_bit(LSFL_TIMEWARN, &ls->ls_flags); 444 set_bit(LSFL_TIMEWARN, &ls->ls_flags);
445 445
446 if (flags & DLM_LSFL_FS)
447 ls->ls_allocation = GFP_NOFS;
448 else
449 ls->ls_allocation = GFP_KERNEL;
450
451 /* ls_exflags are forced to match among nodes, and we don't 446 /* ls_exflags are forced to match among nodes, and we don't
452 need to require all nodes to have some flags set */ 447 need to require all nodes to have some flags set */
453 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | 448 ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS |
@@ -456,7 +451,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
456 size = dlm_config.ci_rsbtbl_size; 451 size = dlm_config.ci_rsbtbl_size;
457 ls->ls_rsbtbl_size = size; 452 ls->ls_rsbtbl_size = size;
458 453
459 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_KERNEL); 454 ls->ls_rsbtbl = kmalloc(sizeof(struct dlm_rsbtable) * size, GFP_NOFS);
460 if (!ls->ls_rsbtbl) 455 if (!ls->ls_rsbtbl)
461 goto out_lsfree; 456 goto out_lsfree;
462 for (i = 0; i < size; i++) { 457 for (i = 0; i < size; i++) {
@@ -468,7 +463,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
468 size = dlm_config.ci_lkbtbl_size; 463 size = dlm_config.ci_lkbtbl_size;
469 ls->ls_lkbtbl_size = size; 464 ls->ls_lkbtbl_size = size;
470 465
471 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_KERNEL); 466 ls->ls_lkbtbl = kmalloc(sizeof(struct dlm_lkbtable) * size, GFP_NOFS);
472 if (!ls->ls_lkbtbl) 467 if (!ls->ls_lkbtbl)
473 goto out_rsbfree; 468 goto out_rsbfree;
474 for (i = 0; i < size; i++) { 469 for (i = 0; i < size; i++) {
@@ -480,7 +475,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
480 size = dlm_config.ci_dirtbl_size; 475 size = dlm_config.ci_dirtbl_size;
481 ls->ls_dirtbl_size = size; 476 ls->ls_dirtbl_size = size;
482 477
483 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_KERNEL); 478 ls->ls_dirtbl = kmalloc(sizeof(struct dlm_dirtable) * size, GFP_NOFS);
484 if (!ls->ls_dirtbl) 479 if (!ls->ls_dirtbl)
485 goto out_lkbfree; 480 goto out_lkbfree;
486 for (i = 0; i < size; i++) { 481 for (i = 0; i < size; i++) {
@@ -527,7 +522,7 @@ static int new_lockspace(const char *name, int namelen, void **lockspace,
527 mutex_init(&ls->ls_requestqueue_mutex); 522 mutex_init(&ls->ls_requestqueue_mutex);
528 mutex_init(&ls->ls_clear_proc_locks); 523 mutex_init(&ls->ls_clear_proc_locks);
529 524
530 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); 525 ls->ls_recover_buf = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS);
531 if (!ls->ls_recover_buf) 526 if (!ls->ls_recover_buf)
532 goto out_dirfree; 527 goto out_dirfree;
533 528
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70736eb4b516..52cab160893c 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1060,7 +1060,7 @@ static void init_local(void)
1060 if (dlm_our_addr(&sas, i)) 1060 if (dlm_our_addr(&sas, i))
1061 break; 1061 break;
1062 1062
1063 addr = kmalloc(sizeof(*addr), GFP_KERNEL); 1063 addr = kmalloc(sizeof(*addr), GFP_NOFS);
1064 if (!addr) 1064 if (!addr)
1065 break; 1065 break;
1066 memcpy(addr, &sas, sizeof(*addr)); 1066 memcpy(addr, &sas, sizeof(*addr));
@@ -1099,7 +1099,7 @@ static int sctp_listen_for_all(void)
1099 struct sockaddr_storage localaddr; 1099 struct sockaddr_storage localaddr;
1100 struct sctp_event_subscribe subscribe; 1100 struct sctp_event_subscribe subscribe;
1101 int result = -EINVAL, num = 1, i, addr_len; 1101 int result = -EINVAL, num = 1, i, addr_len;
1102 struct connection *con = nodeid2con(0, GFP_KERNEL); 1102 struct connection *con = nodeid2con(0, GFP_NOFS);
1103 int bufsize = NEEDED_RMEM; 1103 int bufsize = NEEDED_RMEM;
1104 1104
1105 if (!con) 1105 if (!con)
@@ -1171,7 +1171,7 @@ out:
1171static int tcp_listen_for_all(void) 1171static int tcp_listen_for_all(void)
1172{ 1172{
1173 struct socket *sock = NULL; 1173 struct socket *sock = NULL;
1174 struct connection *con = nodeid2con(0, GFP_KERNEL); 1174 struct connection *con = nodeid2con(0, GFP_NOFS);
1175 int result = -EINVAL; 1175 int result = -EINVAL;
1176 1176
1177 if (!con) 1177 if (!con)
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index b128775913b2..84f70bfb0baf 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -48,7 +48,7 @@ static int dlm_add_member(struct dlm_ls *ls, int nodeid)
48 struct dlm_member *memb; 48 struct dlm_member *memb;
49 int w, error; 49 int w, error;
50 50
51 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); 51 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
52 if (!memb) 52 if (!memb)
53 return -ENOMEM; 53 return -ENOMEM;
54 54
@@ -143,7 +143,7 @@ static void make_member_array(struct dlm_ls *ls)
143 143
144 ls->ls_total_weight = total; 144 ls->ls_total_weight = total;
145 145
146 array = kmalloc(sizeof(int) * total, ls->ls_allocation); 146 array = kmalloc(sizeof(int) * total, GFP_NOFS);
147 if (!array) 147 if (!array)
148 return; 148 return;
149 149
@@ -226,7 +226,7 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
226 continue; 226 continue;
227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]); 227 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
228 228
229 memb = kzalloc(sizeof(struct dlm_member), ls->ls_allocation); 229 memb = kzalloc(sizeof(struct dlm_member), GFP_NOFS);
230 if (!memb) 230 if (!memb)
231 return -ENOMEM; 231 return -ENOMEM;
232 memb->nodeid = rv->new[i]; 232 memb->nodeid = rv->new[i];
@@ -341,7 +341,7 @@ int dlm_ls_start(struct dlm_ls *ls)
341 int *ids = NULL, *new = NULL; 341 int *ids = NULL, *new = NULL;
342 int error, ids_count = 0, new_count = 0; 342 int error, ids_count = 0, new_count = 0;
343 343
344 rv = kzalloc(sizeof(struct dlm_recover), ls->ls_allocation); 344 rv = kzalloc(sizeof(struct dlm_recover), GFP_NOFS);
345 if (!rv) 345 if (!rv)
346 return -ENOMEM; 346 return -ENOMEM;
347 347
diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c
index c1775b84ebab..8e0d00db004f 100644
--- a/fs/dlm/memory.c
+++ b/fs/dlm/memory.c
@@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls)
39{ 39{
40 char *p; 40 char *p;
41 41
42 p = kzalloc(ls->ls_lvblen, ls->ls_allocation); 42 p = kzalloc(ls->ls_lvblen, GFP_NOFS);
43 return p; 43 return p;
44} 44}
45 45
@@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen)
57 57
58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); 58 DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,);
59 59
60 r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation); 60 r = kzalloc(sizeof(*r) + namelen, GFP_NOFS);
61 return r; 61 return r;
62} 62}
63 63
@@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls)
72{ 72{
73 struct dlm_lkb *lkb; 73 struct dlm_lkb *lkb;
74 74
75 lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation); 75 lkb = kmem_cache_zalloc(lkb_cache, GFP_NOFS);
76 return lkb; 76 return lkb;
77} 77}
78 78
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 55ea369f43a9..052095cd592f 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -26,7 +26,7 @@ static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
26 struct sk_buff *skb; 26 struct sk_buff *skb;
27 void *data; 27 void *data;
28 28
29 skb = genlmsg_new(size, GFP_KERNEL); 29 skb = genlmsg_new(size, GFP_NOFS);
30 if (!skb) 30 if (!skb)
31 return -ENOMEM; 31 return -ENOMEM;
32 32
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index 16f682e26c07..b5f89aef3b29 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -82,7 +82,7 @@ int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
82 if (!ls) 82 if (!ls)
83 return -EINVAL; 83 return -EINVAL;
84 84
85 xop = kzalloc(sizeof(*xop), GFP_KERNEL); 85 xop = kzalloc(sizeof(*xop), GFP_NOFS);
86 if (!xop) { 86 if (!xop) {
87 rv = -ENOMEM; 87 rv = -ENOMEM;
88 goto out; 88 goto out;
@@ -143,7 +143,7 @@ out:
143} 143}
144EXPORT_SYMBOL_GPL(dlm_posix_lock); 144EXPORT_SYMBOL_GPL(dlm_posix_lock);
145 145
146/* Returns failure iff a succesful lock operation should be canceled */ 146/* Returns failure iff a successful lock operation should be canceled */
147static int dlm_plock_callback(struct plock_op *op) 147static int dlm_plock_callback(struct plock_op *op)
148{ 148{
149 struct file *file; 149 struct file *file;
@@ -211,7 +211,7 @@ int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
211 if (!ls) 211 if (!ls)
212 return -EINVAL; 212 return -EINVAL;
213 213
214 op = kzalloc(sizeof(*op), GFP_KERNEL); 214 op = kzalloc(sizeof(*op), GFP_NOFS);
215 if (!op) { 215 if (!op) {
216 rv = -ENOMEM; 216 rv = -ENOMEM;
217 goto out; 217 goto out;
@@ -266,7 +266,7 @@ int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
266 if (!ls) 266 if (!ls)
267 return -EINVAL; 267 return -EINVAL;
268 268
269 op = kzalloc(sizeof(*op), GFP_KERNEL); 269 op = kzalloc(sizeof(*op), GFP_NOFS);
270 if (!op) { 270 if (!op) {
271 rv = -ENOMEM; 271 rv = -ENOMEM;
272 goto out; 272 goto out;
diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c
index 67522c268c14..3c83a49a48a3 100644
--- a/fs/dlm/rcom.c
+++ b/fs/dlm/rcom.c
@@ -38,7 +38,7 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len,
38 char *mb; 38 char *mb;
39 int mb_len = sizeof(struct dlm_rcom) + len; 39 int mb_len = sizeof(struct dlm_rcom) + len;
40 40
41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, ls->ls_allocation, &mb); 41 mh = dlm_lowcomms_get_buffer(to_nodeid, mb_len, GFP_NOFS, &mb);
42 if (!mh) { 42 if (!mh) {
43 log_print("create_rcom to %d type %d len %d ENOBUFS", 43 log_print("create_rcom to %d type %d len %d ENOBUFS",
44 to_nodeid, type, len); 44 to_nodeid, type, len);
diff --git a/fs/dlm/requestqueue.c b/fs/dlm/requestqueue.c
index 7a2307c08911..a44fa22890e1 100644
--- a/fs/dlm/requestqueue.c
+++ b/fs/dlm/requestqueue.c
@@ -35,7 +35,7 @@ void dlm_add_requestqueue(struct dlm_ls *ls, int nodeid, struct dlm_message *ms)
35 struct rq_entry *e; 35 struct rq_entry *e;
36 int length = ms->m_header.h_length - sizeof(struct dlm_message); 36 int length = ms->m_header.h_length - sizeof(struct dlm_message);
37 37
38 e = kmalloc(sizeof(struct rq_entry) + length, ls->ls_allocation); 38 e = kmalloc(sizeof(struct rq_entry) + length, GFP_NOFS);
39 if (!e) { 39 if (!e) {
40 log_print("dlm_add_requestqueue: out of memory len %d", length); 40 log_print("dlm_add_requestqueue: out of memory len %d", length);
41 return; 41 return;
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index ebce994ab0b7..e73a4bb572aa 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -267,7 +267,7 @@ static int device_user_lock(struct dlm_user_proc *proc,
267 goto out; 267 goto out;
268 } 268 }
269 269
270 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); 270 ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
271 if (!ua) 271 if (!ua)
272 goto out; 272 goto out;
273 ua->proc = proc; 273 ua->proc = proc;
@@ -307,7 +307,7 @@ static int device_user_unlock(struct dlm_user_proc *proc,
307 if (!ls) 307 if (!ls)
308 return -ENOENT; 308 return -ENOENT;
309 309
310 ua = kzalloc(sizeof(struct dlm_user_args), GFP_KERNEL); 310 ua = kzalloc(sizeof(struct dlm_user_args), GFP_NOFS);
311 if (!ua) 311 if (!ua)
312 goto out; 312 goto out;
313 ua->proc = proc; 313 ua->proc = proc;
@@ -352,7 +352,7 @@ static int dlm_device_register(struct dlm_ls *ls, char *name)
352 352
353 error = -ENOMEM; 353 error = -ENOMEM;
354 len = strlen(name) + strlen(name_prefix) + 2; 354 len = strlen(name) + strlen(name_prefix) + 2;
355 ls->ls_device.name = kzalloc(len, GFP_KERNEL); 355 ls->ls_device.name = kzalloc(len, GFP_NOFS);
356 if (!ls->ls_device.name) 356 if (!ls->ls_device.name)
357 goto fail; 357 goto fail;
358 358
@@ -520,7 +520,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
520#endif 520#endif
521 return -EINVAL; 521 return -EINVAL;
522 522
523 kbuf = kzalloc(count + 1, GFP_KERNEL); 523 kbuf = kzalloc(count + 1, GFP_NOFS);
524 if (!kbuf) 524 if (!kbuf)
525 return -ENOMEM; 525 return -ENOMEM;
526 526
@@ -546,7 +546,7 @@ static ssize_t device_write(struct file *file, const char __user *buf,
546 546
547 /* add 1 after namelen so that the name string is terminated */ 547 /* add 1 after namelen so that the name string is terminated */
548 kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1, 548 kbuf = kzalloc(sizeof(struct dlm_write_request) + namelen + 1,
549 GFP_KERNEL); 549 GFP_NOFS);
550 if (!kbuf) { 550 if (!kbuf) {
551 kfree(k32buf); 551 kfree(k32buf);
552 return -ENOMEM; 552 return -ENOMEM;
@@ -648,7 +648,7 @@ static int device_open(struct inode *inode, struct file *file)
648 if (!ls) 648 if (!ls)
649 return -ENOENT; 649 return -ENOENT;
650 650
651 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); 651 proc = kzalloc(sizeof(struct dlm_user_proc), GFP_NOFS);
652 if (!proc) { 652 if (!proc) {
653 dlm_put_lockspace(ls); 653 dlm_put_lockspace(ls);
654 return -ENOMEM; 654 return -ENOMEM;
diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index cc2d22db119c..2d0f757fda3e 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
12# Kbuild - Gets included from the Kernels Makefile and build system 12# Kbuild - Gets included from the Kernels Makefile and build system
13# 13#
14 14
15exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o 15exofs-y := ios.o inode.o file.o symlink.o namei.o dir.o super.o
16obj-$(CONFIG_EXOFS_FS) += exofs.o 16obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
index c6718e4817fe..b1b178e61718 100644
--- a/fs/exofs/common.h
+++ b/fs/exofs/common.h
@@ -49,6 +49,7 @@
49#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */ 49#define EXOFS_MIN_PID 0x10000 /* Smallest partition ID */
50#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */ 50#define EXOFS_OBJ_OFF 0x10000 /* offset for objects */
51#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */ 51#define EXOFS_SUPER_ID 0x10000 /* object ID for on-disk superblock */
52#define EXOFS_DEVTABLE_ID 0x10001 /* object ID for on-disk device table */
52#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */ 53#define EXOFS_ROOT_ID 0x10002 /* object ID for root directory */
53 54
54/* exofs Application specific page/attribute */ 55/* exofs Application specific page/attribute */
@@ -78,17 +79,67 @@ enum {
78#define EXOFS_SUPER_MAGIC 0x5DF5 79#define EXOFS_SUPER_MAGIC 0x5DF5
79 80
80/* 81/*
81 * The file system control block - stored in an object's data (mainly, the one 82 * The file system control block - stored in object EXOFS_SUPER_ID's data.
82 * with ID EXOFS_SUPER_ID). This is where the in-memory superblock is stored 83 * This is where the in-memory superblock is stored on disk.
83 * on disk. Right now it just has a magic value, which is basically a sanity
84 * check on our ability to communicate with the object store.
85 */ 84 */
85enum {EXOFS_FSCB_VER = 1, EXOFS_DT_VER = 1};
86struct exofs_fscb { 86struct exofs_fscb {
87 __le64 s_nextid; /* Highest object ID used */ 87 __le64 s_nextid; /* Highest object ID used */
88 __le32 s_numfiles; /* Number of files on fs */ 88 __le64 s_numfiles; /* Number of files on fs */
89 __le32 s_version; /* == EXOFS_FSCB_VER */
89 __le16 s_magic; /* Magic signature */ 90 __le16 s_magic; /* Magic signature */
90 __le16 s_newfs; /* Non-zero if this is a new fs */ 91 __le16 s_newfs; /* Non-zero if this is a new fs */
91}; 92
93 /* From here on it's a static part, only written by mkexofs */
94 __le64 s_dev_table_oid; /* Resurved, not used */
95 __le64 s_dev_table_count; /* == 0 means no dev_table */
96} __packed;
97
98/*
99 * Describes the raid used in the FS. It is part of the device table.
100 * This here is taken from the pNFS-objects definition. In exofs we
101 * use one raid policy through-out the filesystem. (NOTE: the funny
102 * alignment at begining. We take care of it at exofs_device_table.
103 */
104struct exofs_dt_data_map {
105 __le32 cb_num_comps;
106 __le64 cb_stripe_unit;
107 __le32 cb_group_width;
108 __le32 cb_group_depth;
109 __le32 cb_mirror_cnt;
110 __le32 cb_raid_algorithm;
111} __packed;
112
113/*
114 * This is an osd device information descriptor. It is a single entry in
115 * the exofs device table. It describes an osd target lun which
116 * contains data belonging to this FS. (Same partition_id on all devices)
117 */
118struct exofs_dt_device_info {
119 __le32 systemid_len;
120 u8 systemid[OSD_SYSTEMID_LEN];
121 __le64 long_name_offset; /* If !0 then offset-in-file */
122 __le32 osdname_len; /* */
123 u8 osdname[44]; /* Embbeded, Ususally an asci uuid */
124} __packed;
125
126/*
127 * The EXOFS device table - stored in object EXOFS_DEVTABLE_ID's data.
128 * It contains the raid used for this multy-device FS and an array of
129 * participating devices.
130 */
131struct exofs_device_table {
132 __le32 dt_version; /* == EXOFS_DT_VER */
133 struct exofs_dt_data_map dt_data_map; /* Raid policy to use */
134
135 /* Resurved space For future use. Total includeing this:
136 * (8 * sizeof(le64))
137 */
138 __le64 __Resurved[4];
139
140 __le64 dt_num_devices; /* Array size */
141 struct exofs_dt_device_info dt_dev_table[]; /* Array of devices */
142} __packed;
92 143
93/**************************************************************************** 144/****************************************************************************
94 * inode-related things 145 * inode-related things
@@ -155,22 +206,4 @@ enum {
155 (((name_len) + offsetof(struct exofs_dir_entry, name) + \ 206 (((name_len) + offsetof(struct exofs_dir_entry, name) + \
156 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND) 207 EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
157 208
158/*************************
159 * function declarations *
160 *************************/
161/* osd.c */
162void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
163 const struct osd_obj_id *obj);
164
165int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
166static inline int exofs_check_ok(struct osd_request *or)
167{
168 return exofs_check_ok_resid(or, NULL, NULL);
169}
170int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
171int exofs_async_op(struct osd_request *or,
172 osd_req_done_fn *async_done, void *caller_context, u8 *cred);
173
174int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
175
176#endif /*ifndef __EXOFS_COM_H__*/ 209#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 5ec72e020b22..c35fd4623986 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -30,13 +30,17 @@
30 * along with exofs; if not, write to the Free Software 30 * along with exofs; if not, write to the Free Software
31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 31 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
32 */ 32 */
33#ifndef __EXOFS_H__
34#define __EXOFS_H__
33 35
34#include <linux/fs.h> 36#include <linux/fs.h>
35#include <linux/time.h> 37#include <linux/time.h>
36#include "common.h" 38#include "common.h"
37 39
38#ifndef __EXOFS_H__ 40/* FIXME: Remove once pnfs hits mainline
39#define __EXOFS_H__ 41 * #include <linux/exportfs/pnfs_osd_xdr.h>
42 */
43#include "pnfs.h"
40 44
41#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a) 45#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
42 46
@@ -55,7 +59,7 @@
55 * our extension to the in-memory superblock 59 * our extension to the in-memory superblock
56 */ 60 */
57struct exofs_sb_info { 61struct exofs_sb_info {
58 struct osd_dev *s_dev; /* returned by get_osd_dev */ 62 struct exofs_fscb s_fscb; /* Written often, pre-allocate*/
59 osd_id s_pid; /* partition ID of file system*/ 63 osd_id s_pid; /* partition ID of file system*/
60 int s_timeout; /* timeout for OSD operations */ 64 int s_timeout; /* timeout for OSD operations */
61 uint64_t s_nextid; /* highest object ID used */ 65 uint64_t s_nextid; /* highest object ID used */
@@ -63,7 +67,11 @@ struct exofs_sb_info {
63 spinlock_t s_next_gen_lock; /* spinlock for gen # update */ 67 spinlock_t s_next_gen_lock; /* spinlock for gen # update */
64 u32 s_next_generation; /* next gen # to use */ 68 u32 s_next_generation; /* next gen # to use */
65 atomic_t s_curr_pending; /* number of pending commands */ 69 atomic_t s_curr_pending; /* number of pending commands */
66 uint8_t s_cred[OSD_CAP_LEN]; /* all-powerful credential */ 70 uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */
71
72 struct pnfs_osd_data_map data_map; /* Default raid to use */
73 unsigned s_numdevs; /* Num of devices in array */
74 struct osd_dev *s_ods[1]; /* Variable length, minimum 1 */
67}; 75};
68 76
69/* 77/*
@@ -79,6 +87,50 @@ struct exofs_i_info {
79 struct inode vfs_inode; /* normal in-memory inode */ 87 struct inode vfs_inode; /* normal in-memory inode */
80}; 88};
81 89
90static inline osd_id exofs_oi_objno(struct exofs_i_info *oi)
91{
92 return oi->vfs_inode.i_ino + EXOFS_OBJ_OFF;
93}
94
95struct exofs_io_state;
96typedef void (*exofs_io_done_fn)(struct exofs_io_state *or, void *private);
97
98struct exofs_io_state {
99 struct kref kref;
100
101 void *private;
102 exofs_io_done_fn done;
103
104 struct exofs_sb_info *sbi;
105 struct osd_obj_id obj;
106 u8 *cred;
107
108 /* Global read/write IO*/
109 loff_t offset;
110 unsigned long length;
111 void *kern_buff;
112 struct bio *bio;
113
114 /* Attributes */
115 unsigned in_attr_len;
116 struct osd_attr *in_attr;
117 unsigned out_attr_len;
118 struct osd_attr *out_attr;
119
120 /* Variable array of size numdevs */
121 unsigned numdevs;
122 struct exofs_per_dev_state {
123 struct osd_request *or;
124 struct bio *bio;
125 } per_dev[];
126};
127
128static inline unsigned exofs_io_state_size(unsigned numdevs)
129{
130 return sizeof(struct exofs_io_state) +
131 sizeof(struct exofs_per_dev_state) * numdevs;
132}
133
82/* 134/*
83 * our inode flags 135 * our inode flags
84 */ 136 */
@@ -130,6 +182,42 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
130/************************* 182/*************************
131 * function declarations * 183 * function declarations *
132 *************************/ 184 *************************/
185
186/* ios.c */
187void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
188 const struct osd_obj_id *obj);
189int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
190 u64 offset, void *p, unsigned length);
191
192int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** ios);
193void exofs_put_io_state(struct exofs_io_state *ios);
194
195int exofs_check_io(struct exofs_io_state *ios, u64 *resid);
196
197int exofs_sbi_create(struct exofs_io_state *ios);
198int exofs_sbi_remove(struct exofs_io_state *ios);
199int exofs_sbi_write(struct exofs_io_state *ios);
200int exofs_sbi_read(struct exofs_io_state *ios);
201
202int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr);
203
204int exofs_oi_truncate(struct exofs_i_info *oi, u64 new_len);
205static inline int exofs_oi_write(struct exofs_i_info *oi,
206 struct exofs_io_state *ios)
207{
208 ios->obj.id = exofs_oi_objno(oi);
209 ios->cred = oi->i_cred;
210 return exofs_sbi_write(ios);
211}
212
213static inline int exofs_oi_read(struct exofs_i_info *oi,
214 struct exofs_io_state *ios)
215{
216 ios->obj.id = exofs_oi_objno(oi);
217 ios->cred = oi->i_cred;
218 return exofs_sbi_read(ios);
219}
220
133/* inode.c */ 221/* inode.c */
134void exofs_truncate(struct inode *inode); 222void exofs_truncate(struct inode *inode);
135int exofs_setattr(struct dentry *, struct iattr *); 223int exofs_setattr(struct dentry *, struct iattr *);
@@ -169,6 +257,7 @@ extern const struct file_operations exofs_file_operations;
169 257
170/* inode.c */ 258/* inode.c */
171extern const struct address_space_operations exofs_aops; 259extern const struct address_space_operations exofs_aops;
260extern const struct osd_attr g_attr_logical_length;
172 261
173/* namei.c */ 262/* namei.c */
174extern const struct inode_operations exofs_dir_inode_operations; 263extern const struct inode_operations exofs_dir_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 6c10f7476699..698a8636d39c 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -37,15 +37,18 @@
37 37
38#include "exofs.h" 38#include "exofs.h"
39 39
40#ifdef CONFIG_EXOFS_DEBUG 40#define EXOFS_DBGMSG2(M...) do {} while (0)
41# define EXOFS_DEBUG_OBJ_ISIZE 1 41
42#endif 42enum { BIO_MAX_PAGES_KMALLOC =
43 (PAGE_SIZE - sizeof(struct bio)) / sizeof(struct bio_vec),
44};
43 45
44struct page_collect { 46struct page_collect {
45 struct exofs_sb_info *sbi; 47 struct exofs_sb_info *sbi;
46 struct request_queue *req_q; 48 struct request_queue *req_q;
47 struct inode *inode; 49 struct inode *inode;
48 unsigned expected_pages; 50 unsigned expected_pages;
51 struct exofs_io_state *ios;
49 52
50 struct bio *bio; 53 struct bio *bio;
51 unsigned nr_pages; 54 unsigned nr_pages;
@@ -54,22 +57,23 @@ struct page_collect {
54}; 57};
55 58
56static void _pcol_init(struct page_collect *pcol, unsigned expected_pages, 59static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
57 struct inode *inode) 60 struct inode *inode)
58{ 61{
59 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 62 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
60 63
61 pcol->sbi = sbi; 64 pcol->sbi = sbi;
62 pcol->req_q = osd_request_queue(sbi->s_dev); 65 /* Create master bios on first Q, later on cloning, each clone will be
66 * allocated on it's destination Q
67 */
68 pcol->req_q = osd_request_queue(sbi->s_ods[0]);
63 pcol->inode = inode; 69 pcol->inode = inode;
64 pcol->expected_pages = expected_pages; 70 pcol->expected_pages = expected_pages;
65 71
72 pcol->ios = NULL;
66 pcol->bio = NULL; 73 pcol->bio = NULL;
67 pcol->nr_pages = 0; 74 pcol->nr_pages = 0;
68 pcol->length = 0; 75 pcol->length = 0;
69 pcol->pg_first = -1; 76 pcol->pg_first = -1;
70
71 EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
72 expected_pages);
73} 77}
74 78
75static void _pcol_reset(struct page_collect *pcol) 79static void _pcol_reset(struct page_collect *pcol)
@@ -80,35 +84,49 @@ static void _pcol_reset(struct page_collect *pcol)
80 pcol->nr_pages = 0; 84 pcol->nr_pages = 0;
81 pcol->length = 0; 85 pcol->length = 0;
82 pcol->pg_first = -1; 86 pcol->pg_first = -1;
83 EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n", 87 pcol->ios = NULL;
84 pcol->inode->i_ino, pcol->expected_pages);
85 88
86 /* this is probably the end of the loop but in writes 89 /* this is probably the end of the loop but in writes
87 * it might not end here. don't be left with nothing 90 * it might not end here. don't be left with nothing
88 */ 91 */
89 if (!pcol->expected_pages) 92 if (!pcol->expected_pages)
90 pcol->expected_pages = 128; 93 pcol->expected_pages = BIO_MAX_PAGES_KMALLOC;
91} 94}
92 95
93static int pcol_try_alloc(struct page_collect *pcol) 96static int pcol_try_alloc(struct page_collect *pcol)
94{ 97{
95 int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES); 98 int pages = min_t(unsigned, pcol->expected_pages,
99 BIO_MAX_PAGES_KMALLOC);
100
101 if (!pcol->ios) { /* First time allocate io_state */
102 int ret = exofs_get_io_state(pcol->sbi, &pcol->ios);
103
104 if (ret)
105 return ret;
106 }
96 107
97 for (; pages; pages >>= 1) { 108 for (; pages; pages >>= 1) {
98 pcol->bio = bio_alloc(GFP_KERNEL, pages); 109 pcol->bio = bio_kmalloc(GFP_KERNEL, pages);
99 if (likely(pcol->bio)) 110 if (likely(pcol->bio))
100 return 0; 111 return 0;
101 } 112 }
102 113
103 EXOFS_ERR("Failed to kcalloc expected_pages=%u\n", 114 EXOFS_ERR("Failed to bio_kmalloc expected_pages=%u\n",
104 pcol->expected_pages); 115 pcol->expected_pages);
105 return -ENOMEM; 116 return -ENOMEM;
106} 117}
107 118
108static void pcol_free(struct page_collect *pcol) 119static void pcol_free(struct page_collect *pcol)
109{ 120{
110 bio_put(pcol->bio); 121 if (pcol->bio) {
111 pcol->bio = NULL; 122 bio_put(pcol->bio);
123 pcol->bio = NULL;
124 }
125
126 if (pcol->ios) {
127 exofs_put_io_state(pcol->ios);
128 pcol->ios = NULL;
129 }
112} 130}
113 131
114static int pcol_add_page(struct page_collect *pcol, struct page *page, 132static int pcol_add_page(struct page_collect *pcol, struct page *page,
@@ -161,22 +179,17 @@ static void update_write_page(struct page *page, int ret)
161/* Called at the end of reads, to optionally unlock pages and update their 179/* Called at the end of reads, to optionally unlock pages and update their
162 * status. 180 * status.
163 */ 181 */
164static int __readpages_done(struct osd_request *or, struct page_collect *pcol, 182static int __readpages_done(struct page_collect *pcol, bool do_unlock)
165 bool do_unlock)
166{ 183{
167 struct bio_vec *bvec; 184 struct bio_vec *bvec;
168 int i; 185 int i;
169 u64 resid; 186 u64 resid;
170 u64 good_bytes; 187 u64 good_bytes;
171 u64 length = 0; 188 u64 length = 0;
172 int ret = exofs_check_ok_resid(or, &resid, NULL); 189 int ret = exofs_check_io(pcol->ios, &resid);
173
174 osd_end_request(or);
175 190
176 if (likely(!ret)) 191 if (likely(!ret))
177 good_bytes = pcol->length; 192 good_bytes = pcol->length;
178 else if (!resid)
179 good_bytes = 0;
180 else 193 else
181 good_bytes = pcol->length - resid; 194 good_bytes = pcol->length - resid;
182 195
@@ -198,7 +211,7 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
198 else 211 else
199 page_stat = ret; 212 page_stat = ret;
200 213
201 EXOFS_DBGMSG(" readpages_done(0x%lx, 0x%lx) %s\n", 214 EXOFS_DBGMSG2(" readpages_done(0x%lx, 0x%lx) %s\n",
202 inode->i_ino, page->index, 215 inode->i_ino, page->index,
203 page_stat ? "bad_bytes" : "good_bytes"); 216 page_stat ? "bad_bytes" : "good_bytes");
204 217
@@ -214,13 +227,13 @@ static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
214} 227}
215 228
216/* callback of async reads */ 229/* callback of async reads */
217static void readpages_done(struct osd_request *or, void *p) 230static void readpages_done(struct exofs_io_state *ios, void *p)
218{ 231{
219 struct page_collect *pcol = p; 232 struct page_collect *pcol = p;
220 233
221 __readpages_done(or, pcol, true); 234 __readpages_done(pcol, true);
222 atomic_dec(&pcol->sbi->s_curr_pending); 235 atomic_dec(&pcol->sbi->s_curr_pending);
223 kfree(p); 236 kfree(pcol);
224} 237}
225 238
226static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw) 239static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
@@ -238,17 +251,13 @@ static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
238 251
239 unlock_page(page); 252 unlock_page(page);
240 } 253 }
241 pcol_free(pcol);
242} 254}
243 255
244static int read_exec(struct page_collect *pcol, bool is_sync) 256static int read_exec(struct page_collect *pcol, bool is_sync)
245{ 257{
246 struct exofs_i_info *oi = exofs_i(pcol->inode); 258 struct exofs_i_info *oi = exofs_i(pcol->inode);
247 struct osd_obj_id obj = {pcol->sbi->s_pid, 259 struct exofs_io_state *ios = pcol->ios;
248 pcol->inode->i_ino + EXOFS_OBJ_OFF};
249 struct osd_request *or = NULL;
250 struct page_collect *pcol_copy = NULL; 260 struct page_collect *pcol_copy = NULL;
251 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
252 int ret; 261 int ret;
253 262
254 if (!pcol->bio) 263 if (!pcol->bio)
@@ -257,17 +266,13 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
257 /* see comment in _readpage() about sync reads */ 266 /* see comment in _readpage() about sync reads */
258 WARN_ON(is_sync && (pcol->nr_pages != 1)); 267 WARN_ON(is_sync && (pcol->nr_pages != 1));
259 268
260 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL); 269 ios->bio = pcol->bio;
261 if (unlikely(!or)) { 270 ios->length = pcol->length;
262 ret = -ENOMEM; 271 ios->offset = pcol->pg_first << PAGE_CACHE_SHIFT;
263 goto err;
264 }
265
266 osd_req_read(or, &obj, i_start, pcol->bio, pcol->length);
267 272
268 if (is_sync) { 273 if (is_sync) {
269 exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred); 274 exofs_oi_read(oi, pcol->ios);
270 return __readpages_done(or, pcol, false); 275 return __readpages_done(pcol, false);
271 } 276 }
272 277
273 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 278 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
@@ -277,14 +282,16 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
277 } 282 }
278 283
279 *pcol_copy = *pcol; 284 *pcol_copy = *pcol;
280 ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred); 285 ios->done = readpages_done;
286 ios->private = pcol_copy;
287 ret = exofs_oi_read(oi, ios);
281 if (unlikely(ret)) 288 if (unlikely(ret))
282 goto err; 289 goto err;
283 290
284 atomic_inc(&pcol->sbi->s_curr_pending); 291 atomic_inc(&pcol->sbi->s_curr_pending);
285 292
286 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n", 293 EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
287 obj.id, _LLU(i_start), pcol->length); 294 ios->obj.id, _LLU(ios->offset), pcol->length);
288 295
289 /* pages ownership was passed to pcol_copy */ 296 /* pages ownership was passed to pcol_copy */
290 _pcol_reset(pcol); 297 _pcol_reset(pcol);
@@ -293,12 +300,10 @@ static int read_exec(struct page_collect *pcol, bool is_sync)
293err: 300err:
294 if (!is_sync) 301 if (!is_sync)
295 _unlock_pcol_pages(pcol, ret, READ); 302 _unlock_pcol_pages(pcol, ret, READ);
296 else /* Pages unlocked by caller in sync mode only free bio */ 303
297 pcol_free(pcol); 304 pcol_free(pcol);
298 305
299 kfree(pcol_copy); 306 kfree(pcol_copy);
300 if (or)
301 osd_end_request(or);
302 return ret; 307 return ret;
303} 308}
304 309
@@ -370,12 +375,12 @@ try_again:
370 if (len != PAGE_CACHE_SIZE) 375 if (len != PAGE_CACHE_SIZE)
371 zero_user(page, len, PAGE_CACHE_SIZE - len); 376 zero_user(page, len, PAGE_CACHE_SIZE - len);
372 377
373 EXOFS_DBGMSG(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n", 378 EXOFS_DBGMSG2(" readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
374 inode->i_ino, page->index, len); 379 inode->i_ino, page->index, len);
375 380
376 ret = pcol_add_page(pcol, page, len); 381 ret = pcol_add_page(pcol, page, len);
377 if (ret) { 382 if (ret) {
378 EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p " 383 EXOFS_DBGMSG2("Failed pcol_add_page pages[i]=%p "
379 "this_len=0x%zx nr_pages=%u length=0x%lx\n", 384 "this_len=0x%zx nr_pages=%u length=0x%lx\n",
380 page, len, pcol->nr_pages, pcol->length); 385 page, len, pcol->nr_pages, pcol->length);
381 386
@@ -419,9 +424,8 @@ static int _readpage(struct page *page, bool is_sync)
419 424
420 _pcol_init(&pcol, 1, page->mapping->host); 425 _pcol_init(&pcol, 1, page->mapping->host);
421 426
422 /* readpage_strip might call read_exec(,async) inside at several places 427 /* readpage_strip might call read_exec(,is_sync==false) at several
423 * but this is safe for is_async=0 since read_exec will not do anything 428 * places but not if we have a single page.
424 * when we have a single page.
425 */ 429 */
426 ret = readpage_strip(&pcol, page); 430 ret = readpage_strip(&pcol, page);
427 if (ret) { 431 if (ret) {
@@ -440,8 +444,8 @@ static int exofs_readpage(struct file *file, struct page *page)
440 return _readpage(page, false); 444 return _readpage(page, false);
441} 445}
442 446
443/* Callback for osd_write. All writes are asynchronouse */ 447/* Callback for osd_write. All writes are asynchronous */
444static void writepages_done(struct osd_request *or, void *p) 448static void writepages_done(struct exofs_io_state *ios, void *p)
445{ 449{
446 struct page_collect *pcol = p; 450 struct page_collect *pcol = p;
447 struct bio_vec *bvec; 451 struct bio_vec *bvec;
@@ -449,16 +453,12 @@ static void writepages_done(struct osd_request *or, void *p)
449 u64 resid; 453 u64 resid;
450 u64 good_bytes; 454 u64 good_bytes;
451 u64 length = 0; 455 u64 length = 0;
456 int ret = exofs_check_io(ios, &resid);
452 457
453 int ret = exofs_check_ok_resid(or, NULL, &resid);
454
455 osd_end_request(or);
456 atomic_dec(&pcol->sbi->s_curr_pending); 458 atomic_dec(&pcol->sbi->s_curr_pending);
457 459
458 if (likely(!ret)) 460 if (likely(!ret))
459 good_bytes = pcol->length; 461 good_bytes = pcol->length;
460 else if (!resid)
461 good_bytes = 0;
462 else 462 else
463 good_bytes = pcol->length - resid; 463 good_bytes = pcol->length - resid;
464 464
@@ -482,7 +482,7 @@ static void writepages_done(struct osd_request *or, void *p)
482 482
483 update_write_page(page, page_stat); 483 update_write_page(page, page_stat);
484 unlock_page(page); 484 unlock_page(page);
485 EXOFS_DBGMSG(" writepages_done(0x%lx, 0x%lx) status=%d\n", 485 EXOFS_DBGMSG2(" writepages_done(0x%lx, 0x%lx) status=%d\n",
486 inode->i_ino, page->index, page_stat); 486 inode->i_ino, page->index, page_stat);
487 487
488 length += bvec->bv_len; 488 length += bvec->bv_len;
@@ -496,23 +496,13 @@ static void writepages_done(struct osd_request *or, void *p)
496static int write_exec(struct page_collect *pcol) 496static int write_exec(struct page_collect *pcol)
497{ 497{
498 struct exofs_i_info *oi = exofs_i(pcol->inode); 498 struct exofs_i_info *oi = exofs_i(pcol->inode);
499 struct osd_obj_id obj = {pcol->sbi->s_pid, 499 struct exofs_io_state *ios = pcol->ios;
500 pcol->inode->i_ino + EXOFS_OBJ_OFF};
501 struct osd_request *or = NULL;
502 struct page_collect *pcol_copy = NULL; 500 struct page_collect *pcol_copy = NULL;
503 loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
504 int ret; 501 int ret;
505 502
506 if (!pcol->bio) 503 if (!pcol->bio)
507 return 0; 504 return 0;
508 505
509 or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
510 if (unlikely(!or)) {
511 EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
512 ret = -ENOMEM;
513 goto err;
514 }
515
516 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL); 506 pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
517 if (!pcol_copy) { 507 if (!pcol_copy) {
518 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n"); 508 EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
@@ -523,16 +513,22 @@ static int write_exec(struct page_collect *pcol)
523 *pcol_copy = *pcol; 513 *pcol_copy = *pcol;
524 514
525 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */ 515 pcol_copy->bio->bi_rw |= (1 << BIO_RW); /* FIXME: bio_set_dir() */
526 osd_req_write(or, &obj, i_start, pcol_copy->bio, pcol_copy->length); 516
527 ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred); 517 ios->bio = pcol_copy->bio;
518 ios->offset = pcol_copy->pg_first << PAGE_CACHE_SHIFT;
519 ios->length = pcol_copy->length;
520 ios->done = writepages_done;
521 ios->private = pcol_copy;
522
523 ret = exofs_oi_write(oi, ios);
528 if (unlikely(ret)) { 524 if (unlikely(ret)) {
529 EXOFS_ERR("write_exec: exofs_async_op() Faild\n"); 525 EXOFS_ERR("write_exec: exofs_oi_write() Faild\n");
530 goto err; 526 goto err;
531 } 527 }
532 528
533 atomic_inc(&pcol->sbi->s_curr_pending); 529 atomic_inc(&pcol->sbi->s_curr_pending);
534 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n", 530 EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
535 pcol->inode->i_ino, pcol->pg_first, _LLU(i_start), 531 pcol->inode->i_ino, pcol->pg_first, _LLU(ios->offset),
536 pcol->length); 532 pcol->length);
537 /* pages ownership was passed to pcol_copy */ 533 /* pages ownership was passed to pcol_copy */
538 _pcol_reset(pcol); 534 _pcol_reset(pcol);
@@ -540,9 +536,9 @@ static int write_exec(struct page_collect *pcol)
540 536
541err: 537err:
542 _unlock_pcol_pages(pcol, ret, WRITE); 538 _unlock_pcol_pages(pcol, ret, WRITE);
539 pcol_free(pcol);
543 kfree(pcol_copy); 540 kfree(pcol_copy);
544 if (or) 541
545 osd_end_request(or);
546 return ret; 542 return ret;
547} 543}
548 544
@@ -586,6 +582,9 @@ static int writepage_strip(struct page *page,
586 if (PageError(page)) 582 if (PageError(page))
587 ClearPageError(page); 583 ClearPageError(page);
588 unlock_page(page); 584 unlock_page(page);
585 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) "
586 "outside the limits\n",
587 inode->i_ino, page->index);
589 return 0; 588 return 0;
590 } 589 }
591 } 590 }
@@ -600,6 +599,9 @@ try_again:
600 ret = write_exec(pcol); 599 ret = write_exec(pcol);
601 if (unlikely(ret)) 600 if (unlikely(ret))
602 goto fail; 601 goto fail;
602
603 EXOFS_DBGMSG("writepage_strip(0x%lx, 0x%lx) Discontinuity\n",
604 inode->i_ino, page->index);
603 goto try_again; 605 goto try_again;
604 } 606 }
605 607
@@ -609,7 +611,7 @@ try_again:
609 goto fail; 611 goto fail;
610 } 612 }
611 613
612 EXOFS_DBGMSG(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n", 614 EXOFS_DBGMSG2(" writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
613 inode->i_ino, page->index, len); 615 inode->i_ino, page->index, len);
614 616
615 ret = pcol_add_page(pcol, page, len); 617 ret = pcol_add_page(pcol, page, len);
@@ -634,6 +636,8 @@ try_again:
634 return 0; 636 return 0;
635 637
636fail: 638fail:
639 EXOFS_DBGMSG("Error: writepage_strip(0x%lx, 0x%lx)=>%d\n",
640 inode->i_ino, page->index, ret);
637 set_bit(AS_EIO, &page->mapping->flags); 641 set_bit(AS_EIO, &page->mapping->flags);
638 unlock_page(page); 642 unlock_page(page);
639 return ret; 643 return ret;
@@ -652,14 +656,17 @@ static int exofs_writepages(struct address_space *mapping,
652 wbc->range_end >> PAGE_CACHE_SHIFT; 656 wbc->range_end >> PAGE_CACHE_SHIFT;
653 657
654 if (start || end) 658 if (start || end)
655 expected_pages = min(end - start + 1, 32L); 659 expected_pages = end - start + 1;
656 else 660 else
657 expected_pages = mapping->nrpages; 661 expected_pages = mapping->nrpages;
658 662
659 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx" 663 if (expected_pages < 32L)
660 " m->nrpages=%lu start=0x%lx end=0x%lx\n", 664 expected_pages = 32L;
665
666 EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx "
667 "nrpages=%lu start=0x%lx end=0x%lx expected_pages=%ld\n",
661 mapping->host->i_ino, wbc->range_start, wbc->range_end, 668 mapping->host->i_ino, wbc->range_start, wbc->range_end,
662 mapping->nrpages, start, end); 669 mapping->nrpages, start, end, expected_pages);
663 670
664 _pcol_init(&pcol, expected_pages, mapping->host); 671 _pcol_init(&pcol, expected_pages, mapping->host);
665 672
@@ -771,19 +778,28 @@ static int exofs_get_block(struct inode *inode, sector_t iblock,
771const struct osd_attr g_attr_logical_length = ATTR_DEF( 778const struct osd_attr g_attr_logical_length = ATTR_DEF(
772 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8); 779 OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
773 780
781static int _do_truncate(struct inode *inode)
782{
783 struct exofs_i_info *oi = exofs_i(inode);
784 loff_t isize = i_size_read(inode);
785 int ret;
786
787 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
788
789 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
790
791 ret = exofs_oi_truncate(oi, (u64)isize);
792 EXOFS_DBGMSG("(0x%lx) size=0x%llx\n", inode->i_ino, isize);
793 return ret;
794}
795
774/* 796/*
775 * Truncate a file to the specified size - all we have to do is set the size 797 * Truncate a file to the specified size - all we have to do is set the size
776 * attribute. We make sure the object exists first. 798 * attribute. We make sure the object exists first.
777 */ 799 */
778void exofs_truncate(struct inode *inode) 800void exofs_truncate(struct inode *inode)
779{ 801{
780 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
781 struct exofs_i_info *oi = exofs_i(inode); 802 struct exofs_i_info *oi = exofs_i(inode);
782 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
783 struct osd_request *or;
784 struct osd_attr attr;
785 loff_t isize = i_size_read(inode);
786 __be64 newsize;
787 int ret; 803 int ret;
788 804
789 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) 805 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
@@ -793,22 +809,6 @@ void exofs_truncate(struct inode *inode)
793 return; 809 return;
794 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 810 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
795 return; 811 return;
796 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
797
798 nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
799
800 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
801 if (unlikely(!or)) {
802 EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
803 goto fail;
804 }
805
806 osd_req_set_attributes(or, &obj);
807
808 newsize = cpu_to_be64((u64)isize);
809 attr = g_attr_logical_length;
810 attr.val_ptr = &newsize;
811 osd_req_add_set_attr_list(or, &attr, 1);
812 812
813 /* if we are about to truncate an object, and it hasn't been 813 /* if we are about to truncate an object, and it hasn't been
814 * created yet, wait 814 * created yet, wait
@@ -816,8 +816,7 @@ void exofs_truncate(struct inode *inode)
816 if (unlikely(wait_obj_created(oi))) 816 if (unlikely(wait_obj_created(oi)))
817 goto fail; 817 goto fail;
818 818
819 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); 819 ret = _do_truncate(inode);
820 osd_end_request(or);
821 if (ret) 820 if (ret)
822 goto fail; 821 goto fail;
823 822
@@ -847,65 +846,62 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
847 846
848/* 847/*
849 * Read an inode from the OSD, and return it as is. We also return the size 848 * Read an inode from the OSD, and return it as is. We also return the size
850 * attribute in the 'sanity' argument if we got compiled with debugging turned 849 * attribute in the 'obj_size' argument.
851 * on.
852 */ 850 */
853static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi, 851static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
854 struct exofs_fcb *inode, uint64_t *sanity) 852 struct exofs_fcb *inode, uint64_t *obj_size)
855{ 853{
856 struct exofs_sb_info *sbi = sb->s_fs_info; 854 struct exofs_sb_info *sbi = sb->s_fs_info;
857 struct osd_request *or; 855 struct osd_attr attrs[2];
858 struct osd_attr attr; 856 struct exofs_io_state *ios;
859 struct osd_obj_id obj = {sbi->s_pid,
860 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
861 int ret; 857 int ret;
862 858
863 exofs_make_credential(oi->i_cred, &obj); 859 *obj_size = ~0;
864 860 ret = exofs_get_io_state(sbi, &ios);
865 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 861 if (unlikely(ret)) {
866 if (unlikely(!or)) { 862 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
867 EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n"); 863 return ret;
868 return -ENOMEM;
869 } 864 }
870 osd_req_get_attributes(or, &obj);
871 865
872 /* we need the inode attribute */ 866 ios->obj.id = exofs_oi_objno(oi);
873 osd_req_add_get_attr_list(or, &g_attr_inode_data, 1); 867 exofs_make_credential(oi->i_cred, &ios->obj);
868 ios->cred = oi->i_cred;
874 869
875#ifdef EXOFS_DEBUG_OBJ_ISIZE 870 attrs[0] = g_attr_inode_data;
876 /* we get the size attributes to do a sanity check */ 871 attrs[1] = g_attr_logical_length;
877 osd_req_add_get_attr_list(or, &g_attr_logical_length, 1); 872 ios->in_attr = attrs;
878#endif 873 ios->in_attr_len = ARRAY_SIZE(attrs);
879 874
880 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred); 875 ret = exofs_sbi_read(ios);
881 if (ret) 876 if (ret)
882 goto out; 877 goto out;
883 878
884 attr = g_attr_inode_data; 879 ret = extract_attr_from_ios(ios, &attrs[0]);
885 ret = extract_attr_from_req(or, &attr);
886 if (ret) { 880 if (ret) {
887 EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n"); 881 EXOFS_ERR("%s: extract_attr of inode_data failed\n", __func__);
888 goto out; 882 goto out;
889 } 883 }
884 WARN_ON(attrs[0].len != EXOFS_INO_ATTR_SIZE);
885 memcpy(inode, attrs[0].val_ptr, EXOFS_INO_ATTR_SIZE);
890 886
891 WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE); 887 ret = extract_attr_from_ios(ios, &attrs[1]);
892 memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
893
894#ifdef EXOFS_DEBUG_OBJ_ISIZE
895 attr = g_attr_logical_length;
896 ret = extract_attr_from_req(or, &attr);
897 if (ret) { 888 if (ret) {
898 EXOFS_ERR("ERROR: extract attr from or failed\n"); 889 EXOFS_ERR("%s: extract_attr of logical_length failed\n",
890 __func__);
899 goto out; 891 goto out;
900 } 892 }
901 *sanity = get_unaligned_be64(attr.val_ptr); 893 *obj_size = get_unaligned_be64(attrs[1].val_ptr);
902#endif
903 894
904out: 895out:
905 osd_end_request(or); 896 exofs_put_io_state(ios);
906 return ret; 897 return ret;
907} 898}
908 899
900static void __oi_init(struct exofs_i_info *oi)
901{
902 init_waitqueue_head(&oi->i_wq);
903 oi->i_flags = 0;
904}
909/* 905/*
910 * Fill in an inode read from the OSD and set it up for use 906 * Fill in an inode read from the OSD and set it up for use
911 */ 907 */
@@ -914,7 +910,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
914 struct exofs_i_info *oi; 910 struct exofs_i_info *oi;
915 struct exofs_fcb fcb; 911 struct exofs_fcb fcb;
916 struct inode *inode; 912 struct inode *inode;
917 uint64_t uninitialized_var(sanity); 913 uint64_t obj_size;
918 int ret; 914 int ret;
919 915
920 inode = iget_locked(sb, ino); 916 inode = iget_locked(sb, ino);
@@ -923,13 +919,13 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
923 if (!(inode->i_state & I_NEW)) 919 if (!(inode->i_state & I_NEW))
924 return inode; 920 return inode;
925 oi = exofs_i(inode); 921 oi = exofs_i(inode);
922 __oi_init(oi);
926 923
927 /* read the inode from the osd */ 924 /* read the inode from the osd */
928 ret = exofs_get_inode(sb, oi, &fcb, &sanity); 925 ret = exofs_get_inode(sb, oi, &fcb, &obj_size);
929 if (ret) 926 if (ret)
930 goto bad_inode; 927 goto bad_inode;
931 928
932 init_waitqueue_head(&oi->i_wq);
933 set_obj_created(oi); 929 set_obj_created(oi);
934 930
935 /* copy stuff from on-disk struct to in-memory struct */ 931 /* copy stuff from on-disk struct to in-memory struct */
@@ -947,14 +943,12 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
947 inode->i_blkbits = EXOFS_BLKSHIFT; 943 inode->i_blkbits = EXOFS_BLKSHIFT;
948 inode->i_generation = le32_to_cpu(fcb.i_generation); 944 inode->i_generation = le32_to_cpu(fcb.i_generation);
949 945
950#ifdef EXOFS_DEBUG_OBJ_ISIZE 946 if ((inode->i_size != obj_size) &&
951 if ((inode->i_size != sanity) &&
952 (!exofs_inode_is_fast_symlink(inode))) { 947 (!exofs_inode_is_fast_symlink(inode))) {
953 EXOFS_ERR("WARNING: Size of object from inode and " 948 EXOFS_ERR("WARNING: Size of inode=%llu != object=%llu\n",
954 "attributes differ (%lld != %llu)\n", 949 inode->i_size, _LLU(obj_size));
955 inode->i_size, _LLU(sanity)); 950 /* FIXME: call exofs_inode_recovery() */
956 } 951 }
957#endif
958 952
959 oi->i_dir_start_lookup = 0; 953 oi->i_dir_start_lookup = 0;
960 954
@@ -1020,23 +1014,30 @@ int __exofs_wait_obj_created(struct exofs_i_info *oi)
1020 * set the obj_created flag so that other methods know that the object exists on 1014 * set the obj_created flag so that other methods know that the object exists on
1021 * the OSD. 1015 * the OSD.
1022 */ 1016 */
1023static void create_done(struct osd_request *or, void *p) 1017static void create_done(struct exofs_io_state *ios, void *p)
1024{ 1018{
1025 struct inode *inode = p; 1019 struct inode *inode = p;
1026 struct exofs_i_info *oi = exofs_i(inode); 1020 struct exofs_i_info *oi = exofs_i(inode);
1027 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info; 1021 struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
1028 int ret; 1022 int ret;
1029 1023
1030 ret = exofs_check_ok(or); 1024 ret = exofs_check_io(ios, NULL);
1031 osd_end_request(or); 1025 exofs_put_io_state(ios);
1026
1032 atomic_dec(&sbi->s_curr_pending); 1027 atomic_dec(&sbi->s_curr_pending);
1033 1028
1034 if (unlikely(ret)) { 1029 if (unlikely(ret)) {
1035 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx", 1030 EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
1036 _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF)); 1031 _LLU(exofs_oi_objno(oi)), _LLU(sbi->s_pid));
1037 make_bad_inode(inode); 1032 /*TODO: When FS is corrupted creation can fail, object already
1038 } else 1033 * exist. Get rid of this asynchronous creation, if exist
1039 set_obj_created(oi); 1034 * increment the obj counter and try the next object. Until we
1035 * succeed. All these dangling objects will be made into lost
1036 * files by chkfs.exofs
1037 */
1038 }
1039
1040 set_obj_created(oi);
1040 1041
1041 atomic_dec(&inode->i_count); 1042 atomic_dec(&inode->i_count);
1042 wake_up(&oi->i_wq); 1043 wake_up(&oi->i_wq);
@@ -1051,8 +1052,7 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1051 struct inode *inode; 1052 struct inode *inode;
1052 struct exofs_i_info *oi; 1053 struct exofs_i_info *oi;
1053 struct exofs_sb_info *sbi; 1054 struct exofs_sb_info *sbi;
1054 struct osd_request *or; 1055 struct exofs_io_state *ios;
1055 struct osd_obj_id obj;
1056 int ret; 1056 int ret;
1057 1057
1058 sb = dir->i_sb; 1058 sb = dir->i_sb;
@@ -1061,8 +1061,8 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1061 return ERR_PTR(-ENOMEM); 1061 return ERR_PTR(-ENOMEM);
1062 1062
1063 oi = exofs_i(inode); 1063 oi = exofs_i(inode);
1064 __oi_init(oi);
1064 1065
1065 init_waitqueue_head(&oi->i_wq);
1066 set_obj_2bcreated(oi); 1066 set_obj_2bcreated(oi);
1067 1067
1068 sbi = sb->s_fs_info; 1068 sbi = sb->s_fs_info;
@@ -1089,28 +1089,28 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
1089 1089
1090 mark_inode_dirty(inode); 1090 mark_inode_dirty(inode);
1091 1091
1092 obj.partition = sbi->s_pid; 1092 ret = exofs_get_io_state(sbi, &ios);
1093 obj.id = inode->i_ino + EXOFS_OBJ_OFF; 1093 if (unlikely(ret)) {
1094 exofs_make_credential(oi->i_cred, &obj); 1094 EXOFS_ERR("exofs_new_inode: exofs_get_io_state failed\n");
1095 1095 return ERR_PTR(ret);
1096 or = osd_start_request(sbi->s_dev, GFP_KERNEL);
1097 if (unlikely(!or)) {
1098 EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
1099 return ERR_PTR(-ENOMEM);
1100 } 1096 }
1101 1097
1102 osd_req_create_object(or, &obj); 1098 ios->obj.id = exofs_oi_objno(oi);
1099 exofs_make_credential(oi->i_cred, &ios->obj);
1103 1100
1104 /* increment the refcount so that the inode will still be around when we 1101 /* increment the refcount so that the inode will still be around when we
1105 * reach the callback 1102 * reach the callback
1106 */ 1103 */
1107 atomic_inc(&inode->i_count); 1104 atomic_inc(&inode->i_count);
1108 1105
1109 ret = exofs_async_op(or, create_done, inode, oi->i_cred); 1106 ios->done = create_done;
1107 ios->private = inode;
1108 ios->cred = oi->i_cred;
1109 ret = exofs_sbi_create(ios);
1110 if (ret) { 1110 if (ret) {
1111 atomic_dec(&inode->i_count); 1111 atomic_dec(&inode->i_count);
1112 osd_end_request(or); 1112 exofs_put_io_state(ios);
1113 return ERR_PTR(-EIO); 1113 return ERR_PTR(ret);
1114 } 1114 }
1115 atomic_inc(&sbi->s_curr_pending); 1115 atomic_inc(&sbi->s_curr_pending);
1116 1116
@@ -1128,11 +1128,11 @@ struct updatei_args {
1128/* 1128/*
1129 * Callback function from exofs_update_inode(). 1129 * Callback function from exofs_update_inode().
1130 */ 1130 */
1131static void updatei_done(struct osd_request *or, void *p) 1131static void updatei_done(struct exofs_io_state *ios, void *p)
1132{ 1132{
1133 struct updatei_args *args = p; 1133 struct updatei_args *args = p;
1134 1134
1135 osd_end_request(or); 1135 exofs_put_io_state(ios);
1136 1136
1137 atomic_dec(&args->sbi->s_curr_pending); 1137 atomic_dec(&args->sbi->s_curr_pending);
1138 1138
@@ -1148,8 +1148,7 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1148 struct exofs_i_info *oi = exofs_i(inode); 1148 struct exofs_i_info *oi = exofs_i(inode);
1149 struct super_block *sb = inode->i_sb; 1149 struct super_block *sb = inode->i_sb;
1150 struct exofs_sb_info *sbi = sb->s_fs_info; 1150 struct exofs_sb_info *sbi = sb->s_fs_info;
1151 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; 1151 struct exofs_io_state *ios;
1152 struct osd_request *or;
1153 struct osd_attr attr; 1152 struct osd_attr attr;
1154 struct exofs_fcb *fcb; 1153 struct exofs_fcb *fcb;
1155 struct updatei_args *args; 1154 struct updatei_args *args;
@@ -1186,18 +1185,16 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1186 } else 1185 } else
1187 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data)); 1186 memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
1188 1187
1189 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 1188 ret = exofs_get_io_state(sbi, &ios);
1190 if (unlikely(!or)) { 1189 if (unlikely(ret)) {
1191 EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n"); 1190 EXOFS_ERR("%s: exofs_get_io_state failed.\n", __func__);
1192 ret = -ENOMEM;
1193 goto free_args; 1191 goto free_args;
1194 } 1192 }
1195 1193
1196 osd_req_set_attributes(or, &obj);
1197
1198 attr = g_attr_inode_data; 1194 attr = g_attr_inode_data;
1199 attr.val_ptr = fcb; 1195 attr.val_ptr = fcb;
1200 osd_req_add_set_attr_list(or, &attr, 1); 1196 ios->out_attr_len = 1;
1197 ios->out_attr = &attr;
1201 1198
1202 if (!obj_created(oi)) { 1199 if (!obj_created(oi)) {
1203 EXOFS_DBGMSG("!obj_created\n"); 1200 EXOFS_DBGMSG("!obj_created\n");
@@ -1206,22 +1203,19 @@ static int exofs_update_inode(struct inode *inode, int do_sync)
1206 EXOFS_DBGMSG("wait_event done\n"); 1203 EXOFS_DBGMSG("wait_event done\n");
1207 } 1204 }
1208 1205
1209 if (do_sync) { 1206 if (!do_sync) {
1210 ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
1211 osd_end_request(or);
1212 goto free_args;
1213 } else {
1214 args->sbi = sbi; 1207 args->sbi = sbi;
1208 ios->done = updatei_done;
1209 ios->private = args;
1210 }
1215 1211
1216 ret = exofs_async_op(or, updatei_done, args, oi->i_cred); 1212 ret = exofs_oi_write(oi, ios);
1217 if (ret) { 1213 if (!do_sync && !ret) {
1218 osd_end_request(or);
1219 goto free_args;
1220 }
1221 atomic_inc(&sbi->s_curr_pending); 1214 atomic_inc(&sbi->s_curr_pending);
1222 goto out; /* deallocation in updatei_done */ 1215 goto out; /* deallocation in updatei_done */
1223 } 1216 }
1224 1217
1218 exofs_put_io_state(ios);
1225free_args: 1219free_args:
1226 kfree(args); 1220 kfree(args);
1227out: 1221out:
@@ -1238,11 +1232,12 @@ int exofs_write_inode(struct inode *inode, int wait)
1238 * Callback function from exofs_delete_inode() - don't have much cleaning up to 1232 * Callback function from exofs_delete_inode() - don't have much cleaning up to
1239 * do. 1233 * do.
1240 */ 1234 */
1241static void delete_done(struct osd_request *or, void *p) 1235static void delete_done(struct exofs_io_state *ios, void *p)
1242{ 1236{
1243 struct exofs_sb_info *sbi; 1237 struct exofs_sb_info *sbi = p;
1244 osd_end_request(or); 1238
1245 sbi = p; 1239 exofs_put_io_state(ios);
1240
1246 atomic_dec(&sbi->s_curr_pending); 1241 atomic_dec(&sbi->s_curr_pending);
1247} 1242}
1248 1243
@@ -1256,8 +1251,7 @@ void exofs_delete_inode(struct inode *inode)
1256 struct exofs_i_info *oi = exofs_i(inode); 1251 struct exofs_i_info *oi = exofs_i(inode);
1257 struct super_block *sb = inode->i_sb; 1252 struct super_block *sb = inode->i_sb;
1258 struct exofs_sb_info *sbi = sb->s_fs_info; 1253 struct exofs_sb_info *sbi = sb->s_fs_info;
1259 struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF}; 1254 struct exofs_io_state *ios;
1260 struct osd_request *or;
1261 int ret; 1255 int ret;
1262 1256
1263 truncate_inode_pages(&inode->i_data, 0); 1257 truncate_inode_pages(&inode->i_data, 0);
@@ -1274,25 +1268,26 @@ void exofs_delete_inode(struct inode *inode)
1274 1268
1275 clear_inode(inode); 1269 clear_inode(inode);
1276 1270
1277 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 1271 ret = exofs_get_io_state(sbi, &ios);
1278 if (unlikely(!or)) { 1272 if (unlikely(ret)) {
1279 EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n"); 1273 EXOFS_ERR("%s: exofs_get_io_state failed\n", __func__);
1280 return; 1274 return;
1281 } 1275 }
1282 1276
1283 osd_req_remove_object(or, &obj);
1284
1285 /* if we are deleting an obj that hasn't been created yet, wait */ 1277 /* if we are deleting an obj that hasn't been created yet, wait */
1286 if (!obj_created(oi)) { 1278 if (!obj_created(oi)) {
1287 BUG_ON(!obj_2bcreated(oi)); 1279 BUG_ON(!obj_2bcreated(oi));
1288 wait_event(oi->i_wq, obj_created(oi)); 1280 wait_event(oi->i_wq, obj_created(oi));
1289 } 1281 }
1290 1282
1291 ret = exofs_async_op(or, delete_done, sbi, oi->i_cred); 1283 ios->obj.id = exofs_oi_objno(oi);
1284 ios->done = delete_done;
1285 ios->private = sbi;
1286 ios->cred = oi->i_cred;
1287 ret = exofs_sbi_remove(ios);
1292 if (ret) { 1288 if (ret) {
1293 EXOFS_ERR( 1289 EXOFS_ERR("%s: exofs_sbi_remove failed\n", __func__);
1294 "ERROR: @exofs_delete_inode exofs_async_op failed\n"); 1290 exofs_put_io_state(ios);
1295 osd_end_request(or);
1296 return; 1291 return;
1297 } 1292 }
1298 atomic_inc(&sbi->s_curr_pending); 1293 atomic_inc(&sbi->s_curr_pending);
diff --git a/fs/exofs/ios.c b/fs/exofs/ios.c
new file mode 100644
index 000000000000..5bad01fa1f9f
--- /dev/null
+++ b/fs/exofs/ios.c
@@ -0,0 +1,421 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com>
6 *
7 * This file is part of exofs.
8 *
9 * exofs is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation. Since it is based on ext2, and the only
12 * valid version of GPL for the Linux kernel is version 2, the only valid
13 * version of GPL for exofs is version 2.
14 *
15 * exofs is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with exofs; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include <scsi/scsi_device.h>
26
27#include "exofs.h"
28
29void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
30{
31 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
32}
33
34int exofs_read_kern(struct osd_dev *od, u8 *cred, struct osd_obj_id *obj,
35 u64 offset, void *p, unsigned length)
36{
37 struct osd_request *or = osd_start_request(od, GFP_KERNEL);
38/* struct osd_sense_info osi = {.key = 0};*/
39 int ret;
40
41 if (unlikely(!or)) {
42 EXOFS_DBGMSG("%s: osd_start_request failed.\n", __func__);
43 return -ENOMEM;
44 }
45 ret = osd_req_read_kern(or, obj, offset, p, length);
46 if (unlikely(ret)) {
47 EXOFS_DBGMSG("%s: osd_req_read_kern failed.\n", __func__);
48 goto out;
49 }
50
51 ret = osd_finalize_request(or, 0, cred, NULL);
52 if (unlikely(ret)) {
53 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
54 goto out;
55 }
56
57 ret = osd_execute_request(or);
58 if (unlikely(ret))
59 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
60 /* osd_req_decode_sense(or, ret); */
61
62out:
63 osd_end_request(or);
64 return ret;
65}
66
67int exofs_get_io_state(struct exofs_sb_info *sbi, struct exofs_io_state** pios)
68{
69 struct exofs_io_state *ios;
70
71 /*TODO: Maybe use kmem_cach per sbi of size
72 * exofs_io_state_size(sbi->s_numdevs)
73 */
74 ios = kzalloc(exofs_io_state_size(sbi->s_numdevs), GFP_KERNEL);
75 if (unlikely(!ios)) {
76 *pios = NULL;
77 return -ENOMEM;
78 }
79
80 ios->sbi = sbi;
81 ios->obj.partition = sbi->s_pid;
82 *pios = ios;
83 return 0;
84}
85
86void exofs_put_io_state(struct exofs_io_state *ios)
87{
88 if (ios) {
89 unsigned i;
90
91 for (i = 0; i < ios->numdevs; i++) {
92 struct exofs_per_dev_state *per_dev = &ios->per_dev[i];
93
94 if (per_dev->or)
95 osd_end_request(per_dev->or);
96 if (per_dev->bio)
97 bio_put(per_dev->bio);
98 }
99
100 kfree(ios);
101 }
102}
103
104static void _sync_done(struct exofs_io_state *ios, void *p)
105{
106 struct completion *waiting = p;
107
108 complete(waiting);
109}
110
111static void _last_io(struct kref *kref)
112{
113 struct exofs_io_state *ios = container_of(
114 kref, struct exofs_io_state, kref);
115
116 ios->done(ios, ios->private);
117}
118
119static void _done_io(struct osd_request *or, void *p)
120{
121 struct exofs_io_state *ios = p;
122
123 kref_put(&ios->kref, _last_io);
124}
125
126static int exofs_io_execute(struct exofs_io_state *ios)
127{
128 DECLARE_COMPLETION_ONSTACK(wait);
129 bool sync = (ios->done == NULL);
130 int i, ret;
131
132 if (sync) {
133 ios->done = _sync_done;
134 ios->private = &wait;
135 }
136
137 for (i = 0; i < ios->numdevs; i++) {
138 struct osd_request *or = ios->per_dev[i].or;
139 if (unlikely(!or))
140 continue;
141
142 ret = osd_finalize_request(or, 0, ios->cred, NULL);
143 if (unlikely(ret)) {
144 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n",
145 ret);
146 return ret;
147 }
148 }
149
150 kref_init(&ios->kref);
151
152 for (i = 0; i < ios->numdevs; i++) {
153 struct osd_request *or = ios->per_dev[i].or;
154 if (unlikely(!or))
155 continue;
156
157 kref_get(&ios->kref);
158 osd_execute_request_async(or, _done_io, ios);
159 }
160
161 kref_put(&ios->kref, _last_io);
162 ret = 0;
163
164 if (sync) {
165 wait_for_completion(&wait);
166 ret = exofs_check_io(ios, NULL);
167 }
168 return ret;
169}
170
171int exofs_check_io(struct exofs_io_state *ios, u64 *resid)
172{
173 enum osd_err_priority acumulated_osd_err = 0;
174 int acumulated_lin_err = 0;
175 int i;
176
177 for (i = 0; i < ios->numdevs; i++) {
178 struct osd_sense_info osi;
179 int ret = osd_req_decode_sense(ios->per_dev[i].or, &osi);
180
181 if (likely(!ret))
182 continue;
183
184 if (unlikely(ret == -EFAULT)) {
185 EXOFS_DBGMSG("%s: EFAULT Need page clear\n", __func__);
186 /*FIXME: All the pages in this device range should:
187 * clear_highpage(page);
188 */
189 }
190
191 if (osi.osd_err_pri >= acumulated_osd_err) {
192 acumulated_osd_err = osi.osd_err_pri;
193 acumulated_lin_err = ret;
194 }
195 }
196
197 /* TODO: raid specific residual calculations */
198 if (resid) {
199 if (likely(!acumulated_lin_err))
200 *resid = 0;
201 else
202 *resid = ios->length;
203 }
204
205 return acumulated_lin_err;
206}
207
208int exofs_sbi_create(struct exofs_io_state *ios)
209{
210 int i, ret;
211
212 for (i = 0; i < ios->sbi->s_numdevs; i++) {
213 struct osd_request *or;
214
215 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
216 if (unlikely(!or)) {
217 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
218 ret = -ENOMEM;
219 goto out;
220 }
221 ios->per_dev[i].or = or;
222 ios->numdevs++;
223
224 osd_req_create_object(or, &ios->obj);
225 }
226 ret = exofs_io_execute(ios);
227
228out:
229 return ret;
230}
231
232int exofs_sbi_remove(struct exofs_io_state *ios)
233{
234 int i, ret;
235
236 for (i = 0; i < ios->sbi->s_numdevs; i++) {
237 struct osd_request *or;
238
239 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
240 if (unlikely(!or)) {
241 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
242 ret = -ENOMEM;
243 goto out;
244 }
245 ios->per_dev[i].or = or;
246 ios->numdevs++;
247
248 osd_req_remove_object(or, &ios->obj);
249 }
250 ret = exofs_io_execute(ios);
251
252out:
253 return ret;
254}
255
256int exofs_sbi_write(struct exofs_io_state *ios)
257{
258 int i, ret;
259
260 for (i = 0; i < ios->sbi->s_numdevs; i++) {
261 struct osd_request *or;
262
263 or = osd_start_request(ios->sbi->s_ods[i], GFP_KERNEL);
264 if (unlikely(!or)) {
265 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
266 ret = -ENOMEM;
267 goto out;
268 }
269 ios->per_dev[i].or = or;
270 ios->numdevs++;
271
272 if (ios->bio) {
273 struct bio *bio;
274
275 if (i != 0) {
276 bio = bio_kmalloc(GFP_KERNEL,
277 ios->bio->bi_max_vecs);
278 if (unlikely(!bio)) {
279 ret = -ENOMEM;
280 goto out;
281 }
282
283 __bio_clone(bio, ios->bio);
284 bio->bi_bdev = NULL;
285 bio->bi_next = NULL;
286 ios->per_dev[i].bio = bio;
287 } else {
288 bio = ios->bio;
289 }
290
291 osd_req_write(or, &ios->obj, ios->offset, bio,
292 ios->length);
293/* EXOFS_DBGMSG("write sync=%d\n", sync);*/
294 } else if (ios->kern_buff) {
295 osd_req_write_kern(or, &ios->obj, ios->offset,
296 ios->kern_buff, ios->length);
297/* EXOFS_DBGMSG("write_kern sync=%d\n", sync);*/
298 } else {
299 osd_req_set_attributes(or, &ios->obj);
300/* EXOFS_DBGMSG("set_attributes sync=%d\n", sync);*/
301 }
302
303 if (ios->out_attr)
304 osd_req_add_set_attr_list(or, ios->out_attr,
305 ios->out_attr_len);
306
307 if (ios->in_attr)
308 osd_req_add_get_attr_list(or, ios->in_attr,
309 ios->in_attr_len);
310 }
311 ret = exofs_io_execute(ios);
312
313out:
314 return ret;
315}
316
317int exofs_sbi_read(struct exofs_io_state *ios)
318{
319 int i, ret;
320
321 for (i = 0; i < 1; i++) {
322 struct osd_request *or;
323 unsigned first_dev = (unsigned)ios->obj.id;
324
325 first_dev %= ios->sbi->s_numdevs;
326 or = osd_start_request(ios->sbi->s_ods[first_dev], GFP_KERNEL);
327 if (unlikely(!or)) {
328 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
329 ret = -ENOMEM;
330 goto out;
331 }
332 ios->per_dev[i].or = or;
333 ios->numdevs++;
334
335 if (ios->bio) {
336 osd_req_read(or, &ios->obj, ios->offset, ios->bio,
337 ios->length);
338/* EXOFS_DBGMSG("read sync=%d\n", sync);*/
339 } else if (ios->kern_buff) {
340 osd_req_read_kern(or, &ios->obj, ios->offset,
341 ios->kern_buff, ios->length);
342/* EXOFS_DBGMSG("read_kern sync=%d\n", sync);*/
343 } else {
344 osd_req_get_attributes(or, &ios->obj);
345/* EXOFS_DBGMSG("get_attributes sync=%d\n", sync);*/
346 }
347
348 if (ios->out_attr)
349 osd_req_add_set_attr_list(or, ios->out_attr,
350 ios->out_attr_len);
351
352 if (ios->in_attr)
353 osd_req_add_get_attr_list(or, ios->in_attr,
354 ios->in_attr_len);
355 }
356 ret = exofs_io_execute(ios);
357
358out:
359 return ret;
360}
361
362int extract_attr_from_ios(struct exofs_io_state *ios, struct osd_attr *attr)
363{
364 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
365 void *iter = NULL;
366 int nelem;
367
368 do {
369 nelem = 1;
370 osd_req_decode_get_attr_list(ios->per_dev[0].or,
371 &cur_attr, &nelem, &iter);
372 if ((cur_attr.attr_page == attr->attr_page) &&
373 (cur_attr.attr_id == attr->attr_id)) {
374 attr->len = cur_attr.len;
375 attr->val_ptr = cur_attr.val_ptr;
376 return 0;
377 }
378 } while (iter);
379
380 return -EIO;
381}
382
383int exofs_oi_truncate(struct exofs_i_info *oi, u64 size)
384{
385 struct exofs_sb_info *sbi = oi->vfs_inode.i_sb->s_fs_info;
386 struct exofs_io_state *ios;
387 struct osd_attr attr;
388 __be64 newsize;
389 int i, ret;
390
391 if (exofs_get_io_state(sbi, &ios))
392 return -ENOMEM;
393
394 ios->obj.id = exofs_oi_objno(oi);
395 ios->cred = oi->i_cred;
396
397 newsize = cpu_to_be64(size);
398 attr = g_attr_logical_length;
399 attr.val_ptr = &newsize;
400
401 for (i = 0; i < sbi->s_numdevs; i++) {
402 struct osd_request *or;
403
404 or = osd_start_request(sbi->s_ods[i], GFP_KERNEL);
405 if (unlikely(!or)) {
406 EXOFS_ERR("%s: osd_start_request failed\n", __func__);
407 ret = -ENOMEM;
408 goto out;
409 }
410 ios->per_dev[i].or = or;
411 ios->numdevs++;
412
413 osd_req_set_attributes(or, &ios->obj);
414 osd_req_add_set_attr_list(or, &attr, 1);
415 }
416 ret = exofs_io_execute(ios);
417
418out:
419 exofs_put_io_state(ios);
420 return ret;
421}
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
deleted file mode 100644
index 4372542df284..000000000000
--- a/fs/exofs/osd.c
+++ /dev/null
@@ -1,125 +0,0 @@
1/*
2 * Copyright (C) 2005, 2006
3 * Avishay Traeger (avishay@gmail.com)
4 * Copyright (C) 2008, 2009
5 * Boaz Harrosh <bharrosh@panasas.com>
6 *
7 * This file is part of exofs.
8 *
9 * exofs is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation. Since it is based on ext2, and the only
12 * valid version of GPL for the Linux kernel is version 2, the only valid
13 * version of GPL for exofs is version 2.
14 *
15 * exofs is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with exofs; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
23 */
24
25#include <scsi/scsi_device.h>
26#include <scsi/osd_sense.h>
27
28#include "exofs.h"
29
30int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
31{
32 struct osd_sense_info osi;
33 int ret = osd_req_decode_sense(or, &osi);
34
35 if (ret) { /* translate to Linux codes */
36 if (osi.additional_code == scsi_invalid_field_in_cdb) {
37 if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
38 ret = -EFAULT;
39 if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
40 ret = -ENOENT;
41 else
42 ret = -EINVAL;
43 } else if (osi.additional_code == osd_quota_error)
44 ret = -ENOSPC;
45 else
46 ret = -EIO;
47 }
48
49 /* FIXME: should be include in osd_sense_info */
50 if (in_resid)
51 *in_resid = or->in.req ? or->in.req->resid_len : 0;
52
53 if (out_resid)
54 *out_resid = or->out.req ? or->out.req->resid_len : 0;
55
56 return ret;
57}
58
59void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
60{
61 osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
62}
63
64/*
65 * Perform a synchronous OSD operation.
66 */
67int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
68{
69 int ret;
70
71 or->timeout = timeout;
72 ret = osd_finalize_request(or, 0, credential, NULL);
73 if (ret) {
74 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
75 return ret;
76 }
77
78 ret = osd_execute_request(or);
79
80 if (ret)
81 EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
82 /* osd_req_decode_sense(or, ret); */
83 return ret;
84}
85
86/*
87 * Perform an asynchronous OSD operation.
88 */
89int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
90 void *caller_context, u8 *cred)
91{
92 int ret;
93
94 ret = osd_finalize_request(or, 0, cred, NULL);
95 if (ret) {
96 EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
97 return ret;
98 }
99
100 ret = osd_execute_request_async(or, async_done, caller_context);
101
102 if (ret)
103 EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
104 return ret;
105}
106
107int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
108{
109 struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
110 void *iter = NULL;
111 int nelem;
112
113 do {
114 nelem = 1;
115 osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
116 if ((cur_attr.attr_page == attr->attr_page) &&
117 (cur_attr.attr_id == attr->attr_id)) {
118 attr->len = cur_attr.len;
119 attr->val_ptr = cur_attr.val_ptr;
120 return 0;
121 }
122 } while (iter);
123
124 return -EIO;
125}
diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h
new file mode 100644
index 000000000000..423033addd1f
--- /dev/null
+++ b/fs/exofs/pnfs.h
@@ -0,0 +1,51 @@
1/*
2 * Copyright (C) 2008, 2009
3 * Boaz Harrosh <bharrosh@panasas.com>
4 *
5 * This file is part of exofs.
6 *
7 * exofs is free software; you can redistribute it and/or modify it under the
8 * terms of the GNU General Public License version 2 as published by the Free
9 * Software Foundation.
10 *
11 */
12
13/* FIXME: Remove this file once pnfs hits mainline */
14
15#ifndef __EXOFS_PNFS_H__
16#define __EXOFS_PNFS_H__
17
18#if defined(CONFIG_PNFS)
19
20
21/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */
22#include "../nfs/objlayout/pnfs_osd_xdr.h"
23
24#else /* defined(CONFIG_PNFS) */
25
26enum pnfs_iomode {
27 IOMODE_READ = 1,
28 IOMODE_RW = 2,
29 IOMODE_ANY = 3,
30};
31
32/* Layout Structure */
33enum pnfs_osd_raid_algorithm4 {
34 PNFS_OSD_RAID_0 = 1,
35 PNFS_OSD_RAID_4 = 2,
36 PNFS_OSD_RAID_5 = 3,
37 PNFS_OSD_RAID_PQ = 4 /* Reed-Solomon P+Q */
38};
39
40struct pnfs_osd_data_map {
41 u32 odm_num_comps;
42 u64 odm_stripe_unit;
43 u32 odm_group_width;
44 u32 odm_group_depth;
45 u32 odm_mirror_cnt;
46 u32 odm_raid_algorithm;
47};
48
49#endif /* else defined(CONFIG_PNFS) */
50
51#endif /* __EXOFS_PNFS_H__ */
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 9f500dec3b59..a1d1e77b12eb 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -203,49 +203,45 @@ int exofs_sync_fs(struct super_block *sb, int wait)
203{ 203{
204 struct exofs_sb_info *sbi; 204 struct exofs_sb_info *sbi;
205 struct exofs_fscb *fscb; 205 struct exofs_fscb *fscb;
206 struct osd_request *or; 206 struct exofs_io_state *ios;
207 struct osd_obj_id obj;
208 int ret = -ENOMEM; 207 int ret = -ENOMEM;
209 208
210 fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
211 if (!fscb) {
212 EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
213 return -ENOMEM;
214 }
215
216 lock_super(sb); 209 lock_super(sb);
217 sbi = sb->s_fs_info; 210 sbi = sb->s_fs_info;
211 fscb = &sbi->s_fscb;
212
213 ret = exofs_get_io_state(sbi, &ios);
214 if (ret)
215 goto out;
216
217 /* Note: We only write the changing part of the fscb. .i.e upto the
218 * the fscb->s_dev_table_oid member. There is no read-modify-write
219 * here.
220 */
221 ios->length = offsetof(struct exofs_fscb, s_dev_table_oid);
222 memset(fscb, 0, ios->length);
218 fscb->s_nextid = cpu_to_le64(sbi->s_nextid); 223 fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
219 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles); 224 fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
220 fscb->s_magic = cpu_to_le16(sb->s_magic); 225 fscb->s_magic = cpu_to_le16(sb->s_magic);
221 fscb->s_newfs = 0; 226 fscb->s_newfs = 0;
227 fscb->s_version = EXOFS_FSCB_VER;
222 228
223 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 229 ios->obj.id = EXOFS_SUPER_ID;
224 if (unlikely(!or)) { 230 ios->offset = 0;
225 EXOFS_ERR("exofs_write_super: osd_start_request failed.\n"); 231 ios->kern_buff = fscb;
226 goto out; 232 ios->cred = sbi->s_cred;
227 }
228 233
229 obj.partition = sbi->s_pid; 234 ret = exofs_sbi_write(ios);
230 obj.id = EXOFS_SUPER_ID;
231 ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
232 if (unlikely(ret)) { 235 if (unlikely(ret)) {
233 EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n"); 236 EXOFS_ERR("%s: exofs_sbi_write failed.\n", __func__);
234 goto out;
235 }
236
237 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
238 if (unlikely(ret)) {
239 EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
240 goto out; 237 goto out;
241 } 238 }
242 sb->s_dirt = 0; 239 sb->s_dirt = 0;
243 240
244out: 241out:
245 if (or) 242 EXOFS_DBGMSG("s_nextid=0x%llx ret=%d\n", _LLU(sbi->s_nextid), ret);
246 osd_end_request(or); 243 exofs_put_io_state(ios);
247 unlock_super(sb); 244 unlock_super(sb);
248 kfree(fscb);
249 return ret; 245 return ret;
250} 246}
251 247
@@ -257,6 +253,29 @@ static void exofs_write_super(struct super_block *sb)
257 sb->s_dirt = 0; 253 sb->s_dirt = 0;
258} 254}
259 255
256static void _exofs_print_device(const char *msg, const char *dev_path,
257 struct osd_dev *od, u64 pid)
258{
259 const struct osd_dev_info *odi = osduld_device_info(od);
260
261 printk(KERN_NOTICE "exofs: %s %s osd_name-%s pid-0x%llx\n",
262 msg, dev_path ?: "", odi->osdname, _LLU(pid));
263}
264
265void exofs_free_sbi(struct exofs_sb_info *sbi)
266{
267 while (sbi->s_numdevs) {
268 int i = --sbi->s_numdevs;
269 struct osd_dev *od = sbi->s_ods[i];
270
271 if (od) {
272 sbi->s_ods[i] = NULL;
273 osduld_put_device(od);
274 }
275 }
276 kfree(sbi);
277}
278
260/* 279/*
261 * This function is called when the vfs is freeing the superblock. We just 280 * This function is called when the vfs is freeing the superblock. We just
262 * need to free our own part. 281 * need to free our own part.
@@ -279,11 +298,182 @@ static void exofs_put_super(struct super_block *sb)
279 msecs_to_jiffies(100)); 298 msecs_to_jiffies(100));
280 } 299 }
281 300
282 osduld_put_device(sbi->s_dev); 301 _exofs_print_device("Unmounting", NULL, sbi->s_ods[0], sbi->s_pid);
283 kfree(sb->s_fs_info); 302
303 exofs_free_sbi(sbi);
284 sb->s_fs_info = NULL; 304 sb->s_fs_info = NULL;
285} 305}
286 306
307static int _read_and_match_data_map(struct exofs_sb_info *sbi, unsigned numdevs,
308 struct exofs_device_table *dt)
309{
310 sbi->data_map.odm_num_comps =
311 le32_to_cpu(dt->dt_data_map.cb_num_comps);
312 sbi->data_map.odm_stripe_unit =
313 le64_to_cpu(dt->dt_data_map.cb_stripe_unit);
314 sbi->data_map.odm_group_width =
315 le32_to_cpu(dt->dt_data_map.cb_group_width);
316 sbi->data_map.odm_group_depth =
317 le32_to_cpu(dt->dt_data_map.cb_group_depth);
318 sbi->data_map.odm_mirror_cnt =
319 le32_to_cpu(dt->dt_data_map.cb_mirror_cnt);
320 sbi->data_map.odm_raid_algorithm =
321 le32_to_cpu(dt->dt_data_map.cb_raid_algorithm);
322
323/* FIXME: Hard coded mirror only for now. if not so do not mount */
324 if ((sbi->data_map.odm_num_comps != numdevs) ||
325 (sbi->data_map.odm_stripe_unit != EXOFS_BLKSIZE) ||
326 (sbi->data_map.odm_raid_algorithm != PNFS_OSD_RAID_0) ||
327 (sbi->data_map.odm_mirror_cnt != (numdevs - 1)))
328 return -EINVAL;
329 else
330 return 0;
331}
332
333/* @odi is valid only as long as @fscb_dev is valid */
334static int exofs_devs_2_odi(struct exofs_dt_device_info *dt_dev,
335 struct osd_dev_info *odi)
336{
337 odi->systemid_len = le32_to_cpu(dt_dev->systemid_len);
338 memcpy(odi->systemid, dt_dev->systemid, odi->systemid_len);
339
340 odi->osdname_len = le32_to_cpu(dt_dev->osdname_len);
341 odi->osdname = dt_dev->osdname;
342
343 /* FIXME support long names. Will need a _put function */
344 if (dt_dev->long_name_offset)
345 return -EINVAL;
346
347 /* Make sure osdname is printable!
348 * mkexofs should give us space for a null-terminator else the
349 * device-table is invalid.
350 */
351 if (unlikely(odi->osdname_len >= sizeof(dt_dev->osdname)))
352 odi->osdname_len = sizeof(dt_dev->osdname) - 1;
353 dt_dev->osdname[odi->osdname_len] = 0;
354
355 /* If it's all zeros something is bad we read past end-of-obj */
356 return !(odi->systemid_len || odi->osdname_len);
357}
358
359static int exofs_read_lookup_dev_table(struct exofs_sb_info **psbi,
360 unsigned table_count)
361{
362 struct exofs_sb_info *sbi = *psbi;
363 struct osd_dev *fscb_od;
364 struct osd_obj_id obj = {.partition = sbi->s_pid,
365 .id = EXOFS_DEVTABLE_ID};
366 struct exofs_device_table *dt;
367 unsigned table_bytes = table_count * sizeof(dt->dt_dev_table[0]) +
368 sizeof(*dt);
369 unsigned numdevs, i;
370 int ret;
371
372 dt = kmalloc(table_bytes, GFP_KERNEL);
373 if (unlikely(!dt)) {
374 EXOFS_ERR("ERROR: allocating %x bytes for device table\n",
375 table_bytes);
376 return -ENOMEM;
377 }
378
379 fscb_od = sbi->s_ods[0];
380 sbi->s_ods[0] = NULL;
381 sbi->s_numdevs = 0;
382 ret = exofs_read_kern(fscb_od, sbi->s_cred, &obj, 0, dt, table_bytes);
383 if (unlikely(ret)) {
384 EXOFS_ERR("ERROR: reading device table\n");
385 goto out;
386 }
387
388 numdevs = le64_to_cpu(dt->dt_num_devices);
389 if (unlikely(!numdevs)) {
390 ret = -EINVAL;
391 goto out;
392 }
393 WARN_ON(table_count != numdevs);
394
395 ret = _read_and_match_data_map(sbi, numdevs, dt);
396 if (unlikely(ret))
397 goto out;
398
399 if (likely(numdevs > 1)) {
400 unsigned size = numdevs * sizeof(sbi->s_ods[0]);
401
402 sbi = krealloc(sbi, sizeof(*sbi) + size, GFP_KERNEL);
403 if (unlikely(!sbi)) {
404 ret = -ENOMEM;
405 goto out;
406 }
407 memset(&sbi->s_ods[1], 0, size - sizeof(sbi->s_ods[0]));
408 *psbi = sbi;
409 }
410
411 for (i = 0; i < numdevs; i++) {
412 struct exofs_fscb fscb;
413 struct osd_dev_info odi;
414 struct osd_dev *od;
415
416 if (exofs_devs_2_odi(&dt->dt_dev_table[i], &odi)) {
417 EXOFS_ERR("ERROR: Read all-zeros device entry\n");
418 ret = -EINVAL;
419 goto out;
420 }
421
422 printk(KERN_NOTICE "Add device[%d]: osd_name-%s\n",
423 i, odi.osdname);
424
425 /* On all devices the device table is identical. The user can
426 * specify any one of the participating devices on the command
427 * line. We always keep them in device-table order.
428 */
429 if (fscb_od && osduld_device_same(fscb_od, &odi)) {
430 sbi->s_ods[i] = fscb_od;
431 ++sbi->s_numdevs;
432 fscb_od = NULL;
433 continue;
434 }
435
436 od = osduld_info_lookup(&odi);
437 if (unlikely(IS_ERR(od))) {
438 ret = PTR_ERR(od);
439 EXOFS_ERR("ERROR: device requested is not found "
440 "osd_name-%s =>%d\n", odi.osdname, ret);
441 goto out;
442 }
443
444 sbi->s_ods[i] = od;
445 ++sbi->s_numdevs;
446
447 /* Read the fscb of the other devices to make sure the FS
448 * partition is there.
449 */
450 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb,
451 sizeof(fscb));
452 if (unlikely(ret)) {
453 EXOFS_ERR("ERROR: Malformed participating device "
454 "error reading fscb osd_name-%s\n",
455 odi.osdname);
456 goto out;
457 }
458
459 /* TODO: verify other information is correct and FS-uuid
460 * matches. Benny what did you say about device table
461 * generation and old devices?
462 */
463 }
464
465out:
466 kfree(dt);
467 if (unlikely(!ret && fscb_od)) {
468 EXOFS_ERR(
469 "ERROR: Bad device-table container device not present\n");
470 osduld_put_device(fscb_od);
471 ret = -EINVAL;
472 }
473
474 return ret;
475}
476
287/* 477/*
288 * Read the superblock from the OSD and fill in the fields 478 * Read the superblock from the OSD and fill in the fields
289 */ 479 */
@@ -292,24 +482,25 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
292 struct inode *root; 482 struct inode *root;
293 struct exofs_mountopt *opts = data; 483 struct exofs_mountopt *opts = data;
294 struct exofs_sb_info *sbi; /*extended info */ 484 struct exofs_sb_info *sbi; /*extended info */
485 struct osd_dev *od; /* Master device */
295 struct exofs_fscb fscb; /*on-disk superblock info */ 486 struct exofs_fscb fscb; /*on-disk superblock info */
296 struct osd_request *or = NULL;
297 struct osd_obj_id obj; 487 struct osd_obj_id obj;
488 unsigned table_count;
298 int ret; 489 int ret;
299 490
300 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); 491 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
301 if (!sbi) 492 if (!sbi)
302 return -ENOMEM; 493 return -ENOMEM;
303 sb->s_fs_info = sbi;
304 494
305 /* use mount options to fill superblock */ 495 /* use mount options to fill superblock */
306 sbi->s_dev = osduld_path_lookup(opts->dev_name); 496 od = osduld_path_lookup(opts->dev_name);
307 if (IS_ERR(sbi->s_dev)) { 497 if (IS_ERR(od)) {
308 ret = PTR_ERR(sbi->s_dev); 498 ret = PTR_ERR(od);
309 sbi->s_dev = NULL;
310 goto free_sbi; 499 goto free_sbi;
311 } 500 }
312 501
502 sbi->s_ods[0] = od;
503 sbi->s_numdevs = 1;
313 sbi->s_pid = opts->pid; 504 sbi->s_pid = opts->pid;
314 sbi->s_timeout = opts->timeout; 505 sbi->s_timeout = opts->timeout;
315 506
@@ -323,35 +514,13 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
323 sb->s_bdev = NULL; 514 sb->s_bdev = NULL;
324 sb->s_dev = 0; 515 sb->s_dev = 0;
325 516
326 /* read data from on-disk superblock object */
327 obj.partition = sbi->s_pid; 517 obj.partition = sbi->s_pid;
328 obj.id = EXOFS_SUPER_ID; 518 obj.id = EXOFS_SUPER_ID;
329 exofs_make_credential(sbi->s_cred, &obj); 519 exofs_make_credential(sbi->s_cred, &obj);
330 520
331 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 521 ret = exofs_read_kern(od, sbi->s_cred, &obj, 0, &fscb, sizeof(fscb));
332 if (unlikely(!or)) { 522 if (unlikely(ret))
333 if (!silent)
334 EXOFS_ERR(
335 "exofs_fill_super: osd_start_request failed.\n");
336 ret = -ENOMEM;
337 goto free_sbi;
338 }
339 ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
340 if (unlikely(ret)) {
341 if (!silent)
342 EXOFS_ERR(
343 "exofs_fill_super: osd_req_read_kern failed.\n");
344 ret = -ENOMEM;
345 goto free_sbi;
346 }
347
348 ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
349 if (unlikely(ret)) {
350 if (!silent)
351 EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
352 ret = -EIO;
353 goto free_sbi; 523 goto free_sbi;
354 }
355 524
356 sb->s_magic = le16_to_cpu(fscb.s_magic); 525 sb->s_magic = le16_to_cpu(fscb.s_magic);
357 sbi->s_nextid = le64_to_cpu(fscb.s_nextid); 526 sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
@@ -364,12 +533,26 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
364 ret = -EINVAL; 533 ret = -EINVAL;
365 goto free_sbi; 534 goto free_sbi;
366 } 535 }
536 if (le32_to_cpu(fscb.s_version) != EXOFS_FSCB_VER) {
537 EXOFS_ERR("ERROR: Bad FSCB version expected-%d got-%d\n",
538 EXOFS_FSCB_VER, le32_to_cpu(fscb.s_version));
539 ret = -EINVAL;
540 goto free_sbi;
541 }
367 542
368 /* start generation numbers from a random point */ 543 /* start generation numbers from a random point */
369 get_random_bytes(&sbi->s_next_generation, sizeof(u32)); 544 get_random_bytes(&sbi->s_next_generation, sizeof(u32));
370 spin_lock_init(&sbi->s_next_gen_lock); 545 spin_lock_init(&sbi->s_next_gen_lock);
371 546
547 table_count = le64_to_cpu(fscb.s_dev_table_count);
548 if (table_count) {
549 ret = exofs_read_lookup_dev_table(&sbi, table_count);
550 if (unlikely(ret))
551 goto free_sbi;
552 }
553
372 /* set up operation vectors */ 554 /* set up operation vectors */
555 sb->s_fs_info = sbi;
373 sb->s_op = &exofs_sops; 556 sb->s_op = &exofs_sops;
374 sb->s_export_op = &exofs_export_ops; 557 sb->s_export_op = &exofs_export_ops;
375 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF); 558 root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
@@ -395,16 +578,15 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
395 goto free_sbi; 578 goto free_sbi;
396 } 579 }
397 580
398 ret = 0; 581 _exofs_print_device("Mounting", opts->dev_name, sbi->s_ods[0],
399out: 582 sbi->s_pid);
400 if (or) 583 return 0;
401 osd_end_request(or);
402 return ret;
403 584
404free_sbi: 585free_sbi:
405 osduld_put_device(sbi->s_dev); /* NULL safe */ 586 EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n",
406 kfree(sbi); 587 opts->dev_name, sbi->s_pid, ret);
407 goto out; 588 exofs_free_sbi(sbi);
589 return ret;
408} 590}
409 591
410/* 592/*
@@ -433,7 +615,7 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
433{ 615{
434 struct super_block *sb = dentry->d_sb; 616 struct super_block *sb = dentry->d_sb;
435 struct exofs_sb_info *sbi = sb->s_fs_info; 617 struct exofs_sb_info *sbi = sb->s_fs_info;
436 struct osd_obj_id obj = {sbi->s_pid, 0}; 618 struct exofs_io_state *ios;
437 struct osd_attr attrs[] = { 619 struct osd_attr attrs[] = {
438 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS, 620 ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
439 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)), 621 OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
@@ -442,32 +624,33 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
442 }; 624 };
443 uint64_t capacity = ULLONG_MAX; 625 uint64_t capacity = ULLONG_MAX;
444 uint64_t used = ULLONG_MAX; 626 uint64_t used = ULLONG_MAX;
445 struct osd_request *or;
446 uint8_t cred_a[OSD_CAP_LEN]; 627 uint8_t cred_a[OSD_CAP_LEN];
447 int ret; 628 int ret;
448 629
449 /* get used/capacity attributes */ 630 ret = exofs_get_io_state(sbi, &ios);
450 exofs_make_credential(cred_a, &obj); 631 if (ret) {
451 632 EXOFS_DBGMSG("exofs_get_io_state failed.\n");
452 or = osd_start_request(sbi->s_dev, GFP_KERNEL); 633 return ret;
453 if (unlikely(!or)) {
454 EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
455 return -ENOMEM;
456 } 634 }
457 635
458 osd_req_get_attributes(or, &obj); 636 exofs_make_credential(cred_a, &ios->obj);
459 osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs)); 637 ios->cred = sbi->s_cred;
460 ret = exofs_sync_op(or, sbi->s_timeout, cred_a); 638 ios->in_attr = attrs;
639 ios->in_attr_len = ARRAY_SIZE(attrs);
640
641 ret = exofs_sbi_read(ios);
461 if (unlikely(ret)) 642 if (unlikely(ret))
462 goto out; 643 goto out;
463 644
464 ret = extract_attr_from_req(or, &attrs[0]); 645 ret = extract_attr_from_ios(ios, &attrs[0]);
465 if (likely(!ret)) 646 if (likely(!ret)) {
466 capacity = get_unaligned_be64(attrs[0].val_ptr); 647 capacity = get_unaligned_be64(attrs[0].val_ptr);
467 else 648 if (unlikely(!capacity))
649 capacity = ULLONG_MAX;
650 } else
468 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n"); 651 EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
469 652
470 ret = extract_attr_from_req(or, &attrs[1]); 653 ret = extract_attr_from_ios(ios, &attrs[1]);
471 if (likely(!ret)) 654 if (likely(!ret))
472 used = get_unaligned_be64(attrs[1].val_ptr); 655 used = get_unaligned_be64(attrs[1].val_ptr);
473 else 656 else
@@ -476,15 +659,15 @@ static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
476 /* fill in the stats buffer */ 659 /* fill in the stats buffer */
477 buf->f_type = EXOFS_SUPER_MAGIC; 660 buf->f_type = EXOFS_SUPER_MAGIC;
478 buf->f_bsize = EXOFS_BLKSIZE; 661 buf->f_bsize = EXOFS_BLKSIZE;
479 buf->f_blocks = (capacity >> EXOFS_BLKSHIFT); 662 buf->f_blocks = capacity >> 9;
480 buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT); 663 buf->f_bfree = (capacity - used) >> 9;
481 buf->f_bavail = buf->f_bfree; 664 buf->f_bavail = buf->f_bfree;
482 buf->f_files = sbi->s_numfiles; 665 buf->f_files = sbi->s_numfiles;
483 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles; 666 buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
484 buf->f_namelen = EXOFS_NAME_LEN; 667 buf->f_namelen = EXOFS_NAME_LEN;
485 668
486out: 669out:
487 osd_end_request(or); 670 exofs_put_io_state(ios);
488 return ret; 671 return ret;
489} 672}
490 673
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 354ed3b47b30..2db957778903 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -2033,7 +2033,7 @@ static Indirect *ext3_find_shared(struct inode *inode, int depth,
2033 int k, err; 2033 int k, err;
2034 2034
2035 *top = 0; 2035 *top = 0;
2036 /* Make k index the deepest non-null offest + 1 */ 2036 /* Make k index the deepest non-null offset + 1 */
2037 for (k = depth; k > 1 && !offsets[k-1]; k--) 2037 for (k = depth; k > 1 && !offsets[k-1]; k--)
2038 ; 2038 ;
2039 partial = ext3_get_branch(inode, k, offsets, chain, &err); 2039 partial = ext3_get_branch(inode, k, offsets, chain, &err);
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 9f2d45d75b1a..9acf7e808139 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -26,6 +26,16 @@ config EXT4_FS
26 26
27 If unsure, say N. 27 If unsure, say N.
28 28
29config EXT4_USE_FOR_EXT23
30 bool "Use ext4 for ext2/ext3 file systems"
31 depends on EXT3_FS=n || EXT2_FS=n
32 default y
33 help
34 Allow the ext4 file system driver code to be used for ext2 or
35 ext3 file system mounts. This allows users to reduce their
36 compiled kernel size by using one file system driver for
37 ext2, ext3, and ext4 file systems.
38
29config EXT4_FS_XATTR 39config EXT4_FS_XATTR
30 bool "Ext4 extended attributes" 40 bool "Ext4 extended attributes"
31 depends on EXT4_FS 41 depends on EXT4_FS
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 1d0418980f8d..22bc7435d913 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -499,44 +499,6 @@ error_return:
499} 499}
500 500
501/** 501/**
502 * ext4_free_blocks() -- Free given blocks and update quota
503 * @handle: handle for this transaction
504 * @inode: inode
505 * @block: start physical block to free
506 * @count: number of blocks to count
507 * @metadata: Are these metadata blocks
508 */
509void ext4_free_blocks(handle_t *handle, struct inode *inode,
510 ext4_fsblk_t block, unsigned long count,
511 int metadata)
512{
513 struct super_block *sb;
514 unsigned long dquot_freed_blocks;
515
516 /* this isn't the right place to decide whether block is metadata
517 * inode.c/extents.c knows better, but for safety ... */
518 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
519 metadata = 1;
520
521 /* We need to make sure we don't reuse
522 * block released untill the transaction commit.
523 * writeback mode have weak data consistency so
524 * don't force data as metadata when freeing block
525 * for writeback mode.
526 */
527 if (metadata == 0 && !ext4_should_writeback_data(inode))
528 metadata = 1;
529
530 sb = inode->i_sb;
531
532 ext4_mb_free_blocks(handle, inode, block, count,
533 metadata, &dquot_freed_blocks);
534 if (dquot_freed_blocks)
535 vfs_dq_free_block(inode, dquot_freed_blocks);
536 return;
537}
538
539/**
540 * ext4_has_free_blocks() 502 * ext4_has_free_blocks()
541 * @sbi: in-core super block structure. 503 * @sbi: in-core super block structure.
542 * @nblocks: number of needed blocks 504 * @nblocks: number of needed blocks
@@ -761,7 +723,13 @@ static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb,
761static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, 723static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb,
762 ext4_group_t group) 724 ext4_group_t group)
763{ 725{
764 return ext4_bg_has_super(sb, group) ? EXT4_SB(sb)->s_gdb_count : 0; 726 if (!ext4_bg_has_super(sb, group))
727 return 0;
728
729 if (EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG))
730 return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg);
731 else
732 return EXT4_SB(sb)->s_gdb_count;
765} 733}
766 734
767/** 735/**
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 50784ef07563..4df8621ec31c 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -160,7 +160,7 @@ int ext4_setup_system_zone(struct super_block *sb)
160 if (ext4_bg_has_super(sb, i) && 160 if (ext4_bg_has_super(sb, i) &&
161 ((i < 5) || ((i % flex_size) == 0))) 161 ((i < 5) || ((i % flex_size) == 0)))
162 add_system_zone(sbi, ext4_group_first_block_no(sb, i), 162 add_system_zone(sbi, ext4_group_first_block_no(sb, i),
163 sbi->s_gdb_count + 1); 163 ext4_bg_num_gdb(sb, i) + 1);
164 gdp = ext4_get_group_desc(sb, i, NULL); 164 gdp = ext4_get_group_desc(sb, i, NULL);
165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); 165 ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
166 if (ret) 166 if (ret)
@@ -228,6 +228,7 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
228 struct rb_node *n = sbi->system_blks.rb_node; 228 struct rb_node *n = sbi->system_blks.rb_node;
229 229
230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || 230 if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
231 (start_blk + count < start_blk) ||
231 (start_blk + count > ext4_blocks_count(sbi->s_es))) 232 (start_blk + count > ext4_blocks_count(sbi->s_es)))
232 return 0; 233 return 0;
233 while (n) { 234 while (n) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 8825515eeddd..ab31e65d46d0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -376,6 +376,12 @@ struct ext4_new_group_data {
376 EXT4_GET_BLOCKS_DIO_CREATE_EXT) 376 EXT4_GET_BLOCKS_DIO_CREATE_EXT)
377 377
378/* 378/*
379 * Flags used by ext4_free_blocks
380 */
381#define EXT4_FREE_BLOCKS_METADATA 0x0001
382#define EXT4_FREE_BLOCKS_FORGET 0x0002
383
384/*
379 * ioctl commands 385 * ioctl commands
380 */ 386 */
381#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS 387#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS
@@ -703,6 +709,13 @@ struct ext4_inode_info {
703 struct list_head i_aio_dio_complete_list; 709 struct list_head i_aio_dio_complete_list;
704 /* current io_end structure for async DIO write*/ 710 /* current io_end structure for async DIO write*/
705 ext4_io_end_t *cur_aio_dio; 711 ext4_io_end_t *cur_aio_dio;
712
713 /*
714 * Transactions that contain inode's metadata needed to complete
715 * fsync and fdatasync, respectively.
716 */
717 tid_t i_sync_tid;
718 tid_t i_datasync_tid;
706}; 719};
707 720
708/* 721/*
@@ -750,6 +763,7 @@ struct ext4_inode_info {
750#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ 763#define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */
751#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ 764#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */
752#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ 765#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */
766#define EXT4_MOUNT_DISCARD 0x40000000 /* Issue DISCARD requests */
753 767
754#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt 768#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt
755#define set_opt(o, opt) o |= EXT4_MOUNT_##opt 769#define set_opt(o, opt) o |= EXT4_MOUNT_##opt
@@ -1324,8 +1338,6 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
1324 ext4_fsblk_t goal, unsigned long *count, int *errp); 1338 ext4_fsblk_t goal, unsigned long *count, int *errp);
1325extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1339extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1326extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); 1340extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks);
1327extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1328 ext4_fsblk_t block, unsigned long count, int metadata);
1329extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, 1341extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
1330 ext4_fsblk_t block, unsigned long count); 1342 ext4_fsblk_t block, unsigned long count);
1331extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); 1343extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *);
@@ -1384,16 +1396,15 @@ extern int ext4_mb_reserve_blocks(struct super_block *, int);
1384extern void ext4_discard_preallocations(struct inode *); 1396extern void ext4_discard_preallocations(struct inode *);
1385extern int __init init_ext4_mballoc(void); 1397extern int __init init_ext4_mballoc(void);
1386extern void exit_ext4_mballoc(void); 1398extern void exit_ext4_mballoc(void);
1387extern void ext4_mb_free_blocks(handle_t *, struct inode *, 1399extern void ext4_free_blocks(handle_t *handle, struct inode *inode,
1388 ext4_fsblk_t, unsigned long, int, unsigned long *); 1400 struct buffer_head *bh, ext4_fsblk_t block,
1401 unsigned long count, int flags);
1389extern int ext4_mb_add_groupinfo(struct super_block *sb, 1402extern int ext4_mb_add_groupinfo(struct super_block *sb,
1390 ext4_group_t i, struct ext4_group_desc *desc); 1403 ext4_group_t i, struct ext4_group_desc *desc);
1391extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); 1404extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
1392extern void ext4_mb_put_buddy_cache_lock(struct super_block *, 1405extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
1393 ext4_group_t, int); 1406 ext4_group_t, int);
1394/* inode.c */ 1407/* inode.c */
1395int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
1396 struct buffer_head *bh, ext4_fsblk_t blocknr);
1397struct buffer_head *ext4_getblk(handle_t *, struct inode *, 1408struct buffer_head *ext4_getblk(handle_t *, struct inode *,
1398 ext4_lblk_t, int, int *); 1409 ext4_lblk_t, int, int *);
1399struct buffer_head *ext4_bread(handle_t *, struct inode *, 1410struct buffer_head *ext4_bread(handle_t *, struct inode *,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 6a9409920dee..b57e5c711b6d 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -4,6 +4,8 @@
4 4
5#include "ext4_jbd2.h" 5#include "ext4_jbd2.h"
6 6
7#include <trace/events/ext4.h>
8
7int __ext4_journal_get_undo_access(const char *where, handle_t *handle, 9int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
8 struct buffer_head *bh) 10 struct buffer_head *bh)
9{ 11{
@@ -32,35 +34,69 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle,
32 return err; 34 return err;
33} 35}
34 36
35int __ext4_journal_forget(const char *where, handle_t *handle, 37/*
36 struct buffer_head *bh) 38 * The ext4 forget function must perform a revoke if we are freeing data
39 * which has been journaled. Metadata (eg. indirect blocks) must be
40 * revoked in all cases.
41 *
42 * "bh" may be NULL: a metadata block may have been freed from memory
43 * but there may still be a record of it in the journal, and that record
44 * still needs to be revoked.
45 *
46 * If the handle isn't valid we're not journaling, but we still need to
47 * call into ext4_journal_revoke() to put the buffer head.
48 */
49int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
50 struct inode *inode, struct buffer_head *bh,
51 ext4_fsblk_t blocknr)
37{ 52{
38 int err = 0; 53 int err;
39 54
40 if (ext4_handle_valid(handle)) { 55 might_sleep();
41 err = jbd2_journal_forget(handle, bh); 56
42 if (err) 57 trace_ext4_forget(inode, is_metadata, blocknr);
43 ext4_journal_abort_handle(where, __func__, bh, 58 BUFFER_TRACE(bh, "enter");
44 handle, err); 59
45 } 60 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
46 else 61 "data mode %x\n",
62 bh, is_metadata, inode->i_mode,
63 test_opt(inode->i_sb, DATA_FLAGS));
64
65 /* In the no journal case, we can just do a bforget and return */
66 if (!ext4_handle_valid(handle)) {
47 bforget(bh); 67 bforget(bh);
48 return err; 68 return 0;
49} 69 }
50 70
51int __ext4_journal_revoke(const char *where, handle_t *handle, 71 /* Never use the revoke function if we are doing full data
52 ext4_fsblk_t blocknr, struct buffer_head *bh) 72 * journaling: there is no need to, and a V1 superblock won't
53{ 73 * support it. Otherwise, only skip the revoke on un-journaled
54 int err = 0; 74 * data blocks. */
55 75
56 if (ext4_handle_valid(handle)) { 76 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
57 err = jbd2_journal_revoke(handle, blocknr, bh); 77 (!is_metadata && !ext4_should_journal_data(inode))) {
58 if (err) 78 if (bh) {
59 ext4_journal_abort_handle(where, __func__, bh, 79 BUFFER_TRACE(bh, "call jbd2_journal_forget");
60 handle, err); 80 err = jbd2_journal_forget(handle, bh);
81 if (err)
82 ext4_journal_abort_handle(where, __func__, bh,
83 handle, err);
84 return err;
85 }
86 return 0;
61 } 87 }
62 else 88
63 bforget(bh); 89 /*
90 * data!=journal && (is_metadata || should_journal_data(inode))
91 */
92 BUFFER_TRACE(bh, "call jbd2_journal_revoke");
93 err = jbd2_journal_revoke(handle, blocknr, bh);
94 if (err) {
95 ext4_journal_abort_handle(where, __func__, bh, handle, err);
96 ext4_abort(inode->i_sb, __func__,
97 "error %d when attempting revoke", err);
98 }
99 BUFFER_TRACE(bh, "exit");
64 return err; 100 return err;
65} 101}
66 102
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index a2865980342f..05eca817d704 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -49,7 +49,7 @@
49 49
50#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ 50#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
51 EXT4_XATTR_TRANS_BLOCKS - 2 + \ 51 EXT4_XATTR_TRANS_BLOCKS - 2 + \
52 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 52 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
53 53
54/* 54/*
55 * Define the number of metadata blocks we need to account to modify data. 55 * Define the number of metadata blocks we need to account to modify data.
@@ -57,7 +57,7 @@
57 * This include super block, inode block, quota blocks and xattr blocks 57 * This include super block, inode block, quota blocks and xattr blocks
58 */ 58 */
59#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ 59#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \
60 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) 60 EXT4_MAXQUOTAS_TRANS_BLOCKS(sb))
61 61
62/* Delete operations potentially hit one directory's namespace plus an 62/* Delete operations potentially hit one directory's namespace plus an
63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be 63 * entire inode, plus arbitrary amounts of bitmap/indirection data. Be
@@ -92,6 +92,7 @@
92 * but inode, sb and group updates are done only once */ 92 * but inode, sb and group updates are done only once */
93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ 93#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) 94 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
95
95#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ 96#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
96 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) 97 (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
97#else 98#else
@@ -99,6 +100,9 @@
99#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 100#define EXT4_QUOTA_INIT_BLOCKS(sb) 0
100#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 101#define EXT4_QUOTA_DEL_BLOCKS(sb) 0
101#endif 102#endif
103#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb))
104#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
105#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
102 106
103int 107int
104ext4_mark_iloc_dirty(handle_t *handle, 108ext4_mark_iloc_dirty(handle_t *handle,
@@ -116,12 +120,8 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
116int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); 120int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode);
117 121
118/* 122/*
119 * Wrapper functions with which ext4 calls into JBD. The intent here is 123 * Wrapper functions with which ext4 calls into JBD.
120 * to allow these to be turned into appropriate stubs so ext4 can control
121 * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't
122 * been done yet.
123 */ 124 */
124
125void ext4_journal_abort_handle(const char *caller, const char *err_fn, 125void ext4_journal_abort_handle(const char *caller, const char *err_fn,
126 struct buffer_head *bh, handle_t *handle, int err); 126 struct buffer_head *bh, handle_t *handle, int err);
127 127
@@ -131,13 +131,9 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle,
131int __ext4_journal_get_write_access(const char *where, handle_t *handle, 131int __ext4_journal_get_write_access(const char *where, handle_t *handle,
132 struct buffer_head *bh); 132 struct buffer_head *bh);
133 133
134/* When called with an invalid handle, this will still do a put on the BH */ 134int __ext4_forget(const char *where, handle_t *handle, int is_metadata,
135int __ext4_journal_forget(const char *where, handle_t *handle, 135 struct inode *inode, struct buffer_head *bh,
136 struct buffer_head *bh); 136 ext4_fsblk_t blocknr);
137
138/* When called with an invalid handle, this will still do a put on the BH */
139int __ext4_journal_revoke(const char *where, handle_t *handle,
140 ext4_fsblk_t blocknr, struct buffer_head *bh);
141 137
142int __ext4_journal_get_create_access(const char *where, 138int __ext4_journal_get_create_access(const char *where,
143 handle_t *handle, struct buffer_head *bh); 139 handle_t *handle, struct buffer_head *bh);
@@ -149,12 +145,11 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
149 __ext4_journal_get_undo_access(__func__, (handle), (bh)) 145 __ext4_journal_get_undo_access(__func__, (handle), (bh))
150#define ext4_journal_get_write_access(handle, bh) \ 146#define ext4_journal_get_write_access(handle, bh) \
151 __ext4_journal_get_write_access(__func__, (handle), (bh)) 147 __ext4_journal_get_write_access(__func__, (handle), (bh))
152#define ext4_journal_revoke(handle, blocknr, bh) \ 148#define ext4_forget(handle, is_metadata, inode, bh, block_nr) \
153 __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) 149 __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\
150 (block_nr))
154#define ext4_journal_get_create_access(handle, bh) \ 151#define ext4_journal_get_create_access(handle, bh) \
155 __ext4_journal_get_create_access(__func__, (handle), (bh)) 152 __ext4_journal_get_create_access(__func__, (handle), (bh))
156#define ext4_journal_forget(handle, bh) \
157 __ext4_journal_forget(__func__, (handle), (bh))
158#define ext4_handle_dirty_metadata(handle, inode, bh) \ 153#define ext4_handle_dirty_metadata(handle, inode, bh) \
159 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) 154 __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh))
160 155
@@ -254,6 +249,19 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode)
254 return 0; 249 return 0;
255} 250}
256 251
252static inline void ext4_update_inode_fsync_trans(handle_t *handle,
253 struct inode *inode,
254 int datasync)
255{
256 struct ext4_inode_info *ei = EXT4_I(inode);
257
258 if (ext4_handle_valid(handle)) {
259 ei->i_sync_tid = handle->h_transaction->t_tid;
260 if (datasync)
261 ei->i_datasync_tid = handle->h_transaction->t_tid;
262 }
263}
264
257/* super.c */ 265/* super.c */
258int ext4_force_commit(struct super_block *sb); 266int ext4_force_commit(struct super_block *sb);
259 267
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 715264b4bae4..3a7928f825e4 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -1007,7 +1007,8 @@ cleanup:
1007 for (i = 0; i < depth; i++) { 1007 for (i = 0; i < depth; i++) {
1008 if (!ablocks[i]) 1008 if (!ablocks[i])
1009 continue; 1009 continue;
1010 ext4_free_blocks(handle, inode, ablocks[i], 1, 1); 1010 ext4_free_blocks(handle, inode, 0, ablocks[i], 1,
1011 EXT4_FREE_BLOCKS_METADATA);
1011 } 1012 }
1012 } 1013 }
1013 kfree(ablocks); 1014 kfree(ablocks);
@@ -1761,7 +1762,9 @@ int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block,
1761 while (block < last && block != EXT_MAX_BLOCK) { 1762 while (block < last && block != EXT_MAX_BLOCK) {
1762 num = last - block; 1763 num = last - block;
1763 /* find extent for this block */ 1764 /* find extent for this block */
1765 down_read(&EXT4_I(inode)->i_data_sem);
1764 path = ext4_ext_find_extent(inode, block, path); 1766 path = ext4_ext_find_extent(inode, block, path);
1767 up_read(&EXT4_I(inode)->i_data_sem);
1765 if (IS_ERR(path)) { 1768 if (IS_ERR(path)) {
1766 err = PTR_ERR(path); 1769 err = PTR_ERR(path);
1767 path = NULL; 1770 path = NULL;
@@ -1957,7 +1960,6 @@ errout:
1957static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, 1960static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1958 struct ext4_ext_path *path) 1961 struct ext4_ext_path *path)
1959{ 1962{
1960 struct buffer_head *bh;
1961 int err; 1963 int err;
1962 ext4_fsblk_t leaf; 1964 ext4_fsblk_t leaf;
1963 1965
@@ -1973,9 +1975,8 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode,
1973 if (err) 1975 if (err)
1974 return err; 1976 return err;
1975 ext_debug("index is empty, remove it, free block %llu\n", leaf); 1977 ext_debug("index is empty, remove it, free block %llu\n", leaf);
1976 bh = sb_find_get_block(inode->i_sb, leaf); 1978 ext4_free_blocks(handle, inode, 0, leaf, 1,
1977 ext4_forget(handle, 1, inode, bh, leaf); 1979 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
1978 ext4_free_blocks(handle, inode, leaf, 1, 1);
1979 return err; 1980 return err;
1980} 1981}
1981 1982
@@ -2042,12 +2043,11 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2042 struct ext4_extent *ex, 2043 struct ext4_extent *ex,
2043 ext4_lblk_t from, ext4_lblk_t to) 2044 ext4_lblk_t from, ext4_lblk_t to)
2044{ 2045{
2045 struct buffer_head *bh;
2046 unsigned short ee_len = ext4_ext_get_actual_len(ex); 2046 unsigned short ee_len = ext4_ext_get_actual_len(ex);
2047 int i, metadata = 0; 2047 int flags = EXT4_FREE_BLOCKS_FORGET;
2048 2048
2049 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) 2049 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
2050 metadata = 1; 2050 flags |= EXT4_FREE_BLOCKS_METADATA;
2051#ifdef EXTENTS_STATS 2051#ifdef EXTENTS_STATS
2052 { 2052 {
2053 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); 2053 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -2072,11 +2072,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode,
2072 num = le32_to_cpu(ex->ee_block) + ee_len - from; 2072 num = le32_to_cpu(ex->ee_block) + ee_len - from;
2073 start = ext_pblock(ex) + ee_len - num; 2073 start = ext_pblock(ex) + ee_len - num;
2074 ext_debug("free last %u blocks starting %llu\n", num, start); 2074 ext_debug("free last %u blocks starting %llu\n", num, start);
2075 for (i = 0; i < num; i++) { 2075 ext4_free_blocks(handle, inode, 0, start, num, flags);
2076 bh = sb_find_get_block(inode->i_sb, start + i);
2077 ext4_forget(handle, 0, inode, bh, start + i);
2078 }
2079 ext4_free_blocks(handle, inode, start, num, metadata);
2080 } else if (from == le32_to_cpu(ex->ee_block) 2076 } else if (from == le32_to_cpu(ex->ee_block)
2081 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) { 2077 && to <= le32_to_cpu(ex->ee_block) + ee_len - 1) {
2082 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n", 2078 printk(KERN_INFO "strange request: removal %u-%u from %u:%u\n",
@@ -2167,7 +2163,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
2167 correct_index = 1; 2163 correct_index = 1;
2168 credits += (ext_depth(inode)) + 1; 2164 credits += (ext_depth(inode)) + 1;
2169 } 2165 }
2170 credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 2166 credits += EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
2171 2167
2172 err = ext4_ext_truncate_extend_restart(handle, inode, credits); 2168 err = ext4_ext_truncate_extend_restart(handle, inode, credits);
2173 if (err) 2169 if (err)
@@ -3064,6 +3060,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3064 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { 3060 if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) {
3065 ret = ext4_convert_unwritten_extents_dio(handle, inode, 3061 ret = ext4_convert_unwritten_extents_dio(handle, inode,
3066 path); 3062 path);
3063 if (ret >= 0)
3064 ext4_update_inode_fsync_trans(handle, inode, 1);
3067 goto out2; 3065 goto out2;
3068 } 3066 }
3069 /* buffered IO case */ 3067 /* buffered IO case */
@@ -3091,6 +3089,8 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
3091 ret = ext4_ext_convert_to_initialized(handle, inode, 3089 ret = ext4_ext_convert_to_initialized(handle, inode,
3092 path, iblock, 3090 path, iblock,
3093 max_blocks); 3091 max_blocks);
3092 if (ret >= 0)
3093 ext4_update_inode_fsync_trans(handle, inode, 1);
3094out: 3094out:
3095 if (ret <= 0) { 3095 if (ret <= 0) {
3096 err = ret; 3096 err = ret;
@@ -3319,8 +3319,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3319 /* not a good idea to call discard here directly, 3319 /* not a good idea to call discard here directly,
3320 * but otherwise we'd need to call it every free() */ 3320 * but otherwise we'd need to call it every free() */
3321 ext4_discard_preallocations(inode); 3321 ext4_discard_preallocations(inode);
3322 ext4_free_blocks(handle, inode, ext_pblock(&newex), 3322 ext4_free_blocks(handle, inode, 0, ext_pblock(&newex),
3323 ext4_ext_get_actual_len(&newex), 0); 3323 ext4_ext_get_actual_len(&newex), 0);
3324 goto out2; 3324 goto out2;
3325 } 3325 }
3326 3326
@@ -3329,10 +3329,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
3329 allocated = ext4_ext_get_actual_len(&newex); 3329 allocated = ext4_ext_get_actual_len(&newex);
3330 set_buffer_new(bh_result); 3330 set_buffer_new(bh_result);
3331 3331
3332 /* Cache only when it is _not_ an uninitialized extent */ 3332 /*
3333 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) 3333 * Cache the extent and update transaction to commit on fdatasync only
3334 * when it is _not_ an uninitialized extent.
3335 */
3336 if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
3334 ext4_ext_put_in_cache(inode, iblock, allocated, newblock, 3337 ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
3335 EXT4_EXT_CACHE_EXTENT); 3338 EXT4_EXT_CACHE_EXTENT);
3339 ext4_update_inode_fsync_trans(handle, inode, 1);
3340 } else
3341 ext4_update_inode_fsync_trans(handle, inode, 0);
3336out: 3342out:
3337 if (allocated > max_blocks) 3343 if (allocated > max_blocks)
3338 allocated = max_blocks; 3344 allocated = max_blocks;
@@ -3720,10 +3726,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3720 * Walk the extent tree gathering extent information. 3726 * Walk the extent tree gathering extent information.
3721 * ext4_ext_fiemap_cb will push extents back to user. 3727 * ext4_ext_fiemap_cb will push extents back to user.
3722 */ 3728 */
3723 down_read(&EXT4_I(inode)->i_data_sem);
3724 error = ext4_ext_walk_space(inode, start_blk, len_blks, 3729 error = ext4_ext_walk_space(inode, start_blk, len_blks,
3725 ext4_ext_fiemap_cb, fieinfo); 3730 ext4_ext_fiemap_cb, fieinfo);
3726 up_read(&EXT4_I(inode)->i_data_sem);
3727 } 3731 }
3728 3732
3729 return error; 3733 return error;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 2b1531266ee2..0b22497d92e1 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -51,25 +51,30 @@
51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) 51int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
52{ 52{
53 struct inode *inode = dentry->d_inode; 53 struct inode *inode = dentry->d_inode;
54 struct ext4_inode_info *ei = EXT4_I(inode);
54 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; 55 journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
55 int err, ret = 0; 56 int ret;
57 tid_t commit_tid;
56 58
57 J_ASSERT(ext4_journal_current_handle() == NULL); 59 J_ASSERT(ext4_journal_current_handle() == NULL);
58 60
59 trace_ext4_sync_file(file, dentry, datasync); 61 trace_ext4_sync_file(file, dentry, datasync);
60 62
63 if (inode->i_sb->s_flags & MS_RDONLY)
64 return 0;
65
61 ret = flush_aio_dio_completed_IO(inode); 66 ret = flush_aio_dio_completed_IO(inode);
62 if (ret < 0) 67 if (ret < 0)
63 goto out; 68 return ret;
69
70 if (!journal)
71 return simple_fsync(file, dentry, datasync);
72
64 /* 73 /*
65 * data=writeback: 74 * data=writeback,ordered:
66 * The caller's filemap_fdatawrite()/wait will sync the data. 75 * The caller's filemap_fdatawrite()/wait will sync the data.
67 * sync_inode() will sync the metadata 76 * Metadata is in the journal, we wait for proper transaction to
68 * 77 * commit here.
69 * data=ordered:
70 * The caller's filemap_fdatawrite() will write the data and
71 * sync_inode() will write the inode if it is dirty. Then the caller's
72 * filemap_fdatawait() will wait on the pages.
73 * 78 *
74 * data=journal: 79 * data=journal:
75 * filemap_fdatawrite won't do anything (the buffers are clean). 80 * filemap_fdatawrite won't do anything (the buffers are clean).
@@ -79,32 +84,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
79 * (they were dirtied by commit). But that's OK - the blocks are 84 * (they were dirtied by commit). But that's OK - the blocks are
80 * safe in-journal, which is all fsync() needs to ensure. 85 * safe in-journal, which is all fsync() needs to ensure.
81 */ 86 */
82 if (ext4_should_journal_data(inode)) { 87 if (ext4_should_journal_data(inode))
83 ret = ext4_force_commit(inode->i_sb); 88 return ext4_force_commit(inode->i_sb);
84 goto out;
85 }
86 89
87 if (!journal) 90 commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
88 ret = sync_mapping_buffers(inode->i_mapping); 91 if (jbd2_log_start_commit(journal, commit_tid))
89 92 jbd2_log_wait_commit(journal, commit_tid);
90 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 93 else if (journal->j_flags & JBD2_BARRIER)
91 goto out;
92
93 /*
94 * The VFS has written the file data. If the inode is unaltered
95 * then we need not start a commit.
96 */
97 if (inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC)) {
98 struct writeback_control wbc = {
99 .sync_mode = WB_SYNC_ALL,
100 .nr_to_write = 0, /* sys_fsync did this */
101 };
102 err = sync_inode(inode, &wbc);
103 if (ret == 0)
104 ret = err;
105 }
106out:
107 if (journal && (journal->j_flags & JBD2_BARRIER))
108 blkdev_issue_flush(inode->i_sb->s_bdev, NULL); 94 blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
109 return ret; 95 return ret;
110} 96}
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 2c8caa51addb..5352db1a3086 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -71,58 +71,6 @@ static int ext4_inode_is_fast_symlink(struct inode *inode)
71} 71}
72 72
73/* 73/*
74 * The ext4 forget function must perform a revoke if we are freeing data
75 * which has been journaled. Metadata (eg. indirect blocks) must be
76 * revoked in all cases.
77 *
78 * "bh" may be NULL: a metadata block may have been freed from memory
79 * but there may still be a record of it in the journal, and that record
80 * still needs to be revoked.
81 *
82 * If the handle isn't valid we're not journaling, but we still need to
83 * call into ext4_journal_revoke() to put the buffer head.
84 */
85int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode,
86 struct buffer_head *bh, ext4_fsblk_t blocknr)
87{
88 int err;
89
90 might_sleep();
91
92 BUFFER_TRACE(bh, "enter");
93
94 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
95 "data mode %x\n",
96 bh, is_metadata, inode->i_mode,
97 test_opt(inode->i_sb, DATA_FLAGS));
98
99 /* Never use the revoke function if we are doing full data
100 * journaling: there is no need to, and a V1 superblock won't
101 * support it. Otherwise, only skip the revoke on un-journaled
102 * data blocks. */
103
104 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
105 (!is_metadata && !ext4_should_journal_data(inode))) {
106 if (bh) {
107 BUFFER_TRACE(bh, "call jbd2_journal_forget");
108 return ext4_journal_forget(handle, bh);
109 }
110 return 0;
111 }
112
113 /*
114 * data!=journal && (is_metadata || should_journal_data(inode))
115 */
116 BUFFER_TRACE(bh, "call ext4_journal_revoke");
117 err = ext4_journal_revoke(handle, blocknr, bh);
118 if (err)
119 ext4_abort(inode->i_sb, __func__,
120 "error %d when attempting revoke", err);
121 BUFFER_TRACE(bh, "exit");
122 return err;
123}
124
125/*
126 * Work out how many blocks we need to proceed with the next chunk of a 74 * Work out how many blocks we need to proceed with the next chunk of a
127 * truncate transaction. 75 * truncate transaction.
128 */ 76 */
@@ -721,7 +669,7 @@ allocated:
721 return ret; 669 return ret;
722failed_out: 670failed_out:
723 for (i = 0; i < index; i++) 671 for (i = 0; i < index; i++)
724 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 672 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
725 return ret; 673 return ret;
726} 674}
727 675
@@ -817,14 +765,20 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
817 return err; 765 return err;
818failed: 766failed:
819 /* Allocation failed, free what we already allocated */ 767 /* Allocation failed, free what we already allocated */
768 ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
820 for (i = 1; i <= n ; i++) { 769 for (i = 1; i <= n ; i++) {
821 BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); 770 /*
822 ext4_journal_forget(handle, branch[i].bh); 771 * branch[i].bh is newly allocated, so there is no
772 * need to revoke the block, which is why we don't
773 * need to set EXT4_FREE_BLOCKS_METADATA.
774 */
775 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1,
776 EXT4_FREE_BLOCKS_FORGET);
823 } 777 }
824 for (i = 0; i < indirect_blks; i++) 778 for (i = n+1; i < indirect_blks; i++)
825 ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); 779 ext4_free_blocks(handle, inode, 0, new_blocks[i], 1, 0);
826 780
827 ext4_free_blocks(handle, inode, new_blocks[i], num, 0); 781 ext4_free_blocks(handle, inode, 0, new_blocks[i], num, 0);
828 782
829 return err; 783 return err;
830} 784}
@@ -903,12 +857,16 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
903 857
904err_out: 858err_out:
905 for (i = 1; i <= num; i++) { 859 for (i = 1; i <= num; i++) {
906 BUFFER_TRACE(where[i].bh, "call jbd2_journal_forget"); 860 /*
907 ext4_journal_forget(handle, where[i].bh); 861 * branch[i].bh is newly allocated, so there is no
908 ext4_free_blocks(handle, inode, 862 * need to revoke the block, which is why we don't
909 le32_to_cpu(where[i-1].key), 1, 0); 863 * need to set EXT4_FREE_BLOCKS_METADATA.
864 */
865 ext4_free_blocks(handle, inode, where[i].bh, 0, 1,
866 EXT4_FREE_BLOCKS_FORGET);
910 } 867 }
911 ext4_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks, 0); 868 ext4_free_blocks(handle, inode, 0, le32_to_cpu(where[num].key),
869 blks, 0);
912 870
913 return err; 871 return err;
914} 872}
@@ -1021,10 +979,12 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
1021 if (!err) 979 if (!err)
1022 err = ext4_splice_branch(handle, inode, iblock, 980 err = ext4_splice_branch(handle, inode, iblock,
1023 partial, indirect_blks, count); 981 partial, indirect_blks, count);
1024 else 982 if (err)
1025 goto cleanup; 983 goto cleanup;
1026 984
1027 set_buffer_new(bh_result); 985 set_buffer_new(bh_result);
986
987 ext4_update_inode_fsync_trans(handle, inode, 1);
1028got_it: 988got_it:
1029 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key)); 989 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
1030 if (count > blocks_to_boundary) 990 if (count > blocks_to_boundary)
@@ -1052,7 +1012,7 @@ qsize_t ext4_get_reserved_space(struct inode *inode)
1052 EXT4_I(inode)->i_reserved_meta_blocks; 1012 EXT4_I(inode)->i_reserved_meta_blocks;
1053 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); 1013 spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
1054 1014
1055 return total; 1015 return (total << inode->i_blkbits);
1056} 1016}
1057/* 1017/*
1058 * Calculate the number of metadata blocks need to reserve 1018 * Calculate the number of metadata blocks need to reserve
@@ -1534,6 +1494,16 @@ static int do_journal_get_write_access(handle_t *handle,
1534 return ext4_journal_get_write_access(handle, bh); 1494 return ext4_journal_get_write_access(handle, bh);
1535} 1495}
1536 1496
1497/*
1498 * Truncate blocks that were not used by write. We have to truncate the
1499 * pagecache as well so that corresponding buffers get properly unmapped.
1500 */
1501static void ext4_truncate_failed_write(struct inode *inode)
1502{
1503 truncate_inode_pages(inode->i_mapping, inode->i_size);
1504 ext4_truncate(inode);
1505}
1506
1537static int ext4_write_begin(struct file *file, struct address_space *mapping, 1507static int ext4_write_begin(struct file *file, struct address_space *mapping,
1538 loff_t pos, unsigned len, unsigned flags, 1508 loff_t pos, unsigned len, unsigned flags,
1539 struct page **pagep, void **fsdata) 1509 struct page **pagep, void **fsdata)
@@ -1599,7 +1569,7 @@ retry:
1599 1569
1600 ext4_journal_stop(handle); 1570 ext4_journal_stop(handle);
1601 if (pos + len > inode->i_size) { 1571 if (pos + len > inode->i_size) {
1602 ext4_truncate(inode); 1572 ext4_truncate_failed_write(inode);
1603 /* 1573 /*
1604 * If truncate failed early the inode might 1574 * If truncate failed early the inode might
1605 * still be on the orphan list; we need to 1575 * still be on the orphan list; we need to
@@ -1709,7 +1679,7 @@ static int ext4_ordered_write_end(struct file *file,
1709 ret = ret2; 1679 ret = ret2;
1710 1680
1711 if (pos + len > inode->i_size) { 1681 if (pos + len > inode->i_size) {
1712 ext4_truncate(inode); 1682 ext4_truncate_failed_write(inode);
1713 /* 1683 /*
1714 * If truncate failed early the inode might still be 1684 * If truncate failed early the inode might still be
1715 * on the orphan list; we need to make sure the inode 1685 * on the orphan list; we need to make sure the inode
@@ -1751,7 +1721,7 @@ static int ext4_writeback_write_end(struct file *file,
1751 ret = ret2; 1721 ret = ret2;
1752 1722
1753 if (pos + len > inode->i_size) { 1723 if (pos + len > inode->i_size) {
1754 ext4_truncate(inode); 1724 ext4_truncate_failed_write(inode);
1755 /* 1725 /*
1756 * If truncate failed early the inode might still be 1726 * If truncate failed early the inode might still be
1757 * on the orphan list; we need to make sure the inode 1727 * on the orphan list; we need to make sure the inode
@@ -1814,7 +1784,7 @@ static int ext4_journalled_write_end(struct file *file,
1814 if (!ret) 1784 if (!ret)
1815 ret = ret2; 1785 ret = ret2;
1816 if (pos + len > inode->i_size) { 1786 if (pos + len > inode->i_size) {
1817 ext4_truncate(inode); 1787 ext4_truncate_failed_write(inode);
1818 /* 1788 /*
1819 * If truncate failed early the inode might still be 1789 * If truncate failed early the inode might still be
1820 * on the orphan list; we need to make sure the inode 1790 * on the orphan list; we need to make sure the inode
@@ -2600,7 +2570,6 @@ static int bput_one(handle_t *handle, struct buffer_head *bh)
2600} 2570}
2601 2571
2602static int __ext4_journalled_writepage(struct page *page, 2572static int __ext4_journalled_writepage(struct page *page,
2603 struct writeback_control *wbc,
2604 unsigned int len) 2573 unsigned int len)
2605{ 2574{
2606 struct address_space *mapping = page->mapping; 2575 struct address_space *mapping = page->mapping;
@@ -2758,7 +2727,7 @@ static int ext4_writepage(struct page *page,
2758 * doesn't seem much point in redirtying the page here. 2727 * doesn't seem much point in redirtying the page here.
2759 */ 2728 */
2760 ClearPageChecked(page); 2729 ClearPageChecked(page);
2761 return __ext4_journalled_writepage(page, wbc, len); 2730 return __ext4_journalled_writepage(page, len);
2762 } 2731 }
2763 2732
2764 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) 2733 if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
@@ -2788,7 +2757,7 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
2788 * number of contiguous block. So we will limit 2757 * number of contiguous block. So we will limit
2789 * number of contiguous block to a sane value 2758 * number of contiguous block to a sane value
2790 */ 2759 */
2791 if (!(inode->i_flags & EXT4_EXTENTS_FL) && 2760 if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
2792 (max_blocks > EXT4_MAX_TRANS_DATA)) 2761 (max_blocks > EXT4_MAX_TRANS_DATA))
2793 max_blocks = EXT4_MAX_TRANS_DATA; 2762 max_blocks = EXT4_MAX_TRANS_DATA;
2794 2763
@@ -2933,7 +2902,7 @@ retry:
2933 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, 2902 ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
2934 &mpd); 2903 &mpd);
2935 /* 2904 /*
2936 * If we have a contigous extent of pages and we 2905 * If we have a contiguous extent of pages and we
2937 * haven't done the I/O yet, map the blocks and submit 2906 * haven't done the I/O yet, map the blocks and submit
2938 * them for I/O. 2907 * them for I/O.
2939 */ 2908 */
@@ -3091,7 +3060,7 @@ retry:
3091 * i_size_read because we hold i_mutex. 3060 * i_size_read because we hold i_mutex.
3092 */ 3061 */
3093 if (pos + len > inode->i_size) 3062 if (pos + len > inode->i_size)
3094 ext4_truncate(inode); 3063 ext4_truncate_failed_write(inode);
3095 } 3064 }
3096 3065
3097 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) 3066 if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -4064,7 +4033,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth,
4064 int k, err; 4033 int k, err;
4065 4034
4066 *top = 0; 4035 *top = 0;
4067 /* Make k index the deepest non-null offest + 1 */ 4036 /* Make k index the deepest non-null offset + 1 */
4068 for (k = depth; k > 1 && !offsets[k-1]; k--) 4037 for (k = depth; k > 1 && !offsets[k-1]; k--)
4069 ; 4038 ;
4070 partial = ext4_get_branch(inode, k, offsets, chain, &err); 4039 partial = ext4_get_branch(inode, k, offsets, chain, &err);
@@ -4120,6 +4089,11 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4120 __le32 *last) 4089 __le32 *last)
4121{ 4090{
4122 __le32 *p; 4091 __le32 *p;
4092 int flags = EXT4_FREE_BLOCKS_FORGET;
4093
4094 if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))
4095 flags |= EXT4_FREE_BLOCKS_METADATA;
4096
4123 if (try_to_extend_transaction(handle, inode)) { 4097 if (try_to_extend_transaction(handle, inode)) {
4124 if (bh) { 4098 if (bh) {
4125 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); 4099 BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
@@ -4134,27 +4108,10 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
4134 } 4108 }
4135 } 4109 }
4136 4110
4137 /* 4111 for (p = first; p < last; p++)
4138 * Any buffers which are on the journal will be in memory. We 4112 *p = 0;
4139 * find them on the hash table so jbd2_journal_revoke() will
4140 * run jbd2_journal_forget() on them. We've already detached
4141 * each block from the file, so bforget() in
4142 * jbd2_journal_forget() should be safe.
4143 *
4144 * AKPM: turn on bforget in jbd2_journal_forget()!!!
4145 */
4146 for (p = first; p < last; p++) {
4147 u32 nr = le32_to_cpu(*p);
4148 if (nr) {
4149 struct buffer_head *tbh;
4150
4151 *p = 0;
4152 tbh = sb_find_get_block(inode->i_sb, nr);
4153 ext4_forget(handle, 0, inode, tbh, nr);
4154 }
4155 }
4156 4113
4157 ext4_free_blocks(handle, inode, block_to_free, count, 0); 4114 ext4_free_blocks(handle, inode, 0, block_to_free, count, flags);
4158} 4115}
4159 4116
4160/** 4117/**
@@ -4342,7 +4299,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
4342 blocks_for_truncate(inode)); 4299 blocks_for_truncate(inode));
4343 } 4300 }
4344 4301
4345 ext4_free_blocks(handle, inode, nr, 1, 1); 4302 ext4_free_blocks(handle, inode, 0, nr, 1,
4303 EXT4_FREE_BLOCKS_METADATA);
4346 4304
4347 if (parent_bh) { 4305 if (parent_bh) {
4348 /* 4306 /*
@@ -4781,8 +4739,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4781 struct ext4_iloc iloc; 4739 struct ext4_iloc iloc;
4782 struct ext4_inode *raw_inode; 4740 struct ext4_inode *raw_inode;
4783 struct ext4_inode_info *ei; 4741 struct ext4_inode_info *ei;
4784 struct buffer_head *bh;
4785 struct inode *inode; 4742 struct inode *inode;
4743 journal_t *journal = EXT4_SB(sb)->s_journal;
4786 long ret; 4744 long ret;
4787 int block; 4745 int block;
4788 4746
@@ -4793,11 +4751,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4793 return inode; 4751 return inode;
4794 4752
4795 ei = EXT4_I(inode); 4753 ei = EXT4_I(inode);
4754 iloc.bh = 0;
4796 4755
4797 ret = __ext4_get_inode_loc(inode, &iloc, 0); 4756 ret = __ext4_get_inode_loc(inode, &iloc, 0);
4798 if (ret < 0) 4757 if (ret < 0)
4799 goto bad_inode; 4758 goto bad_inode;
4800 bh = iloc.bh;
4801 raw_inode = ext4_raw_inode(&iloc); 4759 raw_inode = ext4_raw_inode(&iloc);
4802 inode->i_mode = le16_to_cpu(raw_inode->i_mode); 4760 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
4803 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); 4761 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
@@ -4820,7 +4778,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4820 if (inode->i_mode == 0 || 4778 if (inode->i_mode == 0 ||
4821 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { 4779 !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) {
4822 /* this inode is deleted */ 4780 /* this inode is deleted */
4823 brelse(bh);
4824 ret = -ESTALE; 4781 ret = -ESTALE;
4825 goto bad_inode; 4782 goto bad_inode;
4826 } 4783 }
@@ -4848,11 +4805,35 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4848 ei->i_data[block] = raw_inode->i_block[block]; 4805 ei->i_data[block] = raw_inode->i_block[block];
4849 INIT_LIST_HEAD(&ei->i_orphan); 4806 INIT_LIST_HEAD(&ei->i_orphan);
4850 4807
4808 /*
4809 * Set transaction id's of transactions that have to be committed
4810 * to finish f[data]sync. We set them to currently running transaction
4811 * as we cannot be sure that the inode or some of its metadata isn't
4812 * part of the transaction - the inode could have been reclaimed and
4813 * now it is reread from disk.
4814 */
4815 if (journal) {
4816 transaction_t *transaction;
4817 tid_t tid;
4818
4819 spin_lock(&journal->j_state_lock);
4820 if (journal->j_running_transaction)
4821 transaction = journal->j_running_transaction;
4822 else
4823 transaction = journal->j_committing_transaction;
4824 if (transaction)
4825 tid = transaction->t_tid;
4826 else
4827 tid = journal->j_commit_sequence;
4828 spin_unlock(&journal->j_state_lock);
4829 ei->i_sync_tid = tid;
4830 ei->i_datasync_tid = tid;
4831 }
4832
4851 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { 4833 if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
4852 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); 4834 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
4853 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > 4835 if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
4854 EXT4_INODE_SIZE(inode->i_sb)) { 4836 EXT4_INODE_SIZE(inode->i_sb)) {
4855 brelse(bh);
4856 ret = -EIO; 4837 ret = -EIO;
4857 goto bad_inode; 4838 goto bad_inode;
4858 } 4839 }
@@ -4884,10 +4865,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4884 4865
4885 ret = 0; 4866 ret = 0;
4886 if (ei->i_file_acl && 4867 if (ei->i_file_acl &&
4887 ((ei->i_file_acl < 4868 !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
4888 (le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block) +
4889 EXT4_SB(sb)->s_gdb_count)) ||
4890 (ei->i_file_acl >= ext4_blocks_count(EXT4_SB(sb)->s_es)))) {
4891 ext4_error(sb, __func__, 4869 ext4_error(sb, __func__,
4892 "bad extended attribute block %llu in inode #%lu", 4870 "bad extended attribute block %llu in inode #%lu",
4893 ei->i_file_acl, inode->i_ino); 4871 ei->i_file_acl, inode->i_ino);
@@ -4905,10 +4883,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4905 /* Validate block references which are part of inode */ 4883 /* Validate block references which are part of inode */
4906 ret = ext4_check_inode_blockref(inode); 4884 ret = ext4_check_inode_blockref(inode);
4907 } 4885 }
4908 if (ret) { 4886 if (ret)
4909 brelse(bh);
4910 goto bad_inode; 4887 goto bad_inode;
4911 }
4912 4888
4913 if (S_ISREG(inode->i_mode)) { 4889 if (S_ISREG(inode->i_mode)) {
4914 inode->i_op = &ext4_file_inode_operations; 4890 inode->i_op = &ext4_file_inode_operations;
@@ -4936,7 +4912,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4936 init_special_inode(inode, inode->i_mode, 4912 init_special_inode(inode, inode->i_mode,
4937 new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); 4913 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
4938 } else { 4914 } else {
4939 brelse(bh);
4940 ret = -EIO; 4915 ret = -EIO;
4941 ext4_error(inode->i_sb, __func__, 4916 ext4_error(inode->i_sb, __func__,
4942 "bogus i_mode (%o) for inode=%lu", 4917 "bogus i_mode (%o) for inode=%lu",
@@ -4949,6 +4924,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
4949 return inode; 4924 return inode;
4950 4925
4951bad_inode: 4926bad_inode:
4927 brelse(iloc.bh);
4952 iget_failed(inode); 4928 iget_failed(inode);
4953 return ERR_PTR(ret); 4929 return ERR_PTR(ret);
4954} 4930}
@@ -5108,6 +5084,7 @@ static int ext4_do_update_inode(handle_t *handle,
5108 err = rc; 5084 err = rc;
5109 ei->i_state &= ~EXT4_STATE_NEW; 5085 ei->i_state &= ~EXT4_STATE_NEW;
5110 5086
5087 ext4_update_inode_fsync_trans(handle, inode, 0);
5111out_brelse: 5088out_brelse:
5112 brelse(bh); 5089 brelse(bh);
5113 ext4_std_error(inode->i_sb, err); 5090 ext4_std_error(inode->i_sb, err);
@@ -5227,8 +5204,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
5227 5204
5228 /* (user+group)*(old+new) structure, inode write (sb, 5205 /* (user+group)*(old+new) structure, inode write (sb,
5229 * inode block, ? - but truncate inode update has it) */ 5206 * inode block, ? - but truncate inode update has it) */
5230 handle = ext4_journal_start(inode, 2*(EXT4_QUOTA_INIT_BLOCKS(inode->i_sb)+ 5207 handle = ext4_journal_start(inode, (EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+
5231 EXT4_QUOTA_DEL_BLOCKS(inode->i_sb))+3); 5208 EXT4_MAXQUOTAS_DEL_BLOCKS(inode->i_sb))+3);
5232 if (IS_ERR(handle)) { 5209 if (IS_ERR(handle)) {
5233 error = PTR_ERR(handle); 5210 error = PTR_ERR(handle);
5234 goto err_out; 5211 goto err_out;
@@ -5376,7 +5353,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
5376 * worse case, the indexs blocks spread over different block groups 5353 * worse case, the indexs blocks spread over different block groups
5377 * 5354 *
5378 * If datablocks are discontiguous, they are possible to spread over 5355 * If datablocks are discontiguous, they are possible to spread over
5379 * different block groups too. If they are contiugous, with flexbg, 5356 * different block groups too. If they are contiuguous, with flexbg,
5380 * they could still across block group boundary. 5357 * they could still across block group boundary.
5381 * 5358 *
5382 * Also account for superblock, inode, quota and xattr blocks 5359 * Also account for superblock, inode, quota and xattr blocks
@@ -5452,7 +5429,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
5452 * Calculate the journal credits for a chunk of data modification. 5429 * Calculate the journal credits for a chunk of data modification.
5453 * 5430 *
5454 * This is called from DIO, fallocate or whoever calling 5431 * This is called from DIO, fallocate or whoever calling
5455 * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. 5432 * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks.
5456 * 5433 *
5457 * journal buffers for data blocks are not included here, as DIO 5434 * journal buffers for data blocks are not included here, as DIO
5458 * and fallocate do no need to journal data buffers. 5435 * and fallocate do no need to journal data buffers.
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index c1cdf613e725..b63d193126db 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -221,31 +221,38 @@ setversion_out:
221 struct file *donor_filp; 221 struct file *donor_filp;
222 int err; 222 int err;
223 223
224 if (!(filp->f_mode & FMODE_READ) ||
225 !(filp->f_mode & FMODE_WRITE))
226 return -EBADF;
227
224 if (copy_from_user(&me, 228 if (copy_from_user(&me,
225 (struct move_extent __user *)arg, sizeof(me))) 229 (struct move_extent __user *)arg, sizeof(me)))
226 return -EFAULT; 230 return -EFAULT;
231 me.moved_len = 0;
227 232
228 donor_filp = fget(me.donor_fd); 233 donor_filp = fget(me.donor_fd);
229 if (!donor_filp) 234 if (!donor_filp)
230 return -EBADF; 235 return -EBADF;
231 236
232 if (!capable(CAP_DAC_OVERRIDE)) { 237 if (!(donor_filp->f_mode & FMODE_WRITE)) {
233 if ((current->real_cred->fsuid != inode->i_uid) || 238 err = -EBADF;
234 !(inode->i_mode & S_IRUSR) || 239 goto mext_out;
235 !(donor_filp->f_dentry->d_inode->i_mode &
236 S_IRUSR)) {
237 fput(donor_filp);
238 return -EACCES;
239 }
240 } 240 }
241 241
242 err = mnt_want_write(filp->f_path.mnt);
243 if (err)
244 goto mext_out;
245
242 err = ext4_move_extents(filp, donor_filp, me.orig_start, 246 err = ext4_move_extents(filp, donor_filp, me.orig_start,
243 me.donor_start, me.len, &me.moved_len); 247 me.donor_start, me.len, &me.moved_len);
244 fput(donor_filp); 248 mnt_drop_write(filp->f_path.mnt);
249 if (me.moved_len > 0)
250 file_remove_suid(donor_filp);
245 251
246 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me))) 252 if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
247 return -EFAULT; 253 err = -EFAULT;
248 254mext_out:
255 fput(donor_filp);
249 return err; 256 return err;
250 } 257 }
251 258
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index bba12824defa..c1e19d5b5985 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -142,7 +142,7 @@
142 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The 142 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
143 * value of s_mb_order2_reqs can be tuned via 143 * value of s_mb_order2_reqs can be tuned via
144 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to 144 * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
145 * stripe size (sbi->s_stripe), we try to search for contigous block in 145 * stripe size (sbi->s_stripe), we try to search for contiguous block in
146 * stripe size. This should result in better allocation on RAID setups. If 146 * stripe size. This should result in better allocation on RAID setups. If
147 * not, we search in the specific group using bitmap for best extents. The 147 * not, we search in the specific group using bitmap for best extents. The
148 * tunable min_to_scan and max_to_scan control the behaviour here. 148 * tunable min_to_scan and max_to_scan control the behaviour here.
@@ -2529,7 +2529,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2529 struct ext4_group_info *db; 2529 struct ext4_group_info *db;
2530 int err, count = 0, count2 = 0; 2530 int err, count = 0, count2 = 0;
2531 struct ext4_free_data *entry; 2531 struct ext4_free_data *entry;
2532 ext4_fsblk_t discard_block;
2533 struct list_head *l, *ltmp; 2532 struct list_head *l, *ltmp;
2534 2533
2535 list_for_each_safe(l, ltmp, &txn->t_private_list) { 2534 list_for_each_safe(l, ltmp, &txn->t_private_list) {
@@ -2559,13 +2558,19 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
2559 page_cache_release(e4b.bd_bitmap_page); 2558 page_cache_release(e4b.bd_bitmap_page);
2560 } 2559 }
2561 ext4_unlock_group(sb, entry->group); 2560 ext4_unlock_group(sb, entry->group);
2562 discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) 2561 if (test_opt(sb, DISCARD)) {
2563 + entry->start_blk 2562 ext4_fsblk_t discard_block;
2564 + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); 2563 struct ext4_super_block *es = EXT4_SB(sb)->s_es;
2565 trace_ext4_discard_blocks(sb, (unsigned long long)discard_block, 2564
2566 entry->count); 2565 discard_block = (ext4_fsblk_t)entry->group *
2567 sb_issue_discard(sb, discard_block, entry->count); 2566 EXT4_BLOCKS_PER_GROUP(sb)
2568 2567 + entry->start_blk
2568 + le32_to_cpu(es->s_first_data_block);
2569 trace_ext4_discard_blocks(sb,
2570 (unsigned long long)discard_block,
2571 entry->count);
2572 sb_issue_discard(sb, discard_block, entry->count);
2573 }
2569 kmem_cache_free(ext4_free_ext_cachep, entry); 2574 kmem_cache_free(ext4_free_ext_cachep, entry);
2570 ext4_mb_release_desc(&e4b); 2575 ext4_mb_release_desc(&e4b);
2571 } 2576 }
@@ -3006,6 +3011,24 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
3006} 3011}
3007 3012
3008/* 3013/*
3014 * Called on failure; free up any blocks from the inode PA for this
3015 * context. We don't need this for MB_GROUP_PA because we only change
3016 * pa_free in ext4_mb_release_context(), but on failure, we've already
3017 * zeroed out ac->ac_b_ex.fe_len, so group_pa->pa_free is not changed.
3018 */
3019static void ext4_discard_allocated_blocks(struct ext4_allocation_context *ac)
3020{
3021 struct ext4_prealloc_space *pa = ac->ac_pa;
3022 int len;
3023
3024 if (pa && pa->pa_type == MB_INODE_PA) {
3025 len = ac->ac_b_ex.fe_len;
3026 pa->pa_free += len;
3027 }
3028
3029}
3030
3031/*
3009 * use blocks preallocated to inode 3032 * use blocks preallocated to inode
3010 */ 3033 */
3011static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, 3034static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
@@ -4290,6 +4313,7 @@ repeat:
4290 ac->ac_status = AC_STATUS_CONTINUE; 4313 ac->ac_status = AC_STATUS_CONTINUE;
4291 goto repeat; 4314 goto repeat;
4292 } else if (*errp) { 4315 } else if (*errp) {
4316 ext4_discard_allocated_blocks(ac);
4293 ac->ac_b_ex.fe_len = 0; 4317 ac->ac_b_ex.fe_len = 0;
4294 ar->len = 0; 4318 ar->len = 0;
4295 ext4_mb_show_ac(ac); 4319 ext4_mb_show_ac(ac);
@@ -4422,18 +4446,24 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
4422 return 0; 4446 return 0;
4423} 4447}
4424 4448
4425/* 4449/**
4426 * Main entry point into mballoc to free blocks 4450 * ext4_free_blocks() -- Free given blocks and update quota
4451 * @handle: handle for this transaction
4452 * @inode: inode
4453 * @block: start physical block to free
4454 * @count: number of blocks to count
4455 * @metadata: Are these metadata blocks
4427 */ 4456 */
4428void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, 4457void ext4_free_blocks(handle_t *handle, struct inode *inode,
4429 ext4_fsblk_t block, unsigned long count, 4458 struct buffer_head *bh, ext4_fsblk_t block,
4430 int metadata, unsigned long *freed) 4459 unsigned long count, int flags)
4431{ 4460{
4432 struct buffer_head *bitmap_bh = NULL; 4461 struct buffer_head *bitmap_bh = NULL;
4433 struct super_block *sb = inode->i_sb; 4462 struct super_block *sb = inode->i_sb;
4434 struct ext4_allocation_context *ac = NULL; 4463 struct ext4_allocation_context *ac = NULL;
4435 struct ext4_group_desc *gdp; 4464 struct ext4_group_desc *gdp;
4436 struct ext4_super_block *es; 4465 struct ext4_super_block *es;
4466 unsigned long freed = 0;
4437 unsigned int overflow; 4467 unsigned int overflow;
4438 ext4_grpblk_t bit; 4468 ext4_grpblk_t bit;
4439 struct buffer_head *gd_bh; 4469 struct buffer_head *gd_bh;
@@ -4443,13 +4473,16 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4443 int err = 0; 4473 int err = 0;
4444 int ret; 4474 int ret;
4445 4475
4446 *freed = 0; 4476 if (bh) {
4477 if (block)
4478 BUG_ON(block != bh->b_blocknr);
4479 else
4480 block = bh->b_blocknr;
4481 }
4447 4482
4448 sbi = EXT4_SB(sb); 4483 sbi = EXT4_SB(sb);
4449 es = EXT4_SB(sb)->s_es; 4484 es = EXT4_SB(sb)->s_es;
4450 if (block < le32_to_cpu(es->s_first_data_block) || 4485 if (!ext4_data_block_valid(sbi, block, count)) {
4451 block + count < block ||
4452 block + count > ext4_blocks_count(es)) {
4453 ext4_error(sb, __func__, 4486 ext4_error(sb, __func__,
4454 "Freeing blocks not in datazone - " 4487 "Freeing blocks not in datazone - "
4455 "block = %llu, count = %lu", block, count); 4488 "block = %llu, count = %lu", block, count);
@@ -4457,7 +4490,32 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode,
4457 } 4490 }
4458 4491
4459 ext4_debug("freeing block %llu\n", block); 4492 ext4_debug("freeing block %llu\n", block);
4460 trace_ext4_free_blocks(inode, block, count, metadata); 4493 trace_ext4_free_blocks(inode, block, count, flags);
4494
4495 if (flags & EXT4_FREE_BLOCKS_FORGET) {
4496 struct buffer_head *tbh = bh;
4497 int i;
4498
4499 BUG_ON(bh && (count > 1));
4500
4501 for (i = 0; i < count; i++) {
4502 if (!bh)
4503 tbh = sb_find_get_block(inode->i_sb,
4504 block + i);
4505 ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
4506 inode, tbh, block + i);
4507 }
4508 }
4509
4510 /*
4511 * We need to make sure we don't reuse the freed block until
4512 * after the transaction is committed, which we can do by
4513 * treating the block as metadata, below. We make an
4514 * exception if the inode is to be written in writeback mode
4515 * since writeback mode has weak data consistency guarantees.
4516 */
4517 if (!ext4_should_writeback_data(inode))
4518 flags |= EXT4_FREE_BLOCKS_METADATA;
4461 4519
4462 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); 4520 ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
4463 if (ac) { 4521 if (ac) {
@@ -4533,7 +4591,8 @@ do_more:
4533 err = ext4_mb_load_buddy(sb, block_group, &e4b); 4591 err = ext4_mb_load_buddy(sb, block_group, &e4b);
4534 if (err) 4592 if (err)
4535 goto error_return; 4593 goto error_return;
4536 if (metadata && ext4_handle_valid(handle)) { 4594
4595 if ((flags & EXT4_FREE_BLOCKS_METADATA) && ext4_handle_valid(handle)) {
4537 struct ext4_free_data *new_entry; 4596 struct ext4_free_data *new_entry;
4538 /* 4597 /*
4539 * blocks being freed are metadata. these blocks shouldn't 4598 * blocks being freed are metadata. these blocks shouldn't
@@ -4572,7 +4631,7 @@ do_more:
4572 4631
4573 ext4_mb_release_desc(&e4b); 4632 ext4_mb_release_desc(&e4b);
4574 4633
4575 *freed += count; 4634 freed += count;
4576 4635
4577 /* We dirtied the bitmap block */ 4636 /* We dirtied the bitmap block */
4578 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); 4637 BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
@@ -4592,6 +4651,8 @@ do_more:
4592 } 4651 }
4593 sb->s_dirt = 1; 4652 sb->s_dirt = 1;
4594error_return: 4653error_return:
4654 if (freed)
4655 vfs_dq_free_block(inode, freed);
4595 brelse(bitmap_bh); 4656 brelse(bitmap_bh);
4596 ext4_std_error(sb, err); 4657 ext4_std_error(sb, err);
4597 if (ac) 4658 if (ac)
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index a93d5b80f3e2..81415814b00b 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -238,7 +238,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode)
238 * So allocate a credit of 3. We may update 238 * So allocate a credit of 3. We may update
239 * quota (user and group). 239 * quota (user and group).
240 */ 240 */
241 needed = 3 + 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); 241 needed = 3 + EXT4_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb);
242 242
243 if (ext4_journal_extend(handle, needed) != 0) 243 if (ext4_journal_extend(handle, needed) != 0)
244 retval = ext4_journal_restart(handle, needed); 244 retval = ext4_journal_restart(handle, needed);
@@ -262,13 +262,17 @@ static int free_dind_blocks(handle_t *handle,
262 for (i = 0; i < max_entries; i++) { 262 for (i = 0; i < max_entries; i++) {
263 if (tmp_idata[i]) { 263 if (tmp_idata[i]) {
264 extend_credit_for_blkdel(handle, inode); 264 extend_credit_for_blkdel(handle, inode);
265 ext4_free_blocks(handle, inode, 265 ext4_free_blocks(handle, inode, 0,
266 le32_to_cpu(tmp_idata[i]), 1, 1); 266 le32_to_cpu(tmp_idata[i]), 1,
267 EXT4_FREE_BLOCKS_METADATA |
268 EXT4_FREE_BLOCKS_FORGET);
267 } 269 }
268 } 270 }
269 put_bh(bh); 271 put_bh(bh);
270 extend_credit_for_blkdel(handle, inode); 272 extend_credit_for_blkdel(handle, inode);
271 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 273 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
274 EXT4_FREE_BLOCKS_METADATA |
275 EXT4_FREE_BLOCKS_FORGET);
272 return 0; 276 return 0;
273} 277}
274 278
@@ -297,7 +301,9 @@ static int free_tind_blocks(handle_t *handle,
297 } 301 }
298 put_bh(bh); 302 put_bh(bh);
299 extend_credit_for_blkdel(handle, inode); 303 extend_credit_for_blkdel(handle, inode);
300 ext4_free_blocks(handle, inode, le32_to_cpu(i_data), 1, 1); 304 ext4_free_blocks(handle, inode, 0, le32_to_cpu(i_data), 1,
305 EXT4_FREE_BLOCKS_METADATA |
306 EXT4_FREE_BLOCKS_FORGET);
301 return 0; 307 return 0;
302} 308}
303 309
@@ -308,8 +314,10 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data)
308 /* ei->i_data[EXT4_IND_BLOCK] */ 314 /* ei->i_data[EXT4_IND_BLOCK] */
309 if (i_data[0]) { 315 if (i_data[0]) {
310 extend_credit_for_blkdel(handle, inode); 316 extend_credit_for_blkdel(handle, inode);
311 ext4_free_blocks(handle, inode, 317 ext4_free_blocks(handle, inode, 0,
312 le32_to_cpu(i_data[0]), 1, 1); 318 le32_to_cpu(i_data[0]), 1,
319 EXT4_FREE_BLOCKS_METADATA |
320 EXT4_FREE_BLOCKS_FORGET);
313 } 321 }
314 322
315 /* ei->i_data[EXT4_DIND_BLOCK] */ 323 /* ei->i_data[EXT4_DIND_BLOCK] */
@@ -419,7 +427,8 @@ static int free_ext_idx(handle_t *handle, struct inode *inode,
419 } 427 }
420 put_bh(bh); 428 put_bh(bh);
421 extend_credit_for_blkdel(handle, inode); 429 extend_credit_for_blkdel(handle, inode);
422 ext4_free_blocks(handle, inode, block, 1, 1); 430 ext4_free_blocks(handle, inode, 0, block, 1,
431 EXT4_FREE_BLOCKS_METADATA | EXT4_FREE_BLOCKS_FORGET);
423 return retval; 432 return retval;
424} 433}
425 434
@@ -477,7 +486,7 @@ int ext4_ext_migrate(struct inode *inode)
477 handle = ext4_journal_start(inode, 486 handle = ext4_journal_start(inode,
478 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 487 EXT4_DATA_TRANS_BLOCKS(inode->i_sb) +
479 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 488 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
480 2 * EXT4_QUOTA_INIT_BLOCKS(inode->i_sb) 489 EXT4_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)
481 + 1); 490 + 1);
482 if (IS_ERR(handle)) { 491 if (IS_ERR(handle)) {
483 retval = PTR_ERR(handle); 492 retval = PTR_ERR(handle);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 25b6b1457360..82c415be87a4 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -77,12 +77,14 @@ static int
77mext_next_extent(struct inode *inode, struct ext4_ext_path *path, 77mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
78 struct ext4_extent **extent) 78 struct ext4_extent **extent)
79{ 79{
80 struct ext4_extent_header *eh;
80 int ppos, leaf_ppos = path->p_depth; 81 int ppos, leaf_ppos = path->p_depth;
81 82
82 ppos = leaf_ppos; 83 ppos = leaf_ppos;
83 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { 84 if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
84 /* leaf block */ 85 /* leaf block */
85 *extent = ++path[ppos].p_ext; 86 *extent = ++path[ppos].p_ext;
87 path[ppos].p_block = ext_pblock(path[ppos].p_ext);
86 return 0; 88 return 0;
87 } 89 }
88 90
@@ -119,9 +121,18 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
119 ext_block_hdr(path[cur_ppos+1].p_bh); 121 ext_block_hdr(path[cur_ppos+1].p_bh);
120 } 122 }
121 123
124 path[leaf_ppos].p_ext = *extent = NULL;
125
126 eh = path[leaf_ppos].p_hdr;
127 if (le16_to_cpu(eh->eh_entries) == 0)
128 /* empty leaf is found */
129 return -ENODATA;
130
122 /* leaf block */ 131 /* leaf block */
123 path[leaf_ppos].p_ext = *extent = 132 path[leaf_ppos].p_ext = *extent =
124 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); 133 EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
134 path[leaf_ppos].p_block =
135 ext_pblock(path[leaf_ppos].p_ext);
125 return 0; 136 return 0;
126 } 137 }
127 } 138 }
@@ -155,40 +166,15 @@ mext_check_null_inode(struct inode *inode1, struct inode *inode2,
155} 166}
156 167
157/** 168/**
158 * mext_double_down_read - Acquire two inodes' read semaphore 169 * double_down_write_data_sem - Acquire two inodes' write lock of i_data_sem
159 *
160 * @orig_inode: original inode structure
161 * @donor_inode: donor inode structure
162 * Acquire read semaphore of the two inodes (orig and donor) by i_ino order.
163 */
164static void
165mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
166{
167 struct inode *first = orig_inode, *second = donor_inode;
168
169 /*
170 * Use the inode number to provide the stable locking order instead
171 * of its address, because the C language doesn't guarantee you can
172 * compare pointers that don't come from the same array.
173 */
174 if (donor_inode->i_ino < orig_inode->i_ino) {
175 first = donor_inode;
176 second = orig_inode;
177 }
178
179 down_read(&EXT4_I(first)->i_data_sem);
180 down_read(&EXT4_I(second)->i_data_sem);
181}
182
183/**
184 * mext_double_down_write - Acquire two inodes' write semaphore
185 * 170 *
186 * @orig_inode: original inode structure 171 * @orig_inode: original inode structure
187 * @donor_inode: donor inode structure 172 * @donor_inode: donor inode structure
188 * Acquire write semaphore of the two inodes (orig and donor) by i_ino order. 173 * Acquire write lock of i_data_sem of the two inodes (orig and donor) by
174 * i_ino order.
189 */ 175 */
190static void 176static void
191mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode) 177double_down_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
192{ 178{
193 struct inode *first = orig_inode, *second = donor_inode; 179 struct inode *first = orig_inode, *second = donor_inode;
194 180
@@ -203,32 +189,18 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
203 } 189 }
204 190
205 down_write(&EXT4_I(first)->i_data_sem); 191 down_write(&EXT4_I(first)->i_data_sem);
206 down_write(&EXT4_I(second)->i_data_sem); 192 down_write_nested(&EXT4_I(second)->i_data_sem, SINGLE_DEPTH_NESTING);
207} 193}
208 194
209/** 195/**
210 * mext_double_up_read - Release two inodes' read semaphore 196 * double_up_write_data_sem - Release two inodes' write lock of i_data_sem
211 * 197 *
212 * @orig_inode: original inode structure to be released its lock first 198 * @orig_inode: original inode structure to be released its lock first
213 * @donor_inode: donor inode structure to be released its lock second 199 * @donor_inode: donor inode structure to be released its lock second
214 * Release read semaphore of two inodes (orig and donor). 200 * Release write lock of i_data_sem of two inodes (orig and donor).
215 */ 201 */
216static void 202static void
217mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode) 203double_up_write_data_sem(struct inode *orig_inode, struct inode *donor_inode)
218{
219 up_read(&EXT4_I(orig_inode)->i_data_sem);
220 up_read(&EXT4_I(donor_inode)->i_data_sem);
221}
222
223/**
224 * mext_double_up_write - Release two inodes' write semaphore
225 *
226 * @orig_inode: original inode structure to be released its lock first
227 * @donor_inode: donor inode structure to be released its lock second
228 * Release write semaphore of two inodes (orig and donor).
229 */
230static void
231mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
232{ 204{
233 up_write(&EXT4_I(orig_inode)->i_data_sem); 205 up_write(&EXT4_I(orig_inode)->i_data_sem);
234 up_write(&EXT4_I(donor_inode)->i_data_sem); 206 up_write(&EXT4_I(donor_inode)->i_data_sem);
@@ -596,7 +568,7 @@ out:
596 * @tmp_oext: the extent that will belong to the donor inode 568 * @tmp_oext: the extent that will belong to the donor inode
597 * @orig_off: block offset of original inode 569 * @orig_off: block offset of original inode
598 * @donor_off: block offset of donor inode 570 * @donor_off: block offset of donor inode
599 * @max_count: the maximun length of extents 571 * @max_count: the maximum length of extents
600 * 572 *
601 * Return 0 on success, or a negative error value on failure. 573 * Return 0 on success, or a negative error value on failure.
602 */ 574 */
@@ -661,6 +633,7 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
661 * @donor_inode: donor inode 633 * @donor_inode: donor inode
662 * @from: block offset of orig_inode 634 * @from: block offset of orig_inode
663 * @count: block count to be replaced 635 * @count: block count to be replaced
636 * @err: pointer to save return value
664 * 637 *
665 * Replace original inode extents and donor inode extents page by page. 638 * Replace original inode extents and donor inode extents page by page.
666 * We implement this replacement in the following three steps: 639 * We implement this replacement in the following three steps:
@@ -671,33 +644,33 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
671 * 3. Change the block information of donor inode to point at the saved 644 * 3. Change the block information of donor inode to point at the saved
672 * original inode blocks in the dummy extents. 645 * original inode blocks in the dummy extents.
673 * 646 *
674 * Return 0 on success, or a negative error value on failure. 647 * Return replaced block count.
675 */ 648 */
676static int 649static int
677mext_replace_branches(handle_t *handle, struct inode *orig_inode, 650mext_replace_branches(handle_t *handle, struct inode *orig_inode,
678 struct inode *donor_inode, ext4_lblk_t from, 651 struct inode *donor_inode, ext4_lblk_t from,
679 ext4_lblk_t count) 652 ext4_lblk_t count, int *err)
680{ 653{
681 struct ext4_ext_path *orig_path = NULL; 654 struct ext4_ext_path *orig_path = NULL;
682 struct ext4_ext_path *donor_path = NULL; 655 struct ext4_ext_path *donor_path = NULL;
683 struct ext4_extent *oext, *dext; 656 struct ext4_extent *oext, *dext;
684 struct ext4_extent tmp_dext, tmp_oext; 657 struct ext4_extent tmp_dext, tmp_oext;
685 ext4_lblk_t orig_off = from, donor_off = from; 658 ext4_lblk_t orig_off = from, donor_off = from;
686 int err = 0;
687 int depth; 659 int depth;
688 int replaced_count = 0; 660 int replaced_count = 0;
689 int dext_alen; 661 int dext_alen;
690 662
691 mext_double_down_write(orig_inode, donor_inode); 663 /* Protect extent trees against block allocations via delalloc */
664 double_down_write_data_sem(orig_inode, donor_inode);
692 665
693 /* Get the original extent for the block "orig_off" */ 666 /* Get the original extent for the block "orig_off" */
694 err = get_ext_path(orig_inode, orig_off, &orig_path); 667 *err = get_ext_path(orig_inode, orig_off, &orig_path);
695 if (err) 668 if (*err)
696 goto out; 669 goto out;
697 670
698 /* Get the donor extent for the head */ 671 /* Get the donor extent for the head */
699 err = get_ext_path(donor_inode, donor_off, &donor_path); 672 *err = get_ext_path(donor_inode, donor_off, &donor_path);
700 if (err) 673 if (*err)
701 goto out; 674 goto out;
702 depth = ext_depth(orig_inode); 675 depth = ext_depth(orig_inode);
703 oext = orig_path[depth].p_ext; 676 oext = orig_path[depth].p_ext;
@@ -707,9 +680,9 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
707 dext = donor_path[depth].p_ext; 680 dext = donor_path[depth].p_ext;
708 tmp_dext = *dext; 681 tmp_dext = *dext;
709 682
710 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 683 *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
711 donor_off, count); 684 donor_off, count);
712 if (err) 685 if (*err)
713 goto out; 686 goto out;
714 687
715 /* Loop for the donor extents */ 688 /* Loop for the donor extents */
@@ -718,7 +691,7 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
718 if (!dext) { 691 if (!dext) {
719 ext4_error(donor_inode->i_sb, __func__, 692 ext4_error(donor_inode->i_sb, __func__,
720 "The extent for donor must be found"); 693 "The extent for donor must be found");
721 err = -EIO; 694 *err = -EIO;
722 goto out; 695 goto out;
723 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { 696 } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
724 ext4_error(donor_inode->i_sb, __func__, 697 ext4_error(donor_inode->i_sb, __func__,
@@ -726,20 +699,20 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
726 "extent(%u) should be equal", 699 "extent(%u) should be equal",
727 donor_off, 700 donor_off,
728 le32_to_cpu(tmp_dext.ee_block)); 701 le32_to_cpu(tmp_dext.ee_block));
729 err = -EIO; 702 *err = -EIO;
730 goto out; 703 goto out;
731 } 704 }
732 705
733 /* Set donor extent to orig extent */ 706 /* Set donor extent to orig extent */
734 err = mext_leaf_block(handle, orig_inode, 707 *err = mext_leaf_block(handle, orig_inode,
735 orig_path, &tmp_dext, &orig_off); 708 orig_path, &tmp_dext, &orig_off);
736 if (err < 0) 709 if (*err)
737 goto out; 710 goto out;
738 711
739 /* Set orig extent to donor extent */ 712 /* Set orig extent to donor extent */
740 err = mext_leaf_block(handle, donor_inode, 713 *err = mext_leaf_block(handle, donor_inode,
741 donor_path, &tmp_oext, &donor_off); 714 donor_path, &tmp_oext, &donor_off);
742 if (err < 0) 715 if (*err)
743 goto out; 716 goto out;
744 717
745 dext_alen = ext4_ext_get_actual_len(&tmp_dext); 718 dext_alen = ext4_ext_get_actual_len(&tmp_dext);
@@ -753,35 +726,25 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
753 726
754 if (orig_path) 727 if (orig_path)
755 ext4_ext_drop_refs(orig_path); 728 ext4_ext_drop_refs(orig_path);
756 err = get_ext_path(orig_inode, orig_off, &orig_path); 729 *err = get_ext_path(orig_inode, orig_off, &orig_path);
757 if (err) 730 if (*err)
758 goto out; 731 goto out;
759 depth = ext_depth(orig_inode); 732 depth = ext_depth(orig_inode);
760 oext = orig_path[depth].p_ext; 733 oext = orig_path[depth].p_ext;
761 if (le32_to_cpu(oext->ee_block) +
762 ext4_ext_get_actual_len(oext) <= orig_off) {
763 err = 0;
764 goto out;
765 }
766 tmp_oext = *oext; 734 tmp_oext = *oext;
767 735
768 if (donor_path) 736 if (donor_path)
769 ext4_ext_drop_refs(donor_path); 737 ext4_ext_drop_refs(donor_path);
770 err = get_ext_path(donor_inode, donor_off, &donor_path); 738 *err = get_ext_path(donor_inode, donor_off, &donor_path);
771 if (err) 739 if (*err)
772 goto out; 740 goto out;
773 depth = ext_depth(donor_inode); 741 depth = ext_depth(donor_inode);
774 dext = donor_path[depth].p_ext; 742 dext = donor_path[depth].p_ext;
775 if (le32_to_cpu(dext->ee_block) +
776 ext4_ext_get_actual_len(dext) <= donor_off) {
777 err = 0;
778 goto out;
779 }
780 tmp_dext = *dext; 743 tmp_dext = *dext;
781 744
782 err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, 745 *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
783 donor_off, count - replaced_count); 746 donor_off, count - replaced_count);
784 if (err) 747 if (*err)
785 goto out; 748 goto out;
786 } 749 }
787 750
@@ -795,8 +758,12 @@ out:
795 kfree(donor_path); 758 kfree(donor_path);
796 } 759 }
797 760
798 mext_double_up_write(orig_inode, donor_inode); 761 ext4_ext_invalidate_cache(orig_inode);
799 return err; 762 ext4_ext_invalidate_cache(donor_inode);
763
764 double_up_write_data_sem(orig_inode, donor_inode);
765
766 return replaced_count;
800} 767}
801 768
802/** 769/**
@@ -808,16 +775,17 @@ out:
808 * @data_offset_in_page: block index where data swapping starts 775 * @data_offset_in_page: block index where data swapping starts
809 * @block_len_in_page: the number of blocks to be swapped 776 * @block_len_in_page: the number of blocks to be swapped
810 * @uninit: orig extent is uninitialized or not 777 * @uninit: orig extent is uninitialized or not
778 * @err: pointer to save return value
811 * 779 *
812 * Save the data in original inode blocks and replace original inode extents 780 * Save the data in original inode blocks and replace original inode extents
813 * with donor inode extents by calling mext_replace_branches(). 781 * with donor inode extents by calling mext_replace_branches().
814 * Finally, write out the saved data in new original inode blocks. Return 0 782 * Finally, write out the saved data in new original inode blocks. Return
815 * on success, or a negative error value on failure. 783 * replaced block count.
816 */ 784 */
817static int 785static int
818move_extent_per_page(struct file *o_filp, struct inode *donor_inode, 786move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
819 pgoff_t orig_page_offset, int data_offset_in_page, 787 pgoff_t orig_page_offset, int data_offset_in_page,
820 int block_len_in_page, int uninit) 788 int block_len_in_page, int uninit, int *err)
821{ 789{
822 struct inode *orig_inode = o_filp->f_dentry->d_inode; 790 struct inode *orig_inode = o_filp->f_dentry->d_inode;
823 struct address_space *mapping = orig_inode->i_mapping; 791 struct address_space *mapping = orig_inode->i_mapping;
@@ -829,9 +797,11 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
829 long long offs = orig_page_offset << PAGE_CACHE_SHIFT; 797 long long offs = orig_page_offset << PAGE_CACHE_SHIFT;
830 unsigned long blocksize = orig_inode->i_sb->s_blocksize; 798 unsigned long blocksize = orig_inode->i_sb->s_blocksize;
831 unsigned int w_flags = 0; 799 unsigned int w_flags = 0;
832 unsigned int tmp_data_len, data_len; 800 unsigned int tmp_data_size, data_size, replaced_size;
833 void *fsdata; 801 void *fsdata;
834 int ret, i, jblocks; 802 int i, jblocks;
803 int err2 = 0;
804 int replaced_count = 0;
835 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; 805 int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
836 806
837 /* 807 /*
@@ -841,8 +811,8 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
841 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2; 811 jblocks = ext4_writepage_trans_blocks(orig_inode) * 2;
842 handle = ext4_journal_start(orig_inode, jblocks); 812 handle = ext4_journal_start(orig_inode, jblocks);
843 if (IS_ERR(handle)) { 813 if (IS_ERR(handle)) {
844 ret = PTR_ERR(handle); 814 *err = PTR_ERR(handle);
845 return ret; 815 return 0;
846 } 816 }
847 817
848 if (segment_eq(get_fs(), KERNEL_DS)) 818 if (segment_eq(get_fs(), KERNEL_DS))
@@ -858,39 +828,36 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
858 * Just swap data blocks between orig and donor. 828 * Just swap data blocks between orig and donor.
859 */ 829 */
860 if (uninit) { 830 if (uninit) {
861 ret = mext_replace_branches(handle, orig_inode, 831 replaced_count = mext_replace_branches(handle, orig_inode,
862 donor_inode, orig_blk_offset, 832 donor_inode, orig_blk_offset,
863 block_len_in_page); 833 block_len_in_page, err);
864
865 /* Clear the inode cache not to refer to the old data */
866 ext4_ext_invalidate_cache(orig_inode);
867 ext4_ext_invalidate_cache(donor_inode);
868 goto out2; 834 goto out2;
869 } 835 }
870 836
871 offs = (long long)orig_blk_offset << orig_inode->i_blkbits; 837 offs = (long long)orig_blk_offset << orig_inode->i_blkbits;
872 838
873 /* Calculate data_len */ 839 /* Calculate data_size */
874 if ((orig_blk_offset + block_len_in_page - 1) == 840 if ((orig_blk_offset + block_len_in_page - 1) ==
875 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { 841 ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
876 /* Replace the last block */ 842 /* Replace the last block */
877 tmp_data_len = orig_inode->i_size & (blocksize - 1); 843 tmp_data_size = orig_inode->i_size & (blocksize - 1);
878 /* 844 /*
879 * If data_len equal zero, it shows data_len is multiples of 845 * If data_size equal zero, it shows data_size is multiples of
880 * blocksize. So we set appropriate value. 846 * blocksize. So we set appropriate value.
881 */ 847 */
882 if (tmp_data_len == 0) 848 if (tmp_data_size == 0)
883 tmp_data_len = blocksize; 849 tmp_data_size = blocksize;
884 850
885 data_len = tmp_data_len + 851 data_size = tmp_data_size +
886 ((block_len_in_page - 1) << orig_inode->i_blkbits); 852 ((block_len_in_page - 1) << orig_inode->i_blkbits);
887 } else { 853 } else
888 data_len = block_len_in_page << orig_inode->i_blkbits; 854 data_size = block_len_in_page << orig_inode->i_blkbits;
889 } 855
856 replaced_size = data_size;
890 857
891 ret = a_ops->write_begin(o_filp, mapping, offs, data_len, w_flags, 858 *err = a_ops->write_begin(o_filp, mapping, offs, data_size, w_flags,
892 &page, &fsdata); 859 &page, &fsdata);
893 if (unlikely(ret < 0)) 860 if (unlikely(*err < 0))
894 goto out; 861 goto out;
895 862
896 if (!PageUptodate(page)) { 863 if (!PageUptodate(page)) {
@@ -911,14 +878,17 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
911 /* Release old bh and drop refs */ 878 /* Release old bh and drop refs */
912 try_to_release_page(page, 0); 879 try_to_release_page(page, 0);
913 880
914 ret = mext_replace_branches(handle, orig_inode, donor_inode, 881 replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
915 orig_blk_offset, block_len_in_page); 882 orig_blk_offset, block_len_in_page,
916 if (ret < 0) 883 &err2);
917 goto out; 884 if (err2) {
918 885 if (replaced_count) {
919 /* Clear the inode cache not to refer to the old data */ 886 block_len_in_page = replaced_count;
920 ext4_ext_invalidate_cache(orig_inode); 887 replaced_size =
921 ext4_ext_invalidate_cache(donor_inode); 888 block_len_in_page << orig_inode->i_blkbits;
889 } else
890 goto out;
891 }
922 892
923 if (!page_has_buffers(page)) 893 if (!page_has_buffers(page))
924 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0); 894 create_empty_buffers(page, 1 << orig_inode->i_blkbits, 0);
@@ -928,16 +898,16 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
928 bh = bh->b_this_page; 898 bh = bh->b_this_page;
929 899
930 for (i = 0; i < block_len_in_page; i++) { 900 for (i = 0; i < block_len_in_page; i++) {
931 ret = ext4_get_block(orig_inode, 901 *err = ext4_get_block(orig_inode,
932 (sector_t)(orig_blk_offset + i), bh, 0); 902 (sector_t)(orig_blk_offset + i), bh, 0);
933 if (ret < 0) 903 if (*err < 0)
934 goto out; 904 goto out;
935 905
936 if (bh->b_this_page != NULL) 906 if (bh->b_this_page != NULL)
937 bh = bh->b_this_page; 907 bh = bh->b_this_page;
938 } 908 }
939 909
940 ret = a_ops->write_end(o_filp, mapping, offs, data_len, data_len, 910 *err = a_ops->write_end(o_filp, mapping, offs, data_size, replaced_size,
941 page, fsdata); 911 page, fsdata);
942 page = NULL; 912 page = NULL;
943 913
@@ -951,7 +921,10 @@ out:
951out2: 921out2:
952 ext4_journal_stop(handle); 922 ext4_journal_stop(handle);
953 923
954 return ret < 0 ? ret : 0; 924 if (err2)
925 *err = err2;
926
927 return replaced_count;
955} 928}
956 929
957/** 930/**
@@ -962,7 +935,6 @@ out2:
962 * @orig_start: logical start offset in block for orig 935 * @orig_start: logical start offset in block for orig
963 * @donor_start: logical start offset in block for donor 936 * @donor_start: logical start offset in block for donor
964 * @len: the number of blocks to be moved 937 * @len: the number of blocks to be moved
965 * @moved_len: moved block length
966 * 938 *
967 * Check the arguments of ext4_move_extents() whether the files can be 939 * Check the arguments of ext4_move_extents() whether the files can be
968 * exchanged with each other. 940 * exchanged with each other.
@@ -970,8 +942,8 @@ out2:
970 */ 942 */
971static int 943static int
972mext_check_arguments(struct inode *orig_inode, 944mext_check_arguments(struct inode *orig_inode,
973 struct inode *donor_inode, __u64 orig_start, 945 struct inode *donor_inode, __u64 orig_start,
974 __u64 donor_start, __u64 *len, __u64 moved_len) 946 __u64 donor_start, __u64 *len)
975{ 947{
976 ext4_lblk_t orig_blocks, donor_blocks; 948 ext4_lblk_t orig_blocks, donor_blocks;
977 unsigned int blkbits = orig_inode->i_blkbits; 949 unsigned int blkbits = orig_inode->i_blkbits;
@@ -985,6 +957,13 @@ mext_check_arguments(struct inode *orig_inode,
985 return -EINVAL; 957 return -EINVAL;
986 } 958 }
987 959
960 if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
961 ext4_debug("ext4 move extent: suid or sgid is set"
962 " to donor file [ino:orig %lu, donor %lu]\n",
963 orig_inode->i_ino, donor_inode->i_ino);
964 return -EINVAL;
965 }
966
988 /* Ext4 move extent does not support swapfile */ 967 /* Ext4 move extent does not support swapfile */
989 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) { 968 if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
990 ext4_debug("ext4 move extent: The argument files should " 969 ext4_debug("ext4 move extent: The argument files should "
@@ -1025,13 +1004,6 @@ mext_check_arguments(struct inode *orig_inode,
1025 return -EINVAL; 1004 return -EINVAL;
1026 } 1005 }
1027 1006
1028 if (moved_len) {
1029 ext4_debug("ext4 move extent: moved_len should be 0 "
1030 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
1031 donor_inode->i_ino);
1032 return -EINVAL;
1033 }
1034
1035 if ((orig_start > EXT_MAX_BLOCK) || 1007 if ((orig_start > EXT_MAX_BLOCK) ||
1036 (donor_start > EXT_MAX_BLOCK) || 1008 (donor_start > EXT_MAX_BLOCK) ||
1037 (*len > EXT_MAX_BLOCK) || 1009 (*len > EXT_MAX_BLOCK) ||
@@ -1088,7 +1060,7 @@ mext_check_arguments(struct inode *orig_inode,
1088 } 1060 }
1089 1061
1090 if (!*len) { 1062 if (!*len) {
1091 ext4_debug("ext4 move extent: len shoudld not be 0 " 1063 ext4_debug("ext4 move extent: len should not be 0 "
1092 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, 1064 "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
1093 donor_inode->i_ino); 1065 donor_inode->i_ino);
1094 return -EINVAL; 1066 return -EINVAL;
@@ -1232,16 +1204,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1232 return -EINVAL; 1204 return -EINVAL;
1233 } 1205 }
1234 1206
1235 /* protect orig and donor against a truncate */ 1207 /* Protect orig and donor inodes against a truncate */
1236 ret1 = mext_inode_double_lock(orig_inode, donor_inode); 1208 ret1 = mext_inode_double_lock(orig_inode, donor_inode);
1237 if (ret1 < 0) 1209 if (ret1 < 0)
1238 return ret1; 1210 return ret1;
1239 1211
1240 mext_double_down_read(orig_inode, donor_inode); 1212 /* Protect extent tree against block allocations via delalloc */
1213 double_down_write_data_sem(orig_inode, donor_inode);
1241 /* Check the filesystem environment whether move_extent can be done */ 1214 /* Check the filesystem environment whether move_extent can be done */
1242 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start, 1215 ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
1243 donor_start, &len, *moved_len); 1216 donor_start, &len);
1244 mext_double_up_read(orig_inode, donor_inode);
1245 if (ret1) 1217 if (ret1)
1246 goto out; 1218 goto out;
1247 1219
@@ -1355,36 +1327,39 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1355 seq_start = le32_to_cpu(ext_cur->ee_block); 1327 seq_start = le32_to_cpu(ext_cur->ee_block);
1356 rest_blocks = seq_blocks; 1328 rest_blocks = seq_blocks;
1357 1329
1358 /* Discard preallocations of two inodes */ 1330 /*
1359 down_write(&EXT4_I(orig_inode)->i_data_sem); 1331 * Up semaphore to avoid following problems:
1360 ext4_discard_preallocations(orig_inode); 1332 * a. transaction deadlock among ext4_journal_start,
1361 up_write(&EXT4_I(orig_inode)->i_data_sem); 1333 * ->write_begin via pagefault, and jbd2_journal_commit
1362 1334 * b. racing with ->readpage, ->write_begin, and ext4_get_block
1363 down_write(&EXT4_I(donor_inode)->i_data_sem); 1335 * in move_extent_per_page
1364 ext4_discard_preallocations(donor_inode); 1336 */
1365 up_write(&EXT4_I(donor_inode)->i_data_sem); 1337 double_up_write_data_sem(orig_inode, donor_inode);
1366 1338
1367 while (orig_page_offset <= seq_end_page) { 1339 while (orig_page_offset <= seq_end_page) {
1368 1340
1369 /* Swap original branches with new branches */ 1341 /* Swap original branches with new branches */
1370 ret1 = move_extent_per_page(o_filp, donor_inode, 1342 block_len_in_page = move_extent_per_page(
1343 o_filp, donor_inode,
1371 orig_page_offset, 1344 orig_page_offset,
1372 data_offset_in_page, 1345 data_offset_in_page,
1373 block_len_in_page, uninit); 1346 block_len_in_page, uninit,
1374 if (ret1 < 0) 1347 &ret1);
1375 goto out; 1348
1376 orig_page_offset++;
1377 /* Count how many blocks we have exchanged */ 1349 /* Count how many blocks we have exchanged */
1378 *moved_len += block_len_in_page; 1350 *moved_len += block_len_in_page;
1351 if (ret1 < 0)
1352 break;
1379 if (*moved_len > len) { 1353 if (*moved_len > len) {
1380 ext4_error(orig_inode->i_sb, __func__, 1354 ext4_error(orig_inode->i_sb, __func__,
1381 "We replaced blocks too much! " 1355 "We replaced blocks too much! "
1382 "sum of replaced: %llu requested: %llu", 1356 "sum of replaced: %llu requested: %llu",
1383 *moved_len, len); 1357 *moved_len, len);
1384 ret1 = -EIO; 1358 ret1 = -EIO;
1385 goto out; 1359 break;
1386 } 1360 }
1387 1361
1362 orig_page_offset++;
1388 data_offset_in_page = 0; 1363 data_offset_in_page = 0;
1389 rest_blocks -= block_len_in_page; 1364 rest_blocks -= block_len_in_page;
1390 if (rest_blocks > blocks_per_page) 1365 if (rest_blocks > blocks_per_page)
@@ -1393,6 +1368,10 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1393 block_len_in_page = rest_blocks; 1368 block_len_in_page = rest_blocks;
1394 } 1369 }
1395 1370
1371 double_down_write_data_sem(orig_inode, donor_inode);
1372 if (ret1 < 0)
1373 break;
1374
1396 /* Decrease buffer counter */ 1375 /* Decrease buffer counter */
1397 if (holecheck_path) 1376 if (holecheck_path)
1398 ext4_ext_drop_refs(holecheck_path); 1377 ext4_ext_drop_refs(holecheck_path);
@@ -1414,6 +1393,11 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
1414 1393
1415 } 1394 }
1416out: 1395out:
1396 if (*moved_len) {
1397 ext4_discard_preallocations(orig_inode);
1398 ext4_discard_preallocations(donor_inode);
1399 }
1400
1417 if (orig_path) { 1401 if (orig_path) {
1418 ext4_ext_drop_refs(orig_path); 1402 ext4_ext_drop_refs(orig_path);
1419 kfree(orig_path); 1403 kfree(orig_path);
@@ -1422,7 +1406,7 @@ out:
1422 ext4_ext_drop_refs(holecheck_path); 1406 ext4_ext_drop_refs(holecheck_path);
1423 kfree(holecheck_path); 1407 kfree(holecheck_path);
1424 } 1408 }
1425 1409 double_up_write_data_sem(orig_inode, donor_inode);
1426 ret2 = mext_inode_double_unlock(orig_inode, donor_inode); 1410 ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
1427 1411
1428 if (ret1) 1412 if (ret1)
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 6d2c1b897fc7..17a17e10dd60 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1292,9 +1292,6 @@ errout:
1292 * add_dirent_to_buf will attempt search the directory block for 1292 * add_dirent_to_buf will attempt search the directory block for
1293 * space. It will return -ENOSPC if no space is available, and -EIO 1293 * space. It will return -ENOSPC if no space is available, and -EIO
1294 * and -EEXIST if directory entry already exists. 1294 * and -EEXIST if directory entry already exists.
1295 *
1296 * NOTE! bh is NOT released in the case where ENOSPC is returned. In
1297 * all other cases bh is released.
1298 */ 1295 */
1299static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, 1296static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1300 struct inode *inode, struct ext4_dir_entry_2 *de, 1297 struct inode *inode, struct ext4_dir_entry_2 *de,
@@ -1315,14 +1312,10 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1315 top = bh->b_data + blocksize - reclen; 1312 top = bh->b_data + blocksize - reclen;
1316 while ((char *) de <= top) { 1313 while ((char *) de <= top) {
1317 if (!ext4_check_dir_entry("ext4_add_entry", dir, de, 1314 if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
1318 bh, offset)) { 1315 bh, offset))
1319 brelse(bh);
1320 return -EIO; 1316 return -EIO;
1321 } 1317 if (ext4_match(namelen, name, de))
1322 if (ext4_match(namelen, name, de)) {
1323 brelse(bh);
1324 return -EEXIST; 1318 return -EEXIST;
1325 }
1326 nlen = EXT4_DIR_REC_LEN(de->name_len); 1319 nlen = EXT4_DIR_REC_LEN(de->name_len);
1327 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize); 1320 rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
1328 if ((de->inode? rlen - nlen: rlen) >= reclen) 1321 if ((de->inode? rlen - nlen: rlen) >= reclen)
@@ -1337,7 +1330,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1337 err = ext4_journal_get_write_access(handle, bh); 1330 err = ext4_journal_get_write_access(handle, bh);
1338 if (err) { 1331 if (err) {
1339 ext4_std_error(dir->i_sb, err); 1332 ext4_std_error(dir->i_sb, err);
1340 brelse(bh);
1341 return err; 1333 return err;
1342 } 1334 }
1343 1335
@@ -1377,7 +1369,6 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
1377 err = ext4_handle_dirty_metadata(handle, dir, bh); 1369 err = ext4_handle_dirty_metadata(handle, dir, bh);
1378 if (err) 1370 if (err)
1379 ext4_std_error(dir->i_sb, err); 1371 ext4_std_error(dir->i_sb, err);
1380 brelse(bh);
1381 return 0; 1372 return 0;
1382} 1373}
1383 1374
@@ -1471,7 +1462,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
1471 if (!(de)) 1462 if (!(de))
1472 return retval; 1463 return retval;
1473 1464
1474 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1465 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1466 brelse(bh);
1467 return retval;
1475} 1468}
1476 1469
1477/* 1470/*
@@ -1514,8 +1507,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1514 if(!bh) 1507 if(!bh)
1515 return retval; 1508 return retval;
1516 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1509 retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1517 if (retval != -ENOSPC) 1510 if (retval != -ENOSPC) {
1511 brelse(bh);
1518 return retval; 1512 return retval;
1513 }
1519 1514
1520 if (blocks == 1 && !dx_fallback && 1515 if (blocks == 1 && !dx_fallback &&
1521 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) 1516 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
@@ -1528,7 +1523,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
1528 de = (struct ext4_dir_entry_2 *) bh->b_data; 1523 de = (struct ext4_dir_entry_2 *) bh->b_data;
1529 de->inode = 0; 1524 de->inode = 0;
1530 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); 1525 de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
1531 return add_dirent_to_buf(handle, dentry, inode, de, bh); 1526 retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
1527 brelse(bh);
1528 return retval;
1532} 1529}
1533 1530
1534/* 1531/*
@@ -1561,10 +1558,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1561 goto journal_error; 1558 goto journal_error;
1562 1559
1563 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); 1560 err = add_dirent_to_buf(handle, dentry, inode, NULL, bh);
1564 if (err != -ENOSPC) { 1561 if (err != -ENOSPC)
1565 bh = NULL;
1566 goto cleanup; 1562 goto cleanup;
1567 }
1568 1563
1569 /* Block full, should compress but for now just split */ 1564 /* Block full, should compress but for now just split */
1570 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", 1565 dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
@@ -1657,7 +1652,6 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
1657 if (!de) 1652 if (!de)
1658 goto cleanup; 1653 goto cleanup;
1659 err = add_dirent_to_buf(handle, dentry, inode, de, bh); 1654 err = add_dirent_to_buf(handle, dentry, inode, de, bh);
1660 bh = NULL;
1661 goto cleanup; 1655 goto cleanup;
1662 1656
1663journal_error: 1657journal_error:
@@ -1775,7 +1769,7 @@ static int ext4_create(struct inode *dir, struct dentry *dentry, int mode,
1775retry: 1769retry:
1776 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1770 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1777 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1771 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1778 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1772 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1779 if (IS_ERR(handle)) 1773 if (IS_ERR(handle))
1780 return PTR_ERR(handle); 1774 return PTR_ERR(handle);
1781 1775
@@ -1809,7 +1803,7 @@ static int ext4_mknod(struct inode *dir, struct dentry *dentry,
1809retry: 1803retry:
1810 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1804 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1811 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1805 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1812 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1806 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1813 if (IS_ERR(handle)) 1807 if (IS_ERR(handle))
1814 return PTR_ERR(handle); 1808 return PTR_ERR(handle);
1815 1809
@@ -1846,7 +1840,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1846retry: 1840retry:
1847 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 1841 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
1848 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 + 1842 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3 +
1849 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 1843 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
1850 if (IS_ERR(handle)) 1844 if (IS_ERR(handle))
1851 return PTR_ERR(handle); 1845 return PTR_ERR(handle);
1852 1846
@@ -2259,7 +2253,7 @@ static int ext4_symlink(struct inode *dir,
2259retry: 2253retry:
2260 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + 2254 handle = ext4_journal_start(dir, EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
2261 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 + 2255 EXT4_INDEX_EXTRA_TRANS_BLOCKS + 5 +
2262 2*EXT4_QUOTA_INIT_BLOCKS(dir->i_sb)); 2256 EXT4_MAXQUOTAS_INIT_BLOCKS(dir->i_sb));
2263 if (IS_ERR(handle)) 2257 if (IS_ERR(handle))
2264 return PTR_ERR(handle); 2258 return PTR_ERR(handle);
2265 2259
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 3cfc343c41b5..3b2c5541d8a6 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -247,7 +247,7 @@ static int setup_new_group_blocks(struct super_block *sb,
247 goto exit_bh; 247 goto exit_bh;
248 248
249 if (IS_ERR(gdb = bclean(handle, sb, block))) { 249 if (IS_ERR(gdb = bclean(handle, sb, block))) {
250 err = PTR_ERR(bh); 250 err = PTR_ERR(gdb);
251 goto exit_bh; 251 goto exit_bh;
252 } 252 }
253 ext4_handle_dirty_metadata(handle, NULL, gdb); 253 ext4_handle_dirty_metadata(handle, NULL, gdb);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index d4ca92aab514..8b58a144c31b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -603,10 +603,6 @@ static void ext4_put_super(struct super_block *sb)
603 if (sb->s_dirt) 603 if (sb->s_dirt)
604 ext4_commit_super(sb, 1); 604 ext4_commit_super(sb, 1);
605 605
606 ext4_release_system_zone(sb);
607 ext4_mb_release(sb);
608 ext4_ext_release(sb);
609 ext4_xattr_put_super(sb);
610 if (sbi->s_journal) { 606 if (sbi->s_journal) {
611 err = jbd2_journal_destroy(sbi->s_journal); 607 err = jbd2_journal_destroy(sbi->s_journal);
612 sbi->s_journal = NULL; 608 sbi->s_journal = NULL;
@@ -614,6 +610,12 @@ static void ext4_put_super(struct super_block *sb)
614 ext4_abort(sb, __func__, 610 ext4_abort(sb, __func__,
615 "Couldn't clean up the journal"); 611 "Couldn't clean up the journal");
616 } 612 }
613
614 ext4_release_system_zone(sb);
615 ext4_mb_release(sb);
616 ext4_ext_release(sb);
617 ext4_xattr_put_super(sb);
618
617 if (!(sb->s_flags & MS_RDONLY)) { 619 if (!(sb->s_flags & MS_RDONLY)) {
618 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); 620 EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
619 es->s_state = cpu_to_le16(sbi->s_mount_state); 621 es->s_state = cpu_to_le16(sbi->s_mount_state);
@@ -704,6 +706,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
704 spin_lock_init(&(ei->i_block_reservation_lock)); 706 spin_lock_init(&(ei->i_block_reservation_lock));
705 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); 707 INIT_LIST_HEAD(&ei->i_aio_dio_complete_list);
706 ei->cur_aio_dio = NULL; 708 ei->cur_aio_dio = NULL;
709 ei->i_sync_tid = 0;
710 ei->i_datasync_tid = 0;
707 711
708 return &ei->vfs_inode; 712 return &ei->vfs_inode;
709} 713}
@@ -899,6 +903,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
899 if (test_opt(sb, NO_AUTO_DA_ALLOC)) 903 if (test_opt(sb, NO_AUTO_DA_ALLOC))
900 seq_puts(seq, ",noauto_da_alloc"); 904 seq_puts(seq, ",noauto_da_alloc");
901 905
906 if (test_opt(sb, DISCARD))
907 seq_puts(seq, ",discard");
908
909 if (test_opt(sb, NOLOAD))
910 seq_puts(seq, ",norecovery");
911
902 ext4_show_quota_options(seq, sb); 912 ext4_show_quota_options(seq, sb);
903 913
904 return 0; 914 return 0;
@@ -1079,7 +1089,8 @@ enum {
1079 Opt_usrquota, Opt_grpquota, Opt_i_version, 1089 Opt_usrquota, Opt_grpquota, Opt_i_version,
1080 Opt_stripe, Opt_delalloc, Opt_nodelalloc, 1090 Opt_stripe, Opt_delalloc, Opt_nodelalloc,
1081 Opt_block_validity, Opt_noblock_validity, 1091 Opt_block_validity, Opt_noblock_validity,
1082 Opt_inode_readahead_blks, Opt_journal_ioprio 1092 Opt_inode_readahead_blks, Opt_journal_ioprio,
1093 Opt_discard, Opt_nodiscard,
1083}; 1094};
1084 1095
1085static const match_table_t tokens = { 1096static const match_table_t tokens = {
@@ -1104,6 +1115,7 @@ static const match_table_t tokens = {
1104 {Opt_acl, "acl"}, 1115 {Opt_acl, "acl"},
1105 {Opt_noacl, "noacl"}, 1116 {Opt_noacl, "noacl"},
1106 {Opt_noload, "noload"}, 1117 {Opt_noload, "noload"},
1118 {Opt_noload, "norecovery"},
1107 {Opt_nobh, "nobh"}, 1119 {Opt_nobh, "nobh"},
1108 {Opt_bh, "bh"}, 1120 {Opt_bh, "bh"},
1109 {Opt_commit, "commit=%u"}, 1121 {Opt_commit, "commit=%u"},
@@ -1144,6 +1156,8 @@ static const match_table_t tokens = {
1144 {Opt_auto_da_alloc, "auto_da_alloc=%u"}, 1156 {Opt_auto_da_alloc, "auto_da_alloc=%u"},
1145 {Opt_auto_da_alloc, "auto_da_alloc"}, 1157 {Opt_auto_da_alloc, "auto_da_alloc"},
1146 {Opt_noauto_da_alloc, "noauto_da_alloc"}, 1158 {Opt_noauto_da_alloc, "noauto_da_alloc"},
1159 {Opt_discard, "discard"},
1160 {Opt_nodiscard, "nodiscard"},
1147 {Opt_err, NULL}, 1161 {Opt_err, NULL},
1148}; 1162};
1149 1163
@@ -1565,6 +1579,12 @@ set_qf_format:
1565 else 1579 else
1566 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC); 1580 set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
1567 break; 1581 break;
1582 case Opt_discard:
1583 set_opt(sbi->s_mount_opt, DISCARD);
1584 break;
1585 case Opt_nodiscard:
1586 clear_opt(sbi->s_mount_opt, DISCARD);
1587 break;
1568 default: 1588 default:
1569 ext4_msg(sb, KERN_ERR, 1589 ext4_msg(sb, KERN_ERR,
1570 "Unrecognized mount option \"%s\" " 1590 "Unrecognized mount option \"%s\" "
@@ -1673,14 +1693,14 @@ static int ext4_fill_flex_info(struct super_block *sb)
1673 size_t size; 1693 size_t size;
1674 int i; 1694 int i;
1675 1695
1676 if (!sbi->s_es->s_log_groups_per_flex) { 1696 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1697 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1698
1699 if (groups_per_flex < 2) {
1677 sbi->s_log_groups_per_flex = 0; 1700 sbi->s_log_groups_per_flex = 0;
1678 return 1; 1701 return 1;
1679 } 1702 }
1680 1703
1681 sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex;
1682 groups_per_flex = 1 << sbi->s_log_groups_per_flex;
1683
1684 /* We allocate both existing and potentially added groups */ 1704 /* We allocate both existing and potentially added groups */
1685 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + 1705 flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
1686 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << 1706 ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
@@ -2721,26 +2741,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2721 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { 2741 EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) {
2722 if (ext4_load_journal(sb, es, journal_devnum)) 2742 if (ext4_load_journal(sb, es, journal_devnum))
2723 goto failed_mount3; 2743 goto failed_mount3;
2724 if (!(sb->s_flags & MS_RDONLY) &&
2725 EXT4_SB(sb)->s_journal->j_failed_commit) {
2726 ext4_msg(sb, KERN_CRIT, "error: "
2727 "ext4_fill_super: Journal transaction "
2728 "%u is corrupt",
2729 EXT4_SB(sb)->s_journal->j_failed_commit);
2730 if (test_opt(sb, ERRORS_RO)) {
2731 ext4_msg(sb, KERN_CRIT,
2732 "Mounting filesystem read-only");
2733 sb->s_flags |= MS_RDONLY;
2734 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2735 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2736 }
2737 if (test_opt(sb, ERRORS_PANIC)) {
2738 EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
2739 es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
2740 ext4_commit_super(sb, 1);
2741 goto failed_mount4;
2742 }
2743 }
2744 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && 2744 } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
2745 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { 2745 EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
2746 ext4_msg(sb, KERN_ERR, "required journal recovery " 2746 ext4_msg(sb, KERN_ERR, "required journal recovery "
@@ -3668,13 +3668,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
3668 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; 3668 buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last;
3669 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - 3669 buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) -
3670 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); 3670 percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter);
3671 ext4_free_blocks_count_set(es, buf->f_bfree);
3672 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); 3671 buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
3673 if (buf->f_bfree < ext4_r_blocks_count(es)) 3672 if (buf->f_bfree < ext4_r_blocks_count(es))
3674 buf->f_bavail = 0; 3673 buf->f_bavail = 0;
3675 buf->f_files = le32_to_cpu(es->s_inodes_count); 3674 buf->f_files = le32_to_cpu(es->s_inodes_count);
3676 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter); 3675 buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
3677 es->s_free_inodes_count = cpu_to_le32(buf->f_ffree);
3678 buf->f_namelen = EXT4_NAME_LEN; 3676 buf->f_namelen = EXT4_NAME_LEN;
3679 fsid = le64_to_cpup((void *)es->s_uuid) ^ 3677 fsid = le64_to_cpup((void *)es->s_uuid) ^
3680 le64_to_cpup((void *)es->s_uuid + sizeof(u64)); 3678 le64_to_cpup((void *)es->s_uuid + sizeof(u64));
@@ -3966,6 +3964,58 @@ static int ext4_get_sb(struct file_system_type *fs_type, int flags,
3966 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt); 3964 return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
3967} 3965}
3968 3966
3967#if !defined(CONTIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
3968static struct file_system_type ext2_fs_type = {
3969 .owner = THIS_MODULE,
3970 .name = "ext2",
3971 .get_sb = ext4_get_sb,
3972 .kill_sb = kill_block_super,
3973 .fs_flags = FS_REQUIRES_DEV,
3974};
3975
3976static inline void register_as_ext2(void)
3977{
3978 int err = register_filesystem(&ext2_fs_type);
3979 if (err)
3980 printk(KERN_WARNING
3981 "EXT4-fs: Unable to register as ext2 (%d)\n", err);
3982}
3983
3984static inline void unregister_as_ext2(void)
3985{
3986 unregister_filesystem(&ext2_fs_type);
3987}
3988#else
3989static inline void register_as_ext2(void) { }
3990static inline void unregister_as_ext2(void) { }
3991#endif
3992
3993#if !defined(CONTIG_EXT3_FS) && !defined(CONFIG_EXT3_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
3994static struct file_system_type ext3_fs_type = {
3995 .owner = THIS_MODULE,
3996 .name = "ext3",
3997 .get_sb = ext4_get_sb,
3998 .kill_sb = kill_block_super,
3999 .fs_flags = FS_REQUIRES_DEV,
4000};
4001
4002static inline void register_as_ext3(void)
4003{
4004 int err = register_filesystem(&ext3_fs_type);
4005 if (err)
4006 printk(KERN_WARNING
4007 "EXT4-fs: Unable to register as ext3 (%d)\n", err);
4008}
4009
4010static inline void unregister_as_ext3(void)
4011{
4012 unregister_filesystem(&ext3_fs_type);
4013}
4014#else
4015static inline void register_as_ext3(void) { }
4016static inline void unregister_as_ext3(void) { }
4017#endif
4018
3969static struct file_system_type ext4_fs_type = { 4019static struct file_system_type ext4_fs_type = {
3970 .owner = THIS_MODULE, 4020 .owner = THIS_MODULE,
3971 .name = "ext4", 4021 .name = "ext4",
@@ -3995,11 +4045,15 @@ static int __init init_ext4_fs(void)
3995 err = init_inodecache(); 4045 err = init_inodecache();
3996 if (err) 4046 if (err)
3997 goto out1; 4047 goto out1;
4048 register_as_ext2();
4049 register_as_ext3();
3998 err = register_filesystem(&ext4_fs_type); 4050 err = register_filesystem(&ext4_fs_type);
3999 if (err) 4051 if (err)
4000 goto out; 4052 goto out;
4001 return 0; 4053 return 0;
4002out: 4054out:
4055 unregister_as_ext2();
4056 unregister_as_ext3();
4003 destroy_inodecache(); 4057 destroy_inodecache();
4004out1: 4058out1:
4005 exit_ext4_xattr(); 4059 exit_ext4_xattr();
@@ -4015,6 +4069,8 @@ out4:
4015 4069
4016static void __exit exit_ext4_fs(void) 4070static void __exit exit_ext4_fs(void)
4017{ 4071{
4072 unregister_as_ext2();
4073 unregister_as_ext3();
4018 unregister_filesystem(&ext4_fs_type); 4074 unregister_filesystem(&ext4_fs_type);
4019 destroy_inodecache(); 4075 destroy_inodecache();
4020 exit_ext4_xattr(); 4076 exit_ext4_xattr();
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index fed5b01d7a8d..910bf9a59cb3 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -482,9 +482,10 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode,
482 ea_bdebug(bh, "refcount now=0; freeing"); 482 ea_bdebug(bh, "refcount now=0; freeing");
483 if (ce) 483 if (ce)
484 mb_cache_entry_free(ce); 484 mb_cache_entry_free(ce);
485 ext4_free_blocks(handle, inode, bh->b_blocknr, 1, 1);
486 get_bh(bh); 485 get_bh(bh);
487 ext4_forget(handle, 1, inode, bh, bh->b_blocknr); 486 ext4_free_blocks(handle, inode, bh, 0, 1,
487 EXT4_FREE_BLOCKS_METADATA |
488 EXT4_FREE_BLOCKS_FORGET);
488 } else { 489 } else {
489 le32_add_cpu(&BHDR(bh)->h_refcount, -1); 490 le32_add_cpu(&BHDR(bh)->h_refcount, -1);
490 error = ext4_handle_dirty_metadata(handle, inode, bh); 491 error = ext4_handle_dirty_metadata(handle, inode, bh);
@@ -832,7 +833,8 @@ inserted:
832 new_bh = sb_getblk(sb, block); 833 new_bh = sb_getblk(sb, block);
833 if (!new_bh) { 834 if (!new_bh) {
834getblk_failed: 835getblk_failed:
835 ext4_free_blocks(handle, inode, block, 1, 1); 836 ext4_free_blocks(handle, inode, 0, block, 1,
837 EXT4_FREE_BLOCKS_METADATA);
836 error = -EIO; 838 error = -EIO;
837 goto cleanup; 839 goto cleanup;
838 } 840 }
@@ -988,6 +990,10 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
988 if (error) 990 if (error)
989 goto cleanup; 991 goto cleanup;
990 992
993 error = ext4_journal_get_write_access(handle, is.iloc.bh);
994 if (error)
995 goto cleanup;
996
991 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) { 997 if (EXT4_I(inode)->i_state & EXT4_STATE_NEW) {
992 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc); 998 struct ext4_inode *raw_inode = ext4_raw_inode(&is.iloc);
993 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size); 999 memset(raw_inode, 0, EXT4_SB(inode->i_sb)->s_inode_size);
@@ -1013,9 +1019,6 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
1013 if (flags & XATTR_CREATE) 1019 if (flags & XATTR_CREATE)
1014 goto cleanup; 1020 goto cleanup;
1015 } 1021 }
1016 error = ext4_journal_get_write_access(handle, is.iloc.bh);
1017 if (error)
1018 goto cleanup;
1019 if (!value) { 1022 if (!value) {
1020 if (!is.s.not_found) 1023 if (!is.s.not_found)
1021 error = ext4_xattr_ibody_set(handle, inode, &i, &is); 1024 error = ext4_xattr_ibody_set(handle, inode, &i, &is);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index d4cfd6d2779e..8896c1d4febe 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -636,6 +636,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
636 JBUFFER_TRACE(jh, "ph3: write metadata"); 636 JBUFFER_TRACE(jh, "ph3: write metadata");
637 flags = jbd2_journal_write_metadata_buffer(commit_transaction, 637 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
638 jh, &new_jh, blocknr); 638 jh, &new_jh, blocknr);
639 if (flags < 0) {
640 jbd2_journal_abort(journal, flags);
641 continue;
642 }
639 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state); 643 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
640 wbuf[bufs++] = jh2bh(new_jh); 644 wbuf[bufs++] = jh2bh(new_jh);
641 645
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index fed85388ee86..b7ca3a92a4db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -78,6 +78,7 @@ EXPORT_SYMBOL(jbd2_journal_errno);
78EXPORT_SYMBOL(jbd2_journal_ack_err); 78EXPORT_SYMBOL(jbd2_journal_ack_err);
79EXPORT_SYMBOL(jbd2_journal_clear_err); 79EXPORT_SYMBOL(jbd2_journal_clear_err);
80EXPORT_SYMBOL(jbd2_log_wait_commit); 80EXPORT_SYMBOL(jbd2_log_wait_commit);
81EXPORT_SYMBOL(jbd2_log_start_commit);
81EXPORT_SYMBOL(jbd2_journal_start_commit); 82EXPORT_SYMBOL(jbd2_journal_start_commit);
82EXPORT_SYMBOL(jbd2_journal_force_commit_nested); 83EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
83EXPORT_SYMBOL(jbd2_journal_wipe); 84EXPORT_SYMBOL(jbd2_journal_wipe);
@@ -358,6 +359,10 @@ repeat:
358 359
359 jbd_unlock_bh_state(bh_in); 360 jbd_unlock_bh_state(bh_in);
360 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS); 361 tmp = jbd2_alloc(bh_in->b_size, GFP_NOFS);
362 if (!tmp) {
363 jbd2_journal_put_journal_head(new_jh);
364 return -ENOMEM;
365 }
361 jbd_lock_bh_state(bh_in); 366 jbd_lock_bh_state(bh_in);
362 if (jh_in->b_frozen_data) { 367 if (jh_in->b_frozen_data) {
363 jbd2_free(tmp, bh_in->b_size); 368 jbd2_free(tmp, bh_in->b_size);
@@ -1248,6 +1253,13 @@ int jbd2_journal_load(journal_t *journal)
1248 if (jbd2_journal_recover(journal)) 1253 if (jbd2_journal_recover(journal))
1249 goto recovery_error; 1254 goto recovery_error;
1250 1255
1256 if (journal->j_failed_commit) {
1257 printk(KERN_ERR "JBD2: journal transaction %u on %s "
1258 "is corrupt.\n", journal->j_failed_commit,
1259 journal->j_devname);
1260 return -EIO;
1261 }
1262
1251 /* OK, we've finished with the dynamic journal bits: 1263 /* OK, we've finished with the dynamic journal bits:
1252 * reinitialise the dynamic contents of the superblock in memory 1264 * reinitialise the dynamic contents of the superblock in memory
1253 * and reset them on disk. */ 1265 * and reset them on disk. */
diff --git a/fs/jffs2/compr.c b/fs/jffs2/compr.c
index f25e70c1b51c..f0294410868d 100644
--- a/fs/jffs2/compr.c
+++ b/fs/jffs2/compr.c
@@ -177,7 +177,7 @@ uint16_t jffs2_compress(struct jffs2_sb_info *c, struct jffs2_inode_info *f,
177 spin_unlock(&jffs2_compressor_list_lock); 177 spin_unlock(&jffs2_compressor_list_lock);
178 break; 178 break;
179 default: 179 default:
180 printk(KERN_ERR "JFFS2: unknow compression mode.\n"); 180 printk(KERN_ERR "JFFS2: unknown compression mode.\n");
181 } 181 }
182 out: 182 out:
183 if (ret == JFFS2_COMPR_NONE) { 183 if (ret == JFFS2_COMPR_NONE) {
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 1a80301004b8..378991cfe40f 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -931,7 +931,7 @@ static inline int read_unknown(struct jffs2_sb_info *c, struct jffs2_raw_node_re
931 * Helper function for jffs2_get_inode_nodes(). 931 * Helper function for jffs2_get_inode_nodes().
932 * The function detects whether more data should be read and reads it if yes. 932 * The function detects whether more data should be read and reads it if yes.
933 * 933 *
934 * Returns: 0 on succes; 934 * Returns: 0 on success;
935 * negative error code on failure. 935 * negative error code on failure.
936 */ 936 */
937static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref, 937static int read_more(struct jffs2_sb_info *c, struct jffs2_raw_node_ref *ref,
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 082e844ab2db..4b107881acd5 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -31,7 +31,7 @@
31 * is used to release xattr name/value pair and detach from c->xattrindex. 31 * is used to release xattr name/value pair and detach from c->xattrindex.
32 * reclaim_xattr_datum(c) 32 * reclaim_xattr_datum(c)
33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when 33 * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when
34 * memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold 34 * memory usage by cache is over c->xdatum_mem_threshold. Currently, this threshold
35 * is hard coded as 32KiB. 35 * is hard coded as 32KiB.
36 * do_verify_xattr_datum(c, xd) 36 * do_verify_xattr_datum(c, xd)
37 * is used to load the xdatum informations without name/value pair from the medium. 37 * is used to load the xdatum informations without name/value pair from the medium.
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 2bc7d8aa5740..d9b031cf69f5 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -755,7 +755,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
755 * allocation group. 755 * allocation group.
756 */ 756 */
757 if ((blkno & (bmp->db_agsize - 1)) == 0) 757 if ((blkno & (bmp->db_agsize - 1)) == 0)
758 /* check if the AG is currenly being written to. 758 /* check if the AG is currently being written to.
759 * if so, call dbNextAG() to find a non-busy 759 * if so, call dbNextAG() to find a non-busy
760 * AG with sufficient free space. 760 * AG with sufficient free space.
761 */ 761 */
@@ -3337,7 +3337,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks)
3337 for (i = 0, n = 0; i < agno; n++) { 3337 for (i = 0, n = 0; i < agno; n++) {
3338 bmp->db_agfree[n] = 0; /* init collection point */ 3338 bmp->db_agfree[n] = 0; /* init collection point */
3339 3339
3340 /* coalesce cotiguous k AGs; */ 3340 /* coalesce contiguous k AGs; */
3341 for (j = 0; j < k && i < agno; j++, i++) { 3341 for (j = 0; j < k && i < agno; j++, i++) {
3342 /* merge AGi to AGn */ 3342 /* merge AGi to AGn */
3343 bmp->db_agfree[n] += bmp->db_agfree[i]; 3343 bmp->db_agfree[n] += bmp->db_agfree[i];
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0d58caf4a6e1..ec8f45f12e05 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -835,7 +835,7 @@ static int ncp_ioctl_need_write(unsigned int cmd)
835 case NCP_IOC_SETROOT: 835 case NCP_IOC_SETROOT:
836 return 0; 836 return 0;
837 default: 837 default:
838 /* unkown IOCTL command, assume write */ 838 /* unknown IOCTL command, assume write */
839 return 1; 839 return 1;
840 } 840 }
841} 841}
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1d1d1a2765dd..5ef5f365a5c8 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -741,10 +741,6 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
741 741
742 /* create/update an inode mark */ 742 /* create/update an inode mark */
743 ret = inotify_update_watch(group, inode, mask); 743 ret = inotify_update_watch(group, inode, mask);
744 if (unlikely(ret))
745 goto path_put_and_out;
746
747path_put_and_out:
748 path_put(&path); 744 path_put(&path);
749fput_and_out: 745fput_and_out:
750 fput_light(filp, fput_needed); 746 fput_light(filp, fput_needed);
diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c
index 9669541d0119..08f7530e9341 100644
--- a/fs/ntfs/compress.c
+++ b/fs/ntfs/compress.c
@@ -927,7 +927,7 @@ lock_retry_remap:
927 return 0; 927 return 0;
928 928
929 ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ? 929 ntfs_debug("Failed. Returning error code %s.", err == -EOVERFLOW ?
930 "EOVERFLOW" : (!err ? "EIO" : "unkown error")); 930 "EOVERFLOW" : (!err ? "EIO" : "unknown error"));
931 return err < 0 ? err : -EIO; 931 return err < 0 ? err : -EIO;
932 932
933read_err: 933read_err:
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 663c0e341f8b..43179ddd336f 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -399,7 +399,7 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
399 * @cached_page: allocated but as yet unused page 399 * @cached_page: allocated but as yet unused page
400 * @lru_pvec: lru-buffering pagevec of caller 400 * @lru_pvec: lru-buffering pagevec of caller
401 * 401 *
402 * Obtain @nr_pages locked page cache pages from the mapping @maping and 402 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
403 * starting at index @index. 403 * starting at index @index.
404 * 404 *
405 * If a page is newly created, increment its refcount and add it to the 405 * If a page is newly created, increment its refcount and add it to the
@@ -1281,7 +1281,7 @@ rl_not_mapped_enoent:
1281 1281
1282/* 1282/*
1283 * Copy as much as we can into the pages and return the number of bytes which 1283 * Copy as much as we can into the pages and return the number of bytes which
1284 * were sucessfully copied. If a fault is encountered then clear the pages 1284 * were successfully copied. If a fault is encountered then clear the pages
1285 * out to (ofs + bytes) and return the number of bytes which were copied. 1285 * out to (ofs + bytes) and return the number of bytes which were copied.
1286 */ 1286 */
1287static inline size_t ntfs_copy_from_user(struct page **pages, 1287static inline size_t ntfs_copy_from_user(struct page **pages,
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 89b02985c054..4dadcdf3d451 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -338,7 +338,7 @@ err_out:
338 * copy of the complete multi sector transfer deprotected page. On failure, 338 * copy of the complete multi sector transfer deprotected page. On failure,
339 * *@wrp is undefined. 339 * *@wrp is undefined.
340 * 340 *
341 * Simillarly, if @lsn is not NULL, on succes *@lsn will be set to the current 341 * Simillarly, if @lsn is not NULL, on success *@lsn will be set to the current
342 * logfile lsn according to this restart page. On failure, *@lsn is undefined. 342 * logfile lsn according to this restart page. On failure, *@lsn is undefined.
343 * 343 *
344 * The following error codes are defined: 344 * The following error codes are defined:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 38a42f5d59ff..7c7198a5bc90 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -2398,7 +2398,7 @@ static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos)
2398 * 2398 *
2399 * The array is assumed to be large enough to hold an entire path (tree depth). 2399 * The array is assumed to be large enough to hold an entire path (tree depth).
2400 * 2400 *
2401 * Upon succesful return from this function: 2401 * Upon successful return from this function:
2402 * 2402 *
2403 * - The 'right_path' array will contain a path to the leaf block 2403 * - The 'right_path' array will contain a path to the leaf block
2404 * whose range contains e_cpos. 2404 * whose range contains e_cpos.
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index a1163b8b417c..b7428c5d0d3b 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -47,7 +47,7 @@
47 * Calculate the bit offset in the hamming code buffer based on the bit's 47 * Calculate the bit offset in the hamming code buffer based on the bit's
48 * offset in the data buffer. Since the hamming code reserves all 48 * offset in the data buffer. Since the hamming code reserves all
49 * power-of-two bits for parity, the data bit number and the code bit 49 * power-of-two bits for parity, the data bit number and the code bit
50 * number are offest by all the parity bits beforehand. 50 * number are offset by all the parity bits beforehand.
51 * 51 *
52 * Recall that bit numbers in hamming code are 1-based. This function 52 * Recall that bit numbers in hamming code are 1-based. This function
53 * takes the 0-based data bit from the caller. 53 * takes the 0-based data bit from the caller.
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 83bcaf266b35..03ccf9a7b1f4 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2586,7 +2586,7 @@ fail:
2586 * is complete everywhere. if the target dies while this is 2586 * is complete everywhere. if the target dies while this is
2587 * going on, some nodes could potentially see the target as the 2587 * going on, some nodes could potentially see the target as the
2588 * master, so it is important that my recovery finds the migration 2588 * master, so it is important that my recovery finds the migration
2589 * mle and sets the master to UNKNONWN. */ 2589 * mle and sets the master to UNKNOWN. */
2590 2590
2591 2591
2592 /* wait for new node to assert master */ 2592 /* wait for new node to assert master */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 0d38d67194cb..c5e4a49e3a12 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -1855,7 +1855,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1855 * outstanding lock request, so a cancel convert is 1855 * outstanding lock request, so a cancel convert is
1856 * required. We intentionally overwrite 'ret' - if the 1856 * required. We intentionally overwrite 'ret' - if the
1857 * cancel fails and the lock was granted, it's easier 1857 * cancel fails and the lock was granted, it's easier
1858 * to just bubble sucess back up to the user. 1858 * to just bubble success back up to the user.
1859 */ 1859 */
1860 ret = ocfs2_flock_handle_signal(lockres, level); 1860 ret = ocfs2_flock_handle_signal(lockres, level);
1861 } else if (!ret && (level > lockres->l_level)) { 1861 } else if (!ret && (level > lockres->l_level)) {
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 54c16b66327e..bf34c491ae96 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -659,7 +659,7 @@ static int __ocfs2_journal_access(handle_t *handle,
659 659
660 default: 660 default:
661 status = -EINVAL; 661 status = -EINVAL;
662 mlog(ML_ERROR, "Uknown access type!\n"); 662 mlog(ML_ERROR, "Unknown access type!\n");
663 } 663 }
664 if (!status && ocfs2_meta_ecc(osb) && triggers) 664 if (!status && ocfs2_meta_ecc(osb) && triggers)
665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers); 665 jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 3a0df7a1b810..30967e3f5e43 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2431,7 +2431,7 @@ out:
2431 * we gonna touch and whether we need to create new blocks. 2431 * we gonna touch and whether we need to create new blocks.
2432 * 2432 *
2433 * Normally the refcount blocks store these refcount should be 2433 * Normally the refcount blocks store these refcount should be
2434 * continguous also, so that we can get the number easily. 2434 * contiguous also, so that we can get the number easily.
2435 * As for meta_ac, we will at most add split 2 refcount record and 2435 * As for meta_ac, we will at most add split 2 refcount record and
2436 * 2 more refcount block, so just check it in a rough way. 2436 * 2 more refcount block, so just check it in a rough way.
2437 * 2437 *
diff --git a/fs/omfs/bitmap.c b/fs/omfs/bitmap.c
index e1c0ec0ae989..082234581d05 100644
--- a/fs/omfs/bitmap.c
+++ b/fs/omfs/bitmap.c
@@ -85,7 +85,7 @@ out:
85} 85}
86 86
87/* 87/*
88 * Tries to allocate exactly one block. Returns true if sucessful. 88 * Tries to allocate exactly one block. Returns true if successful.
89 */ 89 */
90int omfs_allocate_block(struct super_block *sb, u64 block) 90int omfs_allocate_block(struct super_block *sb, u64 block)
91{ 91{
diff --git a/fs/qnx4/bitmap.c b/fs/qnx4/bitmap.c
index 0afba069d567..32f5d131a644 100644
--- a/fs/qnx4/bitmap.c
+++ b/fs/qnx4/bitmap.c
@@ -67,7 +67,7 @@ unsigned long qnx4_count_free_blocks(struct super_block *sb)
67 67
68 while (total < size) { 68 while (total < size) {
69 if ((bh = sb_bread(sb, start + offset)) == NULL) { 69 if ((bh = sb_bread(sb, start + offset)) == NULL) {
70 printk("qnx4: I/O error in counting free blocks\n"); 70 printk(KERN_ERR "qnx4: I/O error in counting free blocks\n");
71 break; 71 break;
72 } 72 }
73 count_bits(bh->b_data, size - total, &total_free); 73 count_bits(bh->b_data, size - total, &total_free);
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 86cc39cb1398..6f30c3d5bcbf 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -26,8 +26,8 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
26 int ix, ino; 26 int ix, ino;
27 int size; 27 int size;
28 28
29 QNX4DEBUG(("qnx4_readdir:i_size = %ld\n", (long) inode->i_size)); 29 QNX4DEBUG((KERN_INFO "qnx4_readdir:i_size = %ld\n", (long) inode->i_size));
30 QNX4DEBUG(("filp->f_pos = %ld\n", (long) filp->f_pos)); 30 QNX4DEBUG((KERN_INFO "filp->f_pos = %ld\n", (long) filp->f_pos));
31 31
32 lock_kernel(); 32 lock_kernel();
33 33
@@ -50,7 +50,7 @@ static int qnx4_readdir(struct file *filp, void *dirent, filldir_t filldir)
50 size = QNX4_NAME_MAX; 50 size = QNX4_NAME_MAX;
51 51
52 if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) { 52 if ( ( de->di_status & (QNX4_FILE_USED|QNX4_FILE_LINK) ) != 0 ) {
53 QNX4DEBUG(("qnx4_readdir:%.*s\n", size, de->di_fname)); 53 QNX4DEBUG((KERN_INFO "qnx4_readdir:%.*s\n", size, de->di_fname));
54 if ( ( de->di_status & QNX4_FILE_LINK ) == 0 ) 54 if ( ( de->di_status & QNX4_FILE_LINK ) == 0 )
55 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1; 55 ino = blknum * QNX4_INODES_PER_BLOCK + ix - 1;
56 else { 56 else {
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index d2cd1798d8c4..449f5a66dd34 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -107,7 +107,7 @@ static int qnx4_get_block( struct inode *inode, sector_t iblock, struct buffer_h
107{ 107{
108 unsigned long phys; 108 unsigned long phys;
109 109
110 QNX4DEBUG(("qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock)); 110 QNX4DEBUG((KERN_INFO "qnx4: qnx4_get_block inode=[%ld] iblock=[%ld]\n",inode->i_ino,iblock));
111 111
112 phys = qnx4_block_map( inode, iblock ); 112 phys = qnx4_block_map( inode, iblock );
113 if ( phys ) { 113 if ( phys ) {
@@ -142,12 +142,12 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
142 // read next xtnt block. 142 // read next xtnt block.
143 bh = sb_bread(inode->i_sb, i_xblk - 1); 143 bh = sb_bread(inode->i_sb, i_xblk - 1);
144 if ( !bh ) { 144 if ( !bh ) {
145 QNX4DEBUG(("qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1)); 145 QNX4DEBUG((KERN_ERR "qnx4: I/O error reading xtnt block [%ld])\n", i_xblk - 1));
146 return -EIO; 146 return -EIO;
147 } 147 }
148 xblk = (struct qnx4_xblk*)bh->b_data; 148 xblk = (struct qnx4_xblk*)bh->b_data;
149 if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) { 149 if ( memcmp( xblk->xblk_signature, "IamXblk", 7 ) ) {
150 QNX4DEBUG(("qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk)); 150 QNX4DEBUG((KERN_ERR "qnx4: block at %ld is not a valid xtnt\n", qnx4_inode->i_xblk));
151 return -EIO; 151 return -EIO;
152 } 152 }
153 } 153 }
@@ -168,7 +168,7 @@ unsigned long qnx4_block_map( struct inode *inode, long iblock )
168 brelse( bh ); 168 brelse( bh );
169 } 169 }
170 170
171 QNX4DEBUG(("qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block)); 171 QNX4DEBUG((KERN_INFO "qnx4: mapping block %ld of inode %ld = %ld\n",iblock,inode->i_ino,block));
172 return block; 172 return block;
173} 173}
174 174
@@ -209,7 +209,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
209 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') { 209 if (*(qnx4_sb(sb)->sb->RootDir.di_fname) != '/') {
210 return "no qnx4 filesystem (no root dir)."; 210 return "no qnx4 filesystem (no root dir).";
211 } else { 211 } else {
212 QNX4DEBUG(("QNX4 filesystem found on dev %s.\n", sb->s_id)); 212 QNX4DEBUG((KERN_NOTICE "QNX4 filesystem found on dev %s.\n", sb->s_id));
213 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1; 213 rd = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_blk) - 1;
214 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size); 214 rl = le32_to_cpu(qnx4_sb(sb)->sb->RootDir.di_first_xtnt.xtnt_size);
215 for (j = 0; j < rl; j++) { 215 for (j = 0; j < rl; j++) {
@@ -220,7 +220,7 @@ static const char *qnx4_checkroot(struct super_block *sb)
220 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) { 220 for (i = 0; i < QNX4_INODES_PER_BLOCK; i++) {
221 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE); 221 rootdir = (struct qnx4_inode_entry *) (bh->b_data + i * QNX4_DIR_ENTRY_SIZE);
222 if (rootdir->di_fname != NULL) { 222 if (rootdir->di_fname != NULL) {
223 QNX4DEBUG(("Rootdir entry found : [%s]\n", rootdir->di_fname)); 223 QNX4DEBUG((KERN_INFO "rootdir entry found : [%s]\n", rootdir->di_fname));
224 if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) { 224 if (!strncmp(rootdir->di_fname, QNX4_BMNAME, sizeof QNX4_BMNAME)) {
225 found = 1; 225 found = 1;
226 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL ); 226 qnx4_sb(sb)->BitMap = kmalloc( sizeof( struct qnx4_inode_entry ), GFP_KERNEL );
@@ -265,12 +265,12 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
265 if we don't belong here... */ 265 if we don't belong here... */
266 bh = sb_bread(s, 1); 266 bh = sb_bread(s, 1);
267 if (!bh) { 267 if (!bh) {
268 printk("qnx4: unable to read the superblock\n"); 268 printk(KERN_ERR "qnx4: unable to read the superblock\n");
269 goto outnobh; 269 goto outnobh;
270 } 270 }
271 if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) { 271 if ( le32_to_cpup((__le32*) bh->b_data) != QNX4_SUPER_MAGIC ) {
272 if (!silent) 272 if (!silent)
273 printk("qnx4: wrong fsid in superblock.\n"); 273 printk(KERN_ERR "qnx4: wrong fsid in superblock.\n");
274 goto out; 274 goto out;
275 } 275 }
276 s->s_op = &qnx4_sops; 276 s->s_op = &qnx4_sops;
@@ -284,14 +284,14 @@ static int qnx4_fill_super(struct super_block *s, void *data, int silent)
284 errmsg = qnx4_checkroot(s); 284 errmsg = qnx4_checkroot(s);
285 if (errmsg != NULL) { 285 if (errmsg != NULL) {
286 if (!silent) 286 if (!silent)
287 printk("qnx4: %s\n", errmsg); 287 printk(KERN_ERR "qnx4: %s\n", errmsg);
288 goto out; 288 goto out;
289 } 289 }
290 290
291 /* does root not have inode number QNX4_ROOT_INO ?? */ 291 /* does root not have inode number QNX4_ROOT_INO ?? */
292 root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK); 292 root = qnx4_iget(s, QNX4_ROOT_INO * QNX4_INODES_PER_BLOCK);
293 if (IS_ERR(root)) { 293 if (IS_ERR(root)) {
294 printk("qnx4: get inode failed\n"); 294 printk(KERN_ERR "qnx4: get inode failed\n");
295 ret = PTR_ERR(root); 295 ret = PTR_ERR(root);
296 goto out; 296 goto out;
297 } 297 }
@@ -374,7 +374,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
374 qnx4_inode = qnx4_raw_inode(inode); 374 qnx4_inode = qnx4_raw_inode(inode);
375 inode->i_mode = 0; 375 inode->i_mode = 0;
376 376
377 QNX4DEBUG(("Reading inode : [%d]\n", ino)); 377 QNX4DEBUG((KERN_INFO "reading inode : [%d]\n", ino));
378 if (!ino) { 378 if (!ino) {
379 printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is " 379 printk(KERN_ERR "qnx4: bad inode number on dev %s: %lu is "
380 "out of range\n", 380 "out of range\n",
@@ -385,7 +385,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
385 block = ino / QNX4_INODES_PER_BLOCK; 385 block = ino / QNX4_INODES_PER_BLOCK;
386 386
387 if (!(bh = sb_bread(sb, block))) { 387 if (!(bh = sb_bread(sb, block))) {
388 printk("qnx4: major problem: unable to read inode from dev " 388 printk(KERN_ERR "qnx4: major problem: unable to read inode from dev "
389 "%s\n", sb->s_id); 389 "%s\n", sb->s_id);
390 iget_failed(inode); 390 iget_failed(inode);
391 return ERR_PTR(-EIO); 391 return ERR_PTR(-EIO);
@@ -499,7 +499,7 @@ static int __init init_qnx4_fs(void)
499 return err; 499 return err;
500 } 500 }
501 501
502 printk("QNX4 filesystem 0.2.3 registered.\n"); 502 printk(KERN_INFO "QNX4 filesystem 0.2.3 registered.\n");
503 return 0; 503 return 0;
504} 504}
505 505
diff --git a/fs/qnx4/namei.c b/fs/qnx4/namei.c
index ae1e7edbacd6..58703ebba879 100644
--- a/fs/qnx4/namei.c
+++ b/fs/qnx4/namei.c
@@ -30,7 +30,7 @@ static int qnx4_match(int len, const char *name,
30 int namelen, thislen; 30 int namelen, thislen;
31 31
32 if (bh == NULL) { 32 if (bh == NULL) {
33 printk("qnx4: matching unassigned buffer !\n"); 33 printk(KERN_WARNING "qnx4: matching unassigned buffer !\n");
34 return 0; 34 return 0;
35 } 35 }
36 de = (struct qnx4_inode_entry *) (bh->b_data + *offset); 36 de = (struct qnx4_inode_entry *) (bh->b_data + *offset);
@@ -66,7 +66,7 @@ static struct buffer_head *qnx4_find_entry(int len, struct inode *dir,
66 66
67 *res_dir = NULL; 67 *res_dir = NULL;
68 if (!dir->i_sb) { 68 if (!dir->i_sb) {
69 printk("qnx4: no superblock on dir.\n"); 69 printk(KERN_WARNING "qnx4: no superblock on dir.\n");
70 return NULL; 70 return NULL;
71 } 71 }
72 bh = NULL; 72 bh = NULL;
@@ -124,7 +124,7 @@ struct dentry * qnx4_lookup(struct inode *dir, struct dentry *dentry, struct nam
124 foundinode = qnx4_iget(dir->i_sb, ino); 124 foundinode = qnx4_iget(dir->i_sb, ino);
125 if (IS_ERR(foundinode)) { 125 if (IS_ERR(foundinode)) {
126 unlock_kernel(); 126 unlock_kernel();
127 QNX4DEBUG(("qnx4: lookup->iget -> error %ld\n", 127 QNX4DEBUG((KERN_ERR "qnx4: lookup->iget -> error %ld\n",
128 PTR_ERR(foundinode))); 128 PTR_ERR(foundinode)));
129 return ERR_CAST(foundinode); 129 return ERR_CAST(foundinode);
130 } 130 }
diff --git a/fs/reiserfs/Makefile b/fs/reiserfs/Makefile
index 7c5ab6330dd6..6a9e30c041dd 100644
--- a/fs/reiserfs/Makefile
+++ b/fs/reiserfs/Makefile
@@ -7,7 +7,7 @@ obj-$(CONFIG_REISERFS_FS) += reiserfs.o
7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \ 7reiserfs-objs := bitmap.o do_balan.o namei.o inode.o file.o dir.o fix_node.o \
8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \ 8 super.o prints.o objectid.o lbalance.o ibalance.o stree.o \
9 hashes.o tail_conversion.o journal.o resize.o \ 9 hashes.o tail_conversion.o journal.o resize.o \
10 item_ops.o ioctl.o procfs.o xattr.o 10 item_ops.o ioctl.o procfs.o xattr.o lock.o
11 11
12ifeq ($(CONFIG_REISERFS_FS_XATTR),y) 12ifeq ($(CONFIG_REISERFS_FS_XATTR),y)
13reiserfs-objs += xattr_user.o xattr_trusted.o 13reiserfs-objs += xattr_user.o xattr_trusted.o
diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c
index e716161ab325..685495707181 100644
--- a/fs/reiserfs/bitmap.c
+++ b/fs/reiserfs/bitmap.c
@@ -1249,14 +1249,18 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb,
1249 else if (bitmap == 0) 1249 else if (bitmap == 0)
1250 block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1; 1250 block = (REISERFS_DISK_OFFSET_IN_BYTES >> sb->s_blocksize_bits) + 1;
1251 1251
1252 reiserfs_write_unlock(sb);
1252 bh = sb_bread(sb, block); 1253 bh = sb_bread(sb, block);
1254 reiserfs_write_lock(sb);
1253 if (bh == NULL) 1255 if (bh == NULL)
1254 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " 1256 reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) "
1255 "reading failed", __func__, block); 1257 "reading failed", __func__, block);
1256 else { 1258 else {
1257 if (buffer_locked(bh)) { 1259 if (buffer_locked(bh)) {
1258 PROC_INFO_INC(sb, scan_bitmap.wait); 1260 PROC_INFO_INC(sb, scan_bitmap.wait);
1261 reiserfs_write_unlock(sb);
1259 __wait_on_buffer(bh); 1262 __wait_on_buffer(bh);
1263 reiserfs_write_lock(sb);
1260 } 1264 }
1261 BUG_ON(!buffer_uptodate(bh)); 1265 BUG_ON(!buffer_uptodate(bh));
1262 BUG_ON(atomic_read(&bh->b_count) == 0); 1266 BUG_ON(atomic_read(&bh->b_count) == 0);
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 6d2668fdc384..c094f58c7448 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -20,7 +20,7 @@ const struct file_operations reiserfs_dir_operations = {
20 .read = generic_read_dir, 20 .read = generic_read_dir,
21 .readdir = reiserfs_readdir, 21 .readdir = reiserfs_readdir,
22 .fsync = reiserfs_dir_fsync, 22 .fsync = reiserfs_dir_fsync,
23 .ioctl = reiserfs_ioctl, 23 .unlocked_ioctl = reiserfs_ioctl,
24#ifdef CONFIG_COMPAT 24#ifdef CONFIG_COMPAT
25 .compat_ioctl = reiserfs_compat_ioctl, 25 .compat_ioctl = reiserfs_compat_ioctl,
26#endif 26#endif
@@ -174,14 +174,22 @@ int reiserfs_readdir_dentry(struct dentry *dentry, void *dirent,
174 // user space buffer is swapped out. At that time 174 // user space buffer is swapped out. At that time
175 // entry can move to somewhere else 175 // entry can move to somewhere else
176 memcpy(local_buf, d_name, d_reclen); 176 memcpy(local_buf, d_name, d_reclen);
177
178 /*
179 * Since filldir might sleep, we can release
180 * the write lock here for other waiters
181 */
182 reiserfs_write_unlock(inode->i_sb);
177 if (filldir 183 if (filldir
178 (dirent, local_buf, d_reclen, d_off, d_ino, 184 (dirent, local_buf, d_reclen, d_off, d_ino,
179 DT_UNKNOWN) < 0) { 185 DT_UNKNOWN) < 0) {
186 reiserfs_write_lock(inode->i_sb);
180 if (local_buf != small_buf) { 187 if (local_buf != small_buf) {
181 kfree(local_buf); 188 kfree(local_buf);
182 } 189 }
183 goto end; 190 goto end;
184 } 191 }
192 reiserfs_write_lock(inode->i_sb);
185 if (local_buf != small_buf) { 193 if (local_buf != small_buf) {
186 kfree(local_buf); 194 kfree(local_buf);
187 } 195 }
diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c
index 128d3f7c8aa5..60c080440661 100644
--- a/fs/reiserfs/do_balan.c
+++ b/fs/reiserfs/do_balan.c
@@ -21,14 +21,6 @@
21#include <linux/buffer_head.h> 21#include <linux/buffer_head.h>
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23 23
24#ifdef CONFIG_REISERFS_CHECK
25
26struct tree_balance *cur_tb = NULL; /* detects whether more than one
27 copy of tb exists as a means
28 of checking whether schedule
29 is interrupting do_balance */
30#endif
31
32static inline void buffer_info_init_left(struct tree_balance *tb, 24static inline void buffer_info_init_left(struct tree_balance *tb,
33 struct buffer_info *bi) 25 struct buffer_info *bi)
34{ 26{
@@ -1840,11 +1832,12 @@ static int check_before_balancing(struct tree_balance *tb)
1840{ 1832{
1841 int retval = 0; 1833 int retval = 0;
1842 1834
1843 if (cur_tb) { 1835 if (REISERFS_SB(tb->tb_sb)->cur_tb) {
1844 reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule " 1836 reiserfs_panic(tb->tb_sb, "vs-12335", "suspect that schedule "
1845 "occurred based on cur_tb not being null at " 1837 "occurred based on cur_tb not being null at "
1846 "this point in code. do_balance cannot properly " 1838 "this point in code. do_balance cannot properly "
1847 "handle schedule occurring while it runs."); 1839 "handle concurrent tree accesses on a same "
1840 "mount point.");
1848 } 1841 }
1849 1842
1850 /* double check that buffers that we will modify are unlocked. (fix_nodes should already have 1843 /* double check that buffers that we will modify are unlocked. (fix_nodes should already have
@@ -1986,7 +1979,7 @@ static inline void do_balance_starts(struct tree_balance *tb)
1986 "check");*/ 1979 "check");*/
1987 RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB"); 1980 RFALSE(check_before_balancing(tb), "PAP-12340: locked buffers in TB");
1988#ifdef CONFIG_REISERFS_CHECK 1981#ifdef CONFIG_REISERFS_CHECK
1989 cur_tb = tb; 1982 REISERFS_SB(tb->tb_sb)->cur_tb = tb;
1990#endif 1983#endif
1991} 1984}
1992 1985
@@ -1996,7 +1989,7 @@ static inline void do_balance_completed(struct tree_balance *tb)
1996#ifdef CONFIG_REISERFS_CHECK 1989#ifdef CONFIG_REISERFS_CHECK
1997 check_leaf_level(tb); 1990 check_leaf_level(tb);
1998 check_internal_levels(tb); 1991 check_internal_levels(tb);
1999 cur_tb = NULL; 1992 REISERFS_SB(tb->tb_sb)->cur_tb = NULL;
2000#endif 1993#endif
2001 1994
2002 /* reiserfs_free_block is no longer schedule safe. So, we need to 1995 /* reiserfs_free_block is no longer schedule safe. So, we need to
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9f436668b7f8..da2dba082e2d 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -284,7 +284,7 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
284const struct file_operations reiserfs_file_operations = { 284const struct file_operations reiserfs_file_operations = {
285 .read = do_sync_read, 285 .read = do_sync_read,
286 .write = reiserfs_file_write, 286 .write = reiserfs_file_write,
287 .ioctl = reiserfs_ioctl, 287 .unlocked_ioctl = reiserfs_ioctl,
288#ifdef CONFIG_COMPAT 288#ifdef CONFIG_COMPAT
289 .compat_ioctl = reiserfs_compat_ioctl, 289 .compat_ioctl = reiserfs_compat_ioctl,
290#endif 290#endif
diff --git a/fs/reiserfs/fix_node.c b/fs/reiserfs/fix_node.c
index 5e5a4e6fbaf8..6591cb21edf6 100644
--- a/fs/reiserfs/fix_node.c
+++ b/fs/reiserfs/fix_node.c
@@ -563,9 +563,6 @@ static int get_num_ver(int mode, struct tree_balance *tb, int h,
563 return needed_nodes; 563 return needed_nodes;
564} 564}
565 565
566#ifdef CONFIG_REISERFS_CHECK
567extern struct tree_balance *cur_tb;
568#endif
569 566
570/* Set parameters for balancing. 567/* Set parameters for balancing.
571 * Performs write of results of analysis of balancing into structure tb, 568 * Performs write of results of analysis of balancing into structure tb,
@@ -834,7 +831,7 @@ static int get_empty_nodes(struct tree_balance *tb, int h)
834 RFALSE(buffer_dirty(new_bh) || 831 RFALSE(buffer_dirty(new_bh) ||
835 buffer_journaled(new_bh) || 832 buffer_journaled(new_bh) ||
836 buffer_journal_dirty(new_bh), 833 buffer_journal_dirty(new_bh),
837 "PAP-8140: journlaled or dirty buffer %b for the new block", 834 "PAP-8140: journaled or dirty buffer %b for the new block",
838 new_bh); 835 new_bh);
839 836
840 /* Put empty buffers into the array. */ 837 /* Put empty buffers into the array. */
@@ -1022,7 +1019,11 @@ static int get_far_parent(struct tree_balance *tb,
1022 /* Check whether the common parent is locked. */ 1019 /* Check whether the common parent is locked. */
1023 1020
1024 if (buffer_locked(*pcom_father)) { 1021 if (buffer_locked(*pcom_father)) {
1022
1023 /* Release the write lock while the buffer is busy */
1024 reiserfs_write_unlock(tb->tb_sb);
1025 __wait_on_buffer(*pcom_father); 1025 __wait_on_buffer(*pcom_father);
1026 reiserfs_write_lock(tb->tb_sb);
1026 if (FILESYSTEM_CHANGED_TB(tb)) { 1027 if (FILESYSTEM_CHANGED_TB(tb)) {
1027 brelse(*pcom_father); 1028 brelse(*pcom_father);
1028 return REPEAT_SEARCH; 1029 return REPEAT_SEARCH;
@@ -1927,7 +1928,9 @@ static int get_direct_parent(struct tree_balance *tb, int h)
1927 return REPEAT_SEARCH; 1928 return REPEAT_SEARCH;
1928 1929
1929 if (buffer_locked(bh)) { 1930 if (buffer_locked(bh)) {
1931 reiserfs_write_unlock(tb->tb_sb);
1930 __wait_on_buffer(bh); 1932 __wait_on_buffer(bh);
1933 reiserfs_write_lock(tb->tb_sb);
1931 if (FILESYSTEM_CHANGED_TB(tb)) 1934 if (FILESYSTEM_CHANGED_TB(tb))
1932 return REPEAT_SEARCH; 1935 return REPEAT_SEARCH;
1933 } 1936 }
@@ -1965,7 +1968,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
1965 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> 1968 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb->
1966 FL[h]); 1969 FL[h]);
1967 son_number = B_N_CHILD_NUM(tb->FL[h], child_position); 1970 son_number = B_N_CHILD_NUM(tb->FL[h], child_position);
1971 reiserfs_write_unlock(sb);
1968 bh = sb_bread(sb, son_number); 1972 bh = sb_bread(sb, son_number);
1973 reiserfs_write_lock(sb);
1969 if (!bh) 1974 if (!bh)
1970 return IO_ERROR; 1975 return IO_ERROR;
1971 if (FILESYSTEM_CHANGED_TB(tb)) { 1976 if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2003,7 +2008,9 @@ static int get_neighbors(struct tree_balance *tb, int h)
2003 child_position = 2008 child_position =
2004 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; 2009 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0;
2005 son_number = B_N_CHILD_NUM(tb->FR[h], child_position); 2010 son_number = B_N_CHILD_NUM(tb->FR[h], child_position);
2011 reiserfs_write_unlock(sb);
2006 bh = sb_bread(sb, son_number); 2012 bh = sb_bread(sb, son_number);
2013 reiserfs_write_lock(sb);
2007 if (!bh) 2014 if (!bh)
2008 return IO_ERROR; 2015 return IO_ERROR;
2009 if (FILESYSTEM_CHANGED_TB(tb)) { 2016 if (FILESYSTEM_CHANGED_TB(tb)) {
@@ -2278,7 +2285,9 @@ static int wait_tb_buffers_until_unlocked(struct tree_balance *tb)
2278 REPEAT_SEARCH : CARRY_ON; 2285 REPEAT_SEARCH : CARRY_ON;
2279 } 2286 }
2280#endif 2287#endif
2288 reiserfs_write_unlock(tb->tb_sb);
2281 __wait_on_buffer(locked); 2289 __wait_on_buffer(locked);
2290 reiserfs_write_lock(tb->tb_sb);
2282 if (FILESYSTEM_CHANGED_TB(tb)) 2291 if (FILESYSTEM_CHANGED_TB(tb))
2283 return REPEAT_SEARCH; 2292 return REPEAT_SEARCH;
2284 } 2293 }
@@ -2349,12 +2358,14 @@ int fix_nodes(int op_mode, struct tree_balance *tb,
2349 2358
2350 /* if it possible in indirect_to_direct conversion */ 2359 /* if it possible in indirect_to_direct conversion */
2351 if (buffer_locked(tbS0)) { 2360 if (buffer_locked(tbS0)) {
2361 reiserfs_write_unlock(tb->tb_sb);
2352 __wait_on_buffer(tbS0); 2362 __wait_on_buffer(tbS0);
2363 reiserfs_write_lock(tb->tb_sb);
2353 if (FILESYSTEM_CHANGED_TB(tb)) 2364 if (FILESYSTEM_CHANGED_TB(tb))
2354 return REPEAT_SEARCH; 2365 return REPEAT_SEARCH;
2355 } 2366 }
2356#ifdef CONFIG_REISERFS_CHECK 2367#ifdef CONFIG_REISERFS_CHECK
2357 if (cur_tb) { 2368 if (REISERFS_SB(tb->tb_sb)->cur_tb) {
2358 print_cur_tb("fix_nodes"); 2369 print_cur_tb("fix_nodes");
2359 reiserfs_panic(tb->tb_sb, "PAP-8305", 2370 reiserfs_panic(tb->tb_sb, "PAP-8305",
2360 "there is pending do_balance"); 2371 "there is pending do_balance");
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a14d6cd9eeda..3a28e7751b3c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -251,7 +251,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
251 struct cpu_key key; 251 struct cpu_key key;
252 struct buffer_head *bh; 252 struct buffer_head *bh;
253 struct item_head *ih, tmp_ih; 253 struct item_head *ih, tmp_ih;
254 int fs_gen;
255 b_blocknr_t blocknr; 254 b_blocknr_t blocknr;
256 char *p = NULL; 255 char *p = NULL;
257 int chars; 256 int chars;
@@ -265,7 +264,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
265 (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY, 264 (loff_t) block * inode->i_sb->s_blocksize + 1, TYPE_ANY,
266 3); 265 3);
267 266
268 research:
269 result = search_for_position_by_key(inode->i_sb, &key, &path); 267 result = search_for_position_by_key(inode->i_sb, &key, &path);
270 if (result != POSITION_FOUND) { 268 if (result != POSITION_FOUND) {
271 pathrelse(&path); 269 pathrelse(&path);
@@ -340,7 +338,6 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
340 } 338 }
341 // read file tail into part of page 339 // read file tail into part of page
342 offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1); 340 offset = (cpu_key_k_offset(&key) - 1) & (PAGE_CACHE_SIZE - 1);
343 fs_gen = get_generation(inode->i_sb);
344 copy_item_head(&tmp_ih, ih); 341 copy_item_head(&tmp_ih, ih);
345 342
346 /* we only want to kmap if we are reading the tail into the page. 343 /* we only want to kmap if we are reading the tail into the page.
@@ -348,13 +345,9 @@ static int _get_block_create_0(struct inode *inode, sector_t block,
348 ** sure we need to. But, this means the item might move if 345 ** sure we need to. But, this means the item might move if
349 ** kmap schedules 346 ** kmap schedules
350 */ 347 */
351 if (!p) { 348 if (!p)
352 p = (char *)kmap(bh_result->b_page); 349 p = (char *)kmap(bh_result->b_page);
353 if (fs_changed(fs_gen, inode->i_sb) 350
354 && item_moved(&tmp_ih, &path)) {
355 goto research;
356 }
357 }
358 p += offset; 351 p += offset;
359 memset(p, 0, inode->i_sb->s_blocksize); 352 memset(p, 0, inode->i_sb->s_blocksize);
360 do { 353 do {
@@ -489,10 +482,14 @@ static int reiserfs_get_blocks_direct_io(struct inode *inode,
489 disappeared */ 482 disappeared */
490 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) { 483 if (REISERFS_I(inode)->i_flags & i_pack_on_close_mask) {
491 int err; 484 int err;
492 lock_kernel(); 485
486 reiserfs_write_lock(inode->i_sb);
487
493 err = reiserfs_commit_for_inode(inode); 488 err = reiserfs_commit_for_inode(inode);
494 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 489 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
495 unlock_kernel(); 490
491 reiserfs_write_unlock(inode->i_sb);
492
496 if (err < 0) 493 if (err < 0)
497 ret = err; 494 ret = err;
498 } 495 }
@@ -601,6 +598,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
601 __le32 *item; 598 __le32 *item;
602 int done; 599 int done;
603 int fs_gen; 600 int fs_gen;
601 int lock_depth;
604 struct reiserfs_transaction_handle *th = NULL; 602 struct reiserfs_transaction_handle *th = NULL;
605 /* space reserved in transaction batch: 603 /* space reserved in transaction batch:
606 . 3 balancings in direct->indirect conversion 604 . 3 balancings in direct->indirect conversion
@@ -616,12 +614,11 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
616 loff_t new_offset = 614 loff_t new_offset =
617 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1; 615 (((loff_t) block) << inode->i_sb->s_blocksize_bits) + 1;
618 616
619 /* bad.... */ 617 lock_depth = reiserfs_write_lock_once(inode->i_sb);
620 reiserfs_write_lock(inode->i_sb);
621 version = get_inode_item_key_version(inode); 618 version = get_inode_item_key_version(inode);
622 619
623 if (!file_capable(inode, block)) { 620 if (!file_capable(inode, block)) {
624 reiserfs_write_unlock(inode->i_sb); 621 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
625 return -EFBIG; 622 return -EFBIG;
626 } 623 }
627 624
@@ -633,7 +630,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
633 /* find number of block-th logical block of the file */ 630 /* find number of block-th logical block of the file */
634 ret = _get_block_create_0(inode, block, bh_result, 631 ret = _get_block_create_0(inode, block, bh_result,
635 create | GET_BLOCK_READ_DIRECT); 632 create | GET_BLOCK_READ_DIRECT);
636 reiserfs_write_unlock(inode->i_sb); 633 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
637 return ret; 634 return ret;
638 } 635 }
639 /* 636 /*
@@ -751,7 +748,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
751 if (!dangle && th) 748 if (!dangle && th)
752 retval = reiserfs_end_persistent_transaction(th); 749 retval = reiserfs_end_persistent_transaction(th);
753 750
754 reiserfs_write_unlock(inode->i_sb); 751 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
755 752
756 /* the item was found, so new blocks were not added to the file 753 /* the item was found, so new blocks were not added to the file
757 ** there is no need to make sure the inode is updated with this 754 ** there is no need to make sure the inode is updated with this
@@ -935,7 +932,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
935 if (blocks_needed == 1) { 932 if (blocks_needed == 1) {
936 un = &unf_single; 933 un = &unf_single;
937 } else { 934 } else {
938 un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_ATOMIC); // We need to avoid scheduling. 935 un = kzalloc(min(blocks_needed, max_to_insert) * UNFM_P_SIZE, GFP_NOFS);
939 if (!un) { 936 if (!un) {
940 un = &unf_single; 937 un = &unf_single;
941 blocks_needed = 1; 938 blocks_needed = 1;
@@ -997,10 +994,16 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
997 if (retval) 994 if (retval)
998 goto failure; 995 goto failure;
999 } 996 }
1000 /* inserting indirect pointers for a hole can take a 997 /*
1001 ** long time. reschedule if needed 998 * inserting indirect pointers for a hole can take a
999 * long time. reschedule if needed and also release the write
1000 * lock for others.
1002 */ 1001 */
1003 cond_resched(); 1002 if (need_resched()) {
1003 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1004 schedule();
1005 lock_depth = reiserfs_write_lock_once(inode->i_sb);
1006 }
1004 1007
1005 retval = search_for_position_by_key(inode->i_sb, &key, &path); 1008 retval = search_for_position_by_key(inode->i_sb, &key, &path);
1006 if (retval == IO_ERROR) { 1009 if (retval == IO_ERROR) {
@@ -1035,7 +1038,7 @@ int reiserfs_get_block(struct inode *inode, sector_t block,
1035 retval = err; 1038 retval = err;
1036 } 1039 }
1037 1040
1038 reiserfs_write_unlock(inode->i_sb); 1041 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
1039 reiserfs_check_path(&path); 1042 reiserfs_check_path(&path);
1040 return retval; 1043 return retval;
1041} 1044}
@@ -2072,8 +2075,9 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2072 int error; 2075 int error;
2073 struct buffer_head *bh = NULL; 2076 struct buffer_head *bh = NULL;
2074 int err2; 2077 int err2;
2078 int lock_depth;
2075 2079
2076 reiserfs_write_lock(inode->i_sb); 2080 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2077 2081
2078 if (inode->i_size > 0) { 2082 if (inode->i_size > 0) {
2079 error = grab_tail_page(inode, &page, &bh); 2083 error = grab_tail_page(inode, &page, &bh);
@@ -2142,14 +2146,17 @@ int reiserfs_truncate_file(struct inode *inode, int update_timestamps)
2142 page_cache_release(page); 2146 page_cache_release(page);
2143 } 2147 }
2144 2148
2145 reiserfs_write_unlock(inode->i_sb); 2149 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2150
2146 return 0; 2151 return 0;
2147 out: 2152 out:
2148 if (page) { 2153 if (page) {
2149 unlock_page(page); 2154 unlock_page(page);
2150 page_cache_release(page); 2155 page_cache_release(page);
2151 } 2156 }
2152 reiserfs_write_unlock(inode->i_sb); 2157
2158 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2159
2153 return error; 2160 return error;
2154} 2161}
2155 2162
@@ -2608,7 +2615,10 @@ int reiserfs_prepare_write(struct file *f, struct page *page,
2608 int ret; 2615 int ret;
2609 int old_ref = 0; 2616 int old_ref = 0;
2610 2617
2618 reiserfs_write_unlock(inode->i_sb);
2611 reiserfs_wait_on_write_block(inode->i_sb); 2619 reiserfs_wait_on_write_block(inode->i_sb);
2620 reiserfs_write_lock(inode->i_sb);
2621
2612 fix_tail_page_for_writing(page); 2622 fix_tail_page_for_writing(page);
2613 if (reiserfs_transaction_running(inode->i_sb)) { 2623 if (reiserfs_transaction_running(inode->i_sb)) {
2614 struct reiserfs_transaction_handle *th; 2624 struct reiserfs_transaction_handle *th;
@@ -2664,6 +2674,8 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2664 int update_sd = 0; 2674 int update_sd = 0;
2665 struct reiserfs_transaction_handle *th; 2675 struct reiserfs_transaction_handle *th;
2666 unsigned start; 2676 unsigned start;
2677 int lock_depth = 0;
2678 bool locked = false;
2667 2679
2668 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND) 2680 if ((unsigned long)fsdata & AOP_FLAG_CONT_EXPAND)
2669 pos ++; 2681 pos ++;
@@ -2690,9 +2702,11 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2690 ** to do the i_size updates here. 2702 ** to do the i_size updates here.
2691 */ 2703 */
2692 pos += copied; 2704 pos += copied;
2705
2693 if (pos > inode->i_size) { 2706 if (pos > inode->i_size) {
2694 struct reiserfs_transaction_handle myth; 2707 struct reiserfs_transaction_handle myth;
2695 reiserfs_write_lock(inode->i_sb); 2708 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2709 locked = true;
2696 /* If the file have grown beyond the border where it 2710 /* If the file have grown beyond the border where it
2697 can have a tail, unmark it as needing a tail 2711 can have a tail, unmark it as needing a tail
2698 packing */ 2712 packing */
@@ -2703,10 +2717,9 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2703 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 2717 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2704 2718
2705 ret = journal_begin(&myth, inode->i_sb, 1); 2719 ret = journal_begin(&myth, inode->i_sb, 1);
2706 if (ret) { 2720 if (ret)
2707 reiserfs_write_unlock(inode->i_sb);
2708 goto journal_error; 2721 goto journal_error;
2709 } 2722
2710 reiserfs_update_inode_transaction(inode); 2723 reiserfs_update_inode_transaction(inode);
2711 inode->i_size = pos; 2724 inode->i_size = pos;
2712 /* 2725 /*
@@ -2718,34 +2731,36 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping,
2718 reiserfs_update_sd(&myth, inode); 2731 reiserfs_update_sd(&myth, inode);
2719 update_sd = 1; 2732 update_sd = 1;
2720 ret = journal_end(&myth, inode->i_sb, 1); 2733 ret = journal_end(&myth, inode->i_sb, 1);
2721 reiserfs_write_unlock(inode->i_sb);
2722 if (ret) 2734 if (ret)
2723 goto journal_error; 2735 goto journal_error;
2724 } 2736 }
2725 if (th) { 2737 if (th) {
2726 reiserfs_write_lock(inode->i_sb); 2738 if (!locked) {
2739 lock_depth = reiserfs_write_lock_once(inode->i_sb);
2740 locked = true;
2741 }
2727 if (!update_sd) 2742 if (!update_sd)
2728 mark_inode_dirty(inode); 2743 mark_inode_dirty(inode);
2729 ret = reiserfs_end_persistent_transaction(th); 2744 ret = reiserfs_end_persistent_transaction(th);
2730 reiserfs_write_unlock(inode->i_sb);
2731 if (ret) 2745 if (ret)
2732 goto out; 2746 goto out;
2733 } 2747 }
2734 2748
2735 out: 2749 out:
2750 if (locked)
2751 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2736 unlock_page(page); 2752 unlock_page(page);
2737 page_cache_release(page); 2753 page_cache_release(page);
2738 return ret == 0 ? copied : ret; 2754 return ret == 0 ? copied : ret;
2739 2755
2740 journal_error: 2756 journal_error:
2757 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
2758 locked = false;
2741 if (th) { 2759 if (th) {
2742 reiserfs_write_lock(inode->i_sb);
2743 if (!update_sd) 2760 if (!update_sd)
2744 reiserfs_update_sd(th, inode); 2761 reiserfs_update_sd(th, inode);
2745 ret = reiserfs_end_persistent_transaction(th); 2762 ret = reiserfs_end_persistent_transaction(th);
2746 reiserfs_write_unlock(inode->i_sb);
2747 } 2763 }
2748
2749 goto out; 2764 goto out;
2750} 2765}
2751 2766
@@ -2758,7 +2773,10 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2758 int update_sd = 0; 2773 int update_sd = 0;
2759 struct reiserfs_transaction_handle *th = NULL; 2774 struct reiserfs_transaction_handle *th = NULL;
2760 2775
2776 reiserfs_write_unlock(inode->i_sb);
2761 reiserfs_wait_on_write_block(inode->i_sb); 2777 reiserfs_wait_on_write_block(inode->i_sb);
2778 reiserfs_write_lock(inode->i_sb);
2779
2762 if (reiserfs_transaction_running(inode->i_sb)) { 2780 if (reiserfs_transaction_running(inode->i_sb)) {
2763 th = current->journal_info; 2781 th = current->journal_info;
2764 } 2782 }
@@ -2770,7 +2788,6 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2770 */ 2788 */
2771 if (pos > inode->i_size) { 2789 if (pos > inode->i_size) {
2772 struct reiserfs_transaction_handle myth; 2790 struct reiserfs_transaction_handle myth;
2773 reiserfs_write_lock(inode->i_sb);
2774 /* If the file have grown beyond the border where it 2791 /* If the file have grown beyond the border where it
2775 can have a tail, unmark it as needing a tail 2792 can have a tail, unmark it as needing a tail
2776 packing */ 2793 packing */
@@ -2781,10 +2798,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2781 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask; 2798 REISERFS_I(inode)->i_flags &= ~i_pack_on_close_mask;
2782 2799
2783 ret = journal_begin(&myth, inode->i_sb, 1); 2800 ret = journal_begin(&myth, inode->i_sb, 1);
2784 if (ret) { 2801 if (ret)
2785 reiserfs_write_unlock(inode->i_sb);
2786 goto journal_error; 2802 goto journal_error;
2787 } 2803
2788 reiserfs_update_inode_transaction(inode); 2804 reiserfs_update_inode_transaction(inode);
2789 inode->i_size = pos; 2805 inode->i_size = pos;
2790 /* 2806 /*
@@ -2796,16 +2812,13 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2796 reiserfs_update_sd(&myth, inode); 2812 reiserfs_update_sd(&myth, inode);
2797 update_sd = 1; 2813 update_sd = 1;
2798 ret = journal_end(&myth, inode->i_sb, 1); 2814 ret = journal_end(&myth, inode->i_sb, 1);
2799 reiserfs_write_unlock(inode->i_sb);
2800 if (ret) 2815 if (ret)
2801 goto journal_error; 2816 goto journal_error;
2802 } 2817 }
2803 if (th) { 2818 if (th) {
2804 reiserfs_write_lock(inode->i_sb);
2805 if (!update_sd) 2819 if (!update_sd)
2806 mark_inode_dirty(inode); 2820 mark_inode_dirty(inode);
2807 ret = reiserfs_end_persistent_transaction(th); 2821 ret = reiserfs_end_persistent_transaction(th);
2808 reiserfs_write_unlock(inode->i_sb);
2809 if (ret) 2822 if (ret)
2810 goto out; 2823 goto out;
2811 } 2824 }
@@ -2815,11 +2828,9 @@ int reiserfs_commit_write(struct file *f, struct page *page,
2815 2828
2816 journal_error: 2829 journal_error:
2817 if (th) { 2830 if (th) {
2818 reiserfs_write_lock(inode->i_sb);
2819 if (!update_sd) 2831 if (!update_sd)
2820 reiserfs_update_sd(th, inode); 2832 reiserfs_update_sd(th, inode);
2821 ret = reiserfs_end_persistent_transaction(th); 2833 ret = reiserfs_end_persistent_transaction(th);
2822 reiserfs_write_unlock(inode->i_sb);
2823 } 2834 }
2824 2835
2825 return ret; 2836 return ret;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 0ccc3fdda7bf..ace77451ceb1 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -13,44 +13,52 @@
13#include <linux/compat.h> 13#include <linux/compat.h>
14 14
15/* 15/*
16** reiserfs_ioctl - handler for ioctl for inode 16 * reiserfs_ioctl - handler for ioctl for inode
17** supported commands: 17 * supported commands:
18** 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect 18 * 1) REISERFS_IOC_UNPACK - try to unpack tail from direct item into indirect
19** and prevent packing file (argument arg has to be non-zero) 19 * and prevent packing file (argument arg has to be non-zero)
20** 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION 20 * 2) REISERFS_IOC_[GS]ETFLAGS, REISERFS_IOC_[GS]ETVERSION
21** 3) That's all for a while ... 21 * 3) That's all for a while ...
22*/ 22 */
23int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 23long reiserfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
24 unsigned long arg)
25{ 24{
25 struct inode *inode = filp->f_path.dentry->d_inode;
26 unsigned int flags; 26 unsigned int flags;
27 int err = 0; 27 int err = 0;
28 28
29 reiserfs_write_lock(inode->i_sb);
30
29 switch (cmd) { 31 switch (cmd) {
30 case REISERFS_IOC_UNPACK: 32 case REISERFS_IOC_UNPACK:
31 if (S_ISREG(inode->i_mode)) { 33 if (S_ISREG(inode->i_mode)) {
32 if (arg) 34 if (arg)
33 return reiserfs_unpack(inode, filp); 35 err = reiserfs_unpack(inode, filp);
34 else
35 return 0;
36 } else 36 } else
37 return -ENOTTY; 37 err = -ENOTTY;
38 /* following two cases are taken from fs/ext2/ioctl.c by Remy 38 break;
39 Card (card@masi.ibp.fr) */ 39 /*
40 * following two cases are taken from fs/ext2/ioctl.c by Remy
41 * Card (card@masi.ibp.fr)
42 */
40 case REISERFS_IOC_GETFLAGS: 43 case REISERFS_IOC_GETFLAGS:
41 if (!reiserfs_attrs(inode->i_sb)) 44 if (!reiserfs_attrs(inode->i_sb)) {
42 return -ENOTTY; 45 err = -ENOTTY;
46 break;
47 }
43 48
44 flags = REISERFS_I(inode)->i_attrs; 49 flags = REISERFS_I(inode)->i_attrs;
45 i_attrs_to_sd_attrs(inode, (__u16 *) & flags); 50 i_attrs_to_sd_attrs(inode, (__u16 *) & flags);
46 return put_user(flags, (int __user *)arg); 51 err = put_user(flags, (int __user *)arg);
52 break;
47 case REISERFS_IOC_SETFLAGS:{ 53 case REISERFS_IOC_SETFLAGS:{
48 if (!reiserfs_attrs(inode->i_sb)) 54 if (!reiserfs_attrs(inode->i_sb)) {
49 return -ENOTTY; 55 err = -ENOTTY;
56 break;
57 }
50 58
51 err = mnt_want_write(filp->f_path.mnt); 59 err = mnt_want_write(filp->f_path.mnt);
52 if (err) 60 if (err)
53 return err; 61 break;
54 62
55 if (!is_owner_or_cap(inode)) { 63 if (!is_owner_or_cap(inode)) {
56 err = -EPERM; 64 err = -EPERM;
@@ -90,16 +98,18 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
90 mark_inode_dirty(inode); 98 mark_inode_dirty(inode);
91setflags_out: 99setflags_out:
92 mnt_drop_write(filp->f_path.mnt); 100 mnt_drop_write(filp->f_path.mnt);
93 return err; 101 break;
94 } 102 }
95 case REISERFS_IOC_GETVERSION: 103 case REISERFS_IOC_GETVERSION:
96 return put_user(inode->i_generation, (int __user *)arg); 104 err = put_user(inode->i_generation, (int __user *)arg);
105 break;
97 case REISERFS_IOC_SETVERSION: 106 case REISERFS_IOC_SETVERSION:
98 if (!is_owner_or_cap(inode)) 107 if (!is_owner_or_cap(inode))
99 return -EPERM; 108 err = -EPERM;
109 break;
100 err = mnt_want_write(filp->f_path.mnt); 110 err = mnt_want_write(filp->f_path.mnt);
101 if (err) 111 if (err)
102 return err; 112 break;
103 if (get_user(inode->i_generation, (int __user *)arg)) { 113 if (get_user(inode->i_generation, (int __user *)arg)) {
104 err = -EFAULT; 114 err = -EFAULT;
105 goto setversion_out; 115 goto setversion_out;
@@ -108,19 +118,20 @@ setflags_out:
108 mark_inode_dirty(inode); 118 mark_inode_dirty(inode);
109setversion_out: 119setversion_out:
110 mnt_drop_write(filp->f_path.mnt); 120 mnt_drop_write(filp->f_path.mnt);
111 return err; 121 break;
112 default: 122 default:
113 return -ENOTTY; 123 err = -ENOTTY;
114 } 124 }
125
126 reiserfs_write_unlock(inode->i_sb);
127
128 return err;
115} 129}
116 130
117#ifdef CONFIG_COMPAT 131#ifdef CONFIG_COMPAT
118long reiserfs_compat_ioctl(struct file *file, unsigned int cmd, 132long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
119 unsigned long arg) 133 unsigned long arg)
120{ 134{
121 struct inode *inode = file->f_path.dentry->d_inode;
122 int ret;
123
124 /* These are just misnamed, they actually get/put from/to user an int */ 135 /* These are just misnamed, they actually get/put from/to user an int */
125 switch (cmd) { 136 switch (cmd) {
126 case REISERFS_IOC32_UNPACK: 137 case REISERFS_IOC32_UNPACK:
@@ -141,10 +152,8 @@ long reiserfs_compat_ioctl(struct file *file, unsigned int cmd,
141 default: 152 default:
142 return -ENOIOCTLCMD; 153 return -ENOIOCTLCMD;
143 } 154 }
144 lock_kernel(); 155
145 ret = reiserfs_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); 156 return reiserfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
146 unlock_kernel();
147 return ret;
148} 157}
149#endif 158#endif
150 159
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 90622200b39c..2f8a7e7b8dab 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -429,21 +429,6 @@ static void clear_prepared_bits(struct buffer_head *bh)
429 clear_buffer_journal_restore_dirty(bh); 429 clear_buffer_journal_restore_dirty(bh);
430} 430}
431 431
432/* utility function to force a BUG if it is called without the big
433** kernel lock held. caller is the string printed just before calling BUG()
434*/
435void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
436{
437#ifdef CONFIG_SMP
438 if (current->lock_depth < 0) {
439 reiserfs_panic(sb, "journal-1", "%s called without kernel "
440 "lock held", caller);
441 }
442#else
443 ;
444#endif
445}
446
447/* return a cnode with same dev, block number and size in table, or null if not found */ 432/* return a cnode with same dev, block number and size in table, or null if not found */
448static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct 433static inline struct reiserfs_journal_cnode *get_journal_hash_dev(struct
449 super_block 434 super_block
@@ -556,7 +541,8 @@ static inline void insert_journal_hash(struct reiserfs_journal_cnode **table,
556static inline void lock_journal(struct super_block *sb) 541static inline void lock_journal(struct super_block *sb)
557{ 542{
558 PROC_INFO_INC(sb, journal.lock_journal); 543 PROC_INFO_INC(sb, journal.lock_journal);
559 mutex_lock(&SB_JOURNAL(sb)->j_mutex); 544
545 reiserfs_mutex_lock_safe(&SB_JOURNAL(sb)->j_mutex, sb);
560} 546}
561 547
562/* unlock the current transaction */ 548/* unlock the current transaction */
@@ -708,7 +694,9 @@ static void check_barrier_completion(struct super_block *s,
708 disable_barrier(s); 694 disable_barrier(s);
709 set_buffer_uptodate(bh); 695 set_buffer_uptodate(bh);
710 set_buffer_dirty(bh); 696 set_buffer_dirty(bh);
697 reiserfs_write_unlock(s);
711 sync_dirty_buffer(bh); 698 sync_dirty_buffer(bh);
699 reiserfs_write_lock(s);
712 } 700 }
713} 701}
714 702
@@ -996,8 +984,13 @@ static int reiserfs_async_progress_wait(struct super_block *s)
996{ 984{
997 DEFINE_WAIT(wait); 985 DEFINE_WAIT(wait);
998 struct reiserfs_journal *j = SB_JOURNAL(s); 986 struct reiserfs_journal *j = SB_JOURNAL(s);
999 if (atomic_read(&j->j_async_throttle)) 987
988 if (atomic_read(&j->j_async_throttle)) {
989 reiserfs_write_unlock(s);
1000 congestion_wait(BLK_RW_ASYNC, HZ / 10); 990 congestion_wait(BLK_RW_ASYNC, HZ / 10);
991 reiserfs_write_lock(s);
992 }
993
1001 return 0; 994 return 0;
1002} 995}
1003 996
@@ -1043,7 +1036,8 @@ static int flush_commit_list(struct super_block *s,
1043 } 1036 }
1044 1037
1045 /* make sure nobody is trying to flush this one at the same time */ 1038 /* make sure nobody is trying to flush this one at the same time */
1046 mutex_lock(&jl->j_commit_mutex); 1039 reiserfs_mutex_lock_safe(&jl->j_commit_mutex, s);
1040
1047 if (!journal_list_still_alive(s, trans_id)) { 1041 if (!journal_list_still_alive(s, trans_id)) {
1048 mutex_unlock(&jl->j_commit_mutex); 1042 mutex_unlock(&jl->j_commit_mutex);
1049 goto put_jl; 1043 goto put_jl;
@@ -1061,12 +1055,17 @@ static int flush_commit_list(struct super_block *s,
1061 1055
1062 if (!list_empty(&jl->j_bh_list)) { 1056 if (!list_empty(&jl->j_bh_list)) {
1063 int ret; 1057 int ret;
1064 unlock_kernel(); 1058
1059 /*
1060 * We might sleep in numerous places inside
1061 * write_ordered_buffers. Relax the write lock.
1062 */
1063 reiserfs_write_unlock(s);
1065 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock, 1064 ret = write_ordered_buffers(&journal->j_dirty_buffers_lock,
1066 journal, jl, &jl->j_bh_list); 1065 journal, jl, &jl->j_bh_list);
1067 if (ret < 0 && retval == 0) 1066 if (ret < 0 && retval == 0)
1068 retval = ret; 1067 retval = ret;
1069 lock_kernel(); 1068 reiserfs_write_lock(s);
1070 } 1069 }
1071 BUG_ON(!list_empty(&jl->j_bh_list)); 1070 BUG_ON(!list_empty(&jl->j_bh_list));
1072 /* 1071 /*
@@ -1085,8 +1084,11 @@ static int flush_commit_list(struct super_block *s,
1085 SB_ONDISK_JOURNAL_SIZE(s); 1084 SB_ONDISK_JOURNAL_SIZE(s);
1086 tbh = journal_find_get_block(s, bn); 1085 tbh = journal_find_get_block(s, bn);
1087 if (tbh) { 1086 if (tbh) {
1088 if (buffer_dirty(tbh)) 1087 if (buffer_dirty(tbh)) {
1089 ll_rw_block(WRITE, 1, &tbh) ; 1088 reiserfs_write_unlock(s);
1089 ll_rw_block(WRITE, 1, &tbh);
1090 reiserfs_write_lock(s);
1091 }
1090 put_bh(tbh) ; 1092 put_bh(tbh) ;
1091 } 1093 }
1092 } 1094 }
@@ -1114,12 +1116,19 @@ static int flush_commit_list(struct super_block *s,
1114 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) + 1116 bn = SB_ONDISK_JOURNAL_1st_BLOCK(s) +
1115 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s); 1117 (jl->j_start + i) % SB_ONDISK_JOURNAL_SIZE(s);
1116 tbh = journal_find_get_block(s, bn); 1118 tbh = journal_find_get_block(s, bn);
1119
1120 reiserfs_write_unlock(s);
1117 wait_on_buffer(tbh); 1121 wait_on_buffer(tbh);
1122 reiserfs_write_lock(s);
1118 // since we're using ll_rw_blk above, it might have skipped over 1123 // since we're using ll_rw_blk above, it might have skipped over
1119 // a locked buffer. Double check here 1124 // a locked buffer. Double check here
1120 // 1125 //
1121 if (buffer_dirty(tbh)) /* redundant, sync_dirty_buffer() checks */ 1126 /* redundant, sync_dirty_buffer() checks */
1127 if (buffer_dirty(tbh)) {
1128 reiserfs_write_unlock(s);
1122 sync_dirty_buffer(tbh); 1129 sync_dirty_buffer(tbh);
1130 reiserfs_write_lock(s);
1131 }
1123 if (unlikely(!buffer_uptodate(tbh))) { 1132 if (unlikely(!buffer_uptodate(tbh))) {
1124#ifdef CONFIG_REISERFS_CHECK 1133#ifdef CONFIG_REISERFS_CHECK
1125 reiserfs_warning(s, "journal-601", 1134 reiserfs_warning(s, "journal-601",
@@ -1143,10 +1152,15 @@ static int flush_commit_list(struct super_block *s,
1143 if (buffer_dirty(jl->j_commit_bh)) 1152 if (buffer_dirty(jl->j_commit_bh))
1144 BUG(); 1153 BUG();
1145 mark_buffer_dirty(jl->j_commit_bh) ; 1154 mark_buffer_dirty(jl->j_commit_bh) ;
1155 reiserfs_write_unlock(s);
1146 sync_dirty_buffer(jl->j_commit_bh) ; 1156 sync_dirty_buffer(jl->j_commit_bh) ;
1157 reiserfs_write_lock(s);
1147 } 1158 }
1148 } else 1159 } else {
1160 reiserfs_write_unlock(s);
1149 wait_on_buffer(jl->j_commit_bh); 1161 wait_on_buffer(jl->j_commit_bh);
1162 reiserfs_write_lock(s);
1163 }
1150 1164
1151 check_barrier_completion(s, jl->j_commit_bh); 1165 check_barrier_completion(s, jl->j_commit_bh);
1152 1166
@@ -1286,7 +1300,9 @@ static int _update_journal_header_block(struct super_block *sb,
1286 1300
1287 if (trans_id >= journal->j_last_flush_trans_id) { 1301 if (trans_id >= journal->j_last_flush_trans_id) {
1288 if (buffer_locked((journal->j_header_bh))) { 1302 if (buffer_locked((journal->j_header_bh))) {
1303 reiserfs_write_unlock(sb);
1289 wait_on_buffer((journal->j_header_bh)); 1304 wait_on_buffer((journal->j_header_bh));
1305 reiserfs_write_lock(sb);
1290 if (unlikely(!buffer_uptodate(journal->j_header_bh))) { 1306 if (unlikely(!buffer_uptodate(journal->j_header_bh))) {
1291#ifdef CONFIG_REISERFS_CHECK 1307#ifdef CONFIG_REISERFS_CHECK
1292 reiserfs_warning(sb, "journal-699", 1308 reiserfs_warning(sb, "journal-699",
@@ -1312,12 +1328,16 @@ static int _update_journal_header_block(struct super_block *sb,
1312 disable_barrier(sb); 1328 disable_barrier(sb);
1313 goto sync; 1329 goto sync;
1314 } 1330 }
1331 reiserfs_write_unlock(sb);
1315 wait_on_buffer(journal->j_header_bh); 1332 wait_on_buffer(journal->j_header_bh);
1333 reiserfs_write_lock(sb);
1316 check_barrier_completion(sb, journal->j_header_bh); 1334 check_barrier_completion(sb, journal->j_header_bh);
1317 } else { 1335 } else {
1318 sync: 1336 sync:
1319 set_buffer_dirty(journal->j_header_bh); 1337 set_buffer_dirty(journal->j_header_bh);
1338 reiserfs_write_unlock(sb);
1320 sync_dirty_buffer(journal->j_header_bh); 1339 sync_dirty_buffer(journal->j_header_bh);
1340 reiserfs_write_lock(sb);
1321 } 1341 }
1322 if (!buffer_uptodate(journal->j_header_bh)) { 1342 if (!buffer_uptodate(journal->j_header_bh)) {
1323 reiserfs_warning(sb, "journal-837", 1343 reiserfs_warning(sb, "journal-837",
@@ -1409,7 +1429,7 @@ static int flush_journal_list(struct super_block *s,
1409 1429
1410 /* if flushall == 0, the lock is already held */ 1430 /* if flushall == 0, the lock is already held */
1411 if (flushall) { 1431 if (flushall) {
1412 mutex_lock(&journal->j_flush_mutex); 1432 reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1413 } else if (mutex_trylock(&journal->j_flush_mutex)) { 1433 } else if (mutex_trylock(&journal->j_flush_mutex)) {
1414 BUG(); 1434 BUG();
1415 } 1435 }
@@ -1553,7 +1573,11 @@ static int flush_journal_list(struct super_block *s,
1553 reiserfs_panic(s, "journal-1011", 1573 reiserfs_panic(s, "journal-1011",
1554 "cn->bh is NULL"); 1574 "cn->bh is NULL");
1555 } 1575 }
1576
1577 reiserfs_write_unlock(s);
1556 wait_on_buffer(cn->bh); 1578 wait_on_buffer(cn->bh);
1579 reiserfs_write_lock(s);
1580
1557 if (!cn->bh) { 1581 if (!cn->bh) {
1558 reiserfs_panic(s, "journal-1012", 1582 reiserfs_panic(s, "journal-1012",
1559 "cn->bh is NULL"); 1583 "cn->bh is NULL");
@@ -1769,7 +1793,7 @@ static int kupdate_transactions(struct super_block *s,
1769 struct reiserfs_journal *journal = SB_JOURNAL(s); 1793 struct reiserfs_journal *journal = SB_JOURNAL(s);
1770 chunk.nr = 0; 1794 chunk.nr = 0;
1771 1795
1772 mutex_lock(&journal->j_flush_mutex); 1796 reiserfs_mutex_lock_safe(&journal->j_flush_mutex, s);
1773 if (!journal_list_still_alive(s, orig_trans_id)) { 1797 if (!journal_list_still_alive(s, orig_trans_id)) {
1774 goto done; 1798 goto done;
1775 } 1799 }
@@ -1973,11 +1997,19 @@ static int do_journal_release(struct reiserfs_transaction_handle *th,
1973 reiserfs_mounted_fs_count--; 1997 reiserfs_mounted_fs_count--;
1974 /* wait for all commits to finish */ 1998 /* wait for all commits to finish */
1975 cancel_delayed_work(&SB_JOURNAL(sb)->j_work); 1999 cancel_delayed_work(&SB_JOURNAL(sb)->j_work);
2000
2001 /*
2002 * We must release the write lock here because
2003 * the workqueue job (flush_async_commit) needs this lock
2004 */
2005 reiserfs_write_unlock(sb);
1976 flush_workqueue(commit_wq); 2006 flush_workqueue(commit_wq);
2007
1977 if (!reiserfs_mounted_fs_count) { 2008 if (!reiserfs_mounted_fs_count) {
1978 destroy_workqueue(commit_wq); 2009 destroy_workqueue(commit_wq);
1979 commit_wq = NULL; 2010 commit_wq = NULL;
1980 } 2011 }
2012 reiserfs_write_lock(sb);
1981 2013
1982 free_journal_ram(sb); 2014 free_journal_ram(sb);
1983 2015
@@ -2243,7 +2275,11 @@ static int journal_read_transaction(struct super_block *sb,
2243 /* read in the log blocks, memcpy to the corresponding real block */ 2275 /* read in the log blocks, memcpy to the corresponding real block */
2244 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks); 2276 ll_rw_block(READ, get_desc_trans_len(desc), log_blocks);
2245 for (i = 0; i < get_desc_trans_len(desc); i++) { 2277 for (i = 0; i < get_desc_trans_len(desc); i++) {
2278
2279 reiserfs_write_unlock(sb);
2246 wait_on_buffer(log_blocks[i]); 2280 wait_on_buffer(log_blocks[i]);
2281 reiserfs_write_lock(sb);
2282
2247 if (!buffer_uptodate(log_blocks[i])) { 2283 if (!buffer_uptodate(log_blocks[i])) {
2248 reiserfs_warning(sb, "journal-1212", 2284 reiserfs_warning(sb, "journal-1212",
2249 "REPLAY FAILURE fsck required! " 2285 "REPLAY FAILURE fsck required! "
@@ -2765,11 +2801,27 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2765 goto free_and_return; 2801 goto free_and_return;
2766 } 2802 }
2767 2803
2804 /*
2805 * We need to unlock here to avoid creating the following
2806 * dependency:
2807 * reiserfs_lock -> sysfs_mutex
2808 * Because the reiserfs mmap path creates the following dependency:
2809 * mm->mmap -> reiserfs_lock, hence we have
2810 * mm->mmap -> reiserfs_lock ->sysfs_mutex
2811 * This would ends up in a circular dependency with sysfs readdir path
2812 * which does sysfs_mutex -> mm->mmap_sem
2813 * This is fine because the reiserfs lock is useless in mount path,
2814 * at least until we call journal_begin. We keep it for paranoid
2815 * reasons.
2816 */
2817 reiserfs_write_unlock(sb);
2768 if (journal_init_dev(sb, journal, j_dev_name) != 0) { 2818 if (journal_init_dev(sb, journal, j_dev_name) != 0) {
2819 reiserfs_write_lock(sb);
2769 reiserfs_warning(sb, "sh-462", 2820 reiserfs_warning(sb, "sh-462",
2770 "unable to initialize jornal device"); 2821 "unable to initialize jornal device");
2771 goto free_and_return; 2822 goto free_and_return;
2772 } 2823 }
2824 reiserfs_write_lock(sb);
2773 2825
2774 rs = SB_DISK_SUPER_BLOCK(sb); 2826 rs = SB_DISK_SUPER_BLOCK(sb);
2775 2827
@@ -2881,8 +2933,11 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
2881 } 2933 }
2882 2934
2883 reiserfs_mounted_fs_count++; 2935 reiserfs_mounted_fs_count++;
2884 if (reiserfs_mounted_fs_count <= 1) 2936 if (reiserfs_mounted_fs_count <= 1) {
2937 reiserfs_write_unlock(sb);
2885 commit_wq = create_workqueue("reiserfs"); 2938 commit_wq = create_workqueue("reiserfs");
2939 reiserfs_write_lock(sb);
2940 }
2886 2941
2887 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits); 2942 INIT_DELAYED_WORK(&journal->j_work, flush_async_commits);
2888 journal->j_work_sb = sb; 2943 journal->j_work_sb = sb;
@@ -2964,8 +3019,11 @@ static void queue_log_writer(struct super_block *s)
2964 init_waitqueue_entry(&wait, current); 3019 init_waitqueue_entry(&wait, current);
2965 add_wait_queue(&journal->j_join_wait, &wait); 3020 add_wait_queue(&journal->j_join_wait, &wait);
2966 set_current_state(TASK_UNINTERRUPTIBLE); 3021 set_current_state(TASK_UNINTERRUPTIBLE);
2967 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) 3022 if (test_bit(J_WRITERS_QUEUED, &journal->j_state)) {
3023 reiserfs_write_unlock(s);
2968 schedule(); 3024 schedule();
3025 reiserfs_write_lock(s);
3026 }
2969 __set_current_state(TASK_RUNNING); 3027 __set_current_state(TASK_RUNNING);
2970 remove_wait_queue(&journal->j_join_wait, &wait); 3028 remove_wait_queue(&journal->j_join_wait, &wait);
2971} 3029}
@@ -2982,7 +3040,9 @@ static void let_transaction_grow(struct super_block *sb, unsigned int trans_id)
2982 struct reiserfs_journal *journal = SB_JOURNAL(sb); 3040 struct reiserfs_journal *journal = SB_JOURNAL(sb);
2983 unsigned long bcount = journal->j_bcount; 3041 unsigned long bcount = journal->j_bcount;
2984 while (1) { 3042 while (1) {
3043 reiserfs_write_unlock(sb);
2985 schedule_timeout_uninterruptible(1); 3044 schedule_timeout_uninterruptible(1);
3045 reiserfs_write_lock(sb);
2986 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING; 3046 journal->j_current_jl->j_state |= LIST_COMMIT_PENDING;
2987 while ((atomic_read(&journal->j_wcount) > 0 || 3047 while ((atomic_read(&journal->j_wcount) > 0 ||
2988 atomic_read(&journal->j_jlock)) && 3048 atomic_read(&journal->j_jlock)) &&
@@ -3033,7 +3093,9 @@ static int do_journal_begin_r(struct reiserfs_transaction_handle *th,
3033 3093
3034 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) { 3094 if (test_bit(J_WRITERS_BLOCKED, &journal->j_state)) {
3035 unlock_journal(sb); 3095 unlock_journal(sb);
3096 reiserfs_write_unlock(sb);
3036 reiserfs_wait_on_write_block(sb); 3097 reiserfs_wait_on_write_block(sb);
3098 reiserfs_write_lock(sb);
3037 PROC_INFO_INC(sb, journal.journal_relock_writers); 3099 PROC_INFO_INC(sb, journal.journal_relock_writers);
3038 goto relock; 3100 goto relock;
3039 } 3101 }
@@ -3506,14 +3568,14 @@ static void flush_async_commits(struct work_struct *work)
3506 struct reiserfs_journal_list *jl; 3568 struct reiserfs_journal_list *jl;
3507 struct list_head *entry; 3569 struct list_head *entry;
3508 3570
3509 lock_kernel(); 3571 reiserfs_write_lock(sb);
3510 if (!list_empty(&journal->j_journal_list)) { 3572 if (!list_empty(&journal->j_journal_list)) {
3511 /* last entry is the youngest, commit it and you get everything */ 3573 /* last entry is the youngest, commit it and you get everything */
3512 entry = journal->j_journal_list.prev; 3574 entry = journal->j_journal_list.prev;
3513 jl = JOURNAL_LIST_ENTRY(entry); 3575 jl = JOURNAL_LIST_ENTRY(entry);
3514 flush_commit_list(sb, jl, 1); 3576 flush_commit_list(sb, jl, 1);
3515 } 3577 }
3516 unlock_kernel(); 3578 reiserfs_write_unlock(sb);
3517} 3579}
3518 3580
3519/* 3581/*
@@ -4041,7 +4103,7 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4041 * the new transaction is fully setup, and we've already flushed the 4103 * the new transaction is fully setup, and we've already flushed the
4042 * ordered bh list 4104 * ordered bh list
4043 */ 4105 */
4044 mutex_lock(&jl->j_commit_mutex); 4106 reiserfs_mutex_lock_safe(&jl->j_commit_mutex, sb);
4045 4107
4046 /* save the transaction id in case we need to commit it later */ 4108 /* save the transaction id in case we need to commit it later */
4047 commit_trans_id = jl->j_trans_id; 4109 commit_trans_id = jl->j_trans_id;
@@ -4156,7 +4218,9 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4156 next = cn->next; 4218 next = cn->next;
4157 free_cnode(sb, cn); 4219 free_cnode(sb, cn);
4158 cn = next; 4220 cn = next;
4221 reiserfs_write_unlock(sb);
4159 cond_resched(); 4222 cond_resched();
4223 reiserfs_write_lock(sb);
4160 } 4224 }
4161 4225
4162 /* we are done with both the c_bh and d_bh, but 4226 /* we are done with both the c_bh and d_bh, but
@@ -4203,10 +4267,10 @@ static int do_journal_end(struct reiserfs_transaction_handle *th,
4203 * is lost. 4267 * is lost.
4204 */ 4268 */
4205 if (!list_empty(&jl->j_tail_bh_list)) { 4269 if (!list_empty(&jl->j_tail_bh_list)) {
4206 unlock_kernel(); 4270 reiserfs_write_unlock(sb);
4207 write_ordered_buffers(&journal->j_dirty_buffers_lock, 4271 write_ordered_buffers(&journal->j_dirty_buffers_lock,
4208 journal, jl, &jl->j_tail_bh_list); 4272 journal, jl, &jl->j_tail_bh_list);
4209 lock_kernel(); 4273 reiserfs_write_lock(sb);
4210 } 4274 }
4211 BUG_ON(!list_empty(&jl->j_tail_bh_list)); 4275 BUG_ON(!list_empty(&jl->j_tail_bh_list));
4212 mutex_unlock(&jl->j_commit_mutex); 4276 mutex_unlock(&jl->j_commit_mutex);
diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c
new file mode 100644
index 000000000000..ee2cfc0fd8a7
--- /dev/null
+++ b/fs/reiserfs/lock.c
@@ -0,0 +1,88 @@
1#include <linux/reiserfs_fs.h>
2#include <linux/mutex.h>
3
4/*
5 * The previous reiserfs locking scheme was heavily based on
6 * the tricky properties of the Bkl:
7 *
8 * - it was acquired recursively by a same task
9 * - the performances relied on the release-while-schedule() property
10 *
11 * Now that we replace it by a mutex, we still want to keep the same
12 * recursive property to avoid big changes in the code structure.
13 * We use our own lock_owner here because the owner field on a mutex
14 * is only available in SMP or mutex debugging, also we only need this field
15 * for this mutex, no need for a system wide mutex facility.
16 *
17 * Also this lock is often released before a call that could block because
18 * reiserfs performances were partialy based on the release while schedule()
19 * property of the Bkl.
20 */
21void reiserfs_write_lock(struct super_block *s)
22{
23 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
24
25 if (sb_i->lock_owner != current) {
26 mutex_lock(&sb_i->lock);
27 sb_i->lock_owner = current;
28 }
29
30 /* No need to protect it, only the current task touches it */
31 sb_i->lock_depth++;
32}
33
34void reiserfs_write_unlock(struct super_block *s)
35{
36 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
37
38 /*
39 * Are we unlocking without even holding the lock?
40 * Such a situation must raise a BUG() if we don't want
41 * to corrupt the data.
42 */
43 BUG_ON(sb_i->lock_owner != current);
44
45 if (--sb_i->lock_depth == -1) {
46 sb_i->lock_owner = NULL;
47 mutex_unlock(&sb_i->lock);
48 }
49}
50
51/*
52 * If we already own the lock, just exit and don't increase the depth.
53 * Useful when we don't want to lock more than once.
54 *
55 * We always return the lock_depth we had before calling
56 * this function.
57 */
58int reiserfs_write_lock_once(struct super_block *s)
59{
60 struct reiserfs_sb_info *sb_i = REISERFS_SB(s);
61
62 if (sb_i->lock_owner != current) {
63 mutex_lock(&sb_i->lock);
64 sb_i->lock_owner = current;
65 return sb_i->lock_depth++;
66 }
67
68 return sb_i->lock_depth;
69}
70
71void reiserfs_write_unlock_once(struct super_block *s, int lock_depth)
72{
73 if (lock_depth == -1)
74 reiserfs_write_unlock(s);
75}
76
77/*
78 * Utility function to force a BUG if it is called without the superblock
79 * write lock held. caller is the string printed just before calling BUG()
80 */
81void reiserfs_check_lock_depth(struct super_block *sb, char *caller)
82{
83 struct reiserfs_sb_info *sb_i = REISERFS_SB(sb);
84
85 if (sb_i->lock_depth < 0)
86 reiserfs_panic(sb, "%s called without kernel lock held %d",
87 caller);
88}
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 271579128634..e296ff72a6cc 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -324,6 +324,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
324 struct nameidata *nd) 324 struct nameidata *nd)
325{ 325{
326 int retval; 326 int retval;
327 int lock_depth;
327 struct inode *inode = NULL; 328 struct inode *inode = NULL;
328 struct reiserfs_dir_entry de; 329 struct reiserfs_dir_entry de;
329 INITIALIZE_PATH(path_to_entry); 330 INITIALIZE_PATH(path_to_entry);
@@ -331,7 +332,13 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
331 if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len) 332 if (REISERFS_MAX_NAME(dir->i_sb->s_blocksize) < dentry->d_name.len)
332 return ERR_PTR(-ENAMETOOLONG); 333 return ERR_PTR(-ENAMETOOLONG);
333 334
334 reiserfs_write_lock(dir->i_sb); 335 /*
336 * Might be called with or without the write lock, must be careful
337 * to not recursively hold it in case we want to release the lock
338 * before rescheduling.
339 */
340 lock_depth = reiserfs_write_lock_once(dir->i_sb);
341
335 de.de_gen_number_bit_string = NULL; 342 de.de_gen_number_bit_string = NULL;
336 retval = 343 retval =
337 reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len, 344 reiserfs_find_entry(dir, dentry->d_name.name, dentry->d_name.len,
@@ -341,7 +348,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
341 inode = reiserfs_iget(dir->i_sb, 348 inode = reiserfs_iget(dir->i_sb,
342 (struct cpu_key *)&(de.de_dir_id)); 349 (struct cpu_key *)&(de.de_dir_id));
343 if (!inode || IS_ERR(inode)) { 350 if (!inode || IS_ERR(inode)) {
344 reiserfs_write_unlock(dir->i_sb); 351 reiserfs_write_unlock_once(dir->i_sb, lock_depth);
345 return ERR_PTR(-EACCES); 352 return ERR_PTR(-EACCES);
346 } 353 }
347 354
@@ -350,7 +357,7 @@ static struct dentry *reiserfs_lookup(struct inode *dir, struct dentry *dentry,
350 if (IS_PRIVATE(dir)) 357 if (IS_PRIVATE(dir))
351 inode->i_flags |= S_PRIVATE; 358 inode->i_flags |= S_PRIVATE;
352 } 359 }
353 reiserfs_write_unlock(dir->i_sb); 360 reiserfs_write_unlock_once(dir->i_sb, lock_depth);
354 if (retval == IO_ERROR) { 361 if (retval == IO_ERROR) {
355 return ERR_PTR(-EIO); 362 return ERR_PTR(-EIO);
356 } 363 }
@@ -725,6 +732,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
725 struct inode *inode; 732 struct inode *inode;
726 struct reiserfs_transaction_handle th; 733 struct reiserfs_transaction_handle th;
727 struct reiserfs_security_handle security; 734 struct reiserfs_security_handle security;
735 int lock_depth;
728 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */ 736 /* We need blocks for transaction + (user+group)*(quotas for new inode + update of quota for directory owner) */
729 int jbegin_count = 737 int jbegin_count =
730 JOURNAL_PER_BALANCE_CNT * 3 + 738 JOURNAL_PER_BALANCE_CNT * 3 +
@@ -748,7 +756,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
748 return retval; 756 return retval;
749 } 757 }
750 jbegin_count += retval; 758 jbegin_count += retval;
751 reiserfs_write_lock(dir->i_sb); 759 lock_depth = reiserfs_write_lock_once(dir->i_sb);
752 760
753 retval = journal_begin(&th, dir->i_sb, jbegin_count); 761 retval = journal_begin(&th, dir->i_sb, jbegin_count);
754 if (retval) { 762 if (retval) {
@@ -798,8 +806,8 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
798 d_instantiate(dentry, inode); 806 d_instantiate(dentry, inode);
799 unlock_new_inode(inode); 807 unlock_new_inode(inode);
800 retval = journal_end(&th, dir->i_sb, jbegin_count); 808 retval = journal_end(&th, dir->i_sb, jbegin_count);
801 out_failed: 809out_failed:
802 reiserfs_write_unlock(dir->i_sb); 810 reiserfs_write_unlock_once(dir->i_sb, lock_depth);
803 return retval; 811 return retval;
804} 812}
805 813
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index 536eacaeb710..adbc6f538515 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -349,10 +349,6 @@ void reiserfs_debug(struct super_block *s, int level, const char *fmt, ...)
349 349
350 . */ 350 . */
351 351
352#ifdef CONFIG_REISERFS_CHECK
353extern struct tree_balance *cur_tb;
354#endif
355
356void __reiserfs_panic(struct super_block *sb, const char *id, 352void __reiserfs_panic(struct super_block *sb, const char *id,
357 const char *function, const char *fmt, ...) 353 const char *function, const char *fmt, ...)
358{ 354{
diff --git a/fs/reiserfs/resize.c b/fs/reiserfs/resize.c
index 18b315d3d104..b3a94d20f0fc 100644
--- a/fs/reiserfs/resize.c
+++ b/fs/reiserfs/resize.c
@@ -141,7 +141,9 @@ int reiserfs_resize(struct super_block *s, unsigned long block_count_new)
141 141
142 set_buffer_uptodate(bh); 142 set_buffer_uptodate(bh);
143 mark_buffer_dirty(bh); 143 mark_buffer_dirty(bh);
144 reiserfs_write_unlock(s);
144 sync_dirty_buffer(bh); 145 sync_dirty_buffer(bh);
146 reiserfs_write_lock(s);
145 // update bitmap_info stuff 147 // update bitmap_info stuff
146 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1; 148 bitmap[i].free_count = sb_blocksize(sb) * 8 - 1;
147 brelse(bh); 149 brelse(bh);
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index d036ee5b1c81..5fa7118f04e1 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -222,9 +222,6 @@ static inline int bin_search(const void *key, /* Key to search for. */
222 return ITEM_NOT_FOUND; 222 return ITEM_NOT_FOUND;
223} 223}
224 224
225#ifdef CONFIG_REISERFS_CHECK
226extern struct tree_balance *cur_tb;
227#endif
228 225
229/* Minimal possible key. It is never in the tree. */ 226/* Minimal possible key. It is never in the tree. */
230const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} }; 227const struct reiserfs_key MIN_KEY = { 0, 0, {{0, 0},} };
@@ -519,25 +516,48 @@ static int is_tree_node(struct buffer_head *bh, int level)
519 516
520#define SEARCH_BY_KEY_READA 16 517#define SEARCH_BY_KEY_READA 16
521 518
522/* The function is NOT SCHEDULE-SAFE! */ 519/*
523static void search_by_key_reada(struct super_block *s, 520 * The function is NOT SCHEDULE-SAFE!
521 * It might unlock the write lock if we needed to wait for a block
522 * to be read. Note that in this case it won't recover the lock to avoid
523 * high contention resulting from too much lock requests, especially
524 * the caller (search_by_key) will perform other schedule-unsafe
525 * operations just after calling this function.
526 *
527 * @return true if we have unlocked
528 */
529static bool search_by_key_reada(struct super_block *s,
524 struct buffer_head **bh, 530 struct buffer_head **bh,
525 b_blocknr_t *b, int num) 531 b_blocknr_t *b, int num)
526{ 532{
527 int i, j; 533 int i, j;
534 bool unlocked = false;
528 535
529 for (i = 0; i < num; i++) { 536 for (i = 0; i < num; i++) {
530 bh[i] = sb_getblk(s, b[i]); 537 bh[i] = sb_getblk(s, b[i]);
531 } 538 }
539 /*
540 * We are going to read some blocks on which we
541 * have a reference. It's safe, though we might be
542 * reading blocks concurrently changed if we release
543 * the lock. But it's still fine because we check later
544 * if the tree changed
545 */
532 for (j = 0; j < i; j++) { 546 for (j = 0; j < i; j++) {
533 /* 547 /*
534 * note, this needs attention if we are getting rid of the BKL 548 * note, this needs attention if we are getting rid of the BKL
535 * you have to make sure the prepared bit isn't set on this buffer 549 * you have to make sure the prepared bit isn't set on this buffer
536 */ 550 */
537 if (!buffer_uptodate(bh[j])) 551 if (!buffer_uptodate(bh[j])) {
552 if (!unlocked) {
553 reiserfs_write_unlock(s);
554 unlocked = true;
555 }
538 ll_rw_block(READA, 1, bh + j); 556 ll_rw_block(READA, 1, bh + j);
557 }
539 brelse(bh[j]); 558 brelse(bh[j]);
540 } 559 }
560 return unlocked;
541} 561}
542 562
543/************************************************************************** 563/**************************************************************************
@@ -625,11 +645,26 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
625 have a pointer to it. */ 645 have a pointer to it. */
626 if ((bh = last_element->pe_buffer = 646 if ((bh = last_element->pe_buffer =
627 sb_getblk(sb, block_number))) { 647 sb_getblk(sb, block_number))) {
648 bool unlocked = false;
649
628 if (!buffer_uptodate(bh) && reada_count > 1) 650 if (!buffer_uptodate(bh) && reada_count > 1)
629 search_by_key_reada(sb, reada_bh, 651 /* may unlock the write lock */
652 unlocked = search_by_key_reada(sb, reada_bh,
630 reada_blocks, reada_count); 653 reada_blocks, reada_count);
654 /*
655 * If we haven't already unlocked the write lock,
656 * then we need to do that here before reading
657 * the current block
658 */
659 if (!buffer_uptodate(bh) && !unlocked) {
660 reiserfs_write_unlock(sb);
661 unlocked = true;
662 }
631 ll_rw_block(READ, 1, &bh); 663 ll_rw_block(READ, 1, &bh);
632 wait_on_buffer(bh); 664 wait_on_buffer(bh);
665
666 if (unlocked)
667 reiserfs_write_lock(sb);
633 if (!buffer_uptodate(bh)) 668 if (!buffer_uptodate(bh))
634 goto io_error; 669 goto io_error;
635 } else { 670 } else {
@@ -673,7 +708,7 @@ int search_by_key(struct super_block *sb, const struct cpu_key *key, /* Key to s
673 !key_in_buffer(search_path, key, sb), 708 !key_in_buffer(search_path, key, sb),
674 "PAP-5130: key is not in the buffer"); 709 "PAP-5130: key is not in the buffer");
675#ifdef CONFIG_REISERFS_CHECK 710#ifdef CONFIG_REISERFS_CHECK
676 if (cur_tb) { 711 if (REISERFS_SB(sb)->cur_tb) {
677 print_cur_tb("5140"); 712 print_cur_tb("5140");
678 reiserfs_panic(sb, "PAP-5140", 713 reiserfs_panic(sb, "PAP-5140",
679 "schedule occurred in do_balance!"); 714 "schedule occurred in do_balance!");
@@ -1024,7 +1059,9 @@ static char prepare_for_delete_or_cut(struct reiserfs_transaction_handle *th, st
1024 reiserfs_free_block(th, inode, block, 1); 1059 reiserfs_free_block(th, inode, block, 1);
1025 } 1060 }
1026 1061
1062 reiserfs_write_unlock(sb);
1027 cond_resched(); 1063 cond_resched();
1064 reiserfs_write_lock(sb);
1028 1065
1029 if (item_moved (&s_ih, path)) { 1066 if (item_moved (&s_ih, path)) {
1030 need_re_search = 1; 1067 need_re_search = 1;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index f0ad05f38022..339b0baf2af6 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -465,7 +465,7 @@ static void reiserfs_put_super(struct super_block *s)
465 struct reiserfs_transaction_handle th; 465 struct reiserfs_transaction_handle th;
466 th.t_trans_id = 0; 466 th.t_trans_id = 0;
467 467
468 lock_kernel(); 468 reiserfs_write_lock(s);
469 469
470 if (s->s_dirt) 470 if (s->s_dirt)
471 reiserfs_write_super(s); 471 reiserfs_write_super(s);
@@ -499,10 +499,10 @@ static void reiserfs_put_super(struct super_block *s)
499 499
500 reiserfs_proc_info_done(s); 500 reiserfs_proc_info_done(s);
501 501
502 reiserfs_write_unlock(s);
503 mutex_destroy(&REISERFS_SB(s)->lock);
502 kfree(s->s_fs_info); 504 kfree(s->s_fs_info);
503 s->s_fs_info = NULL; 505 s->s_fs_info = NULL;
504
505 unlock_kernel();
506} 506}
507 507
508static struct kmem_cache *reiserfs_inode_cachep; 508static struct kmem_cache *reiserfs_inode_cachep;
@@ -554,25 +554,28 @@ static void reiserfs_dirty_inode(struct inode *inode)
554 struct reiserfs_transaction_handle th; 554 struct reiserfs_transaction_handle th;
555 555
556 int err = 0; 556 int err = 0;
557 int lock_depth;
558
557 if (inode->i_sb->s_flags & MS_RDONLY) { 559 if (inode->i_sb->s_flags & MS_RDONLY) {
558 reiserfs_warning(inode->i_sb, "clm-6006", 560 reiserfs_warning(inode->i_sb, "clm-6006",
559 "writing inode %lu on readonly FS", 561 "writing inode %lu on readonly FS",
560 inode->i_ino); 562 inode->i_ino);
561 return; 563 return;
562 } 564 }
563 reiserfs_write_lock(inode->i_sb); 565 lock_depth = reiserfs_write_lock_once(inode->i_sb);
564 566
565 /* this is really only used for atime updates, so they don't have 567 /* this is really only used for atime updates, so they don't have
566 ** to be included in O_SYNC or fsync 568 ** to be included in O_SYNC or fsync
567 */ 569 */
568 err = journal_begin(&th, inode->i_sb, 1); 570 err = journal_begin(&th, inode->i_sb, 1);
569 if (err) { 571 if (err)
570 reiserfs_write_unlock(inode->i_sb); 572 goto out;
571 return; 573
572 }
573 reiserfs_update_sd(&th, inode); 574 reiserfs_update_sd(&th, inode);
574 journal_end(&th, inode->i_sb, 1); 575 journal_end(&th, inode->i_sb, 1);
575 reiserfs_write_unlock(inode->i_sb); 576
577out:
578 reiserfs_write_unlock_once(inode->i_sb, lock_depth);
576} 579}
577 580
578#ifdef CONFIG_QUOTA 581#ifdef CONFIG_QUOTA
@@ -1168,11 +1171,14 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1168 unsigned int qfmt = 0; 1171 unsigned int qfmt = 0;
1169#ifdef CONFIG_QUOTA 1172#ifdef CONFIG_QUOTA
1170 int i; 1173 int i;
1174#endif
1175
1176 reiserfs_write_lock(s);
1171 1177
1178#ifdef CONFIG_QUOTA
1172 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names)); 1179 memcpy(qf_names, REISERFS_SB(s)->s_qf_names, sizeof(qf_names));
1173#endif 1180#endif
1174 1181
1175 lock_kernel();
1176 rs = SB_DISK_SUPER_BLOCK(s); 1182 rs = SB_DISK_SUPER_BLOCK(s);
1177 1183
1178 if (!reiserfs_parse_options 1184 if (!reiserfs_parse_options
@@ -1295,12 +1301,12 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
1295 1301
1296out_ok: 1302out_ok:
1297 replace_mount_options(s, new_opts); 1303 replace_mount_options(s, new_opts);
1298 unlock_kernel(); 1304 reiserfs_write_unlock(s);
1299 return 0; 1305 return 0;
1300 1306
1301out_err: 1307out_err:
1302 kfree(new_opts); 1308 kfree(new_opts);
1303 unlock_kernel(); 1309 reiserfs_write_unlock(s);
1304 return err; 1310 return err;
1305} 1311}
1306 1312
@@ -1404,7 +1410,9 @@ static int read_super_block(struct super_block *s, int offset)
1404static int reread_meta_blocks(struct super_block *s) 1410static int reread_meta_blocks(struct super_block *s)
1405{ 1411{
1406 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s))); 1412 ll_rw_block(READ, 1, &(SB_BUFFER_WITH_SB(s)));
1413 reiserfs_write_unlock(s);
1407 wait_on_buffer(SB_BUFFER_WITH_SB(s)); 1414 wait_on_buffer(SB_BUFFER_WITH_SB(s));
1415 reiserfs_write_lock(s);
1408 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) { 1416 if (!buffer_uptodate(SB_BUFFER_WITH_SB(s))) {
1409 reiserfs_warning(s, "reiserfs-2504", "error reading the super"); 1417 reiserfs_warning(s, "reiserfs-2504", "error reading the super");
1410 return 1; 1418 return 1;
@@ -1613,7 +1621,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1613 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL); 1621 sbi = kzalloc(sizeof(struct reiserfs_sb_info), GFP_KERNEL);
1614 if (!sbi) { 1622 if (!sbi) {
1615 errval = -ENOMEM; 1623 errval = -ENOMEM;
1616 goto error; 1624 goto error_alloc;
1617 } 1625 }
1618 s->s_fs_info = sbi; 1626 s->s_fs_info = sbi;
1619 /* Set default values for options: non-aggressive tails, RO on errors */ 1627 /* Set default values for options: non-aggressive tails, RO on errors */
@@ -1627,6 +1635,20 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1627 /* setup default block allocator options */ 1635 /* setup default block allocator options */
1628 reiserfs_init_alloc_options(s); 1636 reiserfs_init_alloc_options(s);
1629 1637
1638 mutex_init(&REISERFS_SB(s)->lock);
1639 REISERFS_SB(s)->lock_depth = -1;
1640
1641 /*
1642 * This function is called with the bkl, which also was the old
1643 * locking used here.
1644 * do_journal_begin() will soon check if we hold the lock (ie: was the
1645 * bkl). This is likely because do_journal_begin() has several another
1646 * callers because at this time, it doesn't seem to be necessary to
1647 * protect against anything.
1648 * Anyway, let's be conservative and lock for now.
1649 */
1650 reiserfs_write_lock(s);
1651
1630 jdev_name = NULL; 1652 jdev_name = NULL;
1631 if (reiserfs_parse_options 1653 if (reiserfs_parse_options
1632 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name, 1654 (s, (char *)data, &(sbi->s_mount_opt), &blocks, &jdev_name,
@@ -1852,9 +1874,13 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent)
1852 init_waitqueue_head(&(sbi->s_wait)); 1874 init_waitqueue_head(&(sbi->s_wait));
1853 spin_lock_init(&sbi->bitmap_lock); 1875 spin_lock_init(&sbi->bitmap_lock);
1854 1876
1877 reiserfs_write_unlock(s);
1878
1855 return (0); 1879 return (0);
1856 1880
1857error: 1881error:
1882 reiserfs_write_unlock(s);
1883error_alloc:
1858 if (jinit_done) { /* kill the commit thread, free journal ram */ 1884 if (jinit_done) { /* kill the commit thread, free journal ram */
1859 journal_release_error(NULL, s); 1885 journal_release_error(NULL, s);
1860 } 1886 }
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 6925b835a43b..58aa8e75f7f5 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -975,7 +975,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
975 int err = 0; 975 int err = 0;
976 976
977 /* If we don't have the privroot located yet - go find it */ 977 /* If we don't have the privroot located yet - go find it */
978 mutex_lock(&s->s_root->d_inode->i_mutex); 978 reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
979 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, 979 dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
980 strlen(PRIVROOT_NAME)); 980 strlen(PRIVROOT_NAME));
981 if (!IS_ERR(dentry)) { 981 if (!IS_ERR(dentry)) {
@@ -1004,14 +1004,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
1004 goto error; 1004 goto error;
1005 1005
1006 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) { 1006 if (!privroot->d_inode && !(mount_flags & MS_RDONLY)) {
1007 mutex_lock(&s->s_root->d_inode->i_mutex); 1007 reiserfs_mutex_lock_safe(&s->s_root->d_inode->i_mutex, s);
1008 err = create_privroot(REISERFS_SB(s)->priv_root); 1008 err = create_privroot(REISERFS_SB(s)->priv_root);
1009 mutex_unlock(&s->s_root->d_inode->i_mutex); 1009 mutex_unlock(&s->s_root->d_inode->i_mutex);
1010 } 1010 }
1011 1011
1012 if (privroot->d_inode) { 1012 if (privroot->d_inode) {
1013 s->s_xattr = reiserfs_xattr_handlers; 1013 s->s_xattr = reiserfs_xattr_handlers;
1014 mutex_lock(&privroot->d_inode->i_mutex); 1014 reiserfs_mutex_lock_safe(&privroot->d_inode->i_mutex, s);
1015 if (!REISERFS_SB(s)->xattr_root) { 1015 if (!REISERFS_SB(s)->xattr_root) {
1016 struct dentry *dentry; 1016 struct dentry *dentry;
1017 dentry = lookup_one_len(XAROOT_NAME, privroot, 1017 dentry = lookup_one_len(XAROOT_NAME, privroot,
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index dbc093afd946..8a771c59ac3e 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2014,7 +2014,7 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
2014 inum = key_inum_flash(c, &dent->key); 2014 inum = key_inum_flash(c, &dent->key);
2015 fscki1 = read_add_inode(c, priv, inum); 2015 fscki1 = read_add_inode(c, priv, inum);
2016 if (IS_ERR(fscki1)) { 2016 if (IS_ERR(fscki1)) {
2017 err = PTR_ERR(fscki); 2017 err = PTR_ERR(fscki1);
2018 ubifs_err("error %d while processing entry node and " 2018 ubifs_err("error %d while processing entry node and "
2019 "trying to find parent inode node %lu", 2019 "trying to find parent inode node %lu",
2020 err, (unsigned long)inum); 2020 err, (unsigned long)inum);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 1009adc8d602..39849f887e72 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1389,7 +1389,6 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
1389 unsigned long nr_segs, loff_t pos) 1389 unsigned long nr_segs, loff_t pos)
1390{ 1390{
1391 int err; 1391 int err;
1392 ssize_t ret;
1393 struct inode *inode = iocb->ki_filp->f_mapping->host; 1392 struct inode *inode = iocb->ki_filp->f_mapping->host;
1394 struct ubifs_info *c = inode->i_sb->s_fs_info; 1393 struct ubifs_info *c = inode->i_sb->s_fs_info;
1395 1394
@@ -1397,17 +1396,7 @@ static ssize_t ubifs_aio_write(struct kiocb *iocb, const struct iovec *iov,
1397 if (err) 1396 if (err)
1398 return err; 1397 return err;
1399 1398
1400 ret = generic_file_aio_write(iocb, iov, nr_segs, pos); 1399 return generic_file_aio_write(iocb, iov, nr_segs, pos);
1401 if (ret < 0)
1402 return ret;
1403
1404 if (ret > 0 && (IS_SYNC(inode) || iocb->ki_filp->f_flags & O_SYNC)) {
1405 err = ubifs_sync_wbufs_by_inode(c, inode);
1406 if (err)
1407 return err;
1408 }
1409
1410 return ret;
1411} 1400}
1412 1401
1413static int ubifs_set_page_dirty(struct page *page) 1402static int ubifs_set_page_dirty(struct page *page)
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index f94ddf7efba0..868a55ee080f 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -23,7 +23,7 @@
23/* 23/*
24 * This file implements functions needed to recover from unclean un-mounts. 24 * This file implements functions needed to recover from unclean un-mounts.
25 * When UBIFS is mounted, it checks a flag on the master node to determine if 25 * When UBIFS is mounted, it checks a flag on the master node to determine if
26 * an un-mount was completed sucessfully. If not, the process of mounting 26 * an un-mount was completed successfully. If not, the process of mounting
27 * incorparates additional checking and fixing of on-flash data structures. 27 * incorparates additional checking and fixing of on-flash data structures.
28 * UBIFS always cleans away all remnants of an unclean un-mount, so that 28 * UBIFS always cleans away all remnants of an unclean un-mount, so that
29 * errors do not accumulate. However UBIFS defers recovery if it is mounted 29 * errors do not accumulate. However UBIFS defers recovery if it is mounted
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 333e181ee987..943ad5624530 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1842,22 +1842,32 @@ const struct super_operations ubifs_super_operations = {
1842 * @name: UBI volume name 1842 * @name: UBI volume name
1843 * @mode: UBI volume open mode 1843 * @mode: UBI volume open mode
1844 * 1844 *
1845 * There are several ways to specify UBI volumes when mounting UBIFS: 1845 * The primary method of mounting UBIFS is by specifying the UBI volume
1846 * o ubiX_Y - UBI device number X, volume Y; 1846 * character device node path. However, UBIFS may also be mounted withoug any
1847 * o ubiY - UBI device number 0, volume Y; 1847 * character device node using one of the following methods:
1848 *
1849 * o ubiX_Y - mount UBI device number X, volume Y;
1850 * o ubiY - mount UBI device number 0, volume Y;
1848 * o ubiX:NAME - mount UBI device X, volume with name NAME; 1851 * o ubiX:NAME - mount UBI device X, volume with name NAME;
1849 * o ubi:NAME - mount UBI device 0, volume with name NAME. 1852 * o ubi:NAME - mount UBI device 0, volume with name NAME.
1850 * 1853 *
1851 * Alternative '!' separator may be used instead of ':' (because some shells 1854 * Alternative '!' separator may be used instead of ':' (because some shells
1852 * like busybox may interpret ':' as an NFS host name separator). This function 1855 * like busybox may interpret ':' as an NFS host name separator). This function
1853 * returns ubi volume object in case of success and a negative error code in 1856 * returns UBI volume description object in case of success and a negative
1854 * case of failure. 1857 * error code in case of failure.
1855 */ 1858 */
1856static struct ubi_volume_desc *open_ubi(const char *name, int mode) 1859static struct ubi_volume_desc *open_ubi(const char *name, int mode)
1857{ 1860{
1861 struct ubi_volume_desc *ubi;
1858 int dev, vol; 1862 int dev, vol;
1859 char *endptr; 1863 char *endptr;
1860 1864
1865 /* First, try to open using the device node path method */
1866 ubi = ubi_open_volume_path(name, mode);
1867 if (!IS_ERR(ubi))
1868 return ubi;
1869
1870 /* Try the "nodev" method */
1861 if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i') 1871 if (name[0] != 'u' || name[1] != 'b' || name[2] != 'i')
1862 return ERR_PTR(-EINVAL); 1872 return ERR_PTR(-EINVAL);
1863 1873
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 6533ead9b889..a2c16bcee90b 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -98,7 +98,7 @@ typedef struct xfs_dquot {
98#define dq_flags q_lists.dqm_flags 98#define dq_flags q_lists.dqm_flags
99 99
100/* 100/*
101 * Lock hierachy for q_qlock: 101 * Lock hierarchy for q_qlock:
102 * XFS_QLOCK_NORMAL is the implicit default, 102 * XFS_QLOCK_NORMAL is the implicit default,
103 * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2 103 * XFS_QLOCK_NESTED is the dquot with the higher id in xfs_dqlock2
104 */ 104 */