aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/fid.c1
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Kconfig.binfmt2
-rw-r--r--fs/bio.c158
-rw-r--r--fs/cramfs/inode.c1
-rw-r--r--fs/dcache.c114
-rw-r--r--fs/dlm/Makefile1
-rw-r--r--fs/dlm/config.c50
-rw-r--r--fs/dlm/config.h3
-rw-r--r--fs/dlm/dlm_internal.h9
-rw-r--r--fs/dlm/lock.c5
-rw-r--r--fs/dlm/lock.h1
-rw-r--r--fs/dlm/main.c7
-rw-r--r--fs/dlm/member.c34
-rw-r--r--fs/dlm/plock.c (renamed from fs/gfs2/locking/dlm/plock.c)169
-rw-r--r--fs/dlm/recoverd.c1
-rw-r--r--fs/ext2/ialloc.c2
-rw-r--r--fs/ext2/inode.c4
-rw-r--r--fs/ext2/ioctl.c57
-rw-r--r--fs/ext3/ialloc.c2
-rw-r--r--fs/ext3/inode.c6
-rw-r--r--fs/ext3/ioctl.c103
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/inode.c6
-rw-r--r--fs/ext4/ioctl.c86
-rw-r--r--fs/fat/file.c12
-rw-r--r--fs/file_table.c42
-rw-r--r--fs/gfs2/Kconfig2
-rw-r--r--fs/gfs2/Makefile2
-rw-r--r--fs/gfs2/acl.c6
-rw-r--r--fs/gfs2/bmap.c670
-rw-r--r--fs/gfs2/dir.c84
-rw-r--r--fs/gfs2/eattr.c58
-rw-r--r--fs/gfs2/glock.c188
-rw-r--r--fs/gfs2/glock.h14
-rw-r--r--fs/gfs2/glops.c10
-rw-r--r--fs/gfs2/incore.h40
-rw-r--r--fs/gfs2/inode.c72
-rw-r--r--fs/gfs2/inode.h22
-rw-r--r--fs/gfs2/lm.c210
-rw-r--r--fs/gfs2/lm.h42
-rw-r--r--fs/gfs2/locking/dlm/Makefile2
-rw-r--r--fs/gfs2/locking/dlm/lock.c7
-rw-r--r--fs/gfs2/locking/dlm/lock_dlm.h13
-rw-r--r--fs/gfs2/locking/dlm/main.c10
-rw-r--r--fs/gfs2/locking/dlm/mount.c21
-rw-r--r--fs/gfs2/locking/dlm/sysfs.c2
-rw-r--r--fs/gfs2/locking/dlm/thread.c10
-rw-r--r--fs/gfs2/locking/nolock/main.c2
-rw-r--r--fs/gfs2/log.c19
-rw-r--r--fs/gfs2/lops.c21
-rw-r--r--fs/gfs2/lops.h11
-rw-r--r--fs/gfs2/main.c10
-rw-r--r--fs/gfs2/ops_address.c44
-rw-r--r--fs/gfs2/ops_dentry.c4
-rw-r--r--fs/gfs2/ops_export.c2
-rw-r--r--fs/gfs2/ops_file.c37
-rw-r--r--fs/gfs2/ops_fstype.c80
-rw-r--r--fs/gfs2/ops_inode.c42
-rw-r--r--fs/gfs2/ops_inode.h1
-rw-r--r--fs/gfs2/ops_super.c1
-rw-r--r--fs/gfs2/quota.c74
-rw-r--r--fs/gfs2/quota.h17
-rw-r--r--fs/gfs2/recovery.c15
-rw-r--r--fs/gfs2/rgrp.c370
-rw-r--r--fs/gfs2/rgrp.h8
-rw-r--r--fs/gfs2/super.c6
-rw-r--r--fs/gfs2/super.h1
-rw-r--r--fs/gfs2/sys.c7
-rw-r--r--fs/gfs2/trans.c25
-rw-r--r--fs/gfs2/trans.h2
-rw-r--r--fs/gfs2/util.c24
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/hfsplus/ioctl.c40
-rw-r--r--fs/inode.c51
-rw-r--r--fs/internal.h11
-rw-r--r--fs/jffs2/jffs2_fs_i.h2
-rw-r--r--fs/jffs2/jffs2_fs_sb.h2
-rw-r--r--fs/jfs/ioctl.c33
-rw-r--r--fs/jfs/jfs_dmap.c11
-rw-r--r--fs/jfs/jfs_dmap.h2
-rw-r--r--fs/jfs/jfs_imap.c15
-rw-r--r--fs/jfs/jfs_xtree.c26
-rw-r--r--fs/locks.c1
-rw-r--r--fs/namei.c275
-rw-r--r--fs/namespace.c647
-rw-r--r--fs/ncpfs/ioctl.c54
-rw-r--r--fs/nfs/dir.c3
-rw-r--r--fs/nfsd/nfs4proc.c7
-rw-r--r--fs/nfsd/nfs4recover.c16
-rw-r--r--fs/nfsd/nfs4state.c3
-rw-r--r--fs/nfsd/vfs.c72
-rw-r--r--fs/ocfs2/Makefile14
-rw-r--r--fs/ocfs2/alloc.c465
-rw-r--r--fs/ocfs2/aops.c6
-rw-r--r--fs/ocfs2/cluster/Makefile2
-rw-r--r--fs/ocfs2/cluster/netdebug.c441
-rw-r--r--fs/ocfs2/cluster/nodemanager.c5
-rw-r--r--fs/ocfs2/cluster/sys.c9
-rw-r--r--fs/ocfs2/cluster/tcp.c164
-rw-r--r--fs/ocfs2/cluster/tcp.h32
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h26
-rw-r--r--fs/ocfs2/dlm/Makefile2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h49
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c911
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h86
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c70
-rw-r--r--fs/ocfs2/dlm/dlmlock.c22
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c200
-rw-r--r--fs/ocfs2/dlmglue.c645
-rw-r--r--fs/ocfs2/dlmglue.h5
-rw-r--r--fs/ocfs2/file.c4
-rw-r--r--fs/ocfs2/heartbeat.c184
-rw-r--r--fs/ocfs2/heartbeat.h17
-rw-r--r--fs/ocfs2/ioctl.c24
-rw-r--r--fs/ocfs2/ioctl.h3
-rw-r--r--fs/ocfs2/journal.c211
-rw-r--r--fs/ocfs2/journal.h4
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/namei.c4
-rw-r--r--fs/ocfs2/ocfs2.h77
-rw-r--r--fs/ocfs2/ocfs2_fs.h79
-rw-r--r--fs/ocfs2/ocfs2_lockid.h2
-rw-r--r--fs/ocfs2/slot_map.c454
-rw-r--r--fs/ocfs2/slot_map.h32
-rw-r--r--fs/ocfs2/stack_o2cb.c420
-rw-r--r--fs/ocfs2/stack_user.c883
-rw-r--r--fs/ocfs2/stackglue.c568
-rw-r--r--fs/ocfs2/stackglue.h261
-rw-r--r--fs/ocfs2/suballoc.c103
-rw-r--r--fs/ocfs2/suballoc.h1
-rw-r--r--fs/ocfs2/super.c208
-rw-r--r--fs/open.c149
-rw-r--r--fs/partitions/check.c4
-rw-r--r--fs/pipe.c3
-rw-r--r--fs/pnode.c60
-rw-r--r--fs/pnode.h2
-rw-r--r--fs/proc/base.c125
-rw-r--r--fs/proc/proc_net.c6
-rw-r--r--fs/read_write.c6
-rw-r--r--fs/reiserfs/ioctl.c63
-rw-r--r--fs/reiserfs/journal.c2
-rw-r--r--fs/reiserfs/xattr.c1
-rw-r--r--fs/select.c2
-rw-r--r--fs/seq_file.c113
-rw-r--r--fs/super.c25
-rw-r--r--fs/sysfs/dir.c1
-rw-r--r--fs/sysfs/file.c6
-rw-r--r--fs/sysfs/symlink.c9
-rw-r--r--fs/udf/Makefile2
-rw-r--r--fs/udf/balloc.c13
-rw-r--r--fs/udf/crc.c172
-rw-r--r--fs/udf/dir.c83
-rw-r--r--fs/udf/ecma_167.h13
-rw-r--r--fs/udf/file.c47
-rw-r--r--fs/udf/ialloc.c13
-rw-r--r--fs/udf/inode.c208
-rw-r--r--fs/udf/lowlevel.c1
-rw-r--r--fs/udf/misc.c26
-rw-r--r--fs/udf/namei.c218
-rw-r--r--fs/udf/partition.c67
-rw-r--r--fs/udf/super.c1262
-rw-r--r--fs/udf/symlink.c1
-rw-r--r--fs/udf/truncate.c81
-rw-r--r--fs/udf/udf_i.h30
-rw-r--r--fs/udf/udf_sb.h109
-rw-r--r--fs/udf/udfdecl.h67
-rw-r--r--fs/udf/udfend.h22
-rw-r--r--fs/udf/udftime.c35
-rw-r--r--fs/udf/unicode.c62
-rw-r--r--fs/utimes.c18
-rw-r--r--fs/xattr.c39
-rw-r--r--fs/xfs/Kconfig12
-rw-r--r--fs/xfs/linux-2.6/kmem.c6
-rw-r--r--fs/xfs/linux-2.6/sema.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c12
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c14
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c36
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c689
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c230
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c79
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h4
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c27
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h30
-rw-r--r--fs/xfs/quota/xfs_dquot.c20
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c14
-rw-r--r--fs/xfs/quota/xfs_qm.c76
-rw-r--r--fs/xfs/quota/xfs_qm.h2
-rw-r--r--fs/xfs/quota/xfs_qm_stats.h4
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c44
-rw-r--r--fs/xfs/support/ktrace.c37
-rw-r--r--fs/xfs/support/ktrace.h3
-rw-r--r--fs/xfs/xfs.h2
-rw-r--r--fs/xfs/xfs_acl.c16
-rw-r--r--fs/xfs/xfs_alloc.c65
-rw-r--r--fs/xfs/xfs_attr.c10
-rw-r--r--fs/xfs/xfs_attr_leaf.c2
-rw-r--r--fs/xfs/xfs_bmap.c59
-rw-r--r--fs/xfs/xfs_bmap.h2
-rw-r--r--fs/xfs/xfs_bmap_btree.c54
-rw-r--r--fs/xfs/xfs_buf_item.c7
-rw-r--r--fs/xfs/xfs_dir2.c62
-rw-r--r--fs/xfs/xfs_dir2.h12
-rw-r--r--fs/xfs/xfs_filestream.c2
-rw-r--r--fs/xfs/xfs_ialloc.c44
-rw-r--r--fs/xfs/xfs_iget.c49
-rw-r--r--fs/xfs/xfs_inode.c823
-rw-r--r--fs/xfs/xfs_inode.h23
-rw-r--r--fs/xfs/xfs_inode_item.c8
-rw-r--r--fs/xfs/xfs_inode_item.h8
-rw-r--r--fs/xfs/xfs_iomap.c7
-rw-r--r--fs/xfs/xfs_itable.c7
-rw-r--r--fs/xfs/xfs_log.c259
-rw-r--r--fs/xfs/xfs_log.h5
-rw-r--r--fs/xfs/xfs_log_priv.h93
-rw-r--r--fs/xfs/xfs_log_recover.c123
-rw-r--r--fs/xfs/xfs_mount.c66
-rw-r--r--fs/xfs/xfs_mount.h30
-rw-r--r--fs/xfs/xfs_rename.c121
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rw.c8
-rw-r--r--fs/xfs/xfs_trans.h8
-rw-r--r--fs/xfs/xfs_trans_ail.c151
-rw-r--r--fs/xfs/xfs_trans_buf.c15
-rw-r--r--fs/xfs/xfs_types.h5
-rw-r--r--fs/xfs/xfs_utils.c26
-rw-r--r--fs/xfs/xfs_utils.h15
-rw-r--r--fs/xfs/xfs_vfsops.c76
-rw-r--r--fs/xfs/xfs_vnodeops.c505
-rw-r--r--fs/xfs/xfs_vnodeops.h33
238 files changed, 12005 insertions, 6571 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c
index dfebdbe7440e..3031e3233dd6 100644
--- a/fs/9p/fid.c
+++ b/fs/9p/fid.c
@@ -26,7 +26,6 @@
26#include <linux/fs.h> 26#include <linux/fs.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/idr.h> 28#include <linux/idr.h>
29#include <asm/semaphore.h>
30#include <net/9p/9p.h> 29#include <net/9p/9p.h>
31#include <net/9p/client.h> 30#include <net/9p/client.h>
32 31
diff --git a/fs/Kconfig b/fs/Kconfig
index c509123bea49..8b18a8758677 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -444,6 +444,32 @@ config OCFS2_FS
444 For more information on OCFS2, see the file 444 For more information on OCFS2, see the file
445 <file:Documentation/filesystems/ocfs2.txt>. 445 <file:Documentation/filesystems/ocfs2.txt>.
446 446
447config OCFS2_FS_O2CB
448 tristate "O2CB Kernelspace Clustering"
449 depends on OCFS2_FS
450 default y
451 help
452 OCFS2 includes a simple kernelspace clustering package, the OCFS2
453 Cluster Base. It only requires a very small userspace component
454 to configure it. This comes with the standard ocfs2-tools package.
455 O2CB is limited to maintaining a cluster for OCFS2 file systems.
456 It cannot manage any other cluster applications.
457
458 It is always safe to say Y here, as the clustering method is
459 run-time selectable.
460
461config OCFS2_FS_USERSPACE_CLUSTER
462 tristate "OCFS2 Userspace Clustering"
463 depends on OCFS2_FS && DLM
464 default y
465 help
466 This option will allow OCFS2 to use userspace clustering services
467 in conjunction with the DLM in fs/dlm. If you are using a
468 userspace cluster manager, say Y here.
469
470 It is safe to say Y, as the clustering method is run-time
471 selectable.
472
447config OCFS2_DEBUG_MASKLOG 473config OCFS2_DEBUG_MASKLOG
448 bool "OCFS2 logging support" 474 bool "OCFS2 logging support"
449 depends on OCFS2_FS 475 depends on OCFS2_FS
@@ -663,6 +689,7 @@ config ZISOFS
663 689
664config UDF_FS 690config UDF_FS
665 tristate "UDF file system support" 691 tristate "UDF file system support"
692 select CRC_ITU_T
666 help 693 help
667 This is the new file system used on some CD-ROMs and DVDs. Say Y if 694 This is the new file system used on some CD-ROMs and DVDs. Say Y if
668 you intend to mount DVD discs or CDRW's written in packet mode, or 695 you intend to mount DVD discs or CDRW's written in packet mode, or
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index b5c3b6114add..853845abcca6 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -62,7 +62,7 @@ config BINFMT_SHARED_FLAT
62config BINFMT_AOUT 62config BINFMT_AOUT
63 tristate "Kernel support for a.out and ECOFF binaries" 63 tristate "Kernel support for a.out and ECOFF binaries"
64 depends on ARCH_SUPPORTS_AOUT && \ 64 depends on ARCH_SUPPORTS_AOUT && \
65 (X86_32 || ALPHA || ARM || M68K || SPARC32) 65 (X86_32 || ALPHA || ARM || M68K)
66 ---help--- 66 ---help---
67 A.out (Assembler.OUTput) is a set of formats for libraries and 67 A.out (Assembler.OUTput) is a set of formats for libraries and
68 executables used in the earliest versions of UNIX. Linux used 68 executables used in the earliest versions of UNIX. Linux used
diff --git a/fs/bio.c b/fs/bio.c
index 553b5b7960ad..6e0b6f66df03 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -444,22 +444,27 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
444 444
445struct bio_map_data { 445struct bio_map_data {
446 struct bio_vec *iovecs; 446 struct bio_vec *iovecs;
447 void __user *userptr; 447 int nr_sgvecs;
448 struct sg_iovec *sgvecs;
448}; 449};
449 450
450static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio) 451static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
452 struct sg_iovec *iov, int iov_count)
451{ 453{
452 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); 454 memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt);
455 memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
456 bmd->nr_sgvecs = iov_count;
453 bio->bi_private = bmd; 457 bio->bi_private = bmd;
454} 458}
455 459
456static void bio_free_map_data(struct bio_map_data *bmd) 460static void bio_free_map_data(struct bio_map_data *bmd)
457{ 461{
458 kfree(bmd->iovecs); 462 kfree(bmd->iovecs);
463 kfree(bmd->sgvecs);
459 kfree(bmd); 464 kfree(bmd);
460} 465}
461 466
462static struct bio_map_data *bio_alloc_map_data(int nr_segs) 467static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count)
463{ 468{
464 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); 469 struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL);
465 470
@@ -467,13 +472,71 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
467 return NULL; 472 return NULL;
468 473
469 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); 474 bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL);
470 if (bmd->iovecs) 475 if (!bmd->iovecs) {
476 kfree(bmd);
477 return NULL;
478 }
479
480 bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL);
481 if (bmd->sgvecs)
471 return bmd; 482 return bmd;
472 483
484 kfree(bmd->iovecs);
473 kfree(bmd); 485 kfree(bmd);
474 return NULL; 486 return NULL;
475} 487}
476 488
489static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count,
490 int uncopy)
491{
492 int ret = 0, i;
493 struct bio_vec *bvec;
494 int iov_idx = 0;
495 unsigned int iov_off = 0;
496 int read = bio_data_dir(bio) == READ;
497
498 __bio_for_each_segment(bvec, bio, i, 0) {
499 char *bv_addr = page_address(bvec->bv_page);
500 unsigned int bv_len = bvec->bv_len;
501
502 while (bv_len && iov_idx < iov_count) {
503 unsigned int bytes;
504 char *iov_addr;
505
506 bytes = min_t(unsigned int,
507 iov[iov_idx].iov_len - iov_off, bv_len);
508 iov_addr = iov[iov_idx].iov_base + iov_off;
509
510 if (!ret) {
511 if (!read && !uncopy)
512 ret = copy_from_user(bv_addr, iov_addr,
513 bytes);
514 if (read && uncopy)
515 ret = copy_to_user(iov_addr, bv_addr,
516 bytes);
517
518 if (ret)
519 ret = -EFAULT;
520 }
521
522 bv_len -= bytes;
523 bv_addr += bytes;
524 iov_addr += bytes;
525 iov_off += bytes;
526
527 if (iov[iov_idx].iov_len == iov_off) {
528 iov_idx++;
529 iov_off = 0;
530 }
531 }
532
533 if (uncopy)
534 __free_page(bvec->bv_page);
535 }
536
537 return ret;
538}
539
477/** 540/**
478 * bio_uncopy_user - finish previously mapped bio 541 * bio_uncopy_user - finish previously mapped bio
479 * @bio: bio being terminated 542 * @bio: bio being terminated
@@ -484,55 +547,56 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs)
484int bio_uncopy_user(struct bio *bio) 547int bio_uncopy_user(struct bio *bio)
485{ 548{
486 struct bio_map_data *bmd = bio->bi_private; 549 struct bio_map_data *bmd = bio->bi_private;
487 const int read = bio_data_dir(bio) == READ; 550 int ret;
488 struct bio_vec *bvec;
489 int i, ret = 0;
490 551
491 __bio_for_each_segment(bvec, bio, i, 0) { 552 ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1);
492 char *addr = page_address(bvec->bv_page);
493 unsigned int len = bmd->iovecs[i].bv_len;
494 553
495 if (read && !ret && copy_to_user(bmd->userptr, addr, len))
496 ret = -EFAULT;
497
498 __free_page(bvec->bv_page);
499 bmd->userptr += len;
500 }
501 bio_free_map_data(bmd); 554 bio_free_map_data(bmd);
502 bio_put(bio); 555 bio_put(bio);
503 return ret; 556 return ret;
504} 557}
505 558
506/** 559/**
507 * bio_copy_user - copy user data to bio 560 * bio_copy_user_iov - copy user data to bio
508 * @q: destination block queue 561 * @q: destination block queue
509 * @uaddr: start of user address 562 * @iov: the iovec.
510 * @len: length in bytes 563 * @iov_count: number of elements in the iovec
511 * @write_to_vm: bool indicating writing to pages or not 564 * @write_to_vm: bool indicating writing to pages or not
512 * 565 *
513 * Prepares and returns a bio for indirect user io, bouncing data 566 * Prepares and returns a bio for indirect user io, bouncing data
514 * to/from kernel pages as necessary. Must be paired with 567 * to/from kernel pages as necessary. Must be paired with
515 * call bio_uncopy_user() on io completion. 568 * call bio_uncopy_user() on io completion.
516 */ 569 */
517struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, 570struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov,
518 unsigned int len, int write_to_vm) 571 int iov_count, int write_to_vm)
519{ 572{
520 unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
521 unsigned long start = uaddr >> PAGE_SHIFT;
522 struct bio_map_data *bmd; 573 struct bio_map_data *bmd;
523 struct bio_vec *bvec; 574 struct bio_vec *bvec;
524 struct page *page; 575 struct page *page;
525 struct bio *bio; 576 struct bio *bio;
526 int i, ret; 577 int i, ret;
578 int nr_pages = 0;
579 unsigned int len = 0;
527 580
528 bmd = bio_alloc_map_data(end - start); 581 for (i = 0; i < iov_count; i++) {
582 unsigned long uaddr;
583 unsigned long end;
584 unsigned long start;
585
586 uaddr = (unsigned long)iov[i].iov_base;
587 end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
588 start = uaddr >> PAGE_SHIFT;
589
590 nr_pages += end - start;
591 len += iov[i].iov_len;
592 }
593
594 bmd = bio_alloc_map_data(nr_pages, iov_count);
529 if (!bmd) 595 if (!bmd)
530 return ERR_PTR(-ENOMEM); 596 return ERR_PTR(-ENOMEM);
531 597
532 bmd->userptr = (void __user *) uaddr;
533
534 ret = -ENOMEM; 598 ret = -ENOMEM;
535 bio = bio_alloc(GFP_KERNEL, end - start); 599 bio = bio_alloc(GFP_KERNEL, nr_pages);
536 if (!bio) 600 if (!bio)
537 goto out_bmd; 601 goto out_bmd;
538 602
@@ -564,22 +628,12 @@ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
564 * success 628 * success
565 */ 629 */
566 if (!write_to_vm) { 630 if (!write_to_vm) {
567 char __user *p = (char __user *) uaddr; 631 ret = __bio_copy_iov(bio, iov, iov_count, 0);
568 632 if (ret)
569 /* 633 goto cleanup;
570 * for a write, copy in data to kernel pages
571 */
572 ret = -EFAULT;
573 bio_for_each_segment(bvec, bio, i) {
574 char *addr = page_address(bvec->bv_page);
575
576 if (copy_from_user(addr, p, bvec->bv_len))
577 goto cleanup;
578 p += bvec->bv_len;
579 }
580 } 634 }
581 635
582 bio_set_map_data(bmd, bio); 636 bio_set_map_data(bmd, bio, iov, iov_count);
583 return bio; 637 return bio;
584cleanup: 638cleanup:
585 bio_for_each_segment(bvec, bio, i) 639 bio_for_each_segment(bvec, bio, i)
@@ -591,6 +645,28 @@ out_bmd:
591 return ERR_PTR(ret); 645 return ERR_PTR(ret);
592} 646}
593 647
648/**
649 * bio_copy_user - copy user data to bio
650 * @q: destination block queue
651 * @uaddr: start of user address
652 * @len: length in bytes
653 * @write_to_vm: bool indicating writing to pages or not
654 *
655 * Prepares and returns a bio for indirect user io, bouncing data
656 * to/from kernel pages as necessary. Must be paired with
657 * call bio_uncopy_user() on io completion.
658 */
659struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr,
660 unsigned int len, int write_to_vm)
661{
662 struct sg_iovec iov;
663
664 iov.iov_base = (void __user *)uaddr;
665 iov.iov_len = len;
666
667 return bio_copy_user_iov(q, &iov, 1, write_to_vm);
668}
669
594static struct bio *__bio_map_user_iov(struct request_queue *q, 670static struct bio *__bio_map_user_iov(struct request_queue *q,
595 struct block_device *bdev, 671 struct block_device *bdev,
596 struct sg_iovec *iov, int iov_count, 672 struct sg_iovec *iov, int iov_count,
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 350680fd7da7..0c3b618c15b3 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -23,7 +23,6 @@
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/vfs.h> 24#include <linux/vfs.h>
25#include <linux/mutex.h> 25#include <linux/mutex.h>
26#include <asm/semaphore.h>
27 26
28#include <asm/uaccess.h> 27#include <asm/uaccess.h>
29 28
diff --git a/fs/dcache.c b/fs/dcache.c
index 43455776711e..3ee588d5f585 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1746,12 +1746,21 @@ shouldnt_be_hashed:
1746 goto shouldnt_be_hashed; 1746 goto shouldnt_be_hashed;
1747} 1747}
1748 1748
1749static int prepend(char **buffer, int *buflen, const char *str,
1750 int namelen)
1751{
1752 *buflen -= namelen;
1753 if (*buflen < 0)
1754 return -ENAMETOOLONG;
1755 *buffer -= namelen;
1756 memcpy(*buffer, str, namelen);
1757 return 0;
1758}
1759
1749/** 1760/**
1750 * d_path - return the path of a dentry 1761 * d_path - return the path of a dentry
1751 * @dentry: dentry to report 1762 * @path: the dentry/vfsmount to report
1752 * @vfsmnt: vfsmnt to which the dentry belongs 1763 * @root: root vfsmnt/dentry (may be modified by this function)
1753 * @root: root dentry
1754 * @rootmnt: vfsmnt to which the root dentry belongs
1755 * @buffer: buffer to return value in 1764 * @buffer: buffer to return value in
1756 * @buflen: buffer length 1765 * @buflen: buffer length
1757 * 1766 *
@@ -1761,23 +1770,22 @@ shouldnt_be_hashed:
1761 * Returns the buffer or an error code if the path was too long. 1770 * Returns the buffer or an error code if the path was too long.
1762 * 1771 *
1763 * "buflen" should be positive. Caller holds the dcache_lock. 1772 * "buflen" should be positive. Caller holds the dcache_lock.
1773 *
1774 * If path is not reachable from the supplied root, then the value of
1775 * root is changed (without modifying refcounts).
1764 */ 1776 */
1765static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt, 1777char *__d_path(const struct path *path, struct path *root,
1766 struct path *root, char *buffer, int buflen) 1778 char *buffer, int buflen)
1767{ 1779{
1780 struct dentry *dentry = path->dentry;
1781 struct vfsmount *vfsmnt = path->mnt;
1768 char * end = buffer+buflen; 1782 char * end = buffer+buflen;
1769 char * retval; 1783 char * retval;
1770 int namelen; 1784
1771 1785 prepend(&end, &buflen, "\0", 1);
1772 *--end = '\0'; 1786 if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
1773 buflen--; 1787 (prepend(&end, &buflen, " (deleted)", 10) != 0))
1774 if (!IS_ROOT(dentry) && d_unhashed(dentry)) {
1775 buflen -= 10;
1776 end -= 10;
1777 if (buflen < 0)
1778 goto Elong; 1788 goto Elong;
1779 memcpy(end, " (deleted)", 10);
1780 }
1781 1789
1782 if (buflen < 1) 1790 if (buflen < 1)
1783 goto Elong; 1791 goto Elong;
@@ -1804,13 +1812,10 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
1804 } 1812 }
1805 parent = dentry->d_parent; 1813 parent = dentry->d_parent;
1806 prefetch(parent); 1814 prefetch(parent);
1807 namelen = dentry->d_name.len; 1815 if ((prepend(&end, &buflen, dentry->d_name.name,
1808 buflen -= namelen + 1; 1816 dentry->d_name.len) != 0) ||
1809 if (buflen < 0) 1817 (prepend(&end, &buflen, "/", 1) != 0))
1810 goto Elong; 1818 goto Elong;
1811 end -= namelen;
1812 memcpy(end, dentry->d_name.name, namelen);
1813 *--end = '/';
1814 retval = end; 1819 retval = end;
1815 dentry = parent; 1820 dentry = parent;
1816 } 1821 }
@@ -1818,12 +1823,12 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
1818 return retval; 1823 return retval;
1819 1824
1820global_root: 1825global_root:
1821 namelen = dentry->d_name.len; 1826 retval += 1; /* hit the slash */
1822 buflen -= namelen; 1827 if (prepend(&retval, &buflen, dentry->d_name.name,
1823 if (buflen < 0) 1828 dentry->d_name.len) != 0)
1824 goto Elong; 1829 goto Elong;
1825 retval -= namelen-1; /* hit the slash */ 1830 root->mnt = vfsmnt;
1826 memcpy(retval, dentry->d_name.name, namelen); 1831 root->dentry = dentry;
1827 return retval; 1832 return retval;
1828Elong: 1833Elong:
1829 return ERR_PTR(-ENAMETOOLONG); 1834 return ERR_PTR(-ENAMETOOLONG);
@@ -1846,6 +1851,7 @@ char *d_path(struct path *path, char *buf, int buflen)
1846{ 1851{
1847 char *res; 1852 char *res;
1848 struct path root; 1853 struct path root;
1854 struct path tmp;
1849 1855
1850 /* 1856 /*
1851 * We have various synthetic filesystems that never get mounted. On 1857 * We have various synthetic filesystems that never get mounted. On
@@ -1859,10 +1865,11 @@ char *d_path(struct path *path, char *buf, int buflen)
1859 1865
1860 read_lock(&current->fs->lock); 1866 read_lock(&current->fs->lock);
1861 root = current->fs->root; 1867 root = current->fs->root;
1862 path_get(&current->fs->root); 1868 path_get(&root);
1863 read_unlock(&current->fs->lock); 1869 read_unlock(&current->fs->lock);
1864 spin_lock(&dcache_lock); 1870 spin_lock(&dcache_lock);
1865 res = __d_path(path->dentry, path->mnt, &root, buf, buflen); 1871 tmp = root;
1872 res = __d_path(path, &tmp, buf, buflen);
1866 spin_unlock(&dcache_lock); 1873 spin_unlock(&dcache_lock);
1867 path_put(&root); 1874 path_put(&root);
1868 return res; 1875 return res;
@@ -1890,6 +1897,48 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen,
1890} 1897}
1891 1898
1892/* 1899/*
1900 * Write full pathname from the root of the filesystem into the buffer.
1901 */
1902char *dentry_path(struct dentry *dentry, char *buf, int buflen)
1903{
1904 char *end = buf + buflen;
1905 char *retval;
1906
1907 spin_lock(&dcache_lock);
1908 prepend(&end, &buflen, "\0", 1);
1909 if (!IS_ROOT(dentry) && d_unhashed(dentry) &&
1910 (prepend(&end, &buflen, "//deleted", 9) != 0))
1911 goto Elong;
1912 if (buflen < 1)
1913 goto Elong;
1914 /* Get '/' right */
1915 retval = end-1;
1916 *retval = '/';
1917
1918 for (;;) {
1919 struct dentry *parent;
1920 if (IS_ROOT(dentry))
1921 break;
1922
1923 parent = dentry->d_parent;
1924 prefetch(parent);
1925
1926 if ((prepend(&end, &buflen, dentry->d_name.name,
1927 dentry->d_name.len) != 0) ||
1928 (prepend(&end, &buflen, "/", 1) != 0))
1929 goto Elong;
1930
1931 retval = end;
1932 dentry = parent;
1933 }
1934 spin_unlock(&dcache_lock);
1935 return retval;
1936Elong:
1937 spin_unlock(&dcache_lock);
1938 return ERR_PTR(-ENAMETOOLONG);
1939}
1940
1941/*
1893 * NOTE! The user-level library version returns a 1942 * NOTE! The user-level library version returns a
1894 * character pointer. The kernel system call just 1943 * character pointer. The kernel system call just
1895 * returns the length of the buffer filled (which 1944 * returns the length of the buffer filled (which
@@ -1918,9 +1967,9 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
1918 1967
1919 read_lock(&current->fs->lock); 1968 read_lock(&current->fs->lock);
1920 pwd = current->fs->pwd; 1969 pwd = current->fs->pwd;
1921 path_get(&current->fs->pwd); 1970 path_get(&pwd);
1922 root = current->fs->root; 1971 root = current->fs->root;
1923 path_get(&current->fs->root); 1972 path_get(&root);
1924 read_unlock(&current->fs->lock); 1973 read_unlock(&current->fs->lock);
1925 1974
1926 error = -ENOENT; 1975 error = -ENOENT;
@@ -1928,9 +1977,10 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size)
1928 spin_lock(&dcache_lock); 1977 spin_lock(&dcache_lock);
1929 if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) { 1978 if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) {
1930 unsigned long len; 1979 unsigned long len;
1980 struct path tmp = root;
1931 char * cwd; 1981 char * cwd;
1932 1982
1933 cwd = __d_path(pwd.dentry, pwd.mnt, &root, page, PAGE_SIZE); 1983 cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE);
1934 spin_unlock(&dcache_lock); 1984 spin_unlock(&dcache_lock);
1935 1985
1936 error = PTR_ERR(cwd); 1986 error = PTR_ERR(cwd);
diff --git a/fs/dlm/Makefile b/fs/dlm/Makefile
index d248e60951ba..ca1c9124c8ce 100644
--- a/fs/dlm/Makefile
+++ b/fs/dlm/Makefile
@@ -10,6 +10,7 @@ dlm-y := ast.o \
10 midcomms.o \ 10 midcomms.o \
11 netlink.o \ 11 netlink.o \
12 lowcomms.o \ 12 lowcomms.o \
13 plock.o \
13 rcom.o \ 14 rcom.o \
14 recover.o \ 15 recover.o \
15 recoverd.o \ 16 recoverd.o \
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index c3ad1dff3b25..eac23bd288b2 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -114,7 +114,7 @@ struct cluster_attribute {
114}; 114};
115 115
116static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field, 116static ssize_t cluster_set(struct cluster *cl, unsigned int *cl_field,
117 unsigned int *info_field, int check_zero, 117 int *info_field, int check_zero,
118 const char *buf, size_t len) 118 const char *buf, size_t len)
119{ 119{
120 unsigned int x; 120 unsigned int x;
@@ -284,6 +284,7 @@ struct node {
284 struct list_head list; /* space->members */ 284 struct list_head list; /* space->members */
285 int nodeid; 285 int nodeid;
286 int weight; 286 int weight;
287 int new;
287}; 288};
288 289
289static struct configfs_group_operations clusters_ops = { 290static struct configfs_group_operations clusters_ops = {
@@ -565,6 +566,7 @@ static struct config_item *make_node(struct config_group *g, const char *name)
565 config_item_init_type_name(&nd->item, name, &node_type); 566 config_item_init_type_name(&nd->item, name, &node_type);
566 nd->nodeid = -1; 567 nd->nodeid = -1;
567 nd->weight = 1; /* default weight of 1 if none is set */ 568 nd->weight = 1; /* default weight of 1 if none is set */
569 nd->new = 1; /* set to 0 once it's been read by dlm_nodeid_list() */
568 570
569 mutex_lock(&sp->members_lock); 571 mutex_lock(&sp->members_lock);
570 list_add(&nd->list, &sp->members); 572 list_add(&nd->list, &sp->members);
@@ -805,12 +807,13 @@ static void put_comm(struct comm *cm)
805} 807}
806 808
807/* caller must free mem */ 809/* caller must free mem */
808int dlm_nodeid_list(char *lsname, int **ids_out) 810int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
811 int **new_out, int *new_count_out)
809{ 812{
810 struct space *sp; 813 struct space *sp;
811 struct node *nd; 814 struct node *nd;
812 int i = 0, rv = 0; 815 int i = 0, rv = 0, ids_count = 0, new_count = 0;
813 int *ids; 816 int *ids, *new;
814 817
815 sp = get_space(lsname); 818 sp = get_space(lsname);
816 if (!sp) 819 if (!sp)
@@ -818,23 +821,50 @@ int dlm_nodeid_list(char *lsname, int **ids_out)
818 821
819 mutex_lock(&sp->members_lock); 822 mutex_lock(&sp->members_lock);
820 if (!sp->members_count) { 823 if (!sp->members_count) {
821 rv = 0; 824 rv = -EINVAL;
825 printk(KERN_ERR "dlm: zero members_count\n");
822 goto out; 826 goto out;
823 } 827 }
824 828
825 ids = kcalloc(sp->members_count, sizeof(int), GFP_KERNEL); 829 ids_count = sp->members_count;
830
831 ids = kcalloc(ids_count, sizeof(int), GFP_KERNEL);
826 if (!ids) { 832 if (!ids) {
827 rv = -ENOMEM; 833 rv = -ENOMEM;
828 goto out; 834 goto out;
829 } 835 }
830 836
831 rv = sp->members_count; 837 list_for_each_entry(nd, &sp->members, list) {
832 list_for_each_entry(nd, &sp->members, list)
833 ids[i++] = nd->nodeid; 838 ids[i++] = nd->nodeid;
839 if (nd->new)
840 new_count++;
841 }
842
843 if (ids_count != i)
844 printk(KERN_ERR "dlm: bad nodeid count %d %d\n", ids_count, i);
845
846 if (!new_count)
847 goto out_ids;
848
849 new = kcalloc(new_count, sizeof(int), GFP_KERNEL);
850 if (!new) {
851 kfree(ids);
852 rv = -ENOMEM;
853 goto out;
854 }
834 855
835 if (rv != i) 856 i = 0;
836 printk("bad nodeid count %d %d\n", rv, i); 857 list_for_each_entry(nd, &sp->members, list) {
858 if (nd->new) {
859 new[i++] = nd->nodeid;
860 nd->new = 0;
861 }
862 }
863 *new_count_out = new_count;
864 *new_out = new;
837 865
866 out_ids:
867 *ids_count_out = ids_count;
838 *ids_out = ids; 868 *ids_out = ids;
839 out: 869 out:
840 mutex_unlock(&sp->members_lock); 870 mutex_unlock(&sp->members_lock);
diff --git a/fs/dlm/config.h b/fs/dlm/config.h
index a3170fe22090..4f1d6fce58c5 100644
--- a/fs/dlm/config.h
+++ b/fs/dlm/config.h
@@ -35,7 +35,8 @@ extern struct dlm_config_info dlm_config;
35int dlm_config_init(void); 35int dlm_config_init(void);
36void dlm_config_exit(void); 36void dlm_config_exit(void);
37int dlm_node_weight(char *lsname, int nodeid); 37int dlm_node_weight(char *lsname, int nodeid);
38int dlm_nodeid_list(char *lsname, int **ids_out); 38int dlm_nodeid_list(char *lsname, int **ids_out, int *ids_count_out,
39 int **new_out, int *new_count_out);
39int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr); 40int dlm_nodeid_to_addr(int nodeid, struct sockaddr_storage *addr);
40int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid); 41int dlm_addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid);
41int dlm_our_nodeid(void); 42int dlm_our_nodeid(void);
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index d30ea8b433a2..5a7ac33b629c 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -37,14 +37,11 @@
37#include <linux/jhash.h> 37#include <linux/jhash.h>
38#include <linux/miscdevice.h> 38#include <linux/miscdevice.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <asm/semaphore.h>
41#include <asm/uaccess.h> 40#include <asm/uaccess.h>
42 41
43#include <linux/dlm.h> 42#include <linux/dlm.h>
44#include "config.h" 43#include "config.h"
45 44
46#define DLM_LOCKSPACE_LEN 64
47
48/* Size of the temp buffer midcomms allocates on the stack. 45/* Size of the temp buffer midcomms allocates on the stack.
49 We try to make this large enough so most messages fit. 46 We try to make this large enough so most messages fit.
50 FIXME: should sctp make this unnecessary? */ 47 FIXME: should sctp make this unnecessary? */
@@ -133,8 +130,10 @@ struct dlm_member {
133 130
134struct dlm_recover { 131struct dlm_recover {
135 struct list_head list; 132 struct list_head list;
136 int *nodeids; 133 int *nodeids; /* nodeids of all members */
137 int node_count; 134 int node_count;
135 int *new; /* nodeids of new members */
136 int new_count;
138 uint64_t seq; 137 uint64_t seq;
139}; 138};
140 139
@@ -580,6 +579,8 @@ static inline int dlm_no_directory(struct dlm_ls *ls)
580int dlm_netlink_init(void); 579int dlm_netlink_init(void);
581void dlm_netlink_exit(void); 580void dlm_netlink_exit(void);
582void dlm_timeout_warn(struct dlm_lkb *lkb); 581void dlm_timeout_warn(struct dlm_lkb *lkb);
582int dlm_plock_init(void);
583void dlm_plock_exit(void);
583 584
584#ifdef CONFIG_DLM_DEBUG 585#ifdef CONFIG_DLM_DEBUG
585int dlm_register_debugfs(void); 586int dlm_register_debugfs(void);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 8f250ac8b928..2d3d1027ce2b 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -165,7 +165,7 @@ void dlm_print_lkb(struct dlm_lkb *lkb)
165 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type); 165 lkb->lkb_grmode, lkb->lkb_wait_type, lkb->lkb_ast_type);
166} 166}
167 167
168void dlm_print_rsb(struct dlm_rsb *r) 168static void dlm_print_rsb(struct dlm_rsb *r)
169{ 169{
170 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n", 170 printk(KERN_ERR "rsb: nodeid %d flags %lx first %x rlc %d name %s\n",
171 r->res_nodeid, r->res_flags, r->res_first_lkid, 171 r->res_nodeid, r->res_flags, r->res_first_lkid,
@@ -1956,8 +1956,7 @@ static void confirm_master(struct dlm_rsb *r, int error)
1956 list_del_init(&lkb->lkb_rsb_lookup); 1956 list_del_init(&lkb->lkb_rsb_lookup);
1957 r->res_first_lkid = lkb->lkb_id; 1957 r->res_first_lkid = lkb->lkb_id;
1958 _request_lock(r, lkb); 1958 _request_lock(r, lkb);
1959 } else 1959 }
1960 r->res_nodeid = -1;
1961 break; 1960 break;
1962 1961
1963 default: 1962 default:
diff --git a/fs/dlm/lock.h b/fs/dlm/lock.h
index 05d9c82e646b..88e93c80cc22 100644
--- a/fs/dlm/lock.h
+++ b/fs/dlm/lock.h
@@ -13,7 +13,6 @@
13#ifndef __LOCK_DOT_H__ 13#ifndef __LOCK_DOT_H__
14#define __LOCK_DOT_H__ 14#define __LOCK_DOT_H__
15 15
16void dlm_print_rsb(struct dlm_rsb *r);
17void dlm_dump_rsb(struct dlm_rsb *r); 16void dlm_dump_rsb(struct dlm_rsb *r);
18void dlm_print_lkb(struct dlm_lkb *lkb); 17void dlm_print_lkb(struct dlm_lkb *lkb);
19void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms); 18void dlm_receive_message_saved(struct dlm_ls *ls, struct dlm_message *ms);
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 58487fb95a4c..b80e0aa3cfa5 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -46,10 +46,16 @@ static int __init init_dlm(void)
46 if (error) 46 if (error)
47 goto out_user; 47 goto out_user;
48 48
49 error = dlm_plock_init();
50 if (error)
51 goto out_netlink;
52
49 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__); 53 printk("DLM (built %s %s) installed\n", __DATE__, __TIME__);
50 54
51 return 0; 55 return 0;
52 56
57 out_netlink:
58 dlm_netlink_exit();
53 out_user: 59 out_user:
54 dlm_user_exit(); 60 dlm_user_exit();
55 out_debug: 61 out_debug:
@@ -66,6 +72,7 @@ static int __init init_dlm(void)
66 72
67static void __exit exit_dlm(void) 73static void __exit exit_dlm(void)
68{ 74{
75 dlm_plock_exit();
69 dlm_netlink_exit(); 76 dlm_netlink_exit();
70 dlm_user_exit(); 77 dlm_user_exit();
71 dlm_config_exit(); 78 dlm_config_exit();
diff --git a/fs/dlm/member.c b/fs/dlm/member.c
index fa17f5a27883..26133f05ae3a 100644
--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -210,6 +210,23 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
210 } 210 }
211 } 211 }
212 212
213 /* Add an entry to ls_nodes_gone for members that were removed and
214 then added again, so that previous state for these nodes will be
215 cleared during recovery. */
216
217 for (i = 0; i < rv->new_count; i++) {
218 if (!dlm_is_member(ls, rv->new[i]))
219 continue;
220 log_debug(ls, "new nodeid %d is a re-added member", rv->new[i]);
221
222 memb = kzalloc(sizeof(struct dlm_member), GFP_KERNEL);
223 if (!memb)
224 return -ENOMEM;
225 memb->nodeid = rv->new[i];
226 list_add_tail(&memb->list, &ls->ls_nodes_gone);
227 neg++;
228 }
229
213 /* add new members to ls_nodes */ 230 /* add new members to ls_nodes */
214 231
215 for (i = 0; i < rv->node_count; i++) { 232 for (i = 0; i < rv->node_count; i++) {
@@ -314,15 +331,16 @@ int dlm_ls_stop(struct dlm_ls *ls)
314int dlm_ls_start(struct dlm_ls *ls) 331int dlm_ls_start(struct dlm_ls *ls)
315{ 332{
316 struct dlm_recover *rv = NULL, *rv_old; 333 struct dlm_recover *rv = NULL, *rv_old;
317 int *ids = NULL; 334 int *ids = NULL, *new = NULL;
318 int error, count; 335 int error, ids_count = 0, new_count = 0;
319 336
320 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL); 337 rv = kzalloc(sizeof(struct dlm_recover), GFP_KERNEL);
321 if (!rv) 338 if (!rv)
322 return -ENOMEM; 339 return -ENOMEM;
323 340
324 error = count = dlm_nodeid_list(ls->ls_name, &ids); 341 error = dlm_nodeid_list(ls->ls_name, &ids, &ids_count,
325 if (error <= 0) 342 &new, &new_count);
343 if (error < 0)
326 goto fail; 344 goto fail;
327 345
328 spin_lock(&ls->ls_recover_lock); 346 spin_lock(&ls->ls_recover_lock);
@@ -337,14 +355,19 @@ int dlm_ls_start(struct dlm_ls *ls)
337 } 355 }
338 356
339 rv->nodeids = ids; 357 rv->nodeids = ids;
340 rv->node_count = count; 358 rv->node_count = ids_count;
359 rv->new = new;
360 rv->new_count = new_count;
341 rv->seq = ++ls->ls_recover_seq; 361 rv->seq = ++ls->ls_recover_seq;
342 rv_old = ls->ls_recover_args; 362 rv_old = ls->ls_recover_args;
343 ls->ls_recover_args = rv; 363 ls->ls_recover_args = rv;
344 spin_unlock(&ls->ls_recover_lock); 364 spin_unlock(&ls->ls_recover_lock);
345 365
346 if (rv_old) { 366 if (rv_old) {
367 log_error(ls, "unused recovery %llx %d",
368 (unsigned long long)rv_old->seq, rv_old->node_count);
347 kfree(rv_old->nodeids); 369 kfree(rv_old->nodeids);
370 kfree(rv_old->new);
348 kfree(rv_old); 371 kfree(rv_old);
349 } 372 }
350 373
@@ -354,6 +377,7 @@ int dlm_ls_start(struct dlm_ls *ls)
354 fail: 377 fail:
355 kfree(rv); 378 kfree(rv);
356 kfree(ids); 379 kfree(ids);
380 kfree(new);
357 return error; 381 return error;
358} 382}
359 383
diff --git a/fs/gfs2/locking/dlm/plock.c b/fs/dlm/plock.c
index 2ebd374b3143..d6d6e370f89c 100644
--- a/fs/gfs2/locking/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -1,17 +1,19 @@
1/* 1/*
2 * Copyright (C) 2005 Red Hat, Inc. All rights reserved. 2 * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
3 * 3 *
4 * This copyrighted material is made available to anyone wishing to use, 4 * This copyrighted material is made available to anyone wishing to use,
5 * modify, copy, or redistribute it subject to the terms and conditions 5 * modify, copy, or redistribute it subject to the terms and conditions
6 * of the GNU General Public License version 2. 6 * of the GNU General Public License version 2.
7 */ 7 */
8 8
9#include <linux/fs.h>
9#include <linux/miscdevice.h> 10#include <linux/miscdevice.h>
10#include <linux/lock_dlm_plock.h>
11#include <linux/poll.h> 11#include <linux/poll.h>
12#include <linux/dlm.h>
13#include <linux/dlm_plock.h>
12 14
13#include "lock_dlm.h" 15#include "dlm_internal.h"
14 16#include "lockspace.h"
15 17
16static spinlock_t ops_lock; 18static spinlock_t ops_lock;
17static struct list_head send_list; 19static struct list_head send_list;
@@ -22,7 +24,7 @@ static wait_queue_head_t recv_wq;
22struct plock_op { 24struct plock_op {
23 struct list_head list; 25 struct list_head list;
24 int done; 26 int done;
25 struct gdlm_plock_info info; 27 struct dlm_plock_info info;
26}; 28};
27 29
28struct plock_xop { 30struct plock_xop {
@@ -34,22 +36,22 @@ struct plock_xop {
34}; 36};
35 37
36 38
37static inline void set_version(struct gdlm_plock_info *info) 39static inline void set_version(struct dlm_plock_info *info)
38{ 40{
39 info->version[0] = GDLM_PLOCK_VERSION_MAJOR; 41 info->version[0] = DLM_PLOCK_VERSION_MAJOR;
40 info->version[1] = GDLM_PLOCK_VERSION_MINOR; 42 info->version[1] = DLM_PLOCK_VERSION_MINOR;
41 info->version[2] = GDLM_PLOCK_VERSION_PATCH; 43 info->version[2] = DLM_PLOCK_VERSION_PATCH;
42} 44}
43 45
44static int check_version(struct gdlm_plock_info *info) 46static int check_version(struct dlm_plock_info *info)
45{ 47{
46 if ((GDLM_PLOCK_VERSION_MAJOR != info->version[0]) || 48 if ((DLM_PLOCK_VERSION_MAJOR != info->version[0]) ||
47 (GDLM_PLOCK_VERSION_MINOR < info->version[1])) { 49 (DLM_PLOCK_VERSION_MINOR < info->version[1])) {
48 log_error("plock device version mismatch: " 50 log_print("plock device version mismatch: "
49 "kernel (%u.%u.%u), user (%u.%u.%u)", 51 "kernel (%u.%u.%u), user (%u.%u.%u)",
50 GDLM_PLOCK_VERSION_MAJOR, 52 DLM_PLOCK_VERSION_MAJOR,
51 GDLM_PLOCK_VERSION_MINOR, 53 DLM_PLOCK_VERSION_MINOR,
52 GDLM_PLOCK_VERSION_PATCH, 54 DLM_PLOCK_VERSION_PATCH,
53 info->version[0], 55 info->version[0],
54 info->version[1], 56 info->version[1],
55 info->version[2]); 57 info->version[2]);
@@ -68,25 +70,31 @@ static void send_op(struct plock_op *op)
68 wake_up(&send_wq); 70 wake_up(&send_wq);
69} 71}
70 72
71int gdlm_plock(void *lockspace, struct lm_lockname *name, 73int dlm_posix_lock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
72 struct file *file, int cmd, struct file_lock *fl) 74 int cmd, struct file_lock *fl)
73{ 75{
74 struct gdlm_ls *ls = lockspace; 76 struct dlm_ls *ls;
75 struct plock_op *op; 77 struct plock_op *op;
76 struct plock_xop *xop; 78 struct plock_xop *xop;
77 int rv; 79 int rv;
78 80
81 ls = dlm_find_lockspace_local(lockspace);
82 if (!ls)
83 return -EINVAL;
84
79 xop = kzalloc(sizeof(*xop), GFP_KERNEL); 85 xop = kzalloc(sizeof(*xop), GFP_KERNEL);
80 if (!xop) 86 if (!xop) {
81 return -ENOMEM; 87 rv = -ENOMEM;
88 goto out;
89 }
82 90
83 op = &xop->xop; 91 op = &xop->xop;
84 op->info.optype = GDLM_PLOCK_OP_LOCK; 92 op->info.optype = DLM_PLOCK_OP_LOCK;
85 op->info.pid = fl->fl_pid; 93 op->info.pid = fl->fl_pid;
86 op->info.ex = (fl->fl_type == F_WRLCK); 94 op->info.ex = (fl->fl_type == F_WRLCK);
87 op->info.wait = IS_SETLKW(cmd); 95 op->info.wait = IS_SETLKW(cmd);
88 op->info.fsid = ls->id; 96 op->info.fsid = ls->ls_global_id;
89 op->info.number = name->ln_number; 97 op->info.number = number;
90 op->info.start = fl->fl_start; 98 op->info.start = fl->fl_start;
91 op->info.end = fl->fl_end; 99 op->info.end = fl->fl_end;
92 if (fl->fl_lmops && fl->fl_lmops->fl_grant) { 100 if (fl->fl_lmops && fl->fl_lmops->fl_grant) {
@@ -107,12 +115,15 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
107 115
108 if (xop->callback == NULL) 116 if (xop->callback == NULL)
109 wait_event(recv_wq, (op->done != 0)); 117 wait_event(recv_wq, (op->done != 0));
110 else 118 else {
111 return -EINPROGRESS; 119 rv = -EINPROGRESS;
120 goto out;
121 }
112 122
113 spin_lock(&ops_lock); 123 spin_lock(&ops_lock);
114 if (!list_empty(&op->list)) { 124 if (!list_empty(&op->list)) {
115 printk(KERN_INFO "plock op on list\n"); 125 log_error(ls, "dlm_posix_lock: op on list %llx",
126 (unsigned long long)number);
116 list_del(&op->list); 127 list_del(&op->list);
117 } 128 }
118 spin_unlock(&ops_lock); 129 spin_unlock(&ops_lock);
@@ -121,17 +132,19 @@ int gdlm_plock(void *lockspace, struct lm_lockname *name,
121 132
122 if (!rv) { 133 if (!rv) {
123 if (posix_lock_file_wait(file, fl) < 0) 134 if (posix_lock_file_wait(file, fl) < 0)
124 log_error("gdlm_plock: vfs lock error %x,%llx", 135 log_error(ls, "dlm_posix_lock: vfs lock error %llx",
125 name->ln_type, 136 (unsigned long long)number);
126 (unsigned long long)name->ln_number);
127 } 137 }
128 138
129 kfree(xop); 139 kfree(xop);
140out:
141 dlm_put_lockspace(ls);
130 return rv; 142 return rv;
131} 143}
144EXPORT_SYMBOL_GPL(dlm_posix_lock);
132 145
133/* Returns failure iff a succesful lock operation should be canceled */ 146/* Returns failure iff a succesful lock operation should be canceled */
134static int gdlm_plock_callback(struct plock_op *op) 147static int dlm_plock_callback(struct plock_op *op)
135{ 148{
136 struct file *file; 149 struct file *file;
137 struct file_lock *fl; 150 struct file_lock *fl;
@@ -142,7 +155,8 @@ static int gdlm_plock_callback(struct plock_op *op)
142 155
143 spin_lock(&ops_lock); 156 spin_lock(&ops_lock);
144 if (!list_empty(&op->list)) { 157 if (!list_empty(&op->list)) {
145 printk(KERN_INFO "plock op on list\n"); 158 log_print("dlm_plock_callback: op on list %llx",
159 (unsigned long long)op->info.number);
146 list_del(&op->list); 160 list_del(&op->list);
147 } 161 }
148 spin_unlock(&ops_lock); 162 spin_unlock(&ops_lock);
@@ -165,19 +179,19 @@ static int gdlm_plock_callback(struct plock_op *op)
165 * This can only happen in the case of kmalloc() failure. 179 * This can only happen in the case of kmalloc() failure.
166 * The filesystem's own lock is the authoritative lock, 180 * The filesystem's own lock is the authoritative lock,
167 * so a failure to get the lock locally is not a disaster. 181 * so a failure to get the lock locally is not a disaster.
168 * As long as GFS cannot reliably cancel locks (especially 182 * As long as the fs cannot reliably cancel locks (especially
169 * in a low-memory situation), we're better off ignoring 183 * in a low-memory situation), we're better off ignoring
170 * this failure than trying to recover. 184 * this failure than trying to recover.
171 */ 185 */
172 log_error("gdlm_plock: vfs lock error file %p fl %p", 186 log_print("dlm_plock_callback: vfs lock error %llx file %p fl %p",
173 file, fl); 187 (unsigned long long)op->info.number, file, fl);
174 } 188 }
175 189
176 rv = notify(flc, NULL, 0); 190 rv = notify(flc, NULL, 0);
177 if (rv) { 191 if (rv) {
178 /* XXX: We need to cancel the fs lock here: */ 192 /* XXX: We need to cancel the fs lock here: */
179 printk("gfs2 lock granted after lock request failed;" 193 log_print("dlm_plock_callback: lock granted after lock request "
180 " dangling lock!\n"); 194 "failed; dangling lock!\n");
181 goto out; 195 goto out;
182 } 196 }
183 197
@@ -186,25 +200,31 @@ out:
186 return rv; 200 return rv;
187} 201}
188 202
189int gdlm_punlock(void *lockspace, struct lm_lockname *name, 203int dlm_posix_unlock(dlm_lockspace_t *lockspace, u64 number, struct file *file,
190 struct file *file, struct file_lock *fl) 204 struct file_lock *fl)
191{ 205{
192 struct gdlm_ls *ls = lockspace; 206 struct dlm_ls *ls;
193 struct plock_op *op; 207 struct plock_op *op;
194 int rv; 208 int rv;
195 209
210 ls = dlm_find_lockspace_local(lockspace);
211 if (!ls)
212 return -EINVAL;
213
196 op = kzalloc(sizeof(*op), GFP_KERNEL); 214 op = kzalloc(sizeof(*op), GFP_KERNEL);
197 if (!op) 215 if (!op) {
198 return -ENOMEM; 216 rv = -ENOMEM;
217 goto out;
218 }
199 219
200 if (posix_lock_file_wait(file, fl) < 0) 220 if (posix_lock_file_wait(file, fl) < 0)
201 log_error("gdlm_punlock: vfs unlock error %x,%llx", 221 log_error(ls, "dlm_posix_unlock: vfs unlock error %llx",
202 name->ln_type, (unsigned long long)name->ln_number); 222 (unsigned long long)number);
203 223
204 op->info.optype = GDLM_PLOCK_OP_UNLOCK; 224 op->info.optype = DLM_PLOCK_OP_UNLOCK;
205 op->info.pid = fl->fl_pid; 225 op->info.pid = fl->fl_pid;
206 op->info.fsid = ls->id; 226 op->info.fsid = ls->ls_global_id;
207 op->info.number = name->ln_number; 227 op->info.number = number;
208 op->info.start = fl->fl_start; 228 op->info.start = fl->fl_start;
209 op->info.end = fl->fl_end; 229 op->info.end = fl->fl_end;
210 if (fl->fl_lmops && fl->fl_lmops->fl_grant) 230 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
@@ -217,7 +237,8 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
217 237
218 spin_lock(&ops_lock); 238 spin_lock(&ops_lock);
219 if (!list_empty(&op->list)) { 239 if (!list_empty(&op->list)) {
220 printk(KERN_INFO "punlock op on list\n"); 240 log_error(ls, "dlm_posix_unlock: op on list %llx",
241 (unsigned long long)number);
221 list_del(&op->list); 242 list_del(&op->list);
222 } 243 }
223 spin_unlock(&ops_lock); 244 spin_unlock(&ops_lock);
@@ -228,25 +249,34 @@ int gdlm_punlock(void *lockspace, struct lm_lockname *name,
228 rv = 0; 249 rv = 0;
229 250
230 kfree(op); 251 kfree(op);
252out:
253 dlm_put_lockspace(ls);
231 return rv; 254 return rv;
232} 255}
256EXPORT_SYMBOL_GPL(dlm_posix_unlock);
233 257
234int gdlm_plock_get(void *lockspace, struct lm_lockname *name, 258int dlm_posix_get(dlm_lockspace_t *lockspace, u64 number, struct file *file,
235 struct file *file, struct file_lock *fl) 259 struct file_lock *fl)
236{ 260{
237 struct gdlm_ls *ls = lockspace; 261 struct dlm_ls *ls;
238 struct plock_op *op; 262 struct plock_op *op;
239 int rv; 263 int rv;
240 264
265 ls = dlm_find_lockspace_local(lockspace);
266 if (!ls)
267 return -EINVAL;
268
241 op = kzalloc(sizeof(*op), GFP_KERNEL); 269 op = kzalloc(sizeof(*op), GFP_KERNEL);
242 if (!op) 270 if (!op) {
243 return -ENOMEM; 271 rv = -ENOMEM;
272 goto out;
273 }
244 274
245 op->info.optype = GDLM_PLOCK_OP_GET; 275 op->info.optype = DLM_PLOCK_OP_GET;
246 op->info.pid = fl->fl_pid; 276 op->info.pid = fl->fl_pid;
247 op->info.ex = (fl->fl_type == F_WRLCK); 277 op->info.ex = (fl->fl_type == F_WRLCK);
248 op->info.fsid = ls->id; 278 op->info.fsid = ls->ls_global_id;
249 op->info.number = name->ln_number; 279 op->info.number = number;
250 op->info.start = fl->fl_start; 280 op->info.start = fl->fl_start;
251 op->info.end = fl->fl_end; 281 op->info.end = fl->fl_end;
252 if (fl->fl_lmops && fl->fl_lmops->fl_grant) 282 if (fl->fl_lmops && fl->fl_lmops->fl_grant)
@@ -259,7 +289,8 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
259 289
260 spin_lock(&ops_lock); 290 spin_lock(&ops_lock);
261 if (!list_empty(&op->list)) { 291 if (!list_empty(&op->list)) {
262 printk(KERN_INFO "plock_get op on list\n"); 292 log_error(ls, "dlm_posix_get: op on list %llx",
293 (unsigned long long)number);
263 list_del(&op->list); 294 list_del(&op->list);
264 } 295 }
265 spin_unlock(&ops_lock); 296 spin_unlock(&ops_lock);
@@ -281,14 +312,17 @@ int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
281 } 312 }
282 313
283 kfree(op); 314 kfree(op);
315out:
316 dlm_put_lockspace(ls);
284 return rv; 317 return rv;
285} 318}
319EXPORT_SYMBOL_GPL(dlm_posix_get);
286 320
287/* a read copies out one plock request from the send list */ 321/* a read copies out one plock request from the send list */
288static ssize_t dev_read(struct file *file, char __user *u, size_t count, 322static ssize_t dev_read(struct file *file, char __user *u, size_t count,
289 loff_t *ppos) 323 loff_t *ppos)
290{ 324{
291 struct gdlm_plock_info info; 325 struct dlm_plock_info info;
292 struct plock_op *op = NULL; 326 struct plock_op *op = NULL;
293 327
294 if (count < sizeof(info)) 328 if (count < sizeof(info))
@@ -315,7 +349,7 @@ static ssize_t dev_read(struct file *file, char __user *u, size_t count,
315static ssize_t dev_write(struct file *file, const char __user *u, size_t count, 349static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
316 loff_t *ppos) 350 loff_t *ppos)
317{ 351{
318 struct gdlm_plock_info info; 352 struct dlm_plock_info info;
319 struct plock_op *op; 353 struct plock_op *op;
320 int found = 0; 354 int found = 0;
321 355
@@ -345,12 +379,12 @@ static ssize_t dev_write(struct file *file, const char __user *u, size_t count,
345 struct plock_xop *xop; 379 struct plock_xop *xop;
346 xop = (struct plock_xop *)op; 380 xop = (struct plock_xop *)op;
347 if (xop->callback) 381 if (xop->callback)
348 count = gdlm_plock_callback(op); 382 count = dlm_plock_callback(op);
349 else 383 else
350 wake_up(&recv_wq); 384 wake_up(&recv_wq);
351 } else 385 } else
352 printk(KERN_INFO "gdlm dev_write no op %x %llx\n", info.fsid, 386 log_print("dev_write no op %x %llx", info.fsid,
353 (unsigned long long)info.number); 387 (unsigned long long)info.number);
354 return count; 388 return count;
355} 389}
356 390
@@ -377,11 +411,11 @@ static const struct file_operations dev_fops = {
377 411
378static struct miscdevice plock_dev_misc = { 412static struct miscdevice plock_dev_misc = {
379 .minor = MISC_DYNAMIC_MINOR, 413 .minor = MISC_DYNAMIC_MINOR,
380 .name = GDLM_PLOCK_MISC_NAME, 414 .name = DLM_PLOCK_MISC_NAME,
381 .fops = &dev_fops 415 .fops = &dev_fops
382}; 416};
383 417
384int gdlm_plock_init(void) 418int dlm_plock_init(void)
385{ 419{
386 int rv; 420 int rv;
387 421
@@ -393,14 +427,13 @@ int gdlm_plock_init(void)
393 427
394 rv = misc_register(&plock_dev_misc); 428 rv = misc_register(&plock_dev_misc);
395 if (rv) 429 if (rv)
396 printk(KERN_INFO "gdlm_plock_init: misc_register failed %d", 430 log_print("dlm_plock_init: misc_register failed %d", rv);
397 rv);
398 return rv; 431 return rv;
399} 432}
400 433
401void gdlm_plock_exit(void) 434void dlm_plock_exit(void)
402{ 435{
403 if (misc_deregister(&plock_dev_misc) < 0) 436 if (misc_deregister(&plock_dev_misc) < 0)
404 printk(KERN_INFO "gdlm_plock_exit: misc_deregister failed"); 437 log_print("dlm_plock_exit: misc_deregister failed");
405} 438}
406 439
diff --git a/fs/dlm/recoverd.c b/fs/dlm/recoverd.c
index 997f9531d594..fd677c8c3d3b 100644
--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -257,6 +257,7 @@ static void do_ls_recovery(struct dlm_ls *ls)
257 if (rv) { 257 if (rv) {
258 ls_recover(ls, rv); 258 ls_recover(ls, rv);
259 kfree(rv->nodeids); 259 kfree(rv->nodeids);
260 kfree(rv->new);
260 kfree(rv); 261 kfree(rv);
261 } 262 }
262} 263}
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 5deb8b74e649..08f647d8188d 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -253,7 +253,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
253 * it has too few free inodes left (min_inodes) or 253 * it has too few free inodes left (min_inodes) or
254 * it has too few free blocks left (min_blocks) or 254 * it has too few free blocks left (min_blocks) or
255 * it's already running too large debt (max_debt). 255 * it's already running too large debt (max_debt).
256 * Parent's group is prefered, if it doesn't satisfy these 256 * Parent's group is preferred, if it doesn't satisfy these
257 * conditions we search cyclically through the rest. If none 257 * conditions we search cyclically through the rest. If none
258 * of the groups look good we just look for a group with more 258 * of the groups look good we just look for a group with more
259 * free inodes than average (starting at parent's group). 259 * free inodes than average (starting at parent's group).
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index c62006805427..b8a2990bab83 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -239,7 +239,7 @@ no_block:
239 * @inode: owner 239 * @inode: owner
240 * @ind: descriptor of indirect block. 240 * @ind: descriptor of indirect block.
241 * 241 *
242 * This function returns the prefered place for block allocation. 242 * This function returns the preferred place for block allocation.
243 * It is used when heuristic for sequential allocation fails. 243 * It is used when heuristic for sequential allocation fails.
244 * Rules are: 244 * Rules are:
245 * + if there is a block to the left of our position - allocate near it. 245 * + if there is a block to the left of our position - allocate near it.
@@ -283,7 +283,7 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind)
283} 283}
284 284
285/** 285/**
286 * ext2_find_goal - find a prefered place for allocation. 286 * ext2_find_goal - find a preferred place for allocation.
287 * @inode: owner 287 * @inode: owner
288 * @block: block we want 288 * @block: block we want
289 * @partial: pointer to the last triple within a chain 289 * @partial: pointer to the last triple within a chain
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index b8ea11fee5c6..de876fa793e1 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -12,6 +12,7 @@
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/sched.h> 13#include <linux/sched.h>
14#include <linux/compat.h> 14#include <linux/compat.h>
15#include <linux/mount.h>
15#include <linux/smp_lock.h> 16#include <linux/smp_lock.h>
16#include <asm/current.h> 17#include <asm/current.h>
17#include <asm/uaccess.h> 18#include <asm/uaccess.h>
@@ -23,6 +24,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
23 struct ext2_inode_info *ei = EXT2_I(inode); 24 struct ext2_inode_info *ei = EXT2_I(inode);
24 unsigned int flags; 25 unsigned int flags;
25 unsigned short rsv_window_size; 26 unsigned short rsv_window_size;
27 int ret;
26 28
27 ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg); 29 ext2_debug ("cmd = %u, arg = %lu\n", cmd, arg);
28 30
@@ -34,14 +36,19 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
34 case EXT2_IOC_SETFLAGS: { 36 case EXT2_IOC_SETFLAGS: {
35 unsigned int oldflags; 37 unsigned int oldflags;
36 38
37 if (IS_RDONLY(inode)) 39 ret = mnt_want_write(filp->f_path.mnt);
38 return -EROFS; 40 if (ret)
41 return ret;
39 42
40 if (!is_owner_or_cap(inode)) 43 if (!is_owner_or_cap(inode)) {
41 return -EACCES; 44 ret = -EACCES;
45 goto setflags_out;
46 }
42 47
43 if (get_user(flags, (int __user *) arg)) 48 if (get_user(flags, (int __user *) arg)) {
44 return -EFAULT; 49 ret = -EFAULT;
50 goto setflags_out;
51 }
45 52
46 if (!S_ISDIR(inode->i_mode)) 53 if (!S_ISDIR(inode->i_mode))
47 flags &= ~EXT2_DIRSYNC_FL; 54 flags &= ~EXT2_DIRSYNC_FL;
@@ -50,7 +57,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
50 /* Is it quota file? Do not allow user to mess with it */ 57 /* Is it quota file? Do not allow user to mess with it */
51 if (IS_NOQUOTA(inode)) { 58 if (IS_NOQUOTA(inode)) {
52 mutex_unlock(&inode->i_mutex); 59 mutex_unlock(&inode->i_mutex);
53 return -EPERM; 60 ret = -EPERM;
61 goto setflags_out;
54 } 62 }
55 oldflags = ei->i_flags; 63 oldflags = ei->i_flags;
56 64
@@ -63,7 +71,8 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
63 if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { 71 if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
64 if (!capable(CAP_LINUX_IMMUTABLE)) { 72 if (!capable(CAP_LINUX_IMMUTABLE)) {
65 mutex_unlock(&inode->i_mutex); 73 mutex_unlock(&inode->i_mutex);
66 return -EPERM; 74 ret = -EPERM;
75 goto setflags_out;
67 } 76 }
68 } 77 }
69 78
@@ -75,20 +84,26 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
75 ext2_set_inode_flags(inode); 84 ext2_set_inode_flags(inode);
76 inode->i_ctime = CURRENT_TIME_SEC; 85 inode->i_ctime = CURRENT_TIME_SEC;
77 mark_inode_dirty(inode); 86 mark_inode_dirty(inode);
78 return 0; 87setflags_out:
88 mnt_drop_write(filp->f_path.mnt);
89 return ret;
79 } 90 }
80 case EXT2_IOC_GETVERSION: 91 case EXT2_IOC_GETVERSION:
81 return put_user(inode->i_generation, (int __user *) arg); 92 return put_user(inode->i_generation, (int __user *) arg);
82 case EXT2_IOC_SETVERSION: 93 case EXT2_IOC_SETVERSION:
83 if (!is_owner_or_cap(inode)) 94 if (!is_owner_or_cap(inode))
84 return -EPERM; 95 return -EPERM;
85 if (IS_RDONLY(inode)) 96 ret = mnt_want_write(filp->f_path.mnt);
86 return -EROFS; 97 if (ret)
87 if (get_user(inode->i_generation, (int __user *) arg)) 98 return ret;
88 return -EFAULT; 99 if (get_user(inode->i_generation, (int __user *) arg)) {
89 inode->i_ctime = CURRENT_TIME_SEC; 100 ret = -EFAULT;
90 mark_inode_dirty(inode); 101 } else {
91 return 0; 102 inode->i_ctime = CURRENT_TIME_SEC;
103 mark_inode_dirty(inode);
104 }
105 mnt_drop_write(filp->f_path.mnt);
106 return ret;
92 case EXT2_IOC_GETRSVSZ: 107 case EXT2_IOC_GETRSVSZ:
93 if (test_opt(inode->i_sb, RESERVATION) 108 if (test_opt(inode->i_sb, RESERVATION)
94 && S_ISREG(inode->i_mode) 109 && S_ISREG(inode->i_mode)
@@ -102,15 +117,16 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
102 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) 117 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
103 return -ENOTTY; 118 return -ENOTTY;
104 119
105 if (IS_RDONLY(inode)) 120 if (!is_owner_or_cap(inode))
106 return -EROFS;
107
108 if ((current->fsuid != inode->i_uid) && !capable(CAP_FOWNER))
109 return -EACCES; 121 return -EACCES;
110 122
111 if (get_user(rsv_window_size, (int __user *)arg)) 123 if (get_user(rsv_window_size, (int __user *)arg))
112 return -EFAULT; 124 return -EFAULT;
113 125
126 ret = mnt_want_write(filp->f_path.mnt);
127 if (ret)
128 return ret;
129
114 if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS) 130 if (rsv_window_size > EXT2_MAX_RESERVE_BLOCKS)
115 rsv_window_size = EXT2_MAX_RESERVE_BLOCKS; 131 rsv_window_size = EXT2_MAX_RESERVE_BLOCKS;
116 132
@@ -131,6 +147,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
131 rsv->rsv_goal_size = rsv_window_size; 147 rsv->rsv_goal_size = rsv_window_size;
132 } 148 }
133 mutex_unlock(&ei->truncate_mutex); 149 mutex_unlock(&ei->truncate_mutex);
150 mnt_drop_write(filp->f_path.mnt);
134 return 0; 151 return 0;
135 } 152 }
136 default: 153 default:
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 4f4020c54683..96dd5573e49b 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -239,7 +239,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent)
239 * it has too few free inodes left (min_inodes) or 239 * it has too few free inodes left (min_inodes) or
240 * it has too few free blocks left (min_blocks) or 240 * it has too few free blocks left (min_blocks) or
241 * it's already running too large debt (max_debt). 241 * it's already running too large debt (max_debt).
242 * Parent's group is prefered, if it doesn't satisfy these 242 * Parent's group is preferred, if it doesn't satisfy these
243 * conditions we search cyclically through the rest. If none 243 * conditions we search cyclically through the rest. If none
244 * of the groups look good we just look for a group with more 244 * of the groups look good we just look for a group with more
245 * free inodes than average (starting at parent's group). 245 * free inodes than average (starting at parent's group).
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index eb95670a27eb..c683609b0e3a 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -392,7 +392,7 @@ no_block:
392 * @inode: owner 392 * @inode: owner
393 * @ind: descriptor of indirect block. 393 * @ind: descriptor of indirect block.
394 * 394 *
395 * This function returns the prefered place for block allocation. 395 * This function returns the preferred place for block allocation.
396 * It is used when heuristic for sequential allocation fails. 396 * It is used when heuristic for sequential allocation fails.
397 * Rules are: 397 * Rules are:
398 * + if there is a block to the left of our position - allocate near it. 398 * + if there is a block to the left of our position - allocate near it.
@@ -436,12 +436,12 @@ static ext3_fsblk_t ext3_find_near(struct inode *inode, Indirect *ind)
436} 436}
437 437
438/** 438/**
439 * ext3_find_goal - find a prefered place for allocation. 439 * ext3_find_goal - find a preferred place for allocation.
440 * @inode: owner 440 * @inode: owner
441 * @block: block we want 441 * @block: block we want
442 * @partial: pointer to the last triple within a chain 442 * @partial: pointer to the last triple within a chain
443 * 443 *
444 * Normally this function find the prefered place for block allocation, 444 * Normally this function find the preferred place for block allocation,
445 * returns it. 445 * returns it.
446 */ 446 */
447 447
diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c
index 023a070f55f1..0d0c70151642 100644
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -12,6 +12,7 @@
12#include <linux/capability.h> 12#include <linux/capability.h>
13#include <linux/ext3_fs.h> 13#include <linux/ext3_fs.h>
14#include <linux/ext3_jbd.h> 14#include <linux/ext3_jbd.h>
15#include <linux/mount.h>
15#include <linux/time.h> 16#include <linux/time.h>
16#include <linux/compat.h> 17#include <linux/compat.h>
17#include <linux/smp_lock.h> 18#include <linux/smp_lock.h>
@@ -38,14 +39,19 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
38 unsigned int oldflags; 39 unsigned int oldflags;
39 unsigned int jflag; 40 unsigned int jflag;
40 41
41 if (IS_RDONLY(inode)) 42 err = mnt_want_write(filp->f_path.mnt);
42 return -EROFS; 43 if (err)
44 return err;
43 45
44 if (!is_owner_or_cap(inode)) 46 if (!is_owner_or_cap(inode)) {
45 return -EACCES; 47 err = -EACCES;
48 goto flags_out;
49 }
46 50
47 if (get_user(flags, (int __user *) arg)) 51 if (get_user(flags, (int __user *) arg)) {
48 return -EFAULT; 52 err = -EFAULT;
53 goto flags_out;
54 }
49 55
50 if (!S_ISDIR(inode->i_mode)) 56 if (!S_ISDIR(inode->i_mode))
51 flags &= ~EXT3_DIRSYNC_FL; 57 flags &= ~EXT3_DIRSYNC_FL;
@@ -54,7 +60,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
54 /* Is it quota file? Do not allow user to mess with it */ 60 /* Is it quota file? Do not allow user to mess with it */
55 if (IS_NOQUOTA(inode)) { 61 if (IS_NOQUOTA(inode)) {
56 mutex_unlock(&inode->i_mutex); 62 mutex_unlock(&inode->i_mutex);
57 return -EPERM; 63 err = -EPERM;
64 goto flags_out;
58 } 65 }
59 oldflags = ei->i_flags; 66 oldflags = ei->i_flags;
60 67
@@ -70,7 +77,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
70 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) { 77 if ((flags ^ oldflags) & (EXT3_APPEND_FL | EXT3_IMMUTABLE_FL)) {
71 if (!capable(CAP_LINUX_IMMUTABLE)) { 78 if (!capable(CAP_LINUX_IMMUTABLE)) {
72 mutex_unlock(&inode->i_mutex); 79 mutex_unlock(&inode->i_mutex);
73 return -EPERM; 80 err = -EPERM;
81 goto flags_out;
74 } 82 }
75 } 83 }
76 84
@@ -81,7 +89,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
81 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) { 89 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
82 if (!capable(CAP_SYS_RESOURCE)) { 90 if (!capable(CAP_SYS_RESOURCE)) {
83 mutex_unlock(&inode->i_mutex); 91 mutex_unlock(&inode->i_mutex);
84 return -EPERM; 92 err = -EPERM;
93 goto flags_out;
85 } 94 }
86 } 95 }
87 96
@@ -89,7 +98,8 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
89 handle = ext3_journal_start(inode, 1); 98 handle = ext3_journal_start(inode, 1);
90 if (IS_ERR(handle)) { 99 if (IS_ERR(handle)) {
91 mutex_unlock(&inode->i_mutex); 100 mutex_unlock(&inode->i_mutex);
92 return PTR_ERR(handle); 101 err = PTR_ERR(handle);
102 goto flags_out;
93 } 103 }
94 if (IS_SYNC(inode)) 104 if (IS_SYNC(inode))
95 handle->h_sync = 1; 105 handle->h_sync = 1;
@@ -115,6 +125,8 @@ flags_err:
115 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) 125 if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL))
116 err = ext3_change_inode_journal_flag(inode, jflag); 126 err = ext3_change_inode_journal_flag(inode, jflag);
117 mutex_unlock(&inode->i_mutex); 127 mutex_unlock(&inode->i_mutex);
128flags_out:
129 mnt_drop_write(filp->f_path.mnt);
118 return err; 130 return err;
119 } 131 }
120 case EXT3_IOC_GETVERSION: 132 case EXT3_IOC_GETVERSION:
@@ -129,14 +141,18 @@ flags_err:
129 141
130 if (!is_owner_or_cap(inode)) 142 if (!is_owner_or_cap(inode))
131 return -EPERM; 143 return -EPERM;
132 if (IS_RDONLY(inode)) 144 err = mnt_want_write(filp->f_path.mnt);
133 return -EROFS; 145 if (err)
134 if (get_user(generation, (int __user *) arg)) 146 return err;
135 return -EFAULT; 147 if (get_user(generation, (int __user *) arg)) {
136 148 err = -EFAULT;
149 goto setversion_out;
150 }
137 handle = ext3_journal_start(inode, 1); 151 handle = ext3_journal_start(inode, 1);
138 if (IS_ERR(handle)) 152 if (IS_ERR(handle)) {
139 return PTR_ERR(handle); 153 err = PTR_ERR(handle);
154 goto setversion_out;
155 }
140 err = ext3_reserve_inode_write(handle, inode, &iloc); 156 err = ext3_reserve_inode_write(handle, inode, &iloc);
141 if (err == 0) { 157 if (err == 0) {
142 inode->i_ctime = CURRENT_TIME_SEC; 158 inode->i_ctime = CURRENT_TIME_SEC;
@@ -144,6 +160,8 @@ flags_err:
144 err = ext3_mark_iloc_dirty(handle, inode, &iloc); 160 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
145 } 161 }
146 ext3_journal_stop(handle); 162 ext3_journal_stop(handle);
163setversion_out:
164 mnt_drop_write(filp->f_path.mnt);
147 return err; 165 return err;
148 } 166 }
149#ifdef CONFIG_JBD_DEBUG 167#ifdef CONFIG_JBD_DEBUG
@@ -179,18 +197,24 @@ flags_err:
179 } 197 }
180 return -ENOTTY; 198 return -ENOTTY;
181 case EXT3_IOC_SETRSVSZ: { 199 case EXT3_IOC_SETRSVSZ: {
200 int err;
182 201
183 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) 202 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
184 return -ENOTTY; 203 return -ENOTTY;
185 204
186 if (IS_RDONLY(inode)) 205 err = mnt_want_write(filp->f_path.mnt);
187 return -EROFS; 206 if (err)
207 return err;
188 208
189 if (!is_owner_or_cap(inode)) 209 if (!is_owner_or_cap(inode)) {
190 return -EACCES; 210 err = -EACCES;
211 goto setrsvsz_out;
212 }
191 213
192 if (get_user(rsv_window_size, (int __user *)arg)) 214 if (get_user(rsv_window_size, (int __user *)arg)) {
193 return -EFAULT; 215 err = -EFAULT;
216 goto setrsvsz_out;
217 }
194 218
195 if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS) 219 if (rsv_window_size > EXT3_MAX_RESERVE_BLOCKS)
196 rsv_window_size = EXT3_MAX_RESERVE_BLOCKS; 220 rsv_window_size = EXT3_MAX_RESERVE_BLOCKS;
@@ -208,7 +232,9 @@ flags_err:
208 rsv->rsv_goal_size = rsv_window_size; 232 rsv->rsv_goal_size = rsv_window_size;
209 } 233 }
210 mutex_unlock(&ei->truncate_mutex); 234 mutex_unlock(&ei->truncate_mutex);
211 return 0; 235setrsvsz_out:
236 mnt_drop_write(filp->f_path.mnt);
237 return err;
212 } 238 }
213 case EXT3_IOC_GROUP_EXTEND: { 239 case EXT3_IOC_GROUP_EXTEND: {
214 ext3_fsblk_t n_blocks_count; 240 ext3_fsblk_t n_blocks_count;
@@ -218,17 +244,20 @@ flags_err:
218 if (!capable(CAP_SYS_RESOURCE)) 244 if (!capable(CAP_SYS_RESOURCE))
219 return -EPERM; 245 return -EPERM;
220 246
221 if (IS_RDONLY(inode)) 247 err = mnt_want_write(filp->f_path.mnt);
222 return -EROFS; 248 if (err)
223 249 return err;
224 if (get_user(n_blocks_count, (__u32 __user *)arg))
225 return -EFAULT;
226 250
251 if (get_user(n_blocks_count, (__u32 __user *)arg)) {
252 err = -EFAULT;
253 goto group_extend_out;
254 }
227 err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count); 255 err = ext3_group_extend(sb, EXT3_SB(sb)->s_es, n_blocks_count);
228 journal_lock_updates(EXT3_SB(sb)->s_journal); 256 journal_lock_updates(EXT3_SB(sb)->s_journal);
229 journal_flush(EXT3_SB(sb)->s_journal); 257 journal_flush(EXT3_SB(sb)->s_journal);
230 journal_unlock_updates(EXT3_SB(sb)->s_journal); 258 journal_unlock_updates(EXT3_SB(sb)->s_journal);
231 259group_extend_out:
260 mnt_drop_write(filp->f_path.mnt);
232 return err; 261 return err;
233 } 262 }
234 case EXT3_IOC_GROUP_ADD: { 263 case EXT3_IOC_GROUP_ADD: {
@@ -239,18 +268,22 @@ flags_err:
239 if (!capable(CAP_SYS_RESOURCE)) 268 if (!capable(CAP_SYS_RESOURCE))
240 return -EPERM; 269 return -EPERM;
241 270
242 if (IS_RDONLY(inode)) 271 err = mnt_want_write(filp->f_path.mnt);
243 return -EROFS; 272 if (err)
273 return err;
244 274
245 if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg, 275 if (copy_from_user(&input, (struct ext3_new_group_input __user *)arg,
246 sizeof(input))) 276 sizeof(input))) {
247 return -EFAULT; 277 err = -EFAULT;
278 goto group_add_out;
279 }
248 280
249 err = ext3_group_add(sb, &input); 281 err = ext3_group_add(sb, &input);
250 journal_lock_updates(EXT3_SB(sb)->s_journal); 282 journal_lock_updates(EXT3_SB(sb)->s_journal);
251 journal_flush(EXT3_SB(sb)->s_journal); 283 journal_flush(EXT3_SB(sb)->s_journal);
252 journal_unlock_updates(EXT3_SB(sb)->s_journal); 284 journal_unlock_updates(EXT3_SB(sb)->s_journal);
253 285group_add_out:
286 mnt_drop_write(filp->f_path.mnt);
254 return err; 287 return err;
255 } 288 }
256 289
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 8036b9b5376b..486e46a3918d 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -305,7 +305,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent,
305 * it has too few free inodes left (min_inodes) or 305 * it has too few free inodes left (min_inodes) or
306 * it has too few free blocks left (min_blocks) or 306 * it has too few free blocks left (min_blocks) or
307 * it's already running too large debt (max_debt). 307 * it's already running too large debt (max_debt).
308 * Parent's group is prefered, if it doesn't satisfy these 308 * Parent's group is preferred, if it doesn't satisfy these
309 * conditions we search cyclically through the rest. If none 309 * conditions we search cyclically through the rest. If none
310 * of the groups look good we just look for a group with more 310 * of the groups look good we just look for a group with more
311 * free inodes than average (starting at parent's group). 311 * free inodes than average (starting at parent's group).
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 945cbf6cb1fc..8fab233cb05f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -382,7 +382,7 @@ no_block:
382 * @inode: owner 382 * @inode: owner
383 * @ind: descriptor of indirect block. 383 * @ind: descriptor of indirect block.
384 * 384 *
385 * This function returns the prefered place for block allocation. 385 * This function returns the preferred place for block allocation.
386 * It is used when heuristic for sequential allocation fails. 386 * It is used when heuristic for sequential allocation fails.
387 * Rules are: 387 * Rules are:
388 * + if there is a block to the left of our position - allocate near it. 388 * + if there is a block to the left of our position - allocate near it.
@@ -432,12 +432,12 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
432} 432}
433 433
434/** 434/**
435 * ext4_find_goal - find a prefered place for allocation. 435 * ext4_find_goal - find a preferred place for allocation.
436 * @inode: owner 436 * @inode: owner
437 * @block: block we want 437 * @block: block we want
438 * @partial: pointer to the last triple within a chain 438 * @partial: pointer to the last triple within a chain
439 * 439 *
440 * Normally this function find the prefered place for block allocation, 440 * Normally this function find the preferred place for block allocation,
441 * returns it. 441 * returns it.
442 */ 442 */
443static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, 443static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 2ed7c37f897e..25b13ede8086 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,6 +15,7 @@
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/compat.h> 16#include <linux/compat.h>
17#include <linux/smp_lock.h> 17#include <linux/smp_lock.h>
18#include <linux/mount.h>
18#include <asm/uaccess.h> 19#include <asm/uaccess.h>
19 20
20int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, 21int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
@@ -38,24 +39,25 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
38 unsigned int oldflags; 39 unsigned int oldflags;
39 unsigned int jflag; 40 unsigned int jflag;
40 41
41 if (IS_RDONLY(inode))
42 return -EROFS;
43
44 if (!is_owner_or_cap(inode)) 42 if (!is_owner_or_cap(inode))
45 return -EACCES; 43 return -EACCES;
46 44
47 if (get_user(flags, (int __user *) arg)) 45 if (get_user(flags, (int __user *) arg))
48 return -EFAULT; 46 return -EFAULT;
49 47
48 err = mnt_want_write(filp->f_path.mnt);
49 if (err)
50 return err;
51
50 if (!S_ISDIR(inode->i_mode)) 52 if (!S_ISDIR(inode->i_mode))
51 flags &= ~EXT4_DIRSYNC_FL; 53 flags &= ~EXT4_DIRSYNC_FL;
52 54
55 err = -EPERM;
53 mutex_lock(&inode->i_mutex); 56 mutex_lock(&inode->i_mutex);
54 /* Is it quota file? Do not allow user to mess with it */ 57 /* Is it quota file? Do not allow user to mess with it */
55 if (IS_NOQUOTA(inode)) { 58 if (IS_NOQUOTA(inode))
56 mutex_unlock(&inode->i_mutex); 59 goto flags_out;
57 return -EPERM; 60
58 }
59 oldflags = ei->i_flags; 61 oldflags = ei->i_flags;
60 62
61 /* The JOURNAL_DATA flag is modifiable only by root */ 63 /* The JOURNAL_DATA flag is modifiable only by root */
@@ -68,10 +70,8 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
68 * This test looks nicer. Thanks to Pauline Middelink 70 * This test looks nicer. Thanks to Pauline Middelink
69 */ 71 */
70 if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { 72 if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
71 if (!capable(CAP_LINUX_IMMUTABLE)) { 73 if (!capable(CAP_LINUX_IMMUTABLE))
72 mutex_unlock(&inode->i_mutex); 74 goto flags_out;
73 return -EPERM;
74 }
75 } 75 }
76 76
77 /* 77 /*
@@ -79,17 +79,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
79 * the relevant capability. 79 * the relevant capability.
80 */ 80 */
81 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { 81 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
82 if (!capable(CAP_SYS_RESOURCE)) { 82 if (!capable(CAP_SYS_RESOURCE))
83 mutex_unlock(&inode->i_mutex); 83 goto flags_out;
84 return -EPERM;
85 }
86 } 84 }
87 85
88
89 handle = ext4_journal_start(inode, 1); 86 handle = ext4_journal_start(inode, 1);
90 if (IS_ERR(handle)) { 87 if (IS_ERR(handle)) {
91 mutex_unlock(&inode->i_mutex); 88 err = PTR_ERR(handle);
92 return PTR_ERR(handle); 89 goto flags_out;
93 } 90 }
94 if (IS_SYNC(inode)) 91 if (IS_SYNC(inode))
95 handle->h_sync = 1; 92 handle->h_sync = 1;
@@ -107,14 +104,14 @@ int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd,
107 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 104 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
108flags_err: 105flags_err:
109 ext4_journal_stop(handle); 106 ext4_journal_stop(handle);
110 if (err) { 107 if (err)
111 mutex_unlock(&inode->i_mutex); 108 goto flags_out;
112 return err;
113 }
114 109
115 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) 110 if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
116 err = ext4_change_inode_journal_flag(inode, jflag); 111 err = ext4_change_inode_journal_flag(inode, jflag);
112flags_out:
117 mutex_unlock(&inode->i_mutex); 113 mutex_unlock(&inode->i_mutex);
114 mnt_drop_write(filp->f_path.mnt);
118 return err; 115 return err;
119 } 116 }
120 case EXT4_IOC_GETVERSION: 117 case EXT4_IOC_GETVERSION:
@@ -129,14 +126,20 @@ flags_err:
129 126
130 if (!is_owner_or_cap(inode)) 127 if (!is_owner_or_cap(inode))
131 return -EPERM; 128 return -EPERM;
132 if (IS_RDONLY(inode)) 129
133 return -EROFS; 130 err = mnt_want_write(filp->f_path.mnt);
134 if (get_user(generation, (int __user *) arg)) 131 if (err)
135 return -EFAULT; 132 return err;
133 if (get_user(generation, (int __user *) arg)) {
134 err = -EFAULT;
135 goto setversion_out;
136 }
136 137
137 handle = ext4_journal_start(inode, 1); 138 handle = ext4_journal_start(inode, 1);
138 if (IS_ERR(handle)) 139 if (IS_ERR(handle)) {
139 return PTR_ERR(handle); 140 err = PTR_ERR(handle);
141 goto setversion_out;
142 }
140 err = ext4_reserve_inode_write(handle, inode, &iloc); 143 err = ext4_reserve_inode_write(handle, inode, &iloc);
141 if (err == 0) { 144 if (err == 0) {
142 inode->i_ctime = ext4_current_time(inode); 145 inode->i_ctime = ext4_current_time(inode);
@@ -144,6 +147,8 @@ flags_err:
144 err = ext4_mark_iloc_dirty(handle, inode, &iloc); 147 err = ext4_mark_iloc_dirty(handle, inode, &iloc);
145 } 148 }
146 ext4_journal_stop(handle); 149 ext4_journal_stop(handle);
150setversion_out:
151 mnt_drop_write(filp->f_path.mnt);
147 return err; 152 return err;
148 } 153 }
149#ifdef CONFIG_JBD2_DEBUG 154#ifdef CONFIG_JBD2_DEBUG
@@ -179,19 +184,21 @@ flags_err:
179 } 184 }
180 return -ENOTTY; 185 return -ENOTTY;
181 case EXT4_IOC_SETRSVSZ: { 186 case EXT4_IOC_SETRSVSZ: {
187 int err;
182 188
183 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) 189 if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode))
184 return -ENOTTY; 190 return -ENOTTY;
185 191
186 if (IS_RDONLY(inode))
187 return -EROFS;
188
189 if (!is_owner_or_cap(inode)) 192 if (!is_owner_or_cap(inode))
190 return -EACCES; 193 return -EACCES;
191 194
192 if (get_user(rsv_window_size, (int __user *)arg)) 195 if (get_user(rsv_window_size, (int __user *)arg))
193 return -EFAULT; 196 return -EFAULT;
194 197
198 err = mnt_want_write(filp->f_path.mnt);
199 if (err)
200 return err;
201
195 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS) 202 if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS)
196 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS; 203 rsv_window_size = EXT4_MAX_RESERVE_BLOCKS;
197 204
@@ -208,6 +215,7 @@ flags_err:
208 rsv->rsv_goal_size = rsv_window_size; 215 rsv->rsv_goal_size = rsv_window_size;
209 } 216 }
210 up_write(&ei->i_data_sem); 217 up_write(&ei->i_data_sem);
218 mnt_drop_write(filp->f_path.mnt);
211 return 0; 219 return 0;
212 } 220 }
213 case EXT4_IOC_GROUP_EXTEND: { 221 case EXT4_IOC_GROUP_EXTEND: {
@@ -218,16 +226,18 @@ flags_err:
218 if (!capable(CAP_SYS_RESOURCE)) 226 if (!capable(CAP_SYS_RESOURCE))
219 return -EPERM; 227 return -EPERM;
220 228
221 if (IS_RDONLY(inode))
222 return -EROFS;
223
224 if (get_user(n_blocks_count, (__u32 __user *)arg)) 229 if (get_user(n_blocks_count, (__u32 __user *)arg))
225 return -EFAULT; 230 return -EFAULT;
226 231
232 err = mnt_want_write(filp->f_path.mnt);
233 if (err)
234 return err;
235
227 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); 236 err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count);
228 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 237 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
229 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 238 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
230 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 239 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
240 mnt_drop_write(filp->f_path.mnt);
231 241
232 return err; 242 return err;
233 } 243 }
@@ -239,17 +249,19 @@ flags_err:
239 if (!capable(CAP_SYS_RESOURCE)) 249 if (!capable(CAP_SYS_RESOURCE))
240 return -EPERM; 250 return -EPERM;
241 251
242 if (IS_RDONLY(inode))
243 return -EROFS;
244
245 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg, 252 if (copy_from_user(&input, (struct ext4_new_group_input __user *)arg,
246 sizeof(input))) 253 sizeof(input)))
247 return -EFAULT; 254 return -EFAULT;
248 255
256 err = mnt_want_write(filp->f_path.mnt);
257 if (err)
258 return err;
259
249 err = ext4_group_add(sb, &input); 260 err = ext4_group_add(sb, &input);
250 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); 261 jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
251 jbd2_journal_flush(EXT4_SB(sb)->s_journal); 262 jbd2_journal_flush(EXT4_SB(sb)->s_journal);
252 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); 263 jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
264 mnt_drop_write(filp->f_path.mnt);
253 265
254 return err; 266 return err;
255 } 267 }
diff --git a/fs/fat/file.c b/fs/fat/file.c
index c614175876e0..2a3bed967041 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/capability.h> 9#include <linux/capability.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/mount.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/msdos_fs.h> 13#include <linux/msdos_fs.h>
13#include <linux/smp_lock.h> 14#include <linux/smp_lock.h>
@@ -46,10 +47,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
46 47
47 mutex_lock(&inode->i_mutex); 48 mutex_lock(&inode->i_mutex);
48 49
49 if (IS_RDONLY(inode)) { 50 err = mnt_want_write(filp->f_path.mnt);
50 err = -EROFS; 51 if (err)
51 goto up; 52 goto up_no_drop_write;
52 }
53 53
54 /* 54 /*
55 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also 55 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +105,9 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
105 105
106 MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED; 106 MSDOS_I(inode)->i_attrs = attr & ATTR_UNUSED;
107 mark_inode_dirty(inode); 107 mark_inode_dirty(inode);
108 up: 108up:
109 mnt_drop_write(filp->f_path.mnt);
110up_no_drop_write:
109 mutex_unlock(&inode->i_mutex); 111 mutex_unlock(&inode->i_mutex);
110 return err; 112 return err;
111 } 113 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 986ff4ed0a7c..7a0a9b872251 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -42,6 +42,7 @@ static inline void file_free_rcu(struct rcu_head *head)
42static inline void file_free(struct file *f) 42static inline void file_free(struct file *f)
43{ 43{
44 percpu_counter_dec(&nr_files); 44 percpu_counter_dec(&nr_files);
45 file_check_state(f);
45 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu); 46 call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
46} 47}
47 48
@@ -199,6 +200,18 @@ int init_file(struct file *file, struct vfsmount *mnt, struct dentry *dentry,
199 file->f_mapping = dentry->d_inode->i_mapping; 200 file->f_mapping = dentry->d_inode->i_mapping;
200 file->f_mode = mode; 201 file->f_mode = mode;
201 file->f_op = fop; 202 file->f_op = fop;
203
204 /*
205 * These mounts don't really matter in practice
206 * for r/o bind mounts. They aren't userspace-
207 * visible. We do this for consistency, and so
208 * that we can do debugging checks at __fput()
209 */
210 if ((mode & FMODE_WRITE) && !special_file(dentry->d_inode->i_mode)) {
211 file_take_write(file);
212 error = mnt_want_write(mnt);
213 WARN_ON(error);
214 }
202 return error; 215 return error;
203} 216}
204EXPORT_SYMBOL(init_file); 217EXPORT_SYMBOL(init_file);
@@ -211,6 +224,31 @@ void fput(struct file *file)
211 224
212EXPORT_SYMBOL(fput); 225EXPORT_SYMBOL(fput);
213 226
227/**
228 * drop_file_write_access - give up ability to write to a file
229 * @file: the file to which we will stop writing
230 *
231 * This is a central place which will give up the ability
232 * to write to @file, along with access to write through
233 * its vfsmount.
234 */
235void drop_file_write_access(struct file *file)
236{
237 struct vfsmount *mnt = file->f_path.mnt;
238 struct dentry *dentry = file->f_path.dentry;
239 struct inode *inode = dentry->d_inode;
240
241 put_write_access(inode);
242
243 if (special_file(inode->i_mode))
244 return;
245 if (file_check_writeable(file) != 0)
246 return;
247 mnt_drop_write(mnt);
248 file_release_write(file);
249}
250EXPORT_SYMBOL_GPL(drop_file_write_access);
251
214/* __fput is called from task context when aio completion releases the last 252/* __fput is called from task context when aio completion releases the last
215 * last use of a struct file *. Do not use otherwise. 253 * last use of a struct file *. Do not use otherwise.
216 */ 254 */
@@ -236,10 +274,10 @@ void __fput(struct file *file)
236 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) 274 if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL))
237 cdev_put(inode->i_cdev); 275 cdev_put(inode->i_cdev);
238 fops_put(file->f_op); 276 fops_put(file->f_op);
239 if (file->f_mode & FMODE_WRITE)
240 put_write_access(inode);
241 put_pid(file->f_owner.pid); 277 put_pid(file->f_owner.pid);
242 file_kill(file); 278 file_kill(file);
279 if (file->f_mode & FMODE_WRITE)
280 drop_file_write_access(file);
243 file->f_path.dentry = NULL; 281 file->f_path.dentry = NULL;
244 file->f_path.mnt = NULL; 282 file->f_path.mnt = NULL;
245 file_free(file); 283 file_free(file);
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index de8e64c03f73..7f7947e3dfbb 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
1config GFS2_FS 1config GFS2_FS
2 tristate "GFS2 file system support" 2 tristate "GFS2 file system support"
3 depends on EXPERIMENTAL 3 depends on EXPERIMENTAL && (64BIT || (LSF && LBD))
4 select FS_POSIX_ACL 4 select FS_POSIX_ACL
5 select CRC32 5 select CRC32
6 help 6 help
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 8fff11058cee..e2350df02a07 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
1obj-$(CONFIG_GFS2_FS) += gfs2.o 1obj-$(CONFIG_GFS2_FS) += gfs2.o
2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ 2gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \
3 glops.o inode.o lm.o log.o lops.o locking.o main.o meta_io.o \ 3 glops.o inode.o log.o lops.o locking.o main.o meta_io.o \
4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ 4 mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \
5 ops_fstype.o ops_inode.o ops_super.o quota.o \ 5 ops_fstype.o ops_inode.o ops_super.o quota.o \
6 recovery.o rgrp.o super.o sys.o trans.o util.o 6 recovery.o rgrp.o super.o sys.o trans.o util.o
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1047a8c7226a..3e9bd46f27e3 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -116,7 +116,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
116 goto out; 116 goto out;
117 117
118 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); 118 er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
119 er.er_data = kmalloc(er.er_data_len, GFP_KERNEL); 119 er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
120 error = -ENOMEM; 120 error = -ENOMEM;
121 if (!er.er_data) 121 if (!er.er_data)
122 goto out; 122 goto out;
@@ -222,7 +222,7 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
222 return error; 222 return error;
223 } 223 }
224 224
225 clone = posix_acl_clone(acl, GFP_KERNEL); 225 clone = posix_acl_clone(acl, GFP_NOFS);
226 error = -ENOMEM; 226 error = -ENOMEM;
227 if (!clone) 227 if (!clone)
228 goto out; 228 goto out;
@@ -272,7 +272,7 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
272 if (!acl) 272 if (!acl)
273 return gfs2_setattr_simple(ip, attr); 273 return gfs2_setattr_simple(ip, attr);
274 274
275 clone = posix_acl_clone(acl, GFP_KERNEL); 275 clone = posix_acl_clone(acl, GFP_NOFS);
276 error = -ENOMEM; 276 error = -ENOMEM;
277 if (!clone) 277 if (!clone)
278 goto out; 278 goto out;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e9456ebd3bb6..c19184f2e70e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -33,6 +33,7 @@
33 * keep it small. 33 * keep it small.
34 */ 34 */
35struct metapath { 35struct metapath {
36 struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
36 __u16 mp_list[GFS2_MAX_META_HEIGHT]; 37 __u16 mp_list[GFS2_MAX_META_HEIGHT];
37}; 38};
38 39
@@ -135,9 +136,10 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
135 /* Get a free block, fill it with the stuffed data, 136 /* Get a free block, fill it with the stuffed data,
136 and write it out to disk */ 137 and write it out to disk */
137 138
139 unsigned int n = 1;
140 block = gfs2_alloc_block(ip, &n);
138 if (isdir) { 141 if (isdir) {
139 block = gfs2_alloc_meta(ip); 142 gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1);
140
141 error = gfs2_dir_get_new_buffer(ip, block, &bh); 143 error = gfs2_dir_get_new_buffer(ip, block, &bh);
142 if (error) 144 if (error)
143 goto out_brelse; 145 goto out_brelse;
@@ -145,8 +147,6 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
145 dibh, sizeof(struct gfs2_dinode)); 147 dibh, sizeof(struct gfs2_dinode));
146 brelse(bh); 148 brelse(bh);
147 } else { 149 } else {
148 block = gfs2_alloc_data(ip);
149
150 error = gfs2_unstuffer_page(ip, dibh, block, page); 150 error = gfs2_unstuffer_page(ip, dibh, block, page);
151 if (error) 151 if (error)
152 goto out_brelse; 152 goto out_brelse;
@@ -161,12 +161,11 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
161 161
162 if (ip->i_di.di_size) { 162 if (ip->i_di.di_size) {
163 *(__be64 *)(di + 1) = cpu_to_be64(block); 163 *(__be64 *)(di + 1) = cpu_to_be64(block);
164 ip->i_di.di_blocks++; 164 gfs2_add_inode_blocks(&ip->i_inode, 1);
165 gfs2_set_inode_blocks(&ip->i_inode); 165 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
166 di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
167 } 166 }
168 167
169 ip->i_di.di_height = 1; 168 ip->i_height = 1;
170 di->di_height = cpu_to_be16(1); 169 di->di_height = cpu_to_be16(1);
171 170
172out_brelse: 171out_brelse:
@@ -176,114 +175,13 @@ out:
176 return error; 175 return error;
177} 176}
178 177
179/**
180 * calc_tree_height - Calculate the height of a metadata tree
181 * @ip: The GFS2 inode
182 * @size: The proposed size of the file
183 *
184 * Work out how tall a metadata tree needs to be in order to accommodate a
185 * file of a particular size. If size is less than the current size of
186 * the inode, then the current size of the inode is used instead of the
187 * supplied one.
188 *
189 * Returns: the height the tree should be
190 */
191
192static unsigned int calc_tree_height(struct gfs2_inode *ip, u64 size)
193{
194 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
195 u64 *arr;
196 unsigned int max, height;
197
198 if (ip->i_di.di_size > size)
199 size = ip->i_di.di_size;
200
201 if (gfs2_is_dir(ip)) {
202 arr = sdp->sd_jheightsize;
203 max = sdp->sd_max_jheight;
204 } else {
205 arr = sdp->sd_heightsize;
206 max = sdp->sd_max_height;
207 }
208
209 for (height = 0; height < max; height++)
210 if (arr[height] >= size)
211 break;
212
213 return height;
214}
215
216/**
217 * build_height - Build a metadata tree of the requested height
218 * @ip: The GFS2 inode
219 * @height: The height to build to
220 *
221 *
222 * Returns: errno
223 */
224
225static int build_height(struct inode *inode, unsigned height)
226{
227 struct gfs2_inode *ip = GFS2_I(inode);
228 unsigned new_height = height - ip->i_di.di_height;
229 struct buffer_head *dibh;
230 struct buffer_head *blocks[GFS2_MAX_META_HEIGHT];
231 struct gfs2_dinode *di;
232 int error;
233 __be64 *bp;
234 u64 bn;
235 unsigned n;
236
237 if (height <= ip->i_di.di_height)
238 return 0;
239
240 error = gfs2_meta_inode_buffer(ip, &dibh);
241 if (error)
242 return error;
243
244 for(n = 0; n < new_height; n++) {
245 bn = gfs2_alloc_meta(ip);
246 blocks[n] = gfs2_meta_new(ip->i_gl, bn);
247 gfs2_trans_add_bh(ip->i_gl, blocks[n], 1);
248 }
249
250 n = 0;
251 bn = blocks[0]->b_blocknr;
252 if (new_height > 1) {
253 for(; n < new_height-1; n++) {
254 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN,
255 GFS2_FORMAT_IN);
256 gfs2_buffer_clear_tail(blocks[n],
257 sizeof(struct gfs2_meta_header));
258 bp = (__be64 *)(blocks[n]->b_data +
259 sizeof(struct gfs2_meta_header));
260 *bp = cpu_to_be64(blocks[n+1]->b_blocknr);
261 brelse(blocks[n]);
262 blocks[n] = NULL;
263 }
264 }
265 gfs2_metatype_set(blocks[n], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
266 gfs2_buffer_copy_tail(blocks[n], sizeof(struct gfs2_meta_header),
267 dibh, sizeof(struct gfs2_dinode));
268 brelse(blocks[n]);
269 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
270 di = (struct gfs2_dinode *)dibh->b_data;
271 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
272 *(__be64 *)(di + 1) = cpu_to_be64(bn);
273 ip->i_di.di_height += new_height;
274 ip->i_di.di_blocks += new_height;
275 gfs2_set_inode_blocks(&ip->i_inode);
276 di->di_height = cpu_to_be16(ip->i_di.di_height);
277 di->di_blocks = cpu_to_be64(ip->i_di.di_blocks);
278 brelse(dibh);
279 return error;
280}
281 178
282/** 179/**
283 * find_metapath - Find path through the metadata tree 180 * find_metapath - Find path through the metadata tree
284 * @ip: The inode pointer 181 * @sdp: The superblock
285 * @mp: The metapath to return the result in 182 * @mp: The metapath to return the result in
286 * @block: The disk block to look up 183 * @block: The disk block to look up
184 * @height: The pre-calculated height of the metadata tree
287 * 185 *
288 * This routine returns a struct metapath structure that defines a path 186 * This routine returns a struct metapath structure that defines a path
289 * through the metadata of inode "ip" to get to block "block". 187 * through the metadata of inode "ip" to get to block "block".
@@ -338,21 +236,29 @@ static int build_height(struct inode *inode, unsigned height)
338 * 236 *
339 */ 237 */
340 238
341static void find_metapath(struct gfs2_inode *ip, u64 block, 239static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
342 struct metapath *mp) 240 struct metapath *mp, unsigned int height)
343{ 241{
344 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
345 u64 b = block;
346 unsigned int i; 242 unsigned int i;
347 243
348 for (i = ip->i_di.di_height; i--;) 244 for (i = height; i--;)
349 mp->mp_list[i] = do_div(b, sdp->sd_inptrs); 245 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
246
247}
350 248
249static inline unsigned int zero_metapath_length(const struct metapath *mp,
250 unsigned height)
251{
252 unsigned int i;
253 for (i = 0; i < height - 1; i++) {
254 if (mp->mp_list[i] != 0)
255 return i;
256 }
257 return height;
351} 258}
352 259
353/** 260/**
354 * metapointer - Return pointer to start of metadata in a buffer 261 * metapointer - Return pointer to start of metadata in a buffer
355 * @bh: The buffer
356 * @height: The metadata height (0 = dinode) 262 * @height: The metadata height (0 = dinode)
357 * @mp: The metapath 263 * @mp: The metapath
358 * 264 *
@@ -361,93 +267,302 @@ static void find_metapath(struct gfs2_inode *ip, u64 block,
361 * metadata tree. 267 * metadata tree.
362 */ 268 */
363 269
364static inline __be64 *metapointer(struct buffer_head *bh, int *boundary, 270static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
365 unsigned int height, const struct metapath *mp)
366{ 271{
272 struct buffer_head *bh = mp->mp_bh[height];
367 unsigned int head_size = (height > 0) ? 273 unsigned int head_size = (height > 0) ?
368 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode); 274 sizeof(struct gfs2_meta_header) : sizeof(struct gfs2_dinode);
369 __be64 *ptr; 275 return ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
370 *boundary = 0;
371 ptr = ((__be64 *)(bh->b_data + head_size)) + mp->mp_list[height];
372 if (ptr + 1 == (__be64 *)(bh->b_data + bh->b_size))
373 *boundary = 1;
374 return ptr;
375} 276}
376 277
377/** 278/**
378 * lookup_block - Get the next metadata block in metadata tree 279 * lookup_metapath - Walk the metadata tree to a specific point
379 * @ip: The GFS2 inode 280 * @ip: The inode
380 * @bh: Buffer containing the pointers to metadata blocks
381 * @height: The height of the tree (0 = dinode)
382 * @mp: The metapath 281 * @mp: The metapath
383 * @create: Non-zero if we may create a new meatdata block
384 * @new: Used to indicate if we did create a new metadata block
385 * @block: the returned disk block number
386 * 282 *
387 * Given a metatree, complete to a particular height, checks to see if the next 283 * Assumes that the inode's buffer has already been looked up and
388 * height of the tree exists. If not the next height of the tree is created. 284 * hooked onto mp->mp_bh[0] and that the metapath has been initialised
389 * The block number of the next height of the metadata tree is returned. 285 * by find_metapath().
286 *
287 * If this function encounters part of the tree which has not been
288 * allocated, it returns the current height of the tree at the point
289 * at which it found the unallocated block. Blocks which are found are
290 * added to the mp->mp_bh[] list.
390 * 291 *
292 * Returns: error or height of metadata tree
391 */ 293 */
392 294
393static int lookup_block(struct gfs2_inode *ip, struct buffer_head *bh, 295static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
394 unsigned int height, struct metapath *mp, int create,
395 int *new, u64 *block)
396{ 296{
397 int boundary; 297 unsigned int end_of_metadata = ip->i_height - 1;
398 __be64 *ptr = metapointer(bh, &boundary, height, mp); 298 unsigned int x;
299 __be64 *ptr;
300 u64 dblock;
301 int ret;
399 302
400 if (*ptr) { 303 for (x = 0; x < end_of_metadata; x++) {
401 *block = be64_to_cpu(*ptr); 304 ptr = metapointer(x, mp);
402 return boundary; 305 dblock = be64_to_cpu(*ptr);
403 } 306 if (!dblock)
307 return x + 1;
404 308
405 *block = 0; 309 ret = gfs2_meta_indirect_buffer(ip, x+1, dblock, 0, &mp->mp_bh[x+1]);
310 if (ret)
311 return ret;
312 }
406 313
407 if (!create) 314 return ip->i_height;
408 return 0; 315}
409 316
410 if (height == ip->i_di.di_height - 1 && !gfs2_is_dir(ip)) 317static inline void release_metapath(struct metapath *mp)
411 *block = gfs2_alloc_data(ip); 318{
412 else 319 int i;
413 *block = gfs2_alloc_meta(ip);
414 320
415 gfs2_trans_add_bh(ip->i_gl, bh, 1); 321 for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
322 if (mp->mp_bh[i] == NULL)
323 break;
324 brelse(mp->mp_bh[i]);
325 }
326}
416 327
417 *ptr = cpu_to_be64(*block); 328/**
418 ip->i_di.di_blocks++; 329 * gfs2_extent_length - Returns length of an extent of blocks
419 gfs2_set_inode_blocks(&ip->i_inode); 330 * @start: Start of the buffer
331 * @len: Length of the buffer in bytes
332 * @ptr: Current position in the buffer
333 * @limit: Max extent length to return (0 = unlimited)
334 * @eob: Set to 1 if we hit "end of block"
335 *
336 * If the first block is zero (unallocated) it will return the number of
337 * unallocated blocks in the extent, otherwise it will return the number
338 * of contiguous blocks in the extent.
339 *
340 * Returns: The length of the extent (minimum of one block)
341 */
420 342
421 *new = 1; 343static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
422 return 0; 344{
345 const __be64 *end = (start + len);
346 const __be64 *first = ptr;
347 u64 d = be64_to_cpu(*ptr);
348
349 *eob = 0;
350 do {
351 ptr++;
352 if (ptr >= end)
353 break;
354 if (limit && --limit == 0)
355 break;
356 if (d)
357 d++;
358 } while(be64_to_cpu(*ptr) == d);
359 if (ptr >= end)
360 *eob = 1;
361 return (ptr - first);
423} 362}
424 363
425static inline void bmap_lock(struct inode *inode, int create) 364static inline void bmap_lock(struct gfs2_inode *ip, int create)
426{ 365{
427 struct gfs2_inode *ip = GFS2_I(inode);
428 if (create) 366 if (create)
429 down_write(&ip->i_rw_mutex); 367 down_write(&ip->i_rw_mutex);
430 else 368 else
431 down_read(&ip->i_rw_mutex); 369 down_read(&ip->i_rw_mutex);
432} 370}
433 371
434static inline void bmap_unlock(struct inode *inode, int create) 372static inline void bmap_unlock(struct gfs2_inode *ip, int create)
435{ 373{
436 struct gfs2_inode *ip = GFS2_I(inode);
437 if (create) 374 if (create)
438 up_write(&ip->i_rw_mutex); 375 up_write(&ip->i_rw_mutex);
439 else 376 else
440 up_read(&ip->i_rw_mutex); 377 up_read(&ip->i_rw_mutex);
441} 378}
442 379
380static inline __be64 *gfs2_indirect_init(struct metapath *mp,
381 struct gfs2_glock *gl, unsigned int i,
382 unsigned offset, u64 bn)
383{
384 __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
385 ((i > 1) ? sizeof(struct gfs2_meta_header) :
386 sizeof(struct gfs2_dinode)));
387 BUG_ON(i < 1);
388 BUG_ON(mp->mp_bh[i] != NULL);
389 mp->mp_bh[i] = gfs2_meta_new(gl, bn);
390 gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
391 gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
392 gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
393 ptr += offset;
394 *ptr = cpu_to_be64(bn);
395 return ptr;
396}
397
398enum alloc_state {
399 ALLOC_DATA = 0,
400 ALLOC_GROW_DEPTH = 1,
401 ALLOC_GROW_HEIGHT = 2,
402 /* ALLOC_UNSTUFF = 3, TBD and rather complicated */
403};
404
405/**
406 * gfs2_bmap_alloc - Build a metadata tree of the requested height
407 * @inode: The GFS2 inode
408 * @lblock: The logical starting block of the extent
409 * @bh_map: This is used to return the mapping details
410 * @mp: The metapath
411 * @sheight: The starting height (i.e. whats already mapped)
412 * @height: The height to build to
413 * @maxlen: The max number of data blocks to alloc
414 *
415 * In this routine we may have to alloc:
416 * i) Indirect blocks to grow the metadata tree height
417 * ii) Indirect blocks to fill in lower part of the metadata tree
418 * iii) Data blocks
419 *
420 * The function is in two parts. The first part works out the total
421 * number of blocks which we need. The second part does the actual
422 * allocation asking for an extent at a time (if enough contiguous free
423 * blocks are available, there will only be one request per bmap call)
424 * and uses the state machine to initialise the blocks in order.
425 *
426 * Returns: errno on error
427 */
428
429static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
430 struct buffer_head *bh_map, struct metapath *mp,
431 const unsigned int sheight,
432 const unsigned int height,
433 const unsigned int maxlen)
434{
435 struct gfs2_inode *ip = GFS2_I(inode);
436 struct gfs2_sbd *sdp = GFS2_SB(inode);
437 struct buffer_head *dibh = mp->mp_bh[0];
438 u64 bn, dblock = 0;
439 unsigned n, i, blks, alloced = 0, iblks = 0, zmpl = 0;
440 unsigned dblks = 0;
441 unsigned ptrs_per_blk;
442 const unsigned end_of_metadata = height - 1;
443 int eob = 0;
444 enum alloc_state state;
445 __be64 *ptr;
446 __be64 zero_bn = 0;
447
448 BUG_ON(sheight < 1);
449 BUG_ON(dibh == NULL);
450
451 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
452
453 if (height == sheight) {
454 struct buffer_head *bh;
455 /* Bottom indirect block exists, find unalloced extent size */
456 ptr = metapointer(end_of_metadata, mp);
457 bh = mp->mp_bh[end_of_metadata];
458 dblks = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen,
459 &eob);
460 BUG_ON(dblks < 1);
461 state = ALLOC_DATA;
462 } else {
463 /* Need to allocate indirect blocks */
464 ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
465 dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
466 if (height == ip->i_height) {
467 /* Writing into existing tree, extend tree down */
468 iblks = height - sheight;
469 state = ALLOC_GROW_DEPTH;
470 } else {
471 /* Building up tree height */
472 state = ALLOC_GROW_HEIGHT;
473 iblks = height - ip->i_height;
474 zmpl = zero_metapath_length(mp, height);
475 iblks -= zmpl;
476 iblks += height;
477 }
478 }
479
480 /* start of the second part of the function (state machine) */
481
482 blks = dblks + iblks;
483 i = sheight;
484 do {
485 n = blks - alloced;
486 bn = gfs2_alloc_block(ip, &n);
487 alloced += n;
488 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
489 gfs2_trans_add_unrevoke(sdp, bn, n);
490 switch (state) {
491 /* Growing height of tree */
492 case ALLOC_GROW_HEIGHT:
493 if (i == 1) {
494 ptr = (__be64 *)(dibh->b_data +
495 sizeof(struct gfs2_dinode));
496 zero_bn = *ptr;
497 }
498 for (; i - 1 < height - ip->i_height && n > 0; i++, n--)
499 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
500 if (i - 1 == height - ip->i_height) {
501 i--;
502 gfs2_buffer_copy_tail(mp->mp_bh[i],
503 sizeof(struct gfs2_meta_header),
504 dibh, sizeof(struct gfs2_dinode));
505 gfs2_buffer_clear_tail(dibh,
506 sizeof(struct gfs2_dinode) +
507 sizeof(__be64));
508 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
509 sizeof(struct gfs2_meta_header));
510 *ptr = zero_bn;
511 state = ALLOC_GROW_DEPTH;
512 for(i = zmpl; i < height; i++) {
513 if (mp->mp_bh[i] == NULL)
514 break;
515 brelse(mp->mp_bh[i]);
516 mp->mp_bh[i] = NULL;
517 }
518 i = zmpl;
519 }
520 if (n == 0)
521 break;
522 /* Branching from existing tree */
523 case ALLOC_GROW_DEPTH:
524 if (i > 1 && i < height)
525 gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
526 for (; i < height && n > 0; i++, n--)
527 gfs2_indirect_init(mp, ip->i_gl, i,
528 mp->mp_list[i-1], bn++);
529 if (i == height)
530 state = ALLOC_DATA;
531 if (n == 0)
532 break;
533 /* Tree complete, adding data blocks */
534 case ALLOC_DATA:
535 BUG_ON(n > dblks);
536 BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
537 gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
538 dblks = n;
539 ptr = metapointer(end_of_metadata, mp);
540 dblock = bn;
541 while (n-- > 0)
542 *ptr++ = cpu_to_be64(bn++);
543 break;
544 }
545 } while (state != ALLOC_DATA);
546
547 ip->i_height = height;
548 gfs2_add_inode_blocks(&ip->i_inode, alloced);
549 gfs2_dinode_out(ip, mp->mp_bh[0]->b_data);
550 map_bh(bh_map, inode->i_sb, dblock);
551 bh_map->b_size = dblks << inode->i_blkbits;
552 set_buffer_new(bh_map);
553 return 0;
554}
555
443/** 556/**
444 * gfs2_block_map - Map a block from an inode to a disk block 557 * gfs2_block_map - Map a block from an inode to a disk block
445 * @inode: The inode 558 * @inode: The inode
446 * @lblock: The logical block number 559 * @lblock: The logical block number
447 * @bh_map: The bh to be mapped 560 * @bh_map: The bh to be mapped
561 * @create: True if its ok to alloc blocks to satify the request
448 * 562 *
449 * Find the block number on the current device which corresponds to an 563 * Sets buffer_mapped() if successful, sets buffer_boundary() if a
450 * inode's block. If the block had to be created, "new" will be set. 564 * read of metadata will be required before the next block can be
565 * mapped. Sets buffer_new() if new blocks were allocated.
451 * 566 *
452 * Returns: errno 567 * Returns: errno
453 */ 568 */
@@ -457,97 +572,78 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
457{ 572{
458 struct gfs2_inode *ip = GFS2_I(inode); 573 struct gfs2_inode *ip = GFS2_I(inode);
459 struct gfs2_sbd *sdp = GFS2_SB(inode); 574 struct gfs2_sbd *sdp = GFS2_SB(inode);
460 struct buffer_head *bh; 575 unsigned int bsize = sdp->sd_sb.sb_bsize;
461 unsigned int bsize; 576 const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
462 unsigned int height; 577 const u64 *arr = sdp->sd_heightsize;
463 unsigned int end_of_metadata; 578 __be64 *ptr;
464 unsigned int x;
465 int error = 0;
466 int new = 0;
467 u64 dblock = 0;
468 int boundary;
469 unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
470 struct metapath mp;
471 u64 size; 579 u64 size;
472 struct buffer_head *dibh = NULL; 580 struct metapath mp;
581 int ret;
582 int eob;
583 unsigned int len;
584 struct buffer_head *bh;
585 u8 height;
473 586
474 BUG_ON(maxlen == 0); 587 BUG_ON(maxlen == 0);
475 588
476 if (gfs2_assert_warn(sdp, !gfs2_is_stuffed(ip))) 589 memset(mp.mp_bh, 0, sizeof(mp.mp_bh));
477 return 0; 590 bmap_lock(ip, create);
478
479 bmap_lock(inode, create);
480 clear_buffer_mapped(bh_map); 591 clear_buffer_mapped(bh_map);
481 clear_buffer_new(bh_map); 592 clear_buffer_new(bh_map);
482 clear_buffer_boundary(bh_map); 593 clear_buffer_boundary(bh_map);
483 bsize = gfs2_is_dir(ip) ? sdp->sd_jbsize : sdp->sd_sb.sb_bsize; 594 if (gfs2_is_dir(ip)) {
484 size = (lblock + 1) * bsize; 595 bsize = sdp->sd_jbsize;
485 596 arr = sdp->sd_jheightsize;
486 if (size > ip->i_di.di_size) {
487 height = calc_tree_height(ip, size);
488 if (ip->i_di.di_height < height) {
489 if (!create)
490 goto out_ok;
491
492 error = build_height(inode, height);
493 if (error)
494 goto out_fail;
495 }
496 } 597 }
497 598
498 find_metapath(ip, lblock, &mp); 599 ret = gfs2_meta_inode_buffer(ip, &mp.mp_bh[0]);
499 end_of_metadata = ip->i_di.di_height - 1; 600 if (ret)
500 error = gfs2_meta_inode_buffer(ip, &bh); 601 goto out;
501 if (error)
502 goto out_fail;
503 dibh = bh;
504 get_bh(dibh);
505 602
506 for (x = 0; x < end_of_metadata; x++) { 603 height = ip->i_height;
507 lookup_block(ip, bh, x, &mp, create, &new, &dblock); 604 size = (lblock + 1) * bsize;
508 brelse(bh); 605 while (size > arr[height])
509 if (!dblock) 606 height++;
510 goto out_ok; 607 find_metapath(sdp, lblock, &mp, height);
608 ret = 1;
609 if (height > ip->i_height || gfs2_is_stuffed(ip))
610 goto do_alloc;
611 ret = lookup_metapath(ip, &mp);
612 if (ret < 0)
613 goto out;
614 if (ret != ip->i_height)
615 goto do_alloc;
616 ptr = metapointer(ip->i_height - 1, &mp);
617 if (*ptr == 0)
618 goto do_alloc;
619 map_bh(bh_map, inode->i_sb, be64_to_cpu(*ptr));
620 bh = mp.mp_bh[ip->i_height - 1];
621 len = gfs2_extent_length(bh->b_data, bh->b_size, ptr, maxlen, &eob);
622 bh_map->b_size = (len << inode->i_blkbits);
623 if (eob)
624 set_buffer_boundary(bh_map);
625 ret = 0;
626out:
627 release_metapath(&mp);
628 bmap_unlock(ip, create);
629 return ret;
511 630
512 error = gfs2_meta_indirect_buffer(ip, x+1, dblock, new, &bh); 631do_alloc:
513 if (error) 632 /* All allocations are done here, firstly check create flag */
514 goto out_fail; 633 if (!create) {
634 BUG_ON(gfs2_is_stuffed(ip));
635 ret = 0;
636 goto out;
515 } 637 }
516 638
517 boundary = lookup_block(ip, bh, end_of_metadata, &mp, create, &new, &dblock); 639 /* At this point ret is the tree depth of already allocated blocks */
518 if (dblock) { 640 ret = gfs2_bmap_alloc(inode, lblock, bh_map, &mp, ret, height, maxlen);
519 map_bh(bh_map, inode->i_sb, dblock); 641 goto out;
520 if (boundary)
521 set_buffer_boundary(bh_map);
522 if (new) {
523 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
524 gfs2_dinode_out(ip, dibh->b_data);
525 set_buffer_new(bh_map);
526 goto out_brelse;
527 }
528 while(--maxlen && !buffer_boundary(bh_map)) {
529 u64 eblock;
530
531 mp.mp_list[end_of_metadata]++;
532 boundary = lookup_block(ip, bh, end_of_metadata, &mp, 0, &new, &eblock);
533 if (eblock != ++dblock)
534 break;
535 bh_map->b_size += (1 << inode->i_blkbits);
536 if (boundary)
537 set_buffer_boundary(bh_map);
538 }
539 }
540out_brelse:
541 brelse(bh);
542out_ok:
543 error = 0;
544out_fail:
545 if (dibh)
546 brelse(dibh);
547 bmap_unlock(inode, create);
548 return error;
549} 642}
550 643
644/*
645 * Deprecated: do not use in new code
646 */
551int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen) 647int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen)
552{ 648{
553 struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 }; 649 struct buffer_head bh = { .b_state = 0, .b_blocknr = 0 };
@@ -558,7 +654,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
558 BUG_ON(!dblock); 654 BUG_ON(!dblock);
559 BUG_ON(!new); 655 BUG_ON(!new);
560 656
561 bh.b_size = 1 << (inode->i_blkbits + 5); 657 bh.b_size = 1 << (inode->i_blkbits + (create ? 0 : 5));
562 ret = gfs2_block_map(inode, lblock, &bh, create); 658 ret = gfs2_block_map(inode, lblock, &bh, create);
563 *extlen = bh.b_size >> inode->i_blkbits; 659 *extlen = bh.b_size >> inode->i_blkbits;
564 *dblock = bh.b_blocknr; 660 *dblock = bh.b_blocknr;
@@ -621,7 +717,7 @@ static int recursive_scan(struct gfs2_inode *ip, struct buffer_head *dibh,
621 if (error) 717 if (error)
622 goto out; 718 goto out;
623 719
624 if (height < ip->i_di.di_height - 1) 720 if (height < ip->i_height - 1)
625 for (; top < bottom; top++, first = 0) { 721 for (; top < bottom; top++, first = 0) {
626 if (!*top) 722 if (!*top)
627 continue; 723 continue;
@@ -679,7 +775,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
679 sm->sm_first = 0; 775 sm->sm_first = 0;
680 } 776 }
681 777
682 metadata = (height != ip->i_di.di_height - 1); 778 metadata = (height != ip->i_height - 1);
683 if (metadata) 779 if (metadata)
684 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs; 780 revokes = (height) ? sdp->sd_inptrs : sdp->sd_diptrs;
685 781
@@ -713,7 +809,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
713 else 809 else
714 goto out; /* Nothing to do */ 810 goto out; /* Nothing to do */
715 811
716 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); 812 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
717 813
718 for (x = 0; x < rlist.rl_rgrps; x++) { 814 for (x = 0; x < rlist.rl_rgrps; x++) {
719 struct gfs2_rgrpd *rgd; 815 struct gfs2_rgrpd *rgd;
@@ -760,10 +856,7 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
760 } 856 }
761 857
762 *p = 0; 858 *p = 0;
763 if (!ip->i_di.di_blocks) 859 gfs2_add_inode_blocks(&ip->i_inode, -1);
764 gfs2_consist_inode(ip);
765 ip->i_di.di_blocks--;
766 gfs2_set_inode_blocks(&ip->i_inode);
767 } 860 }
768 if (bstart) { 861 if (bstart) {
769 if (metadata) 862 if (metadata)
@@ -804,19 +897,16 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
804 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 897 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
805 struct gfs2_alloc *al; 898 struct gfs2_alloc *al;
806 struct buffer_head *dibh; 899 struct buffer_head *dibh;
807 unsigned int h;
808 int error; 900 int error;
809 901
810 al = gfs2_alloc_get(ip); 902 al = gfs2_alloc_get(ip);
903 if (!al)
904 return -ENOMEM;
811 905
812 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 906 error = gfs2_quota_lock_check(ip);
813 if (error) 907 if (error)
814 goto out; 908 goto out;
815 909
816 error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
817 if (error)
818 goto out_gunlock_q;
819
820 al->al_requested = sdp->sd_max_height + RES_DATA; 910 al->al_requested = sdp->sd_max_height + RES_DATA;
821 911
822 error = gfs2_inplace_reserve(ip); 912 error = gfs2_inplace_reserve(ip);
@@ -829,34 +919,25 @@ static int do_grow(struct gfs2_inode *ip, u64 size)
829 if (error) 919 if (error)
830 goto out_ipres; 920 goto out_ipres;
831 921
922 error = gfs2_meta_inode_buffer(ip, &dibh);
923 if (error)
924 goto out_end_trans;
925
832 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) { 926 if (size > sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode)) {
833 if (gfs2_is_stuffed(ip)) { 927 if (gfs2_is_stuffed(ip)) {
834 error = gfs2_unstuff_dinode(ip, NULL); 928 error = gfs2_unstuff_dinode(ip, NULL);
835 if (error) 929 if (error)
836 goto out_end_trans; 930 goto out_brelse;
837 }
838
839 h = calc_tree_height(ip, size);
840 if (ip->i_di.di_height < h) {
841 down_write(&ip->i_rw_mutex);
842 error = build_height(&ip->i_inode, h);
843 up_write(&ip->i_rw_mutex);
844 if (error)
845 goto out_end_trans;
846 } 931 }
847 } 932 }
848 933
849 ip->i_di.di_size = size; 934 ip->i_di.di_size = size;
850 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 935 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
851
852 error = gfs2_meta_inode_buffer(ip, &dibh);
853 if (error)
854 goto out_end_trans;
855
856 gfs2_trans_add_bh(ip->i_gl, dibh, 1); 936 gfs2_trans_add_bh(ip->i_gl, dibh, 1);
857 gfs2_dinode_out(ip, dibh->b_data); 937 gfs2_dinode_out(ip, dibh->b_data);
858 brelse(dibh);
859 938
939out_brelse:
940 brelse(dibh);
860out_end_trans: 941out_end_trans:
861 gfs2_trans_end(sdp); 942 gfs2_trans_end(sdp);
862out_ipres: 943out_ipres:
@@ -986,7 +1067,8 @@ out:
986 1067
987static int trunc_dealloc(struct gfs2_inode *ip, u64 size) 1068static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
988{ 1069{
989 unsigned int height = ip->i_di.di_height; 1070 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1071 unsigned int height = ip->i_height;
990 u64 lblock; 1072 u64 lblock;
991 struct metapath mp; 1073 struct metapath mp;
992 int error; 1074 int error;
@@ -994,10 +1076,11 @@ static int trunc_dealloc(struct gfs2_inode *ip, u64 size)
994 if (!size) 1076 if (!size)
995 lblock = 0; 1077 lblock = 0;
996 else 1078 else
997 lblock = (size - 1) >> GFS2_SB(&ip->i_inode)->sd_sb.sb_bsize_shift; 1079 lblock = (size - 1) >> sdp->sd_sb.sb_bsize_shift;
998 1080
999 find_metapath(ip, lblock, &mp); 1081 find_metapath(sdp, lblock, &mp, ip->i_height);
1000 gfs2_alloc_get(ip); 1082 if (!gfs2_alloc_get(ip))
1083 return -ENOMEM;
1001 1084
1002 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 1085 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1003 if (error) 1086 if (error)
@@ -1037,10 +1120,8 @@ static int trunc_end(struct gfs2_inode *ip)
1037 goto out; 1120 goto out;
1038 1121
1039 if (!ip->i_di.di_size) { 1122 if (!ip->i_di.di_size) {
1040 ip->i_di.di_height = 0; 1123 ip->i_height = 0;
1041 ip->i_di.di_goal_meta = 1124 ip->i_goal = ip->i_no_addr;
1042 ip->i_di.di_goal_data =
1043 ip->i_no_addr;
1044 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); 1125 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1045 } 1126 }
1046 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; 1127 ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
@@ -1197,10 +1278,9 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1197 unsigned int len, int *alloc_required) 1278 unsigned int len, int *alloc_required)
1198{ 1279{
1199 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1280 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1200 u64 lblock, lblock_stop, dblock; 1281 struct buffer_head bh;
1201 u32 extlen; 1282 unsigned int shift;
1202 int new = 0; 1283 u64 lblock, lblock_stop, size;
1203 int error = 0;
1204 1284
1205 *alloc_required = 0; 1285 *alloc_required = 0;
1206 1286
@@ -1214,6 +1294,8 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1214 return 0; 1294 return 0;
1215 } 1295 }
1216 1296
1297 *alloc_required = 1;
1298 shift = sdp->sd_sb.sb_bsize_shift;
1217 if (gfs2_is_dir(ip)) { 1299 if (gfs2_is_dir(ip)) {
1218 unsigned int bsize = sdp->sd_jbsize; 1300 unsigned int bsize = sdp->sd_jbsize;
1219 lblock = offset; 1301 lblock = offset;
@@ -1221,27 +1303,25 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
1221 lblock_stop = offset + len + bsize - 1; 1303 lblock_stop = offset + len + bsize - 1;
1222 do_div(lblock_stop, bsize); 1304 do_div(lblock_stop, bsize);
1223 } else { 1305 } else {
1224 unsigned int shift = sdp->sd_sb.sb_bsize_shift;
1225 u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift; 1306 u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift;
1226 lblock = offset >> shift; 1307 lblock = offset >> shift;
1227 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; 1308 lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
1228 if (lblock_stop > end_of_file) { 1309 if (lblock_stop > end_of_file)
1229 *alloc_required = 1;
1230 return 0; 1310 return 0;
1231 }
1232 } 1311 }
1233 1312
1234 for (; lblock < lblock_stop; lblock += extlen) { 1313 size = (lblock_stop - lblock) << shift;
1235 error = gfs2_extent_map(&ip->i_inode, lblock, &new, &dblock, &extlen); 1314 do {
1236 if (error) 1315 bh.b_state = 0;
1237 return error; 1316 bh.b_size = size;
1238 1317 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
1239 if (!dblock) { 1318 if (!buffer_mapped(&bh))
1240 *alloc_required = 1;
1241 return 0; 1319 return 0;
1242 } 1320 size -= bh.b_size;
1243 } 1321 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
1322 } while(size > 0);
1244 1323
1324 *alloc_required = 0;
1245 return 0; 1325 return 0;
1246} 1326}
1247 1327
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c34709512b19..eed040d8ba3a 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -159,6 +159,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
159 unsigned int o; 159 unsigned int o;
160 int copied = 0; 160 int copied = 0;
161 int error = 0; 161 int error = 0;
162 int new = 0;
162 163
163 if (!size) 164 if (!size)
164 return 0; 165 return 0;
@@ -183,7 +184,6 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
183 while (copied < size) { 184 while (copied < size) {
184 unsigned int amount; 185 unsigned int amount;
185 struct buffer_head *bh; 186 struct buffer_head *bh;
186 int new = 0;
187 187
188 amount = size - copied; 188 amount = size - copied;
189 if (amount > sdp->sd_sb.sb_bsize - o) 189 if (amount > sdp->sd_sb.sb_bsize - o)
@@ -757,7 +757,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
757 757
758 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 758 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
759 struct gfs2_leaf *leaf; 759 struct gfs2_leaf *leaf;
760 unsigned hsize = 1 << ip->i_di.di_depth; 760 unsigned hsize = 1 << ip->i_depth;
761 unsigned index; 761 unsigned index;
762 u64 ln; 762 u64 ln;
763 if (hsize * sizeof(u64) != ip->i_di.di_size) { 763 if (hsize * sizeof(u64) != ip->i_di.di_size) {
@@ -765,7 +765,7 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode,
765 return ERR_PTR(-EIO); 765 return ERR_PTR(-EIO);
766 } 766 }
767 767
768 index = name->hash >> (32 - ip->i_di.di_depth); 768 index = name->hash >> (32 - ip->i_depth);
769 error = get_first_leaf(ip, index, &bh); 769 error = get_first_leaf(ip, index, &bh);
770 if (error) 770 if (error)
771 return ERR_PTR(error); 771 return ERR_PTR(error);
@@ -803,14 +803,15 @@ got_dent:
803static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth) 803static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, u16 depth)
804{ 804{
805 struct gfs2_inode *ip = GFS2_I(inode); 805 struct gfs2_inode *ip = GFS2_I(inode);
806 u64 bn = gfs2_alloc_meta(ip); 806 unsigned int n = 1;
807 u64 bn = gfs2_alloc_block(ip, &n);
807 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); 808 struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn);
808 struct gfs2_leaf *leaf; 809 struct gfs2_leaf *leaf;
809 struct gfs2_dirent *dent; 810 struct gfs2_dirent *dent;
810 struct qstr name = { .name = "", .len = 0, .hash = 0 }; 811 struct qstr name = { .name = "", .len = 0, .hash = 0 };
811 if (!bh) 812 if (!bh)
812 return NULL; 813 return NULL;
813 814 gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
814 gfs2_trans_add_bh(ip->i_gl, bh, 1); 815 gfs2_trans_add_bh(ip->i_gl, bh, 1);
815 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); 816 gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
816 leaf = (struct gfs2_leaf *)bh->b_data; 817 leaf = (struct gfs2_leaf *)bh->b_data;
@@ -905,12 +906,11 @@ static int dir_make_exhash(struct inode *inode)
905 *lp = cpu_to_be64(bn); 906 *lp = cpu_to_be64(bn);
906 907
907 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; 908 dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2;
908 dip->i_di.di_blocks++; 909 gfs2_add_inode_blocks(&dip->i_inode, 1);
909 gfs2_set_inode_blocks(&dip->i_inode);
910 dip->i_di.di_flags |= GFS2_DIF_EXHASH; 910 dip->i_di.di_flags |= GFS2_DIF_EXHASH;
911 911
912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; 912 for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ;
913 dip->i_di.di_depth = y; 913 dip->i_depth = y;
914 914
915 gfs2_dinode_out(dip, dibh->b_data); 915 gfs2_dinode_out(dip, dibh->b_data);
916 916
@@ -941,7 +941,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
941 int x, moved = 0; 941 int x, moved = 0;
942 int error; 942 int error;
943 943
944 index = name->hash >> (32 - dip->i_di.di_depth); 944 index = name->hash >> (32 - dip->i_depth);
945 error = get_leaf_nr(dip, index, &leaf_no); 945 error = get_leaf_nr(dip, index, &leaf_no);
946 if (error) 946 if (error)
947 return error; 947 return error;
@@ -952,7 +952,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
952 return error; 952 return error;
953 953
954 oleaf = (struct gfs2_leaf *)obh->b_data; 954 oleaf = (struct gfs2_leaf *)obh->b_data;
955 if (dip->i_di.di_depth == be16_to_cpu(oleaf->lf_depth)) { 955 if (dip->i_depth == be16_to_cpu(oleaf->lf_depth)) {
956 brelse(obh); 956 brelse(obh);
957 return 1; /* can't split */ 957 return 1; /* can't split */
958 } 958 }
@@ -967,10 +967,10 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
967 bn = nbh->b_blocknr; 967 bn = nbh->b_blocknr;
968 968
969 /* Compute the start and len of leaf pointers in the hash table. */ 969 /* Compute the start and len of leaf pointers in the hash table. */
970 len = 1 << (dip->i_di.di_depth - be16_to_cpu(oleaf->lf_depth)); 970 len = 1 << (dip->i_depth - be16_to_cpu(oleaf->lf_depth));
971 half_len = len >> 1; 971 half_len = len >> 1;
972 if (!half_len) { 972 if (!half_len) {
973 printk(KERN_WARNING "di_depth %u lf_depth %u index %u\n", dip->i_di.di_depth, be16_to_cpu(oleaf->lf_depth), index); 973 printk(KERN_WARNING "i_depth %u lf_depth %u index %u\n", dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
974 gfs2_consist_inode(dip); 974 gfs2_consist_inode(dip);
975 error = -EIO; 975 error = -EIO;
976 goto fail_brelse; 976 goto fail_brelse;
@@ -997,7 +997,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
997 kfree(lp); 997 kfree(lp);
998 998
999 /* Compute the divider */ 999 /* Compute the divider */
1000 divider = (start + half_len) << (32 - dip->i_di.di_depth); 1000 divider = (start + half_len) << (32 - dip->i_depth);
1001 1001
1002 /* Copy the entries */ 1002 /* Copy the entries */
1003 dirent_first(dip, obh, &dent); 1003 dirent_first(dip, obh, &dent);
@@ -1021,13 +1021,13 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1021 1021
1022 new->de_inum = dent->de_inum; /* No endian worries */ 1022 new->de_inum = dent->de_inum; /* No endian worries */
1023 new->de_type = dent->de_type; /* No endian worries */ 1023 new->de_type = dent->de_type; /* No endian worries */
1024 nleaf->lf_entries = cpu_to_be16(be16_to_cpu(nleaf->lf_entries)+1); 1024 be16_add_cpu(&nleaf->lf_entries, 1);
1025 1025
1026 dirent_del(dip, obh, prev, dent); 1026 dirent_del(dip, obh, prev, dent);
1027 1027
1028 if (!oleaf->lf_entries) 1028 if (!oleaf->lf_entries)
1029 gfs2_consist_inode(dip); 1029 gfs2_consist_inode(dip);
1030 oleaf->lf_entries = cpu_to_be16(be16_to_cpu(oleaf->lf_entries)-1); 1030 be16_add_cpu(&oleaf->lf_entries, -1);
1031 1031
1032 if (!prev) 1032 if (!prev)
1033 prev = dent; 1033 prev = dent;
@@ -1044,8 +1044,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
1044 error = gfs2_meta_inode_buffer(dip, &dibh); 1044 error = gfs2_meta_inode_buffer(dip, &dibh);
1045 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) { 1045 if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
1046 gfs2_trans_add_bh(dip->i_gl, dibh, 1); 1046 gfs2_trans_add_bh(dip->i_gl, dibh, 1);
1047 dip->i_di.di_blocks++; 1047 gfs2_add_inode_blocks(&dip->i_inode, 1);
1048 gfs2_set_inode_blocks(&dip->i_inode);
1049 gfs2_dinode_out(dip, dibh->b_data); 1048 gfs2_dinode_out(dip, dibh->b_data);
1050 brelse(dibh); 1049 brelse(dibh);
1051 } 1050 }
@@ -1082,7 +1081,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1082 int x; 1081 int x;
1083 int error = 0; 1082 int error = 0;
1084 1083
1085 hsize = 1 << dip->i_di.di_depth; 1084 hsize = 1 << dip->i_depth;
1086 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1085 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1087 gfs2_consist_inode(dip); 1086 gfs2_consist_inode(dip);
1088 return -EIO; 1087 return -EIO;
@@ -1090,7 +1089,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1090 1089
1091 /* Allocate both the "from" and "to" buffers in one big chunk */ 1090 /* Allocate both the "from" and "to" buffers in one big chunk */
1092 1091
1093 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_KERNEL | __GFP_NOFAIL); 1092 buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL);
1094 1093
1095 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { 1094 for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) {
1096 error = gfs2_dir_read_data(dip, (char *)buf, 1095 error = gfs2_dir_read_data(dip, (char *)buf,
@@ -1125,7 +1124,7 @@ static int dir_double_exhash(struct gfs2_inode *dip)
1125 1124
1126 error = gfs2_meta_inode_buffer(dip, &dibh); 1125 error = gfs2_meta_inode_buffer(dip, &dibh);
1127 if (!gfs2_assert_withdraw(sdp, !error)) { 1126 if (!gfs2_assert_withdraw(sdp, !error)) {
1128 dip->i_di.di_depth++; 1127 dip->i_depth++;
1129 gfs2_dinode_out(dip, dibh->b_data); 1128 gfs2_dinode_out(dip, dibh->b_data);
1130 brelse(dibh); 1129 brelse(dibh);
1131 } 1130 }
@@ -1370,16 +1369,16 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1370 int error = 0; 1369 int error = 0;
1371 unsigned depth = 0; 1370 unsigned depth = 0;
1372 1371
1373 hsize = 1 << dip->i_di.di_depth; 1372 hsize = 1 << dip->i_depth;
1374 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1373 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1375 gfs2_consist_inode(dip); 1374 gfs2_consist_inode(dip);
1376 return -EIO; 1375 return -EIO;
1377 } 1376 }
1378 1377
1379 hash = gfs2_dir_offset2hash(*offset); 1378 hash = gfs2_dir_offset2hash(*offset);
1380 index = hash >> (32 - dip->i_di.di_depth); 1379 index = hash >> (32 - dip->i_depth);
1381 1380
1382 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); 1381 lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
1383 if (!lp) 1382 if (!lp)
1384 return -ENOMEM; 1383 return -ENOMEM;
1385 1384
@@ -1405,7 +1404,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque,
1405 if (error) 1404 if (error)
1406 break; 1405 break;
1407 1406
1408 len = 1 << (dip->i_di.di_depth - depth); 1407 len = 1 << (dip->i_depth - depth);
1409 index = (index & ~(len - 1)) + len; 1408 index = (index & ~(len - 1)) + len;
1410 } 1409 }
1411 1410
@@ -1444,7 +1443,7 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque,
1444 1443
1445 error = -ENOMEM; 1444 error = -ENOMEM;
1446 /* 96 is max number of dirents which can be stuffed into an inode */ 1445 /* 96 is max number of dirents which can be stuffed into an inode */
1447 darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_KERNEL); 1446 darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
1448 if (darr) { 1447 if (darr) {
1449 g.pdent = darr; 1448 g.pdent = darr;
1450 g.offset = 0; 1449 g.offset = 0;
@@ -1549,7 +1548,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1549 u32 index; 1548 u32 index;
1550 u64 bn; 1549 u64 bn;
1551 1550
1552 index = name->hash >> (32 - ip->i_di.di_depth); 1551 index = name->hash >> (32 - ip->i_depth);
1553 error = get_first_leaf(ip, index, &obh); 1552 error = get_first_leaf(ip, index, &obh);
1554 if (error) 1553 if (error)
1555 return error; 1554 return error;
@@ -1579,8 +1578,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
1579 if (error) 1578 if (error)
1580 return error; 1579 return error;
1581 gfs2_trans_add_bh(ip->i_gl, bh, 1); 1580 gfs2_trans_add_bh(ip->i_gl, bh, 1);
1582 ip->i_di.di_blocks++; 1581 gfs2_add_inode_blocks(&ip->i_inode, 1);
1583 gfs2_set_inode_blocks(&ip->i_inode);
1584 gfs2_dinode_out(ip, bh->b_data); 1582 gfs2_dinode_out(ip, bh->b_data);
1585 brelse(bh); 1583 brelse(bh);
1586 return 0; 1584 return 0;
@@ -1616,7 +1614,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1616 dent->de_type = cpu_to_be16(type); 1614 dent->de_type = cpu_to_be16(type);
1617 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { 1615 if (ip->i_di.di_flags & GFS2_DIF_EXHASH) {
1618 leaf = (struct gfs2_leaf *)bh->b_data; 1616 leaf = (struct gfs2_leaf *)bh->b_data;
1619 leaf->lf_entries = cpu_to_be16(be16_to_cpu(leaf->lf_entries) + 1); 1617 be16_add_cpu(&leaf->lf_entries, 1);
1620 } 1618 }
1621 brelse(bh); 1619 brelse(bh);
1622 error = gfs2_meta_inode_buffer(ip, &bh); 1620 error = gfs2_meta_inode_buffer(ip, &bh);
@@ -1641,7 +1639,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
1641 continue; 1639 continue;
1642 if (error < 0) 1640 if (error < 0)
1643 break; 1641 break;
1644 if (ip->i_di.di_depth < GFS2_DIR_MAX_DEPTH) { 1642 if (ip->i_depth < GFS2_DIR_MAX_DEPTH) {
1645 error = dir_double_exhash(ip); 1643 error = dir_double_exhash(ip);
1646 if (error) 1644 if (error)
1647 break; 1645 break;
@@ -1785,13 +1783,13 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1785 u64 leaf_no; 1783 u64 leaf_no;
1786 int error = 0; 1784 int error = 0;
1787 1785
1788 hsize = 1 << dip->i_di.di_depth; 1786 hsize = 1 << dip->i_depth;
1789 if (hsize * sizeof(u64) != dip->i_di.di_size) { 1787 if (hsize * sizeof(u64) != dip->i_di.di_size) {
1790 gfs2_consist_inode(dip); 1788 gfs2_consist_inode(dip);
1791 return -EIO; 1789 return -EIO;
1792 } 1790 }
1793 1791
1794 lp = kmalloc(sdp->sd_hash_bsize, GFP_KERNEL); 1792 lp = kmalloc(sdp->sd_hash_bsize, GFP_NOFS);
1795 if (!lp) 1793 if (!lp)
1796 return -ENOMEM; 1794 return -ENOMEM;
1797 1795
@@ -1817,7 +1815,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data)
1817 if (error) 1815 if (error)
1818 goto out; 1816 goto out;
1819 leaf = (struct gfs2_leaf *)bh->b_data; 1817 leaf = (struct gfs2_leaf *)bh->b_data;
1820 len = 1 << (dip->i_di.di_depth - be16_to_cpu(leaf->lf_depth)); 1818 len = 1 << (dip->i_depth - be16_to_cpu(leaf->lf_depth));
1821 brelse(bh); 1819 brelse(bh);
1822 1820
1823 error = lc(dip, index, len, leaf_no, data); 1821 error = lc(dip, index, len, leaf_no, data);
@@ -1866,15 +1864,18 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1866 1864
1867 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); 1865 memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
1868 1866
1869 ht = kzalloc(size, GFP_KERNEL); 1867 ht = kzalloc(size, GFP_NOFS);
1870 if (!ht) 1868 if (!ht)
1871 return -ENOMEM; 1869 return -ENOMEM;
1872 1870
1873 gfs2_alloc_get(dip); 1871 if (!gfs2_alloc_get(dip)) {
1872 error = -ENOMEM;
1873 goto out;
1874 }
1874 1875
1875 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 1876 error = gfs2_quota_hold(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1876 if (error) 1877 if (error)
1877 goto out; 1878 goto out_put;
1878 1879
1879 error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh); 1880 error = gfs2_rindex_hold(sdp, &dip->i_alloc->al_ri_gh);
1880 if (error) 1881 if (error)
@@ -1894,7 +1895,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1894 l_blocks++; 1895 l_blocks++;
1895 } 1896 }
1896 1897
1897 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); 1898 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
1898 1899
1899 for (x = 0; x < rlist.rl_rgrps; x++) { 1900 for (x = 0; x < rlist.rl_rgrps; x++) {
1900 struct gfs2_rgrpd *rgd; 1901 struct gfs2_rgrpd *rgd;
@@ -1921,11 +1922,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
1921 brelse(bh); 1922 brelse(bh);
1922 1923
1923 gfs2_free_meta(dip, blk, 1); 1924 gfs2_free_meta(dip, blk, 1);
1924 1925 gfs2_add_inode_blocks(&dip->i_inode, -1);
1925 if (!dip->i_di.di_blocks)
1926 gfs2_consist_inode(dip);
1927 dip->i_di.di_blocks--;
1928 gfs2_set_inode_blocks(&dip->i_inode);
1929 } 1926 }
1930 1927
1931 error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size); 1928 error = gfs2_dir_write_data(dip, ht, index * sizeof(u64), size);
@@ -1952,8 +1949,9 @@ out_rlist:
1952 gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh); 1949 gfs2_glock_dq_uninit(&dip->i_alloc->al_ri_gh);
1953out_qs: 1950out_qs:
1954 gfs2_quota_unhold(dip); 1951 gfs2_quota_unhold(dip);
1955out: 1952out_put:
1956 gfs2_alloc_put(dip); 1953 gfs2_alloc_put(dip);
1954out:
1957 kfree(ht); 1955 kfree(ht);
1958 return error; 1956 return error;
1959} 1957}
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c
index bee99704ea10..e3f76f451b0a 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/eattr.c
@@ -277,10 +277,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
277 } 277 }
278 278
279 *dataptrs = 0; 279 *dataptrs = 0;
280 if (!ip->i_di.di_blocks) 280 gfs2_add_inode_blocks(&ip->i_inode, -1);
281 gfs2_consist_inode(ip);
282 ip->i_di.di_blocks--;
283 gfs2_set_inode_blocks(&ip->i_inode);
284 } 281 }
285 if (bstart) 282 if (bstart)
286 gfs2_free_meta(ip, bstart, blen); 283 gfs2_free_meta(ip, bstart, blen);
@@ -321,6 +318,8 @@ static int ea_remove_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
321 int error; 318 int error;
322 319
323 al = gfs2_alloc_get(ip); 320 al = gfs2_alloc_get(ip);
321 if (!al)
322 return -ENOMEM;
324 323
325 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 324 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
326 if (error) 325 if (error)
@@ -449,7 +448,7 @@ static int ea_get_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
449 unsigned int x; 448 unsigned int x;
450 int error = 0; 449 int error = 0;
451 450
452 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); 451 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
453 if (!bh) 452 if (!bh)
454 return -ENOMEM; 453 return -ENOMEM;
455 454
@@ -582,10 +581,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
582{ 581{
583 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 582 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
584 struct gfs2_ea_header *ea; 583 struct gfs2_ea_header *ea;
584 unsigned int n = 1;
585 u64 block; 585 u64 block;
586 586
587 block = gfs2_alloc_meta(ip); 587 block = gfs2_alloc_block(ip, &n);
588 588 gfs2_trans_add_unrevoke(sdp, block, 1);
589 *bhp = gfs2_meta_new(ip->i_gl, block); 589 *bhp = gfs2_meta_new(ip->i_gl, block);
590 gfs2_trans_add_bh(ip->i_gl, *bhp, 1); 590 gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
591 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA); 591 gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
@@ -597,8 +597,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
597 ea->ea_flags = GFS2_EAFLAG_LAST; 597 ea->ea_flags = GFS2_EAFLAG_LAST;
598 ea->ea_num_ptrs = 0; 598 ea->ea_num_ptrs = 0;
599 599
600 ip->i_di.di_blocks++; 600 gfs2_add_inode_blocks(&ip->i_inode, 1);
601 gfs2_set_inode_blocks(&ip->i_inode);
602 601
603 return 0; 602 return 0;
604} 603}
@@ -642,15 +641,15 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
642 struct buffer_head *bh; 641 struct buffer_head *bh;
643 u64 block; 642 u64 block;
644 int mh_size = sizeof(struct gfs2_meta_header); 643 int mh_size = sizeof(struct gfs2_meta_header);
644 unsigned int n = 1;
645 645
646 block = gfs2_alloc_meta(ip); 646 block = gfs2_alloc_block(ip, &n);
647 647 gfs2_trans_add_unrevoke(sdp, block, 1);
648 bh = gfs2_meta_new(ip->i_gl, block); 648 bh = gfs2_meta_new(ip->i_gl, block);
649 gfs2_trans_add_bh(ip->i_gl, bh, 1); 649 gfs2_trans_add_bh(ip->i_gl, bh, 1);
650 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED); 650 gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
651 651
652 ip->i_di.di_blocks++; 652 gfs2_add_inode_blocks(&ip->i_inode, 1);
653 gfs2_set_inode_blocks(&ip->i_inode);
654 653
655 copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize : 654 copy = data_len > sdp->sd_jbsize ? sdp->sd_jbsize :
656 data_len; 655 data_len;
@@ -684,15 +683,13 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
684 int error; 683 int error;
685 684
686 al = gfs2_alloc_get(ip); 685 al = gfs2_alloc_get(ip);
686 if (!al)
687 return -ENOMEM;
687 688
688 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 689 error = gfs2_quota_lock_check(ip);
689 if (error) 690 if (error)
690 goto out; 691 goto out;
691 692
692 error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
693 if (error)
694 goto out_gunlock_q;
695
696 al->al_requested = blks; 693 al->al_requested = blks;
697 694
698 error = gfs2_inplace_reserve(ip); 695 error = gfs2_inplace_reserve(ip);
@@ -966,9 +963,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
966 gfs2_trans_add_bh(ip->i_gl, indbh, 1); 963 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
967 } else { 964 } else {
968 u64 blk; 965 u64 blk;
969 966 unsigned int n = 1;
970 blk = gfs2_alloc_meta(ip); 967 blk = gfs2_alloc_block(ip, &n);
971 968 gfs2_trans_add_unrevoke(sdp, blk, 1);
972 indbh = gfs2_meta_new(ip->i_gl, blk); 969 indbh = gfs2_meta_new(ip->i_gl, blk);
973 gfs2_trans_add_bh(ip->i_gl, indbh, 1); 970 gfs2_trans_add_bh(ip->i_gl, indbh, 1);
974 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN); 971 gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
@@ -978,8 +975,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
978 *eablk = cpu_to_be64(ip->i_di.di_eattr); 975 *eablk = cpu_to_be64(ip->i_di.di_eattr);
979 ip->i_di.di_eattr = blk; 976 ip->i_di.di_eattr = blk;
980 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; 977 ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT;
981 ip->i_di.di_blocks++; 978 gfs2_add_inode_blocks(&ip->i_inode, 1);
982 gfs2_set_inode_blocks(&ip->i_inode);
983 979
984 eablk++; 980 eablk++;
985 } 981 }
@@ -1210,7 +1206,7 @@ static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
1210 unsigned int x; 1206 unsigned int x;
1211 int error; 1207 int error;
1212 1208
1213 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_KERNEL); 1209 bh = kcalloc(nptrs, sizeof(struct buffer_head *), GFP_NOFS);
1214 if (!bh) 1210 if (!bh)
1215 return -ENOMEM; 1211 return -ENOMEM;
1216 1212
@@ -1347,7 +1343,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1347 else 1343 else
1348 goto out; 1344 goto out;
1349 1345
1350 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE, 0); 1346 gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
1351 1347
1352 for (x = 0; x < rlist.rl_rgrps; x++) { 1348 for (x = 0; x < rlist.rl_rgrps; x++) {
1353 struct gfs2_rgrpd *rgd; 1349 struct gfs2_rgrpd *rgd;
@@ -1387,10 +1383,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
1387 } 1383 }
1388 1384
1389 *eablk = 0; 1385 *eablk = 0;
1390 if (!ip->i_di.di_blocks) 1386 gfs2_add_inode_blocks(&ip->i_inode, -1);
1391 gfs2_consist_inode(ip);
1392 ip->i_di.di_blocks--;
1393 gfs2_set_inode_blocks(&ip->i_inode);
1394 } 1387 }
1395 if (bstart) 1388 if (bstart)
1396 gfs2_free_meta(ip, bstart, blen); 1389 gfs2_free_meta(ip, bstart, blen);
@@ -1442,10 +1435,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
1442 gfs2_free_meta(ip, ip->i_di.di_eattr, 1); 1435 gfs2_free_meta(ip, ip->i_di.di_eattr, 1);
1443 1436
1444 ip->i_di.di_eattr = 0; 1437 ip->i_di.di_eattr = 0;
1445 if (!ip->i_di.di_blocks) 1438 gfs2_add_inode_blocks(&ip->i_inode, -1);
1446 gfs2_consist_inode(ip);
1447 ip->i_di.di_blocks--;
1448 gfs2_set_inode_blocks(&ip->i_inode);
1449 1439
1450 error = gfs2_meta_inode_buffer(ip, &dibh); 1440 error = gfs2_meta_inode_buffer(ip, &dibh);
1451 if (!error) { 1441 if (!error) {
@@ -1474,6 +1464,8 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip)
1474 int error; 1464 int error;
1475 1465
1476 al = gfs2_alloc_get(ip); 1466 al = gfs2_alloc_get(ip);
1467 if (!al)
1468 return -ENOMEM;
1477 1469
1478 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 1470 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
1479 if (error) 1471 if (error)
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 7175a4d06435..d636b3e80f5d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -35,7 +35,6 @@
35#include "glock.h" 35#include "glock.h"
36#include "glops.h" 36#include "glops.h"
37#include "inode.h" 37#include "inode.h"
38#include "lm.h"
39#include "lops.h" 38#include "lops.h"
40#include "meta_io.h" 39#include "meta_io.h"
41#include "quota.h" 40#include "quota.h"
@@ -183,7 +182,8 @@ static void glock_free(struct gfs2_glock *gl)
183 struct gfs2_sbd *sdp = gl->gl_sbd; 182 struct gfs2_sbd *sdp = gl->gl_sbd;
184 struct inode *aspace = gl->gl_aspace; 183 struct inode *aspace = gl->gl_aspace;
185 184
186 gfs2_lm_put_lock(sdp, gl->gl_lock); 185 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
186 sdp->sd_lockstruct.ls_ops->lm_put_lock(gl->gl_lock);
187 187
188 if (aspace) 188 if (aspace)
189 gfs2_aspace_put(aspace); 189 gfs2_aspace_put(aspace);
@@ -197,7 +197,7 @@ static void glock_free(struct gfs2_glock *gl)
197 * 197 *
198 */ 198 */
199 199
200void gfs2_glock_hold(struct gfs2_glock *gl) 200static void gfs2_glock_hold(struct gfs2_glock *gl)
201{ 201{
202 atomic_inc(&gl->gl_ref); 202 atomic_inc(&gl->gl_ref);
203} 203}
@@ -293,6 +293,16 @@ static void glock_work_func(struct work_struct *work)
293 gfs2_glock_put(gl); 293 gfs2_glock_put(gl);
294} 294}
295 295
296static int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
297 void **lockp)
298{
299 int error = -EIO;
300 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
301 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
302 sdp->sd_lockstruct.ls_lockspace, name, lockp);
303 return error;
304}
305
296/** 306/**
297 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist 307 * gfs2_glock_get() - Get a glock, or create one if one doesn't exist
298 * @sdp: The GFS2 superblock 308 * @sdp: The GFS2 superblock
@@ -338,8 +348,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
338 gl->gl_ip = 0; 348 gl->gl_ip = 0;
339 gl->gl_ops = glops; 349 gl->gl_ops = glops;
340 gl->gl_req_gh = NULL; 350 gl->gl_req_gh = NULL;
341 gl->gl_req_bh = NULL;
342 gl->gl_vn = 0;
343 gl->gl_stamp = jiffies; 351 gl->gl_stamp = jiffies;
344 gl->gl_tchange = jiffies; 352 gl->gl_tchange = jiffies;
345 gl->gl_object = NULL; 353 gl->gl_object = NULL;
@@ -595,11 +603,12 @@ static void run_queue(struct gfs2_glock *gl)
595 blocked = rq_mutex(gh); 603 blocked = rq_mutex(gh);
596 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) { 604 } else if (test_bit(GLF_DEMOTE, &gl->gl_flags)) {
597 blocked = rq_demote(gl); 605 blocked = rq_demote(gl);
598 if (gl->gl_waiters2 && !blocked) { 606 if (test_bit(GLF_WAITERS2, &gl->gl_flags) &&
607 !blocked) {
599 set_bit(GLF_DEMOTE, &gl->gl_flags); 608 set_bit(GLF_DEMOTE, &gl->gl_flags);
600 gl->gl_demote_state = LM_ST_UNLOCKED; 609 gl->gl_demote_state = LM_ST_UNLOCKED;
601 } 610 }
602 gl->gl_waiters2 = 0; 611 clear_bit(GLF_WAITERS2, &gl->gl_flags);
603 } else if (!list_empty(&gl->gl_waiters3)) { 612 } else if (!list_empty(&gl->gl_waiters3)) {
604 gh = list_entry(gl->gl_waiters3.next, 613 gh = list_entry(gl->gl_waiters3.next,
605 struct gfs2_holder, gh_list); 614 struct gfs2_holder, gh_list);
@@ -710,7 +719,7 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
710 } else if (gl->gl_demote_state != LM_ST_UNLOCKED && 719 } else if (gl->gl_demote_state != LM_ST_UNLOCKED &&
711 gl->gl_demote_state != state) { 720 gl->gl_demote_state != state) {
712 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) 721 if (test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags))
713 gl->gl_waiters2 = 1; 722 set_bit(GLF_WAITERS2, &gl->gl_flags);
714 else 723 else
715 gl->gl_demote_state = LM_ST_UNLOCKED; 724 gl->gl_demote_state = LM_ST_UNLOCKED;
716 } 725 }
@@ -743,6 +752,43 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
743} 752}
744 753
745/** 754/**
755 * drop_bh - Called after a lock module unlock completes
756 * @gl: the glock
757 * @ret: the return status
758 *
759 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
760 * Doesn't drop the reference on the glock the top half took out
761 *
762 */
763
764static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
765{
766 struct gfs2_sbd *sdp = gl->gl_sbd;
767 struct gfs2_holder *gh = gl->gl_req_gh;
768
769 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
770 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
771 gfs2_assert_warn(sdp, !ret);
772
773 state_change(gl, LM_ST_UNLOCKED);
774
775 if (test_and_clear_bit(GLF_CONV_DEADLK, &gl->gl_flags)) {
776 spin_lock(&gl->gl_spin);
777 gh->gh_error = 0;
778 spin_unlock(&gl->gl_spin);
779 gfs2_glock_xmote_th(gl, gl->gl_req_gh);
780 gfs2_glock_put(gl);
781 return;
782 }
783
784 spin_lock(&gl->gl_spin);
785 gfs2_demote_wake(gl);
786 clear_bit(GLF_LOCK, &gl->gl_flags);
787 spin_unlock(&gl->gl_spin);
788 gfs2_glock_put(gl);
789}
790
791/**
746 * xmote_bh - Called after the lock module is done acquiring a lock 792 * xmote_bh - Called after the lock module is done acquiring a lock
747 * @gl: The glock in question 793 * @gl: The glock in question
748 * @ret: the int returned from the lock module 794 * @ret: the int returned from the lock module
@@ -754,25 +800,19 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
754 struct gfs2_sbd *sdp = gl->gl_sbd; 800 struct gfs2_sbd *sdp = gl->gl_sbd;
755 const struct gfs2_glock_operations *glops = gl->gl_ops; 801 const struct gfs2_glock_operations *glops = gl->gl_ops;
756 struct gfs2_holder *gh = gl->gl_req_gh; 802 struct gfs2_holder *gh = gl->gl_req_gh;
757 int prev_state = gl->gl_state;
758 int op_done = 1; 803 int op_done = 1;
759 804
805 if (!gh && (ret & LM_OUT_ST_MASK) == LM_ST_UNLOCKED) {
806 drop_bh(gl, ret);
807 return;
808 }
809
760 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 810 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
761 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); 811 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
762 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC)); 812 gfs2_assert_warn(sdp, !(ret & LM_OUT_ASYNC));
763 813
764 state_change(gl, ret & LM_OUT_ST_MASK); 814 state_change(gl, ret & LM_OUT_ST_MASK);
765 815
766 if (prev_state != LM_ST_UNLOCKED && !(ret & LM_OUT_CACHEABLE)) {
767 if (glops->go_inval)
768 glops->go_inval(gl, DIO_METADATA);
769 } else if (gl->gl_state == LM_ST_DEFERRED) {
770 /* We might not want to do this here.
771 Look at moving to the inode glops. */
772 if (glops->go_inval)
773 glops->go_inval(gl, 0);
774 }
775
776 /* Deal with each possible exit condition */ 816 /* Deal with each possible exit condition */
777 817
778 if (!gh) { 818 if (!gh) {
@@ -782,7 +822,6 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
782 } else { 822 } else {
783 spin_lock(&gl->gl_spin); 823 spin_lock(&gl->gl_spin);
784 if (gl->gl_state != gl->gl_demote_state) { 824 if (gl->gl_state != gl->gl_demote_state) {
785 gl->gl_req_bh = NULL;
786 spin_unlock(&gl->gl_spin); 825 spin_unlock(&gl->gl_spin);
787 gfs2_glock_drop_th(gl); 826 gfs2_glock_drop_th(gl);
788 gfs2_glock_put(gl); 827 gfs2_glock_put(gl);
@@ -793,6 +832,14 @@ static void xmote_bh(struct gfs2_glock *gl, unsigned int ret)
793 } 832 }
794 } else { 833 } else {
795 spin_lock(&gl->gl_spin); 834 spin_lock(&gl->gl_spin);
835 if (ret & LM_OUT_CONV_DEADLK) {
836 gh->gh_error = 0;
837 set_bit(GLF_CONV_DEADLK, &gl->gl_flags);
838 spin_unlock(&gl->gl_spin);
839 gfs2_glock_drop_th(gl);
840 gfs2_glock_put(gl);
841 return;
842 }
796 list_del_init(&gh->gh_list); 843 list_del_init(&gh->gh_list);
797 gh->gh_error = -EIO; 844 gh->gh_error = -EIO;
798 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) 845 if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
@@ -824,7 +871,6 @@ out:
824 if (op_done) { 871 if (op_done) {
825 spin_lock(&gl->gl_spin); 872 spin_lock(&gl->gl_spin);
826 gl->gl_req_gh = NULL; 873 gl->gl_req_gh = NULL;
827 gl->gl_req_bh = NULL;
828 clear_bit(GLF_LOCK, &gl->gl_flags); 874 clear_bit(GLF_LOCK, &gl->gl_flags);
829 spin_unlock(&gl->gl_spin); 875 spin_unlock(&gl->gl_spin);
830 } 876 }
@@ -835,6 +881,17 @@ out:
835 gfs2_holder_wake(gh); 881 gfs2_holder_wake(gh);
836} 882}
837 883
884static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
885 unsigned int cur_state, unsigned int req_state,
886 unsigned int flags)
887{
888 int ret = 0;
889 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
890 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
891 req_state, flags);
892 return ret;
893}
894
838/** 895/**
839 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock 896 * gfs2_glock_xmote_th - Call into the lock module to acquire or change a glock
840 * @gl: The glock in question 897 * @gl: The glock in question
@@ -856,6 +913,8 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
856 913
857 if (glops->go_xmote_th) 914 if (glops->go_xmote_th)
858 glops->go_xmote_th(gl); 915 glops->go_xmote_th(gl);
916 if (state == LM_ST_DEFERRED && glops->go_inval)
917 glops->go_inval(gl, DIO_METADATA);
859 918
860 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 919 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
861 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); 920 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
@@ -863,7 +922,6 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
863 gfs2_assert_warn(sdp, state != gl->gl_state); 922 gfs2_assert_warn(sdp, state != gl->gl_state);
864 923
865 gfs2_glock_hold(gl); 924 gfs2_glock_hold(gl);
866 gl->gl_req_bh = xmote_bh;
867 925
868 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags); 926 lck_ret = gfs2_lm_lock(sdp, gl->gl_lock, gl->gl_state, state, lck_flags);
869 927
@@ -876,49 +934,13 @@ static void gfs2_glock_xmote_th(struct gfs2_glock *gl, struct gfs2_holder *gh)
876 xmote_bh(gl, lck_ret); 934 xmote_bh(gl, lck_ret);
877} 935}
878 936
879/** 937static unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
880 * drop_bh - Called after a lock module unlock completes 938 unsigned int cur_state)
881 * @gl: the glock
882 * @ret: the return status
883 *
884 * Doesn't wake up the process waiting on the struct gfs2_holder (if any)
885 * Doesn't drop the reference on the glock the top half took out
886 *
887 */
888
889static void drop_bh(struct gfs2_glock *gl, unsigned int ret)
890{ 939{
891 struct gfs2_sbd *sdp = gl->gl_sbd; 940 int ret = 0;
892 const struct gfs2_glock_operations *glops = gl->gl_ops; 941 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
893 struct gfs2_holder *gh = gl->gl_req_gh; 942 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
894 943 return ret;
895 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
896 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
897 gfs2_assert_warn(sdp, !ret);
898
899 state_change(gl, LM_ST_UNLOCKED);
900
901 if (glops->go_inval)
902 glops->go_inval(gl, DIO_METADATA);
903
904 if (gh) {
905 spin_lock(&gl->gl_spin);
906 list_del_init(&gh->gh_list);
907 gh->gh_error = 0;
908 spin_unlock(&gl->gl_spin);
909 }
910
911 spin_lock(&gl->gl_spin);
912 gfs2_demote_wake(gl);
913 gl->gl_req_gh = NULL;
914 gl->gl_req_bh = NULL;
915 clear_bit(GLF_LOCK, &gl->gl_flags);
916 spin_unlock(&gl->gl_spin);
917
918 gfs2_glock_put(gl);
919
920 if (gh)
921 gfs2_holder_wake(gh);
922} 944}
923 945
924/** 946/**
@@ -935,13 +957,14 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
935 957
936 if (glops->go_xmote_th) 958 if (glops->go_xmote_th)
937 glops->go_xmote_th(gl); 959 glops->go_xmote_th(gl);
960 if (glops->go_inval)
961 glops->go_inval(gl, DIO_METADATA);
938 962
939 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags)); 963 gfs2_assert_warn(sdp, test_bit(GLF_LOCK, &gl->gl_flags));
940 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders)); 964 gfs2_assert_warn(sdp, list_empty(&gl->gl_holders));
941 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED); 965 gfs2_assert_warn(sdp, gl->gl_state != LM_ST_UNLOCKED);
942 966
943 gfs2_glock_hold(gl); 967 gfs2_glock_hold(gl);
944 gl->gl_req_bh = drop_bh;
945 968
946 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state); 969 ret = gfs2_lm_unlock(sdp, gl->gl_lock, gl->gl_state);
947 970
@@ -964,16 +987,17 @@ static void gfs2_glock_drop_th(struct gfs2_glock *gl)
964static void do_cancels(struct gfs2_holder *gh) 987static void do_cancels(struct gfs2_holder *gh)
965{ 988{
966 struct gfs2_glock *gl = gh->gh_gl; 989 struct gfs2_glock *gl = gh->gh_gl;
990 struct gfs2_sbd *sdp = gl->gl_sbd;
967 991
968 spin_lock(&gl->gl_spin); 992 spin_lock(&gl->gl_spin);
969 993
970 while (gl->gl_req_gh != gh && 994 while (gl->gl_req_gh != gh &&
971 !test_bit(HIF_HOLDER, &gh->gh_iflags) && 995 !test_bit(HIF_HOLDER, &gh->gh_iflags) &&
972 !list_empty(&gh->gh_list)) { 996 !list_empty(&gh->gh_list)) {
973 if (gl->gl_req_bh && !(gl->gl_req_gh && 997 if (!(gl->gl_req_gh && (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
974 (gl->gl_req_gh->gh_flags & GL_NOCANCEL))) {
975 spin_unlock(&gl->gl_spin); 998 spin_unlock(&gl->gl_spin);
976 gfs2_lm_cancel(gl->gl_sbd, gl->gl_lock); 999 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1000 sdp->sd_lockstruct.ls_ops->lm_cancel(gl->gl_lock);
977 msleep(100); 1001 msleep(100);
978 spin_lock(&gl->gl_spin); 1002 spin_lock(&gl->gl_spin);
979 } else { 1003 } else {
@@ -1041,7 +1065,6 @@ static int glock_wait_internal(struct gfs2_holder *gh)
1041 1065
1042 spin_lock(&gl->gl_spin); 1066 spin_lock(&gl->gl_spin);
1043 gl->gl_req_gh = NULL; 1067 gl->gl_req_gh = NULL;
1044 gl->gl_req_bh = NULL;
1045 clear_bit(GLF_LOCK, &gl->gl_flags); 1068 clear_bit(GLF_LOCK, &gl->gl_flags);
1046 run_queue(gl); 1069 run_queue(gl);
1047 spin_unlock(&gl->gl_spin); 1070 spin_unlock(&gl->gl_spin);
@@ -1428,6 +1451,14 @@ void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs)
1428 gfs2_glock_dq_uninit(&ghs[x]); 1451 gfs2_glock_dq_uninit(&ghs[x]);
1429} 1452}
1430 1453
1454static int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
1455{
1456 int error = -EIO;
1457 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1458 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
1459 return error;
1460}
1461
1431/** 1462/**
1432 * gfs2_lvb_hold - attach a LVB from a glock 1463 * gfs2_lvb_hold - attach a LVB from a glock
1433 * @gl: The glock in question 1464 * @gl: The glock in question
@@ -1463,12 +1494,15 @@ int gfs2_lvb_hold(struct gfs2_glock *gl)
1463 1494
1464void gfs2_lvb_unhold(struct gfs2_glock *gl) 1495void gfs2_lvb_unhold(struct gfs2_glock *gl)
1465{ 1496{
1497 struct gfs2_sbd *sdp = gl->gl_sbd;
1498
1466 gfs2_glock_hold(gl); 1499 gfs2_glock_hold(gl);
1467 gfs2_glmutex_lock(gl); 1500 gfs2_glmutex_lock(gl);
1468 1501
1469 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0); 1502 gfs2_assert(gl->gl_sbd, atomic_read(&gl->gl_lvb_count) > 0);
1470 if (atomic_dec_and_test(&gl->gl_lvb_count)) { 1503 if (atomic_dec_and_test(&gl->gl_lvb_count)) {
1471 gfs2_lm_unhold_lvb(gl->gl_sbd, gl->gl_lock, gl->gl_lvb); 1504 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
1505 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(gl->gl_lock, gl->gl_lvb);
1472 gl->gl_lvb = NULL; 1506 gl->gl_lvb = NULL;
1473 gfs2_glock_put(gl); 1507 gfs2_glock_put(gl);
1474 } 1508 }
@@ -1534,8 +1568,7 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data)
1534 gl = gfs2_glock_find(sdp, &async->lc_name); 1568 gl = gfs2_glock_find(sdp, &async->lc_name);
1535 if (gfs2_assert_warn(sdp, gl)) 1569 if (gfs2_assert_warn(sdp, gl))
1536 return; 1570 return;
1537 if (!gfs2_assert_warn(sdp, gl->gl_req_bh)) 1571 xmote_bh(gl, async->lc_ret);
1538 gl->gl_req_bh(gl, async->lc_ret);
1539 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) 1572 if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
1540 gfs2_glock_put(gl); 1573 gfs2_glock_put(gl);
1541 up_read(&gfs2_umount_flush_sem); 1574 up_read(&gfs2_umount_flush_sem);
@@ -1594,10 +1627,10 @@ void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
1594 gfs2_glock_hold(gl); 1627 gfs2_glock_hold(gl);
1595 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); 1628 list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list);
1596 atomic_inc(&sdp->sd_reclaim_count); 1629 atomic_inc(&sdp->sd_reclaim_count);
1597 } 1630 spin_unlock(&sdp->sd_reclaim_lock);
1598 spin_unlock(&sdp->sd_reclaim_lock); 1631 wake_up(&sdp->sd_reclaim_wq);
1599 1632 } else
1600 wake_up(&sdp->sd_reclaim_wq); 1633 spin_unlock(&sdp->sd_reclaim_lock);
1601} 1634}
1602 1635
1603/** 1636/**
@@ -1897,7 +1930,6 @@ static int dump_glock(struct glock_iter *gi, struct gfs2_glock *gl)
1897 print_dbg(gi, " gl_owner = -1\n"); 1930 print_dbg(gi, " gl_owner = -1\n");
1898 print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip); 1931 print_dbg(gi, " gl_ip = %lu\n", gl->gl_ip);
1899 print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no"); 1932 print_dbg(gi, " req_gh = %s\n", (gl->gl_req_gh) ? "yes" : "no");
1900 print_dbg(gi, " req_bh = %s\n", (gl->gl_req_bh) ? "yes" : "no");
1901 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count)); 1933 print_dbg(gi, " lvb_count = %d\n", atomic_read(&gl->gl_lvb_count));
1902 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no"); 1934 print_dbg(gi, " object = %s\n", (gl->gl_object) ? "yes" : "no");
1903 print_dbg(gi, " reclaim = %s\n", 1935 print_dbg(gi, " reclaim = %s\n",
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 2f9c6d136b37..cdad3e6f8150 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -32,24 +32,23 @@
32#define GLR_TRYFAILED 13 32#define GLR_TRYFAILED 13
33#define GLR_CANCELED 14 33#define GLR_CANCELED 14
34 34
35static inline int gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) 35static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
36{ 36{
37 struct gfs2_holder *gh; 37 struct gfs2_holder *gh;
38 int locked = 0;
39 struct pid *pid; 38 struct pid *pid;
40 39
41 /* Look in glock's list of holders for one with current task as owner */ 40 /* Look in glock's list of holders for one with current task as owner */
42 spin_lock(&gl->gl_spin); 41 spin_lock(&gl->gl_spin);
43 pid = task_pid(current); 42 pid = task_pid(current);
44 list_for_each_entry(gh, &gl->gl_holders, gh_list) { 43 list_for_each_entry(gh, &gl->gl_holders, gh_list) {
45 if (gh->gh_owner_pid == pid) { 44 if (gh->gh_owner_pid == pid)
46 locked = 1; 45 goto out;
47 break;
48 }
49 } 46 }
47 gh = NULL;
48out:
50 spin_unlock(&gl->gl_spin); 49 spin_unlock(&gl->gl_spin);
51 50
52 return locked; 51 return gh;
53} 52}
54 53
55static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl) 54static inline int gfs2_glock_is_held_excl(struct gfs2_glock *gl)
@@ -79,7 +78,6 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
79int gfs2_glock_get(struct gfs2_sbd *sdp, 78int gfs2_glock_get(struct gfs2_sbd *sdp,
80 u64 number, const struct gfs2_glock_operations *glops, 79 u64 number, const struct gfs2_glock_operations *glops,
81 int create, struct gfs2_glock **glp); 80 int create, struct gfs2_glock **glp);
82void gfs2_glock_hold(struct gfs2_glock *gl);
83int gfs2_glock_put(struct gfs2_glock *gl); 81int gfs2_glock_put(struct gfs2_glock *gl);
84void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, 82void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
85 struct gfs2_holder *gh); 83 struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index c663b7a0f410..d31badadef8f 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -126,7 +126,13 @@ static void meta_go_inval(struct gfs2_glock *gl, int flags)
126 return; 126 return;
127 127
128 gfs2_meta_inval(gl); 128 gfs2_meta_inval(gl);
129 gl->gl_vn++; 129 if (gl->gl_object == GFS2_I(gl->gl_sbd->sd_rindex))
130 gl->gl_sbd->sd_rindex_uptodate = 0;
131 else if (gl->gl_ops == &gfs2_rgrp_glops && gl->gl_object) {
132 struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object;
133
134 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
135 }
130} 136}
131 137
132/** 138/**
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 525dcae352d6..9c2c0b90b22a 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -44,7 +44,6 @@ struct gfs2_log_header_host {
44 44
45struct gfs2_log_operations { 45struct gfs2_log_operations {
46 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le); 46 void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_log_element *le);
47 void (*lo_incore_commit) (struct gfs2_sbd *sdp, struct gfs2_trans *tr);
48 void (*lo_before_commit) (struct gfs2_sbd *sdp); 47 void (*lo_before_commit) (struct gfs2_sbd *sdp);
49 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai); 48 void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
50 void (*lo_before_scan) (struct gfs2_jdesc *jd, 49 void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -70,7 +69,6 @@ struct gfs2_bitmap {
70}; 69};
71 70
72struct gfs2_rgrp_host { 71struct gfs2_rgrp_host {
73 u32 rg_flags;
74 u32 rg_free; 72 u32 rg_free;
75 u32 rg_dinodes; 73 u32 rg_dinodes;
76 u64 rg_igeneration; 74 u64 rg_igeneration;
@@ -87,17 +85,17 @@ struct gfs2_rgrpd {
87 u32 rd_data; /* num of data blocks in rgrp */ 85 u32 rd_data; /* num of data blocks in rgrp */
88 u32 rd_bitbytes; /* number of bytes in data bitmaps */ 86 u32 rd_bitbytes; /* number of bytes in data bitmaps */
89 struct gfs2_rgrp_host rd_rg; 87 struct gfs2_rgrp_host rd_rg;
90 u64 rd_rg_vn;
91 struct gfs2_bitmap *rd_bits; 88 struct gfs2_bitmap *rd_bits;
92 unsigned int rd_bh_count; 89 unsigned int rd_bh_count;
93 struct mutex rd_mutex; 90 struct mutex rd_mutex;
94 u32 rd_free_clone; 91 u32 rd_free_clone;
95 struct gfs2_log_element rd_le; 92 struct gfs2_log_element rd_le;
96 u32 rd_last_alloc_data; 93 u32 rd_last_alloc;
97 u32 rd_last_alloc_meta;
98 struct gfs2_sbd *rd_sbd; 94 struct gfs2_sbd *rd_sbd;
99 unsigned long rd_flags; 95 unsigned char rd_flags;
100#define GFS2_RDF_CHECK 0x0001 /* Need to check for unlinked inodes */ 96#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */
97#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */
98#define GFS2_RDF_UPTODATE 0x04 /* rg is up to date */
101}; 99};
102 100
103enum gfs2_state_bits { 101enum gfs2_state_bits {
@@ -168,6 +166,8 @@ enum {
168 GLF_DIRTY = 5, 166 GLF_DIRTY = 5,
169 GLF_DEMOTE_IN_PROGRESS = 6, 167 GLF_DEMOTE_IN_PROGRESS = 6,
170 GLF_LFLUSH = 7, 168 GLF_LFLUSH = 7,
169 GLF_WAITERS2 = 8,
170 GLF_CONV_DEADLK = 9,
171}; 171};
172 172
173struct gfs2_glock { 173struct gfs2_glock {
@@ -187,18 +187,15 @@ struct gfs2_glock {
187 struct list_head gl_holders; 187 struct list_head gl_holders;
188 struct list_head gl_waiters1; /* HIF_MUTEX */ 188 struct list_head gl_waiters1; /* HIF_MUTEX */
189 struct list_head gl_waiters3; /* HIF_PROMOTE */ 189 struct list_head gl_waiters3; /* HIF_PROMOTE */
190 int gl_waiters2; /* GIF_DEMOTE */
191 190
192 const struct gfs2_glock_operations *gl_ops; 191 const struct gfs2_glock_operations *gl_ops;
193 192
194 struct gfs2_holder *gl_req_gh; 193 struct gfs2_holder *gl_req_gh;
195 gfs2_glop_bh_t gl_req_bh;
196 194
197 void *gl_lock; 195 void *gl_lock;
198 char *gl_lvb; 196 char *gl_lvb;
199 atomic_t gl_lvb_count; 197 atomic_t gl_lvb_count;
200 198
201 u64 gl_vn;
202 unsigned long gl_stamp; 199 unsigned long gl_stamp;
203 unsigned long gl_tchange; 200 unsigned long gl_tchange;
204 void *gl_object; 201 void *gl_object;
@@ -213,6 +210,8 @@ struct gfs2_glock {
213 struct delayed_work gl_work; 210 struct delayed_work gl_work;
214}; 211};
215 212
213#define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */
214
216struct gfs2_alloc { 215struct gfs2_alloc {
217 /* Quota stuff */ 216 /* Quota stuff */
218 217
@@ -241,14 +240,9 @@ enum {
241 240
242struct gfs2_dinode_host { 241struct gfs2_dinode_host {
243 u64 di_size; /* number of bytes in file */ 242 u64 di_size; /* number of bytes in file */
244 u64 di_blocks; /* number of blocks in file */
245 u64 di_goal_meta; /* rgrp to alloc from next */
246 u64 di_goal_data; /* data block goal */
247 u64 di_generation; /* generation number for NFS */ 243 u64 di_generation; /* generation number for NFS */
248 u32 di_flags; /* GFS2_DIF_... */ 244 u32 di_flags; /* GFS2_DIF_... */
249 u16 di_height; /* height of metadata */
250 /* These only apply to directories */ 245 /* These only apply to directories */
251 u16 di_depth; /* Number of bits in the table */
252 u32 di_entries; /* The number of entries in the directory */ 246 u32 di_entries; /* The number of entries in the directory */
253 u64 di_eattr; /* extended attribute block number */ 247 u64 di_eattr; /* extended attribute block number */
254}; 248};
@@ -265,9 +259,10 @@ struct gfs2_inode {
265 struct gfs2_holder i_iopen_gh; 259 struct gfs2_holder i_iopen_gh;
266 struct gfs2_holder i_gh; /* for prepare/commit_write only */ 260 struct gfs2_holder i_gh; /* for prepare/commit_write only */
267 struct gfs2_alloc *i_alloc; 261 struct gfs2_alloc *i_alloc;
268 u64 i_last_rg_alloc; 262 u64 i_goal; /* goal block for allocations */
269
270 struct rw_semaphore i_rw_mutex; 263 struct rw_semaphore i_rw_mutex;
264 u8 i_height;
265 u8 i_depth;
271}; 266};
272 267
273/* 268/*
@@ -490,9 +485,9 @@ struct gfs2_sbd {
490 u32 sd_qc_per_block; 485 u32 sd_qc_per_block;
491 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */ 486 u32 sd_max_dirres; /* Max blocks needed to add a directory entry */
492 u32 sd_max_height; /* Max height of a file's metadata tree */ 487 u32 sd_max_height; /* Max height of a file's metadata tree */
493 u64 sd_heightsize[GFS2_MAX_META_HEIGHT]; 488 u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
494 u32 sd_max_jheight; /* Max height of journaled file's meta tree */ 489 u32 sd_max_jheight; /* Max height of journaled file's meta tree */
495 u64 sd_jheightsize[GFS2_MAX_META_HEIGHT]; 490 u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
496 491
497 struct gfs2_args sd_args; /* Mount arguments */ 492 struct gfs2_args sd_args; /* Mount arguments */
498 struct gfs2_tune sd_tune; /* Filesystem tuning structure */ 493 struct gfs2_tune sd_tune; /* Filesystem tuning structure */
@@ -533,7 +528,7 @@ struct gfs2_sbd {
533 528
534 /* Resource group stuff */ 529 /* Resource group stuff */
535 530
536 u64 sd_rindex_vn; 531 int sd_rindex_uptodate;
537 spinlock_t sd_rindex_spin; 532 spinlock_t sd_rindex_spin;
538 struct mutex sd_rindex_mutex; 533 struct mutex sd_rindex_mutex;
539 struct list_head sd_rindex_list; 534 struct list_head sd_rindex_list;
@@ -637,9 +632,6 @@ struct gfs2_sbd {
637 632
638 /* Counters */ 633 /* Counters */
639 634
640 atomic_t sd_glock_count;
641 atomic_t sd_glock_held_count;
642 atomic_t sd_inode_count;
643 atomic_t sd_reclaimed; 635 atomic_t sd_reclaimed;
644 636
645 char sd_fsname[GFS2_FSNAME_LEN]; 637 char sd_fsname[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 37725ade3c51..3a9ef526c308 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -149,7 +149,8 @@ void gfs2_set_iop(struct inode *inode)
149 } else if (S_ISLNK(mode)) { 149 } else if (S_ISLNK(mode)) {
150 inode->i_op = &gfs2_symlink_iops; 150 inode->i_op = &gfs2_symlink_iops;
151 } else { 151 } else {
152 inode->i_op = &gfs2_dev_iops; 152 inode->i_op = &gfs2_file_iops;
153 init_special_inode(inode, inode->i_mode, inode->i_rdev);
153 } 154 }
154 155
155 unlock_new_inode(inode); 156 unlock_new_inode(inode);
@@ -248,12 +249,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
248{ 249{
249 struct gfs2_dinode_host *di = &ip->i_di; 250 struct gfs2_dinode_host *di = &ip->i_di;
250 const struct gfs2_dinode *str = buf; 251 const struct gfs2_dinode *str = buf;
252 u16 height, depth;
251 253
252 if (ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)) { 254 if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr)))
253 if (gfs2_consist_inode(ip)) 255 goto corrupt;
254 gfs2_dinode_print(ip);
255 return -EIO;
256 }
257 ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino); 256 ip->i_no_formal_ino = be64_to_cpu(str->di_num.no_formal_ino);
258 ip->i_inode.i_mode = be32_to_cpu(str->di_mode); 257 ip->i_inode.i_mode = be32_to_cpu(str->di_mode);
259 ip->i_inode.i_rdev = 0; 258 ip->i_inode.i_rdev = 0;
@@ -275,8 +274,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
275 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); 274 ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink);
276 di->di_size = be64_to_cpu(str->di_size); 275 di->di_size = be64_to_cpu(str->di_size);
277 i_size_write(&ip->i_inode, di->di_size); 276 i_size_write(&ip->i_inode, di->di_size);
278 di->di_blocks = be64_to_cpu(str->di_blocks); 277 gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks));
279 gfs2_set_inode_blocks(&ip->i_inode);
280 ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); 278 ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime);
281 ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); 279 ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec);
282 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); 280 ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime);
@@ -284,15 +282,20 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
284 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); 282 ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime);
285 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); 283 ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec);
286 284
287 di->di_goal_meta = be64_to_cpu(str->di_goal_meta); 285 ip->i_goal = be64_to_cpu(str->di_goal_meta);
288 di->di_goal_data = be64_to_cpu(str->di_goal_data);
289 di->di_generation = be64_to_cpu(str->di_generation); 286 di->di_generation = be64_to_cpu(str->di_generation);
290 287
291 di->di_flags = be32_to_cpu(str->di_flags); 288 di->di_flags = be32_to_cpu(str->di_flags);
292 gfs2_set_inode_flags(&ip->i_inode); 289 gfs2_set_inode_flags(&ip->i_inode);
293 di->di_height = be16_to_cpu(str->di_height); 290 height = be16_to_cpu(str->di_height);
294 291 if (unlikely(height > GFS2_MAX_META_HEIGHT))
295 di->di_depth = be16_to_cpu(str->di_depth); 292 goto corrupt;
293 ip->i_height = (u8)height;
294
295 depth = be16_to_cpu(str->di_depth);
296 if (unlikely(depth > GFS2_DIR_MAX_DEPTH))
297 goto corrupt;
298 ip->i_depth = (u8)depth;
296 di->di_entries = be32_to_cpu(str->di_entries); 299 di->di_entries = be32_to_cpu(str->di_entries);
297 300
298 di->di_eattr = be64_to_cpu(str->di_eattr); 301 di->di_eattr = be64_to_cpu(str->di_eattr);
@@ -300,6 +303,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
300 gfs2_set_aops(&ip->i_inode); 303 gfs2_set_aops(&ip->i_inode);
301 304
302 return 0; 305 return 0;
306corrupt:
307 if (gfs2_consist_inode(ip))
308 gfs2_dinode_print(ip);
309 return -EIO;
303} 310}
304 311
305/** 312/**
@@ -337,13 +344,15 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip)
337 struct gfs2_rgrpd *rgd; 344 struct gfs2_rgrpd *rgd;
338 int error; 345 int error;
339 346
340 if (ip->i_di.di_blocks != 1) { 347 if (gfs2_get_inode_blocks(&ip->i_inode) != 1) {
341 if (gfs2_consist_inode(ip)) 348 if (gfs2_consist_inode(ip))
342 gfs2_dinode_print(ip); 349 gfs2_dinode_print(ip);
343 return -EIO; 350 return -EIO;
344 } 351 }
345 352
346 al = gfs2_alloc_get(ip); 353 al = gfs2_alloc_get(ip);
354 if (!al)
355 return -ENOMEM;
347 356
348 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 357 error = gfs2_quota_hold(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
349 if (error) 358 if (error)
@@ -487,7 +496,7 @@ struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name,
487 return dir; 496 return dir;
488 } 497 }
489 498
490 if (gfs2_glock_is_locked_by_me(dip->i_gl) == 0) { 499 if (gfs2_glock_is_locked_by_me(dip->i_gl) == NULL) {
491 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); 500 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
492 if (error) 501 if (error)
493 return ERR_PTR(error); 502 return ERR_PTR(error);
@@ -818,7 +827,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
818 int error; 827 int error;
819 828
820 munge_mode_uid_gid(dip, &mode, &uid, &gid); 829 munge_mode_uid_gid(dip, &mode, &uid, &gid);
821 gfs2_alloc_get(dip); 830 if (!gfs2_alloc_get(dip))
831 return -ENOMEM;
822 832
823 error = gfs2_quota_lock(dip, uid, gid); 833 error = gfs2_quota_lock(dip, uid, gid);
824 if (error) 834 if (error)
@@ -853,6 +863,8 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
853 int error; 863 int error;
854 864
855 al = gfs2_alloc_get(dip); 865 al = gfs2_alloc_get(dip);
866 if (!al)
867 return -ENOMEM;
856 868
857 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 869 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
858 if (error) 870 if (error)
@@ -1219,7 +1231,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len)
1219 1231
1220 x = ip->i_di.di_size + 1; 1232 x = ip->i_di.di_size + 1;
1221 if (x > *len) { 1233 if (x > *len) {
1222 *buf = kmalloc(x, GFP_KERNEL); 1234 *buf = kmalloc(x, GFP_NOFS);
1223 if (!*buf) { 1235 if (!*buf) {
1224 error = -ENOMEM; 1236 error = -ENOMEM;
1225 goto out_brelse; 1237 goto out_brelse;
@@ -1391,21 +1403,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
1391 str->di_gid = cpu_to_be32(ip->i_inode.i_gid); 1403 str->di_gid = cpu_to_be32(ip->i_inode.i_gid);
1392 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); 1404 str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink);
1393 str->di_size = cpu_to_be64(di->di_size); 1405 str->di_size = cpu_to_be64(di->di_size);
1394 str->di_blocks = cpu_to_be64(di->di_blocks); 1406 str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
1395 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); 1407 str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec);
1396 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); 1408 str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec);
1397 str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec); 1409 str->di_ctime = cpu_to_be64(ip->i_inode.i_ctime.tv_sec);
1398 1410
1399 str->di_goal_meta = cpu_to_be64(di->di_goal_meta); 1411 str->di_goal_meta = cpu_to_be64(ip->i_goal);
1400 str->di_goal_data = cpu_to_be64(di->di_goal_data); 1412 str->di_goal_data = cpu_to_be64(ip->i_goal);
1401 str->di_generation = cpu_to_be64(di->di_generation); 1413 str->di_generation = cpu_to_be64(di->di_generation);
1402 1414
1403 str->di_flags = cpu_to_be32(di->di_flags); 1415 str->di_flags = cpu_to_be32(di->di_flags);
1404 str->di_height = cpu_to_be16(di->di_height); 1416 str->di_height = cpu_to_be16(ip->i_height);
1405 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && 1417 str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) &&
1406 !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? 1418 !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ?
1407 GFS2_FORMAT_DE : 0); 1419 GFS2_FORMAT_DE : 0);
1408 str->di_depth = cpu_to_be16(di->di_depth); 1420 str->di_depth = cpu_to_be16(ip->i_depth);
1409 str->di_entries = cpu_to_be32(di->di_entries); 1421 str->di_entries = cpu_to_be32(di->di_entries);
1410 1422
1411 str->di_eattr = cpu_to_be64(di->di_eattr); 1423 str->di_eattr = cpu_to_be64(di->di_eattr);
@@ -1423,15 +1435,13 @@ void gfs2_dinode_print(const struct gfs2_inode *ip)
1423 printk(KERN_INFO " no_addr = %llu\n", 1435 printk(KERN_INFO " no_addr = %llu\n",
1424 (unsigned long long)ip->i_no_addr); 1436 (unsigned long long)ip->i_no_addr);
1425 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); 1437 printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size);
1426 printk(KERN_INFO " di_blocks = %llu\n", 1438 printk(KERN_INFO " blocks = %llu\n",
1427 (unsigned long long)di->di_blocks); 1439 (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode));
1428 printk(KERN_INFO " di_goal_meta = %llu\n", 1440 printk(KERN_INFO " i_goal = %llu\n",
1429 (unsigned long long)di->di_goal_meta); 1441 (unsigned long long)ip->i_goal);
1430 printk(KERN_INFO " di_goal_data = %llu\n",
1431 (unsigned long long)di->di_goal_data);
1432 printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); 1442 printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags);
1433 printk(KERN_INFO " di_height = %u\n", di->di_height); 1443 printk(KERN_INFO " i_height = %u\n", ip->i_height);
1434 printk(KERN_INFO " di_depth = %u\n", di->di_depth); 1444 printk(KERN_INFO " i_depth = %u\n", ip->i_depth);
1435 printk(KERN_INFO " di_entries = %u\n", di->di_entries); 1445 printk(KERN_INFO " di_entries = %u\n", di->di_entries);
1436 printk(KERN_INFO " di_eattr = %llu\n", 1446 printk(KERN_INFO " di_eattr = %llu\n",
1437 (unsigned long long)di->di_eattr); 1447 (unsigned long long)di->di_eattr);
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index d44650662615..580da454b38f 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -10,9 +10,11 @@
10#ifndef __INODE_DOT_H__ 10#ifndef __INODE_DOT_H__
11#define __INODE_DOT_H__ 11#define __INODE_DOT_H__
12 12
13#include "util.h"
14
13static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) 15static inline int gfs2_is_stuffed(const struct gfs2_inode *ip)
14{ 16{
15 return !ip->i_di.di_height; 17 return !ip->i_height;
16} 18}
17 19
18static inline int gfs2_is_jdata(const struct gfs2_inode *ip) 20static inline int gfs2_is_jdata(const struct gfs2_inode *ip)
@@ -37,13 +39,25 @@ static inline int gfs2_is_dir(const struct gfs2_inode *ip)
37 return S_ISDIR(ip->i_inode.i_mode); 39 return S_ISDIR(ip->i_inode.i_mode);
38} 40}
39 41
40static inline void gfs2_set_inode_blocks(struct inode *inode) 42static inline void gfs2_set_inode_blocks(struct inode *inode, u64 blocks)
43{
44 inode->i_blocks = blocks <<
45 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
46}
47
48static inline u64 gfs2_get_inode_blocks(const struct inode *inode)
41{ 49{
42 struct gfs2_inode *ip = GFS2_I(inode); 50 return inode->i_blocks >>
43 inode->i_blocks = ip->i_di.di_blocks <<
44 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT); 51 (GFS2_SB(inode)->sd_sb.sb_bsize_shift - GFS2_BASIC_BLOCK_SHIFT);
45} 52}
46 53
54static inline void gfs2_add_inode_blocks(struct inode *inode, s64 change)
55{
56 gfs2_assert(GFS2_SB(inode), (change >= 0 || inode->i_blocks > -change));
57 change *= (GFS2_SB(inode)->sd_sb.sb_bsize/GFS2_BASIC_BLOCK);
58 inode->i_blocks += change;
59}
60
47static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr, 61static inline int gfs2_check_inum(const struct gfs2_inode *ip, u64 no_addr,
48 u64 no_formal_ino) 62 u64 no_formal_ino)
49{ 63{
diff --git a/fs/gfs2/lm.c b/fs/gfs2/lm.c
deleted file mode 100644
index cfcc39b86a53..000000000000
--- a/fs/gfs2/lm.c
+++ /dev/null
@@ -1,210 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#include <linux/slab.h>
11#include <linux/spinlock.h>
12#include <linux/completion.h>
13#include <linux/buffer_head.h>
14#include <linux/delay.h>
15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h>
17
18#include "gfs2.h"
19#include "incore.h"
20#include "glock.h"
21#include "lm.h"
22#include "super.h"
23#include "util.h"
24
25/**
26 * gfs2_lm_mount - mount a locking protocol
27 * @sdp: the filesystem
28 * @args: mount arguements
29 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
30 *
31 * Returns: errno
32 */
33
34int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
35{
36 char *proto = sdp->sd_proto_name;
37 char *table = sdp->sd_table_name;
38 int flags = 0;
39 int error;
40
41 if (sdp->sd_args.ar_spectator)
42 flags |= LM_MFLAG_SPECTATOR;
43
44 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
45
46 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
47 gfs2_glock_cb, sdp,
48 GFS2_MIN_LVB_SIZE, flags,
49 &sdp->sd_lockstruct, &sdp->sd_kobj);
50 if (error) {
51 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
52 proto, table, sdp->sd_args.ar_hostdata);
53 goto out;
54 }
55
56 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
57 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
58 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
59 GFS2_MIN_LVB_SIZE)) {
60 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
61 goto out;
62 }
63
64 if (sdp->sd_args.ar_spectator)
65 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
66 else
67 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
68 sdp->sd_lockstruct.ls_jid);
69
70 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
71
72 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
73 !sdp->sd_args.ar_ignore_local_fs) {
74 sdp->sd_args.ar_localflocks = 1;
75 sdp->sd_args.ar_localcaching = 1;
76 }
77
78out:
79 return error;
80}
81
82void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
83{
84 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
85 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
86 sdp->sd_lockstruct.ls_lockspace);
87}
88
89void gfs2_lm_unmount(struct gfs2_sbd *sdp)
90{
91 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
92 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
93}
94
95int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
96{
97 va_list args;
98
99 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
100 return 0;
101
102 va_start(args, fmt);
103 vprintk(fmt, args);
104 va_end(args);
105
106 fs_err(sdp, "about to withdraw this file system\n");
107 BUG_ON(sdp->sd_args.ar_debug);
108
109 fs_err(sdp, "telling LM to withdraw\n");
110 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
111 fs_err(sdp, "withdrawn\n");
112 dump_stack();
113
114 return -1;
115}
116
117int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
118 void **lockp)
119{
120 int error = -EIO;
121 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
122 error = sdp->sd_lockstruct.ls_ops->lm_get_lock(
123 sdp->sd_lockstruct.ls_lockspace, name, lockp);
124 return error;
125}
126
127void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock)
128{
129 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
130 sdp->sd_lockstruct.ls_ops->lm_put_lock(lock);
131}
132
133unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
134 unsigned int cur_state, unsigned int req_state,
135 unsigned int flags)
136{
137 int ret = 0;
138 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
139 ret = sdp->sd_lockstruct.ls_ops->lm_lock(lock, cur_state,
140 req_state, flags);
141 return ret;
142}
143
144unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
145 unsigned int cur_state)
146{
147 int ret = 0;
148 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
149 ret = sdp->sd_lockstruct.ls_ops->lm_unlock(lock, cur_state);
150 return ret;
151}
152
153void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock)
154{
155 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
156 sdp->sd_lockstruct.ls_ops->lm_cancel(lock);
157}
158
159int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp)
160{
161 int error = -EIO;
162 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
163 error = sdp->sd_lockstruct.ls_ops->lm_hold_lvb(lock, lvbp);
164 return error;
165}
166
167void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb)
168{
169 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
170 sdp->sd_lockstruct.ls_ops->lm_unhold_lvb(lock, lvb);
171}
172
173int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
174 struct file *file, struct file_lock *fl)
175{
176 int error = -EIO;
177 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
178 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
179 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
180 return error;
181}
182
183int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
184 struct file *file, int cmd, struct file_lock *fl)
185{
186 int error = -EIO;
187 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
188 error = sdp->sd_lockstruct.ls_ops->lm_plock(
189 sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
190 return error;
191}
192
193int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
194 struct file *file, struct file_lock *fl)
195{
196 int error = -EIO;
197 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
198 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
199 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
200 return error;
201}
202
203void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
204 unsigned int message)
205{
206 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
207 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
208 sdp->sd_lockstruct.ls_lockspace, jid, message);
209}
210
diff --git a/fs/gfs2/lm.h b/fs/gfs2/lm.h
deleted file mode 100644
index 21cdc30ee08c..000000000000
--- a/fs/gfs2/lm.h
+++ /dev/null
@@ -1,42 +0,0 @@
1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved.
4 *
5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions
7 * of the GNU General Public License version 2.
8 */
9
10#ifndef __LM_DOT_H__
11#define __LM_DOT_H__
12
13struct gfs2_sbd;
14
15#define GFS2_MIN_LVB_SIZE 32
16
17int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent);
18void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp);
19void gfs2_lm_unmount(struct gfs2_sbd *sdp);
20int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
21 __attribute__ ((format(printf, 2, 3)));
22int gfs2_lm_get_lock(struct gfs2_sbd *sdp, struct lm_lockname *name,
23 void **lockp);
24void gfs2_lm_put_lock(struct gfs2_sbd *sdp, void *lock);
25unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock,
26 unsigned int cur_state, unsigned int req_state,
27 unsigned int flags);
28unsigned int gfs2_lm_unlock(struct gfs2_sbd *sdp, void *lock,
29 unsigned int cur_state);
30void gfs2_lm_cancel(struct gfs2_sbd *sdp, void *lock);
31int gfs2_lm_hold_lvb(struct gfs2_sbd *sdp, void *lock, char **lvbp);
32void gfs2_lm_unhold_lvb(struct gfs2_sbd *sdp, void *lock, char *lvb);
33int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
34 struct file *file, struct file_lock *fl);
35int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
36 struct file *file, int cmd, struct file_lock *fl);
37int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
38 struct file *file, struct file_lock *fl);
39void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
40 unsigned int message);
41
42#endif /* __LM_DOT_H__ */
diff --git a/fs/gfs2/locking/dlm/Makefile b/fs/gfs2/locking/dlm/Makefile
index 89b93b6b45cf..2609bb6cd013 100644
--- a/fs/gfs2/locking/dlm/Makefile
+++ b/fs/gfs2/locking/dlm/Makefile
@@ -1,3 +1,3 @@
1obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o 1obj-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o
2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o plock.o 2lock_dlm-y := lock.o main.o mount.o sysfs.o thread.o
3 3
diff --git a/fs/gfs2/locking/dlm/lock.c b/fs/gfs2/locking/dlm/lock.c
index 542a797ac89a..cf7ea8abec87 100644
--- a/fs/gfs2/locking/dlm/lock.c
+++ b/fs/gfs2/locking/dlm/lock.c
@@ -137,7 +137,8 @@ static inline unsigned int make_flags(struct gdlm_lock *lp,
137 137
138 /* Conversion deadlock avoidance by DLM */ 138 /* Conversion deadlock avoidance by DLM */
139 139
140 if (!test_bit(LFL_FORCE_PROMOTE, &lp->flags) && 140 if (!(lp->ls->fsflags & LM_MFLAG_CONV_NODROP) &&
141 !test_bit(LFL_FORCE_PROMOTE, &lp->flags) &&
141 !(lkf & DLM_LKF_NOQUEUE) && 142 !(lkf & DLM_LKF_NOQUEUE) &&
142 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req) 143 cur > DLM_LOCK_NL && req > DLM_LOCK_NL && cur != req)
143 lkf |= DLM_LKF_CONVDEADLK; 144 lkf |= DLM_LKF_CONVDEADLK;
@@ -164,7 +165,7 @@ static int gdlm_create_lp(struct gdlm_ls *ls, struct lm_lockname *name,
164{ 165{
165 struct gdlm_lock *lp; 166 struct gdlm_lock *lp;
166 167
167 lp = kzalloc(sizeof(struct gdlm_lock), GFP_KERNEL); 168 lp = kzalloc(sizeof(struct gdlm_lock), GFP_NOFS);
168 if (!lp) 169 if (!lp)
169 return -ENOMEM; 170 return -ENOMEM;
170 171
@@ -382,7 +383,7 @@ static int gdlm_add_lvb(struct gdlm_lock *lp)
382{ 383{
383 char *lvb; 384 char *lvb;
384 385
385 lvb = kzalloc(GDLM_LVB_SIZE, GFP_KERNEL); 386 lvb = kzalloc(GDLM_LVB_SIZE, GFP_NOFS);
386 if (!lvb) 387 if (!lvb)
387 return -ENOMEM; 388 return -ENOMEM;
388 389
diff --git a/fs/gfs2/locking/dlm/lock_dlm.h b/fs/gfs2/locking/dlm/lock_dlm.h
index 9e8265d28377..a243cf69c54e 100644
--- a/fs/gfs2/locking/dlm/lock_dlm.h
+++ b/fs/gfs2/locking/dlm/lock_dlm.h
@@ -25,6 +25,7 @@
25#include <net/sock.h> 25#include <net/sock.h>
26 26
27#include <linux/dlm.h> 27#include <linux/dlm.h>
28#include <linux/dlm_plock.h>
28#include <linux/lm_interface.h> 29#include <linux/lm_interface.h>
29 30
30/* 31/*
@@ -173,15 +174,9 @@ void gdlm_cancel(void *);
173int gdlm_hold_lvb(void *, char **); 174int gdlm_hold_lvb(void *, char **);
174void gdlm_unhold_lvb(void *, char *); 175void gdlm_unhold_lvb(void *, char *);
175 176
176/* plock.c */ 177/* mount.c */
178
179extern const struct lm_lockops gdlm_ops;
177 180
178int gdlm_plock_init(void);
179void gdlm_plock_exit(void);
180int gdlm_plock(void *, struct lm_lockname *, struct file *, int,
181 struct file_lock *);
182int gdlm_plock_get(void *, struct lm_lockname *, struct file *,
183 struct file_lock *);
184int gdlm_punlock(void *, struct lm_lockname *, struct file *,
185 struct file_lock *);
186#endif 181#endif
187 182
diff --git a/fs/gfs2/locking/dlm/main.c b/fs/gfs2/locking/dlm/main.c
index a0e7eda643ed..b9a03a7ff801 100644
--- a/fs/gfs2/locking/dlm/main.c
+++ b/fs/gfs2/locking/dlm/main.c
@@ -11,8 +11,6 @@
11 11
12#include "lock_dlm.h" 12#include "lock_dlm.h"
13 13
14extern struct lm_lockops gdlm_ops;
15
16static int __init init_lock_dlm(void) 14static int __init init_lock_dlm(void)
17{ 15{
18 int error; 16 int error;
@@ -30,13 +28,6 @@ static int __init init_lock_dlm(void)
30 return error; 28 return error;
31 } 29 }
32 30
33 error = gdlm_plock_init();
34 if (error) {
35 gdlm_sysfs_exit();
36 gfs2_unregister_lockproto(&gdlm_ops);
37 return error;
38 }
39
40 printk(KERN_INFO 31 printk(KERN_INFO
41 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__); 32 "Lock_DLM (built %s %s) installed\n", __DATE__, __TIME__);
42 return 0; 33 return 0;
@@ -44,7 +35,6 @@ static int __init init_lock_dlm(void)
44 35
45static void __exit exit_lock_dlm(void) 36static void __exit exit_lock_dlm(void)
46{ 37{
47 gdlm_plock_exit();
48 gdlm_sysfs_exit(); 38 gdlm_sysfs_exit();
49 gfs2_unregister_lockproto(&gdlm_ops); 39 gfs2_unregister_lockproto(&gdlm_ops);
50} 40}
diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c
index f2efff424224..470bdf650b50 100644
--- a/fs/gfs2/locking/dlm/mount.c
+++ b/fs/gfs2/locking/dlm/mount.c
@@ -236,6 +236,27 @@ static void gdlm_withdraw(void *lockspace)
236 gdlm_kobject_release(ls); 236 gdlm_kobject_release(ls);
237} 237}
238 238
239static int gdlm_plock(void *lockspace, struct lm_lockname *name,
240 struct file *file, int cmd, struct file_lock *fl)
241{
242 struct gdlm_ls *ls = lockspace;
243 return dlm_posix_lock(ls->dlm_lockspace, name->ln_number, file, cmd, fl);
244}
245
246static int gdlm_punlock(void *lockspace, struct lm_lockname *name,
247 struct file *file, struct file_lock *fl)
248{
249 struct gdlm_ls *ls = lockspace;
250 return dlm_posix_unlock(ls->dlm_lockspace, name->ln_number, file, fl);
251}
252
253static int gdlm_plock_get(void *lockspace, struct lm_lockname *name,
254 struct file *file, struct file_lock *fl)
255{
256 struct gdlm_ls *ls = lockspace;
257 return dlm_posix_get(ls->dlm_lockspace, name->ln_number, file, fl);
258}
259
239const struct lm_lockops gdlm_ops = { 260const struct lm_lockops gdlm_ops = {
240 .lm_proto_name = "lock_dlm", 261 .lm_proto_name = "lock_dlm",
241 .lm_mount = gdlm_mount, 262 .lm_mount = gdlm_mount,
diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c
index a87b09839761..8479da47049c 100644
--- a/fs/gfs2/locking/dlm/sysfs.c
+++ b/fs/gfs2/locking/dlm/sysfs.c
@@ -12,8 +12,6 @@
12 12
13#include "lock_dlm.h" 13#include "lock_dlm.h"
14 14
15extern struct lm_lockops gdlm_ops;
16
17static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf) 15static ssize_t proto_name_show(struct gdlm_ls *ls, char *buf)
18{ 16{
19 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name); 17 return sprintf(buf, "%s\n", gdlm_ops.lm_proto_name);
diff --git a/fs/gfs2/locking/dlm/thread.c b/fs/gfs2/locking/dlm/thread.c
index 521694fc19d6..e53db6fd28ab 100644
--- a/fs/gfs2/locking/dlm/thread.c
+++ b/fs/gfs2/locking/dlm/thread.c
@@ -135,7 +135,15 @@ static void process_complete(struct gdlm_lock *lp)
135 lp->lksb.sb_status, lp->lockname.ln_type, 135 lp->lksb.sb_status, lp->lockname.ln_type,
136 (unsigned long long)lp->lockname.ln_number, 136 (unsigned long long)lp->lockname.ln_number,
137 lp->flags); 137 lp->flags);
138 return; 138 if (lp->lksb.sb_status == -EDEADLOCK &&
139 lp->ls->fsflags & LM_MFLAG_CONV_NODROP) {
140 lp->req = lp->cur;
141 acb.lc_ret |= LM_OUT_CONV_DEADLK;
142 if (lp->cur == DLM_LOCK_IV)
143 lp->lksb.sb_lkid = 0;
144 goto out;
145 } else
146 return;
139 } 147 }
140 148
141 /* 149 /*
diff --git a/fs/gfs2/locking/nolock/main.c b/fs/gfs2/locking/nolock/main.c
index d3b8ce6fbbe3..284a5ece8d94 100644
--- a/fs/gfs2/locking/nolock/main.c
+++ b/fs/gfs2/locking/nolock/main.c
@@ -140,7 +140,7 @@ static int nolock_hold_lvb(void *lock, char **lvbp)
140 struct nolock_lockspace *nl = lock; 140 struct nolock_lockspace *nl = lock;
141 int error = 0; 141 int error = 0;
142 142
143 *lvbp = kzalloc(nl->nl_lvb_size, GFP_KERNEL); 143 *lvbp = kzalloc(nl->nl_lvb_size, GFP_NOFS);
144 if (!*lvbp) 144 if (!*lvbp)
145 error = -ENOMEM; 145 error = -ENOMEM;
146 146
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 161ab6f2058e..548264b1836d 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -769,8 +769,8 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
769 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm; 769 sdp->sd_log_commited_revoke += tr->tr_num_revoke - tr->tr_num_revoke_rm;
770 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0); 770 gfs2_assert_withdraw(sdp, ((int)sdp->sd_log_commited_revoke) >= 0);
771 reserved = calc_reserved(sdp); 771 reserved = calc_reserved(sdp);
772 gfs2_assert_withdraw(sdp, sdp->sd_log_blks_reserved + tr->tr_reserved >= reserved);
772 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved; 773 unused = sdp->sd_log_blks_reserved - reserved + tr->tr_reserved;
773 gfs2_assert_withdraw(sdp, unused >= 0);
774 atomic_add(unused, &sdp->sd_log_blks_free); 774 atomic_add(unused, &sdp->sd_log_blks_free);
775 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <= 775 gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
776 sdp->sd_jdesc->jd_blocks); 776 sdp->sd_jdesc->jd_blocks);
@@ -779,6 +779,21 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
779 gfs2_log_unlock(sdp); 779 gfs2_log_unlock(sdp);
780} 780}
781 781
782static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
783{
784 struct list_head *head = &tr->tr_list_buf;
785 struct gfs2_bufdata *bd;
786
787 gfs2_log_lock(sdp);
788 while (!list_empty(head)) {
789 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
790 list_del_init(&bd->bd_list_tr);
791 tr->tr_num_buf--;
792 }
793 gfs2_log_unlock(sdp);
794 gfs2_assert_warn(sdp, !tr->tr_num_buf);
795}
796
782/** 797/**
783 * gfs2_log_commit - Commit a transaction to the log 798 * gfs2_log_commit - Commit a transaction to the log
784 * @sdp: the filesystem 799 * @sdp: the filesystem
@@ -790,7 +805,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
790void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) 805void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
791{ 806{
792 log_refund(sdp, tr); 807 log_refund(sdp, tr);
793 lops_incore_commit(sdp, tr); 808 buf_lo_incore_commit(sdp, tr);
794 809
795 sdp->sd_vfs->s_dirt = 1; 810 sdp->sd_vfs->s_dirt = 1;
796 up_read(&sdp->sd_log_flush_lock); 811 up_read(&sdp->sd_log_flush_lock);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index fae59d69d01a..4390f6f4047d 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -152,21 +152,6 @@ out:
152 unlock_buffer(bd->bd_bh); 152 unlock_buffer(bd->bd_bh);
153} 153}
154 154
155static void buf_lo_incore_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
156{
157 struct list_head *head = &tr->tr_list_buf;
158 struct gfs2_bufdata *bd;
159
160 gfs2_log_lock(sdp);
161 while (!list_empty(head)) {
162 bd = list_entry(head->next, struct gfs2_bufdata, bd_list_tr);
163 list_del_init(&bd->bd_list_tr);
164 tr->tr_num_buf--;
165 }
166 gfs2_log_unlock(sdp);
167 gfs2_assert_warn(sdp, !tr->tr_num_buf);
168}
169
170static void buf_lo_before_commit(struct gfs2_sbd *sdp) 155static void buf_lo_before_commit(struct gfs2_sbd *sdp)
171{ 156{
172 struct buffer_head *bh; 157 struct buffer_head *bh;
@@ -419,8 +404,10 @@ static int revoke_lo_scan_elements(struct gfs2_jdesc *jd, unsigned int start,
419 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset)); 404 blkno = be64_to_cpu(*(__be64 *)(bh->b_data + offset));
420 405
421 error = gfs2_revoke_add(sdp, blkno, start); 406 error = gfs2_revoke_add(sdp, blkno, start);
422 if (error < 0) 407 if (error < 0) {
408 brelse(bh);
423 return error; 409 return error;
410 }
424 else if (error) 411 else if (error)
425 sdp->sd_found_revokes++; 412 sdp->sd_found_revokes++;
426 413
@@ -737,7 +724,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
737 724
738const struct gfs2_log_operations gfs2_buf_lops = { 725const struct gfs2_log_operations gfs2_buf_lops = {
739 .lo_add = buf_lo_add, 726 .lo_add = buf_lo_add,
740 .lo_incore_commit = buf_lo_incore_commit,
741 .lo_before_commit = buf_lo_before_commit, 727 .lo_before_commit = buf_lo_before_commit,
742 .lo_after_commit = buf_lo_after_commit, 728 .lo_after_commit = buf_lo_after_commit,
743 .lo_before_scan = buf_lo_before_scan, 729 .lo_before_scan = buf_lo_before_scan,
@@ -763,7 +749,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
763 749
764const struct gfs2_log_operations gfs2_databuf_lops = { 750const struct gfs2_log_operations gfs2_databuf_lops = {
765 .lo_add = databuf_lo_add, 751 .lo_add = databuf_lo_add,
766 .lo_incore_commit = buf_lo_incore_commit,
767 .lo_before_commit = databuf_lo_before_commit, 752 .lo_before_commit = databuf_lo_before_commit,
768 .lo_after_commit = databuf_lo_after_commit, 753 .lo_after_commit = databuf_lo_after_commit,
769 .lo_scan_elements = databuf_lo_scan_elements, 754 .lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 41a00df75587..3c0b2737658a 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -57,15 +57,6 @@ static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_log_element *le)
57 le->le_ops->lo_add(sdp, le); 57 le->le_ops->lo_add(sdp, le);
58} 58}
59 59
60static inline void lops_incore_commit(struct gfs2_sbd *sdp,
61 struct gfs2_trans *tr)
62{
63 int x;
64 for (x = 0; gfs2_log_ops[x]; x++)
65 if (gfs2_log_ops[x]->lo_incore_commit)
66 gfs2_log_ops[x]->lo_incore_commit(sdp, tr);
67}
68
69static inline void lops_before_commit(struct gfs2_sbd *sdp) 60static inline void lops_before_commit(struct gfs2_sbd *sdp)
70{ 61{
71 int x; 62 int x;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 9c7765c12d62..053e2ebbbd50 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -89,6 +89,12 @@ static int __init init_gfs2_fs(void)
89 if (!gfs2_bufdata_cachep) 89 if (!gfs2_bufdata_cachep)
90 goto fail; 90 goto fail;
91 91
92 gfs2_rgrpd_cachep = kmem_cache_create("gfs2_rgrpd",
93 sizeof(struct gfs2_rgrpd),
94 0, 0, NULL);
95 if (!gfs2_rgrpd_cachep)
96 goto fail;
97
92 error = register_filesystem(&gfs2_fs_type); 98 error = register_filesystem(&gfs2_fs_type);
93 if (error) 99 if (error)
94 goto fail; 100 goto fail;
@@ -108,6 +114,9 @@ fail_unregister:
108fail: 114fail:
109 gfs2_glock_exit(); 115 gfs2_glock_exit();
110 116
117 if (gfs2_rgrpd_cachep)
118 kmem_cache_destroy(gfs2_rgrpd_cachep);
119
111 if (gfs2_bufdata_cachep) 120 if (gfs2_bufdata_cachep)
112 kmem_cache_destroy(gfs2_bufdata_cachep); 121 kmem_cache_destroy(gfs2_bufdata_cachep);
113 122
@@ -133,6 +142,7 @@ static void __exit exit_gfs2_fs(void)
133 unregister_filesystem(&gfs2_fs_type); 142 unregister_filesystem(&gfs2_fs_type);
134 unregister_filesystem(&gfs2meta_fs_type); 143 unregister_filesystem(&gfs2meta_fs_type);
135 144
145 kmem_cache_destroy(gfs2_rgrpd_cachep);
136 kmem_cache_destroy(gfs2_bufdata_cachep); 146 kmem_cache_destroy(gfs2_bufdata_cachep);
137 kmem_cache_destroy(gfs2_inode_cachep); 147 kmem_cache_destroy(gfs2_inode_cachep);
138 kmem_cache_destroy(gfs2_glock_cachep); 148 kmem_cache_destroy(gfs2_glock_cachep);
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index ac772b6d9dbb..90a04a6e3789 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -21,7 +21,6 @@
21#include <linux/gfs2_ondisk.h> 21#include <linux/gfs2_ondisk.h>
22#include <linux/lm_interface.h> 22#include <linux/lm_interface.h>
23#include <linux/backing-dev.h> 23#include <linux/backing-dev.h>
24#include <linux/pagevec.h>
25 24
26#include "gfs2.h" 25#include "gfs2.h"
27#include "incore.h" 26#include "incore.h"
@@ -104,11 +103,9 @@ static int gfs2_writepage_common(struct page *page,
104 loff_t i_size = i_size_read(inode); 103 loff_t i_size = i_size_read(inode);
105 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; 104 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
106 unsigned offset; 105 unsigned offset;
107 int ret = -EIO;
108 106
109 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl))) 107 if (gfs2_assert_withdraw(sdp, gfs2_glock_is_held_excl(ip->i_gl)))
110 goto out; 108 goto out;
111 ret = 0;
112 if (current->journal_info) 109 if (current->journal_info)
113 goto redirty; 110 goto redirty;
114 /* Is the page fully outside i_size? (truncate in progress) */ 111 /* Is the page fully outside i_size? (truncate in progress) */
@@ -280,7 +277,7 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping,
280 int i; 277 int i;
281 int ret; 278 int ret;
282 279
283 ret = gfs2_trans_begin(sdp, nrblocks, 0); 280 ret = gfs2_trans_begin(sdp, nrblocks, nrblocks);
284 if (ret < 0) 281 if (ret < 0)
285 return ret; 282 return ret;
286 283
@@ -510,23 +507,26 @@ static int __gfs2_readpage(void *file, struct page *page)
510static int gfs2_readpage(struct file *file, struct page *page) 507static int gfs2_readpage(struct file *file, struct page *page)
511{ 508{
512 struct gfs2_inode *ip = GFS2_I(page->mapping->host); 509 struct gfs2_inode *ip = GFS2_I(page->mapping->host);
513 struct gfs2_holder gh; 510 struct gfs2_holder *gh;
514 int error; 511 int error;
515 512
516 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME|LM_FLAG_TRY_1CB, &gh); 513 gh = gfs2_glock_is_locked_by_me(ip->i_gl);
517 error = gfs2_glock_nq_atime(&gh); 514 if (!gh) {
518 if (unlikely(error)) { 515 gh = kmalloc(sizeof(struct gfs2_holder), GFP_NOFS);
516 if (!gh)
517 return -ENOBUFS;
518 gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, gh);
519 unlock_page(page); 519 unlock_page(page);
520 goto out; 520 error = gfs2_glock_nq_atime(gh);
521 if (likely(error != 0))
522 goto out;
523 return AOP_TRUNCATED_PAGE;
521 } 524 }
522 error = __gfs2_readpage(file, page); 525 error = __gfs2_readpage(file, page);
523 gfs2_glock_dq(&gh); 526 gfs2_glock_dq(gh);
524out: 527out:
525 gfs2_holder_uninit(&gh); 528 gfs2_holder_uninit(gh);
526 if (error == GLR_TRYFAILED) { 529 kfree(gh);
527 yield();
528 return AOP_TRUNCATED_PAGE;
529 }
530 return error; 530 return error;
531} 531}
532 532
@@ -648,15 +648,15 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
648 648
649 if (alloc_required) { 649 if (alloc_required) {
650 al = gfs2_alloc_get(ip); 650 al = gfs2_alloc_get(ip);
651 if (!al) {
652 error = -ENOMEM;
653 goto out_unlock;
654 }
651 655
652 error = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 656 error = gfs2_quota_lock_check(ip);
653 if (error) 657 if (error)
654 goto out_alloc_put; 658 goto out_alloc_put;
655 659
656 error = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
657 if (error)
658 goto out_qunlock;
659
660 al->al_requested = data_blocks + ind_blocks; 660 al->al_requested = data_blocks + ind_blocks;
661 error = gfs2_inplace_reserve(ip); 661 error = gfs2_inplace_reserve(ip);
662 if (error) 662 if (error)
@@ -828,7 +828,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
828 unsigned int to = from + len; 828 unsigned int to = from + len;
829 int ret; 829 int ret;
830 830
831 BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == 0); 831 BUG_ON(gfs2_glock_is_locked_by_me(ip->i_gl) == NULL);
832 832
833 ret = gfs2_meta_inode_buffer(ip, &dibh); 833 ret = gfs2_meta_inode_buffer(ip, &dibh);
834 if (unlikely(ret)) { 834 if (unlikely(ret)) {
diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c
index 793e334d098e..4a5e676b4420 100644
--- a/fs/gfs2/ops_dentry.c
+++ b/fs/gfs2/ops_dentry.c
@@ -43,7 +43,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
43 struct gfs2_holder d_gh; 43 struct gfs2_holder d_gh;
44 struct gfs2_inode *ip = NULL; 44 struct gfs2_inode *ip = NULL;
45 int error; 45 int error;
46 int had_lock=0; 46 int had_lock = 0;
47 47
48 if (inode) { 48 if (inode) {
49 if (is_bad_inode(inode)) 49 if (is_bad_inode(inode))
@@ -54,7 +54,7 @@ static int gfs2_drevalidate(struct dentry *dentry, struct nameidata *nd)
54 if (sdp->sd_args.ar_localcaching) 54 if (sdp->sd_args.ar_localcaching)
55 goto valid; 55 goto valid;
56 56
57 had_lock = gfs2_glock_is_locked_by_me(dip->i_gl); 57 had_lock = (gfs2_glock_is_locked_by_me(dip->i_gl) != NULL);
58 if (!had_lock) { 58 if (!had_lock) {
59 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); 59 error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh);
60 if (error) 60 if (error)
diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c
index 334c7f85351b..990d9f4bc463 100644
--- a/fs/gfs2/ops_export.c
+++ b/fs/gfs2/ops_export.c
@@ -204,8 +204,6 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
204 inode = gfs2_inode_lookup(sb, DT_UNKNOWN, 204 inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
205 inum->no_addr, 205 inum->no_addr,
206 0, 0); 206 0, 0);
207 if (!inode)
208 goto fail;
209 if (IS_ERR(inode)) { 207 if (IS_ERR(inode)) {
210 error = PTR_ERR(inode); 208 error = PTR_ERR(inode);
211 goto fail; 209 goto fail;
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index f4842f2548cd..e1b7d525a066 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -30,7 +30,6 @@
30#include "glock.h" 30#include "glock.h"
31#include "glops.h" 31#include "glops.h"
32#include "inode.h" 32#include "inode.h"
33#include "lm.h"
34#include "log.h" 33#include "log.h"
35#include "meta_io.h" 34#include "meta_io.h"
36#include "quota.h" 35#include "quota.h"
@@ -39,6 +38,7 @@
39#include "util.h" 38#include "util.h"
40#include "eaops.h" 39#include "eaops.h"
41#include "ops_address.h" 40#include "ops_address.h"
41#include "ops_inode.h"
42 42
43/** 43/**
44 * gfs2_llseek - seek to a location in a file 44 * gfs2_llseek - seek to a location in a file
@@ -369,12 +369,9 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
369 if (al == NULL) 369 if (al == NULL)
370 goto out_unlock; 370 goto out_unlock;
371 371
372 ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 372 ret = gfs2_quota_lock_check(ip);
373 if (ret) 373 if (ret)
374 goto out_alloc_put; 374 goto out_alloc_put;
375 ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
376 if (ret)
377 goto out_quota_unlock;
378 al->al_requested = data_blocks + ind_blocks; 375 al->al_requested = data_blocks + ind_blocks;
379 ret = gfs2_inplace_reserve(ip); 376 ret = gfs2_inplace_reserve(ip);
380 if (ret) 377 if (ret)
@@ -596,6 +593,36 @@ static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl)
596 return generic_setlease(file, arg, fl); 593 return generic_setlease(file, arg, fl);
597} 594}
598 595
596static int gfs2_lm_plock_get(struct gfs2_sbd *sdp, struct lm_lockname *name,
597 struct file *file, struct file_lock *fl)
598{
599 int error = -EIO;
600 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
601 error = sdp->sd_lockstruct.ls_ops->lm_plock_get(
602 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
603 return error;
604}
605
606static int gfs2_lm_plock(struct gfs2_sbd *sdp, struct lm_lockname *name,
607 struct file *file, int cmd, struct file_lock *fl)
608{
609 int error = -EIO;
610 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
611 error = sdp->sd_lockstruct.ls_ops->lm_plock(
612 sdp->sd_lockstruct.ls_lockspace, name, file, cmd, fl);
613 return error;
614}
615
616static int gfs2_lm_punlock(struct gfs2_sbd *sdp, struct lm_lockname *name,
617 struct file *file, struct file_lock *fl)
618{
619 int error = -EIO;
620 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
621 error = sdp->sd_lockstruct.ls_ops->lm_punlock(
622 sdp->sd_lockstruct.ls_lockspace, name, file, fl);
623 return error;
624}
625
599/** 626/**
600 * gfs2_lock - acquire/release a posix lock on a file 627 * gfs2_lock - acquire/release a posix lock on a file
601 * @file: the file pointer 628 * @file: the file pointer
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 4bee6aa845e4..ef9c6c4f80f6 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -26,7 +26,6 @@
26#include "glock.h" 26#include "glock.h"
27#include "glops.h" 27#include "glops.h"
28#include "inode.h" 28#include "inode.h"
29#include "lm.h"
30#include "mount.h" 29#include "mount.h"
31#include "ops_fstype.h" 30#include "ops_fstype.h"
32#include "ops_dentry.h" 31#include "ops_dentry.h"
@@ -363,6 +362,13 @@ static int map_journal_extents(struct gfs2_sbd *sdp)
363 return rc; 362 return rc;
364} 363}
365 364
365static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp)
366{
367 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
368 sdp->sd_lockstruct.ls_ops->lm_others_may_mount(
369 sdp->sd_lockstruct.ls_lockspace);
370}
371
366static int init_journal(struct gfs2_sbd *sdp, int undo) 372static int init_journal(struct gfs2_sbd *sdp, int undo)
367{ 373{
368 struct gfs2_holder ji_gh; 374 struct gfs2_holder ji_gh;
@@ -542,7 +548,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
542 } 548 }
543 ip = GFS2_I(sdp->sd_rindex); 549 ip = GFS2_I(sdp->sd_rindex);
544 set_bit(GLF_STICKY, &ip->i_gl->gl_flags); 550 set_bit(GLF_STICKY, &ip->i_gl->gl_flags);
545 sdp->sd_rindex_vn = ip->i_gl->gl_vn - 1; 551 sdp->sd_rindex_uptodate = 0;
546 552
547 /* Read in the quota inode */ 553 /* Read in the quota inode */
548 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); 554 sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota");
@@ -705,6 +711,69 @@ fail:
705} 711}
706 712
707/** 713/**
714 * gfs2_lm_mount - mount a locking protocol
715 * @sdp: the filesystem
716 * @args: mount arguements
717 * @silent: if 1, don't complain if the FS isn't a GFS2 fs
718 *
719 * Returns: errno
720 */
721
722static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
723{
724 char *proto = sdp->sd_proto_name;
725 char *table = sdp->sd_table_name;
726 int flags = LM_MFLAG_CONV_NODROP;
727 int error;
728
729 if (sdp->sd_args.ar_spectator)
730 flags |= LM_MFLAG_SPECTATOR;
731
732 fs_info(sdp, "Trying to join cluster \"%s\", \"%s\"\n", proto, table);
733
734 error = gfs2_mount_lockproto(proto, table, sdp->sd_args.ar_hostdata,
735 gfs2_glock_cb, sdp,
736 GFS2_MIN_LVB_SIZE, flags,
737 &sdp->sd_lockstruct, &sdp->sd_kobj);
738 if (error) {
739 fs_info(sdp, "can't mount proto=%s, table=%s, hostdata=%s\n",
740 proto, table, sdp->sd_args.ar_hostdata);
741 goto out;
742 }
743
744 if (gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lockspace) ||
745 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_ops) ||
746 gfs2_assert_warn(sdp, sdp->sd_lockstruct.ls_lvb_size >=
747 GFS2_MIN_LVB_SIZE)) {
748 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
749 goto out;
750 }
751
752 if (sdp->sd_args.ar_spectator)
753 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
754 else
755 snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
756 sdp->sd_lockstruct.ls_jid);
757
758 fs_info(sdp, "Joined cluster. Now mounting FS...\n");
759
760 if ((sdp->sd_lockstruct.ls_flags & LM_LSFLAG_LOCAL) &&
761 !sdp->sd_args.ar_ignore_local_fs) {
762 sdp->sd_args.ar_localflocks = 1;
763 sdp->sd_args.ar_localcaching = 1;
764 }
765
766out:
767 return error;
768}
769
770void gfs2_lm_unmount(struct gfs2_sbd *sdp)
771{
772 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
773 gfs2_unmount_lockproto(&sdp->sd_lockstruct);
774}
775
776/**
708 * fill_super - Read in superblock 777 * fill_super - Read in superblock
709 * @sb: The VFS superblock 778 * @sb: The VFS superblock
710 * @data: Mount options 779 * @data: Mount options
@@ -874,7 +943,6 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
874{ 943{
875 struct kstat stat; 944 struct kstat stat;
876 struct nameidata nd; 945 struct nameidata nd;
877 struct file_system_type *fstype;
878 struct super_block *sb = NULL, *s; 946 struct super_block *sb = NULL, *s;
879 int error; 947 int error;
880 948
@@ -886,8 +954,7 @@ static struct super_block* get_gfs2_sb(const char *dev_name)
886 } 954 }
887 error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat); 955 error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat);
888 956
889 fstype = get_fs_type("gfs2"); 957 list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) {
890 list_for_each_entry(s, &fstype->fs_supers, s_instances) {
891 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || 958 if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) ||
892 (S_ISDIR(stat.mode) && 959 (S_ISDIR(stat.mode) &&
893 s == nd.path.dentry->d_inode->i_sb)) { 960 s == nd.path.dentry->d_inode->i_sb)) {
@@ -931,7 +998,6 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags,
931 error = PTR_ERR(new); 998 error = PTR_ERR(new);
932 goto error; 999 goto error;
933 } 1000 }
934 module_put(fs_type->owner);
935 new->s_flags = flags; 1001 new->s_flags = flags;
936 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id)); 1002 strlcpy(new->s_id, sb->s_id, sizeof(new->s_id));
937 sb_set_blocksize(new, sb->s_blocksize); 1003 sb_set_blocksize(new, sb->s_blocksize);
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index e87412902bed..2686ad4c0029 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -200,15 +200,15 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
200 200
201 if (alloc_required) { 201 if (alloc_required) {
202 struct gfs2_alloc *al = gfs2_alloc_get(dip); 202 struct gfs2_alloc *al = gfs2_alloc_get(dip);
203 if (!al) {
204 error = -ENOMEM;
205 goto out_gunlock;
206 }
203 207
204 error = gfs2_quota_lock(dip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 208 error = gfs2_quota_lock_check(dip);
205 if (error) 209 if (error)
206 goto out_alloc; 210 goto out_alloc;
207 211
208 error = gfs2_quota_check(dip, dip->i_inode.i_uid, dip->i_inode.i_gid);
209 if (error)
210 goto out_gunlock_q;
211
212 al->al_requested = sdp->sd_max_dirres; 212 al->al_requested = sdp->sd_max_dirres;
213 213
214 error = gfs2_inplace_reserve(dip); 214 error = gfs2_inplace_reserve(dip);
@@ -716,15 +716,15 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
716 716
717 if (alloc_required) { 717 if (alloc_required) {
718 struct gfs2_alloc *al = gfs2_alloc_get(ndip); 718 struct gfs2_alloc *al = gfs2_alloc_get(ndip);
719 if (!al) {
720 error = -ENOMEM;
721 goto out_gunlock;
722 }
719 723
720 error = gfs2_quota_lock(ndip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE); 724 error = gfs2_quota_lock_check(ndip);
721 if (error) 725 if (error)
722 goto out_alloc; 726 goto out_alloc;
723 727
724 error = gfs2_quota_check(ndip, ndip->i_inode.i_uid, ndip->i_inode.i_gid);
725 if (error)
726 goto out_gunlock_q;
727
728 al->al_requested = sdp->sd_max_dirres; 728 al->al_requested = sdp->sd_max_dirres;
729 729
730 error = gfs2_inplace_reserve(ndip); 730 error = gfs2_inplace_reserve(ndip);
@@ -898,7 +898,7 @@ static int gfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
898 int error; 898 int error;
899 int unlock = 0; 899 int unlock = 0;
900 900
901 if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) { 901 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
902 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); 902 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
903 if (error) 903 if (error)
904 return error; 904 return error;
@@ -953,7 +953,8 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
953 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid) 953 if (!(attr->ia_valid & ATTR_GID) || ogid == ngid)
954 ogid = ngid = NO_QUOTA_CHANGE; 954 ogid = ngid = NO_QUOTA_CHANGE;
955 955
956 gfs2_alloc_get(ip); 956 if (!gfs2_alloc_get(ip))
957 return -ENOMEM;
957 958
958 error = gfs2_quota_lock(ip, nuid, ngid); 959 error = gfs2_quota_lock(ip, nuid, ngid);
959 if (error) 960 if (error)
@@ -981,8 +982,9 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
981 brelse(dibh); 982 brelse(dibh);
982 983
983 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) { 984 if (ouid != NO_QUOTA_CHANGE || ogid != NO_QUOTA_CHANGE) {
984 gfs2_quota_change(ip, -ip->i_di.di_blocks, ouid, ogid); 985 u64 blocks = gfs2_get_inode_blocks(&ip->i_inode);
985 gfs2_quota_change(ip, ip->i_di.di_blocks, nuid, ngid); 986 gfs2_quota_change(ip, -blocks, ouid, ogid);
987 gfs2_quota_change(ip, blocks, nuid, ngid);
986 } 988 }
987 989
988out_end_trans: 990out_end_trans:
@@ -1064,7 +1066,7 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
1064 int error; 1066 int error;
1065 int unlock = 0; 1067 int unlock = 0;
1066 1068
1067 if (gfs2_glock_is_locked_by_me(ip->i_gl) == 0) { 1069 if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
1068 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); 1070 error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
1069 if (error) 1071 if (error)
1070 return error; 1072 return error;
@@ -1148,16 +1150,6 @@ const struct inode_operations gfs2_file_iops = {
1148 .removexattr = gfs2_removexattr, 1150 .removexattr = gfs2_removexattr,
1149}; 1151};
1150 1152
1151const struct inode_operations gfs2_dev_iops = {
1152 .permission = gfs2_permission,
1153 .setattr = gfs2_setattr,
1154 .getattr = gfs2_getattr,
1155 .setxattr = gfs2_setxattr,
1156 .getxattr = gfs2_getxattr,
1157 .listxattr = gfs2_listxattr,
1158 .removexattr = gfs2_removexattr,
1159};
1160
1161const struct inode_operations gfs2_dir_iops = { 1153const struct inode_operations gfs2_dir_iops = {
1162 .create = gfs2_create, 1154 .create = gfs2_create,
1163 .lookup = gfs2_lookup, 1155 .lookup = gfs2_lookup,
diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h
index fd8cee231e1d..14b4b797622a 100644
--- a/fs/gfs2/ops_inode.h
+++ b/fs/gfs2/ops_inode.h
@@ -15,7 +15,6 @@
15extern const struct inode_operations gfs2_file_iops; 15extern const struct inode_operations gfs2_file_iops;
16extern const struct inode_operations gfs2_dir_iops; 16extern const struct inode_operations gfs2_dir_iops;
17extern const struct inode_operations gfs2_symlink_iops; 17extern const struct inode_operations gfs2_symlink_iops;
18extern const struct inode_operations gfs2_dev_iops;
19extern const struct file_operations gfs2_file_fops; 18extern const struct file_operations gfs2_file_fops;
20extern const struct file_operations gfs2_dir_fops; 19extern const struct file_operations gfs2_dir_fops;
21extern const struct file_operations gfs2_file_fops_nolock; 20extern const struct file_operations gfs2_file_fops_nolock;
diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c
index 5e524217944a..2278c68b7e35 100644
--- a/fs/gfs2/ops_super.c
+++ b/fs/gfs2/ops_super.c
@@ -25,7 +25,6 @@
25#include "incore.h" 25#include "incore.h"
26#include "glock.h" 26#include "glock.h"
27#include "inode.h" 27#include "inode.h"
28#include "lm.h"
29#include "log.h" 28#include "log.h"
30#include "mount.h" 29#include "mount.h"
31#include "ops_super.h" 30#include "ops_super.h"
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index a08dabd6ce90..56aaf915c59a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -94,7 +94,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id,
94 struct gfs2_quota_data *qd; 94 struct gfs2_quota_data *qd;
95 int error; 95 int error;
96 96
97 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_KERNEL); 97 qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS);
98 if (!qd) 98 if (!qd)
99 return -ENOMEM; 99 return -ENOMEM;
100 100
@@ -616,16 +616,9 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
616 s64 value; 616 s64 value;
617 int err = -EIO; 617 int err = -EIO;
618 618
619 if (gfs2_is_stuffed(ip)) { 619 if (gfs2_is_stuffed(ip))
620 struct gfs2_alloc *al = NULL;
621 al = gfs2_alloc_get(ip);
622 /* just request 1 blk */
623 al->al_requested = 1;
624 gfs2_inplace_reserve(ip);
625 gfs2_unstuff_dinode(ip, NULL); 620 gfs2_unstuff_dinode(ip, NULL);
626 gfs2_inplace_release(ip); 621
627 gfs2_alloc_put(ip);
628 }
629 page = grab_cache_page(mapping, index); 622 page = grab_cache_page(mapping, index);
630 if (!page) 623 if (!page)
631 return -ENOMEM; 624 return -ENOMEM;
@@ -690,14 +683,14 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
690 unsigned int qx, x; 683 unsigned int qx, x;
691 struct gfs2_quota_data *qd; 684 struct gfs2_quota_data *qd;
692 loff_t offset; 685 loff_t offset;
693 unsigned int nalloc = 0; 686 unsigned int nalloc = 0, blocks;
694 struct gfs2_alloc *al = NULL; 687 struct gfs2_alloc *al = NULL;
695 int error; 688 int error;
696 689
697 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota), 690 gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
698 &data_blocks, &ind_blocks); 691 &data_blocks, &ind_blocks);
699 692
700 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_KERNEL); 693 ghs = kcalloc(num_qd, sizeof(struct gfs2_holder), GFP_NOFS);
701 if (!ghs) 694 if (!ghs)
702 return -ENOMEM; 695 return -ENOMEM;
703 696
@@ -727,30 +720,33 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
727 nalloc++; 720 nalloc++;
728 } 721 }
729 722
730 if (nalloc) { 723 al = gfs2_alloc_get(ip);
731 al = gfs2_alloc_get(ip); 724 if (!al) {
725 error = -ENOMEM;
726 goto out_gunlock;
727 }
728 /*
729 * 1 blk for unstuffing inode if stuffed. We add this extra
730 * block to the reservation unconditionally. If the inode
731 * doesn't need unstuffing, the block will be released to the
732 * rgrp since it won't be allocated during the transaction
733 */
734 al->al_requested = 1;
735 /* +1 in the end for block requested above for unstuffing */
736 blocks = num_qd * data_blocks + RES_DINODE + num_qd + 1;
732 737
733 al->al_requested = nalloc * (data_blocks + ind_blocks); 738 if (nalloc)
739 al->al_requested += nalloc * (data_blocks + ind_blocks);
740 error = gfs2_inplace_reserve(ip);
741 if (error)
742 goto out_alloc;
734 743
735 error = gfs2_inplace_reserve(ip); 744 if (nalloc)
736 if (error) 745 blocks += al->al_rgd->rd_length + nalloc * ind_blocks + RES_STATFS;
737 goto out_alloc; 746
738 747 error = gfs2_trans_begin(sdp, blocks, 0);
739 error = gfs2_trans_begin(sdp, 748 if (error)
740 al->al_rgd->rd_length + 749 goto out_ipres;
741 num_qd * data_blocks +
742 nalloc * ind_blocks +
743 RES_DINODE + num_qd +
744 RES_STATFS, 0);
745 if (error)
746 goto out_ipres;
747 } else {
748 error = gfs2_trans_begin(sdp,
749 num_qd * data_blocks +
750 RES_DINODE + num_qd, 0);
751 if (error)
752 goto out_gunlock;
753 }
754 750
755 for (x = 0; x < num_qd; x++) { 751 for (x = 0; x < num_qd; x++) {
756 qd = qda[x]; 752 qd = qda[x];
@@ -769,11 +765,9 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
769out_end_trans: 765out_end_trans:
770 gfs2_trans_end(sdp); 766 gfs2_trans_end(sdp);
771out_ipres: 767out_ipres:
772 if (nalloc) 768 gfs2_inplace_release(ip);
773 gfs2_inplace_release(ip);
774out_alloc: 769out_alloc:
775 if (nalloc) 770 gfs2_alloc_put(ip);
776 gfs2_alloc_put(ip);
777out_gunlock: 771out_gunlock:
778 gfs2_glock_dq_uninit(&i_gh); 772 gfs2_glock_dq_uninit(&i_gh);
779out: 773out:
@@ -1124,12 +1118,12 @@ int gfs2_quota_init(struct gfs2_sbd *sdp)
1124 error = -ENOMEM; 1118 error = -ENOMEM;
1125 1119
1126 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks, 1120 sdp->sd_quota_bitmap = kcalloc(sdp->sd_quota_chunks,
1127 sizeof(unsigned char *), GFP_KERNEL); 1121 sizeof(unsigned char *), GFP_NOFS);
1128 if (!sdp->sd_quota_bitmap) 1122 if (!sdp->sd_quota_bitmap)
1129 return error; 1123 return error;
1130 1124
1131 for (x = 0; x < sdp->sd_quota_chunks; x++) { 1125 for (x = 0; x < sdp->sd_quota_chunks; x++) {
1132 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_KERNEL); 1126 sdp->sd_quota_bitmap[x] = kzalloc(PAGE_SIZE, GFP_NOFS);
1133 if (!sdp->sd_quota_bitmap[x]) 1127 if (!sdp->sd_quota_bitmap[x])
1134 goto fail; 1128 goto fail;
1135 } 1129 }
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index a8be1417051f..3b7f4b0e5dfe 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -32,4 +32,21 @@ int gfs2_quota_init(struct gfs2_sbd *sdp);
32void gfs2_quota_scan(struct gfs2_sbd *sdp); 32void gfs2_quota_scan(struct gfs2_sbd *sdp);
33void gfs2_quota_cleanup(struct gfs2_sbd *sdp); 33void gfs2_quota_cleanup(struct gfs2_sbd *sdp);
34 34
35static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
36{
37 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
38 int ret;
39 if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
40 return 0;
41 ret = gfs2_quota_lock(ip, NO_QUOTA_CHANGE, NO_QUOTA_CHANGE);
42 if (ret)
43 return ret;
44 if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
45 return 0;
46 ret = gfs2_quota_check(ip, ip->i_inode.i_uid, ip->i_inode.i_gid);
47 if (ret)
48 gfs2_quota_unlock(ip);
49 return ret;
50}
51
35#endif /* __QUOTA_DOT_H__ */ 52#endif /* __QUOTA_DOT_H__ */
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 6fb07d67ca8a..2888e4b4b1c5 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -20,7 +20,6 @@
20#include "bmap.h" 20#include "bmap.h"
21#include "glock.h" 21#include "glock.h"
22#include "glops.h" 22#include "glops.h"
23#include "lm.h"
24#include "lops.h" 23#include "lops.h"
25#include "meta_io.h" 24#include "meta_io.h"
26#include "recovery.h" 25#include "recovery.h"
@@ -69,7 +68,7 @@ int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where)
69 return 0; 68 return 0;
70 } 69 }
71 70
72 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_KERNEL); 71 rr = kmalloc(sizeof(struct gfs2_revoke_replay), GFP_NOFS);
73 if (!rr) 72 if (!rr)
74 return -ENOMEM; 73 return -ENOMEM;
75 74
@@ -150,7 +149,7 @@ static int get_log_header(struct gfs2_jdesc *jd, unsigned int blk,
150 struct gfs2_log_header_host *head) 149 struct gfs2_log_header_host *head)
151{ 150{
152 struct buffer_head *bh; 151 struct buffer_head *bh;
153 struct gfs2_log_header_host lh; 152 struct gfs2_log_header_host uninitialized_var(lh);
154 const u32 nothing = 0; 153 const u32 nothing = 0;
155 u32 hash; 154 u32 hash;
156 int error; 155 int error;
@@ -425,6 +424,16 @@ static int clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *hea
425 return error; 424 return error;
426} 425}
427 426
427
428static void gfs2_lm_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
429 unsigned int message)
430{
431 if (likely(!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)))
432 sdp->sd_lockstruct.ls_ops->lm_recovery_done(
433 sdp->sd_lockstruct.ls_lockspace, jid, message);
434}
435
436
428/** 437/**
429 * gfs2_recover_journal - recovery a given journal 438 * gfs2_recover_journal - recovery a given journal
430 * @jd: the struct gfs2_jdesc describing the journal 439 * @jd: the struct gfs2_jdesc describing the journal
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3552110b2e5f..7e8f0b1d6c6e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -14,6 +14,7 @@
14#include <linux/fs.h> 14#include <linux/fs.h>
15#include <linux/gfs2_ondisk.h> 15#include <linux/gfs2_ondisk.h>
16#include <linux/lm_interface.h> 16#include <linux/lm_interface.h>
17#include <linux/prefetch.h>
17 18
18#include "gfs2.h" 19#include "gfs2.h"
19#include "incore.h" 20#include "incore.h"
@@ -33,6 +34,16 @@
33#define BFITNOENT ((u32)~0) 34#define BFITNOENT ((u32)~0)
34#define NO_BLOCK ((u64)~0) 35#define NO_BLOCK ((u64)~0)
35 36
37#if BITS_PER_LONG == 32
38#define LBITMASK (0x55555555UL)
39#define LBITSKIP55 (0x55555555UL)
40#define LBITSKIP00 (0x00000000UL)
41#else
42#define LBITMASK (0x5555555555555555UL)
43#define LBITSKIP55 (0x5555555555555555UL)
44#define LBITSKIP00 (0x0000000000000000UL)
45#endif
46
36/* 47/*
37 * These routines are used by the resource group routines (rgrp.c) 48 * These routines are used by the resource group routines (rgrp.c)
38 * to keep track of block allocation. Each block is represented by two 49 * to keep track of block allocation. Each block is represented by two
@@ -53,7 +64,8 @@ static const char valid_change[16] = {
53}; 64};
54 65
55static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, 66static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
56 unsigned char old_state, unsigned char new_state); 67 unsigned char old_state, unsigned char new_state,
68 unsigned int *n);
57 69
58/** 70/**
59 * gfs2_setbit - Set a bit in the bitmaps 71 * gfs2_setbit - Set a bit in the bitmaps
@@ -64,26 +76,32 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
64 * 76 *
65 */ 77 */
66 78
67static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, 79static inline void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buf1,
68 unsigned int buflen, u32 block, 80 unsigned char *buf2, unsigned int offset,
69 unsigned char new_state) 81 unsigned int buflen, u32 block,
82 unsigned char new_state)
70{ 83{
71 unsigned char *byte, *end, cur_state; 84 unsigned char *byte1, *byte2, *end, cur_state;
72 unsigned int bit; 85 const unsigned int bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE;
73 86
74 byte = buffer + (block / GFS2_NBBY); 87 byte1 = buf1 + offset + (block / GFS2_NBBY);
75 bit = (block % GFS2_NBBY) * GFS2_BIT_SIZE; 88 end = buf1 + offset + buflen;
76 end = buffer + buflen;
77 89
78 gfs2_assert(rgd->rd_sbd, byte < end); 90 BUG_ON(byte1 >= end);
79 91
80 cur_state = (*byte >> bit) & GFS2_BIT_MASK; 92 cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
81 93
82 if (valid_change[new_state * 4 + cur_state]) { 94 if (unlikely(!valid_change[new_state * 4 + cur_state])) {
83 *byte ^= cur_state << bit;
84 *byte |= new_state << bit;
85 } else
86 gfs2_consist_rgrpd(rgd); 95 gfs2_consist_rgrpd(rgd);
96 return;
97 }
98 *byte1 ^= (cur_state ^ new_state) << bit;
99
100 if (buf2) {
101 byte2 = buf2 + offset + (block / GFS2_NBBY);
102 cur_state = (*byte2 >> bit) & GFS2_BIT_MASK;
103 *byte2 ^= (cur_state ^ new_state) << bit;
104 }
87} 105}
88 106
89/** 107/**
@@ -94,10 +112,12 @@ static void gfs2_setbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
94 * 112 *
95 */ 113 */
96 114
97static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer, 115static inline unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd,
98 unsigned int buflen, u32 block) 116 const unsigned char *buffer,
117 unsigned int buflen, u32 block)
99{ 118{
100 unsigned char *byte, *end, cur_state; 119 const unsigned char *byte, *end;
120 unsigned char cur_state;
101 unsigned int bit; 121 unsigned int bit;
102 122
103 byte = buffer + (block / GFS2_NBBY); 123 byte = buffer + (block / GFS2_NBBY);
@@ -126,47 +146,66 @@ static unsigned char gfs2_testbit(struct gfs2_rgrpd *rgd, unsigned char *buffer,
126 * Return: the block number (bitmap buffer scope) that was found 146 * Return: the block number (bitmap buffer scope) that was found
127 */ 147 */
128 148
129static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal, 149static u32 gfs2_bitfit(const u8 *buffer, unsigned int buflen, u32 goal,
130 unsigned char old_state) 150 u8 old_state)
131{ 151{
132 unsigned char *byte; 152 const u8 *byte, *start, *end;
133 u32 blk = goal; 153 int bit, startbit;
134 unsigned int bit, bitlong; 154 u32 g1, g2, misaligned;
135 unsigned long *plong, plong55; 155 unsigned long *plong;
136 156 unsigned long lskipval;
137 byte = buffer + (goal / GFS2_NBBY); 157
138 plong = (unsigned long *)(buffer + (goal / GFS2_NBBY)); 158 lskipval = (old_state & GFS2_BLKST_USED) ? LBITSKIP00 : LBITSKIP55;
139 bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE; 159 g1 = (goal / GFS2_NBBY);
140 bitlong = bit; 160 start = buffer + g1;
141#if BITS_PER_LONG == 32 161 byte = start;
142 plong55 = 0x55555555; 162 end = buffer + buflen;
143#else 163 g2 = ALIGN(g1, sizeof(unsigned long));
144 plong55 = 0x5555555555555555; 164 plong = (unsigned long *)(buffer + g2);
145#endif 165 startbit = bit = (goal % GFS2_NBBY) * GFS2_BIT_SIZE;
146 while (byte < buffer + buflen) { 166 misaligned = g2 - g1;
147 167 if (!misaligned)
148 if (bitlong == 0 && old_state == 0 && *plong == plong55) { 168 goto ulong_aligned;
149 plong++; 169/* parse the bitmap a byte at a time */
150 byte += sizeof(unsigned long); 170misaligned:
151 blk += sizeof(unsigned long) * GFS2_NBBY; 171 while (byte < end) {
152 continue; 172 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state) {
173 return goal +
174 (((byte - start) * GFS2_NBBY) +
175 ((bit - startbit) >> 1));
153 } 176 }
154 if (((*byte >> bit) & GFS2_BIT_MASK) == old_state)
155 return blk;
156 bit += GFS2_BIT_SIZE; 177 bit += GFS2_BIT_SIZE;
157 if (bit >= 8) { 178 if (bit >= GFS2_NBBY * GFS2_BIT_SIZE) {
158 bit = 0; 179 bit = 0;
159 byte++; 180 byte++;
181 misaligned--;
182 if (!misaligned) {
183 plong = (unsigned long *)byte;
184 goto ulong_aligned;
185 }
160 } 186 }
161 bitlong += GFS2_BIT_SIZE;
162 if (bitlong >= sizeof(unsigned long) * 8) {
163 bitlong = 0;
164 plong++;
165 }
166
167 blk++;
168 } 187 }
188 return BFITNOENT;
169 189
190/* parse the bitmap a unsigned long at a time */
191ulong_aligned:
192 /* Stop at "end - 1" or else prefetch can go past the end and segfault.
193 We could "if" it but we'd lose some of the performance gained.
194 This way will only slow down searching the very last 4/8 bytes
195 depending on architecture. I've experimented with several ways
196 of writing this section such as using an else before the goto
197 but this one seems to be the fastest. */
198 while ((unsigned char *)plong < end - 1) {
199 prefetch(plong + 1);
200 if (((*plong) & LBITMASK) != lskipval)
201 break;
202 plong++;
203 }
204 if ((unsigned char *)plong < end) {
205 byte = (const u8 *)plong;
206 misaligned += sizeof(unsigned long) - 1;
207 goto misaligned;
208 }
170 return BFITNOENT; 209 return BFITNOENT;
171} 210}
172 211
@@ -179,14 +218,14 @@ static u32 gfs2_bitfit(unsigned char *buffer, unsigned int buflen, u32 goal,
179 * Returns: The number of bits 218 * Returns: The number of bits
180 */ 219 */
181 220
182static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, unsigned char *buffer, 221static u32 gfs2_bitcount(struct gfs2_rgrpd *rgd, const u8 *buffer,
183 unsigned int buflen, unsigned char state) 222 unsigned int buflen, u8 state)
184{ 223{
185 unsigned char *byte = buffer; 224 const u8 *byte = buffer;
186 unsigned char *end = buffer + buflen; 225 const u8 *end = buffer + buflen;
187 unsigned char state1 = state << 2; 226 const u8 state1 = state << 2;
188 unsigned char state2 = state << 4; 227 const u8 state2 = state << 4;
189 unsigned char state3 = state << 6; 228 const u8 state3 = state << 6;
190 u32 count = 0; 229 u32 count = 0;
191 230
192 for (; byte < end; byte++) { 231 for (; byte < end; byte++) {
@@ -353,7 +392,7 @@ static void clear_rgrpdi(struct gfs2_sbd *sdp)
353 } 392 }
354 393
355 kfree(rgd->rd_bits); 394 kfree(rgd->rd_bits);
356 kfree(rgd); 395 kmem_cache_free(gfs2_rgrpd_cachep, rgd);
357 } 396 }
358} 397}
359 398
@@ -516,7 +555,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
516 return error; 555 return error;
517 } 556 }
518 557
519 rgd = kzalloc(sizeof(struct gfs2_rgrpd), GFP_NOFS); 558 rgd = kmem_cache_zalloc(gfs2_rgrpd_cachep, GFP_NOFS);
520 error = -ENOMEM; 559 error = -ENOMEM;
521 if (!rgd) 560 if (!rgd)
522 return error; 561 return error;
@@ -539,7 +578,7 @@ static int read_rindex_entry(struct gfs2_inode *ip,
539 return error; 578 return error;
540 579
541 rgd->rd_gl->gl_object = rgd; 580 rgd->rd_gl->gl_object = rgd;
542 rgd->rd_rg_vn = rgd->rd_gl->gl_vn - 1; 581 rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
543 rgd->rd_flags |= GFS2_RDF_CHECK; 582 rgd->rd_flags |= GFS2_RDF_CHECK;
544 return error; 583 return error;
545} 584}
@@ -575,7 +614,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip)
575 } 614 }
576 } 615 }
577 616
578 sdp->sd_rindex_vn = ip->i_gl->gl_vn; 617 sdp->sd_rindex_uptodate = 1;
579 return 0; 618 return 0;
580} 619}
581 620
@@ -609,7 +648,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip)
609 } 648 }
610 } 649 }
611 650
612 sdp->sd_rindex_vn = ip->i_gl->gl_vn; 651 sdp->sd_rindex_uptodate = 1;
613 return 0; 652 return 0;
614} 653}
615 654
@@ -642,9 +681,9 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
642 return error; 681 return error;
643 682
644 /* Read new copy from disk if we don't have the latest */ 683 /* Read new copy from disk if we don't have the latest */
645 if (sdp->sd_rindex_vn != gl->gl_vn) { 684 if (!sdp->sd_rindex_uptodate) {
646 mutex_lock(&sdp->sd_rindex_mutex); 685 mutex_lock(&sdp->sd_rindex_mutex);
647 if (sdp->sd_rindex_vn != gl->gl_vn) { 686 if (!sdp->sd_rindex_uptodate) {
648 error = gfs2_ri_update(ip); 687 error = gfs2_ri_update(ip);
649 if (error) 688 if (error)
650 gfs2_glock_dq_uninit(ri_gh); 689 gfs2_glock_dq_uninit(ri_gh);
@@ -655,21 +694,31 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh)
655 return error; 694 return error;
656} 695}
657 696
658static void gfs2_rgrp_in(struct gfs2_rgrp_host *rg, const void *buf) 697static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf)
659{ 698{
660 const struct gfs2_rgrp *str = buf; 699 const struct gfs2_rgrp *str = buf;
700 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
701 u32 rg_flags;
661 702
662 rg->rg_flags = be32_to_cpu(str->rg_flags); 703 rg_flags = be32_to_cpu(str->rg_flags);
704 if (rg_flags & GFS2_RGF_NOALLOC)
705 rgd->rd_flags |= GFS2_RDF_NOALLOC;
706 else
707 rgd->rd_flags &= ~GFS2_RDF_NOALLOC;
663 rg->rg_free = be32_to_cpu(str->rg_free); 708 rg->rg_free = be32_to_cpu(str->rg_free);
664 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); 709 rg->rg_dinodes = be32_to_cpu(str->rg_dinodes);
665 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); 710 rg->rg_igeneration = be64_to_cpu(str->rg_igeneration);
666} 711}
667 712
668static void gfs2_rgrp_out(const struct gfs2_rgrp_host *rg, void *buf) 713static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf)
669{ 714{
670 struct gfs2_rgrp *str = buf; 715 struct gfs2_rgrp *str = buf;
716 struct gfs2_rgrp_host *rg = &rgd->rd_rg;
717 u32 rg_flags = 0;
671 718
672 str->rg_flags = cpu_to_be32(rg->rg_flags); 719 if (rgd->rd_flags & GFS2_RDF_NOALLOC)
720 rg_flags |= GFS2_RGF_NOALLOC;
721 str->rg_flags = cpu_to_be32(rg_flags);
673 str->rg_free = cpu_to_be32(rg->rg_free); 722 str->rg_free = cpu_to_be32(rg->rg_free);
674 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); 723 str->rg_dinodes = cpu_to_be32(rg->rg_dinodes);
675 str->__pad = cpu_to_be32(0); 724 str->__pad = cpu_to_be32(0);
@@ -726,9 +775,9 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
726 } 775 }
727 } 776 }
728 777
729 if (rgd->rd_rg_vn != gl->gl_vn) { 778 if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) {
730 gfs2_rgrp_in(&rgd->rd_rg, (rgd->rd_bits[0].bi_bh)->b_data); 779 gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data);
731 rgd->rd_rg_vn = gl->gl_vn; 780 rgd->rd_flags |= GFS2_RDF_UPTODATE;
732 } 781 }
733 782
734 spin_lock(&sdp->sd_rindex_spin); 783 spin_lock(&sdp->sd_rindex_spin);
@@ -840,7 +889,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
840 struct gfs2_sbd *sdp = rgd->rd_sbd; 889 struct gfs2_sbd *sdp = rgd->rd_sbd;
841 int ret = 0; 890 int ret = 0;
842 891
843 if (rgd->rd_rg.rg_flags & GFS2_RGF_NOALLOC) 892 if (rgd->rd_flags & GFS2_RDF_NOALLOC)
844 return 0; 893 return 0;
845 894
846 spin_lock(&sdp->sd_rindex_spin); 895 spin_lock(&sdp->sd_rindex_spin);
@@ -866,13 +915,15 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
866 u32 goal = 0, block; 915 u32 goal = 0, block;
867 u64 no_addr; 916 u64 no_addr;
868 struct gfs2_sbd *sdp = rgd->rd_sbd; 917 struct gfs2_sbd *sdp = rgd->rd_sbd;
918 unsigned int n;
869 919
870 for(;;) { 920 for(;;) {
871 if (goal >= rgd->rd_data) 921 if (goal >= rgd->rd_data)
872 break; 922 break;
873 down_write(&sdp->sd_log_flush_lock); 923 down_write(&sdp->sd_log_flush_lock);
924 n = 1;
874 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED, 925 block = rgblk_search(rgd, goal, GFS2_BLKST_UNLINKED,
875 GFS2_BLKST_UNLINKED); 926 GFS2_BLKST_UNLINKED, &n);
876 up_write(&sdp->sd_log_flush_lock); 927 up_write(&sdp->sd_log_flush_lock);
877 if (block == BFITNOENT) 928 if (block == BFITNOENT)
878 break; 929 break;
@@ -904,24 +955,20 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
904static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp, 955static struct gfs2_rgrpd *recent_rgrp_first(struct gfs2_sbd *sdp,
905 u64 rglast) 956 u64 rglast)
906{ 957{
907 struct gfs2_rgrpd *rgd = NULL; 958 struct gfs2_rgrpd *rgd;
908 959
909 spin_lock(&sdp->sd_rindex_spin); 960 spin_lock(&sdp->sd_rindex_spin);
910 961
911 if (list_empty(&sdp->sd_rindex_recent_list)) 962 if (rglast) {
912 goto out; 963 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
913 964 if (rgrp_contains_block(rgd, rglast))
914 if (!rglast) 965 goto out;
915 goto first; 966 }
916
917 list_for_each_entry(rgd, &sdp->sd_rindex_recent_list, rd_recent) {
918 if (rgd->rd_addr == rglast)
919 goto out;
920 } 967 }
921 968 rgd = NULL;
922first: 969 if (!list_empty(&sdp->sd_rindex_recent_list))
923 rgd = list_entry(sdp->sd_rindex_recent_list.next, struct gfs2_rgrpd, 970 rgd = list_entry(sdp->sd_rindex_recent_list.next,
924 rd_recent); 971 struct gfs2_rgrpd, rd_recent);
925out: 972out:
926 spin_unlock(&sdp->sd_rindex_spin); 973 spin_unlock(&sdp->sd_rindex_spin);
927 return rgd; 974 return rgd;
@@ -1067,7 +1114,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1067 1114
1068 /* Try recently successful rgrps */ 1115 /* Try recently successful rgrps */
1069 1116
1070 rgd = recent_rgrp_first(sdp, ip->i_last_rg_alloc); 1117 rgd = recent_rgrp_first(sdp, ip->i_goal);
1071 1118
1072 while (rgd) { 1119 while (rgd) {
1073 rg_locked = 0; 1120 rg_locked = 0;
@@ -1151,8 +1198,6 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
1151 } 1198 }
1152 1199
1153out: 1200out:
1154 ip->i_last_rg_alloc = rgd->rd_addr;
1155
1156 if (begin) { 1201 if (begin) {
1157 recent_rgrp_add(rgd); 1202 recent_rgrp_add(rgd);
1158 rgd = gfs2_rgrpd_get_next(rgd); 1203 rgd = gfs2_rgrpd_get_next(rgd);
@@ -1275,6 +1320,7 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1275 * @goal: the goal block within the RG (start here to search for avail block) 1320 * @goal: the goal block within the RG (start here to search for avail block)
1276 * @old_state: GFS2_BLKST_XXX the before-allocation state to find 1321 * @old_state: GFS2_BLKST_XXX the before-allocation state to find
1277 * @new_state: GFS2_BLKST_XXX the after-allocation block state 1322 * @new_state: GFS2_BLKST_XXX the after-allocation block state
1323 * @n: The extent length
1278 * 1324 *
1279 * Walk rgrp's bitmap to find bits that represent a block in @old_state. 1325 * Walk rgrp's bitmap to find bits that represent a block in @old_state.
1280 * Add the found bitmap buffer to the transaction. 1326 * Add the found bitmap buffer to the transaction.
@@ -1290,13 +1336,17 @@ unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
1290 */ 1336 */
1291 1337
1292static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, 1338static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1293 unsigned char old_state, unsigned char new_state) 1339 unsigned char old_state, unsigned char new_state,
1340 unsigned int *n)
1294{ 1341{
1295 struct gfs2_bitmap *bi = NULL; 1342 struct gfs2_bitmap *bi = NULL;
1296 u32 length = rgd->rd_length; 1343 const u32 length = rgd->rd_length;
1297 u32 blk = 0; 1344 u32 blk = 0;
1298 unsigned int buf, x; 1345 unsigned int buf, x;
1346 const unsigned int elen = *n;
1347 const u8 *buffer;
1299 1348
1349 *n = 0;
1300 /* Find bitmap block that contains bits for goal block */ 1350 /* Find bitmap block that contains bits for goal block */
1301 for (buf = 0; buf < length; buf++) { 1351 for (buf = 0; buf < length; buf++) {
1302 bi = rgd->rd_bits + buf; 1352 bi = rgd->rd_bits + buf;
@@ -1317,12 +1367,11 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1317 for (x = 0; x <= length; x++) { 1367 for (x = 0; x <= length; x++) {
1318 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone 1368 /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone
1319 bitmaps, so we must search the originals for that. */ 1369 bitmaps, so we must search the originals for that. */
1370 buffer = bi->bi_bh->b_data + bi->bi_offset;
1320 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone) 1371 if (old_state != GFS2_BLKST_UNLINKED && bi->bi_clone)
1321 blk = gfs2_bitfit(bi->bi_clone + bi->bi_offset, 1372 buffer = bi->bi_clone + bi->bi_offset;
1322 bi->bi_len, goal, old_state); 1373
1323 else 1374 blk = gfs2_bitfit(buffer, bi->bi_len, goal, old_state);
1324 blk = gfs2_bitfit(bi->bi_bh->b_data + bi->bi_offset,
1325 bi->bi_len, goal, old_state);
1326 if (blk != BFITNOENT) 1375 if (blk != BFITNOENT)
1327 break; 1376 break;
1328 1377
@@ -1333,12 +1382,23 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal,
1333 } 1382 }
1334 1383
1335 if (blk != BFITNOENT && old_state != new_state) { 1384 if (blk != BFITNOENT && old_state != new_state) {
1385 *n = 1;
1336 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1386 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1337 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, 1387 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset,
1338 bi->bi_len, blk, new_state); 1388 bi->bi_len, blk, new_state);
1339 if (bi->bi_clone) 1389 goal = blk;
1340 gfs2_setbit(rgd, bi->bi_clone + bi->bi_offset, 1390 while (*n < elen) {
1341 bi->bi_len, blk, new_state); 1391 goal++;
1392 if (goal >= (bi->bi_len * GFS2_NBBY))
1393 break;
1394 if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) !=
1395 GFS2_BLKST_FREE)
1396 break;
1397 gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone,
1398 bi->bi_offset, bi->bi_len, goal,
1399 new_state);
1400 (*n)++;
1401 }
1342 } 1402 }
1343 1403
1344 return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk; 1404 return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk;
@@ -1393,7 +1453,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1393 bi->bi_len); 1453 bi->bi_len);
1394 } 1454 }
1395 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); 1455 gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1);
1396 gfs2_setbit(rgd, bi->bi_bh->b_data + bi->bi_offset, 1456 gfs2_setbit(rgd, bi->bi_bh->b_data, NULL, bi->bi_offset,
1397 bi->bi_len, buf_blk, new_state); 1457 bi->bi_len, buf_blk, new_state);
1398 } 1458 }
1399 1459
@@ -1401,13 +1461,13 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
1401} 1461}
1402 1462
1403/** 1463/**
1404 * gfs2_alloc_data - Allocate a data block 1464 * gfs2_alloc_block - Allocate a block
1405 * @ip: the inode to allocate the data block for 1465 * @ip: the inode to allocate the block for
1406 * 1466 *
1407 * Returns: the allocated block 1467 * Returns: the allocated block
1408 */ 1468 */
1409 1469
1410u64 gfs2_alloc_data(struct gfs2_inode *ip) 1470u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n)
1411{ 1471{
1412 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); 1472 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1413 struct gfs2_alloc *al = ip->i_alloc; 1473 struct gfs2_alloc *al = ip->i_alloc;
@@ -1415,77 +1475,31 @@ u64 gfs2_alloc_data(struct gfs2_inode *ip)
1415 u32 goal, blk; 1475 u32 goal, blk;
1416 u64 block; 1476 u64 block;
1417 1477
1418 if (rgrp_contains_block(rgd, ip->i_di.di_goal_data)) 1478 if (rgrp_contains_block(rgd, ip->i_goal))
1419 goal = ip->i_di.di_goal_data - rgd->rd_data0; 1479 goal = ip->i_goal - rgd->rd_data0;
1420 else 1480 else
1421 goal = rgd->rd_last_alloc_data; 1481 goal = rgd->rd_last_alloc;
1422 1482
1423 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED); 1483 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n);
1424 BUG_ON(blk == BFITNOENT); 1484 BUG_ON(blk == BFITNOENT);
1425 rgd->rd_last_alloc_data = blk;
1426 1485
1486 rgd->rd_last_alloc = blk;
1427 block = rgd->rd_data0 + blk; 1487 block = rgd->rd_data0 + blk;
1428 ip->i_di.di_goal_data = block; 1488 ip->i_goal = block;
1429 1489
1430 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); 1490 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n);
1431 rgd->rd_rg.rg_free--; 1491 rgd->rd_rg.rg_free -= *n;
1432 1492
1433 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1493 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1434 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); 1494 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1435 1495
1436 al->al_alloced++; 1496 al->al_alloced += *n;
1437 1497
1438 gfs2_statfs_change(sdp, 0, -1, 0); 1498 gfs2_statfs_change(sdp, 0, -*n, 0);
1439 gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid); 1499 gfs2_quota_change(ip, *n, ip->i_inode.i_uid, ip->i_inode.i_gid);
1440 1500
1441 spin_lock(&sdp->sd_rindex_spin); 1501 spin_lock(&sdp->sd_rindex_spin);
1442 rgd->rd_free_clone--; 1502 rgd->rd_free_clone -= *n;
1443 spin_unlock(&sdp->sd_rindex_spin);
1444
1445 return block;
1446}
1447
1448/**
1449 * gfs2_alloc_meta - Allocate a metadata block
1450 * @ip: the inode to allocate the metadata block for
1451 *
1452 * Returns: the allocated block
1453 */
1454
1455u64 gfs2_alloc_meta(struct gfs2_inode *ip)
1456{
1457 struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1458 struct gfs2_alloc *al = ip->i_alloc;
1459 struct gfs2_rgrpd *rgd = al->al_rgd;
1460 u32 goal, blk;
1461 u64 block;
1462
1463 if (rgrp_contains_block(rgd, ip->i_di.di_goal_meta))
1464 goal = ip->i_di.di_goal_meta - rgd->rd_data0;
1465 else
1466 goal = rgd->rd_last_alloc_meta;
1467
1468 blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED);
1469 BUG_ON(blk == BFITNOENT);
1470 rgd->rd_last_alloc_meta = blk;
1471
1472 block = rgd->rd_data0 + blk;
1473 ip->i_di.di_goal_meta = block;
1474
1475 gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free);
1476 rgd->rd_rg.rg_free--;
1477
1478 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1479 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data);
1480
1481 al->al_alloced++;
1482
1483 gfs2_statfs_change(sdp, 0, -1, 0);
1484 gfs2_quota_change(ip, +1, ip->i_inode.i_uid, ip->i_inode.i_gid);
1485 gfs2_trans_add_unrevoke(sdp, block);
1486
1487 spin_lock(&sdp->sd_rindex_spin);
1488 rgd->rd_free_clone--;
1489 spin_unlock(&sdp->sd_rindex_spin); 1503 spin_unlock(&sdp->sd_rindex_spin);
1490 1504
1491 return block; 1505 return block;
@@ -1505,12 +1519,13 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1505 struct gfs2_rgrpd *rgd = al->al_rgd; 1519 struct gfs2_rgrpd *rgd = al->al_rgd;
1506 u32 blk; 1520 u32 blk;
1507 u64 block; 1521 u64 block;
1522 unsigned int n = 1;
1508 1523
1509 blk = rgblk_search(rgd, rgd->rd_last_alloc_meta, 1524 blk = rgblk_search(rgd, rgd->rd_last_alloc,
1510 GFS2_BLKST_FREE, GFS2_BLKST_DINODE); 1525 GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
1511 BUG_ON(blk == BFITNOENT); 1526 BUG_ON(blk == BFITNOENT);
1512 1527
1513 rgd->rd_last_alloc_meta = blk; 1528 rgd->rd_last_alloc = blk;
1514 1529
1515 block = rgd->rd_data0 + blk; 1530 block = rgd->rd_data0 + blk;
1516 1531
@@ -1519,12 +1534,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
1519 rgd->rd_rg.rg_dinodes++; 1534 rgd->rd_rg.rg_dinodes++;
1520 *generation = rgd->rd_rg.rg_igeneration++; 1535 *generation = rgd->rd_rg.rg_igeneration++;
1521 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1536 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1522 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); 1537 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1523 1538
1524 al->al_alloced++; 1539 al->al_alloced++;
1525 1540
1526 gfs2_statfs_change(sdp, 0, -1, +1); 1541 gfs2_statfs_change(sdp, 0, -1, +1);
1527 gfs2_trans_add_unrevoke(sdp, block); 1542 gfs2_trans_add_unrevoke(sdp, block, 1);
1528 1543
1529 spin_lock(&sdp->sd_rindex_spin); 1544 spin_lock(&sdp->sd_rindex_spin);
1530 rgd->rd_free_clone--; 1545 rgd->rd_free_clone--;
@@ -1553,7 +1568,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen)
1553 rgd->rd_rg.rg_free += blen; 1568 rgd->rd_rg.rg_free += blen;
1554 1569
1555 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1570 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1556 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); 1571 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1557 1572
1558 gfs2_trans_add_rg(rgd); 1573 gfs2_trans_add_rg(rgd);
1559 1574
@@ -1581,7 +1596,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
1581 rgd->rd_rg.rg_free += blen; 1596 rgd->rd_rg.rg_free += blen;
1582 1597
1583 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1598 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1584 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); 1599 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1585 1600
1586 gfs2_trans_add_rg(rgd); 1601 gfs2_trans_add_rg(rgd);
1587 1602
@@ -1601,7 +1616,7 @@ void gfs2_unlink_di(struct inode *inode)
1601 if (!rgd) 1616 if (!rgd)
1602 return; 1617 return;
1603 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1618 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1604 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); 1619 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1605 gfs2_trans_add_rg(rgd); 1620 gfs2_trans_add_rg(rgd);
1606} 1621}
1607 1622
@@ -1621,7 +1636,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
1621 rgd->rd_rg.rg_free++; 1636 rgd->rd_rg.rg_free++;
1622 1637
1623 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); 1638 gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
1624 gfs2_rgrp_out(&rgd->rd_rg, rgd->rd_bits[0].bi_bh->b_data); 1639 gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
1625 1640
1626 gfs2_statfs_change(sdp, 0, +1, -1); 1641 gfs2_statfs_change(sdp, 0, +1, -1);
1627 gfs2_trans_add_rg(rgd); 1642 gfs2_trans_add_rg(rgd);
@@ -1699,8 +1714,7 @@ void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
1699 * 1714 *
1700 */ 1715 */
1701 1716
1702void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, 1717void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
1703 int flags)
1704{ 1718{
1705 unsigned int x; 1719 unsigned int x;
1706 1720
@@ -1708,7 +1722,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state,
1708 GFP_NOFS | __GFP_NOFAIL); 1722 GFP_NOFS | __GFP_NOFAIL);
1709 for (x = 0; x < rlist->rl_rgrps; x++) 1723 for (x = 0; x < rlist->rl_rgrps; x++)
1710 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl, 1724 gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
1711 state, flags, 1725 state, 0,
1712 &rlist->rl_ghs[x]); 1726 &rlist->rl_ghs[x]);
1713} 1727}
1714 1728
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 149bb161f4b6..3181c7e624bf 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. 2 * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved.
3 * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This copyrighted material is made available to anyone wishing to use, 5 * This copyrighted material is made available to anyone wishing to use,
6 * modify, copy, or redistribute it subject to the terms and conditions 6 * modify, copy, or redistribute it subject to the terms and conditions
@@ -46,8 +46,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip);
46 46
47unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); 47unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
48 48
49u64 gfs2_alloc_data(struct gfs2_inode *ip); 49u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n);
50u64 gfs2_alloc_meta(struct gfs2_inode *ip);
51u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); 50u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
52 51
53void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); 52void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
@@ -64,8 +63,7 @@ struct gfs2_rgrp_list {
64 63
65void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, 64void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist,
66 u64 block); 65 u64 block);
67void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state, 66void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
68 int flags);
69void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); 67void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
70u64 gfs2_ri_total(struct gfs2_sbd *sdp); 68u64 gfs2_ri_total(struct gfs2_sbd *sdp);
71 69
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index ef0562c3bc71..7aeacbc65f35 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -210,7 +210,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
210 struct page *page; 210 struct page *page;
211 struct bio *bio; 211 struct bio *bio;
212 212
213 page = alloc_page(GFP_KERNEL); 213 page = alloc_page(GFP_NOFS);
214 if (unlikely(!page)) 214 if (unlikely(!page))
215 return -ENOBUFS; 215 return -ENOBUFS;
216 216
@@ -218,7 +218,7 @@ int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector)
218 ClearPageDirty(page); 218 ClearPageDirty(page);
219 lock_page(page); 219 lock_page(page);
220 220
221 bio = bio_alloc(GFP_KERNEL, 1); 221 bio = bio_alloc(GFP_NOFS, 1);
222 if (unlikely(!bio)) { 222 if (unlikely(!bio)) {
223 __free_page(page); 223 __free_page(page);
224 return -ENOBUFS; 224 return -ENOBUFS;
@@ -316,6 +316,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
316 sdp->sd_heightsize[x] = space; 316 sdp->sd_heightsize[x] = space;
317 } 317 }
318 sdp->sd_max_height = x; 318 sdp->sd_max_height = x;
319 sdp->sd_heightsize[x] = ~0;
319 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); 320 gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT);
320 321
321 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - 322 sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize -
@@ -334,6 +335,7 @@ int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent)
334 sdp->sd_jheightsize[x] = space; 335 sdp->sd_jheightsize[x] = space;
335 } 336 }
336 sdp->sd_max_jheight = x; 337 sdp->sd_max_jheight = x;
338 sdp->sd_jheightsize[x] = ~0;
337 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); 339 gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
338 340
339 return 0; 341 return 0;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index 60a870e430be..44361ecc44f7 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -17,6 +17,7 @@ void gfs2_tune_init(struct gfs2_tune *gt);
17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent); 17int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent);
18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); 18int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent);
19int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector); 19int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector);
20void gfs2_lm_unmount(struct gfs2_sbd *sdp);
20 21
21static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) 22static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
22{ 23{
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index eaa3b7b2f99e..9ab9fc85ecd0 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -20,7 +20,6 @@
20 20
21#include "gfs2.h" 21#include "gfs2.h"
22#include "incore.h" 22#include "incore.h"
23#include "lm.h"
24#include "sys.h" 23#include "sys.h"
25#include "super.h" 24#include "super.h"
26#include "glock.h" 25#include "glock.h"
@@ -328,15 +327,9 @@ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \
328} \ 327} \
329static struct counters_attr counters_attr_##name = __ATTR_RO(name) 328static struct counters_attr counters_attr_##name = __ATTR_RO(name)
330 329
331COUNTERS_ATTR(glock_count, "%u\n");
332COUNTERS_ATTR(glock_held_count, "%u\n");
333COUNTERS_ATTR(inode_count, "%u\n");
334COUNTERS_ATTR(reclaimed, "%u\n"); 330COUNTERS_ATTR(reclaimed, "%u\n");
335 331
336static struct attribute *counters_attrs[] = { 332static struct attribute *counters_attrs[] = {
337 &counters_attr_glock_count.attr,
338 &counters_attr_glock_held_count.attr,
339 &counters_attr_inode_count.attr,
340 &counters_attr_reclaimed.attr, 333 &counters_attr_reclaimed.attr,
341 NULL, 334 NULL,
342}; 335};
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 73e5d92a657c..f677b8a83f0c 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -146,30 +146,25 @@ void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
146 lops_add(sdp, &bd->bd_le); 146 lops_add(sdp, &bd->bd_le);
147} 147}
148 148
149void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno) 149void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
150{ 150{
151 struct gfs2_bufdata *bd; 151 struct gfs2_bufdata *bd, *tmp;
152 int found = 0; 152 struct gfs2_trans *tr = current->journal_info;
153 unsigned int n = len;
153 154
154 gfs2_log_lock(sdp); 155 gfs2_log_lock(sdp);
155 156 list_for_each_entry_safe(bd, tmp, &sdp->sd_log_le_revoke, bd_le.le_list) {
156 list_for_each_entry(bd, &sdp->sd_log_le_revoke, bd_le.le_list) { 157 if ((bd->bd_blkno >= blkno) && (bd->bd_blkno < (blkno + len))) {
157 if (bd->bd_blkno == blkno) {
158 list_del_init(&bd->bd_le.le_list); 158 list_del_init(&bd->bd_le.le_list);
159 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke); 159 gfs2_assert_withdraw(sdp, sdp->sd_log_num_revoke);
160 sdp->sd_log_num_revoke--; 160 sdp->sd_log_num_revoke--;
161 found = 1; 161 kmem_cache_free(gfs2_bufdata_cachep, bd);
162 break; 162 tr->tr_num_revoke_rm++;
163 if (--n == 0)
164 break;
163 } 165 }
164 } 166 }
165
166 gfs2_log_unlock(sdp); 167 gfs2_log_unlock(sdp);
167
168 if (found) {
169 struct gfs2_trans *tr = current->journal_info;
170 kmem_cache_free(gfs2_bufdata_cachep, bd);
171 tr->tr_num_revoke_rm++;
172 }
173} 168}
174 169
175void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd) 170void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index e826f0dab80a..edf9d4bd908e 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -32,7 +32,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp);
32 32
33void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta); 33void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
34void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); 34void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
35void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno); 35void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
36void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd); 36void gfs2_trans_add_rg(struct gfs2_rgrpd *rgd);
37 37
38#endif /* __TRANS_DOT_H__ */ 38#endif /* __TRANS_DOT_H__ */
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 424a0774eda8..d31e355c61fb 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -19,12 +19,12 @@
19#include "gfs2.h" 19#include "gfs2.h"
20#include "incore.h" 20#include "incore.h"
21#include "glock.h" 21#include "glock.h"
22#include "lm.h"
23#include "util.h" 22#include "util.h"
24 23
25struct kmem_cache *gfs2_glock_cachep __read_mostly; 24struct kmem_cache *gfs2_glock_cachep __read_mostly;
26struct kmem_cache *gfs2_inode_cachep __read_mostly; 25struct kmem_cache *gfs2_inode_cachep __read_mostly;
27struct kmem_cache *gfs2_bufdata_cachep __read_mostly; 26struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
27struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
28 28
29void gfs2_assert_i(struct gfs2_sbd *sdp) 29void gfs2_assert_i(struct gfs2_sbd *sdp)
30{ 30{
@@ -32,6 +32,28 @@ void gfs2_assert_i(struct gfs2_sbd *sdp)
32 sdp->sd_fsname); 32 sdp->sd_fsname);
33} 33}
34 34
35int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
36{
37 va_list args;
38
39 if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
40 return 0;
41
42 va_start(args, fmt);
43 vprintk(fmt, args);
44 va_end(args);
45
46 fs_err(sdp, "about to withdraw this file system\n");
47 BUG_ON(sdp->sd_args.ar_debug);
48
49 fs_err(sdp, "telling LM to withdraw\n");
50 gfs2_withdraw_lockproto(&sdp->sd_lockstruct);
51 fs_err(sdp, "withdrawn\n");
52 dump_stack();
53
54 return -1;
55}
56
35/** 57/**
36 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false 58 * gfs2_assert_withdraw_i - Cause the machine to withdraw if @assertion is false
37 * Returns: -1 if this call withdrew the machine, 59 * Returns: -1 if this call withdrew the machine,
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 28938a46cf47..509c5d60bd80 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -147,6 +147,7 @@ gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__);
147extern struct kmem_cache *gfs2_glock_cachep; 147extern struct kmem_cache *gfs2_glock_cachep;
148extern struct kmem_cache *gfs2_inode_cachep; 148extern struct kmem_cache *gfs2_inode_cachep;
149extern struct kmem_cache *gfs2_bufdata_cachep; 149extern struct kmem_cache *gfs2_bufdata_cachep;
150extern struct kmem_cache *gfs2_rgrpd_cachep;
150 151
151static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, 152static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
152 unsigned int *p) 153 unsigned int *p)
@@ -163,6 +164,7 @@ gfs2_tune_get_i(&(sdp)->sd_tune, &(sdp)->sd_tune.field)
163 164
164void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap, 165void gfs2_icbit_munge(struct gfs2_sbd *sdp, unsigned char **bitmap,
165 unsigned int bit, int new_value); 166 unsigned int bit, int new_value);
167int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...);
166 168
167#endif /* __UTIL_DOT_H__ */ 169#endif /* __UTIL_DOT_H__ */
168 170
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index b60c0affbec5..f457d2ca51ab 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -14,6 +14,7 @@
14 14
15#include <linux/capability.h> 15#include <linux/capability.h>
16#include <linux/fs.h> 16#include <linux/fs.h>
17#include <linux/mount.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18#include <linux/xattr.h> 19#include <linux/xattr.h>
19#include <asm/uaccess.h> 20#include <asm/uaccess.h>
@@ -35,25 +36,32 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
35 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */ 36 flags |= FS_NODUMP_FL; /* EXT2_NODUMP_FL */
36 return put_user(flags, (int __user *)arg); 37 return put_user(flags, (int __user *)arg);
37 case HFSPLUS_IOC_EXT2_SETFLAGS: { 38 case HFSPLUS_IOC_EXT2_SETFLAGS: {
38 if (IS_RDONLY(inode)) 39 int err = 0;
39 return -EROFS; 40 err = mnt_want_write(filp->f_path.mnt);
40 41 if (err)
41 if (!is_owner_or_cap(inode)) 42 return err;
42 return -EACCES; 43
43 44 if (!is_owner_or_cap(inode)) {
44 if (get_user(flags, (int __user *)arg)) 45 err = -EACCES;
45 return -EFAULT; 46 goto setflags_out;
46 47 }
48 if (get_user(flags, (int __user *)arg)) {
49 err = -EFAULT;
50 goto setflags_out;
51 }
47 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) || 52 if (flags & (FS_IMMUTABLE_FL|FS_APPEND_FL) ||
48 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) { 53 HFSPLUS_I(inode).rootflags & (HFSPLUS_FLG_IMMUTABLE|HFSPLUS_FLG_APPEND)) {
49 if (!capable(CAP_LINUX_IMMUTABLE)) 54 if (!capable(CAP_LINUX_IMMUTABLE)) {
50 return -EPERM; 55 err = -EPERM;
56 goto setflags_out;
57 }
51 } 58 }
52 59
53 /* don't silently ignore unsupported ext2 flags */ 60 /* don't silently ignore unsupported ext2 flags */
54 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) 61 if (flags & ~(FS_IMMUTABLE_FL|FS_APPEND_FL|FS_NODUMP_FL)) {
55 return -EOPNOTSUPP; 62 err = -EOPNOTSUPP;
56 63 goto setflags_out;
64 }
57 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */ 65 if (flags & FS_IMMUTABLE_FL) { /* EXT2_IMMUTABLE_FL */
58 inode->i_flags |= S_IMMUTABLE; 66 inode->i_flags |= S_IMMUTABLE;
59 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE; 67 HFSPLUS_I(inode).rootflags |= HFSPLUS_FLG_IMMUTABLE;
@@ -75,7 +83,9 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
75 83
76 inode->i_ctime = CURRENT_TIME_SEC; 84 inode->i_ctime = CURRENT_TIME_SEC;
77 mark_inode_dirty(inode); 85 mark_inode_dirty(inode);
78 return 0; 86setflags_out:
87 mnt_drop_write(filp->f_path.mnt);
88 return err;
79 } 89 }
80 default: 90 default:
81 return -ENOTTY; 91 return -ENOTTY;
diff --git a/fs/inode.c b/fs/inode.c
index 53245ffcf93d..27ee1af50d02 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1199,42 +1199,37 @@ void touch_atime(struct vfsmount *mnt, struct dentry *dentry)
1199 struct inode *inode = dentry->d_inode; 1199 struct inode *inode = dentry->d_inode;
1200 struct timespec now; 1200 struct timespec now;
1201 1201
1202 if (inode->i_flags & S_NOATIME) 1202 if (mnt_want_write(mnt))
1203 return; 1203 return;
1204 if (inode->i_flags & S_NOATIME)
1205 goto out;
1204 if (IS_NOATIME(inode)) 1206 if (IS_NOATIME(inode))
1205 return; 1207 goto out;
1206 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) 1208 if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))
1207 return; 1209 goto out;
1208 1210
1209 /* 1211 if (mnt->mnt_flags & MNT_NOATIME)
1210 * We may have a NULL vfsmount when coming from NFSD 1212 goto out;
1211 */ 1213 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))
1212 if (mnt) { 1214 goto out;
1213 if (mnt->mnt_flags & MNT_NOATIME) 1215 if (mnt->mnt_flags & MNT_RELATIME) {
1214 return; 1216 /*
1215 if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) 1217 * With relative atime, only update atime if the previous
1216 return; 1218 * atime is earlier than either the ctime or mtime.
1217 1219 */
1218 if (mnt->mnt_flags & MNT_RELATIME) { 1220 if (timespec_compare(&inode->i_mtime, &inode->i_atime) < 0 &&
1219 /* 1221 timespec_compare(&inode->i_ctime, &inode->i_atime) < 0)
1220 * With relative atime, only update atime if the 1222 goto out;
1221 * previous atime is earlier than either the ctime or
1222 * mtime.
1223 */
1224 if (timespec_compare(&inode->i_mtime,
1225 &inode->i_atime) < 0 &&
1226 timespec_compare(&inode->i_ctime,
1227 &inode->i_atime) < 0)
1228 return;
1229 }
1230 } 1223 }
1231 1224
1232 now = current_fs_time(inode->i_sb); 1225 now = current_fs_time(inode->i_sb);
1233 if (timespec_equal(&inode->i_atime, &now)) 1226 if (timespec_equal(&inode->i_atime, &now))
1234 return; 1227 goto out;
1235 1228
1236 inode->i_atime = now; 1229 inode->i_atime = now;
1237 mark_inode_dirty_sync(inode); 1230 mark_inode_dirty_sync(inode);
1231out:
1232 mnt_drop_write(mnt);
1238} 1233}
1239EXPORT_SYMBOL(touch_atime); 1234EXPORT_SYMBOL(touch_atime);
1240 1235
@@ -1255,10 +1250,13 @@ void file_update_time(struct file *file)
1255 struct inode *inode = file->f_path.dentry->d_inode; 1250 struct inode *inode = file->f_path.dentry->d_inode;
1256 struct timespec now; 1251 struct timespec now;
1257 int sync_it = 0; 1252 int sync_it = 0;
1253 int err;
1258 1254
1259 if (IS_NOCMTIME(inode)) 1255 if (IS_NOCMTIME(inode))
1260 return; 1256 return;
1261 if (IS_RDONLY(inode)) 1257
1258 err = mnt_want_write(file->f_path.mnt);
1259 if (err)
1262 return; 1260 return;
1263 1261
1264 now = current_fs_time(inode->i_sb); 1262 now = current_fs_time(inode->i_sb);
@@ -1279,6 +1277,7 @@ void file_update_time(struct file *file)
1279 1277
1280 if (sync_it) 1278 if (sync_it)
1281 mark_inode_dirty_sync(inode); 1279 mark_inode_dirty_sync(inode);
1280 mnt_drop_write(file->f_path.mnt);
1282} 1281}
1283 1282
1284EXPORT_SYMBOL(file_update_time); 1283EXPORT_SYMBOL(file_update_time);
diff --git a/fs/internal.h b/fs/internal.h
index 392e8ccd6fc4..80aa9a023372 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,3 +43,14 @@ extern void __init chrdev_init(void);
43 * namespace.c 43 * namespace.c
44 */ 44 */
45extern int copy_mount_options(const void __user *, unsigned long *); 45extern int copy_mount_options(const void __user *, unsigned long *);
46
47extern void free_vfsmnt(struct vfsmount *);
48extern struct vfsmount *alloc_vfsmnt(const char *);
49extern struct vfsmount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
50extern void mnt_set_mountpoint(struct vfsmount *, struct dentry *,
51 struct vfsmount *);
52extern void release_mounts(struct list_head *);
53extern void umount_tree(struct vfsmount *, int, struct list_head *);
54extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
55
56extern void __init mnt_init(void);
diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h
index 0b78fdc9773b..a841f4973a74 100644
--- a/fs/jffs2/jffs2_fs_i.h
+++ b/fs/jffs2/jffs2_fs_i.h
@@ -15,7 +15,7 @@
15#include <linux/version.h> 15#include <linux/version.h>
16#include <linux/rbtree.h> 16#include <linux/rbtree.h>
17#include <linux/posix_acl.h> 17#include <linux/posix_acl.h>
18#include <asm/semaphore.h> 18#include <linux/semaphore.h>
19 19
20struct jffs2_inode_info { 20struct jffs2_inode_info {
21 /* We need an internal mutex similar to inode->i_mutex. 21 /* We need an internal mutex similar to inode->i_mutex.
diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h
index 3a2197f3c812..18fca2b9e531 100644
--- a/fs/jffs2/jffs2_fs_sb.h
+++ b/fs/jffs2/jffs2_fs_sb.h
@@ -16,7 +16,7 @@
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/workqueue.h> 17#include <linux/workqueue.h>
18#include <linux/completion.h> 18#include <linux/completion.h>
19#include <asm/semaphore.h> 19#include <linux/semaphore.h>
20#include <linux/timer.h> 20#include <linux/timer.h>
21#include <linux/wait.h> 21#include <linux/wait.h>
22#include <linux/list.h> 22#include <linux/list.h>
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index a1f8e375ad21..afe222bf300f 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -8,6 +8,7 @@
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/ctype.h> 9#include <linux/ctype.h>
10#include <linux/capability.h> 10#include <linux/capability.h>
11#include <linux/mount.h>
11#include <linux/time.h> 12#include <linux/time.h>
12#include <linux/sched.h> 13#include <linux/sched.h>
13#include <asm/current.h> 14#include <asm/current.h>
@@ -65,23 +66,30 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
65 return put_user(flags, (int __user *) arg); 66 return put_user(flags, (int __user *) arg);
66 case JFS_IOC_SETFLAGS: { 67 case JFS_IOC_SETFLAGS: {
67 unsigned int oldflags; 68 unsigned int oldflags;
69 int err;
68 70
69 if (IS_RDONLY(inode)) 71 err = mnt_want_write(filp->f_path.mnt);
70 return -EROFS; 72 if (err)
73 return err;
71 74
72 if (!is_owner_or_cap(inode)) 75 if (!is_owner_or_cap(inode)) {
73 return -EACCES; 76 err = -EACCES;
74 77 goto setflags_out;
75 if (get_user(flags, (int __user *) arg)) 78 }
76 return -EFAULT; 79 if (get_user(flags, (int __user *) arg)) {
80 err = -EFAULT;
81 goto setflags_out;
82 }
77 83
78 flags = jfs_map_ext2(flags, 1); 84 flags = jfs_map_ext2(flags, 1);
79 if (!S_ISDIR(inode->i_mode)) 85 if (!S_ISDIR(inode->i_mode))
80 flags &= ~JFS_DIRSYNC_FL; 86 flags &= ~JFS_DIRSYNC_FL;
81 87
82 /* Is it quota file? Do not allow user to mess with it */ 88 /* Is it quota file? Do not allow user to mess with it */
83 if (IS_NOQUOTA(inode)) 89 if (IS_NOQUOTA(inode)) {
84 return -EPERM; 90 err = -EPERM;
91 goto setflags_out;
92 }
85 93
86 /* Lock against other parallel changes of flags */ 94 /* Lock against other parallel changes of flags */
87 mutex_lock(&inode->i_mutex); 95 mutex_lock(&inode->i_mutex);
@@ -98,7 +106,8 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
98 (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { 106 (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
99 if (!capable(CAP_LINUX_IMMUTABLE)) { 107 if (!capable(CAP_LINUX_IMMUTABLE)) {
100 mutex_unlock(&inode->i_mutex); 108 mutex_unlock(&inode->i_mutex);
101 return -EPERM; 109 err = -EPERM;
110 goto setflags_out;
102 } 111 }
103 } 112 }
104 113
@@ -110,7 +119,9 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
110 mutex_unlock(&inode->i_mutex); 119 mutex_unlock(&inode->i_mutex);
111 inode->i_ctime = CURRENT_TIME_SEC; 120 inode->i_ctime = CURRENT_TIME_SEC;
112 mark_inode_dirty(inode); 121 mark_inode_dirty(inode);
113 return 0; 122setflags_out:
123 mnt_drop_write(filp->f_path.mnt);
124 return err;
114 } 125 }
115 default: 126 default:
116 return -ENOTTY; 127 return -ENOTTY;
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index e1985066b1c6..2bc7d8aa5740 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -2172,7 +2172,7 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2172 } 2172 }
2173 2173
2174 /* update the free count for this dmap */ 2174 /* update the free count for this dmap */
2175 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); 2175 le32_add_cpu(&dp->nfree, -nblocks);
2176 2176
2177 BMAP_LOCK(bmp); 2177 BMAP_LOCK(bmp);
2178 2178
@@ -2316,7 +2316,7 @@ static int dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2316 2316
2317 /* update the free count for this dmap. 2317 /* update the free count for this dmap.
2318 */ 2318 */
2319 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks); 2319 le32_add_cpu(&dp->nfree, nblocks);
2320 2320
2321 BMAP_LOCK(bmp); 2321 BMAP_LOCK(bmp);
2322 2322
@@ -3226,7 +3226,7 @@ static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3226 } 3226 }
3227 3227
3228 /* update the free count for this dmap */ 3228 /* update the free count for this dmap */
3229 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks); 3229 le32_add_cpu(&dp->nfree, -nblocks);
3230 3230
3231 /* reconstruct summary tree */ 3231 /* reconstruct summary tree */
3232 dbInitDmapTree(dp); 3232 dbInitDmapTree(dp);
@@ -3660,9 +3660,8 @@ static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
3660 goto initTree; 3660 goto initTree;
3661 } 3661 }
3662 } else { 3662 } else {
3663 dp->nblocks = 3663 le32_add_cpu(&dp->nblocks, nblocks);
3664 cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks); 3664 le32_add_cpu(&dp->nfree, nblocks);
3665 dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
3666 } 3665 }
3667 3666
3668 /* word number containing start block number */ 3667 /* word number containing start block number */
diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h
index 11e6d471b364..1a6eb41569bc 100644
--- a/fs/jfs/jfs_dmap.h
+++ b/fs/jfs/jfs_dmap.h
@@ -61,7 +61,7 @@
61 * determine the maximum free string for four (lower level) nodes 61 * determine the maximum free string for four (lower level) nodes
62 * of the tree. 62 * of the tree.
63 */ 63 */
64static __inline signed char TREEMAX(signed char *cp) 64static inline signed char TREEMAX(signed char *cp)
65{ 65{
66 signed char tmp1, tmp2; 66 signed char tmp1, tmp2;
67 67
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index 9bf29f771737..734ec916beaf 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -1019,8 +1019,7 @@ int diFree(struct inode *ip)
1019 /* update the free inode counts at the iag, ag and 1019 /* update the free inode counts at the iag, ag and
1020 * map level. 1020 * map level.
1021 */ 1021 */
1022 iagp->nfreeinos = 1022 le32_add_cpu(&iagp->nfreeinos, 1);
1023 cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 1);
1024 imap->im_agctl[agno].numfree += 1; 1023 imap->im_agctl[agno].numfree += 1;
1025 atomic_inc(&imap->im_numfree); 1024 atomic_inc(&imap->im_numfree);
1026 1025
@@ -1219,9 +1218,8 @@ int diFree(struct inode *ip)
1219 /* update the number of free inodes and number of free extents 1218 /* update the number of free inodes and number of free extents
1220 * for the iag. 1219 * for the iag.
1221 */ 1220 */
1222 iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1221 le32_add_cpu(&iagp->nfreeinos, -(INOSPEREXT - 1));
1223 (INOSPEREXT - 1)); 1222 le32_add_cpu(&iagp->nfreeexts, 1);
1224 iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) + 1);
1225 1223
1226 /* update the number of free inodes and backed inodes 1224 /* update the number of free inodes and backed inodes
1227 * at the ag and inode map level. 1225 * at the ag and inode map level.
@@ -2124,7 +2122,7 @@ static int diAllocBit(struct inomap * imap, struct iag * iagp, int ino)
2124 /* update the free inode count at the iag, ag, inode 2122 /* update the free inode count at the iag, ag, inode
2125 * map levels. 2123 * map levels.
2126 */ 2124 */
2127 iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) - 1); 2125 le32_add_cpu(&iagp->nfreeinos, -1);
2128 imap->im_agctl[agno].numfree -= 1; 2126 imap->im_agctl[agno].numfree -= 1;
2129 atomic_dec(&imap->im_numfree); 2127 atomic_dec(&imap->im_numfree);
2130 2128
@@ -2378,9 +2376,8 @@ static int diNewExt(struct inomap * imap, struct iag * iagp, int extno)
2378 /* update the free inode and free extent counts for the 2376 /* update the free inode and free extent counts for the
2379 * iag. 2377 * iag.
2380 */ 2378 */
2381 iagp->nfreeinos = cpu_to_le32(le32_to_cpu(iagp->nfreeinos) + 2379 le32_add_cpu(&iagp->nfreeinos, (INOSPEREXT - 1));
2382 (INOSPEREXT - 1)); 2380 le32_add_cpu(&iagp->nfreeexts, -1);
2383 iagp->nfreeexts = cpu_to_le32(le32_to_cpu(iagp->nfreeexts) - 1);
2384 2381
2385 /* update the free and backed inode counts for the ag. 2382 /* update the free and backed inode counts for the ag.
2386 */ 2383 */
diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c
index a000aaa75136..5a61ebf2cbcc 100644
--- a/fs/jfs/jfs_xtree.c
+++ b/fs/jfs/jfs_xtree.c
@@ -905,8 +905,7 @@ int xtInsert(tid_t tid, /* transaction id */
905 XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); 905 XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
906 906
907 /* advance next available entry index */ 907 /* advance next available entry index */
908 p->header.nextindex = 908 le16_add_cpu(&p->header.nextindex, 1);
909 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
910 909
911 /* Don't log it if there are no links to the file */ 910 /* Don't log it if there are no links to the file */
912 if (!test_cflag(COMMIT_Nolink, ip)) { 911 if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -997,8 +996,7 @@ xtSplitUp(tid_t tid,
997 split->addr); 996 split->addr);
998 997
999 /* advance next available entry index */ 998 /* advance next available entry index */
1000 sp->header.nextindex = 999 le16_add_cpu(&sp->header.nextindex, 1);
1001 cpu_to_le16(le16_to_cpu(sp->header.nextindex) + 1);
1002 1000
1003 /* Don't log it if there are no links to the file */ 1001 /* Don't log it if there are no links to the file */
1004 if (!test_cflag(COMMIT_Nolink, ip)) { 1002 if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1167,9 +1165,7 @@ xtSplitUp(tid_t tid,
1167 JFS_SBI(ip->i_sb)->nbperpage, rcbn); 1165 JFS_SBI(ip->i_sb)->nbperpage, rcbn);
1168 1166
1169 /* advance next available entry index. */ 1167 /* advance next available entry index. */
1170 sp->header.nextindex = 1168 le16_add_cpu(&sp->header.nextindex, 1);
1171 cpu_to_le16(le16_to_cpu(sp->header.nextindex) +
1172 1);
1173 1169
1174 /* Don't log it if there are no links to the file */ 1170 /* Don't log it if there are no links to the file */
1175 if (!test_cflag(COMMIT_Nolink, ip)) { 1171 if (!test_cflag(COMMIT_Nolink, ip)) {
@@ -1738,8 +1734,7 @@ int xtExtend(tid_t tid, /* transaction id */
1738 XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr); 1734 XT_PUTENTRY(xad, XAD_NEW, xoff, len, xaddr);
1739 1735
1740 /* advance next available entry index */ 1736 /* advance next available entry index */
1741 p->header.nextindex = 1737 le16_add_cpu(&p->header.nextindex, 1);
1742 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
1743 } 1738 }
1744 1739
1745 /* get back old entry */ 1740 /* get back old entry */
@@ -1905,8 +1900,7 @@ printf("xtTailgate: xoff:0x%lx xlen:0x%x xaddr:0x%lx\n",
1905 XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr); 1900 XT_PUTENTRY(xad, XAD_NEW, xoff, xlen, xaddr);
1906 1901
1907 /* advance next available entry index */ 1902 /* advance next available entry index */
1908 p->header.nextindex = 1903 le16_add_cpu(&p->header.nextindex, 1);
1909 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
1910 } 1904 }
1911 1905
1912 /* get back old XAD */ 1906 /* get back old XAD */
@@ -2567,8 +2561,7 @@ int xtAppend(tid_t tid, /* transaction id */
2567 XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr); 2561 XT_PUTENTRY(xad, xflag, xoff, xlen, xaddr);
2568 2562
2569 /* advance next available entry index */ 2563 /* advance next available entry index */
2570 p->header.nextindex = 2564 le16_add_cpu(&p->header.nextindex, 1);
2571 cpu_to_le16(le16_to_cpu(p->header.nextindex) + 1);
2572 2565
2573 xtlck->lwm.offset = 2566 xtlck->lwm.offset =
2574 (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index; 2567 (xtlck->lwm.offset) ? min(index,(int) xtlck->lwm.offset) : index;
@@ -2631,8 +2624,7 @@ int xtDelete(tid_t tid, struct inode *ip, s64 xoff, s32 xlen, int flag)
2631 * delete the entry from the leaf page 2624 * delete the entry from the leaf page
2632 */ 2625 */
2633 nextindex = le16_to_cpu(p->header.nextindex); 2626 nextindex = le16_to_cpu(p->header.nextindex);
2634 p->header.nextindex = 2627 le16_add_cpu(&p->header.nextindex, -1);
2635 cpu_to_le16(le16_to_cpu(p->header.nextindex) - 1);
2636 2628
2637 /* 2629 /*
2638 * if the leaf page bocome empty, free the page 2630 * if the leaf page bocome empty, free the page
@@ -2795,9 +2787,7 @@ xtDeleteUp(tid_t tid, struct inode *ip,
2795 (nextindex - index - 2787 (nextindex - index -
2796 1) << L2XTSLOTSIZE); 2788 1) << L2XTSLOTSIZE);
2797 2789
2798 p->header.nextindex = 2790 le16_add_cpu(&p->header.nextindex, -1);
2799 cpu_to_le16(le16_to_cpu(p->header.nextindex) -
2800 1);
2801 jfs_info("xtDeleteUp(entry): 0x%lx[%d]", 2791 jfs_info("xtDeleteUp(entry): 0x%lx[%d]",
2802 (ulong) parent->bn, index); 2792 (ulong) parent->bn, index);
2803 } 2793 }
diff --git a/fs/locks.c b/fs/locks.c
index 43c0af21a0c5..592faadbcec1 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -127,7 +127,6 @@
127#include <linux/rcupdate.h> 127#include <linux/rcupdate.h>
128#include <linux/pid_namespace.h> 128#include <linux/pid_namespace.h>
129 129
130#include <asm/semaphore.h>
131#include <asm/uaccess.h> 130#include <asm/uaccess.h>
132 131
133#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX) 132#define IS_POSIX(fl) (fl->fl_flags & FL_POSIX)
diff --git a/fs/namei.c b/fs/namei.c
index 8cf9bb9c2fc0..e179f71bfcb0 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1623,8 +1623,7 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1623 return -EACCES; 1623 return -EACCES;
1624 1624
1625 flag &= ~O_TRUNC; 1625 flag &= ~O_TRUNC;
1626 } else if (IS_RDONLY(inode) && (acc_mode & MAY_WRITE)) 1626 }
1627 return -EROFS;
1628 1627
1629 error = vfs_permission(nd, acc_mode); 1628 error = vfs_permission(nd, acc_mode);
1630 if (error) 1629 if (error)
@@ -1677,7 +1676,12 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
1677 return 0; 1676 return 0;
1678} 1677}
1679 1678
1680static int open_namei_create(struct nameidata *nd, struct path *path, 1679/*
1680 * Be careful about ever adding any more callers of this
1681 * function. Its flags must be in the namei format, not
1682 * what get passed to sys_open().
1683 */
1684static int __open_namei_create(struct nameidata *nd, struct path *path,
1681 int flag, int mode) 1685 int flag, int mode)
1682{ 1686{
1683 int error; 1687 int error;
@@ -1696,26 +1700,56 @@ static int open_namei_create(struct nameidata *nd, struct path *path,
1696} 1700}
1697 1701
1698/* 1702/*
1699 * open_namei() 1703 * Note that while the flag value (low two bits) for sys_open means:
1704 * 00 - read-only
1705 * 01 - write-only
1706 * 10 - read-write
1707 * 11 - special
1708 * it is changed into
1709 * 00 - no permissions needed
1710 * 01 - read-permission
1711 * 10 - write-permission
1712 * 11 - read-write
1713 * for the internal routines (ie open_namei()/follow_link() etc)
1714 * This is more logical, and also allows the 00 "no perm needed"
1715 * to be used for symlinks (where the permissions are checked
1716 * later).
1700 * 1717 *
1701 * namei for open - this is in fact almost the whole open-routine. 1718*/
1702 * 1719static inline int open_to_namei_flags(int flag)
1703 * Note that the low bits of "flag" aren't the same as in the open 1720{
1704 * system call - they are 00 - no permissions needed 1721 if ((flag+1) & O_ACCMODE)
1705 * 01 - read permission needed 1722 flag++;
1706 * 10 - write permission needed 1723 return flag;
1707 * 11 - read/write permissions needed 1724}
1708 * which is a lot more logical, and also allows the "no perm" needed 1725
1709 * for symlinks (where the permissions are checked later). 1726static int open_will_write_to_fs(int flag, struct inode *inode)
1710 * SMP-safe 1727{
1728 /*
1729 * We'll never write to the fs underlying
1730 * a device file.
1731 */
1732 if (special_file(inode->i_mode))
1733 return 0;
1734 return (flag & O_TRUNC);
1735}
1736
1737/*
1738 * Note that the low bits of the passed in "open_flag"
1739 * are not the same as in the local variable "flag". See
1740 * open_to_namei_flags() for more details.
1711 */ 1741 */
1712int open_namei(int dfd, const char *pathname, int flag, 1742struct file *do_filp_open(int dfd, const char *pathname,
1713 int mode, struct nameidata *nd) 1743 int open_flag, int mode)
1714{ 1744{
1745 struct file *filp;
1746 struct nameidata nd;
1715 int acc_mode, error; 1747 int acc_mode, error;
1716 struct path path; 1748 struct path path;
1717 struct dentry *dir; 1749 struct dentry *dir;
1718 int count = 0; 1750 int count = 0;
1751 int will_write;
1752 int flag = open_to_namei_flags(open_flag);
1719 1753
1720 acc_mode = ACC_MODE(flag); 1754 acc_mode = ACC_MODE(flag);
1721 1755
@@ -1733,18 +1767,19 @@ int open_namei(int dfd, const char *pathname, int flag,
1733 */ 1767 */
1734 if (!(flag & O_CREAT)) { 1768 if (!(flag & O_CREAT)) {
1735 error = path_lookup_open(dfd, pathname, lookup_flags(flag), 1769 error = path_lookup_open(dfd, pathname, lookup_flags(flag),
1736 nd, flag); 1770 &nd, flag);
1737 if (error) 1771 if (error)
1738 return error; 1772 return ERR_PTR(error);
1739 goto ok; 1773 goto ok;
1740 } 1774 }
1741 1775
1742 /* 1776 /*
1743 * Create - we need to know the parent. 1777 * Create - we need to know the parent.
1744 */ 1778 */
1745 error = path_lookup_create(dfd,pathname,LOOKUP_PARENT,nd,flag,mode); 1779 error = path_lookup_create(dfd, pathname, LOOKUP_PARENT,
1780 &nd, flag, mode);
1746 if (error) 1781 if (error)
1747 return error; 1782 return ERR_PTR(error);
1748 1783
1749 /* 1784 /*
1750 * We have the parent and last component. First of all, check 1785 * We have the parent and last component. First of all, check
@@ -1752,14 +1787,14 @@ int open_namei(int dfd, const char *pathname, int flag,
1752 * will not do. 1787 * will not do.
1753 */ 1788 */
1754 error = -EISDIR; 1789 error = -EISDIR;
1755 if (nd->last_type != LAST_NORM || nd->last.name[nd->last.len]) 1790 if (nd.last_type != LAST_NORM || nd.last.name[nd.last.len])
1756 goto exit; 1791 goto exit;
1757 1792
1758 dir = nd->path.dentry; 1793 dir = nd.path.dentry;
1759 nd->flags &= ~LOOKUP_PARENT; 1794 nd.flags &= ~LOOKUP_PARENT;
1760 mutex_lock(&dir->d_inode->i_mutex); 1795 mutex_lock(&dir->d_inode->i_mutex);
1761 path.dentry = lookup_hash(nd); 1796 path.dentry = lookup_hash(&nd);
1762 path.mnt = nd->path.mnt; 1797 path.mnt = nd.path.mnt;
1763 1798
1764do_last: 1799do_last:
1765 error = PTR_ERR(path.dentry); 1800 error = PTR_ERR(path.dentry);
@@ -1768,18 +1803,31 @@ do_last:
1768 goto exit; 1803 goto exit;
1769 } 1804 }
1770 1805
1771 if (IS_ERR(nd->intent.open.file)) { 1806 if (IS_ERR(nd.intent.open.file)) {
1772 mutex_unlock(&dir->d_inode->i_mutex); 1807 error = PTR_ERR(nd.intent.open.file);
1773 error = PTR_ERR(nd->intent.open.file); 1808 goto exit_mutex_unlock;
1774 goto exit_dput;
1775 } 1809 }
1776 1810
1777 /* Negative dentry, just create the file */ 1811 /* Negative dentry, just create the file */
1778 if (!path.dentry->d_inode) { 1812 if (!path.dentry->d_inode) {
1779 error = open_namei_create(nd, &path, flag, mode); 1813 /*
1814 * This write is needed to ensure that a
1815 * ro->rw transition does not occur between
1816 * the time when the file is created and when
1817 * a permanent write count is taken through
1818 * the 'struct file' in nameidata_to_filp().
1819 */
1820 error = mnt_want_write(nd.path.mnt);
1780 if (error) 1821 if (error)
1822 goto exit_mutex_unlock;
1823 error = __open_namei_create(&nd, &path, flag, mode);
1824 if (error) {
1825 mnt_drop_write(nd.path.mnt);
1781 goto exit; 1826 goto exit;
1782 return 0; 1827 }
1828 filp = nameidata_to_filp(&nd, open_flag);
1829 mnt_drop_write(nd.path.mnt);
1830 return filp;
1783 } 1831 }
1784 1832
1785 /* 1833 /*
@@ -1804,23 +1852,52 @@ do_last:
1804 if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) 1852 if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
1805 goto do_link; 1853 goto do_link;
1806 1854
1807 path_to_nameidata(&path, nd); 1855 path_to_nameidata(&path, &nd);
1808 error = -EISDIR; 1856 error = -EISDIR;
1809 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode)) 1857 if (path.dentry->d_inode && S_ISDIR(path.dentry->d_inode->i_mode))
1810 goto exit; 1858 goto exit;
1811ok: 1859ok:
1812 error = may_open(nd, acc_mode, flag); 1860 /*
1813 if (error) 1861 * Consider:
1862 * 1. may_open() truncates a file
1863 * 2. a rw->ro mount transition occurs
1864 * 3. nameidata_to_filp() fails due to
1865 * the ro mount.
1866 * That would be inconsistent, and should
1867 * be avoided. Taking this mnt write here
1868 * ensures that (2) can not occur.
1869 */
1870 will_write = open_will_write_to_fs(flag, nd.path.dentry->d_inode);
1871 if (will_write) {
1872 error = mnt_want_write(nd.path.mnt);
1873 if (error)
1874 goto exit;
1875 }
1876 error = may_open(&nd, acc_mode, flag);
1877 if (error) {
1878 if (will_write)
1879 mnt_drop_write(nd.path.mnt);
1814 goto exit; 1880 goto exit;
1815 return 0; 1881 }
1882 filp = nameidata_to_filp(&nd, open_flag);
1883 /*
1884 * It is now safe to drop the mnt write
1885 * because the filp has had a write taken
1886 * on its behalf.
1887 */
1888 if (will_write)
1889 mnt_drop_write(nd.path.mnt);
1890 return filp;
1816 1891
1892exit_mutex_unlock:
1893 mutex_unlock(&dir->d_inode->i_mutex);
1817exit_dput: 1894exit_dput:
1818 path_put_conditional(&path, nd); 1895 path_put_conditional(&path, &nd);
1819exit: 1896exit:
1820 if (!IS_ERR(nd->intent.open.file)) 1897 if (!IS_ERR(nd.intent.open.file))
1821 release_open_intent(nd); 1898 release_open_intent(&nd);
1822 path_put(&nd->path); 1899 path_put(&nd.path);
1823 return error; 1900 return ERR_PTR(error);
1824 1901
1825do_link: 1902do_link:
1826 error = -ELOOP; 1903 error = -ELOOP;
@@ -1836,43 +1913,60 @@ do_link:
1836 * stored in nd->last.name and we will have to putname() it when we 1913 * stored in nd->last.name and we will have to putname() it when we
1837 * are done. Procfs-like symlinks just set LAST_BIND. 1914 * are done. Procfs-like symlinks just set LAST_BIND.
1838 */ 1915 */
1839 nd->flags |= LOOKUP_PARENT; 1916 nd.flags |= LOOKUP_PARENT;
1840 error = security_inode_follow_link(path.dentry, nd); 1917 error = security_inode_follow_link(path.dentry, &nd);
1841 if (error) 1918 if (error)
1842 goto exit_dput; 1919 goto exit_dput;
1843 error = __do_follow_link(&path, nd); 1920 error = __do_follow_link(&path, &nd);
1844 if (error) { 1921 if (error) {
1845 /* Does someone understand code flow here? Or it is only 1922 /* Does someone understand code flow here? Or it is only
1846 * me so stupid? Anathema to whoever designed this non-sense 1923 * me so stupid? Anathema to whoever designed this non-sense
1847 * with "intent.open". 1924 * with "intent.open".
1848 */ 1925 */
1849 release_open_intent(nd); 1926 release_open_intent(&nd);
1850 return error; 1927 return ERR_PTR(error);
1851 } 1928 }
1852 nd->flags &= ~LOOKUP_PARENT; 1929 nd.flags &= ~LOOKUP_PARENT;
1853 if (nd->last_type == LAST_BIND) 1930 if (nd.last_type == LAST_BIND)
1854 goto ok; 1931 goto ok;
1855 error = -EISDIR; 1932 error = -EISDIR;
1856 if (nd->last_type != LAST_NORM) 1933 if (nd.last_type != LAST_NORM)
1857 goto exit; 1934 goto exit;
1858 if (nd->last.name[nd->last.len]) { 1935 if (nd.last.name[nd.last.len]) {
1859 __putname(nd->last.name); 1936 __putname(nd.last.name);
1860 goto exit; 1937 goto exit;
1861 } 1938 }
1862 error = -ELOOP; 1939 error = -ELOOP;
1863 if (count++==32) { 1940 if (count++==32) {
1864 __putname(nd->last.name); 1941 __putname(nd.last.name);
1865 goto exit; 1942 goto exit;
1866 } 1943 }
1867 dir = nd->path.dentry; 1944 dir = nd.path.dentry;
1868 mutex_lock(&dir->d_inode->i_mutex); 1945 mutex_lock(&dir->d_inode->i_mutex);
1869 path.dentry = lookup_hash(nd); 1946 path.dentry = lookup_hash(&nd);
1870 path.mnt = nd->path.mnt; 1947 path.mnt = nd.path.mnt;
1871 __putname(nd->last.name); 1948 __putname(nd.last.name);
1872 goto do_last; 1949 goto do_last;
1873} 1950}
1874 1951
1875/** 1952/**
1953 * filp_open - open file and return file pointer
1954 *
1955 * @filename: path to open
1956 * @flags: open flags as per the open(2) second argument
1957 * @mode: mode for the new file if O_CREAT is set, else ignored
1958 *
1959 * This is the helper to open a file from kernelspace if you really
1960 * have to. But in generally you should not do this, so please move
1961 * along, nothing to see here..
1962 */
1963struct file *filp_open(const char *filename, int flags, int mode)
1964{
1965 return do_filp_open(AT_FDCWD, filename, flags, mode);
1966}
1967EXPORT_SYMBOL(filp_open);
1968
1969/**
1876 * lookup_create - lookup a dentry, creating it if it doesn't exist 1970 * lookup_create - lookup a dentry, creating it if it doesn't exist
1877 * @nd: nameidata info 1971 * @nd: nameidata info
1878 * @is_dir: directory flag 1972 * @is_dir: directory flag
@@ -1945,6 +2039,23 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
1945 return error; 2039 return error;
1946} 2040}
1947 2041
2042static int may_mknod(mode_t mode)
2043{
2044 switch (mode & S_IFMT) {
2045 case S_IFREG:
2046 case S_IFCHR:
2047 case S_IFBLK:
2048 case S_IFIFO:
2049 case S_IFSOCK:
2050 case 0: /* zero mode translates to S_IFREG */
2051 return 0;
2052 case S_IFDIR:
2053 return -EPERM;
2054 default:
2055 return -EINVAL;
2056 }
2057}
2058
1948asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode, 2059asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1949 unsigned dev) 2060 unsigned dev)
1950{ 2061{
@@ -1963,12 +2074,19 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1963 if (error) 2074 if (error)
1964 goto out; 2075 goto out;
1965 dentry = lookup_create(&nd, 0); 2076 dentry = lookup_create(&nd, 0);
1966 error = PTR_ERR(dentry); 2077 if (IS_ERR(dentry)) {
1967 2078 error = PTR_ERR(dentry);
2079 goto out_unlock;
2080 }
1968 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 2081 if (!IS_POSIXACL(nd.path.dentry->d_inode))
1969 mode &= ~current->fs->umask; 2082 mode &= ~current->fs->umask;
1970 if (!IS_ERR(dentry)) { 2083 error = may_mknod(mode);
1971 switch (mode & S_IFMT) { 2084 if (error)
2085 goto out_dput;
2086 error = mnt_want_write(nd.path.mnt);
2087 if (error)
2088 goto out_dput;
2089 switch (mode & S_IFMT) {
1972 case 0: case S_IFREG: 2090 case 0: case S_IFREG:
1973 error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd); 2091 error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
1974 break; 2092 break;
@@ -1979,14 +2097,11 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
1979 case S_IFIFO: case S_IFSOCK: 2097 case S_IFIFO: case S_IFSOCK:
1980 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0); 2098 error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
1981 break; 2099 break;
1982 case S_IFDIR:
1983 error = -EPERM;
1984 break;
1985 default:
1986 error = -EINVAL;
1987 }
1988 dput(dentry);
1989 } 2100 }
2101 mnt_drop_write(nd.path.mnt);
2102out_dput:
2103 dput(dentry);
2104out_unlock:
1990 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2105 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
1991 path_put(&nd.path); 2106 path_put(&nd.path);
1992out: 2107out:
@@ -2044,7 +2159,12 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
2044 2159
2045 if (!IS_POSIXACL(nd.path.dentry->d_inode)) 2160 if (!IS_POSIXACL(nd.path.dentry->d_inode))
2046 mode &= ~current->fs->umask; 2161 mode &= ~current->fs->umask;
2162 error = mnt_want_write(nd.path.mnt);
2163 if (error)
2164 goto out_dput;
2047 error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode); 2165 error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
2166 mnt_drop_write(nd.path.mnt);
2167out_dput:
2048 dput(dentry); 2168 dput(dentry);
2049out_unlock: 2169out_unlock:
2050 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2170 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2151,7 +2271,12 @@ static long do_rmdir(int dfd, const char __user *pathname)
2151 error = PTR_ERR(dentry); 2271 error = PTR_ERR(dentry);
2152 if (IS_ERR(dentry)) 2272 if (IS_ERR(dentry))
2153 goto exit2; 2273 goto exit2;
2274 error = mnt_want_write(nd.path.mnt);
2275 if (error)
2276 goto exit3;
2154 error = vfs_rmdir(nd.path.dentry->d_inode, dentry); 2277 error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
2278 mnt_drop_write(nd.path.mnt);
2279exit3:
2155 dput(dentry); 2280 dput(dentry);
2156exit2: 2281exit2:
2157 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2282 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2232,7 +2357,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
2232 inode = dentry->d_inode; 2357 inode = dentry->d_inode;
2233 if (inode) 2358 if (inode)
2234 atomic_inc(&inode->i_count); 2359 atomic_inc(&inode->i_count);
2360 error = mnt_want_write(nd.path.mnt);
2361 if (error)
2362 goto exit2;
2235 error = vfs_unlink(nd.path.dentry->d_inode, dentry); 2363 error = vfs_unlink(nd.path.dentry->d_inode, dentry);
2364 mnt_drop_write(nd.path.mnt);
2236 exit2: 2365 exit2:
2237 dput(dentry); 2366 dput(dentry);
2238 } 2367 }
@@ -2313,7 +2442,12 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
2313 if (IS_ERR(dentry)) 2442 if (IS_ERR(dentry))
2314 goto out_unlock; 2443 goto out_unlock;
2315 2444
2445 error = mnt_want_write(nd.path.mnt);
2446 if (error)
2447 goto out_dput;
2316 error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO); 2448 error = vfs_symlink(nd.path.dentry->d_inode, dentry, from, S_IALLUGO);
2449 mnt_drop_write(nd.path.mnt);
2450out_dput:
2317 dput(dentry); 2451 dput(dentry);
2318out_unlock: 2452out_unlock:
2319 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2453 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2408,7 +2542,12 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
2408 error = PTR_ERR(new_dentry); 2542 error = PTR_ERR(new_dentry);
2409 if (IS_ERR(new_dentry)) 2543 if (IS_ERR(new_dentry))
2410 goto out_unlock; 2544 goto out_unlock;
2545 error = mnt_want_write(nd.path.mnt);
2546 if (error)
2547 goto out_dput;
2411 error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry); 2548 error = vfs_link(old_nd.path.dentry, nd.path.dentry->d_inode, new_dentry);
2549 mnt_drop_write(nd.path.mnt);
2550out_dput:
2412 dput(new_dentry); 2551 dput(new_dentry);
2413out_unlock: 2552out_unlock:
2414 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 2553 mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
@@ -2634,8 +2773,12 @@ static int do_rename(int olddfd, const char *oldname,
2634 if (new_dentry == trap) 2773 if (new_dentry == trap)
2635 goto exit5; 2774 goto exit5;
2636 2775
2776 error = mnt_want_write(oldnd.path.mnt);
2777 if (error)
2778 goto exit5;
2637 error = vfs_rename(old_dir->d_inode, old_dentry, 2779 error = vfs_rename(old_dir->d_inode, old_dentry,
2638 new_dir->d_inode, new_dentry); 2780 new_dir->d_inode, new_dentry);
2781 mnt_drop_write(oldnd.path.mnt);
2639exit5: 2782exit5:
2640 dput(new_dentry); 2783 dput(new_dentry);
2641exit4: 2784exit4:
diff --git a/fs/namespace.c b/fs/namespace.c
index 94f026ec990a..0505fb61aa74 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -17,6 +17,7 @@
17#include <linux/quotaops.h> 17#include <linux/quotaops.h>
18#include <linux/acct.h> 18#include <linux/acct.h>
19#include <linux/capability.h> 19#include <linux/capability.h>
20#include <linux/cpumask.h>
20#include <linux/module.h> 21#include <linux/module.h>
21#include <linux/sysfs.h> 22#include <linux/sysfs.h>
22#include <linux/seq_file.h> 23#include <linux/seq_file.h>
@@ -26,6 +27,7 @@
26#include <linux/mount.h> 27#include <linux/mount.h>
27#include <linux/ramfs.h> 28#include <linux/ramfs.h>
28#include <linux/log2.h> 29#include <linux/log2.h>
30#include <linux/idr.h>
29#include <asm/uaccess.h> 31#include <asm/uaccess.h>
30#include <asm/unistd.h> 32#include <asm/unistd.h>
31#include "pnode.h" 33#include "pnode.h"
@@ -38,6 +40,8 @@
38__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); 40__cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock);
39 41
40static int event; 42static int event;
43static DEFINE_IDA(mnt_id_ida);
44static DEFINE_IDA(mnt_group_ida);
41 45
42static struct list_head *mount_hashtable __read_mostly; 46static struct list_head *mount_hashtable __read_mostly;
43static struct kmem_cache *mnt_cache __read_mostly; 47static struct kmem_cache *mnt_cache __read_mostly;
@@ -55,10 +59,65 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry)
55 return tmp & (HASH_SIZE - 1); 59 return tmp & (HASH_SIZE - 1);
56} 60}
57 61
62#define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16)
63
64/* allocation is serialized by namespace_sem */
65static int mnt_alloc_id(struct vfsmount *mnt)
66{
67 int res;
68
69retry:
70 ida_pre_get(&mnt_id_ida, GFP_KERNEL);
71 spin_lock(&vfsmount_lock);
72 res = ida_get_new(&mnt_id_ida, &mnt->mnt_id);
73 spin_unlock(&vfsmount_lock);
74 if (res == -EAGAIN)
75 goto retry;
76
77 return res;
78}
79
80static void mnt_free_id(struct vfsmount *mnt)
81{
82 spin_lock(&vfsmount_lock);
83 ida_remove(&mnt_id_ida, mnt->mnt_id);
84 spin_unlock(&vfsmount_lock);
85}
86
87/*
88 * Allocate a new peer group ID
89 *
90 * mnt_group_ida is protected by namespace_sem
91 */
92static int mnt_alloc_group_id(struct vfsmount *mnt)
93{
94 if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL))
95 return -ENOMEM;
96
97 return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id);
98}
99
100/*
101 * Release a peer group ID
102 */
103void mnt_release_group_id(struct vfsmount *mnt)
104{
105 ida_remove(&mnt_group_ida, mnt->mnt_group_id);
106 mnt->mnt_group_id = 0;
107}
108
58struct vfsmount *alloc_vfsmnt(const char *name) 109struct vfsmount *alloc_vfsmnt(const char *name)
59{ 110{
60 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); 111 struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
61 if (mnt) { 112 if (mnt) {
113 int err;
114
115 err = mnt_alloc_id(mnt);
116 if (err) {
117 kmem_cache_free(mnt_cache, mnt);
118 return NULL;
119 }
120
62 atomic_set(&mnt->mnt_count, 1); 121 atomic_set(&mnt->mnt_count, 1);
63 INIT_LIST_HEAD(&mnt->mnt_hash); 122 INIT_LIST_HEAD(&mnt->mnt_hash);
64 INIT_LIST_HEAD(&mnt->mnt_child); 123 INIT_LIST_HEAD(&mnt->mnt_child);
@@ -68,6 +127,7 @@ struct vfsmount *alloc_vfsmnt(const char *name)
68 INIT_LIST_HEAD(&mnt->mnt_share); 127 INIT_LIST_HEAD(&mnt->mnt_share);
69 INIT_LIST_HEAD(&mnt->mnt_slave_list); 128 INIT_LIST_HEAD(&mnt->mnt_slave_list);
70 INIT_LIST_HEAD(&mnt->mnt_slave); 129 INIT_LIST_HEAD(&mnt->mnt_slave);
130 atomic_set(&mnt->__mnt_writers, 0);
71 if (name) { 131 if (name) {
72 int size = strlen(name) + 1; 132 int size = strlen(name) + 1;
73 char *newname = kmalloc(size, GFP_KERNEL); 133 char *newname = kmalloc(size, GFP_KERNEL);
@@ -80,6 +140,263 @@ struct vfsmount *alloc_vfsmnt(const char *name)
80 return mnt; 140 return mnt;
81} 141}
82 142
143/*
144 * Most r/o checks on a fs are for operations that take
145 * discrete amounts of time, like a write() or unlink().
146 * We must keep track of when those operations start
147 * (for permission checks) and when they end, so that
148 * we can determine when writes are able to occur to
149 * a filesystem.
150 */
151/*
152 * __mnt_is_readonly: check whether a mount is read-only
153 * @mnt: the mount to check for its write status
154 *
155 * This shouldn't be used directly ouside of the VFS.
156 * It does not guarantee that the filesystem will stay
157 * r/w, just that it is right *now*. This can not and
158 * should not be used in place of IS_RDONLY(inode).
159 * mnt_want/drop_write() will _keep_ the filesystem
160 * r/w.
161 */
162int __mnt_is_readonly(struct vfsmount *mnt)
163{
164 if (mnt->mnt_flags & MNT_READONLY)
165 return 1;
166 if (mnt->mnt_sb->s_flags & MS_RDONLY)
167 return 1;
168 return 0;
169}
170EXPORT_SYMBOL_GPL(__mnt_is_readonly);
171
172struct mnt_writer {
173 /*
174 * If holding multiple instances of this lock, they
175 * must be ordered by cpu number.
176 */
177 spinlock_t lock;
178 struct lock_class_key lock_class; /* compiles out with !lockdep */
179 unsigned long count;
180 struct vfsmount *mnt;
181} ____cacheline_aligned_in_smp;
182static DEFINE_PER_CPU(struct mnt_writer, mnt_writers);
183
184static int __init init_mnt_writers(void)
185{
186 int cpu;
187 for_each_possible_cpu(cpu) {
188 struct mnt_writer *writer = &per_cpu(mnt_writers, cpu);
189 spin_lock_init(&writer->lock);
190 lockdep_set_class(&writer->lock, &writer->lock_class);
191 writer->count = 0;
192 }
193 return 0;
194}
195fs_initcall(init_mnt_writers);
196
197static void unlock_mnt_writers(void)
198{
199 int cpu;
200 struct mnt_writer *cpu_writer;
201
202 for_each_possible_cpu(cpu) {
203 cpu_writer = &per_cpu(mnt_writers, cpu);
204 spin_unlock(&cpu_writer->lock);
205 }
206}
207
208static inline void __clear_mnt_count(struct mnt_writer *cpu_writer)
209{
210 if (!cpu_writer->mnt)
211 return;
212 /*
213 * This is in case anyone ever leaves an invalid,
214 * old ->mnt and a count of 0.
215 */
216 if (!cpu_writer->count)
217 return;
218 atomic_add(cpu_writer->count, &cpu_writer->mnt->__mnt_writers);
219 cpu_writer->count = 0;
220}
221 /*
222 * must hold cpu_writer->lock
223 */
224static inline void use_cpu_writer_for_mount(struct mnt_writer *cpu_writer,
225 struct vfsmount *mnt)
226{
227 if (cpu_writer->mnt == mnt)
228 return;
229 __clear_mnt_count(cpu_writer);
230 cpu_writer->mnt = mnt;
231}
232
233/*
234 * Most r/o checks on a fs are for operations that take
235 * discrete amounts of time, like a write() or unlink().
236 * We must keep track of when those operations start
237 * (for permission checks) and when they end, so that
238 * we can determine when writes are able to occur to
239 * a filesystem.
240 */
241/**
242 * mnt_want_write - get write access to a mount
243 * @mnt: the mount on which to take a write
244 *
245 * This tells the low-level filesystem that a write is
246 * about to be performed to it, and makes sure that
247 * writes are allowed before returning success. When
248 * the write operation is finished, mnt_drop_write()
249 * must be called. This is effectively a refcount.
250 */
251int mnt_want_write(struct vfsmount *mnt)
252{
253 int ret = 0;
254 struct mnt_writer *cpu_writer;
255
256 cpu_writer = &get_cpu_var(mnt_writers);
257 spin_lock(&cpu_writer->lock);
258 if (__mnt_is_readonly(mnt)) {
259 ret = -EROFS;
260 goto out;
261 }
262 use_cpu_writer_for_mount(cpu_writer, mnt);
263 cpu_writer->count++;
264out:
265 spin_unlock(&cpu_writer->lock);
266 put_cpu_var(mnt_writers);
267 return ret;
268}
269EXPORT_SYMBOL_GPL(mnt_want_write);
270
271static void lock_mnt_writers(void)
272{
273 int cpu;
274 struct mnt_writer *cpu_writer;
275
276 for_each_possible_cpu(cpu) {
277 cpu_writer = &per_cpu(mnt_writers, cpu);
278 spin_lock(&cpu_writer->lock);
279 __clear_mnt_count(cpu_writer);
280 cpu_writer->mnt = NULL;
281 }
282}
283
284/*
285 * These per-cpu write counts are not guaranteed to have
286 * matched increments and decrements on any given cpu.
287 * A file open()ed for write on one cpu and close()d on
288 * another cpu will imbalance this count. Make sure it
289 * does not get too far out of whack.
290 */
291static void handle_write_count_underflow(struct vfsmount *mnt)
292{
293 if (atomic_read(&mnt->__mnt_writers) >=
294 MNT_WRITER_UNDERFLOW_LIMIT)
295 return;
296 /*
297 * It isn't necessary to hold all of the locks
298 * at the same time, but doing it this way makes
299 * us share a lot more code.
300 */
301 lock_mnt_writers();
302 /*
303 * vfsmount_lock is for mnt_flags.
304 */
305 spin_lock(&vfsmount_lock);
306 /*
307 * If coalescing the per-cpu writer counts did not
308 * get us back to a positive writer count, we have
309 * a bug.
310 */
311 if ((atomic_read(&mnt->__mnt_writers) < 0) &&
312 !(mnt->mnt_flags & MNT_IMBALANCED_WRITE_COUNT)) {
313 printk(KERN_DEBUG "leak detected on mount(%p) writers "
314 "count: %d\n",
315 mnt, atomic_read(&mnt->__mnt_writers));
316 WARN_ON(1);
317 /* use the flag to keep the dmesg spam down */
318 mnt->mnt_flags |= MNT_IMBALANCED_WRITE_COUNT;
319 }
320 spin_unlock(&vfsmount_lock);
321 unlock_mnt_writers();
322}
323
324/**
325 * mnt_drop_write - give up write access to a mount
326 * @mnt: the mount on which to give up write access
327 *
328 * Tells the low-level filesystem that we are done
329 * performing writes to it. Must be matched with
330 * mnt_want_write() call above.
331 */
332void mnt_drop_write(struct vfsmount *mnt)
333{
334 int must_check_underflow = 0;
335 struct mnt_writer *cpu_writer;
336
337 cpu_writer = &get_cpu_var(mnt_writers);
338 spin_lock(&cpu_writer->lock);
339
340 use_cpu_writer_for_mount(cpu_writer, mnt);
341 if (cpu_writer->count > 0) {
342 cpu_writer->count--;
343 } else {
344 must_check_underflow = 1;
345 atomic_dec(&mnt->__mnt_writers);
346 }
347
348 spin_unlock(&cpu_writer->lock);
349 /*
350 * Logically, we could call this each time,
351 * but the __mnt_writers cacheline tends to
352 * be cold, and makes this expensive.
353 */
354 if (must_check_underflow)
355 handle_write_count_underflow(mnt);
356 /*
357 * This could be done right after the spinlock
358 * is taken because the spinlock keeps us on
359 * the cpu, and disables preemption. However,
360 * putting it here bounds the amount that
361 * __mnt_writers can underflow. Without it,
362 * we could theoretically wrap __mnt_writers.
363 */
364 put_cpu_var(mnt_writers);
365}
366EXPORT_SYMBOL_GPL(mnt_drop_write);
367
368static int mnt_make_readonly(struct vfsmount *mnt)
369{
370 int ret = 0;
371
372 lock_mnt_writers();
373 /*
374 * With all the locks held, this value is stable
375 */
376 if (atomic_read(&mnt->__mnt_writers) > 0) {
377 ret = -EBUSY;
378 goto out;
379 }
380 /*
381 * nobody can do a successful mnt_want_write() with all
382 * of the counts in MNT_DENIED_WRITE and the locks held.
383 */
384 spin_lock(&vfsmount_lock);
385 if (!ret)
386 mnt->mnt_flags |= MNT_READONLY;
387 spin_unlock(&vfsmount_lock);
388out:
389 unlock_mnt_writers();
390 return ret;
391}
392
393static void __mnt_unmake_readonly(struct vfsmount *mnt)
394{
395 spin_lock(&vfsmount_lock);
396 mnt->mnt_flags &= ~MNT_READONLY;
397 spin_unlock(&vfsmount_lock);
398}
399
83int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb) 400int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
84{ 401{
85 mnt->mnt_sb = sb; 402 mnt->mnt_sb = sb;
@@ -92,6 +409,7 @@ EXPORT_SYMBOL(simple_set_mnt);
92void free_vfsmnt(struct vfsmount *mnt) 409void free_vfsmnt(struct vfsmount *mnt)
93{ 410{
94 kfree(mnt->mnt_devname); 411 kfree(mnt->mnt_devname);
412 mnt_free_id(mnt);
95 kmem_cache_free(mnt_cache, mnt); 413 kmem_cache_free(mnt_cache, mnt);
96} 414}
97 415
@@ -238,6 +556,17 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
238 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); 556 struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname);
239 557
240 if (mnt) { 558 if (mnt) {
559 if (flag & (CL_SLAVE | CL_PRIVATE))
560 mnt->mnt_group_id = 0; /* not a peer of original */
561 else
562 mnt->mnt_group_id = old->mnt_group_id;
563
564 if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) {
565 int err = mnt_alloc_group_id(mnt);
566 if (err)
567 goto out_free;
568 }
569
241 mnt->mnt_flags = old->mnt_flags; 570 mnt->mnt_flags = old->mnt_flags;
242 atomic_inc(&sb->s_active); 571 atomic_inc(&sb->s_active);
243 mnt->mnt_sb = sb; 572 mnt->mnt_sb = sb;
@@ -267,11 +596,44 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root,
267 } 596 }
268 } 597 }
269 return mnt; 598 return mnt;
599
600 out_free:
601 free_vfsmnt(mnt);
602 return NULL;
270} 603}
271 604
272static inline void __mntput(struct vfsmount *mnt) 605static inline void __mntput(struct vfsmount *mnt)
273{ 606{
607 int cpu;
274 struct super_block *sb = mnt->mnt_sb; 608 struct super_block *sb = mnt->mnt_sb;
609 /*
610 * We don't have to hold all of the locks at the
611 * same time here because we know that we're the
612 * last reference to mnt and that no new writers
613 * can come in.
614 */
615 for_each_possible_cpu(cpu) {
616 struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
617 if (cpu_writer->mnt != mnt)
618 continue;
619 spin_lock(&cpu_writer->lock);
620 atomic_add(cpu_writer->count, &mnt->__mnt_writers);
621 cpu_writer->count = 0;
622 /*
623 * Might as well do this so that no one
624 * ever sees the pointer and expects
625 * it to be valid.
626 */
627 cpu_writer->mnt = NULL;
628 spin_unlock(&cpu_writer->lock);
629 }
630 /*
631 * This probably indicates that somebody messed
632 * up a mnt_want/drop_write() pair. If this
633 * happens, the filesystem was probably unable
634 * to make r/w->r/o transitions.
635 */
636 WARN_ON(atomic_read(&mnt->__mnt_writers));
275 dput(mnt->mnt_root); 637 dput(mnt->mnt_root);
276 free_vfsmnt(mnt); 638 free_vfsmnt(mnt);
277 deactivate_super(sb); 639 deactivate_super(sb);
@@ -362,20 +724,21 @@ void save_mount_options(struct super_block *sb, char *options)
362} 724}
363EXPORT_SYMBOL(save_mount_options); 725EXPORT_SYMBOL(save_mount_options);
364 726
727#ifdef CONFIG_PROC_FS
365/* iterator */ 728/* iterator */
366static void *m_start(struct seq_file *m, loff_t *pos) 729static void *m_start(struct seq_file *m, loff_t *pos)
367{ 730{
368 struct mnt_namespace *n = m->private; 731 struct proc_mounts *p = m->private;
369 732
370 down_read(&namespace_sem); 733 down_read(&namespace_sem);
371 return seq_list_start(&n->list, *pos); 734 return seq_list_start(&p->ns->list, *pos);
372} 735}
373 736
374static void *m_next(struct seq_file *m, void *v, loff_t *pos) 737static void *m_next(struct seq_file *m, void *v, loff_t *pos)
375{ 738{
376 struct mnt_namespace *n = m->private; 739 struct proc_mounts *p = m->private;
377 740
378 return seq_list_next(v, &n->list, pos); 741 return seq_list_next(v, &p->ns->list, pos);
379} 742}
380 743
381static void m_stop(struct seq_file *m, void *v) 744static void m_stop(struct seq_file *m, void *v)
@@ -383,20 +746,30 @@ static void m_stop(struct seq_file *m, void *v)
383 up_read(&namespace_sem); 746 up_read(&namespace_sem);
384} 747}
385 748
386static int show_vfsmnt(struct seq_file *m, void *v) 749struct proc_fs_info {
750 int flag;
751 const char *str;
752};
753
754static void show_sb_opts(struct seq_file *m, struct super_block *sb)
387{ 755{
388 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 756 static const struct proc_fs_info fs_info[] = {
389 int err = 0;
390 static struct proc_fs_info {
391 int flag;
392 char *str;
393 } fs_info[] = {
394 { MS_SYNCHRONOUS, ",sync" }, 757 { MS_SYNCHRONOUS, ",sync" },
395 { MS_DIRSYNC, ",dirsync" }, 758 { MS_DIRSYNC, ",dirsync" },
396 { MS_MANDLOCK, ",mand" }, 759 { MS_MANDLOCK, ",mand" },
397 { 0, NULL } 760 { 0, NULL }
398 }; 761 };
399 static struct proc_fs_info mnt_info[] = { 762 const struct proc_fs_info *fs_infop;
763
764 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
765 if (sb->s_flags & fs_infop->flag)
766 seq_puts(m, fs_infop->str);
767 }
768}
769
770static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt)
771{
772 static const struct proc_fs_info mnt_info[] = {
400 { MNT_NOSUID, ",nosuid" }, 773 { MNT_NOSUID, ",nosuid" },
401 { MNT_NODEV, ",nodev" }, 774 { MNT_NODEV, ",nodev" },
402 { MNT_NOEXEC, ",noexec" }, 775 { MNT_NOEXEC, ",noexec" },
@@ -405,40 +778,108 @@ static int show_vfsmnt(struct seq_file *m, void *v)
405 { MNT_RELATIME, ",relatime" }, 778 { MNT_RELATIME, ",relatime" },
406 { 0, NULL } 779 { 0, NULL }
407 }; 780 };
408 struct proc_fs_info *fs_infop; 781 const struct proc_fs_info *fs_infop;
782
783 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
784 if (mnt->mnt_flags & fs_infop->flag)
785 seq_puts(m, fs_infop->str);
786 }
787}
788
789static void show_type(struct seq_file *m, struct super_block *sb)
790{
791 mangle(m, sb->s_type->name);
792 if (sb->s_subtype && sb->s_subtype[0]) {
793 seq_putc(m, '.');
794 mangle(m, sb->s_subtype);
795 }
796}
797
798static int show_vfsmnt(struct seq_file *m, void *v)
799{
800 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
801 int err = 0;
409 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; 802 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
410 803
411 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); 804 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
412 seq_putc(m, ' '); 805 seq_putc(m, ' ');
413 seq_path(m, &mnt_path, " \t\n\\"); 806 seq_path(m, &mnt_path, " \t\n\\");
414 seq_putc(m, ' '); 807 seq_putc(m, ' ');
415 mangle(m, mnt->mnt_sb->s_type->name); 808 show_type(m, mnt->mnt_sb);
416 if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { 809 seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw");
417 seq_putc(m, '.'); 810 show_sb_opts(m, mnt->mnt_sb);
418 mangle(m, mnt->mnt_sb->s_subtype); 811 show_mnt_opts(m, mnt);
419 }
420 seq_puts(m, mnt->mnt_sb->s_flags & MS_RDONLY ? " ro" : " rw");
421 for (fs_infop = fs_info; fs_infop->flag; fs_infop++) {
422 if (mnt->mnt_sb->s_flags & fs_infop->flag)
423 seq_puts(m, fs_infop->str);
424 }
425 for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) {
426 if (mnt->mnt_flags & fs_infop->flag)
427 seq_puts(m, fs_infop->str);
428 }
429 if (mnt->mnt_sb->s_op->show_options) 812 if (mnt->mnt_sb->s_op->show_options)
430 err = mnt->mnt_sb->s_op->show_options(m, mnt); 813 err = mnt->mnt_sb->s_op->show_options(m, mnt);
431 seq_puts(m, " 0 0\n"); 814 seq_puts(m, " 0 0\n");
432 return err; 815 return err;
433} 816}
434 817
435struct seq_operations mounts_op = { 818const struct seq_operations mounts_op = {
436 .start = m_start, 819 .start = m_start,
437 .next = m_next, 820 .next = m_next,
438 .stop = m_stop, 821 .stop = m_stop,
439 .show = show_vfsmnt 822 .show = show_vfsmnt
440}; 823};
441 824
825static int show_mountinfo(struct seq_file *m, void *v)
826{
827 struct proc_mounts *p = m->private;
828 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
829 struct super_block *sb = mnt->mnt_sb;
830 struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
831 struct path root = p->root;
832 int err = 0;
833
834 seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id,
835 MAJOR(sb->s_dev), MINOR(sb->s_dev));
836 seq_dentry(m, mnt->mnt_root, " \t\n\\");
837 seq_putc(m, ' ');
838 seq_path_root(m, &mnt_path, &root, " \t\n\\");
839 if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) {
840 /*
841 * Mountpoint is outside root, discard that one. Ugly,
842 * but less so than trying to do that in iterator in a
843 * race-free way (due to renames).
844 */
845 return SEQ_SKIP;
846 }
847 seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw");
848 show_mnt_opts(m, mnt);
849
850 /* Tagged fields ("foo:X" or "bar") */
851 if (IS_MNT_SHARED(mnt))
852 seq_printf(m, " shared:%i", mnt->mnt_group_id);
853 if (IS_MNT_SLAVE(mnt)) {
854 int master = mnt->mnt_master->mnt_group_id;
855 int dom = get_dominating_id(mnt, &p->root);
856 seq_printf(m, " master:%i", master);
857 if (dom && dom != master)
858 seq_printf(m, " propagate_from:%i", dom);
859 }
860 if (IS_MNT_UNBINDABLE(mnt))
861 seq_puts(m, " unbindable");
862
863 /* Filesystem specific data */
864 seq_puts(m, " - ");
865 show_type(m, sb);
866 seq_putc(m, ' ');
867 mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none");
868 seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
869 show_sb_opts(m, sb);
870 if (sb->s_op->show_options)
871 err = sb->s_op->show_options(m, mnt);
872 seq_putc(m, '\n');
873 return err;
874}
875
876const struct seq_operations mountinfo_op = {
877 .start = m_start,
878 .next = m_next,
879 .stop = m_stop,
880 .show = show_mountinfo,
881};
882
442static int show_vfsstat(struct seq_file *m, void *v) 883static int show_vfsstat(struct seq_file *m, void *v)
443{ 884{
444 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); 885 struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list);
@@ -459,7 +900,7 @@ static int show_vfsstat(struct seq_file *m, void *v)
459 900
460 /* file system type */ 901 /* file system type */
461 seq_puts(m, "with fstype "); 902 seq_puts(m, "with fstype ");
462 mangle(m, mnt->mnt_sb->s_type->name); 903 show_type(m, mnt->mnt_sb);
463 904
464 /* optional statistics */ 905 /* optional statistics */
465 if (mnt->mnt_sb->s_op->show_stats) { 906 if (mnt->mnt_sb->s_op->show_stats) {
@@ -471,12 +912,13 @@ static int show_vfsstat(struct seq_file *m, void *v)
471 return err; 912 return err;
472} 913}
473 914
474struct seq_operations mountstats_op = { 915const struct seq_operations mountstats_op = {
475 .start = m_start, 916 .start = m_start,
476 .next = m_next, 917 .next = m_next,
477 .stop = m_stop, 918 .stop = m_stop,
478 .show = show_vfsstat, 919 .show = show_vfsstat,
479}; 920};
921#endif /* CONFIG_PROC_FS */
480 922
481/** 923/**
482 * may_umount_tree - check if a mount tree is busy 924 * may_umount_tree - check if a mount tree is busy
@@ -801,23 +1243,50 @@ Enomem:
801struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry) 1243struct vfsmount *collect_mounts(struct vfsmount *mnt, struct dentry *dentry)
802{ 1244{
803 struct vfsmount *tree; 1245 struct vfsmount *tree;
804 down_read(&namespace_sem); 1246 down_write(&namespace_sem);
805 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE); 1247 tree = copy_tree(mnt, dentry, CL_COPY_ALL | CL_PRIVATE);
806 up_read(&namespace_sem); 1248 up_write(&namespace_sem);
807 return tree; 1249 return tree;
808} 1250}
809 1251
810void drop_collected_mounts(struct vfsmount *mnt) 1252void drop_collected_mounts(struct vfsmount *mnt)
811{ 1253{
812 LIST_HEAD(umount_list); 1254 LIST_HEAD(umount_list);
813 down_read(&namespace_sem); 1255 down_write(&namespace_sem);
814 spin_lock(&vfsmount_lock); 1256 spin_lock(&vfsmount_lock);
815 umount_tree(mnt, 0, &umount_list); 1257 umount_tree(mnt, 0, &umount_list);
816 spin_unlock(&vfsmount_lock); 1258 spin_unlock(&vfsmount_lock);
817 up_read(&namespace_sem); 1259 up_write(&namespace_sem);
818 release_mounts(&umount_list); 1260 release_mounts(&umount_list);
819} 1261}
820 1262
1263static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end)
1264{
1265 struct vfsmount *p;
1266
1267 for (p = mnt; p != end; p = next_mnt(p, mnt)) {
1268 if (p->mnt_group_id && !IS_MNT_SHARED(p))
1269 mnt_release_group_id(p);
1270 }
1271}
1272
1273static int invent_group_ids(struct vfsmount *mnt, bool recurse)
1274{
1275 struct vfsmount *p;
1276
1277 for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) {
1278 if (!p->mnt_group_id && !IS_MNT_SHARED(p)) {
1279 int err = mnt_alloc_group_id(p);
1280 if (err) {
1281 cleanup_group_ids(mnt, p);
1282 return err;
1283 }
1284 }
1285 }
1286
1287 return 0;
1288}
1289
821/* 1290/*
822 * @source_mnt : mount tree to be attached 1291 * @source_mnt : mount tree to be attached
823 * @nd : place the mount tree @source_mnt is attached 1292 * @nd : place the mount tree @source_mnt is attached
@@ -888,9 +1357,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
888 struct vfsmount *dest_mnt = path->mnt; 1357 struct vfsmount *dest_mnt = path->mnt;
889 struct dentry *dest_dentry = path->dentry; 1358 struct dentry *dest_dentry = path->dentry;
890 struct vfsmount *child, *p; 1359 struct vfsmount *child, *p;
1360 int err;
891 1361
892 if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) 1362 if (IS_MNT_SHARED(dest_mnt)) {
893 return -EINVAL; 1363 err = invent_group_ids(source_mnt, true);
1364 if (err)
1365 goto out;
1366 }
1367 err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list);
1368 if (err)
1369 goto out_cleanup_ids;
894 1370
895 if (IS_MNT_SHARED(dest_mnt)) { 1371 if (IS_MNT_SHARED(dest_mnt)) {
896 for (p = source_mnt; p; p = next_mnt(p, source_mnt)) 1372 for (p = source_mnt; p; p = next_mnt(p, source_mnt))
@@ -913,34 +1389,40 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt,
913 } 1389 }
914 spin_unlock(&vfsmount_lock); 1390 spin_unlock(&vfsmount_lock);
915 return 0; 1391 return 0;
1392
1393 out_cleanup_ids:
1394 if (IS_MNT_SHARED(dest_mnt))
1395 cleanup_group_ids(source_mnt, NULL);
1396 out:
1397 return err;
916} 1398}
917 1399
918static int graft_tree(struct vfsmount *mnt, struct nameidata *nd) 1400static int graft_tree(struct vfsmount *mnt, struct path *path)
919{ 1401{
920 int err; 1402 int err;
921 if (mnt->mnt_sb->s_flags & MS_NOUSER) 1403 if (mnt->mnt_sb->s_flags & MS_NOUSER)
922 return -EINVAL; 1404 return -EINVAL;
923 1405
924 if (S_ISDIR(nd->path.dentry->d_inode->i_mode) != 1406 if (S_ISDIR(path->dentry->d_inode->i_mode) !=
925 S_ISDIR(mnt->mnt_root->d_inode->i_mode)) 1407 S_ISDIR(mnt->mnt_root->d_inode->i_mode))
926 return -ENOTDIR; 1408 return -ENOTDIR;
927 1409
928 err = -ENOENT; 1410 err = -ENOENT;
929 mutex_lock(&nd->path.dentry->d_inode->i_mutex); 1411 mutex_lock(&path->dentry->d_inode->i_mutex);
930 if (IS_DEADDIR(nd->path.dentry->d_inode)) 1412 if (IS_DEADDIR(path->dentry->d_inode))
931 goto out_unlock; 1413 goto out_unlock;
932 1414
933 err = security_sb_check_sb(mnt, nd); 1415 err = security_sb_check_sb(mnt, path);
934 if (err) 1416 if (err)
935 goto out_unlock; 1417 goto out_unlock;
936 1418
937 err = -ENOENT; 1419 err = -ENOENT;
938 if (IS_ROOT(nd->path.dentry) || !d_unhashed(nd->path.dentry)) 1420 if (IS_ROOT(path->dentry) || !d_unhashed(path->dentry))
939 err = attach_recursive_mnt(mnt, &nd->path, NULL); 1421 err = attach_recursive_mnt(mnt, path, NULL);
940out_unlock: 1422out_unlock:
941 mutex_unlock(&nd->path.dentry->d_inode->i_mutex); 1423 mutex_unlock(&path->dentry->d_inode->i_mutex);
942 if (!err) 1424 if (!err)
943 security_sb_post_addmount(mnt, nd); 1425 security_sb_post_addmount(mnt, path);
944 return err; 1426 return err;
945} 1427}
946 1428
@@ -953,6 +1435,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
953 struct vfsmount *m, *mnt = nd->path.mnt; 1435 struct vfsmount *m, *mnt = nd->path.mnt;
954 int recurse = flag & MS_REC; 1436 int recurse = flag & MS_REC;
955 int type = flag & ~MS_REC; 1437 int type = flag & ~MS_REC;
1438 int err = 0;
956 1439
957 if (!capable(CAP_SYS_ADMIN)) 1440 if (!capable(CAP_SYS_ADMIN))
958 return -EPERM; 1441 return -EPERM;
@@ -961,12 +1444,20 @@ static noinline int do_change_type(struct nameidata *nd, int flag)
961 return -EINVAL; 1444 return -EINVAL;
962 1445
963 down_write(&namespace_sem); 1446 down_write(&namespace_sem);
1447 if (type == MS_SHARED) {
1448 err = invent_group_ids(mnt, recurse);
1449 if (err)
1450 goto out_unlock;
1451 }
1452
964 spin_lock(&vfsmount_lock); 1453 spin_lock(&vfsmount_lock);
965 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) 1454 for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL))
966 change_mnt_propagation(m, type); 1455 change_mnt_propagation(m, type);
967 spin_unlock(&vfsmount_lock); 1456 spin_unlock(&vfsmount_lock);
1457
1458 out_unlock:
968 up_write(&namespace_sem); 1459 up_write(&namespace_sem);
969 return 0; 1460 return err;
970} 1461}
971 1462
972/* 1463/*
@@ -1004,7 +1495,7 @@ static noinline int do_loopback(struct nameidata *nd, char *old_name,
1004 if (!mnt) 1495 if (!mnt)
1005 goto out; 1496 goto out;
1006 1497
1007 err = graft_tree(mnt, nd); 1498 err = graft_tree(mnt, &nd->path);
1008 if (err) { 1499 if (err) {
1009 LIST_HEAD(umount_list); 1500 LIST_HEAD(umount_list);
1010 spin_lock(&vfsmount_lock); 1501 spin_lock(&vfsmount_lock);
@@ -1019,6 +1510,23 @@ out:
1019 return err; 1510 return err;
1020} 1511}
1021 1512
1513static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
1514{
1515 int error = 0;
1516 int readonly_request = 0;
1517
1518 if (ms_flags & MS_RDONLY)
1519 readonly_request = 1;
1520 if (readonly_request == __mnt_is_readonly(mnt))
1521 return 0;
1522
1523 if (readonly_request)
1524 error = mnt_make_readonly(mnt);
1525 else
1526 __mnt_unmake_readonly(mnt);
1527 return error;
1528}
1529
1022/* 1530/*
1023 * change filesystem flags. dir should be a physical root of filesystem. 1531 * change filesystem flags. dir should be a physical root of filesystem.
1024 * If you've mounted a non-root directory somewhere and want to do remount 1532 * If you've mounted a non-root directory somewhere and want to do remount
@@ -1041,7 +1549,10 @@ static noinline int do_remount(struct nameidata *nd, int flags, int mnt_flags,
1041 return -EINVAL; 1549 return -EINVAL;
1042 1550
1043 down_write(&sb->s_umount); 1551 down_write(&sb->s_umount);
1044 err = do_remount_sb(sb, flags, data, 0); 1552 if (flags & MS_BIND)
1553 err = change_mount_flags(nd->path.mnt, flags);
1554 else
1555 err = do_remount_sb(sb, flags, data, 0);
1045 if (!err) 1556 if (!err)
1046 nd->path.mnt->mnt_flags = mnt_flags; 1557 nd->path.mnt->mnt_flags = mnt_flags;
1047 up_write(&sb->s_umount); 1558 up_write(&sb->s_umount);
@@ -1191,7 +1702,7 @@ int do_add_mount(struct vfsmount *newmnt, struct nameidata *nd,
1191 goto unlock; 1702 goto unlock;
1192 1703
1193 newmnt->mnt_flags = mnt_flags; 1704 newmnt->mnt_flags = mnt_flags;
1194 if ((err = graft_tree(newmnt, nd))) 1705 if ((err = graft_tree(newmnt, &nd->path)))
1195 goto unlock; 1706 goto unlock;
1196 1707
1197 if (fslist) /* add to the specified expiration list */ 1708 if (fslist) /* add to the specified expiration list */
@@ -1425,6 +1936,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1425 mnt_flags |= MNT_NODIRATIME; 1936 mnt_flags |= MNT_NODIRATIME;
1426 if (flags & MS_RELATIME) 1937 if (flags & MS_RELATIME)
1427 mnt_flags |= MNT_RELATIME; 1938 mnt_flags |= MNT_RELATIME;
1939 if (flags & MS_RDONLY)
1940 mnt_flags |= MNT_READONLY;
1428 1941
1429 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE | 1942 flags &= ~(MS_NOSUID | MS_NOEXEC | MS_NODEV | MS_ACTIVE |
1430 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT); 1943 MS_NOATIME | MS_NODIRATIME | MS_RELATIME| MS_KERNMOUNT);
@@ -1434,7 +1947,8 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
1434 if (retval) 1947 if (retval)
1435 return retval; 1948 return retval;
1436 1949
1437 retval = security_sb_mount(dev_name, &nd, type_page, flags, data_page); 1950 retval = security_sb_mount(dev_name, &nd.path,
1951 type_page, flags, data_page);
1438 if (retval) 1952 if (retval)
1439 goto dput_out; 1953 goto dput_out;
1440 1954
@@ -1674,15 +2188,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1674 const char __user * put_old) 2188 const char __user * put_old)
1675{ 2189{
1676 struct vfsmount *tmp; 2190 struct vfsmount *tmp;
1677 struct nameidata new_nd, old_nd, user_nd; 2191 struct nameidata new_nd, old_nd;
1678 struct path parent_path, root_parent; 2192 struct path parent_path, root_parent, root;
1679 int error; 2193 int error;
1680 2194
1681 if (!capable(CAP_SYS_ADMIN)) 2195 if (!capable(CAP_SYS_ADMIN))
1682 return -EPERM; 2196 return -EPERM;
1683 2197
1684 lock_kernel();
1685
1686 error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, 2198 error = __user_walk(new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
1687 &new_nd); 2199 &new_nd);
1688 if (error) 2200 if (error)
@@ -1695,14 +2207,14 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1695 if (error) 2207 if (error)
1696 goto out1; 2208 goto out1;
1697 2209
1698 error = security_sb_pivotroot(&old_nd, &new_nd); 2210 error = security_sb_pivotroot(&old_nd.path, &new_nd.path);
1699 if (error) { 2211 if (error) {
1700 path_put(&old_nd.path); 2212 path_put(&old_nd.path);
1701 goto out1; 2213 goto out1;
1702 } 2214 }
1703 2215
1704 read_lock(&current->fs->lock); 2216 read_lock(&current->fs->lock);
1705 user_nd.path = current->fs->root; 2217 root = current->fs->root;
1706 path_get(&current->fs->root); 2218 path_get(&current->fs->root);
1707 read_unlock(&current->fs->lock); 2219 read_unlock(&current->fs->lock);
1708 down_write(&namespace_sem); 2220 down_write(&namespace_sem);
@@ -1710,9 +2222,9 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1710 error = -EINVAL; 2222 error = -EINVAL;
1711 if (IS_MNT_SHARED(old_nd.path.mnt) || 2223 if (IS_MNT_SHARED(old_nd.path.mnt) ||
1712 IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) || 2224 IS_MNT_SHARED(new_nd.path.mnt->mnt_parent) ||
1713 IS_MNT_SHARED(user_nd.path.mnt->mnt_parent)) 2225 IS_MNT_SHARED(root.mnt->mnt_parent))
1714 goto out2; 2226 goto out2;
1715 if (!check_mnt(user_nd.path.mnt)) 2227 if (!check_mnt(root.mnt))
1716 goto out2; 2228 goto out2;
1717 error = -ENOENT; 2229 error = -ENOENT;
1718 if (IS_DEADDIR(new_nd.path.dentry->d_inode)) 2230 if (IS_DEADDIR(new_nd.path.dentry->d_inode))
@@ -1722,13 +2234,13 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1722 if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry)) 2234 if (d_unhashed(old_nd.path.dentry) && !IS_ROOT(old_nd.path.dentry))
1723 goto out2; 2235 goto out2;
1724 error = -EBUSY; 2236 error = -EBUSY;
1725 if (new_nd.path.mnt == user_nd.path.mnt || 2237 if (new_nd.path.mnt == root.mnt ||
1726 old_nd.path.mnt == user_nd.path.mnt) 2238 old_nd.path.mnt == root.mnt)
1727 goto out2; /* loop, on the same file system */ 2239 goto out2; /* loop, on the same file system */
1728 error = -EINVAL; 2240 error = -EINVAL;
1729 if (user_nd.path.mnt->mnt_root != user_nd.path.dentry) 2241 if (root.mnt->mnt_root != root.dentry)
1730 goto out2; /* not a mountpoint */ 2242 goto out2; /* not a mountpoint */
1731 if (user_nd.path.mnt->mnt_parent == user_nd.path.mnt) 2243 if (root.mnt->mnt_parent == root.mnt)
1732 goto out2; /* not attached */ 2244 goto out2; /* not attached */
1733 if (new_nd.path.mnt->mnt_root != new_nd.path.dentry) 2245 if (new_nd.path.mnt->mnt_root != new_nd.path.dentry)
1734 goto out2; /* not a mountpoint */ 2246 goto out2; /* not a mountpoint */
@@ -1750,27 +2262,26 @@ asmlinkage long sys_pivot_root(const char __user * new_root,
1750 } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry)) 2262 } else if (!is_subdir(old_nd.path.dentry, new_nd.path.dentry))
1751 goto out3; 2263 goto out3;
1752 detach_mnt(new_nd.path.mnt, &parent_path); 2264 detach_mnt(new_nd.path.mnt, &parent_path);
1753 detach_mnt(user_nd.path.mnt, &root_parent); 2265 detach_mnt(root.mnt, &root_parent);
1754 /* mount old root on put_old */ 2266 /* mount old root on put_old */
1755 attach_mnt(user_nd.path.mnt, &old_nd.path); 2267 attach_mnt(root.mnt, &old_nd.path);
1756 /* mount new_root on / */ 2268 /* mount new_root on / */
1757 attach_mnt(new_nd.path.mnt, &root_parent); 2269 attach_mnt(new_nd.path.mnt, &root_parent);
1758 touch_mnt_namespace(current->nsproxy->mnt_ns); 2270 touch_mnt_namespace(current->nsproxy->mnt_ns);
1759 spin_unlock(&vfsmount_lock); 2271 spin_unlock(&vfsmount_lock);
1760 chroot_fs_refs(&user_nd.path, &new_nd.path); 2272 chroot_fs_refs(&root, &new_nd.path);
1761 security_sb_post_pivotroot(&user_nd, &new_nd); 2273 security_sb_post_pivotroot(&root, &new_nd.path);
1762 error = 0; 2274 error = 0;
1763 path_put(&root_parent); 2275 path_put(&root_parent);
1764 path_put(&parent_path); 2276 path_put(&parent_path);
1765out2: 2277out2:
1766 mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex); 2278 mutex_unlock(&old_nd.path.dentry->d_inode->i_mutex);
1767 up_write(&namespace_sem); 2279 up_write(&namespace_sem);
1768 path_put(&user_nd.path); 2280 path_put(&root);
1769 path_put(&old_nd.path); 2281 path_put(&old_nd.path);
1770out1: 2282out1:
1771 path_put(&new_nd.path); 2283 path_put(&new_nd.path);
1772out0: 2284out0:
1773 unlock_kernel();
1774 return error; 2285 return error;
1775out3: 2286out3:
1776 spin_unlock(&vfsmount_lock); 2287 spin_unlock(&vfsmount_lock);
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index c67b4bdcf719..ad8f167e54bc 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -14,6 +14,7 @@
14#include <linux/ioctl.h> 14#include <linux/ioctl.h>
15#include <linux/time.h> 15#include <linux/time.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/mount.h>
17#include <linux/highuid.h> 18#include <linux/highuid.h>
18#include <linux/smp_lock.h> 19#include <linux/smp_lock.h>
19#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
@@ -261,7 +262,7 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
261} 262}
262#endif /* CONFIG_NCPFS_NLS */ 263#endif /* CONFIG_NCPFS_NLS */
263 264
264int ncp_ioctl(struct inode *inode, struct file *filp, 265static int __ncp_ioctl(struct inode *inode, struct file *filp,
265 unsigned int cmd, unsigned long arg) 266 unsigned int cmd, unsigned long arg)
266{ 267{
267 struct ncp_server *server = NCP_SERVER(inode); 268 struct ncp_server *server = NCP_SERVER(inode);
@@ -822,6 +823,57 @@ outrel:
822 return -EINVAL; 823 return -EINVAL;
823} 824}
824 825
826static int ncp_ioctl_need_write(unsigned int cmd)
827{
828 switch (cmd) {
829 case NCP_IOC_GET_FS_INFO:
830 case NCP_IOC_GET_FS_INFO_V2:
831 case NCP_IOC_NCPREQUEST:
832 case NCP_IOC_SETDENTRYTTL:
833 case NCP_IOC_SIGN_INIT:
834 case NCP_IOC_LOCKUNLOCK:
835 case NCP_IOC_SET_SIGN_WANTED:
836 return 1;
837 case NCP_IOC_GETOBJECTNAME:
838 case NCP_IOC_SETOBJECTNAME:
839 case NCP_IOC_GETPRIVATEDATA:
840 case NCP_IOC_SETPRIVATEDATA:
841 case NCP_IOC_SETCHARSETS:
842 case NCP_IOC_GETCHARSETS:
843 case NCP_IOC_CONN_LOGGED_IN:
844 case NCP_IOC_GETDENTRYTTL:
845 case NCP_IOC_GETMOUNTUID2:
846 case NCP_IOC_SIGN_WANTED:
847 case NCP_IOC_GETROOT:
848 case NCP_IOC_SETROOT:
849 return 0;
850 default:
851 /* unkown IOCTL command, assume write */
852 return 1;
853 }
854}
855
856int ncp_ioctl(struct inode *inode, struct file *filp,
857 unsigned int cmd, unsigned long arg)
858{
859 int ret;
860
861 if (ncp_ioctl_need_write(cmd)) {
862 /*
863 * inside the ioctl(), any failures which
864 * are because of file_permission() are
865 * -EACCESS, so it seems consistent to keep
866 * that here.
867 */
868 if (mnt_want_write(filp->f_path.mnt))
869 return -EACCES;
870 }
871 ret = __ncp_ioctl(inode, filp, cmd, arg);
872 if (ncp_ioctl_need_write(cmd))
873 mnt_drop_write(filp->f_path.mnt);
874 return ret;
875}
876
825#ifdef CONFIG_COMPAT 877#ifdef CONFIG_COMPAT
826long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) 878long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
827{ 879{
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 6cea7479c5b4..d9e30ac2798d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -967,7 +967,8 @@ static int is_atomic_open(struct inode *dir, struct nameidata *nd)
967 if (nd->flags & LOOKUP_DIRECTORY) 967 if (nd->flags & LOOKUP_DIRECTORY)
968 return 0; 968 return 0;
969 /* Are we trying to write to a read only partition? */ 969 /* Are we trying to write to a read only partition? */
970 if (IS_RDONLY(dir) && (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE))) 970 if (__mnt_is_readonly(nd->path.mnt) &&
971 (nd->intent.open.flags & (O_CREAT|O_TRUNC|FMODE_WRITE)))
971 return 0; 972 return 0;
972 return 1; 973 return 1;
973} 974}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index c593db047d8b..c309c881bd4e 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -658,14 +658,19 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
658 return status; 658 return status;
659 } 659 }
660 } 660 }
661 status = mnt_want_write(cstate->current_fh.fh_export->ex_path.mnt);
662 if (status)
663 return status;
661 status = nfs_ok; 664 status = nfs_ok;
662 if (setattr->sa_acl != NULL) 665 if (setattr->sa_acl != NULL)
663 status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh, 666 status = nfsd4_set_nfs4_acl(rqstp, &cstate->current_fh,
664 setattr->sa_acl); 667 setattr->sa_acl);
665 if (status) 668 if (status)
666 return status; 669 goto out;
667 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr, 670 status = nfsd_setattr(rqstp, &cstate->current_fh, &setattr->sa_iattr,
668 0, (time_t)0); 671 0, (time_t)0);
672out:
673 mnt_drop_write(cstate->current_fh.fh_export->ex_path.mnt);
669 return status; 674 return status;
670} 675}
671 676
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 1ff90625860f..145b3c877a27 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -46,6 +46,7 @@
46#include <linux/scatterlist.h> 46#include <linux/scatterlist.h>
47#include <linux/crypto.h> 47#include <linux/crypto.h>
48#include <linux/sched.h> 48#include <linux/sched.h>
49#include <linux/mount.h>
49 50
50#define NFSDDBG_FACILITY NFSDDBG_PROC 51#define NFSDDBG_FACILITY NFSDDBG_PROC
51 52
@@ -154,7 +155,11 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
154 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n"); 155 dprintk("NFSD: nfsd4_create_clid_dir: DIRECTORY EXISTS\n");
155 goto out_put; 156 goto out_put;
156 } 157 }
158 status = mnt_want_write(rec_dir.path.mnt);
159 if (status)
160 goto out_put;
157 status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU); 161 status = vfs_mkdir(rec_dir.path.dentry->d_inode, dentry, S_IRWXU);
162 mnt_drop_write(rec_dir.path.mnt);
158out_put: 163out_put:
159 dput(dentry); 164 dput(dentry);
160out_unlock: 165out_unlock:
@@ -313,12 +318,17 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp)
313 if (!rec_dir_init || !clp->cl_firststate) 318 if (!rec_dir_init || !clp->cl_firststate)
314 return; 319 return;
315 320
321 status = mnt_want_write(rec_dir.path.mnt);
322 if (status)
323 goto out;
316 clp->cl_firststate = 0; 324 clp->cl_firststate = 0;
317 nfs4_save_user(&uid, &gid); 325 nfs4_save_user(&uid, &gid);
318 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1); 326 status = nfsd4_unlink_clid_dir(clp->cl_recdir, HEXDIR_LEN-1);
319 nfs4_reset_user(uid, gid); 327 nfs4_reset_user(uid, gid);
320 if (status == 0) 328 if (status == 0)
321 nfsd4_sync_rec_dir(); 329 nfsd4_sync_rec_dir();
330 mnt_drop_write(rec_dir.path.mnt);
331out:
322 if (status) 332 if (status)
323 printk("NFSD: Failed to remove expired client state directory" 333 printk("NFSD: Failed to remove expired client state directory"
324 " %.*s\n", HEXDIR_LEN, clp->cl_recdir); 334 " %.*s\n", HEXDIR_LEN, clp->cl_recdir);
@@ -347,13 +357,17 @@ nfsd4_recdir_purge_old(void) {
347 357
348 if (!rec_dir_init) 358 if (!rec_dir_init)
349 return; 359 return;
360 status = mnt_want_write(rec_dir.path.mnt);
361 if (status)
362 goto out;
350 status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old); 363 status = nfsd4_list_rec_dir(rec_dir.path.dentry, purge_old);
351 if (status == 0) 364 if (status == 0)
352 nfsd4_sync_rec_dir(); 365 nfsd4_sync_rec_dir();
366 mnt_drop_write(rec_dir.path.mnt);
367out:
353 if (status) 368 if (status)
354 printk("nfsd4: failed to purge old clients from recovery" 369 printk("nfsd4: failed to purge old clients from recovery"
355 " directory %s\n", rec_dir.path.dentry->d_name.name); 370 " directory %s\n", rec_dir.path.dentry->d_name.name);
356 return;
357} 371}
358 372
359static int 373static int
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bcb97d8e8b8b..81a75f3081f4 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -41,6 +41,7 @@
41#include <linux/sunrpc/svc.h> 41#include <linux/sunrpc/svc.h>
42#include <linux/nfsd/nfsd.h> 42#include <linux/nfsd/nfsd.h>
43#include <linux/nfsd/cache.h> 43#include <linux/nfsd/cache.h>
44#include <linux/file.h>
44#include <linux/mount.h> 45#include <linux/mount.h>
45#include <linux/workqueue.h> 46#include <linux/workqueue.h>
46#include <linux/smp_lock.h> 47#include <linux/smp_lock.h>
@@ -1239,7 +1240,7 @@ static inline void
1239nfs4_file_downgrade(struct file *filp, unsigned int share_access) 1240nfs4_file_downgrade(struct file *filp, unsigned int share_access)
1240{ 1241{
1241 if (share_access & NFS4_SHARE_ACCESS_WRITE) { 1242 if (share_access & NFS4_SHARE_ACCESS_WRITE) {
1242 put_write_access(filp->f_path.dentry->d_inode); 1243 drop_file_write_access(filp);
1243 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; 1244 filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE;
1244 } 1245 }
1245} 1246}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 46f59d5365a0..304bf5f643c9 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -1255,23 +1255,35 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1255 err = 0; 1255 err = 0;
1256 switch (type) { 1256 switch (type) {
1257 case S_IFREG: 1257 case S_IFREG:
1258 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1259 if (host_err)
1260 goto out_nfserr;
1258 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1261 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1259 break; 1262 break;
1260 case S_IFDIR: 1263 case S_IFDIR:
1264 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1265 if (host_err)
1266 goto out_nfserr;
1261 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1267 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
1262 break; 1268 break;
1263 case S_IFCHR: 1269 case S_IFCHR:
1264 case S_IFBLK: 1270 case S_IFBLK:
1265 case S_IFIFO: 1271 case S_IFIFO:
1266 case S_IFSOCK: 1272 case S_IFSOCK:
1273 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1274 if (host_err)
1275 goto out_nfserr;
1267 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev); 1276 host_err = vfs_mknod(dirp, dchild, iap->ia_mode, rdev);
1268 break; 1277 break;
1269 default: 1278 default:
1270 printk("nfsd: bad file type %o in nfsd_create\n", type); 1279 printk("nfsd: bad file type %o in nfsd_create\n", type);
1271 host_err = -EINVAL; 1280 host_err = -EINVAL;
1281 goto out_nfserr;
1272 } 1282 }
1273 if (host_err < 0) 1283 if (host_err < 0) {
1284 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1274 goto out_nfserr; 1285 goto out_nfserr;
1286 }
1275 1287
1276 if (EX_ISSYNC(fhp->fh_export)) { 1288 if (EX_ISSYNC(fhp->fh_export)) {
1277 err = nfserrno(nfsd_sync_dir(dentry)); 1289 err = nfserrno(nfsd_sync_dir(dentry));
@@ -1282,6 +1294,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1282 err2 = nfsd_create_setattr(rqstp, resfhp, iap); 1294 err2 = nfsd_create_setattr(rqstp, resfhp, iap);
1283 if (err2) 1295 if (err2)
1284 err = err2; 1296 err = err2;
1297 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1285 /* 1298 /*
1286 * Update the file handle to get the new inode info. 1299 * Update the file handle to get the new inode info.
1287 */ 1300 */
@@ -1359,6 +1372,9 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1359 v_atime = verifier[1]&0x7fffffff; 1372 v_atime = verifier[1]&0x7fffffff;
1360 } 1373 }
1361 1374
1375 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1376 if (host_err)
1377 goto out_nfserr;
1362 if (dchild->d_inode) { 1378 if (dchild->d_inode) {
1363 err = 0; 1379 err = 0;
1364 1380
@@ -1390,12 +1406,15 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1390 case NFS3_CREATE_GUARDED: 1406 case NFS3_CREATE_GUARDED:
1391 err = nfserr_exist; 1407 err = nfserr_exist;
1392 } 1408 }
1409 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1393 goto out; 1410 goto out;
1394 } 1411 }
1395 1412
1396 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1413 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1397 if (host_err < 0) 1414 if (host_err < 0) {
1415 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1398 goto out_nfserr; 1416 goto out_nfserr;
1417 }
1399 if (created) 1418 if (created)
1400 *created = 1; 1419 *created = 1;
1401 1420
@@ -1420,6 +1439,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1420 if (err2) 1439 if (err2)
1421 err = err2; 1440 err = err2;
1422 1441
1442 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1423 /* 1443 /*
1424 * Update the filehandle to get the new inode info. 1444 * Update the filehandle to get the new inode info.
1425 */ 1445 */
@@ -1522,6 +1542,10 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1522 if (iap && (iap->ia_valid & ATTR_MODE)) 1542 if (iap && (iap->ia_valid & ATTR_MODE))
1523 mode = iap->ia_mode & S_IALLUGO; 1543 mode = iap->ia_mode & S_IALLUGO;
1524 1544
1545 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1546 if (host_err)
1547 goto out_nfserr;
1548
1525 if (unlikely(path[plen] != 0)) { 1549 if (unlikely(path[plen] != 0)) {
1526 char *path_alloced = kmalloc(plen+1, GFP_KERNEL); 1550 char *path_alloced = kmalloc(plen+1, GFP_KERNEL);
1527 if (path_alloced == NULL) 1551 if (path_alloced == NULL)
@@ -1542,6 +1566,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
1542 err = nfserrno(host_err); 1566 err = nfserrno(host_err);
1543 fh_unlock(fhp); 1567 fh_unlock(fhp);
1544 1568
1569 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1570
1545 cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp); 1571 cerr = fh_compose(resfhp, fhp->fh_export, dnew, fhp);
1546 dput(dnew); 1572 dput(dnew);
1547 if (err==0) err = cerr; 1573 if (err==0) err = cerr;
@@ -1592,6 +1618,11 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1592 dold = tfhp->fh_dentry; 1618 dold = tfhp->fh_dentry;
1593 dest = dold->d_inode; 1619 dest = dold->d_inode;
1594 1620
1621 host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt);
1622 if (host_err) {
1623 err = nfserrno(host_err);
1624 goto out_dput;
1625 }
1595 host_err = vfs_link(dold, dirp, dnew); 1626 host_err = vfs_link(dold, dirp, dnew);
1596 if (!host_err) { 1627 if (!host_err) {
1597 if (EX_ISSYNC(ffhp->fh_export)) { 1628 if (EX_ISSYNC(ffhp->fh_export)) {
@@ -1605,7 +1636,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
1605 else 1636 else
1606 err = nfserrno(host_err); 1637 err = nfserrno(host_err);
1607 } 1638 }
1608 1639 mnt_drop_write(tfhp->fh_export->ex_path.mnt);
1640out_dput:
1609 dput(dnew); 1641 dput(dnew);
1610out_unlock: 1642out_unlock:
1611 fh_unlock(ffhp); 1643 fh_unlock(ffhp);
@@ -1678,13 +1710,20 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1678 if (ndentry == trap) 1710 if (ndentry == trap)
1679 goto out_dput_new; 1711 goto out_dput_new;
1680 1712
1681#ifdef MSNFS 1713 if (svc_msnfs(ffhp) &&
1682 if ((ffhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
1683 ((atomic_read(&odentry->d_count) > 1) 1714 ((atomic_read(&odentry->d_count) > 1)
1684 || (atomic_read(&ndentry->d_count) > 1))) { 1715 || (atomic_read(&ndentry->d_count) > 1))) {
1685 host_err = -EPERM; 1716 host_err = -EPERM;
1686 } else 1717 goto out_dput_new;
1687#endif 1718 }
1719
1720 host_err = -EXDEV;
1721 if (ffhp->fh_export->ex_path.mnt != tfhp->fh_export->ex_path.mnt)
1722 goto out_dput_new;
1723 host_err = mnt_want_write(ffhp->fh_export->ex_path.mnt);
1724 if (host_err)
1725 goto out_dput_new;
1726
1688 host_err = vfs_rename(fdir, odentry, tdir, ndentry); 1727 host_err = vfs_rename(fdir, odentry, tdir, ndentry);
1689 if (!host_err && EX_ISSYNC(tfhp->fh_export)) { 1728 if (!host_err && EX_ISSYNC(tfhp->fh_export)) {
1690 host_err = nfsd_sync_dir(tdentry); 1729 host_err = nfsd_sync_dir(tdentry);
@@ -1692,6 +1731,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
1692 host_err = nfsd_sync_dir(fdentry); 1731 host_err = nfsd_sync_dir(fdentry);
1693 } 1732 }
1694 1733
1734 mnt_drop_write(ffhp->fh_export->ex_path.mnt);
1735
1695 out_dput_new: 1736 out_dput_new:
1696 dput(ndentry); 1737 dput(ndentry);
1697 out_dput_old: 1738 out_dput_old:
@@ -1750,6 +1791,10 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1750 if (!type) 1791 if (!type)
1751 type = rdentry->d_inode->i_mode & S_IFMT; 1792 type = rdentry->d_inode->i_mode & S_IFMT;
1752 1793
1794 host_err = mnt_want_write(fhp->fh_export->ex_path.mnt);
1795 if (host_err)
1796 goto out_nfserr;
1797
1753 if (type != S_IFDIR) { /* It's UNLINK */ 1798 if (type != S_IFDIR) { /* It's UNLINK */
1754#ifdef MSNFS 1799#ifdef MSNFS
1755 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 1800 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
@@ -1765,10 +1810,12 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
1765 dput(rdentry); 1810 dput(rdentry);
1766 1811
1767 if (host_err) 1812 if (host_err)
1768 goto out_nfserr; 1813 goto out_drop;
1769 if (EX_ISSYNC(fhp->fh_export)) 1814 if (EX_ISSYNC(fhp->fh_export))
1770 host_err = nfsd_sync_dir(dentry); 1815 host_err = nfsd_sync_dir(dentry);
1771 1816
1817out_drop:
1818 mnt_drop_write(fhp->fh_export->ex_path.mnt);
1772out_nfserr: 1819out_nfserr:
1773 err = nfserrno(host_err); 1820 err = nfserrno(host_err);
1774out: 1821out:
@@ -1865,7 +1912,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
1865 inode->i_mode, 1912 inode->i_mode,
1866 IS_IMMUTABLE(inode)? " immut" : "", 1913 IS_IMMUTABLE(inode)? " immut" : "",
1867 IS_APPEND(inode)? " append" : "", 1914 IS_APPEND(inode)? " append" : "",
1868 IS_RDONLY(inode)? " ro" : ""); 1915 __mnt_is_readonly(exp->ex_path.mnt)? " ro" : "");
1869 dprintk(" owner %d/%d user %d/%d\n", 1916 dprintk(" owner %d/%d user %d/%d\n",
1870 inode->i_uid, inode->i_gid, current->fsuid, current->fsgid); 1917 inode->i_uid, inode->i_gid, current->fsuid, current->fsgid);
1871#endif 1918#endif
@@ -1876,7 +1923,8 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
1876 */ 1923 */
1877 if (!(acc & MAY_LOCAL_ACCESS)) 1924 if (!(acc & MAY_LOCAL_ACCESS))
1878 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) { 1925 if (acc & (MAY_WRITE | MAY_SATTR | MAY_TRUNC)) {
1879 if (exp_rdonly(rqstp, exp) || IS_RDONLY(inode)) 1926 if (exp_rdonly(rqstp, exp) ||
1927 __mnt_is_readonly(exp->ex_path.mnt))
1880 return nfserr_rofs; 1928 return nfserr_rofs;
1881 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode)) 1929 if (/* (acc & MAY_WRITE) && */ IS_IMMUTABLE(inode))
1882 return nfserr_perm; 1930 return nfserr_perm;
@@ -2039,6 +2087,9 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
2039 } else 2087 } else
2040 size = 0; 2088 size = 0;
2041 2089
2090 error = mnt_want_write(fhp->fh_export->ex_path.mnt);
2091 if (error)
2092 goto getout;
2042 if (size) 2093 if (size)
2043 error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0); 2094 error = vfs_setxattr(fhp->fh_dentry, name, value, size, 0);
2044 else { 2095 else {
@@ -2050,6 +2101,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
2050 error = 0; 2101 error = 0;
2051 } 2102 }
2052 } 2103 }
2104 mnt_drop_write(fhp->fh_export->ex_path.mnt);
2053 2105
2054getout: 2106getout:
2055 kfree(value); 2107 kfree(value);
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 4d4ce48bb42c..f6956de56fdb 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -2,7 +2,12 @@ EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES 3EXTRA_CFLAGS += -DCATCH_BH_JBD_RACES
4 4
5obj-$(CONFIG_OCFS2_FS) += ocfs2.o 5obj-$(CONFIG_OCFS2_FS) += \
6 ocfs2.o \
7 ocfs2_stackglue.o
8
9obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_stack_o2cb.o
10obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
6 11
7ocfs2-objs := \ 12ocfs2-objs := \
8 alloc.o \ 13 alloc.o \
@@ -31,5 +36,10 @@ ocfs2-objs := \
31 uptodate.o \ 36 uptodate.o \
32 ver.o 37 ver.o
33 38
39ocfs2_stackglue-objs := stackglue.o
40ocfs2_stack_o2cb-objs := stack_o2cb.o
41ocfs2_stack_user-objs := stack_user.o
42
43# cluster/ is always needed when OCFS2_FS for masklog support
34obj-$(CONFIG_OCFS2_FS) += cluster/ 44obj-$(CONFIG_OCFS2_FS) += cluster/
35obj-$(CONFIG_OCFS2_FS) += dlm/ 45obj-$(CONFIG_OCFS2_FS_O2CB) += dlm/
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 447206eb5c2e..41f84c92094f 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1029,8 +1029,7 @@ static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
1029 BUG_ON(!next_free); 1029 BUG_ON(!next_free);
1030 1030
1031 /* The tree code before us didn't allow enough room in the leaf. */ 1031 /* The tree code before us didn't allow enough room in the leaf. */
1032 if (el->l_next_free_rec == el->l_count && !has_empty) 1032 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty);
1033 BUG();
1034 1033
1035 /* 1034 /*
1036 * The easiest way to approach this is to just remove the 1035 * The easiest way to approach this is to just remove the
@@ -1450,6 +1449,8 @@ static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
1450 * - When our insert into the right path leaf is at the leftmost edge 1449 * - When our insert into the right path leaf is at the leftmost edge
1451 * and requires an update of the path immediately to it's left. This 1450 * and requires an update of the path immediately to it's left. This
1452 * can occur at the end of some types of rotation and appending inserts. 1451 * can occur at the end of some types of rotation and appending inserts.
1452 * - When we've adjusted the last extent record in the left path leaf and the
1453 * 1st extent record in the right path leaf during cross extent block merge.
1453 */ 1454 */
1454static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1455static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
1455 struct ocfs2_path *left_path, 1456 struct ocfs2_path *left_path,
@@ -2712,24 +2713,147 @@ static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el,
2712 } 2713 }
2713} 2714}
2714 2715
2716static int ocfs2_get_right_path(struct inode *inode,
2717 struct ocfs2_path *left_path,
2718 struct ocfs2_path **ret_right_path)
2719{
2720 int ret;
2721 u32 right_cpos;
2722 struct ocfs2_path *right_path = NULL;
2723 struct ocfs2_extent_list *left_el;
2724
2725 *ret_right_path = NULL;
2726
2727 /* This function shouldn't be called for non-trees. */
2728 BUG_ON(left_path->p_tree_depth == 0);
2729
2730 left_el = path_leaf_el(left_path);
2731 BUG_ON(left_el->l_next_free_rec != left_el->l_count);
2732
2733 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path,
2734 &right_cpos);
2735 if (ret) {
2736 mlog_errno(ret);
2737 goto out;
2738 }
2739
2740 /* This function shouldn't be called for the rightmost leaf. */
2741 BUG_ON(right_cpos == 0);
2742
2743 right_path = ocfs2_new_path(path_root_bh(left_path),
2744 path_root_el(left_path));
2745 if (!right_path) {
2746 ret = -ENOMEM;
2747 mlog_errno(ret);
2748 goto out;
2749 }
2750
2751 ret = ocfs2_find_path(inode, right_path, right_cpos);
2752 if (ret) {
2753 mlog_errno(ret);
2754 goto out;
2755 }
2756
2757 *ret_right_path = right_path;
2758out:
2759 if (ret)
2760 ocfs2_free_path(right_path);
2761 return ret;
2762}
2763
2715/* 2764/*
2716 * Remove split_rec clusters from the record at index and merge them 2765 * Remove split_rec clusters from the record at index and merge them
2717 * onto the beginning of the record at index + 1. 2766 * onto the beginning of the record "next" to it.
2767 * For index < l_count - 1, the next means the extent rec at index + 1.
2768 * For index == l_count - 1, the "next" means the 1st extent rec of the
2769 * next extent block.
2718 */ 2770 */
2719static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh, 2771static int ocfs2_merge_rec_right(struct inode *inode,
2720 handle_t *handle, 2772 struct ocfs2_path *left_path,
2721 struct ocfs2_extent_rec *split_rec, 2773 handle_t *handle,
2722 struct ocfs2_extent_list *el, int index) 2774 struct ocfs2_extent_rec *split_rec,
2775 int index)
2723{ 2776{
2724 int ret; 2777 int ret, next_free, i;
2725 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 2778 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2726 struct ocfs2_extent_rec *left_rec; 2779 struct ocfs2_extent_rec *left_rec;
2727 struct ocfs2_extent_rec *right_rec; 2780 struct ocfs2_extent_rec *right_rec;
2781 struct ocfs2_extent_list *right_el;
2782 struct ocfs2_path *right_path = NULL;
2783 int subtree_index = 0;
2784 struct ocfs2_extent_list *el = path_leaf_el(left_path);
2785 struct buffer_head *bh = path_leaf_bh(left_path);
2786 struct buffer_head *root_bh = NULL;
2728 2787
2729 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 2788 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec));
2730
2731 left_rec = &el->l_recs[index]; 2789 left_rec = &el->l_recs[index];
2732 right_rec = &el->l_recs[index + 1]; 2790
2791 if (index == le16_to_cpu(el->l_next_free_rec - 1) &&
2792 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) {
2793 /* we meet with a cross extent block merge. */
2794 ret = ocfs2_get_right_path(inode, left_path, &right_path);
2795 if (ret) {
2796 mlog_errno(ret);
2797 goto out;
2798 }
2799
2800 right_el = path_leaf_el(right_path);
2801 next_free = le16_to_cpu(right_el->l_next_free_rec);
2802 BUG_ON(next_free <= 0);
2803 right_rec = &right_el->l_recs[0];
2804 if (ocfs2_is_empty_extent(right_rec)) {
2805 BUG_ON(le16_to_cpu(next_free) <= 1);
2806 right_rec = &right_el->l_recs[1];
2807 }
2808
2809 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2810 le16_to_cpu(left_rec->e_leaf_clusters) !=
2811 le32_to_cpu(right_rec->e_cpos));
2812
2813 subtree_index = ocfs2_find_subtree_root(inode,
2814 left_path, right_path);
2815
2816 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2817 handle->h_buffer_credits,
2818 right_path);
2819 if (ret) {
2820 mlog_errno(ret);
2821 goto out;
2822 }
2823
2824 root_bh = left_path->p_node[subtree_index].bh;
2825 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2826
2827 ret = ocfs2_journal_access(handle, inode, root_bh,
2828 OCFS2_JOURNAL_ACCESS_WRITE);
2829 if (ret) {
2830 mlog_errno(ret);
2831 goto out;
2832 }
2833
2834 for (i = subtree_index + 1;
2835 i < path_num_items(right_path); i++) {
2836 ret = ocfs2_journal_access(handle, inode,
2837 right_path->p_node[i].bh,
2838 OCFS2_JOURNAL_ACCESS_WRITE);
2839 if (ret) {
2840 mlog_errno(ret);
2841 goto out;
2842 }
2843
2844 ret = ocfs2_journal_access(handle, inode,
2845 left_path->p_node[i].bh,
2846 OCFS2_JOURNAL_ACCESS_WRITE);
2847 if (ret) {
2848 mlog_errno(ret);
2849 goto out;
2850 }
2851 }
2852
2853 } else {
2854 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1);
2855 right_rec = &el->l_recs[index + 1];
2856 }
2733 2857
2734 ret = ocfs2_journal_access(handle, inode, bh, 2858 ret = ocfs2_journal_access(handle, inode, bh,
2735 OCFS2_JOURNAL_ACCESS_WRITE); 2859 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2751,30 +2875,156 @@ static int ocfs2_merge_rec_right(struct inode *inode, struct buffer_head *bh,
2751 if (ret) 2875 if (ret)
2752 mlog_errno(ret); 2876 mlog_errno(ret);
2753 2877
2878 if (right_path) {
2879 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path));
2880 if (ret)
2881 mlog_errno(ret);
2882
2883 ocfs2_complete_edge_insert(inode, handle, left_path,
2884 right_path, subtree_index);
2885 }
2886out:
2887 if (right_path)
2888 ocfs2_free_path(right_path);
2889 return ret;
2890}
2891
2892static int ocfs2_get_left_path(struct inode *inode,
2893 struct ocfs2_path *right_path,
2894 struct ocfs2_path **ret_left_path)
2895{
2896 int ret;
2897 u32 left_cpos;
2898 struct ocfs2_path *left_path = NULL;
2899
2900 *ret_left_path = NULL;
2901
2902 /* This function shouldn't be called for non-trees. */
2903 BUG_ON(right_path->p_tree_depth == 0);
2904
2905 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
2906 right_path, &left_cpos);
2907 if (ret) {
2908 mlog_errno(ret);
2909 goto out;
2910 }
2911
2912 /* This function shouldn't be called for the leftmost leaf. */
2913 BUG_ON(left_cpos == 0);
2914
2915 left_path = ocfs2_new_path(path_root_bh(right_path),
2916 path_root_el(right_path));
2917 if (!left_path) {
2918 ret = -ENOMEM;
2919 mlog_errno(ret);
2920 goto out;
2921 }
2922
2923 ret = ocfs2_find_path(inode, left_path, left_cpos);
2924 if (ret) {
2925 mlog_errno(ret);
2926 goto out;
2927 }
2928
2929 *ret_left_path = left_path;
2754out: 2930out:
2931 if (ret)
2932 ocfs2_free_path(left_path);
2755 return ret; 2933 return ret;
2756} 2934}
2757 2935
2758/* 2936/*
2759 * Remove split_rec clusters from the record at index and merge them 2937 * Remove split_rec clusters from the record at index and merge them
2760 * onto the tail of the record at index - 1. 2938 * onto the tail of the record "before" it.
2939 * For index > 0, the "before" means the extent rec at index - 1.
2940 *
2941 * For index == 0, the "before" means the last record of the previous
2942 * extent block. And there is also a situation that we may need to
2943 * remove the rightmost leaf extent block in the right_path and change
2944 * the right path to indicate the new rightmost path.
2761 */ 2945 */
2762static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh, 2946static int ocfs2_merge_rec_left(struct inode *inode,
2947 struct ocfs2_path *right_path,
2763 handle_t *handle, 2948 handle_t *handle,
2764 struct ocfs2_extent_rec *split_rec, 2949 struct ocfs2_extent_rec *split_rec,
2765 struct ocfs2_extent_list *el, int index) 2950 struct ocfs2_cached_dealloc_ctxt *dealloc,
2951 int index)
2766{ 2952{
2767 int ret, has_empty_extent = 0; 2953 int ret, i, subtree_index = 0, has_empty_extent = 0;
2768 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 2954 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters);
2769 struct ocfs2_extent_rec *left_rec; 2955 struct ocfs2_extent_rec *left_rec;
2770 struct ocfs2_extent_rec *right_rec; 2956 struct ocfs2_extent_rec *right_rec;
2957 struct ocfs2_extent_list *el = path_leaf_el(right_path);
2958 struct buffer_head *bh = path_leaf_bh(right_path);
2959 struct buffer_head *root_bh = NULL;
2960 struct ocfs2_path *left_path = NULL;
2961 struct ocfs2_extent_list *left_el;
2771 2962
2772 BUG_ON(index <= 0); 2963 BUG_ON(index < 0);
2773 2964
2774 left_rec = &el->l_recs[index - 1];
2775 right_rec = &el->l_recs[index]; 2965 right_rec = &el->l_recs[index];
2776 if (ocfs2_is_empty_extent(&el->l_recs[0])) 2966 if (index == 0) {
2777 has_empty_extent = 1; 2967 /* we meet with a cross extent block merge. */
2968 ret = ocfs2_get_left_path(inode, right_path, &left_path);
2969 if (ret) {
2970 mlog_errno(ret);
2971 goto out;
2972 }
2973
2974 left_el = path_leaf_el(left_path);
2975 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) !=
2976 le16_to_cpu(left_el->l_count));
2977
2978 left_rec = &left_el->l_recs[
2979 le16_to_cpu(left_el->l_next_free_rec) - 1];
2980 BUG_ON(le32_to_cpu(left_rec->e_cpos) +
2981 le16_to_cpu(left_rec->e_leaf_clusters) !=
2982 le32_to_cpu(split_rec->e_cpos));
2983
2984 subtree_index = ocfs2_find_subtree_root(inode,
2985 left_path, right_path);
2986
2987 ret = ocfs2_extend_rotate_transaction(handle, subtree_index,
2988 handle->h_buffer_credits,
2989 left_path);
2990 if (ret) {
2991 mlog_errno(ret);
2992 goto out;
2993 }
2994
2995 root_bh = left_path->p_node[subtree_index].bh;
2996 BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
2997
2998 ret = ocfs2_journal_access(handle, inode, root_bh,
2999 OCFS2_JOURNAL_ACCESS_WRITE);
3000 if (ret) {
3001 mlog_errno(ret);
3002 goto out;
3003 }
3004
3005 for (i = subtree_index + 1;
3006 i < path_num_items(right_path); i++) {
3007 ret = ocfs2_journal_access(handle, inode,
3008 right_path->p_node[i].bh,
3009 OCFS2_JOURNAL_ACCESS_WRITE);
3010 if (ret) {
3011 mlog_errno(ret);
3012 goto out;
3013 }
3014
3015 ret = ocfs2_journal_access(handle, inode,
3016 left_path->p_node[i].bh,
3017 OCFS2_JOURNAL_ACCESS_WRITE);
3018 if (ret) {
3019 mlog_errno(ret);
3020 goto out;
3021 }
3022 }
3023 } else {
3024 left_rec = &el->l_recs[index - 1];
3025 if (ocfs2_is_empty_extent(&el->l_recs[0]))
3026 has_empty_extent = 1;
3027 }
2778 3028
2779 ret = ocfs2_journal_access(handle, inode, bh, 3029 ret = ocfs2_journal_access(handle, inode, bh,
2780 OCFS2_JOURNAL_ACCESS_WRITE); 3030 OCFS2_JOURNAL_ACCESS_WRITE);
@@ -2790,9 +3040,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2790 *left_rec = *split_rec; 3040 *left_rec = *split_rec;
2791 3041
2792 has_empty_extent = 0; 3042 has_empty_extent = 0;
2793 } else { 3043 } else
2794 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3044 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters);
2795 }
2796 3045
2797 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3046 le32_add_cpu(&right_rec->e_cpos, split_clusters);
2798 le64_add_cpu(&right_rec->e_blkno, 3047 le64_add_cpu(&right_rec->e_blkno,
@@ -2805,13 +3054,44 @@ static int ocfs2_merge_rec_left(struct inode *inode, struct buffer_head *bh,
2805 if (ret) 3054 if (ret)
2806 mlog_errno(ret); 3055 mlog_errno(ret);
2807 3056
3057 if (left_path) {
3058 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path));
3059 if (ret)
3060 mlog_errno(ret);
3061
3062 /*
3063 * In the situation that the right_rec is empty and the extent
3064 * block is empty also, ocfs2_complete_edge_insert can't handle
3065 * it and we need to delete the right extent block.
3066 */
3067 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 &&
3068 le16_to_cpu(el->l_next_free_rec) == 1) {
3069
3070 ret = ocfs2_remove_rightmost_path(inode, handle,
3071 right_path, dealloc);
3072 if (ret) {
3073 mlog_errno(ret);
3074 goto out;
3075 }
3076
3077 /* Now the rightmost extent block has been deleted.
3078 * So we use the new rightmost path.
3079 */
3080 ocfs2_mv_path(right_path, left_path);
3081 left_path = NULL;
3082 } else
3083 ocfs2_complete_edge_insert(inode, handle, left_path,
3084 right_path, subtree_index);
3085 }
2808out: 3086out:
3087 if (left_path)
3088 ocfs2_free_path(left_path);
2809 return ret; 3089 return ret;
2810} 3090}
2811 3091
2812static int ocfs2_try_to_merge_extent(struct inode *inode, 3092static int ocfs2_try_to_merge_extent(struct inode *inode,
2813 handle_t *handle, 3093 handle_t *handle,
2814 struct ocfs2_path *left_path, 3094 struct ocfs2_path *path,
2815 int split_index, 3095 int split_index,
2816 struct ocfs2_extent_rec *split_rec, 3096 struct ocfs2_extent_rec *split_rec,
2817 struct ocfs2_cached_dealloc_ctxt *dealloc, 3097 struct ocfs2_cached_dealloc_ctxt *dealloc,
@@ -2819,7 +3099,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2819 3099
2820{ 3100{
2821 int ret = 0; 3101 int ret = 0;
2822 struct ocfs2_extent_list *el = path_leaf_el(left_path); 3102 struct ocfs2_extent_list *el = path_leaf_el(path);
2823 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3103 struct ocfs2_extent_rec *rec = &el->l_recs[split_index];
2824 3104
2825 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3105 BUG_ON(ctxt->c_contig_type == CONTIG_NONE);
@@ -2832,7 +3112,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2832 * extents - having more than one in a leaf is 3112 * extents - having more than one in a leaf is
2833 * illegal. 3113 * illegal.
2834 */ 3114 */
2835 ret = ocfs2_rotate_tree_left(inode, handle, left_path, 3115 ret = ocfs2_rotate_tree_left(inode, handle, path,
2836 dealloc); 3116 dealloc);
2837 if (ret) { 3117 if (ret) {
2838 mlog_errno(ret); 3118 mlog_errno(ret);
@@ -2847,7 +3127,6 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2847 * Left-right contig implies this. 3127 * Left-right contig implies this.
2848 */ 3128 */
2849 BUG_ON(!ctxt->c_split_covers_rec); 3129 BUG_ON(!ctxt->c_split_covers_rec);
2850 BUG_ON(split_index == 0);
2851 3130
2852 /* 3131 /*
2853 * Since the leftright insert always covers the entire 3132 * Since the leftright insert always covers the entire
@@ -2858,9 +3137,14 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2858 * Since the adding of an empty extent shifts 3137 * Since the adding of an empty extent shifts
2859 * everything back to the right, there's no need to 3138 * everything back to the right, there's no need to
2860 * update split_index here. 3139 * update split_index here.
3140 *
3141 * When the split_index is zero, we need to merge it to the
3142 * prevoius extent block. It is more efficient and easier
3143 * if we do merge_right first and merge_left later.
2861 */ 3144 */
2862 ret = ocfs2_merge_rec_left(inode, path_leaf_bh(left_path), 3145 ret = ocfs2_merge_rec_right(inode, path,
2863 handle, split_rec, el, split_index); 3146 handle, split_rec,
3147 split_index);
2864 if (ret) { 3148 if (ret) {
2865 mlog_errno(ret); 3149 mlog_errno(ret);
2866 goto out; 3150 goto out;
@@ -2871,32 +3155,30 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2871 */ 3155 */
2872 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3156 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0]));
2873 3157
2874 /* 3158 /* The merge left us with an empty extent, remove it. */
2875 * The left merge left us with an empty extent, remove 3159 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc);
2876 * it.
2877 */
2878 ret = ocfs2_rotate_tree_left(inode, handle, left_path, dealloc);
2879 if (ret) { 3160 if (ret) {
2880 mlog_errno(ret); 3161 mlog_errno(ret);
2881 goto out; 3162 goto out;
2882 } 3163 }
2883 split_index--; 3164
2884 rec = &el->l_recs[split_index]; 3165 rec = &el->l_recs[split_index];
2885 3166
2886 /* 3167 /*
2887 * Note that we don't pass split_rec here on purpose - 3168 * Note that we don't pass split_rec here on purpose -
2888 * we've merged it into the left side. 3169 * we've merged it into the rec already.
2889 */ 3170 */
2890 ret = ocfs2_merge_rec_right(inode, path_leaf_bh(left_path), 3171 ret = ocfs2_merge_rec_left(inode, path,
2891 handle, rec, el, split_index); 3172 handle, rec,
3173 dealloc,
3174 split_index);
3175
2892 if (ret) { 3176 if (ret) {
2893 mlog_errno(ret); 3177 mlog_errno(ret);
2894 goto out; 3178 goto out;
2895 } 3179 }
2896 3180
2897 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3181 ret = ocfs2_rotate_tree_left(inode, handle, path,
2898
2899 ret = ocfs2_rotate_tree_left(inode, handle, left_path,
2900 dealloc); 3182 dealloc);
2901 /* 3183 /*
2902 * Error from this last rotate is not critical, so 3184 * Error from this last rotate is not critical, so
@@ -2915,8 +3197,9 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2915 */ 3197 */
2916 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3198 if (ctxt->c_contig_type == CONTIG_RIGHT) {
2917 ret = ocfs2_merge_rec_left(inode, 3199 ret = ocfs2_merge_rec_left(inode,
2918 path_leaf_bh(left_path), 3200 path,
2919 handle, split_rec, el, 3201 handle, split_rec,
3202 dealloc,
2920 split_index); 3203 split_index);
2921 if (ret) { 3204 if (ret) {
2922 mlog_errno(ret); 3205 mlog_errno(ret);
@@ -2924,8 +3207,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2924 } 3207 }
2925 } else { 3208 } else {
2926 ret = ocfs2_merge_rec_right(inode, 3209 ret = ocfs2_merge_rec_right(inode,
2927 path_leaf_bh(left_path), 3210 path,
2928 handle, split_rec, el, 3211 handle, split_rec,
2929 split_index); 3212 split_index);
2930 if (ret) { 3213 if (ret) {
2931 mlog_errno(ret); 3214 mlog_errno(ret);
@@ -2938,7 +3221,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode,
2938 * The merge may have left an empty extent in 3221 * The merge may have left an empty extent in
2939 * our leaf. Try to rotate it away. 3222 * our leaf. Try to rotate it away.
2940 */ 3223 */
2941 ret = ocfs2_rotate_tree_left(inode, handle, left_path, 3224 ret = ocfs2_rotate_tree_left(inode, handle, path,
2942 dealloc); 3225 dealloc);
2943 if (ret) 3226 if (ret)
2944 mlog_errno(ret); 3227 mlog_errno(ret);
@@ -3498,20 +3781,57 @@ out:
3498} 3781}
3499 3782
3500static enum ocfs2_contig_type 3783static enum ocfs2_contig_type
3501ocfs2_figure_merge_contig_type(struct inode *inode, 3784ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
3502 struct ocfs2_extent_list *el, int index, 3785 struct ocfs2_extent_list *el, int index,
3503 struct ocfs2_extent_rec *split_rec) 3786 struct ocfs2_extent_rec *split_rec)
3504{ 3787{
3505 struct ocfs2_extent_rec *rec; 3788 int status;
3506 enum ocfs2_contig_type ret = CONTIG_NONE; 3789 enum ocfs2_contig_type ret = CONTIG_NONE;
3790 u32 left_cpos, right_cpos;
3791 struct ocfs2_extent_rec *rec = NULL;
3792 struct ocfs2_extent_list *new_el;
3793 struct ocfs2_path *left_path = NULL, *right_path = NULL;
3794 struct buffer_head *bh;
3795 struct ocfs2_extent_block *eb;
3796
3797 if (index > 0) {
3798 rec = &el->l_recs[index - 1];
3799 } else if (path->p_tree_depth > 0) {
3800 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb,
3801 path, &left_cpos);
3802 if (status)
3803 goto out;
3804
3805 if (left_cpos != 0) {
3806 left_path = ocfs2_new_path(path_root_bh(path),
3807 path_root_el(path));
3808 if (!left_path)
3809 goto out;
3810
3811 status = ocfs2_find_path(inode, left_path, left_cpos);
3812 if (status)
3813 goto out;
3814
3815 new_el = path_leaf_el(left_path);
3816
3817 if (le16_to_cpu(new_el->l_next_free_rec) !=
3818 le16_to_cpu(new_el->l_count)) {
3819 bh = path_leaf_bh(left_path);
3820 eb = (struct ocfs2_extent_block *)bh->b_data;
3821 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3822 eb);
3823 goto out;
3824 }
3825 rec = &new_el->l_recs[
3826 le16_to_cpu(new_el->l_next_free_rec) - 1];
3827 }
3828 }
3507 3829
3508 /* 3830 /*
3509 * We're careful to check for an empty extent record here - 3831 * We're careful to check for an empty extent record here -
3510 * the merge code will know what to do if it sees one. 3832 * the merge code will know what to do if it sees one.
3511 */ 3833 */
3512 3834 if (rec) {
3513 if (index > 0) {
3514 rec = &el->l_recs[index - 1];
3515 if (index == 1 && ocfs2_is_empty_extent(rec)) { 3835 if (index == 1 && ocfs2_is_empty_extent(rec)) {
3516 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 3836 if (split_rec->e_cpos == el->l_recs[index].e_cpos)
3517 ret = CONTIG_RIGHT; 3837 ret = CONTIG_RIGHT;
@@ -3520,10 +3840,45 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
3520 } 3840 }
3521 } 3841 }
3522 3842
3523 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) { 3843 rec = NULL;
3844 if (index < (le16_to_cpu(el->l_next_free_rec) - 1))
3845 rec = &el->l_recs[index + 1];
3846 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) &&
3847 path->p_tree_depth > 0) {
3848 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb,
3849 path, &right_cpos);
3850 if (status)
3851 goto out;
3852
3853 if (right_cpos == 0)
3854 goto out;
3855
3856 right_path = ocfs2_new_path(path_root_bh(path),
3857 path_root_el(path));
3858 if (!right_path)
3859 goto out;
3860
3861 status = ocfs2_find_path(inode, right_path, right_cpos);
3862 if (status)
3863 goto out;
3864
3865 new_el = path_leaf_el(right_path);
3866 rec = &new_el->l_recs[0];
3867 if (ocfs2_is_empty_extent(rec)) {
3868 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
3869 bh = path_leaf_bh(right_path);
3870 eb = (struct ocfs2_extent_block *)bh->b_data;
3871 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
3872 eb);
3873 goto out;
3874 }
3875 rec = &new_el->l_recs[1];
3876 }
3877 }
3878
3879 if (rec) {
3524 enum ocfs2_contig_type contig_type; 3880 enum ocfs2_contig_type contig_type;
3525 3881
3526 rec = &el->l_recs[index + 1];
3527 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 3882 contig_type = ocfs2_extent_contig(inode, rec, split_rec);
3528 3883
3529 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 3884 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT)
@@ -3532,6 +3887,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode,
3532 ret = contig_type; 3887 ret = contig_type;
3533 } 3888 }
3534 3889
3890out:
3891 if (left_path)
3892 ocfs2_free_path(left_path);
3893 if (right_path)
3894 ocfs2_free_path(right_path);
3895
3535 return ret; 3896 return ret;
3536} 3897}
3537 3898
@@ -3994,7 +4355,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
3994 goto out; 4355 goto out;
3995 } 4356 }
3996 4357
3997 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, el, 4358 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el,
3998 split_index, 4359 split_index,
3999 split_rec); 4360 split_rec);
4000 4361
@@ -4788,6 +5149,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work)
4788 status = ocfs2_flush_truncate_log(osb); 5149 status = ocfs2_flush_truncate_log(osb);
4789 if (status < 0) 5150 if (status < 0)
4790 mlog_errno(status); 5151 mlog_errno(status);
5152 else
5153 ocfs2_init_inode_steal_slot(osb);
4791 5154
4792 mlog_exit(status); 5155 mlog_exit(status);
4793} 5156}
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 90383ed61005..17964c0505a9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -467,11 +467,11 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
467 unsigned to) 467 unsigned to)
468{ 468{
469 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 469 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
470 handle_t *handle = NULL; 470 handle_t *handle;
471 int ret = 0; 471 int ret = 0;
472 472
473 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 473 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
474 if (!handle) { 474 if (IS_ERR(handle)) {
475 ret = -ENOMEM; 475 ret = -ENOMEM;
476 mlog_errno(ret); 476 mlog_errno(ret);
477 goto out; 477 goto out;
@@ -487,7 +487,7 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
487 } 487 }
488out: 488out:
489 if (ret) { 489 if (ret) {
490 if (handle) 490 if (!IS_ERR(handle))
491 ocfs2_commit_trans(osb, handle); 491 ocfs2_commit_trans(osb, handle);
492 handle = ERR_PTR(ret); 492 handle = ERR_PTR(ret);
493 } 493 }
diff --git a/fs/ocfs2/cluster/Makefile b/fs/ocfs2/cluster/Makefile
index cdd162f13650..bc8c5e7d8608 100644
--- a/fs/ocfs2/cluster/Makefile
+++ b/fs/ocfs2/cluster/Makefile
@@ -1,4 +1,4 @@
1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o 1obj-$(CONFIG_OCFS2_FS) += ocfs2_nodemanager.o
2 2
3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \ 3ocfs2_nodemanager-objs := heartbeat.o masklog.o sys.o nodemanager.o \
4 quorum.o tcp.o ver.o 4 quorum.o tcp.o netdebug.o ver.o
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
new file mode 100644
index 000000000000..7bf3c0ea7bd9
--- /dev/null
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -0,0 +1,441 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * netdebug.c
5 *
6 * debug functionality for o2net
7 *
8 * Copyright (C) 2005, 2008 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation; either
13 * version 2 of the License, or (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public
21 * License along with this program; if not, write to the
22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
23 * Boston, MA 021110-1307, USA.
24 *
25 */
26
27#ifdef CONFIG_DEBUG_FS
28
29#include <linux/module.h>
30#include <linux/types.h>
31#include <linux/slab.h>
32#include <linux/idr.h>
33#include <linux/kref.h>
34#include <linux/seq_file.h>
35#include <linux/debugfs.h>
36
37#include <linux/uaccess.h>
38
39#include "tcp.h"
40#include "nodemanager.h"
41#define MLOG_MASK_PREFIX ML_TCP
42#include "masklog.h"
43
44#include "tcp_internal.h"
45
46#define O2NET_DEBUG_DIR "o2net"
47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking"
49
50static struct dentry *o2net_dentry;
51static struct dentry *sc_dentry;
52static struct dentry *nst_dentry;
53
54static DEFINE_SPINLOCK(o2net_debug_lock);
55
56static LIST_HEAD(sock_containers);
57static LIST_HEAD(send_tracking);
58
59void o2net_debug_add_nst(struct o2net_send_tracking *nst)
60{
61 spin_lock(&o2net_debug_lock);
62 list_add(&nst->st_net_debug_item, &send_tracking);
63 spin_unlock(&o2net_debug_lock);
64}
65
66void o2net_debug_del_nst(struct o2net_send_tracking *nst)
67{
68 spin_lock(&o2net_debug_lock);
69 if (!list_empty(&nst->st_net_debug_item))
70 list_del_init(&nst->st_net_debug_item);
71 spin_unlock(&o2net_debug_lock);
72}
73
74static struct o2net_send_tracking
75 *next_nst(struct o2net_send_tracking *nst_start)
76{
77 struct o2net_send_tracking *nst, *ret = NULL;
78
79 assert_spin_locked(&o2net_debug_lock);
80
81 list_for_each_entry(nst, &nst_start->st_net_debug_item,
82 st_net_debug_item) {
83 /* discover the head of the list */
84 if (&nst->st_net_debug_item == &send_tracking)
85 break;
86
87 /* use st_task to detect real nsts in the list */
88 if (nst->st_task != NULL) {
89 ret = nst;
90 break;
91 }
92 }
93
94 return ret;
95}
96
97static void *nst_seq_start(struct seq_file *seq, loff_t *pos)
98{
99 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
100
101 spin_lock(&o2net_debug_lock);
102 nst = next_nst(dummy_nst);
103 spin_unlock(&o2net_debug_lock);
104
105 return nst;
106}
107
108static void *nst_seq_next(struct seq_file *seq, void *v, loff_t *pos)
109{
110 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
111
112 spin_lock(&o2net_debug_lock);
113 nst = next_nst(dummy_nst);
114 list_del_init(&dummy_nst->st_net_debug_item);
115 if (nst)
116 list_add(&dummy_nst->st_net_debug_item,
117 &nst->st_net_debug_item);
118 spin_unlock(&o2net_debug_lock);
119
120 return nst; /* unused, just needs to be null when done */
121}
122
123static int nst_seq_show(struct seq_file *seq, void *v)
124{
125 struct o2net_send_tracking *nst, *dummy_nst = seq->private;
126
127 spin_lock(&o2net_debug_lock);
128 nst = next_nst(dummy_nst);
129
130 if (nst != NULL) {
131 /* get_task_comm isn't exported. oh well. */
132 seq_printf(seq, "%p:\n"
133 " pid: %lu\n"
134 " tgid: %lu\n"
135 " process name: %s\n"
136 " node: %u\n"
137 " sc: %p\n"
138 " message id: %d\n"
139 " message type: %u\n"
140 " message key: 0x%08x\n"
141 " sock acquiry: %lu.%lu\n"
142 " send start: %lu.%lu\n"
143 " wait start: %lu.%lu\n",
144 nst, (unsigned long)nst->st_task->pid,
145 (unsigned long)nst->st_task->tgid,
146 nst->st_task->comm, nst->st_node,
147 nst->st_sc, nst->st_id, nst->st_msg_type,
148 nst->st_msg_key,
149 nst->st_sock_time.tv_sec, nst->st_sock_time.tv_usec,
150 nst->st_send_time.tv_sec, nst->st_send_time.tv_usec,
151 nst->st_status_time.tv_sec,
152 nst->st_status_time.tv_usec);
153 }
154
155 spin_unlock(&o2net_debug_lock);
156
157 return 0;
158}
159
160static void nst_seq_stop(struct seq_file *seq, void *v)
161{
162}
163
164static struct seq_operations nst_seq_ops = {
165 .start = nst_seq_start,
166 .next = nst_seq_next,
167 .stop = nst_seq_stop,
168 .show = nst_seq_show,
169};
170
171static int nst_fop_open(struct inode *inode, struct file *file)
172{
173 struct o2net_send_tracking *dummy_nst;
174 struct seq_file *seq;
175 int ret;
176
177 dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL);
178 if (dummy_nst == NULL) {
179 ret = -ENOMEM;
180 goto out;
181 }
182 dummy_nst->st_task = NULL;
183
184 ret = seq_open(file, &nst_seq_ops);
185 if (ret)
186 goto out;
187
188 seq = file->private_data;
189 seq->private = dummy_nst;
190 o2net_debug_add_nst(dummy_nst);
191
192 dummy_nst = NULL;
193
194out:
195 kfree(dummy_nst);
196 return ret;
197}
198
199static int nst_fop_release(struct inode *inode, struct file *file)
200{
201 struct seq_file *seq = file->private_data;
202 struct o2net_send_tracking *dummy_nst = seq->private;
203
204 o2net_debug_del_nst(dummy_nst);
205 return seq_release_private(inode, file);
206}
207
208static struct file_operations nst_seq_fops = {
209 .open = nst_fop_open,
210 .read = seq_read,
211 .llseek = seq_lseek,
212 .release = nst_fop_release,
213};
214
215void o2net_debug_add_sc(struct o2net_sock_container *sc)
216{
217 spin_lock(&o2net_debug_lock);
218 list_add(&sc->sc_net_debug_item, &sock_containers);
219 spin_unlock(&o2net_debug_lock);
220}
221
222void o2net_debug_del_sc(struct o2net_sock_container *sc)
223{
224 spin_lock(&o2net_debug_lock);
225 list_del_init(&sc->sc_net_debug_item);
226 spin_unlock(&o2net_debug_lock);
227}
228
229static struct o2net_sock_container
230 *next_sc(struct o2net_sock_container *sc_start)
231{
232 struct o2net_sock_container *sc, *ret = NULL;
233
234 assert_spin_locked(&o2net_debug_lock);
235
236 list_for_each_entry(sc, &sc_start->sc_net_debug_item,
237 sc_net_debug_item) {
238 /* discover the head of the list miscast as a sc */
239 if (&sc->sc_net_debug_item == &sock_containers)
240 break;
241
242 /* use sc_page to detect real scs in the list */
243 if (sc->sc_page != NULL) {
244 ret = sc;
245 break;
246 }
247 }
248
249 return ret;
250}
251
252static void *sc_seq_start(struct seq_file *seq, loff_t *pos)
253{
254 struct o2net_sock_container *sc, *dummy_sc = seq->private;
255
256 spin_lock(&o2net_debug_lock);
257 sc = next_sc(dummy_sc);
258 spin_unlock(&o2net_debug_lock);
259
260 return sc;
261}
262
263static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
264{
265 struct o2net_sock_container *sc, *dummy_sc = seq->private;
266
267 spin_lock(&o2net_debug_lock);
268 sc = next_sc(dummy_sc);
269 list_del_init(&dummy_sc->sc_net_debug_item);
270 if (sc)
271 list_add(&dummy_sc->sc_net_debug_item, &sc->sc_net_debug_item);
272 spin_unlock(&o2net_debug_lock);
273
274 return sc; /* unused, just needs to be null when done */
275}
276
277#define TV_SEC_USEC(TV) TV.tv_sec, TV.tv_usec
278
279static int sc_seq_show(struct seq_file *seq, void *v)
280{
281 struct o2net_sock_container *sc, *dummy_sc = seq->private;
282
283 spin_lock(&o2net_debug_lock);
284 sc = next_sc(dummy_sc);
285
286 if (sc != NULL) {
287 struct inet_sock *inet = NULL;
288
289 __be32 saddr = 0, daddr = 0;
290 __be16 sport = 0, dport = 0;
291
292 if (sc->sc_sock) {
293 inet = inet_sk(sc->sc_sock->sk);
294 /* the stack's structs aren't sparse endian clean */
295 saddr = (__force __be32)inet->saddr;
296 daddr = (__force __be32)inet->daddr;
297 sport = (__force __be16)inet->sport;
298 dport = (__force __be16)inet->dport;
299 }
300
301 /* XXX sigh, inet-> doesn't have sparse annotation so any
302 * use of it here generates a warning with -Wbitwise */
303 seq_printf(seq, "%p:\n"
304 " krefs: %d\n"
305 " sock: %u.%u.%u.%u:%u -> "
306 "%u.%u.%u.%u:%u\n"
307 " remote node: %s\n"
308 " page off: %zu\n"
309 " handshake ok: %u\n"
310 " timer: %lu.%lu\n"
311 " data ready: %lu.%lu\n"
312 " advance start: %lu.%lu\n"
313 " advance stop: %lu.%lu\n"
314 " func start: %lu.%lu\n"
315 " func stop: %lu.%lu\n"
316 " func key: %u\n"
317 " func type: %u\n",
318 sc,
319 atomic_read(&sc->sc_kref.refcount),
320 NIPQUAD(saddr), inet ? ntohs(sport) : 0,
321 NIPQUAD(daddr), inet ? ntohs(dport) : 0,
322 sc->sc_node->nd_name,
323 sc->sc_page_off,
324 sc->sc_handshake_ok,
325 TV_SEC_USEC(sc->sc_tv_timer),
326 TV_SEC_USEC(sc->sc_tv_data_ready),
327 TV_SEC_USEC(sc->sc_tv_advance_start),
328 TV_SEC_USEC(sc->sc_tv_advance_stop),
329 TV_SEC_USEC(sc->sc_tv_func_start),
330 TV_SEC_USEC(sc->sc_tv_func_stop),
331 sc->sc_msg_key,
332 sc->sc_msg_type);
333 }
334
335
336 spin_unlock(&o2net_debug_lock);
337
338 return 0;
339}
340
341static void sc_seq_stop(struct seq_file *seq, void *v)
342{
343}
344
345static struct seq_operations sc_seq_ops = {
346 .start = sc_seq_start,
347 .next = sc_seq_next,
348 .stop = sc_seq_stop,
349 .show = sc_seq_show,
350};
351
352static int sc_fop_open(struct inode *inode, struct file *file)
353{
354 struct o2net_sock_container *dummy_sc;
355 struct seq_file *seq;
356 int ret;
357
358 dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL);
359 if (dummy_sc == NULL) {
360 ret = -ENOMEM;
361 goto out;
362 }
363 dummy_sc->sc_page = NULL;
364
365 ret = seq_open(file, &sc_seq_ops);
366 if (ret)
367 goto out;
368
369 seq = file->private_data;
370 seq->private = dummy_sc;
371 o2net_debug_add_sc(dummy_sc);
372
373 dummy_sc = NULL;
374
375out:
376 kfree(dummy_sc);
377 return ret;
378}
379
380static int sc_fop_release(struct inode *inode, struct file *file)
381{
382 struct seq_file *seq = file->private_data;
383 struct o2net_sock_container *dummy_sc = seq->private;
384
385 o2net_debug_del_sc(dummy_sc);
386 return seq_release_private(inode, file);
387}
388
389static struct file_operations sc_seq_fops = {
390 .open = sc_fop_open,
391 .read = seq_read,
392 .llseek = seq_lseek,
393 .release = sc_fop_release,
394};
395
396int o2net_debugfs_init(void)
397{
398 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
399 if (!o2net_dentry) {
400 mlog_errno(-ENOMEM);
401 goto bail;
402 }
403
404 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR,
405 o2net_dentry, NULL,
406 &nst_seq_fops);
407 if (!nst_dentry) {
408 mlog_errno(-ENOMEM);
409 goto bail;
410 }
411
412 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR,
413 o2net_dentry, NULL,
414 &sc_seq_fops);
415 if (!sc_dentry) {
416 mlog_errno(-ENOMEM);
417 goto bail;
418 }
419
420 return 0;
421bail:
422 if (sc_dentry)
423 debugfs_remove(sc_dentry);
424 if (nst_dentry)
425 debugfs_remove(nst_dentry);
426 if (o2net_dentry)
427 debugfs_remove(o2net_dentry);
428 return -ENOMEM;
429}
430
431void o2net_debugfs_exit(void)
432{
433 if (sc_dentry)
434 debugfs_remove(sc_dentry);
435 if (nst_dentry)
436 debugfs_remove(nst_dentry);
437 if (o2net_dentry)
438 debugfs_remove(o2net_dentry);
439}
440
441#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 709fba25bf7e..cf9401e8cd0b 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -959,7 +959,10 @@ static int __init init_o2nm(void)
959 cluster_print_version(); 959 cluster_print_version();
960 960
961 o2hb_init(); 961 o2hb_init();
962 o2net_init(); 962
963 ret = o2net_init();
964 if (ret)
965 goto out;
963 966
964 ocfs2_table_header = register_sysctl_table(ocfs2_root_table); 967 ocfs2_table_header = register_sysctl_table(ocfs2_root_table);
965 if (!ocfs2_table_header) { 968 if (!ocfs2_table_header) {
diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c
index 0c095ce7723d..98429fd68499 100644
--- a/fs/ocfs2/cluster/sys.c
+++ b/fs/ocfs2/cluster/sys.c
@@ -57,6 +57,7 @@ static struct kset *o2cb_kset;
57void o2cb_sys_shutdown(void) 57void o2cb_sys_shutdown(void)
58{ 58{
59 mlog_sys_shutdown(); 59 mlog_sys_shutdown();
60 sysfs_remove_link(NULL, "o2cb");
60 kset_unregister(o2cb_kset); 61 kset_unregister(o2cb_kset);
61} 62}
62 63
@@ -68,6 +69,14 @@ int o2cb_sys_init(void)
68 if (!o2cb_kset) 69 if (!o2cb_kset)
69 return -ENOMEM; 70 return -ENOMEM;
70 71
72 /*
73 * Create this symlink for backwards compatibility with old
74 * versions of ocfs2-tools which look for things in /sys/o2cb.
75 */
76 ret = sysfs_create_link(NULL, &o2cb_kset->kobj, "o2cb");
77 if (ret)
78 goto error;
79
71 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group); 80 ret = sysfs_create_group(&o2cb_kset->kobj, &o2cb_attr_group);
72 if (ret) 81 if (ret)
73 goto error; 82 goto error;
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index b8057c51b205..1e44ad14881a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -142,23 +142,65 @@ static void o2net_idle_timer(unsigned long data);
142static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); 142static void o2net_sc_postpone_idle(struct o2net_sock_container *sc);
143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); 143static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc);
144 144
145/* 145static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype,
146 * FIXME: These should use to_o2nm_cluster_from_node(), but we end up 146 u32 msgkey, struct task_struct *task, u8 node)
147 * losing our parent link to the cluster during shutdown. This can be 147{
148 * solved by adding a pre-removal callback to configfs, or passing 148#ifdef CONFIG_DEBUG_FS
149 * around the cluster with the node. -jeffm 149 INIT_LIST_HEAD(&nst->st_net_debug_item);
150 */ 150 nst->st_task = task;
151static inline int o2net_reconnect_delay(struct o2nm_node *node) 151 nst->st_msg_type = msgtype;
152 nst->st_msg_key = msgkey;
153 nst->st_node = node;
154#endif
155}
156
157static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst)
158{
159#ifdef CONFIG_DEBUG_FS
160 do_gettimeofday(&nst->st_sock_time);
161#endif
162}
163
164static void o2net_set_nst_send_time(struct o2net_send_tracking *nst)
165{
166#ifdef CONFIG_DEBUG_FS
167 do_gettimeofday(&nst->st_send_time);
168#endif
169}
170
171static void o2net_set_nst_status_time(struct o2net_send_tracking *nst)
172{
173#ifdef CONFIG_DEBUG_FS
174 do_gettimeofday(&nst->st_status_time);
175#endif
176}
177
178static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst,
179 struct o2net_sock_container *sc)
180{
181#ifdef CONFIG_DEBUG_FS
182 nst->st_sc = sc;
183#endif
184}
185
186static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id)
187{
188#ifdef CONFIG_DEBUG_FS
189 nst->st_id = msg_id;
190#endif
191}
192
193static inline int o2net_reconnect_delay(void)
152{ 194{
153 return o2nm_single_cluster->cl_reconnect_delay_ms; 195 return o2nm_single_cluster->cl_reconnect_delay_ms;
154} 196}
155 197
156static inline int o2net_keepalive_delay(struct o2nm_node *node) 198static inline int o2net_keepalive_delay(void)
157{ 199{
158 return o2nm_single_cluster->cl_keepalive_delay_ms; 200 return o2nm_single_cluster->cl_keepalive_delay_ms;
159} 201}
160 202
161static inline int o2net_idle_timeout(struct o2nm_node *node) 203static inline int o2net_idle_timeout(void)
162{ 204{
163 return o2nm_single_cluster->cl_idle_timeout_ms; 205 return o2nm_single_cluster->cl_idle_timeout_ms;
164} 206}
@@ -296,6 +338,7 @@ static void sc_kref_release(struct kref *kref)
296 o2nm_node_put(sc->sc_node); 338 o2nm_node_put(sc->sc_node);
297 sc->sc_node = NULL; 339 sc->sc_node = NULL;
298 340
341 o2net_debug_del_sc(sc);
299 kfree(sc); 342 kfree(sc);
300} 343}
301 344
@@ -336,6 +379,7 @@ static struct o2net_sock_container *sc_alloc(struct o2nm_node *node)
336 379
337 ret = sc; 380 ret = sc;
338 sc->sc_page = page; 381 sc->sc_page = page;
382 o2net_debug_add_sc(sc);
339 sc = NULL; 383 sc = NULL;
340 page = NULL; 384 page = NULL;
341 385
@@ -399,8 +443,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
399 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid); 443 mlog_bug_on_msg(err && valid, "err %d valid %u\n", err, valid);
400 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc); 444 mlog_bug_on_msg(valid && !sc, "valid %u sc %p\n", valid, sc);
401 445
402 /* we won't reconnect after our valid conn goes away for
403 * this hb iteration.. here so it shows up in the logs */
404 if (was_valid && !valid && err == 0) 446 if (was_valid && !valid && err == 0)
405 err = -ENOTCONN; 447 err = -ENOTCONN;
406 448
@@ -430,11 +472,6 @@ static void o2net_set_nn_state(struct o2net_node *nn,
430 472
431 if (!was_valid && valid) { 473 if (!was_valid && valid) {
432 o2quo_conn_up(o2net_num_from_nn(nn)); 474 o2quo_conn_up(o2net_num_from_nn(nn));
433 /* this is a bit of a hack. we only try reconnecting
434 * when heartbeating starts until we get a connection.
435 * if that connection then dies we don't try reconnecting.
436 * the only way to start connecting again is to down
437 * heartbeat and bring it back up. */
438 cancel_delayed_work(&nn->nn_connect_expired); 475 cancel_delayed_work(&nn->nn_connect_expired);
439 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n", 476 printk(KERN_INFO "o2net: %s " SC_NODEF_FMT "\n",
440 o2nm_this_node() > sc->sc_node->nd_num ? 477 o2nm_this_node() > sc->sc_node->nd_num ?
@@ -451,12 +488,24 @@ static void o2net_set_nn_state(struct o2net_node *nn,
451 /* delay if we're withing a RECONNECT_DELAY of the 488 /* delay if we're withing a RECONNECT_DELAY of the
452 * last attempt */ 489 * last attempt */
453 delay = (nn->nn_last_connect_attempt + 490 delay = (nn->nn_last_connect_attempt +
454 msecs_to_jiffies(o2net_reconnect_delay(NULL))) 491 msecs_to_jiffies(o2net_reconnect_delay()))
455 - jiffies; 492 - jiffies;
456 if (delay > msecs_to_jiffies(o2net_reconnect_delay(NULL))) 493 if (delay > msecs_to_jiffies(o2net_reconnect_delay()))
457 delay = 0; 494 delay = 0;
458 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay); 495 mlog(ML_CONN, "queueing conn attempt in %lu jiffies\n", delay);
459 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay); 496 queue_delayed_work(o2net_wq, &nn->nn_connect_work, delay);
497
498 /*
499 * Delay the expired work after idle timeout.
500 *
501 * We might have lots of failed connection attempts that run
502 * through here but we only cancel the connect_expired work when
503 * a connection attempt succeeds. So only the first enqueue of
504 * the connect_expired work will do anything. The rest will see
505 * that it's already queued and do nothing.
506 */
507 delay += msecs_to_jiffies(o2net_idle_timeout());
508 queue_delayed_work(o2net_wq, &nn->nn_connect_expired, delay);
460 } 509 }
461 510
462 /* keep track of the nn's sc ref for the caller */ 511 /* keep track of the nn's sc ref for the caller */
@@ -914,6 +963,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
914 struct o2net_status_wait nsw = { 963 struct o2net_status_wait nsw = {
915 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item), 964 .ns_node_item = LIST_HEAD_INIT(nsw.ns_node_item),
916 }; 965 };
966 struct o2net_send_tracking nst;
967
968 o2net_init_nst(&nst, msg_type, key, current, target_node);
917 969
918 if (o2net_wq == NULL) { 970 if (o2net_wq == NULL) {
919 mlog(0, "attempt to tx without o2netd running\n"); 971 mlog(0, "attempt to tx without o2netd running\n");
@@ -939,6 +991,10 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
939 goto out; 991 goto out;
940 } 992 }
941 993
994 o2net_debug_add_nst(&nst);
995
996 o2net_set_nst_sock_time(&nst);
997
942 ret = wait_event_interruptible(nn->nn_sc_wq, 998 ret = wait_event_interruptible(nn->nn_sc_wq,
943 o2net_tx_can_proceed(nn, &sc, &error)); 999 o2net_tx_can_proceed(nn, &sc, &error));
944 if (!ret && error) 1000 if (!ret && error)
@@ -946,6 +1002,8 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
946 if (ret) 1002 if (ret)
947 goto out; 1003 goto out;
948 1004
1005 o2net_set_nst_sock_container(&nst, sc);
1006
949 veclen = caller_veclen + 1; 1007 veclen = caller_veclen + 1;
950 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC); 1008 vec = kmalloc(sizeof(struct kvec) * veclen, GFP_ATOMIC);
951 if (vec == NULL) { 1009 if (vec == NULL) {
@@ -972,6 +1030,9 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
972 goto out; 1030 goto out;
973 1031
974 msg->msg_num = cpu_to_be32(nsw.ns_id); 1032 msg->msg_num = cpu_to_be32(nsw.ns_id);
1033 o2net_set_nst_msg_id(&nst, nsw.ns_id);
1034
1035 o2net_set_nst_send_time(&nst);
975 1036
976 /* finally, convert the message header to network byte-order 1037 /* finally, convert the message header to network byte-order
977 * and send */ 1038 * and send */
@@ -986,6 +1047,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
986 } 1047 }
987 1048
988 /* wait on other node's handler */ 1049 /* wait on other node's handler */
1050 o2net_set_nst_status_time(&nst);
989 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw)); 1051 wait_event(nsw.ns_wq, o2net_nsw_completed(nn, &nsw));
990 1052
991 /* Note that we avoid overwriting the callers status return 1053 /* Note that we avoid overwriting the callers status return
@@ -998,6 +1060,7 @@ int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
998 mlog(0, "woken, returning system status %d, user status %d\n", 1060 mlog(0, "woken, returning system status %d, user status %d\n",
999 ret, nsw.ns_status); 1061 ret, nsw.ns_status);
1000out: 1062out:
1063 o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
1001 if (sc) 1064 if (sc)
1002 sc_put(sc); 1065 sc_put(sc);
1003 if (vec) 1066 if (vec)
@@ -1154,23 +1217,23 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1154 * but isn't. This can ultimately cause corruption. 1217 * but isn't. This can ultimately cause corruption.
1155 */ 1218 */
1156 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1219 if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
1157 o2net_idle_timeout(sc->sc_node)) { 1220 o2net_idle_timeout()) {
1158 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1221 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of "
1159 "%u ms, but we use %u ms locally. disconnecting\n", 1222 "%u ms, but we use %u ms locally. disconnecting\n",
1160 SC_NODEF_ARGS(sc), 1223 SC_NODEF_ARGS(sc),
1161 be32_to_cpu(hand->o2net_idle_timeout_ms), 1224 be32_to_cpu(hand->o2net_idle_timeout_ms),
1162 o2net_idle_timeout(sc->sc_node)); 1225 o2net_idle_timeout());
1163 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1226 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1164 return -1; 1227 return -1;
1165 } 1228 }
1166 1229
1167 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1230 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
1168 o2net_keepalive_delay(sc->sc_node)) { 1231 o2net_keepalive_delay()) {
1169 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1232 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of "
1170 "%u ms, but we use %u ms locally. disconnecting\n", 1233 "%u ms, but we use %u ms locally. disconnecting\n",
1171 SC_NODEF_ARGS(sc), 1234 SC_NODEF_ARGS(sc),
1172 be32_to_cpu(hand->o2net_keepalive_delay_ms), 1235 be32_to_cpu(hand->o2net_keepalive_delay_ms),
1173 o2net_keepalive_delay(sc->sc_node)); 1236 o2net_keepalive_delay());
1174 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1237 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1175 return -1; 1238 return -1;
1176 } 1239 }
@@ -1193,6 +1256,7 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1193 * shut down already */ 1256 * shut down already */
1194 if (nn->nn_sc == sc) { 1257 if (nn->nn_sc == sc) {
1195 o2net_sc_reset_idle_timer(sc); 1258 o2net_sc_reset_idle_timer(sc);
1259 atomic_set(&nn->nn_timeout, 0);
1196 o2net_set_nn_state(nn, sc, 1, 0); 1260 o2net_set_nn_state(nn, sc, 1, 0);
1197 } 1261 }
1198 spin_unlock(&nn->nn_lock); 1262 spin_unlock(&nn->nn_lock);
@@ -1347,12 +1411,11 @@ static void o2net_initialize_handshake(void)
1347{ 1411{
1348 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32( 1412 o2net_hand->o2hb_heartbeat_timeout_ms = cpu_to_be32(
1349 O2HB_MAX_WRITE_TIMEOUT_MS); 1413 O2HB_MAX_WRITE_TIMEOUT_MS);
1350 o2net_hand->o2net_idle_timeout_ms = cpu_to_be32( 1414 o2net_hand->o2net_idle_timeout_ms = cpu_to_be32(o2net_idle_timeout());
1351 o2net_idle_timeout(NULL));
1352 o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32( 1415 o2net_hand->o2net_keepalive_delay_ms = cpu_to_be32(
1353 o2net_keepalive_delay(NULL)); 1416 o2net_keepalive_delay());
1354 o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32( 1417 o2net_hand->o2net_reconnect_delay_ms = cpu_to_be32(
1355 o2net_reconnect_delay(NULL)); 1418 o2net_reconnect_delay());
1356} 1419}
1357 1420
1358/* ------------------------------------------------------------ */ 1421/* ------------------------------------------------------------ */
@@ -1391,14 +1454,15 @@ static void o2net_sc_send_keep_req(struct work_struct *work)
1391static void o2net_idle_timer(unsigned long data) 1454static void o2net_idle_timer(unsigned long data)
1392{ 1455{
1393 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1456 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1457 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1394 struct timeval now; 1458 struct timeval now;
1395 1459
1396 do_gettimeofday(&now); 1460 do_gettimeofday(&now);
1397 1461
1398 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1462 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u "
1399 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1463 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc),
1400 o2net_idle_timeout(sc->sc_node) / 1000, 1464 o2net_idle_timeout() / 1000,
1401 o2net_idle_timeout(sc->sc_node) % 1000); 1465 o2net_idle_timeout() % 1000);
1402 mlog(ML_NOTICE, "here are some times that might help debug the " 1466 mlog(ML_NOTICE, "here are some times that might help debug the "
1403 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1467 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1404 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1468 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
@@ -1413,6 +1477,12 @@ static void o2net_idle_timer(unsigned long data)
1413 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec, 1477 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
1414 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec); 1478 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
1415 1479
1480 /*
1481 * Initialize the nn_timeout so that the next connection attempt
1482 * will continue in o2net_start_connect.
1483 */
1484 atomic_set(&nn->nn_timeout, 1);
1485
1416 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 1486 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1417} 1487}
1418 1488
@@ -1420,10 +1490,10 @@ static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc)
1420{ 1490{
1421 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); 1491 o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work);
1422 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work, 1492 o2net_sc_queue_delayed_work(sc, &sc->sc_keepalive_work,
1423 msecs_to_jiffies(o2net_keepalive_delay(sc->sc_node))); 1493 msecs_to_jiffies(o2net_keepalive_delay()));
1424 do_gettimeofday(&sc->sc_tv_timer); 1494 do_gettimeofday(&sc->sc_tv_timer);
1425 mod_timer(&sc->sc_idle_timeout, 1495 mod_timer(&sc->sc_idle_timeout,
1426 jiffies + msecs_to_jiffies(o2net_idle_timeout(sc->sc_node))); 1496 jiffies + msecs_to_jiffies(o2net_idle_timeout()));
1427} 1497}
1428 1498
1429static void o2net_sc_postpone_idle(struct o2net_sock_container *sc) 1499static void o2net_sc_postpone_idle(struct o2net_sock_container *sc)
@@ -1447,6 +1517,7 @@ static void o2net_start_connect(struct work_struct *work)
1447 struct socket *sock = NULL; 1517 struct socket *sock = NULL;
1448 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; 1518 struct sockaddr_in myaddr = {0, }, remoteaddr = {0, };
1449 int ret = 0, stop; 1519 int ret = 0, stop;
1520 unsigned int timeout;
1450 1521
1451 /* if we're greater we initiate tx, otherwise we accept */ 1522 /* if we're greater we initiate tx, otherwise we accept */
1452 if (o2nm_this_node() <= o2net_num_from_nn(nn)) 1523 if (o2nm_this_node() <= o2net_num_from_nn(nn))
@@ -1466,8 +1537,17 @@ static void o2net_start_connect(struct work_struct *work)
1466 } 1537 }
1467 1538
1468 spin_lock(&nn->nn_lock); 1539 spin_lock(&nn->nn_lock);
1469 /* see if we already have one pending or have given up */ 1540 /*
1470 stop = (nn->nn_sc || nn->nn_persistent_error); 1541 * see if we already have one pending or have given up.
1542 * For nn_timeout, it is set when we close the connection
1543 * because of the idle time out. So it means that we have
1544 * at least connected to that node successfully once,
1545 * now try to connect to it again.
1546 */
1547 timeout = atomic_read(&nn->nn_timeout);
1548 stop = (nn->nn_sc ||
1549 (nn->nn_persistent_error &&
1550 (nn->nn_persistent_error != -ENOTCONN || timeout == 0)));
1471 spin_unlock(&nn->nn_lock); 1551 spin_unlock(&nn->nn_lock);
1472 if (stop) 1552 if (stop)
1473 goto out; 1553 goto out;
@@ -1555,8 +1635,8 @@ static void o2net_connect_expired(struct work_struct *work)
1555 mlog(ML_ERROR, "no connection established with node %u after " 1635 mlog(ML_ERROR, "no connection established with node %u after "
1556 "%u.%u seconds, giving up and returning errors.\n", 1636 "%u.%u seconds, giving up and returning errors.\n",
1557 o2net_num_from_nn(nn), 1637 o2net_num_from_nn(nn),
1558 o2net_idle_timeout(NULL) / 1000, 1638 o2net_idle_timeout() / 1000,
1559 o2net_idle_timeout(NULL) % 1000); 1639 o2net_idle_timeout() % 1000);
1560 1640
1561 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1641 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1562 } 1642 }
@@ -1579,6 +1659,7 @@ void o2net_disconnect_node(struct o2nm_node *node)
1579 1659
1580 /* don't reconnect until it's heartbeating again */ 1660 /* don't reconnect until it's heartbeating again */
1581 spin_lock(&nn->nn_lock); 1661 spin_lock(&nn->nn_lock);
1662 atomic_set(&nn->nn_timeout, 0);
1582 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN); 1663 o2net_set_nn_state(nn, NULL, 0, -ENOTCONN);
1583 spin_unlock(&nn->nn_lock); 1664 spin_unlock(&nn->nn_lock);
1584 1665
@@ -1610,20 +1691,15 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1610 1691
1611 /* ensure an immediate connect attempt */ 1692 /* ensure an immediate connect attempt */
1612 nn->nn_last_connect_attempt = jiffies - 1693 nn->nn_last_connect_attempt = jiffies -
1613 (msecs_to_jiffies(o2net_reconnect_delay(node)) + 1); 1694 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);
1614 1695
1615 if (node_num != o2nm_this_node()) { 1696 if (node_num != o2nm_this_node()) {
1616 /* heartbeat doesn't work unless a local node number is
1617 * configured and doing so brings up the o2net_wq, so we can
1618 * use it.. */
1619 queue_delayed_work(o2net_wq, &nn->nn_connect_expired,
1620 msecs_to_jiffies(o2net_idle_timeout(node)));
1621
1622 /* believe it or not, accept and node hearbeating testing 1697 /* believe it or not, accept and node hearbeating testing
1623 * can succeed for this node before we got here.. so 1698 * can succeed for this node before we got here.. so
1624 * only use set_nn_state to clear the persistent error 1699 * only use set_nn_state to clear the persistent error
1625 * if that hasn't already happened */ 1700 * if that hasn't already happened */
1626 spin_lock(&nn->nn_lock); 1701 spin_lock(&nn->nn_lock);
1702 atomic_set(&nn->nn_timeout, 0);
1627 if (nn->nn_persistent_error) 1703 if (nn->nn_persistent_error)
1628 o2net_set_nn_state(nn, NULL, 0, 0); 1704 o2net_set_nn_state(nn, NULL, 0, 0);
1629 spin_unlock(&nn->nn_lock); 1705 spin_unlock(&nn->nn_lock);
@@ -1747,6 +1823,7 @@ static int o2net_accept_one(struct socket *sock)
1747 new_sock = NULL; 1823 new_sock = NULL;
1748 1824
1749 spin_lock(&nn->nn_lock); 1825 spin_lock(&nn->nn_lock);
1826 atomic_set(&nn->nn_timeout, 0);
1750 o2net_set_nn_state(nn, sc, 0, 0); 1827 o2net_set_nn_state(nn, sc, 0, 0);
1751 spin_unlock(&nn->nn_lock); 1828 spin_unlock(&nn->nn_lock);
1752 1829
@@ -1922,6 +1999,9 @@ int o2net_init(void)
1922 1999
1923 o2quo_init(); 2000 o2quo_init();
1924 2001
2002 if (o2net_debugfs_init())
2003 return -ENOMEM;
2004
1925 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); 2005 o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL);
1926 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2006 o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
1927 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); 2007 o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL);
@@ -1941,6 +2021,7 @@ int o2net_init(void)
1941 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) { 2021 for (i = 0; i < ARRAY_SIZE(o2net_nodes); i++) {
1942 struct o2net_node *nn = o2net_nn_from_num(i); 2022 struct o2net_node *nn = o2net_nn_from_num(i);
1943 2023
2024 atomic_set(&nn->nn_timeout, 0);
1944 spin_lock_init(&nn->nn_lock); 2025 spin_lock_init(&nn->nn_lock);
1945 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect); 2026 INIT_DELAYED_WORK(&nn->nn_connect_work, o2net_start_connect);
1946 INIT_DELAYED_WORK(&nn->nn_connect_expired, 2027 INIT_DELAYED_WORK(&nn->nn_connect_expired,
@@ -1962,4 +2043,5 @@ void o2net_exit(void)
1962 kfree(o2net_hand); 2043 kfree(o2net_hand);
1963 kfree(o2net_keep_req); 2044 kfree(o2net_keep_req);
1964 kfree(o2net_keep_resp); 2045 kfree(o2net_keep_resp);
2046 o2net_debugfs_exit();
1965} 2047}
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index f36f66aab3dd..a705d5d19036 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -117,4 +117,36 @@ int o2net_num_connected_peers(void);
117int o2net_init(void); 117int o2net_init(void);
118void o2net_exit(void); 118void o2net_exit(void);
119 119
120struct o2net_send_tracking;
121struct o2net_sock_container;
122
123#ifdef CONFIG_DEBUG_FS
124int o2net_debugfs_init(void);
125void o2net_debugfs_exit(void);
126void o2net_debug_add_nst(struct o2net_send_tracking *nst);
127void o2net_debug_del_nst(struct o2net_send_tracking *nst);
128void o2net_debug_add_sc(struct o2net_sock_container *sc);
129void o2net_debug_del_sc(struct o2net_sock_container *sc);
130#else
131static int o2net_debugfs_init(void)
132{
133 return 0;
134}
135static void o2net_debugfs_exit(void)
136{
137}
138static void o2net_debug_add_nst(struct o2net_send_tracking *nst)
139{
140}
141static void o2net_debug_del_nst(struct o2net_send_tracking *nst)
142{
143}
144static void o2net_debug_add_sc(struct o2net_sock_container *sc)
145{
146}
147static void o2net_debug_del_sc(struct o2net_sock_container *sc)
148{
149}
150#endif /* CONFIG_DEBUG_FS */
151
120#endif /* O2CLUSTER_TCP_H */ 152#endif /* O2CLUSTER_TCP_H */
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index d25b9af28500..8d58cfe410b1 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -95,6 +95,8 @@ struct o2net_node {
95 unsigned nn_sc_valid:1; 95 unsigned nn_sc_valid:1;
96 /* if this is set tx just returns it */ 96 /* if this is set tx just returns it */
97 int nn_persistent_error; 97 int nn_persistent_error;
98 /* It is only set to 1 after the idle time out. */
99 atomic_t nn_timeout;
98 100
99 /* threads waiting for an sc to arrive wait on the wq for generation 101 /* threads waiting for an sc to arrive wait on the wq for generation
100 * to increase. it is increased when a connecting socket succeeds 102 * to increase. it is increased when a connecting socket succeeds
@@ -164,7 +166,9 @@ struct o2net_sock_container {
164 /* original handlers for the sockets */ 166 /* original handlers for the sockets */
165 void (*sc_state_change)(struct sock *sk); 167 void (*sc_state_change)(struct sock *sk);
166 void (*sc_data_ready)(struct sock *sk, int bytes); 168 void (*sc_data_ready)(struct sock *sk, int bytes);
167 169#ifdef CONFIG_DEBUG_FS
170 struct list_head sc_net_debug_item;
171#endif
168 struct timeval sc_tv_timer; 172 struct timeval sc_tv_timer;
169 struct timeval sc_tv_data_ready; 173 struct timeval sc_tv_data_ready;
170 struct timeval sc_tv_advance_start; 174 struct timeval sc_tv_advance_start;
@@ -206,4 +210,24 @@ struct o2net_status_wait {
206 struct list_head ns_node_item; 210 struct list_head ns_node_item;
207}; 211};
208 212
213#ifdef CONFIG_DEBUG_FS
214/* just for state dumps */
215struct o2net_send_tracking {
216 struct list_head st_net_debug_item;
217 struct task_struct *st_task;
218 struct o2net_sock_container *st_sc;
219 u32 st_id;
220 u32 st_msg_type;
221 u32 st_msg_key;
222 u8 st_node;
223 struct timeval st_sock_time;
224 struct timeval st_send_time;
225 struct timeval st_status_time;
226};
227#else
228struct o2net_send_tracking {
229 u32 dummy;
230};
231#endif /* CONFIG_DEBUG_FS */
232
209#endif /* O2CLUSTER_TCP_INTERNAL_H */ 233#endif /* O2CLUSTER_TCP_INTERNAL_H */
diff --git a/fs/ocfs2/dlm/Makefile b/fs/ocfs2/dlm/Makefile
index ce3f7c29d270..190361375700 100644
--- a/fs/ocfs2/dlm/Makefile
+++ b/fs/ocfs2/dlm/Makefile
@@ -1,6 +1,6 @@
1EXTRA_CFLAGS += -Ifs/ocfs2 1EXTRA_CFLAGS += -Ifs/ocfs2
2 2
3obj-$(CONFIG_OCFS2_FS) += ocfs2_dlm.o ocfs2_dlmfs.o 3obj-$(CONFIG_OCFS2_FS_O2CB) += ocfs2_dlm.o ocfs2_dlmfs.o
4 4
5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \ 5ocfs2_dlm-objs := dlmdomain.o dlmdebug.o dlmthread.o dlmrecovery.o \
6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o 6 dlmmaster.o dlmast.o dlmconvert.o dlmlock.o dlmunlock.o dlmver.o
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index dc8ea666efdb..d5a86fb81a49 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -49,6 +49,41 @@
49/* Intended to make it easier for us to switch out hash functions */ 49/* Intended to make it easier for us to switch out hash functions */
50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) 50#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
51 51
52enum dlm_mle_type {
53 DLM_MLE_BLOCK,
54 DLM_MLE_MASTER,
55 DLM_MLE_MIGRATION
56};
57
58struct dlm_lock_name {
59 u8 len;
60 u8 name[DLM_LOCKID_NAME_MAX];
61};
62
63struct dlm_master_list_entry {
64 struct list_head list;
65 struct list_head hb_events;
66 struct dlm_ctxt *dlm;
67 spinlock_t spinlock;
68 wait_queue_head_t wq;
69 atomic_t woken;
70 struct kref mle_refs;
71 int inuse;
72 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
73 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
74 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
75 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
76 u8 master;
77 u8 new_master;
78 enum dlm_mle_type type;
79 struct o2hb_callback_func mle_hb_up;
80 struct o2hb_callback_func mle_hb_down;
81 union {
82 struct dlm_lock_resource *res;
83 struct dlm_lock_name name;
84 } u;
85};
86
52enum dlm_ast_type { 87enum dlm_ast_type {
53 DLM_AST = 0, 88 DLM_AST = 0,
54 DLM_BAST, 89 DLM_BAST,
@@ -101,6 +136,7 @@ struct dlm_ctxt
101 struct list_head purge_list; 136 struct list_head purge_list;
102 struct list_head pending_asts; 137 struct list_head pending_asts;
103 struct list_head pending_basts; 138 struct list_head pending_basts;
139 struct list_head tracking_list;
104 unsigned int purge_count; 140 unsigned int purge_count;
105 spinlock_t spinlock; 141 spinlock_t spinlock;
106 spinlock_t ast_lock; 142 spinlock_t ast_lock;
@@ -122,6 +158,9 @@ struct dlm_ctxt
122 atomic_t remote_resources; 158 atomic_t remote_resources;
123 atomic_t unknown_resources; 159 atomic_t unknown_resources;
124 160
161 struct dlm_debug_ctxt *dlm_debug_ctxt;
162 struct dentry *dlm_debugfs_subroot;
163
125 /* NOTE: Next three are protected by dlm_domain_lock */ 164 /* NOTE: Next three are protected by dlm_domain_lock */
126 struct kref dlm_refs; 165 struct kref dlm_refs;
127 enum dlm_ctxt_state dlm_state; 166 enum dlm_ctxt_state dlm_state;
@@ -270,6 +309,9 @@ struct dlm_lock_resource
270 struct list_head dirty; 309 struct list_head dirty;
271 struct list_head recovering; // dlm_recovery_ctxt.resources list 310 struct list_head recovering; // dlm_recovery_ctxt.resources list
272 311
312 /* Added during init and removed during release */
313 struct list_head tracking; /* dlm->tracking_list */
314
273 /* unused lock resources have their last_used stamped and are 315 /* unused lock resources have their last_used stamped and are
274 * put on a list for the dlm thread to run. */ 316 * put on a list for the dlm thread to run. */
275 unsigned long last_used; 317 unsigned long last_used;
@@ -963,9 +1005,16 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
963 DLM_LOCK_RES_MIGRATING)); 1005 DLM_LOCK_RES_MIGRATING));
964} 1006}
965 1007
1008/* create/destroy slab caches */
1009int dlm_init_master_caches(void);
1010void dlm_destroy_master_caches(void);
1011
1012int dlm_init_lock_cache(void);
1013void dlm_destroy_lock_cache(void);
966 1014
967int dlm_init_mle_cache(void); 1015int dlm_init_mle_cache(void);
968void dlm_destroy_mle_cache(void); 1016void dlm_destroy_mle_cache(void);
1017
969void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up); 1018void dlm_hb_event_notify_attached(struct dlm_ctxt *dlm, int idx, int node_up);
970int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, 1019int dlm_drop_lockres_ref(struct dlm_ctxt *dlm,
971 struct dlm_lock_resource *res); 1020 struct dlm_lock_resource *res);
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 64239b37e5d4..5f6d858770a2 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * debug functionality for the dlm 6 * debug functionality for the dlm
7 * 7 *
8 * Copyright (C) 2004 Oracle. All rights reserved. 8 * Copyright (C) 2004, 2008 Oracle. All rights reserved.
9 * 9 *
10 * This program is free software; you can redistribute it and/or 10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public 11 * modify it under the terms of the GNU General Public
@@ -30,6 +30,7 @@
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/sysctl.h> 31#include <linux/sysctl.h>
32#include <linux/spinlock.h> 32#include <linux/spinlock.h>
33#include <linux/debugfs.h>
33 34
34#include "cluster/heartbeat.h" 35#include "cluster/heartbeat.h"
35#include "cluster/nodemanager.h" 36#include "cluster/nodemanager.h"
@@ -37,17 +38,16 @@
37 38
38#include "dlmapi.h" 39#include "dlmapi.h"
39#include "dlmcommon.h" 40#include "dlmcommon.h"
40
41#include "dlmdomain.h" 41#include "dlmdomain.h"
42#include "dlmdebug.h"
42 43
43#define MLOG_MASK_PREFIX ML_DLM 44#define MLOG_MASK_PREFIX ML_DLM
44#include "cluster/masklog.h" 45#include "cluster/masklog.h"
45 46
47int stringify_lockname(const char *lockname, int locklen, char *buf, int len);
48
46void dlm_print_one_lock_resource(struct dlm_lock_resource *res) 49void dlm_print_one_lock_resource(struct dlm_lock_resource *res)
47{ 50{
48 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n",
49 res->lockname.len, res->lockname.name,
50 res->owner, res->state);
51 spin_lock(&res->spinlock); 51 spin_lock(&res->spinlock);
52 __dlm_print_one_lock_resource(res); 52 __dlm_print_one_lock_resource(res);
53 spin_unlock(&res->spinlock); 53 spin_unlock(&res->spinlock);
@@ -58,7 +58,7 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
58 int bit; 58 int bit;
59 assert_spin_locked(&res->spinlock); 59 assert_spin_locked(&res->spinlock);
60 60
61 mlog(ML_NOTICE, " refmap nodes: [ "); 61 printk(" refmap nodes: [ ");
62 bit = 0; 62 bit = 0;
63 while (1) { 63 while (1) {
64 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); 64 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit);
@@ -70,63 +70,66 @@ static void dlm_print_lockres_refmap(struct dlm_lock_resource *res)
70 printk("], inflight=%u\n", res->inflight_locks); 70 printk("], inflight=%u\n", res->inflight_locks);
71} 71}
72 72
73static void __dlm_print_lock(struct dlm_lock *lock)
74{
75 spin_lock(&lock->spinlock);
76
77 printk(" type=%d, conv=%d, node=%u, cookie=%u:%llu, "
78 "ref=%u, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c), "
79 "pending=(conv=%c,lock=%c,cancel=%c,unlock=%c)\n",
80 lock->ml.type, lock->ml.convert_type, lock->ml.node,
81 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
82 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
83 atomic_read(&lock->lock_refs.refcount),
84 (list_empty(&lock->ast_list) ? 'y' : 'n'),
85 (lock->ast_pending ? 'y' : 'n'),
86 (list_empty(&lock->bast_list) ? 'y' : 'n'),
87 (lock->bast_pending ? 'y' : 'n'),
88 (lock->convert_pending ? 'y' : 'n'),
89 (lock->lock_pending ? 'y' : 'n'),
90 (lock->cancel_pending ? 'y' : 'n'),
91 (lock->unlock_pending ? 'y' : 'n'));
92
93 spin_unlock(&lock->spinlock);
94}
95
73void __dlm_print_one_lock_resource(struct dlm_lock_resource *res) 96void __dlm_print_one_lock_resource(struct dlm_lock_resource *res)
74{ 97{
75 struct list_head *iter2; 98 struct list_head *iter2;
76 struct dlm_lock *lock; 99 struct dlm_lock *lock;
100 char buf[DLM_LOCKID_NAME_MAX];
77 101
78 assert_spin_locked(&res->spinlock); 102 assert_spin_locked(&res->spinlock);
79 103
80 mlog(ML_NOTICE, "lockres: %.*s, owner=%u, state=%u\n", 104 stringify_lockname(res->lockname.name, res->lockname.len,
81 res->lockname.len, res->lockname.name, 105 buf, sizeof(buf) - 1);
82 res->owner, res->state); 106 printk("lockres: %s, owner=%u, state=%u\n",
83 mlog(ML_NOTICE, " last used: %lu, on purge list: %s\n", 107 buf, res->owner, res->state);
84 res->last_used, list_empty(&res->purge) ? "no" : "yes"); 108 printk(" last used: %lu, refcnt: %u, on purge list: %s\n",
109 res->last_used, atomic_read(&res->refs.refcount),
110 list_empty(&res->purge) ? "no" : "yes");
111 printk(" on dirty list: %s, on reco list: %s, "
112 "migrating pending: %s\n",
113 list_empty(&res->dirty) ? "no" : "yes",
114 list_empty(&res->recovering) ? "no" : "yes",
115 res->migration_pending ? "yes" : "no");
116 printk(" inflight locks: %d, asts reserved: %d\n",
117 res->inflight_locks, atomic_read(&res->asts_reserved));
85 dlm_print_lockres_refmap(res); 118 dlm_print_lockres_refmap(res);
86 mlog(ML_NOTICE, " granted queue: \n"); 119 printk(" granted queue:\n");
87 list_for_each(iter2, &res->granted) { 120 list_for_each(iter2, &res->granted) {
88 lock = list_entry(iter2, struct dlm_lock, list); 121 lock = list_entry(iter2, struct dlm_lock, list);
89 spin_lock(&lock->spinlock); 122 __dlm_print_lock(lock);
90 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
91 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
92 lock->ml.type, lock->ml.convert_type, lock->ml.node,
93 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
94 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
95 list_empty(&lock->ast_list) ? 'y' : 'n',
96 lock->ast_pending ? 'y' : 'n',
97 list_empty(&lock->bast_list) ? 'y' : 'n',
98 lock->bast_pending ? 'y' : 'n');
99 spin_unlock(&lock->spinlock);
100 } 123 }
101 mlog(ML_NOTICE, " converting queue: \n"); 124 printk(" converting queue:\n");
102 list_for_each(iter2, &res->converting) { 125 list_for_each(iter2, &res->converting) {
103 lock = list_entry(iter2, struct dlm_lock, list); 126 lock = list_entry(iter2, struct dlm_lock, list);
104 spin_lock(&lock->spinlock); 127 __dlm_print_lock(lock);
105 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
106 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
107 lock->ml.type, lock->ml.convert_type, lock->ml.node,
108 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
109 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
110 list_empty(&lock->ast_list) ? 'y' : 'n',
111 lock->ast_pending ? 'y' : 'n',
112 list_empty(&lock->bast_list) ? 'y' : 'n',
113 lock->bast_pending ? 'y' : 'n');
114 spin_unlock(&lock->spinlock);
115 } 128 }
116 mlog(ML_NOTICE, " blocked queue: \n"); 129 printk(" blocked queue:\n");
117 list_for_each(iter2, &res->blocked) { 130 list_for_each(iter2, &res->blocked) {
118 lock = list_entry(iter2, struct dlm_lock, list); 131 lock = list_entry(iter2, struct dlm_lock, list);
119 spin_lock(&lock->spinlock); 132 __dlm_print_lock(lock);
120 mlog(ML_NOTICE, " type=%d, conv=%d, node=%u, "
121 "cookie=%u:%llu, ast=(empty=%c,pend=%c), bast=(empty=%c,pend=%c)\n",
122 lock->ml.type, lock->ml.convert_type, lock->ml.node,
123 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
124 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
125 list_empty(&lock->ast_list) ? 'y' : 'n',
126 lock->ast_pending ? 'y' : 'n',
127 list_empty(&lock->bast_list) ? 'y' : 'n',
128 lock->bast_pending ? 'y' : 'n');
129 spin_unlock(&lock->spinlock);
130 } 133 }
131} 134}
132 135
@@ -136,31 +139,6 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
136} 139}
137EXPORT_SYMBOL_GPL(dlm_print_one_lock); 140EXPORT_SYMBOL_GPL(dlm_print_one_lock);
138 141
139#if 0
140void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
141{
142 struct dlm_lock_resource *res;
143 struct hlist_node *iter;
144 struct hlist_head *bucket;
145 int i;
146
147 mlog(ML_NOTICE, "struct dlm_ctxt: %s, node=%u, key=%u\n",
148 dlm->name, dlm->node_num, dlm->key);
149 if (!dlm || !dlm->name) {
150 mlog(ML_ERROR, "dlm=%p\n", dlm);
151 return;
152 }
153
154 spin_lock(&dlm->spinlock);
155 for (i=0; i<DLM_HASH_BUCKETS; i++) {
156 bucket = dlm_lockres_hash(dlm, i);
157 hlist_for_each_entry(res, iter, bucket, hash_node)
158 dlm_print_one_lock_resource(res);
159 }
160 spin_unlock(&dlm->spinlock);
161}
162#endif /* 0 */
163
164static const char *dlm_errnames[] = { 142static const char *dlm_errnames[] = {
165 [DLM_NORMAL] = "DLM_NORMAL", 143 [DLM_NORMAL] = "DLM_NORMAL",
166 [DLM_GRANTED] = "DLM_GRANTED", 144 [DLM_GRANTED] = "DLM_GRANTED",
@@ -266,3 +244,792 @@ const char *dlm_errname(enum dlm_status err)
266 return dlm_errnames[err]; 244 return dlm_errnames[err];
267} 245}
268EXPORT_SYMBOL_GPL(dlm_errname); 246EXPORT_SYMBOL_GPL(dlm_errname);
247
248/* NOTE: This function converts a lockname into a string. It uses knowledge
249 * of the format of the lockname that should be outside the purview of the dlm.
250 * We are adding only to make dlm debugging slightly easier.
251 *
252 * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h.
253 */
254int stringify_lockname(const char *lockname, int locklen, char *buf, int len)
255{
256 int out = 0;
257 __be64 inode_blkno_be;
258
259#define OCFS2_DENTRY_LOCK_INO_START 18
260 if (*lockname == 'N') {
261 memcpy((__be64 *)&inode_blkno_be,
262 (char *)&lockname[OCFS2_DENTRY_LOCK_INO_START],
263 sizeof(__be64));
264 out += snprintf(buf + out, len - out, "%.*s%08x",
265 OCFS2_DENTRY_LOCK_INO_START - 1, lockname,
266 (unsigned int)be64_to_cpu(inode_blkno_be));
267 } else
268 out += snprintf(buf + out, len - out, "%.*s",
269 locklen, lockname);
270 return out;
271}
272
273static int stringify_nodemap(unsigned long *nodemap, int maxnodes,
274 char *buf, int len)
275{
276 int out = 0;
277 int i = -1;
278
279 while ((i = find_next_bit(nodemap, maxnodes, i + 1)) < maxnodes)
280 out += snprintf(buf + out, len - out, "%d ", i);
281
282 return out;
283}
284
285static int dump_mle(struct dlm_master_list_entry *mle, char *buf, int len)
286{
287 int out = 0;
288 unsigned int namelen;
289 const char *name;
290 char *mle_type;
291
292 if (mle->type != DLM_MLE_MASTER) {
293 namelen = mle->u.name.len;
294 name = mle->u.name.name;
295 } else {
296 namelen = mle->u.res->lockname.len;
297 name = mle->u.res->lockname.name;
298 }
299
300 if (mle->type == DLM_MLE_BLOCK)
301 mle_type = "BLK";
302 else if (mle->type == DLM_MLE_MASTER)
303 mle_type = "MAS";
304 else
305 mle_type = "MIG";
306
307 out += stringify_lockname(name, namelen, buf + out, len - out);
308 out += snprintf(buf + out, len - out,
309 "\t%3s\tmas=%3u\tnew=%3u\tevt=%1d\tuse=%1d\tref=%3d\n",
310 mle_type, mle->master, mle->new_master,
311 !list_empty(&mle->hb_events),
312 !!mle->inuse,
313 atomic_read(&mle->mle_refs.refcount));
314
315 out += snprintf(buf + out, len - out, "Maybe=");
316 out += stringify_nodemap(mle->maybe_map, O2NM_MAX_NODES,
317 buf + out, len - out);
318 out += snprintf(buf + out, len - out, "\n");
319
320 out += snprintf(buf + out, len - out, "Vote=");
321 out += stringify_nodemap(mle->vote_map, O2NM_MAX_NODES,
322 buf + out, len - out);
323 out += snprintf(buf + out, len - out, "\n");
324
325 out += snprintf(buf + out, len - out, "Response=");
326 out += stringify_nodemap(mle->response_map, O2NM_MAX_NODES,
327 buf + out, len - out);
328 out += snprintf(buf + out, len - out, "\n");
329
330 out += snprintf(buf + out, len - out, "Node=");
331 out += stringify_nodemap(mle->node_map, O2NM_MAX_NODES,
332 buf + out, len - out);
333 out += snprintf(buf + out, len - out, "\n");
334
335 out += snprintf(buf + out, len - out, "\n");
336
337 return out;
338}
339
340void dlm_print_one_mle(struct dlm_master_list_entry *mle)
341{
342 char *buf;
343
344 buf = (char *) get_zeroed_page(GFP_NOFS);
345 if (buf) {
346 dump_mle(mle, buf, PAGE_SIZE - 1);
347 free_page((unsigned long)buf);
348 }
349}
350
351#ifdef CONFIG_DEBUG_FS
352
353static struct dentry *dlm_debugfs_root = NULL;
354
355#define DLM_DEBUGFS_DIR "o2dlm"
356#define DLM_DEBUGFS_DLM_STATE "dlm_state"
357#define DLM_DEBUGFS_LOCKING_STATE "locking_state"
358#define DLM_DEBUGFS_MLE_STATE "mle_state"
359#define DLM_DEBUGFS_PURGE_LIST "purge_list"
360
361/* begin - utils funcs */
362static void dlm_debug_free(struct kref *kref)
363{
364 struct dlm_debug_ctxt *dc;
365
366 dc = container_of(kref, struct dlm_debug_ctxt, debug_refcnt);
367
368 kfree(dc);
369}
370
371void dlm_debug_put(struct dlm_debug_ctxt *dc)
372{
373 if (dc)
374 kref_put(&dc->debug_refcnt, dlm_debug_free);
375}
376
377static void dlm_debug_get(struct dlm_debug_ctxt *dc)
378{
379 kref_get(&dc->debug_refcnt);
380}
381
382static struct debug_buffer *debug_buffer_allocate(void)
383{
384 struct debug_buffer *db = NULL;
385
386 db = kzalloc(sizeof(struct debug_buffer), GFP_KERNEL);
387 if (!db)
388 goto bail;
389
390 db->len = PAGE_SIZE;
391 db->buf = kmalloc(db->len, GFP_KERNEL);
392 if (!db->buf)
393 goto bail;
394
395 return db;
396bail:
397 kfree(db);
398 return NULL;
399}
400
401static ssize_t debug_buffer_read(struct file *file, char __user *buf,
402 size_t nbytes, loff_t *ppos)
403{
404 struct debug_buffer *db = file->private_data;
405
406 return simple_read_from_buffer(buf, nbytes, ppos, db->buf, db->len);
407}
408
409static loff_t debug_buffer_llseek(struct file *file, loff_t off, int whence)
410{
411 struct debug_buffer *db = file->private_data;
412 loff_t new = -1;
413
414 switch (whence) {
415 case 0:
416 new = off;
417 break;
418 case 1:
419 new = file->f_pos + off;
420 break;
421 }
422
423 if (new < 0 || new > db->len)
424 return -EINVAL;
425
426 return (file->f_pos = new);
427}
428
429static int debug_buffer_release(struct inode *inode, struct file *file)
430{
431 struct debug_buffer *db = (struct debug_buffer *)file->private_data;
432
433 if (db)
434 kfree(db->buf);
435 kfree(db);
436
437 return 0;
438}
439/* end - util funcs */
440
441/* begin - purge list funcs */
442static int debug_purgelist_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
443{
444 struct dlm_lock_resource *res;
445 int out = 0;
446 unsigned long total = 0;
447
448 out += snprintf(db->buf + out, db->len - out,
449 "Dumping Purgelist for Domain: %s\n", dlm->name);
450
451 spin_lock(&dlm->spinlock);
452 list_for_each_entry(res, &dlm->purge_list, purge) {
453 ++total;
454 if (db->len - out < 100)
455 continue;
456 spin_lock(&res->spinlock);
457 out += stringify_lockname(res->lockname.name,
458 res->lockname.len,
459 db->buf + out, db->len - out);
460 out += snprintf(db->buf + out, db->len - out, "\t%ld\n",
461 (jiffies - res->last_used)/HZ);
462 spin_unlock(&res->spinlock);
463 }
464 spin_unlock(&dlm->spinlock);
465
466 out += snprintf(db->buf + out, db->len - out,
467 "Total on list: %ld\n", total);
468
469 return out;
470}
471
472static int debug_purgelist_open(struct inode *inode, struct file *file)
473{
474 struct dlm_ctxt *dlm = inode->i_private;
475 struct debug_buffer *db;
476
477 db = debug_buffer_allocate();
478 if (!db)
479 goto bail;
480
481 db->len = debug_purgelist_print(dlm, db);
482
483 file->private_data = db;
484
485 return 0;
486bail:
487 return -ENOMEM;
488}
489
490static struct file_operations debug_purgelist_fops = {
491 .open = debug_purgelist_open,
492 .release = debug_buffer_release,
493 .read = debug_buffer_read,
494 .llseek = debug_buffer_llseek,
495};
496/* end - purge list funcs */
497
498/* begin - debug mle funcs */
499static int debug_mle_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
500{
501 struct dlm_master_list_entry *mle;
502 int out = 0;
503 unsigned long total = 0;
504
505 out += snprintf(db->buf + out, db->len - out,
506 "Dumping MLEs for Domain: %s\n", dlm->name);
507
508 spin_lock(&dlm->master_lock);
509 list_for_each_entry(mle, &dlm->master_list, list) {
510 ++total;
511 if (db->len - out < 200)
512 continue;
513 out += dump_mle(mle, db->buf + out, db->len - out);
514 }
515 spin_unlock(&dlm->master_lock);
516
517 out += snprintf(db->buf + out, db->len - out,
518 "Total on list: %ld\n", total);
519 return out;
520}
521
522static int debug_mle_open(struct inode *inode, struct file *file)
523{
524 struct dlm_ctxt *dlm = inode->i_private;
525 struct debug_buffer *db;
526
527 db = debug_buffer_allocate();
528 if (!db)
529 goto bail;
530
531 db->len = debug_mle_print(dlm, db);
532
533 file->private_data = db;
534
535 return 0;
536bail:
537 return -ENOMEM;
538}
539
540static struct file_operations debug_mle_fops = {
541 .open = debug_mle_open,
542 .release = debug_buffer_release,
543 .read = debug_buffer_read,
544 .llseek = debug_buffer_llseek,
545};
546
547/* end - debug mle funcs */
548
549/* begin - debug lockres funcs */
550static int dump_lock(struct dlm_lock *lock, int list_type, char *buf, int len)
551{
552 int out;
553
554#define DEBUG_LOCK_VERSION 1
555 spin_lock(&lock->spinlock);
556 out = snprintf(buf, len, "LOCK:%d,%d,%d,%d,%d,%d:%lld,%d,%d,%d,%d,%d,"
557 "%d,%d,%d,%d\n",
558 DEBUG_LOCK_VERSION,
559 list_type, lock->ml.type, lock->ml.convert_type,
560 lock->ml.node,
561 dlm_get_lock_cookie_node(be64_to_cpu(lock->ml.cookie)),
562 dlm_get_lock_cookie_seq(be64_to_cpu(lock->ml.cookie)),
563 !list_empty(&lock->ast_list),
564 !list_empty(&lock->bast_list),
565 lock->ast_pending, lock->bast_pending,
566 lock->convert_pending, lock->lock_pending,
567 lock->cancel_pending, lock->unlock_pending,
568 atomic_read(&lock->lock_refs.refcount));
569 spin_unlock(&lock->spinlock);
570
571 return out;
572}
573
574static int dump_lockres(struct dlm_lock_resource *res, char *buf, int len)
575{
576 struct dlm_lock *lock;
577 int i;
578 int out = 0;
579
580 out += snprintf(buf + out, len - out, "NAME:");
581 out += stringify_lockname(res->lockname.name, res->lockname.len,
582 buf + out, len - out);
583 out += snprintf(buf + out, len - out, "\n");
584
585#define DEBUG_LRES_VERSION 1
586 out += snprintf(buf + out, len - out,
587 "LRES:%d,%d,%d,%ld,%d,%d,%d,%d,%d,%d,%d\n",
588 DEBUG_LRES_VERSION,
589 res->owner, res->state, res->last_used,
590 !list_empty(&res->purge),
591 !list_empty(&res->dirty),
592 !list_empty(&res->recovering),
593 res->inflight_locks, res->migration_pending,
594 atomic_read(&res->asts_reserved),
595 atomic_read(&res->refs.refcount));
596
597 /* refmap */
598 out += snprintf(buf + out, len - out, "RMAP:");
599 out += stringify_nodemap(res->refmap, O2NM_MAX_NODES,
600 buf + out, len - out);
601 out += snprintf(buf + out, len - out, "\n");
602
603 /* lvb */
604 out += snprintf(buf + out, len - out, "LVBX:");
605 for (i = 0; i < DLM_LVB_LEN; i++)
606 out += snprintf(buf + out, len - out,
607 "%02x", (unsigned char)res->lvb[i]);
608 out += snprintf(buf + out, len - out, "\n");
609
610 /* granted */
611 list_for_each_entry(lock, &res->granted, list)
612 out += dump_lock(lock, 0, buf + out, len - out);
613
614 /* converting */
615 list_for_each_entry(lock, &res->converting, list)
616 out += dump_lock(lock, 1, buf + out, len - out);
617
618 /* blocked */
619 list_for_each_entry(lock, &res->blocked, list)
620 out += dump_lock(lock, 2, buf + out, len - out);
621
622 out += snprintf(buf + out, len - out, "\n");
623
624 return out;
625}
626
627static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
628{
629 struct debug_lockres *dl = m->private;
630 struct dlm_ctxt *dlm = dl->dl_ctxt;
631 struct dlm_lock_resource *res = NULL;
632
633 spin_lock(&dlm->spinlock);
634
635 if (dl->dl_res) {
636 list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
637 if (dl->dl_res) {
638 dlm_lockres_put(dl->dl_res);
639 dl->dl_res = NULL;
640 }
641 if (&res->tracking == &dlm->tracking_list) {
642 mlog(0, "End of list found, %p\n", res);
643 dl = NULL;
644 break;
645 }
646 dlm_lockres_get(res);
647 dl->dl_res = res;
648 break;
649 }
650 } else {
651 if (!list_empty(&dlm->tracking_list)) {
652 list_for_each_entry(res, &dlm->tracking_list, tracking)
653 break;
654 dlm_lockres_get(res);
655 dl->dl_res = res;
656 } else
657 dl = NULL;
658 }
659
660 if (dl) {
661 spin_lock(&dl->dl_res->spinlock);
662 dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
663 spin_unlock(&dl->dl_res->spinlock);
664 }
665
666 spin_unlock(&dlm->spinlock);
667
668 return dl;
669}
670
671static void lockres_seq_stop(struct seq_file *m, void *v)
672{
673}
674
675static void *lockres_seq_next(struct seq_file *m, void *v, loff_t *pos)
676{
677 return NULL;
678}
679
680static int lockres_seq_show(struct seq_file *s, void *v)
681{
682 struct debug_lockres *dl = (struct debug_lockres *)v;
683
684 seq_printf(s, "%s", dl->dl_buf);
685
686 return 0;
687}
688
689static struct seq_operations debug_lockres_ops = {
690 .start = lockres_seq_start,
691 .stop = lockres_seq_stop,
692 .next = lockres_seq_next,
693 .show = lockres_seq_show,
694};
695
696static int debug_lockres_open(struct inode *inode, struct file *file)
697{
698 struct dlm_ctxt *dlm = inode->i_private;
699 int ret = -ENOMEM;
700 struct seq_file *seq;
701 struct debug_lockres *dl = NULL;
702
703 dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL);
704 if (!dl) {
705 mlog_errno(ret);
706 goto bail;
707 }
708
709 dl->dl_len = PAGE_SIZE;
710 dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL);
711 if (!dl->dl_buf) {
712 mlog_errno(ret);
713 goto bail;
714 }
715
716 ret = seq_open(file, &debug_lockres_ops);
717 if (ret) {
718 mlog_errno(ret);
719 goto bail;
720 }
721
722 seq = (struct seq_file *) file->private_data;
723 seq->private = dl;
724
725 dlm_grab(dlm);
726 dl->dl_ctxt = dlm;
727
728 return 0;
729bail:
730 if (dl)
731 kfree(dl->dl_buf);
732 kfree(dl);
733 return ret;
734}
735
736static int debug_lockres_release(struct inode *inode, struct file *file)
737{
738 struct seq_file *seq = (struct seq_file *)file->private_data;
739 struct debug_lockres *dl = (struct debug_lockres *)seq->private;
740
741 if (dl->dl_res)
742 dlm_lockres_put(dl->dl_res);
743 dlm_put(dl->dl_ctxt);
744 kfree(dl->dl_buf);
745 return seq_release_private(inode, file);
746}
747
748static struct file_operations debug_lockres_fops = {
749 .open = debug_lockres_open,
750 .release = debug_lockres_release,
751 .read = seq_read,
752 .llseek = seq_lseek,
753};
754/* end - debug lockres funcs */
755
756/* begin - debug state funcs */
757static int debug_state_print(struct dlm_ctxt *dlm, struct debug_buffer *db)
758{
759 int out = 0;
760 struct dlm_reco_node_data *node;
761 char *state;
762 int lres, rres, ures, tres;
763
764 lres = atomic_read(&dlm->local_resources);
765 rres = atomic_read(&dlm->remote_resources);
766 ures = atomic_read(&dlm->unknown_resources);
767 tres = lres + rres + ures;
768
769 spin_lock(&dlm->spinlock);
770
771 switch (dlm->dlm_state) {
772 case DLM_CTXT_NEW:
773 state = "NEW"; break;
774 case DLM_CTXT_JOINED:
775 state = "JOINED"; break;
776 case DLM_CTXT_IN_SHUTDOWN:
777 state = "SHUTDOWN"; break;
778 case DLM_CTXT_LEAVING:
779 state = "LEAVING"; break;
780 default:
781 state = "UNKNOWN"; break;
782 }
783
784 /* Domain: xxxxxxxxxx Key: 0xdfbac769 */
785 out += snprintf(db->buf + out, db->len - out,
786 "Domain: %s Key: 0x%08x\n", dlm->name, dlm->key);
787
788 /* Thread Pid: xxx Node: xxx State: xxxxx */
789 out += snprintf(db->buf + out, db->len - out,
790 "Thread Pid: %d Node: %d State: %s\n",
791 dlm->dlm_thread_task->pid, dlm->node_num, state);
792
793 /* Number of Joins: xxx Joining Node: xxx */
794 out += snprintf(db->buf + out, db->len - out,
795 "Number of Joins: %d Joining Node: %d\n",
796 dlm->num_joins, dlm->joining_node);
797
798 /* Domain Map: xx xx xx */
799 out += snprintf(db->buf + out, db->len - out, "Domain Map: ");
800 out += stringify_nodemap(dlm->domain_map, O2NM_MAX_NODES,
801 db->buf + out, db->len - out);
802 out += snprintf(db->buf + out, db->len - out, "\n");
803
804 /* Live Map: xx xx xx */
805 out += snprintf(db->buf + out, db->len - out, "Live Map: ");
806 out += stringify_nodemap(dlm->live_nodes_map, O2NM_MAX_NODES,
807 db->buf + out, db->len - out);
808 out += snprintf(db->buf + out, db->len - out, "\n");
809
810 /* Mastered Resources Total: xxx Locally: xxx Remotely: ... */
811 out += snprintf(db->buf + out, db->len - out,
812 "Mastered Resources Total: %d Locally: %d "
813 "Remotely: %d Unknown: %d\n",
814 tres, lres, rres, ures);
815
816 /* Lists: Dirty=Empty Purge=InUse PendingASTs=Empty ... */
817 out += snprintf(db->buf + out, db->len - out,
818 "Lists: Dirty=%s Purge=%s PendingASTs=%s "
819 "PendingBASTs=%s Master=%s\n",
820 (list_empty(&dlm->dirty_list) ? "Empty" : "InUse"),
821 (list_empty(&dlm->purge_list) ? "Empty" : "InUse"),
822 (list_empty(&dlm->pending_asts) ? "Empty" : "InUse"),
823 (list_empty(&dlm->pending_basts) ? "Empty" : "InUse"),
824 (list_empty(&dlm->master_list) ? "Empty" : "InUse"));
825
826 /* Purge Count: xxx Refs: xxx */
827 out += snprintf(db->buf + out, db->len - out,
828 "Purge Count: %d Refs: %d\n", dlm->purge_count,
829 atomic_read(&dlm->dlm_refs.refcount));
830
831 /* Dead Node: xxx */
832 out += snprintf(db->buf + out, db->len - out,
833 "Dead Node: %d\n", dlm->reco.dead_node);
834
835 /* What about DLM_RECO_STATE_FINALIZE? */
836 if (dlm->reco.state == DLM_RECO_STATE_ACTIVE)
837 state = "ACTIVE";
838 else
839 state = "INACTIVE";
840
841 /* Recovery Pid: xxxx Master: xxx State: xxxx */
842 out += snprintf(db->buf + out, db->len - out,
843 "Recovery Pid: %d Master: %d State: %s\n",
844 dlm->dlm_reco_thread_task->pid,
845 dlm->reco.new_master, state);
846
847 /* Recovery Map: xx xx */
848 out += snprintf(db->buf + out, db->len - out, "Recovery Map: ");
849 out += stringify_nodemap(dlm->recovery_map, O2NM_MAX_NODES,
850 db->buf + out, db->len - out);
851 out += snprintf(db->buf + out, db->len - out, "\n");
852
853 /* Recovery Node State: */
854 out += snprintf(db->buf + out, db->len - out, "Recovery Node State:\n");
855 list_for_each_entry(node, &dlm->reco.node_data, list) {
856 switch (node->state) {
857 case DLM_RECO_NODE_DATA_INIT:
858 state = "INIT";
859 break;
860 case DLM_RECO_NODE_DATA_REQUESTING:
861 state = "REQUESTING";
862 break;
863 case DLM_RECO_NODE_DATA_DEAD:
864 state = "DEAD";
865 break;
866 case DLM_RECO_NODE_DATA_RECEIVING:
867 state = "RECEIVING";
868 break;
869 case DLM_RECO_NODE_DATA_REQUESTED:
870 state = "REQUESTED";
871 break;
872 case DLM_RECO_NODE_DATA_DONE:
873 state = "DONE";
874 break;
875 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
876 state = "FINALIZE-SENT";
877 break;
878 default:
879 state = "BAD";
880 break;
881 }
882 out += snprintf(db->buf + out, db->len - out, "\t%u - %s\n",
883 node->node_num, state);
884 }
885
886 spin_unlock(&dlm->spinlock);
887
888 return out;
889}
890
891static int debug_state_open(struct inode *inode, struct file *file)
892{
893 struct dlm_ctxt *dlm = inode->i_private;
894 struct debug_buffer *db = NULL;
895
896 db = debug_buffer_allocate();
897 if (!db)
898 goto bail;
899
900 db->len = debug_state_print(dlm, db);
901
902 file->private_data = db;
903
904 return 0;
905bail:
906 return -ENOMEM;
907}
908
909static struct file_operations debug_state_fops = {
910 .open = debug_state_open,
911 .release = debug_buffer_release,
912 .read = debug_buffer_read,
913 .llseek = debug_buffer_llseek,
914};
915/* end - debug state funcs */
916
917/* files in subroot */
918int dlm_debug_init(struct dlm_ctxt *dlm)
919{
920 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
921
922 /* for dumping dlm_ctxt */
923 dc->debug_state_dentry = debugfs_create_file(DLM_DEBUGFS_DLM_STATE,
924 S_IFREG|S_IRUSR,
925 dlm->dlm_debugfs_subroot,
926 dlm, &debug_state_fops);
927 if (!dc->debug_state_dentry) {
928 mlog_errno(-ENOMEM);
929 goto bail;
930 }
931
932 /* for dumping lockres */
933 dc->debug_lockres_dentry =
934 debugfs_create_file(DLM_DEBUGFS_LOCKING_STATE,
935 S_IFREG|S_IRUSR,
936 dlm->dlm_debugfs_subroot,
937 dlm, &debug_lockres_fops);
938 if (!dc->debug_lockres_dentry) {
939 mlog_errno(-ENOMEM);
940 goto bail;
941 }
942
943 /* for dumping mles */
944 dc->debug_mle_dentry = debugfs_create_file(DLM_DEBUGFS_MLE_STATE,
945 S_IFREG|S_IRUSR,
946 dlm->dlm_debugfs_subroot,
947 dlm, &debug_mle_fops);
948 if (!dc->debug_mle_dentry) {
949 mlog_errno(-ENOMEM);
950 goto bail;
951 }
952
953 /* for dumping lockres on the purge list */
954 dc->debug_purgelist_dentry =
955 debugfs_create_file(DLM_DEBUGFS_PURGE_LIST,
956 S_IFREG|S_IRUSR,
957 dlm->dlm_debugfs_subroot,
958 dlm, &debug_purgelist_fops);
959 if (!dc->debug_purgelist_dentry) {
960 mlog_errno(-ENOMEM);
961 goto bail;
962 }
963
964 dlm_debug_get(dc);
965 return 0;
966
967bail:
968 dlm_debug_shutdown(dlm);
969 return -ENOMEM;
970}
971
972void dlm_debug_shutdown(struct dlm_ctxt *dlm)
973{
974 struct dlm_debug_ctxt *dc = dlm->dlm_debug_ctxt;
975
976 if (dc) {
977 if (dc->debug_purgelist_dentry)
978 debugfs_remove(dc->debug_purgelist_dentry);
979 if (dc->debug_mle_dentry)
980 debugfs_remove(dc->debug_mle_dentry);
981 if (dc->debug_lockres_dentry)
982 debugfs_remove(dc->debug_lockres_dentry);
983 if (dc->debug_state_dentry)
984 debugfs_remove(dc->debug_state_dentry);
985 dlm_debug_put(dc);
986 }
987}
988
989/* subroot - domain dir */
990int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
991{
992 dlm->dlm_debugfs_subroot = debugfs_create_dir(dlm->name,
993 dlm_debugfs_root);
994 if (!dlm->dlm_debugfs_subroot) {
995 mlog_errno(-ENOMEM);
996 goto bail;
997 }
998
999 dlm->dlm_debug_ctxt = kzalloc(sizeof(struct dlm_debug_ctxt),
1000 GFP_KERNEL);
1001 if (!dlm->dlm_debug_ctxt) {
1002 mlog_errno(-ENOMEM);
1003 goto bail;
1004 }
1005 kref_init(&dlm->dlm_debug_ctxt->debug_refcnt);
1006
1007 return 0;
1008bail:
1009 dlm_destroy_debugfs_subroot(dlm);
1010 return -ENOMEM;
1011}
1012
1013void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
1014{
1015 if (dlm->dlm_debugfs_subroot)
1016 debugfs_remove(dlm->dlm_debugfs_subroot);
1017}
1018
1019/* debugfs root */
1020int dlm_create_debugfs_root(void)
1021{
1022 dlm_debugfs_root = debugfs_create_dir(DLM_DEBUGFS_DIR, NULL);
1023 if (!dlm_debugfs_root) {
1024 mlog_errno(-ENOMEM);
1025 return -ENOMEM;
1026 }
1027 return 0;
1028}
1029
1030void dlm_destroy_debugfs_root(void)
1031{
1032 if (dlm_debugfs_root)
1033 debugfs_remove(dlm_debugfs_root);
1034}
1035#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
new file mode 100644
index 000000000000..d34a62a3a625
--- /dev/null
+++ b/fs/ocfs2/dlm/dlmdebug.h
@@ -0,0 +1,86 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * dlmdebug.h
5 *
6 * Copyright (C) 2008 Oracle. All rights reserved.
7 *
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public
10 * License as published by the Free Software Foundation; either
11 * version 2 of the License, or (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public
19 * License along with this program; if not, write to the
20 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
21 * Boston, MA 021110-1307, USA.
22 *
23 */
24
25#ifndef DLMDEBUG_H
26#define DLMDEBUG_H
27
28void dlm_print_one_mle(struct dlm_master_list_entry *mle);
29
30#ifdef CONFIG_DEBUG_FS
31
32struct dlm_debug_ctxt {
33 struct kref debug_refcnt;
34 struct dentry *debug_state_dentry;
35 struct dentry *debug_lockres_dentry;
36 struct dentry *debug_mle_dentry;
37 struct dentry *debug_purgelist_dentry;
38};
39
40struct debug_buffer {
41 int len;
42 char *buf;
43};
44
45struct debug_lockres {
46 int dl_len;
47 char *dl_buf;
48 struct dlm_ctxt *dl_ctxt;
49 struct dlm_lock_resource *dl_res;
50};
51
52int dlm_debug_init(struct dlm_ctxt *dlm);
53void dlm_debug_shutdown(struct dlm_ctxt *dlm);
54
55int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm);
56void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm);
57
58int dlm_create_debugfs_root(void);
59void dlm_destroy_debugfs_root(void);
60
61#else
62
63static int dlm_debug_init(struct dlm_ctxt *dlm)
64{
65 return 0;
66}
67static void dlm_debug_shutdown(struct dlm_ctxt *dlm)
68{
69}
70static int dlm_create_debugfs_subroot(struct dlm_ctxt *dlm)
71{
72 return 0;
73}
74static void dlm_destroy_debugfs_subroot(struct dlm_ctxt *dlm)
75{
76}
77static int dlm_create_debugfs_root(void)
78{
79 return 0;
80}
81static void dlm_destroy_debugfs_root(void)
82{
83}
84
85#endif /* CONFIG_DEBUG_FS */
86#endif /* DLMDEBUG_H */
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 0879d86113e3..63f8125824e8 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -33,6 +33,7 @@
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
34#include <linux/delay.h> 34#include <linux/delay.h>
35#include <linux/err.h> 35#include <linux/err.h>
36#include <linux/debugfs.h>
36 37
37#include "cluster/heartbeat.h" 38#include "cluster/heartbeat.h"
38#include "cluster/nodemanager.h" 39#include "cluster/nodemanager.h"
@@ -40,8 +41,8 @@
40 41
41#include "dlmapi.h" 42#include "dlmapi.h"
42#include "dlmcommon.h" 43#include "dlmcommon.h"
43
44#include "dlmdomain.h" 44#include "dlmdomain.h"
45#include "dlmdebug.h"
45 46
46#include "dlmver.h" 47#include "dlmver.h"
47 48
@@ -298,6 +299,8 @@ static int dlm_wait_on_domain_helper(const char *domain)
298 299
299static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) 300static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
300{ 301{
302 dlm_destroy_debugfs_subroot(dlm);
303
301 if (dlm->lockres_hash) 304 if (dlm->lockres_hash)
302 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); 305 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
303 306
@@ -395,6 +398,7 @@ static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
395static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) 398static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
396{ 399{
397 dlm_unregister_domain_handlers(dlm); 400 dlm_unregister_domain_handlers(dlm);
401 dlm_debug_shutdown(dlm);
398 dlm_complete_thread(dlm); 402 dlm_complete_thread(dlm);
399 dlm_complete_recovery_thread(dlm); 403 dlm_complete_recovery_thread(dlm);
400 dlm_destroy_dlm_worker(dlm); 404 dlm_destroy_dlm_worker(dlm);
@@ -644,6 +648,7 @@ int dlm_shutting_down(struct dlm_ctxt *dlm)
644void dlm_unregister_domain(struct dlm_ctxt *dlm) 648void dlm_unregister_domain(struct dlm_ctxt *dlm)
645{ 649{
646 int leave = 0; 650 int leave = 0;
651 struct dlm_lock_resource *res;
647 652
648 spin_lock(&dlm_domain_lock); 653 spin_lock(&dlm_domain_lock);
649 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED); 654 BUG_ON(dlm->dlm_state != DLM_CTXT_JOINED);
@@ -673,6 +678,15 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
673 msleep(500); 678 msleep(500);
674 mlog(0, "%s: more migration to do\n", dlm->name); 679 mlog(0, "%s: more migration to do\n", dlm->name);
675 } 680 }
681
682 /* This list should be empty. If not, print remaining lockres */
683 if (!list_empty(&dlm->tracking_list)) {
684 mlog(ML_ERROR, "Following lockres' are still on the "
685 "tracking list:\n");
686 list_for_each_entry(res, &dlm->tracking_list, tracking)
687 dlm_print_one_lock_resource(res);
688 }
689
676 dlm_mark_domain_leaving(dlm); 690 dlm_mark_domain_leaving(dlm);
677 dlm_leave_domain(dlm); 691 dlm_leave_domain(dlm);
678 dlm_complete_dlm_shutdown(dlm); 692 dlm_complete_dlm_shutdown(dlm);
@@ -1405,6 +1419,12 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
1405 goto bail; 1419 goto bail;
1406 } 1420 }
1407 1421
1422 status = dlm_debug_init(dlm);
1423 if (status < 0) {
1424 mlog_errno(status);
1425 goto bail;
1426 }
1427
1408 status = dlm_launch_thread(dlm); 1428 status = dlm_launch_thread(dlm);
1409 if (status < 0) { 1429 if (status < 0) {
1410 mlog_errno(status); 1430 mlog_errno(status);
@@ -1472,6 +1492,7 @@ bail:
1472 1492
1473 if (status) { 1493 if (status) {
1474 dlm_unregister_domain_handlers(dlm); 1494 dlm_unregister_domain_handlers(dlm);
1495 dlm_debug_shutdown(dlm);
1475 dlm_complete_thread(dlm); 1496 dlm_complete_thread(dlm);
1476 dlm_complete_recovery_thread(dlm); 1497 dlm_complete_recovery_thread(dlm);
1477 dlm_destroy_dlm_worker(dlm); 1498 dlm_destroy_dlm_worker(dlm);
@@ -1484,6 +1505,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1484 u32 key) 1505 u32 key)
1485{ 1506{
1486 int i; 1507 int i;
1508 int ret;
1487 struct dlm_ctxt *dlm = NULL; 1509 struct dlm_ctxt *dlm = NULL;
1488 1510
1489 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); 1511 dlm = kzalloc(sizeof(*dlm), GFP_KERNEL);
@@ -1516,6 +1538,15 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1516 dlm->key = key; 1538 dlm->key = key;
1517 dlm->node_num = o2nm_this_node(); 1539 dlm->node_num = o2nm_this_node();
1518 1540
1541 ret = dlm_create_debugfs_subroot(dlm);
1542 if (ret < 0) {
1543 dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
1544 kfree(dlm->name);
1545 kfree(dlm);
1546 dlm = NULL;
1547 goto leave;
1548 }
1549
1519 spin_lock_init(&dlm->spinlock); 1550 spin_lock_init(&dlm->spinlock);
1520 spin_lock_init(&dlm->master_lock); 1551 spin_lock_init(&dlm->master_lock);
1521 spin_lock_init(&dlm->ast_lock); 1552 spin_lock_init(&dlm->ast_lock);
@@ -1526,6 +1557,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
1526 INIT_LIST_HEAD(&dlm->reco.node_data); 1557 INIT_LIST_HEAD(&dlm->reco.node_data);
1527 INIT_LIST_HEAD(&dlm->purge_list); 1558 INIT_LIST_HEAD(&dlm->purge_list);
1528 INIT_LIST_HEAD(&dlm->dlm_domain_handlers); 1559 INIT_LIST_HEAD(&dlm->dlm_domain_handlers);
1560 INIT_LIST_HEAD(&dlm->tracking_list);
1529 dlm->reco.state = 0; 1561 dlm->reco.state = 0;
1530 1562
1531 INIT_LIST_HEAD(&dlm->pending_asts); 1563 INIT_LIST_HEAD(&dlm->pending_asts);
@@ -1816,21 +1848,49 @@ static int __init dlm_init(void)
1816 dlm_print_version(); 1848 dlm_print_version();
1817 1849
1818 status = dlm_init_mle_cache(); 1850 status = dlm_init_mle_cache();
1819 if (status) 1851 if (status) {
1820 return -1; 1852 mlog(ML_ERROR, "Could not create o2dlm_mle slabcache\n");
1853 goto error;
1854 }
1855
1856 status = dlm_init_master_caches();
1857 if (status) {
1858 mlog(ML_ERROR, "Could not create o2dlm_lockres and "
1859 "o2dlm_lockname slabcaches\n");
1860 goto error;
1861 }
1862
1863 status = dlm_init_lock_cache();
1864 if (status) {
1865 mlog(ML_ERROR, "Count not create o2dlm_lock slabcache\n");
1866 goto error;
1867 }
1821 1868
1822 status = dlm_register_net_handlers(); 1869 status = dlm_register_net_handlers();
1823 if (status) { 1870 if (status) {
1824 dlm_destroy_mle_cache(); 1871 mlog(ML_ERROR, "Unable to register network handlers\n");
1825 return -1; 1872 goto error;
1826 } 1873 }
1827 1874
1875 status = dlm_create_debugfs_root();
1876 if (status)
1877 goto error;
1878
1828 return 0; 1879 return 0;
1880error:
1881 dlm_unregister_net_handlers();
1882 dlm_destroy_lock_cache();
1883 dlm_destroy_master_caches();
1884 dlm_destroy_mle_cache();
1885 return -1;
1829} 1886}
1830 1887
1831static void __exit dlm_exit (void) 1888static void __exit dlm_exit (void)
1832{ 1889{
1890 dlm_destroy_debugfs_root();
1833 dlm_unregister_net_handlers(); 1891 dlm_unregister_net_handlers();
1892 dlm_destroy_lock_cache();
1893 dlm_destroy_master_caches();
1834 dlm_destroy_mle_cache(); 1894 dlm_destroy_mle_cache();
1835} 1895}
1836 1896
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 52578d907d9a..83a9f2972ac8 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -53,6 +53,8 @@
53#define MLOG_MASK_PREFIX ML_DLM 53#define MLOG_MASK_PREFIX ML_DLM
54#include "cluster/masklog.h" 54#include "cluster/masklog.h"
55 55
56static struct kmem_cache *dlm_lock_cache = NULL;
57
56static DEFINE_SPINLOCK(dlm_cookie_lock); 58static DEFINE_SPINLOCK(dlm_cookie_lock);
57static u64 dlm_next_cookie = 1; 59static u64 dlm_next_cookie = 1;
58 60
@@ -64,6 +66,22 @@ static void dlm_init_lock(struct dlm_lock *newlock, int type,
64static void dlm_lock_release(struct kref *kref); 66static void dlm_lock_release(struct kref *kref);
65static void dlm_lock_detach_lockres(struct dlm_lock *lock); 67static void dlm_lock_detach_lockres(struct dlm_lock *lock);
66 68
69int dlm_init_lock_cache(void)
70{
71 dlm_lock_cache = kmem_cache_create("o2dlm_lock",
72 sizeof(struct dlm_lock),
73 0, SLAB_HWCACHE_ALIGN, NULL);
74 if (dlm_lock_cache == NULL)
75 return -ENOMEM;
76 return 0;
77}
78
79void dlm_destroy_lock_cache(void)
80{
81 if (dlm_lock_cache)
82 kmem_cache_destroy(dlm_lock_cache);
83}
84
67/* Tell us whether we can grant a new lock request. 85/* Tell us whether we can grant a new lock request.
68 * locking: 86 * locking:
69 * caller needs: res->spinlock 87 * caller needs: res->spinlock
@@ -353,7 +371,7 @@ static void dlm_lock_release(struct kref *kref)
353 mlog(0, "freeing kernel-allocated lksb\n"); 371 mlog(0, "freeing kernel-allocated lksb\n");
354 kfree(lock->lksb); 372 kfree(lock->lksb);
355 } 373 }
356 kfree(lock); 374 kmem_cache_free(dlm_lock_cache, lock);
357} 375}
358 376
359/* associate a lock with it's lockres, getting a ref on the lockres */ 377/* associate a lock with it's lockres, getting a ref on the lockres */
@@ -412,7 +430,7 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
412 struct dlm_lock *lock; 430 struct dlm_lock *lock;
413 int kernel_allocated = 0; 431 int kernel_allocated = 0;
414 432
415 lock = kzalloc(sizeof(*lock), GFP_NOFS); 433 lock = (struct dlm_lock *) kmem_cache_zalloc(dlm_lock_cache, GFP_NOFS);
416 if (!lock) 434 if (!lock)
417 return NULL; 435 return NULL;
418 436
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index ea6b89577860..efc015c6128a 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -48,47 +48,11 @@
48#include "dlmapi.h" 48#include "dlmapi.h"
49#include "dlmcommon.h" 49#include "dlmcommon.h"
50#include "dlmdomain.h" 50#include "dlmdomain.h"
51#include "dlmdebug.h"
51 52
52#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) 53#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
53#include "cluster/masklog.h" 54#include "cluster/masklog.h"
54 55
55enum dlm_mle_type {
56 DLM_MLE_BLOCK,
57 DLM_MLE_MASTER,
58 DLM_MLE_MIGRATION
59};
60
61struct dlm_lock_name
62{
63 u8 len;
64 u8 name[DLM_LOCKID_NAME_MAX];
65};
66
67struct dlm_master_list_entry
68{
69 struct list_head list;
70 struct list_head hb_events;
71 struct dlm_ctxt *dlm;
72 spinlock_t spinlock;
73 wait_queue_head_t wq;
74 atomic_t woken;
75 struct kref mle_refs;
76 int inuse;
77 unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
78 unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
79 unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
80 unsigned long node_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
81 u8 master;
82 u8 new_master;
83 enum dlm_mle_type type;
84 struct o2hb_callback_func mle_hb_up;
85 struct o2hb_callback_func mle_hb_down;
86 union {
87 struct dlm_lock_resource *res;
88 struct dlm_lock_name name;
89 } u;
90};
91
92static void dlm_mle_node_down(struct dlm_ctxt *dlm, 56static void dlm_mle_node_down(struct dlm_ctxt *dlm,
93 struct dlm_master_list_entry *mle, 57 struct dlm_master_list_entry *mle,
94 struct o2nm_node *node, 58 struct o2nm_node *node,
@@ -128,98 +92,10 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
128 return 1; 92 return 1;
129} 93}
130 94
131#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) 95static struct kmem_cache *dlm_lockres_cache = NULL;
132static void _dlm_print_nodemap(unsigned long *map, const char *mapname) 96static struct kmem_cache *dlm_lockname_cache = NULL;
133{
134 int i;
135 printk("%s=[ ", mapname);
136 for (i=0; i<O2NM_MAX_NODES; i++)
137 if (test_bit(i, map))
138 printk("%d ", i);
139 printk("]");
140}
141
142static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
143{
144 int refs;
145 char *type;
146 char attached;
147 u8 master;
148 unsigned int namelen;
149 const char *name;
150 struct kref *k;
151 unsigned long *maybe = mle->maybe_map,
152 *vote = mle->vote_map,
153 *resp = mle->response_map,
154 *node = mle->node_map;
155
156 k = &mle->mle_refs;
157 if (mle->type == DLM_MLE_BLOCK)
158 type = "BLK";
159 else if (mle->type == DLM_MLE_MASTER)
160 type = "MAS";
161 else
162 type = "MIG";
163 refs = atomic_read(&k->refcount);
164 master = mle->master;
165 attached = (list_empty(&mle->hb_events) ? 'N' : 'Y');
166
167 if (mle->type != DLM_MLE_MASTER) {
168 namelen = mle->u.name.len;
169 name = mle->u.name.name;
170 } else {
171 namelen = mle->u.res->lockname.len;
172 name = mle->u.res->lockname.name;
173 }
174
175 mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
176 namelen, name, type, refs, master, mle->new_master, attached,
177 mle->inuse);
178 dlm_print_nodemap(maybe);
179 printk(", ");
180 dlm_print_nodemap(vote);
181 printk(", ");
182 dlm_print_nodemap(resp);
183 printk(", ");
184 dlm_print_nodemap(node);
185 printk(", ");
186 printk("\n");
187}
188
189#if 0
190/* Code here is included but defined out as it aids debugging */
191
192static void dlm_dump_mles(struct dlm_ctxt *dlm)
193{
194 struct dlm_master_list_entry *mle;
195
196 mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
197 spin_lock(&dlm->master_lock);
198 list_for_each_entry(mle, &dlm->master_list, list)
199 dlm_print_one_mle(mle);
200 spin_unlock(&dlm->master_lock);
201}
202
203int dlm_dump_all_mles(const char __user *data, unsigned int len)
204{
205 struct dlm_ctxt *dlm;
206
207 spin_lock(&dlm_domain_lock);
208 list_for_each_entry(dlm, &dlm_domains, list) {
209 mlog(ML_NOTICE, "found dlm: %p, name=%s\n", dlm, dlm->name);
210 dlm_dump_mles(dlm);
211 }
212 spin_unlock(&dlm_domain_lock);
213 return len;
214}
215EXPORT_SYMBOL_GPL(dlm_dump_all_mles);
216
217#endif /* 0 */
218
219
220static struct kmem_cache *dlm_mle_cache = NULL; 97static struct kmem_cache *dlm_mle_cache = NULL;
221 98
222
223static void dlm_mle_release(struct kref *kref); 99static void dlm_mle_release(struct kref *kref);
224static void dlm_init_mle(struct dlm_master_list_entry *mle, 100static void dlm_init_mle(struct dlm_master_list_entry *mle,
225 enum dlm_mle_type type, 101 enum dlm_mle_type type,
@@ -507,7 +383,7 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm,
507 383
508int dlm_init_mle_cache(void) 384int dlm_init_mle_cache(void)
509{ 385{
510 dlm_mle_cache = kmem_cache_create("dlm_mle_cache", 386 dlm_mle_cache = kmem_cache_create("o2dlm_mle",
511 sizeof(struct dlm_master_list_entry), 387 sizeof(struct dlm_master_list_entry),
512 0, SLAB_HWCACHE_ALIGN, 388 0, SLAB_HWCACHE_ALIGN,
513 NULL); 389 NULL);
@@ -560,6 +436,35 @@ static void dlm_mle_release(struct kref *kref)
560 * LOCK RESOURCE FUNCTIONS 436 * LOCK RESOURCE FUNCTIONS
561 */ 437 */
562 438
439int dlm_init_master_caches(void)
440{
441 dlm_lockres_cache = kmem_cache_create("o2dlm_lockres",
442 sizeof(struct dlm_lock_resource),
443 0, SLAB_HWCACHE_ALIGN, NULL);
444 if (!dlm_lockres_cache)
445 goto bail;
446
447 dlm_lockname_cache = kmem_cache_create("o2dlm_lockname",
448 DLM_LOCKID_NAME_MAX, 0,
449 SLAB_HWCACHE_ALIGN, NULL);
450 if (!dlm_lockname_cache)
451 goto bail;
452
453 return 0;
454bail:
455 dlm_destroy_master_caches();
456 return -ENOMEM;
457}
458
459void dlm_destroy_master_caches(void)
460{
461 if (dlm_lockname_cache)
462 kmem_cache_destroy(dlm_lockname_cache);
463
464 if (dlm_lockres_cache)
465 kmem_cache_destroy(dlm_lockres_cache);
466}
467
563static void dlm_set_lockres_owner(struct dlm_ctxt *dlm, 468static void dlm_set_lockres_owner(struct dlm_ctxt *dlm,
564 struct dlm_lock_resource *res, 469 struct dlm_lock_resource *res,
565 u8 owner) 470 u8 owner)
@@ -610,6 +515,14 @@ static void dlm_lockres_release(struct kref *kref)
610 mlog(0, "destroying lockres %.*s\n", res->lockname.len, 515 mlog(0, "destroying lockres %.*s\n", res->lockname.len,
611 res->lockname.name); 516 res->lockname.name);
612 517
518 if (!list_empty(&res->tracking))
519 list_del_init(&res->tracking);
520 else {
521 mlog(ML_ERROR, "Resource %.*s not on the Tracking list\n",
522 res->lockname.len, res->lockname.name);
523 dlm_print_one_lock_resource(res);
524 }
525
613 if (!hlist_unhashed(&res->hash_node) || 526 if (!hlist_unhashed(&res->hash_node) ||
614 !list_empty(&res->granted) || 527 !list_empty(&res->granted) ||
615 !list_empty(&res->converting) || 528 !list_empty(&res->converting) ||
@@ -642,9 +555,9 @@ static void dlm_lockres_release(struct kref *kref)
642 BUG_ON(!list_empty(&res->recovering)); 555 BUG_ON(!list_empty(&res->recovering));
643 BUG_ON(!list_empty(&res->purge)); 556 BUG_ON(!list_empty(&res->purge));
644 557
645 kfree(res->lockname.name); 558 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
646 559
647 kfree(res); 560 kmem_cache_free(dlm_lockres_cache, res);
648} 561}
649 562
650void dlm_lockres_put(struct dlm_lock_resource *res) 563void dlm_lockres_put(struct dlm_lock_resource *res)
@@ -677,6 +590,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
677 INIT_LIST_HEAD(&res->dirty); 590 INIT_LIST_HEAD(&res->dirty);
678 INIT_LIST_HEAD(&res->recovering); 591 INIT_LIST_HEAD(&res->recovering);
679 INIT_LIST_HEAD(&res->purge); 592 INIT_LIST_HEAD(&res->purge);
593 INIT_LIST_HEAD(&res->tracking);
680 atomic_set(&res->asts_reserved, 0); 594 atomic_set(&res->asts_reserved, 0);
681 res->migration_pending = 0; 595 res->migration_pending = 0;
682 res->inflight_locks = 0; 596 res->inflight_locks = 0;
@@ -692,6 +606,8 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
692 606
693 res->last_used = 0; 607 res->last_used = 0;
694 608
609 list_add_tail(&res->tracking, &dlm->tracking_list);
610
695 memset(res->lvb, 0, DLM_LVB_LEN); 611 memset(res->lvb, 0, DLM_LVB_LEN);
696 memset(res->refmap, 0, sizeof(res->refmap)); 612 memset(res->refmap, 0, sizeof(res->refmap));
697} 613}
@@ -700,20 +616,28 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
700 const char *name, 616 const char *name,
701 unsigned int namelen) 617 unsigned int namelen)
702{ 618{
703 struct dlm_lock_resource *res; 619 struct dlm_lock_resource *res = NULL;
704 620
705 res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); 621 res = (struct dlm_lock_resource *)
622 kmem_cache_zalloc(dlm_lockres_cache, GFP_NOFS);
706 if (!res) 623 if (!res)
707 return NULL; 624 goto error;
708 625
709 res->lockname.name = kmalloc(namelen, GFP_NOFS); 626 res->lockname.name = (char *)
710 if (!res->lockname.name) { 627 kmem_cache_zalloc(dlm_lockname_cache, GFP_NOFS);
711 kfree(res); 628 if (!res->lockname.name)
712 return NULL; 629 goto error;
713 }
714 630
715 dlm_init_lockres(dlm, res, name, namelen); 631 dlm_init_lockres(dlm, res, name, namelen);
716 return res; 632 return res;
633
634error:
635 if (res && res->lockname.name)
636 kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name);
637
638 if (res)
639 kmem_cache_free(dlm_lockres_cache, res);
640 return NULL;
717} 641}
718 642
719void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 643void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1f1873bf41fb..394d25a131a5 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -27,18 +27,11 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/mm.h> 29#include <linux/mm.h>
30#include <linux/crc32.h>
31#include <linux/kthread.h> 30#include <linux/kthread.h>
32#include <linux/pagemap.h> 31#include <linux/pagemap.h>
33#include <linux/debugfs.h> 32#include <linux/debugfs.h>
34#include <linux/seq_file.h> 33#include <linux/seq_file.h>
35 34
36#include <cluster/heartbeat.h>
37#include <cluster/nodemanager.h>
38#include <cluster/tcp.h>
39
40#include <dlm/dlmapi.h>
41
42#define MLOG_MASK_PREFIX ML_DLM_GLUE 35#define MLOG_MASK_PREFIX ML_DLM_GLUE
43#include <cluster/masklog.h> 36#include <cluster/masklog.h>
44 37
@@ -53,6 +46,7 @@
53#include "heartbeat.h" 46#include "heartbeat.h"
54#include "inode.h" 47#include "inode.h"
55#include "journal.h" 48#include "journal.h"
49#include "stackglue.h"
56#include "slot_map.h" 50#include "slot_map.h"
57#include "super.h" 51#include "super.h"
58#include "uptodate.h" 52#include "uptodate.h"
@@ -113,7 +107,8 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
113 unsigned int line, 107 unsigned int line,
114 struct ocfs2_lock_res *lockres) 108 struct ocfs2_lock_res *lockres)
115{ 109{
116 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 110 struct ocfs2_meta_lvb *lvb =
111 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
117 112
118 mlog(level, "LVB information for %s (called from %s:%u):\n", 113 mlog(level, "LVB information for %s (called from %s:%u):\n",
119 lockres->l_name, function, line); 114 lockres->l_name, function, line);
@@ -259,31 +254,6 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
259 .flags = 0, 254 .flags = 0,
260}; 255};
261 256
262/*
263 * This is the filesystem locking protocol version.
264 *
265 * Whenever the filesystem does new things with locks (adds or removes a
266 * lock, orders them differently, does different things underneath a lock),
267 * the version must be changed. The protocol is negotiated when joining
268 * the dlm domain. A node may join the domain if its major version is
269 * identical to all other nodes and its minor version is greater than
270 * or equal to all other nodes. When its minor version is greater than
271 * the other nodes, it will run at the minor version specified by the
272 * other nodes.
273 *
274 * If a locking change is made that will not be compatible with older
275 * versions, the major number must be increased and the minor version set
276 * to zero. If a change merely adds a behavior that can be disabled when
277 * speaking to older versions, the minor version must be increased. If a
278 * change adds a fully backwards compatible change (eg, LVB changes that
279 * are just ignored by older versions), the version does not need to be
280 * updated.
281 */
282const struct dlm_protocol_version ocfs2_locking_protocol = {
283 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
284 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
285};
286
287static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) 257static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
288{ 258{
289 return lockres->l_type == OCFS2_LOCK_TYPE_META || 259 return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -316,7 +286,7 @@ static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *l
316static int ocfs2_lock_create(struct ocfs2_super *osb, 286static int ocfs2_lock_create(struct ocfs2_super *osb,
317 struct ocfs2_lock_res *lockres, 287 struct ocfs2_lock_res *lockres,
318 int level, 288 int level,
319 int dlm_flags); 289 u32 dlm_flags);
320static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres, 290static inline int ocfs2_may_continue_on_blocked_lock(struct ocfs2_lock_res *lockres,
321 int wanted); 291 int wanted);
322static void ocfs2_cluster_unlock(struct ocfs2_super *osb, 292static void ocfs2_cluster_unlock(struct ocfs2_super *osb,
@@ -330,10 +300,9 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
330 struct ocfs2_lock_res *lockres); 300 struct ocfs2_lock_res *lockres);
331static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, 301static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
332 int convert); 302 int convert);
333#define ocfs2_log_dlm_error(_func, _stat, _lockres) do { \ 303#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \
334 mlog(ML_ERROR, "Dlm error \"%s\" while calling %s on " \ 304 mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
335 "resource %s: %s\n", dlm_errname(_stat), _func, \ 305 _err, _func, _lockres->l_name); \
336 _lockres->l_name, dlm_errmsg(_stat)); \
337} while (0) 306} while (0)
338static int ocfs2_downconvert_thread(void *arg); 307static int ocfs2_downconvert_thread(void *arg);
339static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, 308static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
@@ -342,12 +311,13 @@ static int ocfs2_inode_lock_update(struct inode *inode,
342 struct buffer_head **bh); 311 struct buffer_head **bh);
343static void ocfs2_drop_osb_locks(struct ocfs2_super *osb); 312static void ocfs2_drop_osb_locks(struct ocfs2_super *osb);
344static inline int ocfs2_highest_compat_lock_level(int level); 313static inline int ocfs2_highest_compat_lock_level(int level);
345static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 314static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
346 int new_level); 315 int new_level);
347static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 316static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
348 struct ocfs2_lock_res *lockres, 317 struct ocfs2_lock_res *lockres,
349 int new_level, 318 int new_level,
350 int lvb); 319 int lvb,
320 unsigned int generation);
351static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 321static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
352 struct ocfs2_lock_res *lockres); 322 struct ocfs2_lock_res *lockres);
353static int ocfs2_cancel_convert(struct ocfs2_super *osb, 323static int ocfs2_cancel_convert(struct ocfs2_super *osb,
@@ -406,9 +376,9 @@ static void ocfs2_lock_res_init_common(struct ocfs2_super *osb,
406 res->l_ops = ops; 376 res->l_ops = ops;
407 res->l_priv = priv; 377 res->l_priv = priv;
408 378
409 res->l_level = LKM_IVMODE; 379 res->l_level = DLM_LOCK_IV;
410 res->l_requested = LKM_IVMODE; 380 res->l_requested = DLM_LOCK_IV;
411 res->l_blocking = LKM_IVMODE; 381 res->l_blocking = DLM_LOCK_IV;
412 res->l_action = OCFS2_AST_INVALID; 382 res->l_action = OCFS2_AST_INVALID;
413 res->l_unlock_action = OCFS2_UNLOCK_INVALID; 383 res->l_unlock_action = OCFS2_UNLOCK_INVALID;
414 384
@@ -604,10 +574,10 @@ static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres,
604 BUG_ON(!lockres); 574 BUG_ON(!lockres);
605 575
606 switch(level) { 576 switch(level) {
607 case LKM_EXMODE: 577 case DLM_LOCK_EX:
608 lockres->l_ex_holders++; 578 lockres->l_ex_holders++;
609 break; 579 break;
610 case LKM_PRMODE: 580 case DLM_LOCK_PR:
611 lockres->l_ro_holders++; 581 lockres->l_ro_holders++;
612 break; 582 break;
613 default: 583 default:
@@ -625,11 +595,11 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
625 BUG_ON(!lockres); 595 BUG_ON(!lockres);
626 596
627 switch(level) { 597 switch(level) {
628 case LKM_EXMODE: 598 case DLM_LOCK_EX:
629 BUG_ON(!lockres->l_ex_holders); 599 BUG_ON(!lockres->l_ex_holders);
630 lockres->l_ex_holders--; 600 lockres->l_ex_holders--;
631 break; 601 break;
632 case LKM_PRMODE: 602 case DLM_LOCK_PR:
633 BUG_ON(!lockres->l_ro_holders); 603 BUG_ON(!lockres->l_ro_holders);
634 lockres->l_ro_holders--; 604 lockres->l_ro_holders--;
635 break; 605 break;
@@ -644,12 +614,12 @@ static inline void ocfs2_dec_holders(struct ocfs2_lock_res *lockres,
644 * lock types are added. */ 614 * lock types are added. */
645static inline int ocfs2_highest_compat_lock_level(int level) 615static inline int ocfs2_highest_compat_lock_level(int level)
646{ 616{
647 int new_level = LKM_EXMODE; 617 int new_level = DLM_LOCK_EX;
648 618
649 if (level == LKM_EXMODE) 619 if (level == DLM_LOCK_EX)
650 new_level = LKM_NLMODE; 620 new_level = DLM_LOCK_NL;
651 else if (level == LKM_PRMODE) 621 else if (level == DLM_LOCK_PR)
652 new_level = LKM_PRMODE; 622 new_level = DLM_LOCK_PR;
653 return new_level; 623 return new_level;
654} 624}
655 625
@@ -688,12 +658,12 @@ static inline void ocfs2_generic_handle_downconvert_action(struct ocfs2_lock_res
688 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY)); 658 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
689 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED)); 659 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_ATTACHED));
690 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED)); 660 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BLOCKED));
691 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 661 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
692 662
693 lockres->l_level = lockres->l_requested; 663 lockres->l_level = lockres->l_requested;
694 if (lockres->l_level <= 664 if (lockres->l_level <=
695 ocfs2_highest_compat_lock_level(lockres->l_blocking)) { 665 ocfs2_highest_compat_lock_level(lockres->l_blocking)) {
696 lockres->l_blocking = LKM_NLMODE; 666 lockres->l_blocking = DLM_LOCK_NL;
697 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED); 667 lockres_clear_flags(lockres, OCFS2_LOCK_BLOCKED);
698 } 668 }
699 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 669 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
@@ -712,7 +682,7 @@ static inline void ocfs2_generic_handle_convert_action(struct ocfs2_lock_res *lo
712 * information is already up to data. Convert from NL to 682 * information is already up to data. Convert from NL to
713 * *anything* however should mark ourselves as needing an 683 * *anything* however should mark ourselves as needing an
714 * update */ 684 * update */
715 if (lockres->l_level == LKM_NLMODE && 685 if (lockres->l_level == DLM_LOCK_NL &&
716 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 686 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
717 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 687 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
718 688
@@ -729,7 +699,7 @@ static inline void ocfs2_generic_handle_attach_action(struct ocfs2_lock_res *loc
729 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY))); 699 BUG_ON((!(lockres->l_flags & OCFS2_LOCK_BUSY)));
730 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 700 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
731 701
732 if (lockres->l_requested > LKM_NLMODE && 702 if (lockres->l_requested > DLM_LOCK_NL &&
733 !(lockres->l_flags & OCFS2_LOCK_LOCAL) && 703 !(lockres->l_flags & OCFS2_LOCK_LOCAL) &&
734 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH) 704 lockres->l_ops->flags & LOCK_TYPE_REQUIRES_REFRESH)
735 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH); 705 lockres_or_flags(lockres, OCFS2_LOCK_NEEDS_REFRESH);
@@ -767,6 +737,113 @@ static int ocfs2_generic_handle_bast(struct ocfs2_lock_res *lockres,
767 return needs_downconvert; 737 return needs_downconvert;
768} 738}
769 739
740/*
741 * OCFS2_LOCK_PENDING and l_pending_gen.
742 *
743 * Why does OCFS2_LOCK_PENDING exist? To close a race between setting
744 * OCFS2_LOCK_BUSY and calling ocfs2_dlm_lock(). See ocfs2_unblock_lock()
745 * for more details on the race.
746 *
747 * OCFS2_LOCK_PENDING closes the race quite nicely. However, it introduces
748 * a race on itself. In o2dlm, we can get the ast before ocfs2_dlm_lock()
749 * returns. The ast clears OCFS2_LOCK_BUSY, and must therefore clear
750 * OCFS2_LOCK_PENDING at the same time. When ocfs2_dlm_lock() returns,
751 * the caller is going to try to clear PENDING again. If nothing else is
752 * happening, __lockres_clear_pending() sees PENDING is unset and does
753 * nothing.
754 *
755 * But what if another path (eg downconvert thread) has just started a
756 * new locking action? The other path has re-set PENDING. Our path
757 * cannot clear PENDING, because that will re-open the original race
758 * window.
759 *
760 * [Example]
761 *
762 * ocfs2_meta_lock()
763 * ocfs2_cluster_lock()
764 * set BUSY
765 * set PENDING
766 * drop l_lock
767 * ocfs2_dlm_lock()
768 * ocfs2_locking_ast() ocfs2_downconvert_thread()
769 * clear PENDING ocfs2_unblock_lock()
770 * take_l_lock
771 * !BUSY
772 * ocfs2_prepare_downconvert()
773 * set BUSY
774 * set PENDING
775 * drop l_lock
776 * take l_lock
777 * clear PENDING
778 * drop l_lock
779 * <window>
780 * ocfs2_dlm_lock()
781 *
782 * So as you can see, we now have a window where l_lock is not held,
783 * PENDING is not set, and ocfs2_dlm_lock() has not been called.
784 *
785 * The core problem is that ocfs2_cluster_lock() has cleared the PENDING
786 * set by ocfs2_prepare_downconvert(). That wasn't nice.
787 *
788 * To solve this we introduce l_pending_gen. A call to
789 * lockres_clear_pending() will only do so when it is passed a generation
790 * number that matches the lockres. lockres_set_pending() will return the
791 * current generation number. When ocfs2_cluster_lock() goes to clear
792 * PENDING, it passes the generation it got from set_pending(). In our
793 * example above, the generation numbers will *not* match. Thus,
794 * ocfs2_cluster_lock() will not clear the PENDING set by
795 * ocfs2_prepare_downconvert().
796 */
797
798/* Unlocked version for ocfs2_locking_ast() */
799static void __lockres_clear_pending(struct ocfs2_lock_res *lockres,
800 unsigned int generation,
801 struct ocfs2_super *osb)
802{
803 assert_spin_locked(&lockres->l_lock);
804
805 /*
806 * The ast and locking functions can race us here. The winner
807 * will clear pending, the loser will not.
808 */
809 if (!(lockres->l_flags & OCFS2_LOCK_PENDING) ||
810 (lockres->l_pending_gen != generation))
811 return;
812
813 lockres_clear_flags(lockres, OCFS2_LOCK_PENDING);
814 lockres->l_pending_gen++;
815
816 /*
817 * The downconvert thread may have skipped us because we
818 * were PENDING. Wake it up.
819 */
820 if (lockres->l_flags & OCFS2_LOCK_BLOCKED)
821 ocfs2_wake_downconvert_thread(osb);
822}
823
824/* Locked version for callers of ocfs2_dlm_lock() */
825static void lockres_clear_pending(struct ocfs2_lock_res *lockres,
826 unsigned int generation,
827 struct ocfs2_super *osb)
828{
829 unsigned long flags;
830
831 spin_lock_irqsave(&lockres->l_lock, flags);
832 __lockres_clear_pending(lockres, generation, osb);
833 spin_unlock_irqrestore(&lockres->l_lock, flags);
834}
835
836static unsigned int lockres_set_pending(struct ocfs2_lock_res *lockres)
837{
838 assert_spin_locked(&lockres->l_lock);
839 BUG_ON(!(lockres->l_flags & OCFS2_LOCK_BUSY));
840
841 lockres_or_flags(lockres, OCFS2_LOCK_PENDING);
842
843 return lockres->l_pending_gen;
844}
845
846
770static void ocfs2_blocking_ast(void *opaque, int level) 847static void ocfs2_blocking_ast(void *opaque, int level)
771{ 848{
772 struct ocfs2_lock_res *lockres = opaque; 849 struct ocfs2_lock_res *lockres = opaque;
@@ -774,7 +851,7 @@ static void ocfs2_blocking_ast(void *opaque, int level)
774 int needs_downconvert; 851 int needs_downconvert;
775 unsigned long flags; 852 unsigned long flags;
776 853
777 BUG_ON(level <= LKM_NLMODE); 854 BUG_ON(level <= DLM_LOCK_NL);
778 855
779 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n", 856 mlog(0, "BAST fired for lockres %s, blocking %d, level %d type %s\n",
780 lockres->l_name, level, lockres->l_level, 857 lockres->l_name, level, lockres->l_level,
@@ -801,14 +878,22 @@ static void ocfs2_blocking_ast(void *opaque, int level)
801static void ocfs2_locking_ast(void *opaque) 878static void ocfs2_locking_ast(void *opaque)
802{ 879{
803 struct ocfs2_lock_res *lockres = opaque; 880 struct ocfs2_lock_res *lockres = opaque;
804 struct dlm_lockstatus *lksb = &lockres->l_lksb; 881 struct ocfs2_super *osb = ocfs2_get_lockres_osb(lockres);
805 unsigned long flags; 882 unsigned long flags;
883 int status;
806 884
807 spin_lock_irqsave(&lockres->l_lock, flags); 885 spin_lock_irqsave(&lockres->l_lock, flags);
808 886
809 if (lksb->status != DLM_NORMAL) { 887 status = ocfs2_dlm_lock_status(&lockres->l_lksb);
810 mlog(ML_ERROR, "lockres %s: lksb status value of %u!\n", 888
811 lockres->l_name, lksb->status); 889 if (status == -EAGAIN) {
890 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
891 goto out;
892 }
893
894 if (status) {
895 mlog(ML_ERROR, "lockres %s: lksb status value of %d!\n",
896 lockres->l_name, status);
812 spin_unlock_irqrestore(&lockres->l_lock, flags); 897 spin_unlock_irqrestore(&lockres->l_lock, flags);
813 return; 898 return;
814 } 899 }
@@ -831,11 +916,23 @@ static void ocfs2_locking_ast(void *opaque)
831 lockres->l_unlock_action); 916 lockres->l_unlock_action);
832 BUG(); 917 BUG();
833 } 918 }
834 919out:
835 /* set it to something invalid so if we get called again we 920 /* set it to something invalid so if we get called again we
836 * can catch it. */ 921 * can catch it. */
837 lockres->l_action = OCFS2_AST_INVALID; 922 lockres->l_action = OCFS2_AST_INVALID;
838 923
924 /* Did we try to cancel this lock? Clear that state */
925 if (lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT)
926 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
927
928 /*
929 * We may have beaten the locking functions here. We certainly
930 * know that dlm_lock() has been called :-)
931 * Because we can't have two lock calls in flight at once, we
932 * can use lockres->l_pending_gen.
933 */
934 __lockres_clear_pending(lockres, lockres->l_pending_gen, osb);
935
839 wake_up(&lockres->l_event); 936 wake_up(&lockres->l_event);
840 spin_unlock_irqrestore(&lockres->l_lock, flags); 937 spin_unlock_irqrestore(&lockres->l_lock, flags);
841} 938}
@@ -865,15 +962,15 @@ static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
865static int ocfs2_lock_create(struct ocfs2_super *osb, 962static int ocfs2_lock_create(struct ocfs2_super *osb,
866 struct ocfs2_lock_res *lockres, 963 struct ocfs2_lock_res *lockres,
867 int level, 964 int level,
868 int dlm_flags) 965 u32 dlm_flags)
869{ 966{
870 int ret = 0; 967 int ret = 0;
871 enum dlm_status status = DLM_NORMAL;
872 unsigned long flags; 968 unsigned long flags;
969 unsigned int gen;
873 970
874 mlog_entry_void(); 971 mlog_entry_void();
875 972
876 mlog(0, "lock %s, level = %d, flags = %d\n", lockres->l_name, level, 973 mlog(0, "lock %s, level = %d, flags = %u\n", lockres->l_name, level,
877 dlm_flags); 974 dlm_flags);
878 975
879 spin_lock_irqsave(&lockres->l_lock, flags); 976 spin_lock_irqsave(&lockres->l_lock, flags);
@@ -886,24 +983,23 @@ static int ocfs2_lock_create(struct ocfs2_super *osb,
886 lockres->l_action = OCFS2_AST_ATTACH; 983 lockres->l_action = OCFS2_AST_ATTACH;
887 lockres->l_requested = level; 984 lockres->l_requested = level;
888 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 985 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
986 gen = lockres_set_pending(lockres);
889 spin_unlock_irqrestore(&lockres->l_lock, flags); 987 spin_unlock_irqrestore(&lockres->l_lock, flags);
890 988
891 status = dlmlock(osb->dlm, 989 ret = ocfs2_dlm_lock(osb->cconn,
892 level, 990 level,
893 &lockres->l_lksb, 991 &lockres->l_lksb,
894 dlm_flags, 992 dlm_flags,
895 lockres->l_name, 993 lockres->l_name,
896 OCFS2_LOCK_ID_MAX_LEN - 1, 994 OCFS2_LOCK_ID_MAX_LEN - 1,
897 ocfs2_locking_ast, 995 lockres);
898 lockres, 996 lockres_clear_pending(lockres, gen, osb);
899 ocfs2_blocking_ast); 997 if (ret) {
900 if (status != DLM_NORMAL) { 998 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
901 ocfs2_log_dlm_error("dlmlock", status, lockres);
902 ret = -EINVAL;
903 ocfs2_recover_from_dlm_error(lockres, 1); 999 ocfs2_recover_from_dlm_error(lockres, 1);
904 } 1000 }
905 1001
906 mlog(0, "lock %s, successfull return from dlmlock\n", lockres->l_name); 1002 mlog(0, "lock %s, return from ocfs2_dlm_lock\n", lockres->l_name);
907 1003
908bail: 1004bail:
909 mlog_exit(ret); 1005 mlog_exit(ret);
@@ -1016,21 +1112,22 @@ static int ocfs2_wait_for_mask_interruptible(struct ocfs2_mask_waiter *mw,
1016static int ocfs2_cluster_lock(struct ocfs2_super *osb, 1112static int ocfs2_cluster_lock(struct ocfs2_super *osb,
1017 struct ocfs2_lock_res *lockres, 1113 struct ocfs2_lock_res *lockres,
1018 int level, 1114 int level,
1019 int lkm_flags, 1115 u32 lkm_flags,
1020 int arg_flags) 1116 int arg_flags)
1021{ 1117{
1022 struct ocfs2_mask_waiter mw; 1118 struct ocfs2_mask_waiter mw;
1023 enum dlm_status status;
1024 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR); 1119 int wait, catch_signals = !(osb->s_mount_opt & OCFS2_MOUNT_NOINTR);
1025 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */ 1120 int ret = 0; /* gcc doesn't realize wait = 1 guarantees ret is set */
1026 unsigned long flags; 1121 unsigned long flags;
1122 unsigned int gen;
1123 int noqueue_attempted = 0;
1027 1124
1028 mlog_entry_void(); 1125 mlog_entry_void();
1029 1126
1030 ocfs2_init_mask_waiter(&mw); 1127 ocfs2_init_mask_waiter(&mw);
1031 1128
1032 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 1129 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
1033 lkm_flags |= LKM_VALBLK; 1130 lkm_flags |= DLM_LKF_VALBLK;
1034 1131
1035again: 1132again:
1036 wait = 0; 1133 wait = 0;
@@ -1068,52 +1165,56 @@ again:
1068 } 1165 }
1069 1166
1070 if (level > lockres->l_level) { 1167 if (level > lockres->l_level) {
1168 if (noqueue_attempted > 0) {
1169 ret = -EAGAIN;
1170 goto unlock;
1171 }
1172 if (lkm_flags & DLM_LKF_NOQUEUE)
1173 noqueue_attempted = 1;
1174
1071 if (lockres->l_action != OCFS2_AST_INVALID) 1175 if (lockres->l_action != OCFS2_AST_INVALID)
1072 mlog(ML_ERROR, "lockres %s has action %u pending\n", 1176 mlog(ML_ERROR, "lockres %s has action %u pending\n",
1073 lockres->l_name, lockres->l_action); 1177 lockres->l_name, lockres->l_action);
1074 1178
1075 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) { 1179 if (!(lockres->l_flags & OCFS2_LOCK_ATTACHED)) {
1076 lockres->l_action = OCFS2_AST_ATTACH; 1180 lockres->l_action = OCFS2_AST_ATTACH;
1077 lkm_flags &= ~LKM_CONVERT; 1181 lkm_flags &= ~DLM_LKF_CONVERT;
1078 } else { 1182 } else {
1079 lockres->l_action = OCFS2_AST_CONVERT; 1183 lockres->l_action = OCFS2_AST_CONVERT;
1080 lkm_flags |= LKM_CONVERT; 1184 lkm_flags |= DLM_LKF_CONVERT;
1081 } 1185 }
1082 1186
1083 lockres->l_requested = level; 1187 lockres->l_requested = level;
1084 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 1188 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
1189 gen = lockres_set_pending(lockres);
1085 spin_unlock_irqrestore(&lockres->l_lock, flags); 1190 spin_unlock_irqrestore(&lockres->l_lock, flags);
1086 1191
1087 BUG_ON(level == LKM_IVMODE); 1192 BUG_ON(level == DLM_LOCK_IV);
1088 BUG_ON(level == LKM_NLMODE); 1193 BUG_ON(level == DLM_LOCK_NL);
1089 1194
1090 mlog(0, "lock %s, convert from %d to level = %d\n", 1195 mlog(0, "lock %s, convert from %d to level = %d\n",
1091 lockres->l_name, lockres->l_level, level); 1196 lockres->l_name, lockres->l_level, level);
1092 1197
1093 /* call dlm_lock to upgrade lock now */ 1198 /* call dlm_lock to upgrade lock now */
1094 status = dlmlock(osb->dlm, 1199 ret = ocfs2_dlm_lock(osb->cconn,
1095 level, 1200 level,
1096 &lockres->l_lksb, 1201 &lockres->l_lksb,
1097 lkm_flags, 1202 lkm_flags,
1098 lockres->l_name, 1203 lockres->l_name,
1099 OCFS2_LOCK_ID_MAX_LEN - 1, 1204 OCFS2_LOCK_ID_MAX_LEN - 1,
1100 ocfs2_locking_ast, 1205 lockres);
1101 lockres, 1206 lockres_clear_pending(lockres, gen, osb);
1102 ocfs2_blocking_ast); 1207 if (ret) {
1103 if (status != DLM_NORMAL) { 1208 if (!(lkm_flags & DLM_LKF_NOQUEUE) ||
1104 if ((lkm_flags & LKM_NOQUEUE) && 1209 (ret != -EAGAIN)) {
1105 (status == DLM_NOTQUEUED)) 1210 ocfs2_log_dlm_error("ocfs2_dlm_lock",
1106 ret = -EAGAIN; 1211 ret, lockres);
1107 else {
1108 ocfs2_log_dlm_error("dlmlock", status,
1109 lockres);
1110 ret = -EINVAL;
1111 } 1212 }
1112 ocfs2_recover_from_dlm_error(lockres, 1); 1213 ocfs2_recover_from_dlm_error(lockres, 1);
1113 goto out; 1214 goto out;
1114 } 1215 }
1115 1216
1116 mlog(0, "lock %s, successfull return from dlmlock\n", 1217 mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n",
1117 lockres->l_name); 1218 lockres->l_name);
1118 1219
1119 /* At this point we've gone inside the dlm and need to 1220 /* At this point we've gone inside the dlm and need to
@@ -1177,9 +1278,9 @@ static int ocfs2_create_new_lock(struct ocfs2_super *osb,
1177 int ex, 1278 int ex,
1178 int local) 1279 int local)
1179{ 1280{
1180 int level = ex ? LKM_EXMODE : LKM_PRMODE; 1281 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1181 unsigned long flags; 1282 unsigned long flags;
1182 int lkm_flags = local ? LKM_LOCAL : 0; 1283 u32 lkm_flags = local ? DLM_LKF_LOCAL : 0;
1183 1284
1184 spin_lock_irqsave(&lockres->l_lock, flags); 1285 spin_lock_irqsave(&lockres->l_lock, flags);
1185 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED); 1286 BUG_ON(lockres->l_flags & OCFS2_LOCK_ATTACHED);
@@ -1222,7 +1323,7 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
1222 } 1323 }
1223 1324
1224 /* 1325 /*
1225 * We don't want to use LKM_LOCAL on a meta data lock as they 1326 * We don't want to use DLM_LKF_LOCAL on a meta data lock as they
1226 * don't use a generation in their lock names. 1327 * don't use a generation in their lock names.
1227 */ 1328 */
1228 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0); 1329 ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_inode_lockres, 1, 0);
@@ -1261,7 +1362,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1261 1362
1262 lockres = &OCFS2_I(inode)->ip_rw_lockres; 1363 lockres = &OCFS2_I(inode)->ip_rw_lockres;
1263 1364
1264 level = write ? LKM_EXMODE : LKM_PRMODE; 1365 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1265 1366
1266 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0, 1367 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, level, 0,
1267 0); 1368 0);
@@ -1274,7 +1375,7 @@ int ocfs2_rw_lock(struct inode *inode, int write)
1274 1375
1275void ocfs2_rw_unlock(struct inode *inode, int write) 1376void ocfs2_rw_unlock(struct inode *inode, int write)
1276{ 1377{
1277 int level = write ? LKM_EXMODE : LKM_PRMODE; 1378 int level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1278 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres; 1379 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_rw_lockres;
1279 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1380 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1280 1381
@@ -1312,7 +1413,7 @@ int ocfs2_open_lock(struct inode *inode)
1312 lockres = &OCFS2_I(inode)->ip_open_lockres; 1413 lockres = &OCFS2_I(inode)->ip_open_lockres;
1313 1414
1314 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1415 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1315 LKM_PRMODE, 0, 0); 1416 DLM_LOCK_PR, 0, 0);
1316 if (status < 0) 1417 if (status < 0)
1317 mlog_errno(status); 1418 mlog_errno(status);
1318 1419
@@ -1340,16 +1441,16 @@ int ocfs2_try_open_lock(struct inode *inode, int write)
1340 1441
1341 lockres = &OCFS2_I(inode)->ip_open_lockres; 1442 lockres = &OCFS2_I(inode)->ip_open_lockres;
1342 1443
1343 level = write ? LKM_EXMODE : LKM_PRMODE; 1444 level = write ? DLM_LOCK_EX : DLM_LOCK_PR;
1344 1445
1345 /* 1446 /*
1346 * The file system may already holding a PRMODE/EXMODE open lock. 1447 * The file system may already holding a PRMODE/EXMODE open lock.
1347 * Since we pass LKM_NOQUEUE, the request won't block waiting on 1448 * Since we pass DLM_LKF_NOQUEUE, the request won't block waiting on
1348 * other nodes and the -EAGAIN will indicate to the caller that 1449 * other nodes and the -EAGAIN will indicate to the caller that
1349 * this inode is still in use. 1450 * this inode is still in use.
1350 */ 1451 */
1351 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres, 1452 status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
1352 level, LKM_NOQUEUE, 0); 1453 level, DLM_LKF_NOQUEUE, 0);
1353 1454
1354out: 1455out:
1355 mlog_exit(status); 1456 mlog_exit(status);
@@ -1374,10 +1475,10 @@ void ocfs2_open_unlock(struct inode *inode)
1374 1475
1375 if(lockres->l_ro_holders) 1476 if(lockres->l_ro_holders)
1376 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1477 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1377 LKM_PRMODE); 1478 DLM_LOCK_PR);
1378 if(lockres->l_ex_holders) 1479 if(lockres->l_ex_holders)
1379 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres, 1480 ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
1380 LKM_EXMODE); 1481 DLM_LOCK_EX);
1381 1482
1382out: 1483out:
1383 mlog_exit_void(); 1484 mlog_exit_void();
@@ -1464,7 +1565,7 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1464 ocfs2_init_mask_waiter(&mw); 1565 ocfs2_init_mask_waiter(&mw);
1465 1566
1466 if ((lockres->l_flags & OCFS2_LOCK_BUSY) || 1567 if ((lockres->l_flags & OCFS2_LOCK_BUSY) ||
1467 (lockres->l_level > LKM_NLMODE)) { 1568 (lockres->l_level > DLM_LOCK_NL)) {
1468 mlog(ML_ERROR, 1569 mlog(ML_ERROR,
1469 "File lock \"%s\" has busy or locked state: flags: 0x%lx, " 1570 "File lock \"%s\" has busy or locked state: flags: 0x%lx, "
1470 "level: %u\n", lockres->l_name, lockres->l_flags, 1571 "level: %u\n", lockres->l_name, lockres->l_flags,
@@ -1503,14 +1604,12 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1503 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1604 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1504 spin_unlock_irqrestore(&lockres->l_lock, flags); 1605 spin_unlock_irqrestore(&lockres->l_lock, flags);
1505 1606
1506 ret = dlmlock(osb->dlm, level, &lockres->l_lksb, lkm_flags, 1607 ret = ocfs2_dlm_lock(osb->cconn, level, &lockres->l_lksb, lkm_flags,
1507 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1, 1608 lockres->l_name, OCFS2_LOCK_ID_MAX_LEN - 1,
1508 ocfs2_locking_ast, lockres, ocfs2_blocking_ast); 1609 lockres);
1509 if (ret != DLM_NORMAL) { 1610 if (ret) {
1510 if (trylock && ret == DLM_NOTQUEUED) 1611 if (!trylock || (ret != -EAGAIN)) {
1511 ret = -EAGAIN; 1612 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
1512 else {
1513 ocfs2_log_dlm_error("dlmlock", ret, lockres);
1514 ret = -EINVAL; 1613 ret = -EINVAL;
1515 } 1614 }
1516 1615
@@ -1537,6 +1636,10 @@ int ocfs2_file_lock(struct file *file, int ex, int trylock)
1537 * to just bubble sucess back up to the user. 1636 * to just bubble sucess back up to the user.
1538 */ 1637 */
1539 ret = ocfs2_flock_handle_signal(lockres, level); 1638 ret = ocfs2_flock_handle_signal(lockres, level);
1639 } else if (!ret && (level > lockres->l_level)) {
1640 /* Trylock failed asynchronously */
1641 BUG_ON(!trylock);
1642 ret = -EAGAIN;
1540 } 1643 }
1541 1644
1542out: 1645out:
@@ -1549,6 +1652,7 @@ out:
1549void ocfs2_file_unlock(struct file *file) 1652void ocfs2_file_unlock(struct file *file)
1550{ 1653{
1551 int ret; 1654 int ret;
1655 unsigned int gen;
1552 unsigned long flags; 1656 unsigned long flags;
1553 struct ocfs2_file_private *fp = file->private_data; 1657 struct ocfs2_file_private *fp = file->private_data;
1554 struct ocfs2_lock_res *lockres = &fp->fp_flock; 1658 struct ocfs2_lock_res *lockres = &fp->fp_flock;
@@ -1572,13 +1676,13 @@ void ocfs2_file_unlock(struct file *file)
1572 * Fake a blocking ast for the downconvert code. 1676 * Fake a blocking ast for the downconvert code.
1573 */ 1677 */
1574 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED); 1678 lockres_or_flags(lockres, OCFS2_LOCK_BLOCKED);
1575 lockres->l_blocking = LKM_EXMODE; 1679 lockres->l_blocking = DLM_LOCK_EX;
1576 1680
1577 ocfs2_prepare_downconvert(lockres, LKM_NLMODE); 1681 gen = ocfs2_prepare_downconvert(lockres, LKM_NLMODE);
1578 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0); 1682 lockres_add_mask_waiter(lockres, &mw, OCFS2_LOCK_BUSY, 0);
1579 spin_unlock_irqrestore(&lockres->l_lock, flags); 1683 spin_unlock_irqrestore(&lockres->l_lock, flags);
1580 1684
1581 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0); 1685 ret = ocfs2_downconvert_lock(osb, lockres, LKM_NLMODE, 0, gen);
1582 if (ret) { 1686 if (ret) {
1583 mlog_errno(ret); 1687 mlog_errno(ret);
1584 return; 1688 return;
@@ -1601,11 +1705,11 @@ static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
1601 * condition. */ 1705 * condition. */
1602 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) { 1706 if (lockres->l_flags & OCFS2_LOCK_BLOCKED) {
1603 switch(lockres->l_blocking) { 1707 switch(lockres->l_blocking) {
1604 case LKM_EXMODE: 1708 case DLM_LOCK_EX:
1605 if (!lockres->l_ex_holders && !lockres->l_ro_holders) 1709 if (!lockres->l_ex_holders && !lockres->l_ro_holders)
1606 kick = 1; 1710 kick = 1;
1607 break; 1711 break;
1608 case LKM_PRMODE: 1712 case DLM_LOCK_PR:
1609 if (!lockres->l_ex_holders) 1713 if (!lockres->l_ex_holders)
1610 kick = 1; 1714 kick = 1;
1611 break; 1715 break;
@@ -1648,7 +1752,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
1648 1752
1649 mlog_entry_void(); 1753 mlog_entry_void();
1650 1754
1651 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1755 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1652 1756
1653 /* 1757 /*
1654 * Invalidate the LVB of a deleted inode - this way other 1758 * Invalidate the LVB of a deleted inode - this way other
@@ -1700,7 +1804,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1700 1804
1701 mlog_meta_lvb(0, lockres); 1805 mlog_meta_lvb(0, lockres);
1702 1806
1703 lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1807 lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1704 1808
1705 /* We're safe here without the lockres lock... */ 1809 /* We're safe here without the lockres lock... */
1706 spin_lock(&oi->ip_lock); 1810 spin_lock(&oi->ip_lock);
@@ -1735,7 +1839,8 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
1735static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, 1839static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
1736 struct ocfs2_lock_res *lockres) 1840 struct ocfs2_lock_res *lockres)
1737{ 1841{
1738 struct ocfs2_meta_lvb *lvb = (struct ocfs2_meta_lvb *) lockres->l_lksb.lvb; 1842 struct ocfs2_meta_lvb *lvb =
1843 (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
1739 1844
1740 if (lvb->lvb_version == OCFS2_LVB_VERSION 1845 if (lvb->lvb_version == OCFS2_LVB_VERSION
1741 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) 1846 && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -1923,7 +2028,8 @@ int ocfs2_inode_lock_full(struct inode *inode,
1923 int ex, 2028 int ex,
1924 int arg_flags) 2029 int arg_flags)
1925{ 2030{
1926 int status, level, dlm_flags, acquired; 2031 int status, level, acquired;
2032 u32 dlm_flags;
1927 struct ocfs2_lock_res *lockres = NULL; 2033 struct ocfs2_lock_res *lockres = NULL;
1928 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2034 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
1929 struct buffer_head *local_bh = NULL; 2035 struct buffer_head *local_bh = NULL;
@@ -1950,14 +2056,13 @@ int ocfs2_inode_lock_full(struct inode *inode,
1950 goto local; 2056 goto local;
1951 2057
1952 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2058 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1953 wait_event(osb->recovery_event, 2059 ocfs2_wait_for_recovery(osb);
1954 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1955 2060
1956 lockres = &OCFS2_I(inode)->ip_inode_lockres; 2061 lockres = &OCFS2_I(inode)->ip_inode_lockres;
1957 level = ex ? LKM_EXMODE : LKM_PRMODE; 2062 level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
1958 dlm_flags = 0; 2063 dlm_flags = 0;
1959 if (arg_flags & OCFS2_META_LOCK_NOQUEUE) 2064 if (arg_flags & OCFS2_META_LOCK_NOQUEUE)
1960 dlm_flags |= LKM_NOQUEUE; 2065 dlm_flags |= DLM_LKF_NOQUEUE;
1961 2066
1962 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags); 2067 status = ocfs2_cluster_lock(osb, lockres, level, dlm_flags, arg_flags);
1963 if (status < 0) { 2068 if (status < 0) {
@@ -1974,8 +2079,7 @@ int ocfs2_inode_lock_full(struct inode *inode,
1974 * committed to owning this lock so we don't allow signals to 2079 * committed to owning this lock so we don't allow signals to
1975 * abort the operation. */ 2080 * abort the operation. */
1976 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY)) 2081 if (!(arg_flags & OCFS2_META_LOCK_RECOVERY))
1977 wait_event(osb->recovery_event, 2082 ocfs2_wait_for_recovery(osb);
1978 ocfs2_node_map_is_empty(osb, &osb->recovery_map));
1979 2083
1980local: 2084local:
1981 /* 2085 /*
@@ -2109,7 +2213,7 @@ int ocfs2_inode_lock_atime(struct inode *inode,
2109void ocfs2_inode_unlock(struct inode *inode, 2213void ocfs2_inode_unlock(struct inode *inode,
2110 int ex) 2214 int ex)
2111{ 2215{
2112 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2216 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2113 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres; 2217 struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_inode_lockres;
2114 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 2218 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
2115 2219
@@ -2130,10 +2234,8 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2130 int ex) 2234 int ex)
2131{ 2235{
2132 int status = 0; 2236 int status = 0;
2133 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2237 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2134 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 2238 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2135 struct buffer_head *bh;
2136 struct ocfs2_slot_info *si = osb->slot_info;
2137 2239
2138 mlog_entry_void(); 2240 mlog_entry_void();
2139 2241
@@ -2159,11 +2261,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
2159 goto bail; 2261 goto bail;
2160 } 2262 }
2161 if (status) { 2263 if (status) {
2162 bh = si->si_bh; 2264 status = ocfs2_refresh_slot_info(osb);
2163 status = ocfs2_read_block(osb, bh->b_blocknr, &bh, 0,
2164 si->si_inode);
2165 if (status == 0)
2166 ocfs2_update_slot_info(si);
2167 2265
2168 ocfs2_complete_lock_res_refresh(lockres, status); 2266 ocfs2_complete_lock_res_refresh(lockres, status);
2169 2267
@@ -2178,7 +2276,7 @@ bail:
2178void ocfs2_super_unlock(struct ocfs2_super *osb, 2276void ocfs2_super_unlock(struct ocfs2_super *osb,
2179 int ex) 2277 int ex)
2180{ 2278{
2181 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2279 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2182 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres; 2280 struct ocfs2_lock_res *lockres = &osb->osb_super_lockres;
2183 2281
2184 if (!ocfs2_mount_local(osb)) 2282 if (!ocfs2_mount_local(osb))
@@ -2196,7 +2294,7 @@ int ocfs2_rename_lock(struct ocfs2_super *osb)
2196 if (ocfs2_mount_local(osb)) 2294 if (ocfs2_mount_local(osb))
2197 return 0; 2295 return 0;
2198 2296
2199 status = ocfs2_cluster_lock(osb, lockres, LKM_EXMODE, 0, 0); 2297 status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, 0, 0);
2200 if (status < 0) 2298 if (status < 0)
2201 mlog_errno(status); 2299 mlog_errno(status);
2202 2300
@@ -2208,13 +2306,13 @@ void ocfs2_rename_unlock(struct ocfs2_super *osb)
2208 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres; 2306 struct ocfs2_lock_res *lockres = &osb->osb_rename_lockres;
2209 2307
2210 if (!ocfs2_mount_local(osb)) 2308 if (!ocfs2_mount_local(osb))
2211 ocfs2_cluster_unlock(osb, lockres, LKM_EXMODE); 2309 ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX);
2212} 2310}
2213 2311
2214int ocfs2_dentry_lock(struct dentry *dentry, int ex) 2312int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2215{ 2313{
2216 int ret; 2314 int ret;
2217 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2315 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2218 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2316 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2219 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2317 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2220 2318
@@ -2235,7 +2333,7 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex)
2235 2333
2236void ocfs2_dentry_unlock(struct dentry *dentry, int ex) 2334void ocfs2_dentry_unlock(struct dentry *dentry, int ex)
2237{ 2335{
2238 int level = ex ? LKM_EXMODE : LKM_PRMODE; 2336 int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
2239 struct ocfs2_dentry_lock *dl = dentry->d_fsdata; 2337 struct ocfs2_dentry_lock *dl = dentry->d_fsdata;
2240 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); 2338 struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
2241 2339
@@ -2400,7 +2498,7 @@ static int ocfs2_dlm_seq_show(struct seq_file *m, void *v)
2400 lockres->l_blocking); 2498 lockres->l_blocking);
2401 2499
2402 /* Dump the raw LVB */ 2500 /* Dump the raw LVB */
2403 lvb = lockres->l_lksb.lvb; 2501 lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
2404 for(i = 0; i < DLM_LVB_LEN; i++) 2502 for(i = 0; i < DLM_LVB_LEN; i++)
2405 seq_printf(m, "0x%x\t", lvb[i]); 2503 seq_printf(m, "0x%x\t", lvb[i]);
2406 2504
@@ -2504,13 +2602,14 @@ static void ocfs2_dlm_shutdown_debug(struct ocfs2_super *osb)
2504int ocfs2_dlm_init(struct ocfs2_super *osb) 2602int ocfs2_dlm_init(struct ocfs2_super *osb)
2505{ 2603{
2506 int status = 0; 2604 int status = 0;
2507 u32 dlm_key; 2605 struct ocfs2_cluster_connection *conn = NULL;
2508 struct dlm_ctxt *dlm = NULL;
2509 2606
2510 mlog_entry_void(); 2607 mlog_entry_void();
2511 2608
2512 if (ocfs2_mount_local(osb)) 2609 if (ocfs2_mount_local(osb)) {
2610 osb->node_num = 0;
2513 goto local; 2611 goto local;
2612 }
2514 2613
2515 status = ocfs2_dlm_init_debug(osb); 2614 status = ocfs2_dlm_init_debug(osb);
2516 if (status < 0) { 2615 if (status < 0) {
@@ -2527,26 +2626,31 @@ int ocfs2_dlm_init(struct ocfs2_super *osb)
2527 goto bail; 2626 goto bail;
2528 } 2627 }
2529 2628
2530 /* used by the dlm code to make message headers unique, each
2531 * node in this domain must agree on this. */
2532 dlm_key = crc32_le(0, osb->uuid_str, strlen(osb->uuid_str));
2533
2534 /* for now, uuid == domain */ 2629 /* for now, uuid == domain */
2535 dlm = dlm_register_domain(osb->uuid_str, dlm_key, 2630 status = ocfs2_cluster_connect(osb->osb_cluster_stack,
2536 &osb->osb_locking_proto); 2631 osb->uuid_str,
2537 if (IS_ERR(dlm)) { 2632 strlen(osb->uuid_str),
2538 status = PTR_ERR(dlm); 2633 ocfs2_do_node_down, osb,
2634 &conn);
2635 if (status) {
2539 mlog_errno(status); 2636 mlog_errno(status);
2540 goto bail; 2637 goto bail;
2541 } 2638 }
2542 2639
2543 dlm_register_eviction_cb(dlm, &osb->osb_eviction_cb); 2640 status = ocfs2_cluster_this_node(&osb->node_num);
2641 if (status < 0) {
2642 mlog_errno(status);
2643 mlog(ML_ERROR,
2644 "could not find this host's node number\n");
2645 ocfs2_cluster_disconnect(conn, 0);
2646 goto bail;
2647 }
2544 2648
2545local: 2649local:
2546 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb); 2650 ocfs2_super_lock_res_init(&osb->osb_super_lockres, osb);
2547 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb); 2651 ocfs2_rename_lock_res_init(&osb->osb_rename_lockres, osb);
2548 2652
2549 osb->dlm = dlm; 2653 osb->cconn = conn;
2550 2654
2551 status = 0; 2655 status = 0;
2552bail: 2656bail:
@@ -2560,14 +2664,19 @@ bail:
2560 return status; 2664 return status;
2561} 2665}
2562 2666
2563void ocfs2_dlm_shutdown(struct ocfs2_super *osb) 2667void ocfs2_dlm_shutdown(struct ocfs2_super *osb,
2668 int hangup_pending)
2564{ 2669{
2565 mlog_entry_void(); 2670 mlog_entry_void();
2566 2671
2567 dlm_unregister_eviction_cb(&osb->osb_eviction_cb);
2568
2569 ocfs2_drop_osb_locks(osb); 2672 ocfs2_drop_osb_locks(osb);
2570 2673
2674 /*
2675 * Now that we have dropped all locks and ocfs2_dismount_volume()
2676 * has disabled recovery, the DLM won't be talking to us. It's
2677 * safe to tear things down before disconnecting the cluster.
2678 */
2679
2571 if (osb->dc_task) { 2680 if (osb->dc_task) {
2572 kthread_stop(osb->dc_task); 2681 kthread_stop(osb->dc_task);
2573 osb->dc_task = NULL; 2682 osb->dc_task = NULL;
@@ -2576,15 +2685,15 @@ void ocfs2_dlm_shutdown(struct ocfs2_super *osb)
2576 ocfs2_lock_res_free(&osb->osb_super_lockres); 2685 ocfs2_lock_res_free(&osb->osb_super_lockres);
2577 ocfs2_lock_res_free(&osb->osb_rename_lockres); 2686 ocfs2_lock_res_free(&osb->osb_rename_lockres);
2578 2687
2579 dlm_unregister_domain(osb->dlm); 2688 ocfs2_cluster_disconnect(osb->cconn, hangup_pending);
2580 osb->dlm = NULL; 2689 osb->cconn = NULL;
2581 2690
2582 ocfs2_dlm_shutdown_debug(osb); 2691 ocfs2_dlm_shutdown_debug(osb);
2583 2692
2584 mlog_exit_void(); 2693 mlog_exit_void();
2585} 2694}
2586 2695
2587static void ocfs2_unlock_ast(void *opaque, enum dlm_status status) 2696static void ocfs2_unlock_ast(void *opaque, int error)
2588{ 2697{
2589 struct ocfs2_lock_res *lockres = opaque; 2698 struct ocfs2_lock_res *lockres = opaque;
2590 unsigned long flags; 2699 unsigned long flags;
@@ -2595,24 +2704,9 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
2595 lockres->l_unlock_action); 2704 lockres->l_unlock_action);
2596 2705
2597 spin_lock_irqsave(&lockres->l_lock, flags); 2706 spin_lock_irqsave(&lockres->l_lock, flags);
2598 /* We tried to cancel a convert request, but it was already 2707 if (error) {
2599 * granted. All we want to do here is clear our unlock 2708 mlog(ML_ERROR, "Dlm passes error %d for lock %s, "
2600 * state. The wake_up call done at the bottom is redundant 2709 "unlock_action %d\n", error, lockres->l_name,
2601 * (ocfs2_prepare_cancel_convert doesn't sleep on this) but doesn't
2602 * hurt anything anyway */
2603 if (status == DLM_CANCELGRANT &&
2604 lockres->l_unlock_action == OCFS2_UNLOCK_CANCEL_CONVERT) {
2605 mlog(0, "Got cancelgrant for %s\n", lockres->l_name);
2606
2607 /* We don't clear the busy flag in this case as it
2608 * should have been cleared by the ast which the dlm
2609 * has called. */
2610 goto complete_unlock;
2611 }
2612
2613 if (status != DLM_NORMAL) {
2614 mlog(ML_ERROR, "Dlm passes status %d for lock %s, "
2615 "unlock_action %d\n", status, lockres->l_name,
2616 lockres->l_unlock_action); 2710 lockres->l_unlock_action);
2617 spin_unlock_irqrestore(&lockres->l_lock, flags); 2711 spin_unlock_irqrestore(&lockres->l_lock, flags);
2618 return; 2712 return;
@@ -2624,14 +2718,13 @@ static void ocfs2_unlock_ast(void *opaque, enum dlm_status status)
2624 lockres->l_action = OCFS2_AST_INVALID; 2718 lockres->l_action = OCFS2_AST_INVALID;
2625 break; 2719 break;
2626 case OCFS2_UNLOCK_DROP_LOCK: 2720 case OCFS2_UNLOCK_DROP_LOCK:
2627 lockres->l_level = LKM_IVMODE; 2721 lockres->l_level = DLM_LOCK_IV;
2628 break; 2722 break;
2629 default: 2723 default:
2630 BUG(); 2724 BUG();
2631 } 2725 }
2632 2726
2633 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY); 2727 lockres_clear_flags(lockres, OCFS2_LOCK_BUSY);
2634complete_unlock:
2635 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID; 2728 lockres->l_unlock_action = OCFS2_UNLOCK_INVALID;
2636 spin_unlock_irqrestore(&lockres->l_lock, flags); 2729 spin_unlock_irqrestore(&lockres->l_lock, flags);
2637 2730
@@ -2643,16 +2736,16 @@ complete_unlock:
2643static int ocfs2_drop_lock(struct ocfs2_super *osb, 2736static int ocfs2_drop_lock(struct ocfs2_super *osb,
2644 struct ocfs2_lock_res *lockres) 2737 struct ocfs2_lock_res *lockres)
2645{ 2738{
2646 enum dlm_status status; 2739 int ret;
2647 unsigned long flags; 2740 unsigned long flags;
2648 int lkm_flags = 0; 2741 u32 lkm_flags = 0;
2649 2742
2650 /* We didn't get anywhere near actually using this lockres. */ 2743 /* We didn't get anywhere near actually using this lockres. */
2651 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) 2744 if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED))
2652 goto out; 2745 goto out;
2653 2746
2654 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) 2747 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
2655 lkm_flags |= LKM_VALBLK; 2748 lkm_flags |= DLM_LKF_VALBLK;
2656 2749
2657 spin_lock_irqsave(&lockres->l_lock, flags); 2750 spin_lock_irqsave(&lockres->l_lock, flags);
2658 2751
@@ -2678,7 +2771,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2678 2771
2679 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { 2772 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
2680 if (lockres->l_flags & OCFS2_LOCK_ATTACHED && 2773 if (lockres->l_flags & OCFS2_LOCK_ATTACHED &&
2681 lockres->l_level == LKM_EXMODE && 2774 lockres->l_level == DLM_LOCK_EX &&
2682 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH)) 2775 !(lockres->l_flags & OCFS2_LOCK_NEEDS_REFRESH))
2683 lockres->l_ops->set_lvb(lockres); 2776 lockres->l_ops->set_lvb(lockres);
2684 } 2777 }
@@ -2707,15 +2800,15 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb,
2707 2800
2708 mlog(0, "lock %s\n", lockres->l_name); 2801 mlog(0, "lock %s\n", lockres->l_name);
2709 2802
2710 status = dlmunlock(osb->dlm, &lockres->l_lksb, lkm_flags, 2803 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb, lkm_flags,
2711 ocfs2_unlock_ast, lockres); 2804 lockres);
2712 if (status != DLM_NORMAL) { 2805 if (ret) {
2713 ocfs2_log_dlm_error("dlmunlock", status, lockres); 2806 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
2714 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags); 2807 mlog(ML_ERROR, "lockres flags: %lu\n", lockres->l_flags);
2715 dlm_print_one_lock(lockres->l_lksb.lockid); 2808 ocfs2_dlm_dump_lksb(&lockres->l_lksb);
2716 BUG(); 2809 BUG();
2717 } 2810 }
2718 mlog(0, "lock %s, successfull return from dlmunlock\n", 2811 mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n",
2719 lockres->l_name); 2812 lockres->l_name);
2720 2813
2721 ocfs2_wait_on_busy_lock(lockres); 2814 ocfs2_wait_on_busy_lock(lockres);
@@ -2806,15 +2899,15 @@ int ocfs2_drop_inode_locks(struct inode *inode)
2806 return status; 2899 return status;
2807} 2900}
2808 2901
2809static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres, 2902static unsigned int ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2810 int new_level) 2903 int new_level)
2811{ 2904{
2812 assert_spin_locked(&lockres->l_lock); 2905 assert_spin_locked(&lockres->l_lock);
2813 2906
2814 BUG_ON(lockres->l_blocking <= LKM_NLMODE); 2907 BUG_ON(lockres->l_blocking <= DLM_LOCK_NL);
2815 2908
2816 if (lockres->l_level <= new_level) { 2909 if (lockres->l_level <= new_level) {
2817 mlog(ML_ERROR, "lockres->l_level (%u) <= new_level (%u)\n", 2910 mlog(ML_ERROR, "lockres->l_level (%d) <= new_level (%d)\n",
2818 lockres->l_level, new_level); 2911 lockres->l_level, new_level);
2819 BUG(); 2912 BUG();
2820 } 2913 }
@@ -2825,33 +2918,33 @@ static void ocfs2_prepare_downconvert(struct ocfs2_lock_res *lockres,
2825 lockres->l_action = OCFS2_AST_DOWNCONVERT; 2918 lockres->l_action = OCFS2_AST_DOWNCONVERT;
2826 lockres->l_requested = new_level; 2919 lockres->l_requested = new_level;
2827 lockres_or_flags(lockres, OCFS2_LOCK_BUSY); 2920 lockres_or_flags(lockres, OCFS2_LOCK_BUSY);
2921 return lockres_set_pending(lockres);
2828} 2922}
2829 2923
2830static int ocfs2_downconvert_lock(struct ocfs2_super *osb, 2924static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
2831 struct ocfs2_lock_res *lockres, 2925 struct ocfs2_lock_res *lockres,
2832 int new_level, 2926 int new_level,
2833 int lvb) 2927 int lvb,
2928 unsigned int generation)
2834{ 2929{
2835 int ret, dlm_flags = LKM_CONVERT; 2930 int ret;
2836 enum dlm_status status; 2931 u32 dlm_flags = DLM_LKF_CONVERT;
2837 2932
2838 mlog_entry_void(); 2933 mlog_entry_void();
2839 2934
2840 if (lvb) 2935 if (lvb)
2841 dlm_flags |= LKM_VALBLK; 2936 dlm_flags |= DLM_LKF_VALBLK;
2842 2937
2843 status = dlmlock(osb->dlm, 2938 ret = ocfs2_dlm_lock(osb->cconn,
2844 new_level, 2939 new_level,
2845 &lockres->l_lksb, 2940 &lockres->l_lksb,
2846 dlm_flags, 2941 dlm_flags,
2847 lockres->l_name, 2942 lockres->l_name,
2848 OCFS2_LOCK_ID_MAX_LEN - 1, 2943 OCFS2_LOCK_ID_MAX_LEN - 1,
2849 ocfs2_locking_ast, 2944 lockres);
2850 lockres, 2945 lockres_clear_pending(lockres, generation, osb);
2851 ocfs2_blocking_ast); 2946 if (ret) {
2852 if (status != DLM_NORMAL) { 2947 ocfs2_log_dlm_error("ocfs2_dlm_lock", ret, lockres);
2853 ocfs2_log_dlm_error("dlmlock", status, lockres);
2854 ret = -EINVAL;
2855 ocfs2_recover_from_dlm_error(lockres, 1); 2948 ocfs2_recover_from_dlm_error(lockres, 1);
2856 goto bail; 2949 goto bail;
2857 } 2950 }
@@ -2862,7 +2955,7 @@ bail:
2862 return ret; 2955 return ret;
2863} 2956}
2864 2957
2865/* returns 1 when the caller should unlock and call dlmunlock */ 2958/* returns 1 when the caller should unlock and call ocfs2_dlm_unlock */
2866static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb, 2959static int ocfs2_prepare_cancel_convert(struct ocfs2_super *osb,
2867 struct ocfs2_lock_res *lockres) 2960 struct ocfs2_lock_res *lockres)
2868{ 2961{
@@ -2898,24 +2991,18 @@ static int ocfs2_cancel_convert(struct ocfs2_super *osb,
2898 struct ocfs2_lock_res *lockres) 2991 struct ocfs2_lock_res *lockres)
2899{ 2992{
2900 int ret; 2993 int ret;
2901 enum dlm_status status;
2902 2994
2903 mlog_entry_void(); 2995 mlog_entry_void();
2904 mlog(0, "lock %s\n", lockres->l_name); 2996 mlog(0, "lock %s\n", lockres->l_name);
2905 2997
2906 ret = 0; 2998 ret = ocfs2_dlm_unlock(osb->cconn, &lockres->l_lksb,
2907 status = dlmunlock(osb->dlm, 2999 DLM_LKF_CANCEL, lockres);
2908 &lockres->l_lksb, 3000 if (ret) {
2909 LKM_CANCEL, 3001 ocfs2_log_dlm_error("ocfs2_dlm_unlock", ret, lockres);
2910 ocfs2_unlock_ast,
2911 lockres);
2912 if (status != DLM_NORMAL) {
2913 ocfs2_log_dlm_error("dlmunlock", status, lockres);
2914 ret = -EINVAL;
2915 ocfs2_recover_from_dlm_error(lockres, 0); 3002 ocfs2_recover_from_dlm_error(lockres, 0);
2916 } 3003 }
2917 3004
2918 mlog(0, "lock %s return from dlmunlock\n", lockres->l_name); 3005 mlog(0, "lock %s return from ocfs2_dlm_unlock\n", lockres->l_name);
2919 3006
2920 mlog_exit(ret); 3007 mlog_exit(ret);
2921 return ret; 3008 return ret;
@@ -2930,6 +3017,7 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
2930 int new_level; 3017 int new_level;
2931 int ret = 0; 3018 int ret = 0;
2932 int set_lvb = 0; 3019 int set_lvb = 0;
3020 unsigned int gen;
2933 3021
2934 mlog_entry_void(); 3022 mlog_entry_void();
2935 3023
@@ -2939,6 +3027,32 @@ static int ocfs2_unblock_lock(struct ocfs2_super *osb,
2939 3027
2940recheck: 3028recheck:
2941 if (lockres->l_flags & OCFS2_LOCK_BUSY) { 3029 if (lockres->l_flags & OCFS2_LOCK_BUSY) {
3030 /* XXX
3031 * This is a *big* race. The OCFS2_LOCK_PENDING flag
3032 * exists entirely for one reason - another thread has set
3033 * OCFS2_LOCK_BUSY, but has *NOT* yet called dlm_lock().
3034 *
3035 * If we do ocfs2_cancel_convert() before the other thread
3036 * calls dlm_lock(), our cancel will do nothing. We will
3037 * get no ast, and we will have no way of knowing the
3038 * cancel failed. Meanwhile, the other thread will call
3039 * into dlm_lock() and wait...forever.
3040 *
3041 * Why forever? Because another node has asked for the
3042 * lock first; that's why we're here in unblock_lock().
3043 *
3044 * The solution is OCFS2_LOCK_PENDING. When PENDING is
3045 * set, we just requeue the unblock. Only when the other
3046 * thread has called dlm_lock() and cleared PENDING will
3047 * we then cancel their request.
3048 *
3049 * All callers of dlm_lock() must set OCFS2_DLM_PENDING
3050 * at the same time they set OCFS2_DLM_BUSY. They must
3051 * clear OCFS2_DLM_PENDING after dlm_lock() returns.
3052 */
3053 if (lockres->l_flags & OCFS2_LOCK_PENDING)
3054 goto leave_requeue;
3055
2942 ctl->requeue = 1; 3056 ctl->requeue = 1;
2943 ret = ocfs2_prepare_cancel_convert(osb, lockres); 3057 ret = ocfs2_prepare_cancel_convert(osb, lockres);
2944 spin_unlock_irqrestore(&lockres->l_lock, flags); 3058 spin_unlock_irqrestore(&lockres->l_lock, flags);
@@ -2952,13 +3066,13 @@ recheck:
2952 3066
2953 /* if we're blocking an exclusive and we have *any* holders, 3067 /* if we're blocking an exclusive and we have *any* holders,
2954 * then requeue. */ 3068 * then requeue. */
2955 if ((lockres->l_blocking == LKM_EXMODE) 3069 if ((lockres->l_blocking == DLM_LOCK_EX)
2956 && (lockres->l_ex_holders || lockres->l_ro_holders)) 3070 && (lockres->l_ex_holders || lockres->l_ro_holders))
2957 goto leave_requeue; 3071 goto leave_requeue;
2958 3072
2959 /* If it's a PR we're blocking, then only 3073 /* If it's a PR we're blocking, then only
2960 * requeue if we've got any EX holders */ 3074 * requeue if we've got any EX holders */
2961 if (lockres->l_blocking == LKM_PRMODE && 3075 if (lockres->l_blocking == DLM_LOCK_PR &&
2962 lockres->l_ex_holders) 3076 lockres->l_ex_holders)
2963 goto leave_requeue; 3077 goto leave_requeue;
2964 3078
@@ -3005,7 +3119,7 @@ downconvert:
3005 ctl->requeue = 0; 3119 ctl->requeue = 0;
3006 3120
3007 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) { 3121 if (lockres->l_ops->flags & LOCK_TYPE_USES_LVB) {
3008 if (lockres->l_level == LKM_EXMODE) 3122 if (lockres->l_level == DLM_LOCK_EX)
3009 set_lvb = 1; 3123 set_lvb = 1;
3010 3124
3011 /* 3125 /*
@@ -3018,9 +3132,11 @@ downconvert:
3018 lockres->l_ops->set_lvb(lockres); 3132 lockres->l_ops->set_lvb(lockres);
3019 } 3133 }
3020 3134
3021 ocfs2_prepare_downconvert(lockres, new_level); 3135 gen = ocfs2_prepare_downconvert(lockres, new_level);
3022 spin_unlock_irqrestore(&lockres->l_lock, flags); 3136 spin_unlock_irqrestore(&lockres->l_lock, flags);
3023 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb); 3137 ret = ocfs2_downconvert_lock(osb, lockres, new_level, set_lvb,
3138 gen);
3139
3024leave: 3140leave:
3025 mlog_exit(ret); 3141 mlog_exit(ret);
3026 return ret; 3142 return ret;
@@ -3059,7 +3175,7 @@ static int ocfs2_data_convert_worker(struct ocfs2_lock_res *lockres,
3059 (unsigned long long)OCFS2_I(inode)->ip_blkno); 3175 (unsigned long long)OCFS2_I(inode)->ip_blkno);
3060 } 3176 }
3061 sync_mapping_buffers(mapping); 3177 sync_mapping_buffers(mapping);
3062 if (blocking == LKM_EXMODE) { 3178 if (blocking == DLM_LOCK_EX) {
3063 truncate_inode_pages(mapping, 0); 3179 truncate_inode_pages(mapping, 0);
3064 } else { 3180 } else {
3065 /* We only need to wait on the I/O if we're not also 3181 /* We only need to wait on the I/O if we're not also
@@ -3080,8 +3196,8 @@ static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
3080 struct inode *inode = ocfs2_lock_res_inode(lockres); 3196 struct inode *inode = ocfs2_lock_res_inode(lockres);
3081 int checkpointed = ocfs2_inode_fully_checkpointed(inode); 3197 int checkpointed = ocfs2_inode_fully_checkpointed(inode);
3082 3198
3083 BUG_ON(new_level != LKM_NLMODE && new_level != LKM_PRMODE); 3199 BUG_ON(new_level != DLM_LOCK_NL && new_level != DLM_LOCK_PR);
3084 BUG_ON(lockres->l_level != LKM_EXMODE && !checkpointed); 3200 BUG_ON(lockres->l_level != DLM_LOCK_EX && !checkpointed);
3085 3201
3086 if (checkpointed) 3202 if (checkpointed)
3087 return 1; 3203 return 1;
@@ -3145,7 +3261,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3145 * valid. The downconvert code will retain a PR for this node, 3261 * valid. The downconvert code will retain a PR for this node,
3146 * so there's no further work to do. 3262 * so there's no further work to do.
3147 */ 3263 */
3148 if (blocking == LKM_PRMODE) 3264 if (blocking == DLM_LOCK_PR)
3149 return UNBLOCK_CONTINUE; 3265 return UNBLOCK_CONTINUE;
3150 3266
3151 /* 3267 /*
@@ -3219,6 +3335,45 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
3219 return UNBLOCK_CONTINUE_POST; 3335 return UNBLOCK_CONTINUE_POST;
3220} 3336}
3221 3337
3338/*
3339 * This is the filesystem locking protocol. It provides the lock handling
3340 * hooks for the underlying DLM. It has a maximum version number.
3341 * The version number allows interoperability with systems running at
3342 * the same major number and an equal or smaller minor number.
3343 *
3344 * Whenever the filesystem does new things with locks (adds or removes a
3345 * lock, orders them differently, does different things underneath a lock),
3346 * the version must be changed. The protocol is negotiated when joining
3347 * the dlm domain. A node may join the domain if its major version is
3348 * identical to all other nodes and its minor version is greater than
3349 * or equal to all other nodes. When its minor version is greater than
3350 * the other nodes, it will run at the minor version specified by the
3351 * other nodes.
3352 *
3353 * If a locking change is made that will not be compatible with older
3354 * versions, the major number must be increased and the minor version set
3355 * to zero. If a change merely adds a behavior that can be disabled when
3356 * speaking to older versions, the minor version must be increased. If a
3357 * change adds a fully backwards compatible change (eg, LVB changes that
3358 * are just ignored by older versions), the version does not need to be
3359 * updated.
3360 */
3361static struct ocfs2_locking_protocol lproto = {
3362 .lp_max_version = {
3363 .pv_major = OCFS2_LOCKING_PROTOCOL_MAJOR,
3364 .pv_minor = OCFS2_LOCKING_PROTOCOL_MINOR,
3365 },
3366 .lp_lock_ast = ocfs2_locking_ast,
3367 .lp_blocking_ast = ocfs2_blocking_ast,
3368 .lp_unlock_ast = ocfs2_unlock_ast,
3369};
3370
3371void ocfs2_set_locking_protocol(void)
3372{
3373 ocfs2_stack_glue_set_locking_protocol(&lproto);
3374}
3375
3376
3222static void ocfs2_process_blocked_lock(struct ocfs2_super *osb, 3377static void ocfs2_process_blocked_lock(struct ocfs2_super *osb,
3223 struct ocfs2_lock_res *lockres) 3378 struct ocfs2_lock_res *lockres)
3224{ 3379{
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index e3cf902404b4..2bb01f09c1b1 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -58,7 +58,7 @@ struct ocfs2_meta_lvb {
58#define OCFS2_LOCK_NONBLOCK (0x04) 58#define OCFS2_LOCK_NONBLOCK (0x04)
59 59
60int ocfs2_dlm_init(struct ocfs2_super *osb); 60int ocfs2_dlm_init(struct ocfs2_super *osb);
61void ocfs2_dlm_shutdown(struct ocfs2_super *osb); 61void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
62void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res); 62void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
63void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res, 63void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
64 enum ocfs2_lock_type type, 64 enum ocfs2_lock_type type,
@@ -114,5 +114,6 @@ void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
114struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void); 114struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
115void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug); 115void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
116 116
117extern const struct dlm_protocol_version ocfs2_locking_protocol; 117/* To set the locking protocol on module initialization */
118void ocfs2_set_locking_protocol(void);
118#endif /* DLMGLUE_H */ 119#endif /* DLMGLUE_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index ed5d5232e85d..9154c82d3258 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ const struct file_operations ocfs2_fops = {
2242 .open = ocfs2_file_open, 2242 .open = ocfs2_file_open,
2243 .aio_read = ocfs2_file_aio_read, 2243 .aio_read = ocfs2_file_aio_read,
2244 .aio_write = ocfs2_file_aio_write, 2244 .aio_write = ocfs2_file_aio_write,
2245 .ioctl = ocfs2_ioctl, 2245 .unlocked_ioctl = ocfs2_ioctl,
2246#ifdef CONFIG_COMPAT 2246#ifdef CONFIG_COMPAT
2247 .compat_ioctl = ocfs2_compat_ioctl, 2247 .compat_ioctl = ocfs2_compat_ioctl,
2248#endif 2248#endif
@@ -2258,7 +2258,7 @@ const struct file_operations ocfs2_dops = {
2258 .fsync = ocfs2_sync_file, 2258 .fsync = ocfs2_sync_file,
2259 .release = ocfs2_dir_release, 2259 .release = ocfs2_dir_release,
2260 .open = ocfs2_dir_open, 2260 .open = ocfs2_dir_open,
2261 .ioctl = ocfs2_ioctl, 2261 .unlocked_ioctl = ocfs2_ioctl,
2262#ifdef CONFIG_COMPAT 2262#ifdef CONFIG_COMPAT
2263 .compat_ioctl = ocfs2_compat_ioctl, 2263 .compat_ioctl = ocfs2_compat_ioctl,
2264#endif 2264#endif
diff --git a/fs/ocfs2/heartbeat.c b/fs/ocfs2/heartbeat.c
index 0758daf64da0..c6e7213db868 100644
--- a/fs/ocfs2/heartbeat.c
+++ b/fs/ocfs2/heartbeat.c
@@ -28,9 +28,6 @@
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h> 29#include <linux/slab.h>
30#include <linux/highmem.h> 30#include <linux/highmem.h>
31#include <linux/kmod.h>
32
33#include <dlm/dlmapi.h>
34 31
35#define MLOG_MASK_PREFIX ML_SUPER 32#define MLOG_MASK_PREFIX ML_SUPER
36#include <cluster/masklog.h> 33#include <cluster/masklog.h>
@@ -48,7 +45,6 @@ static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
48 int bit); 45 int bit);
49static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map, 46static inline void __ocfs2_node_map_clear_bit(struct ocfs2_node_map *map,
50 int bit); 47 int bit);
51static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map);
52 48
53/* special case -1 for now 49/* special case -1 for now
54 * TODO: should *really* make sure the calling func never passes -1!! */ 50 * TODO: should *really* make sure the calling func never passes -1!! */
@@ -62,23 +58,23 @@ static void ocfs2_node_map_init(struct ocfs2_node_map *map)
62void ocfs2_init_node_maps(struct ocfs2_super *osb) 58void ocfs2_init_node_maps(struct ocfs2_super *osb)
63{ 59{
64 spin_lock_init(&osb->node_map_lock); 60 spin_lock_init(&osb->node_map_lock);
65 ocfs2_node_map_init(&osb->recovery_map);
66 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs); 61 ocfs2_node_map_init(&osb->osb_recovering_orphan_dirs);
67} 62}
68 63
69static void ocfs2_do_node_down(int node_num, 64void ocfs2_do_node_down(int node_num, void *data)
70 struct ocfs2_super *osb)
71{ 65{
66 struct ocfs2_super *osb = data;
67
72 BUG_ON(osb->node_num == node_num); 68 BUG_ON(osb->node_num == node_num);
73 69
74 mlog(0, "ocfs2: node down event for %d\n", node_num); 70 mlog(0, "ocfs2: node down event for %d\n", node_num);
75 71
76 if (!osb->dlm) { 72 if (!osb->cconn) {
77 /* 73 /*
78 * No DLM means we're not even ready to participate yet. 74 * No cluster connection means we're not even ready to
79 * We check the slots after the DLM comes up, so we will 75 * participate yet. We check the slots after the cluster
80 * notice the node death then. We can safely ignore it 76 * comes up, so we will notice the node death then. We
81 * here. 77 * can safely ignore it here.
82 */ 78 */
83 return; 79 return;
84 } 80 }
@@ -86,61 +82,6 @@ static void ocfs2_do_node_down(int node_num,
86 ocfs2_recovery_thread(osb, node_num); 82 ocfs2_recovery_thread(osb, node_num);
87} 83}
88 84
89/* Called from the dlm when it's about to evict a node. We may also
90 * get a heartbeat callback later. */
91static void ocfs2_dlm_eviction_cb(int node_num,
92 void *data)
93{
94 struct ocfs2_super *osb = (struct ocfs2_super *) data;
95 struct super_block *sb = osb->sb;
96
97 mlog(ML_NOTICE, "device (%u,%u): dlm has evicted node %d\n",
98 MAJOR(sb->s_dev), MINOR(sb->s_dev), node_num);
99
100 ocfs2_do_node_down(node_num, osb);
101}
102
103void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb)
104{
105 /* Not exactly a heartbeat callback, but leads to essentially
106 * the same path so we set it up here. */
107 dlm_setup_eviction_cb(&osb->osb_eviction_cb,
108 ocfs2_dlm_eviction_cb,
109 osb);
110}
111
112void ocfs2_stop_heartbeat(struct ocfs2_super *osb)
113{
114 int ret;
115 char *argv[5], *envp[3];
116
117 if (ocfs2_mount_local(osb))
118 return;
119
120 if (!osb->uuid_str) {
121 /* This can happen if we don't get far enough in mount... */
122 mlog(0, "No UUID with which to stop heartbeat!\n\n");
123 return;
124 }
125
126 argv[0] = (char *)o2nm_get_hb_ctl_path();
127 argv[1] = "-K";
128 argv[2] = "-u";
129 argv[3] = osb->uuid_str;
130 argv[4] = NULL;
131
132 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
133
134 /* minimal command environment taken from cpu_run_sbin_hotplug */
135 envp[0] = "HOME=/";
136 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
137 envp[2] = NULL;
138
139 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
140 if (ret < 0)
141 mlog_errno(ret);
142}
143
144static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map, 85static inline void __ocfs2_node_map_set_bit(struct ocfs2_node_map *map,
145 int bit) 86 int bit)
146{ 87{
@@ -192,112 +133,3 @@ int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
192 return ret; 133 return ret;
193} 134}
194 135
195static inline int __ocfs2_node_map_is_empty(struct ocfs2_node_map *map)
196{
197 int bit;
198 bit = find_next_bit(map->map, map->num_nodes, 0);
199 if (bit < map->num_nodes)
200 return 0;
201 return 1;
202}
203
204int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
205 struct ocfs2_node_map *map)
206{
207 int ret;
208 BUG_ON(map->num_nodes == 0);
209 spin_lock(&osb->node_map_lock);
210 ret = __ocfs2_node_map_is_empty(map);
211 spin_unlock(&osb->node_map_lock);
212 return ret;
213}
214
215#if 0
216
217static void __ocfs2_node_map_dup(struct ocfs2_node_map *target,
218 struct ocfs2_node_map *from)
219{
220 BUG_ON(from->num_nodes == 0);
221 ocfs2_node_map_init(target);
222 __ocfs2_node_map_set(target, from);
223}
224
225/* returns 1 if bit is the only bit set in target, 0 otherwise */
226int ocfs2_node_map_is_only(struct ocfs2_super *osb,
227 struct ocfs2_node_map *target,
228 int bit)
229{
230 struct ocfs2_node_map temp;
231 int ret;
232
233 spin_lock(&osb->node_map_lock);
234 __ocfs2_node_map_dup(&temp, target);
235 __ocfs2_node_map_clear_bit(&temp, bit);
236 ret = __ocfs2_node_map_is_empty(&temp);
237 spin_unlock(&osb->node_map_lock);
238
239 return ret;
240}
241
242static void __ocfs2_node_map_set(struct ocfs2_node_map *target,
243 struct ocfs2_node_map *from)
244{
245 int num_longs, i;
246
247 BUG_ON(target->num_nodes != from->num_nodes);
248 BUG_ON(target->num_nodes == 0);
249
250 num_longs = BITS_TO_LONGS(target->num_nodes);
251 for (i = 0; i < num_longs; i++)
252 target->map[i] = from->map[i];
253}
254
255#endif /* 0 */
256
257/* Returns whether the recovery bit was actually set - it may not be
258 * if a node is still marked as needing recovery */
259int ocfs2_recovery_map_set(struct ocfs2_super *osb,
260 int num)
261{
262 int set = 0;
263
264 spin_lock(&osb->node_map_lock);
265
266 if (!test_bit(num, osb->recovery_map.map)) {
267 __ocfs2_node_map_set_bit(&osb->recovery_map, num);
268 set = 1;
269 }
270
271 spin_unlock(&osb->node_map_lock);
272
273 return set;
274}
275
276void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
277 int num)
278{
279 ocfs2_node_map_clear_bit(osb, &osb->recovery_map, num);
280}
281
282int ocfs2_node_map_iterate(struct ocfs2_super *osb,
283 struct ocfs2_node_map *map,
284 int idx)
285{
286 int i = idx;
287
288 idx = O2NM_INVALID_NODE_NUM;
289 spin_lock(&osb->node_map_lock);
290 if ((i != O2NM_INVALID_NODE_NUM) &&
291 (i >= 0) &&
292 (i < map->num_nodes)) {
293 while(i < map->num_nodes) {
294 if (test_bit(i, map->map)) {
295 idx = i;
296 break;
297 }
298 i++;
299 }
300 }
301 spin_unlock(&osb->node_map_lock);
302 return idx;
303}
diff --git a/fs/ocfs2/heartbeat.h b/fs/ocfs2/heartbeat.h
index eac63aed7611..74b9c5dda28d 100644
--- a/fs/ocfs2/heartbeat.h
+++ b/fs/ocfs2/heartbeat.h
@@ -28,13 +28,10 @@
28 28
29void ocfs2_init_node_maps(struct ocfs2_super *osb); 29void ocfs2_init_node_maps(struct ocfs2_super *osb);
30 30
31void ocfs2_setup_hb_callbacks(struct ocfs2_super *osb); 31void ocfs2_do_node_down(int node_num, void *data);
32void ocfs2_stop_heartbeat(struct ocfs2_super *osb);
33 32
34/* node map functions - used to keep track of mounted and in-recovery 33/* node map functions - used to keep track of mounted and in-recovery
35 * nodes. */ 34 * nodes. */
36int ocfs2_node_map_is_empty(struct ocfs2_super *osb,
37 struct ocfs2_node_map *map);
38void ocfs2_node_map_set_bit(struct ocfs2_super *osb, 35void ocfs2_node_map_set_bit(struct ocfs2_super *osb,
39 struct ocfs2_node_map *map, 36 struct ocfs2_node_map *map,
40 int bit); 37 int bit);
@@ -44,17 +41,5 @@ void ocfs2_node_map_clear_bit(struct ocfs2_super *osb,
44int ocfs2_node_map_test_bit(struct ocfs2_super *osb, 41int ocfs2_node_map_test_bit(struct ocfs2_super *osb,
45 struct ocfs2_node_map *map, 42 struct ocfs2_node_map *map,
46 int bit); 43 int bit);
47int ocfs2_node_map_iterate(struct ocfs2_super *osb,
48 struct ocfs2_node_map *map,
49 int idx);
50static inline int ocfs2_node_map_first_set_bit(struct ocfs2_super *osb,
51 struct ocfs2_node_map *map)
52{
53 return ocfs2_node_map_iterate(osb, map, 0);
54}
55int ocfs2_recovery_map_set(struct ocfs2_super *osb,
56 int num);
57void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
58 int num);
59 44
60#endif /* OCFS2_HEARTBEAT_H */ 45#endif /* OCFS2_HEARTBEAT_H */
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 5177fba5162b..7b142f0ce995 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/fs.h> 8#include <linux/fs.h>
9#include <linux/mount.h> 9#include <linux/mount.h>
10#include <linux/smp_lock.h>
10 11
11#define MLOG_MASK_PREFIX ML_INODE 12#define MLOG_MASK_PREFIX ML_INODE
12#include <cluster/masklog.h> 13#include <cluster/masklog.h>
@@ -59,10 +60,6 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
59 goto bail; 60 goto bail;
60 } 61 }
61 62
62 status = -EROFS;
63 if (IS_RDONLY(inode))
64 goto bail_unlock;
65
66 status = -EACCES; 63 status = -EACCES;
67 if (!is_owner_or_cap(inode)) 64 if (!is_owner_or_cap(inode))
68 goto bail_unlock; 65 goto bail_unlock;
@@ -112,9 +109,9 @@ bail:
112 return status; 109 return status;
113} 110}
114 111
115int ocfs2_ioctl(struct inode * inode, struct file * filp, 112long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
116 unsigned int cmd, unsigned long arg)
117{ 113{
114 struct inode *inode = filp->f_path.dentry->d_inode;
118 unsigned int flags; 115 unsigned int flags;
119 int new_clusters; 116 int new_clusters;
120 int status; 117 int status;
@@ -133,8 +130,13 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
133 if (get_user(flags, (int __user *) arg)) 130 if (get_user(flags, (int __user *) arg))
134 return -EFAULT; 131 return -EFAULT;
135 132
136 return ocfs2_set_inode_attr(inode, flags, 133 status = mnt_want_write(filp->f_path.mnt);
134 if (status)
135 return status;
136 status = ocfs2_set_inode_attr(inode, flags,
137 OCFS2_FL_MODIFIABLE); 137 OCFS2_FL_MODIFIABLE);
138 mnt_drop_write(filp->f_path.mnt);
139 return status;
138 case OCFS2_IOC_RESVSP: 140 case OCFS2_IOC_RESVSP:
139 case OCFS2_IOC_RESVSP64: 141 case OCFS2_IOC_RESVSP64:
140 case OCFS2_IOC_UNRESVSP: 142 case OCFS2_IOC_UNRESVSP:
@@ -168,9 +170,6 @@ int ocfs2_ioctl(struct inode * inode, struct file * filp,
168#ifdef CONFIG_COMPAT 170#ifdef CONFIG_COMPAT
169long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) 171long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
170{ 172{
171 struct inode *inode = file->f_path.dentry->d_inode;
172 int ret;
173
174 switch (cmd) { 173 switch (cmd) {
175 case OCFS2_IOC32_GETFLAGS: 174 case OCFS2_IOC32_GETFLAGS:
176 cmd = OCFS2_IOC_GETFLAGS; 175 cmd = OCFS2_IOC_GETFLAGS;
@@ -190,9 +189,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg)
190 return -ENOIOCTLCMD; 189 return -ENOIOCTLCMD;
191 } 190 }
192 191
193 lock_kernel(); 192 return ocfs2_ioctl(file, cmd, arg);
194 ret = ocfs2_ioctl(inode, file, cmd, arg);
195 unlock_kernel();
196 return ret;
197} 193}
198#endif 194#endif
diff --git a/fs/ocfs2/ioctl.h b/fs/ocfs2/ioctl.h
index 4d6c4f430d0d..cf9a5ee30fef 100644
--- a/fs/ocfs2/ioctl.h
+++ b/fs/ocfs2/ioctl.h
@@ -10,8 +10,7 @@
10#ifndef OCFS2_IOCTL_H 10#ifndef OCFS2_IOCTL_H
11#define OCFS2_IOCTL_H 11#define OCFS2_IOCTL_H
12 12
13int ocfs2_ioctl(struct inode * inode, struct file * filp, 13long ocfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
14 unsigned int cmd, unsigned long arg);
15long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg); 14long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg);
16 15
17#endif /* OCFS2_IOCTL_H */ 16#endif /* OCFS2_IOCTL_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f31c7e8c19c3..9698338adc39 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -64,6 +64,137 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
64 int slot); 64 int slot);
65static int ocfs2_commit_thread(void *arg); 65static int ocfs2_commit_thread(void *arg);
66 66
67
68/*
69 * The recovery_list is a simple linked list of node numbers to recover.
70 * It is protected by the recovery_lock.
71 */
72
73struct ocfs2_recovery_map {
74 unsigned int rm_used;
75 unsigned int *rm_entries;
76};
77
78int ocfs2_recovery_init(struct ocfs2_super *osb)
79{
80 struct ocfs2_recovery_map *rm;
81
82 mutex_init(&osb->recovery_lock);
83 osb->disable_recovery = 0;
84 osb->recovery_thread_task = NULL;
85 init_waitqueue_head(&osb->recovery_event);
86
87 rm = kzalloc(sizeof(struct ocfs2_recovery_map) +
88 osb->max_slots * sizeof(unsigned int),
89 GFP_KERNEL);
90 if (!rm) {
91 mlog_errno(-ENOMEM);
92 return -ENOMEM;
93 }
94
95 rm->rm_entries = (unsigned int *)((char *)rm +
96 sizeof(struct ocfs2_recovery_map));
97 osb->recovery_map = rm;
98
99 return 0;
100}
101
102/* we can't grab the goofy sem lock from inside wait_event, so we use
103 * memory barriers to make sure that we'll see the null task before
104 * being woken up */
105static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
106{
107 mb();
108 return osb->recovery_thread_task != NULL;
109}
110
111void ocfs2_recovery_exit(struct ocfs2_super *osb)
112{
113 struct ocfs2_recovery_map *rm;
114
115 /* disable any new recovery threads and wait for any currently
116 * running ones to exit. Do this before setting the vol_state. */
117 mutex_lock(&osb->recovery_lock);
118 osb->disable_recovery = 1;
119 mutex_unlock(&osb->recovery_lock);
120 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
121
122 /* At this point, we know that no more recovery threads can be
123 * launched, so wait for any recovery completion work to
124 * complete. */
125 flush_workqueue(ocfs2_wq);
126
127 /*
128 * Now that recovery is shut down, and the osb is about to be
129 * freed, the osb_lock is not taken here.
130 */
131 rm = osb->recovery_map;
132 /* XXX: Should we bug if there are dirty entries? */
133
134 kfree(rm);
135}
136
137static int __ocfs2_recovery_map_test(struct ocfs2_super *osb,
138 unsigned int node_num)
139{
140 int i;
141 struct ocfs2_recovery_map *rm = osb->recovery_map;
142
143 assert_spin_locked(&osb->osb_lock);
144
145 for (i = 0; i < rm->rm_used; i++) {
146 if (rm->rm_entries[i] == node_num)
147 return 1;
148 }
149
150 return 0;
151}
152
153/* Behaves like test-and-set. Returns the previous value */
154static int ocfs2_recovery_map_set(struct ocfs2_super *osb,
155 unsigned int node_num)
156{
157 struct ocfs2_recovery_map *rm = osb->recovery_map;
158
159 spin_lock(&osb->osb_lock);
160 if (__ocfs2_recovery_map_test(osb, node_num)) {
161 spin_unlock(&osb->osb_lock);
162 return 1;
163 }
164
165 /* XXX: Can this be exploited? Not from o2dlm... */
166 BUG_ON(rm->rm_used >= osb->max_slots);
167
168 rm->rm_entries[rm->rm_used] = node_num;
169 rm->rm_used++;
170 spin_unlock(&osb->osb_lock);
171
172 return 0;
173}
174
175static void ocfs2_recovery_map_clear(struct ocfs2_super *osb,
176 unsigned int node_num)
177{
178 int i;
179 struct ocfs2_recovery_map *rm = osb->recovery_map;
180
181 spin_lock(&osb->osb_lock);
182
183 for (i = 0; i < rm->rm_used; i++) {
184 if (rm->rm_entries[i] == node_num)
185 break;
186 }
187
188 if (i < rm->rm_used) {
189 /* XXX: be careful with the pointer math */
190 memmove(&(rm->rm_entries[i]), &(rm->rm_entries[i + 1]),
191 (rm->rm_used - i - 1) * sizeof(unsigned int));
192 rm->rm_used--;
193 }
194
195 spin_unlock(&osb->osb_lock);
196}
197
67static int ocfs2_commit_cache(struct ocfs2_super *osb) 198static int ocfs2_commit_cache(struct ocfs2_super *osb)
68{ 199{
69 int status = 0; 200 int status = 0;
@@ -586,8 +717,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local)
586 717
587 mlog_entry_void(); 718 mlog_entry_void();
588 719
589 if (!journal) 720 BUG_ON(!journal);
590 BUG();
591 721
592 osb = journal->j_osb; 722 osb = journal->j_osb;
593 723
@@ -650,6 +780,23 @@ bail:
650 return status; 780 return status;
651} 781}
652 782
783static int ocfs2_recovery_completed(struct ocfs2_super *osb)
784{
785 int empty;
786 struct ocfs2_recovery_map *rm = osb->recovery_map;
787
788 spin_lock(&osb->osb_lock);
789 empty = (rm->rm_used == 0);
790 spin_unlock(&osb->osb_lock);
791
792 return empty;
793}
794
795void ocfs2_wait_for_recovery(struct ocfs2_super *osb)
796{
797 wait_event(osb->recovery_event, ocfs2_recovery_completed(osb));
798}
799
653/* 800/*
654 * JBD Might read a cached version of another nodes journal file. We 801 * JBD Might read a cached version of another nodes journal file. We
655 * don't want this as this file changes often and we get no 802 * don't want this as this file changes often and we get no
@@ -848,6 +995,7 @@ static int __ocfs2_recovery_thread(void *arg)
848{ 995{
849 int status, node_num; 996 int status, node_num;
850 struct ocfs2_super *osb = arg; 997 struct ocfs2_super *osb = arg;
998 struct ocfs2_recovery_map *rm = osb->recovery_map;
851 999
852 mlog_entry_void(); 1000 mlog_entry_void();
853 1001
@@ -863,26 +1011,29 @@ restart:
863 goto bail; 1011 goto bail;
864 } 1012 }
865 1013
866 while(!ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1014 spin_lock(&osb->osb_lock);
867 node_num = ocfs2_node_map_first_set_bit(osb, 1015 while (rm->rm_used) {
868 &osb->recovery_map); 1016 /* It's always safe to remove entry zero, as we won't
869 if (node_num == O2NM_INVALID_NODE_NUM) { 1017 * clear it until ocfs2_recover_node() has succeeded. */
870 mlog(0, "Out of nodes to recover.\n"); 1018 node_num = rm->rm_entries[0];
871 break; 1019 spin_unlock(&osb->osb_lock);
872 }
873 1020
874 status = ocfs2_recover_node(osb, node_num); 1021 status = ocfs2_recover_node(osb, node_num);
875 if (status < 0) { 1022 if (!status) {
1023 ocfs2_recovery_map_clear(osb, node_num);
1024 } else {
876 mlog(ML_ERROR, 1025 mlog(ML_ERROR,
877 "Error %d recovering node %d on device (%u,%u)!\n", 1026 "Error %d recovering node %d on device (%u,%u)!\n",
878 status, node_num, 1027 status, node_num,
879 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1028 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
880 mlog(ML_ERROR, "Volume requires unmount.\n"); 1029 mlog(ML_ERROR, "Volume requires unmount.\n");
881 continue;
882 } 1030 }
883 1031
884 ocfs2_recovery_map_clear(osb, node_num); 1032 spin_lock(&osb->osb_lock);
885 } 1033 }
1034 spin_unlock(&osb->osb_lock);
1035 mlog(0, "All nodes recovered\n");
1036
886 ocfs2_super_unlock(osb, 1); 1037 ocfs2_super_unlock(osb, 1);
887 1038
888 /* We always run recovery on our own orphan dir - the dead 1039 /* We always run recovery on our own orphan dir - the dead
@@ -893,8 +1044,7 @@ restart:
893 1044
894bail: 1045bail:
895 mutex_lock(&osb->recovery_lock); 1046 mutex_lock(&osb->recovery_lock);
896 if (!status && 1047 if (!status && !ocfs2_recovery_completed(osb)) {
897 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
898 mutex_unlock(&osb->recovery_lock); 1048 mutex_unlock(&osb->recovery_lock);
899 goto restart; 1049 goto restart;
900 } 1050 }
@@ -924,8 +1074,8 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
924 1074
925 /* People waiting on recovery will wait on 1075 /* People waiting on recovery will wait on
926 * the recovery map to empty. */ 1076 * the recovery map to empty. */
927 if (!ocfs2_recovery_map_set(osb, node_num)) 1077 if (ocfs2_recovery_map_set(osb, node_num))
928 mlog(0, "node %d already be in recovery.\n", node_num); 1078 mlog(0, "node %d already in recovery map.\n", node_num);
929 1079
930 mlog(0, "starting recovery thread...\n"); 1080 mlog(0, "starting recovery thread...\n");
931 1081
@@ -1079,7 +1229,6 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1079{ 1229{
1080 int status = 0; 1230 int status = 0;
1081 int slot_num; 1231 int slot_num;
1082 struct ocfs2_slot_info *si = osb->slot_info;
1083 struct ocfs2_dinode *la_copy = NULL; 1232 struct ocfs2_dinode *la_copy = NULL;
1084 struct ocfs2_dinode *tl_copy = NULL; 1233 struct ocfs2_dinode *tl_copy = NULL;
1085 1234
@@ -1092,8 +1241,8 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1092 * case we should've called ocfs2_journal_load instead. */ 1241 * case we should've called ocfs2_journal_load instead. */
1093 BUG_ON(osb->node_num == node_num); 1242 BUG_ON(osb->node_num == node_num);
1094 1243
1095 slot_num = ocfs2_node_num_to_slot(si, node_num); 1244 slot_num = ocfs2_node_num_to_slot(osb, node_num);
1096 if (slot_num == OCFS2_INVALID_SLOT) { 1245 if (slot_num == -ENOENT) {
1097 status = 0; 1246 status = 0;
1098 mlog(0, "no slot for this node, so no recovery required.\n"); 1247 mlog(0, "no slot for this node, so no recovery required.\n");
1099 goto done; 1248 goto done;
@@ -1123,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1123 1272
1124 /* Likewise, this would be a strange but ultimately not so 1273 /* Likewise, this would be a strange but ultimately not so
1125 * harmful place to get an error... */ 1274 * harmful place to get an error... */
1126 ocfs2_clear_slot(si, slot_num); 1275 status = ocfs2_clear_slot(osb, slot_num);
1127 status = ocfs2_update_disk_slots(osb, si);
1128 if (status < 0) 1276 if (status < 0)
1129 mlog_errno(status); 1277 mlog_errno(status);
1130 1278
@@ -1184,23 +1332,24 @@ bail:
1184 * slot info struct has been updated from disk. */ 1332 * slot info struct has been updated from disk. */
1185int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) 1333int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1186{ 1334{
1187 int status, i, node_num; 1335 unsigned int node_num;
1188 struct ocfs2_slot_info *si = osb->slot_info; 1336 int status, i;
1189 1337
1190 /* This is called with the super block cluster lock, so we 1338 /* This is called with the super block cluster lock, so we
1191 * know that the slot map can't change underneath us. */ 1339 * know that the slot map can't change underneath us. */
1192 1340
1193 spin_lock(&si->si_lock); 1341 spin_lock(&osb->osb_lock);
1194 for(i = 0; i < si->si_num_slots; i++) { 1342 for (i = 0; i < osb->max_slots; i++) {
1195 if (i == osb->slot_num) 1343 if (i == osb->slot_num)
1196 continue; 1344 continue;
1197 if (ocfs2_is_empty_slot(si, i)) 1345
1346 status = ocfs2_slot_to_node_num_locked(osb, i, &node_num);
1347 if (status == -ENOENT)
1198 continue; 1348 continue;
1199 1349
1200 node_num = si->si_global_node_nums[i]; 1350 if (__ocfs2_recovery_map_test(osb, node_num))
1201 if (ocfs2_node_map_test_bit(osb, &osb->recovery_map, node_num))
1202 continue; 1351 continue;
1203 spin_unlock(&si->si_lock); 1352 spin_unlock(&osb->osb_lock);
1204 1353
1205 /* Ok, we have a slot occupied by another node which 1354 /* Ok, we have a slot occupied by another node which
1206 * is not in the recovery map. We trylock his journal 1355 * is not in the recovery map. We trylock his journal
@@ -1216,9 +1365,9 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb)
1216 goto bail; 1365 goto bail;
1217 } 1366 }
1218 1367
1219 spin_lock(&si->si_lock); 1368 spin_lock(&osb->osb_lock);
1220 } 1369 }
1221 spin_unlock(&si->si_lock); 1370 spin_unlock(&osb->osb_lock);
1222 1371
1223 status = 0; 1372 status = 0;
1224bail: 1373bail:
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 220f3e818e78..db82be2532ed 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -134,6 +134,10 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
134 134
135/* Exported only for the journal struct init code in super.c. Do not call. */ 135/* Exported only for the journal struct init code in super.c. Do not call. */
136void ocfs2_complete_recovery(struct work_struct *work); 136void ocfs2_complete_recovery(struct work_struct *work);
137void ocfs2_wait_for_recovery(struct ocfs2_super *osb);
138
139int ocfs2_recovery_init(struct ocfs2_super *osb);
140void ocfs2_recovery_exit(struct ocfs2_super *osb);
137 141
138/* 142/*
139 * Journal Control: 143 * Journal Control:
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index ab83fd562429..ce0dc147602a 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -447,6 +447,8 @@ out_mutex:
447 iput(main_bm_inode); 447 iput(main_bm_inode);
448 448
449out: 449out:
450 if (!status)
451 ocfs2_init_inode_steal_slot(osb);
450 mlog_exit(status); 452 mlog_exit(status);
451 return status; 453 return status;
452} 454}
@@ -523,6 +525,8 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
523 } 525 }
524 526
525 ac->ac_inode = local_alloc_inode; 527 ac->ac_inode = local_alloc_inode;
528 /* We should never use localalloc from another slot */
529 ac->ac_alloc_slot = osb->slot_num;
526 ac->ac_which = OCFS2_AC_USE_LOCAL; 530 ac->ac_which = OCFS2_AC_USE_LOCAL;
527 get_bh(osb->local_alloc_bh); 531 get_bh(osb->local_alloc_bh);
528 ac->ac_bh = osb->local_alloc_bh; 532 ac->ac_bh = osb->local_alloc_bh;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index ae9ad9587516..d5d808fe0140 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -424,7 +424,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
424 fe->i_fs_generation = cpu_to_le32(osb->fs_generation); 424 fe->i_fs_generation = cpu_to_le32(osb->fs_generation);
425 fe->i_blkno = cpu_to_le64(fe_blkno); 425 fe->i_blkno = cpu_to_le64(fe_blkno);
426 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); 426 fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
427 fe->i_suballoc_slot = cpu_to_le16(osb->slot_num); 427 fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
428 fe->i_uid = cpu_to_le32(current->fsuid); 428 fe->i_uid = cpu_to_le32(current->fsuid);
429 if (dir->i_mode & S_ISGID) { 429 if (dir->i_mode & S_ISGID) {
430 fe->i_gid = cpu_to_le32(dir->i_gid); 430 fe->i_gid = cpu_to_le32(dir->i_gid);
@@ -997,7 +997,7 @@ static int ocfs2_rename(struct inode *old_dir,
997 * 997 *
998 * And that's why, just like the VFS, we need a file system 998 * And that's why, just like the VFS, we need a file system
999 * rename lock. */ 999 * rename lock. */
1000 if (old_dentry != new_dentry) { 1000 if (old_dir != new_dir && S_ISDIR(old_inode->i_mode)) {
1001 status = ocfs2_rename_lock(osb); 1001 status = ocfs2_rename_lock(osb);
1002 if (status < 0) { 1002 if (status < 0) {
1003 mlog_errno(status); 1003 mlog_errno(status);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 6546cef212e3..31692379c170 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -36,11 +36,8 @@
36#include <linux/mutex.h> 36#include <linux/mutex.h>
37#include <linux/jbd.h> 37#include <linux/jbd.h>
38 38
39#include "cluster/nodemanager.h" 39/* For union ocfs2_dlm_lksb */
40#include "cluster/heartbeat.h" 40#include "stackglue.h"
41#include "cluster/tcp.h"
42
43#include "dlm/dlmapi.h"
44 41
45#include "ocfs2_fs.h" 42#include "ocfs2_fs.h"
46#include "ocfs2_lockid.h" 43#include "ocfs2_lockid.h"
@@ -101,6 +98,9 @@ enum ocfs2_unlock_action {
101 * dropped. */ 98 * dropped. */
102#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */ 99#define OCFS2_LOCK_QUEUED (0x00000100) /* queued for downconvert */
103#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */ 100#define OCFS2_LOCK_NOCACHE (0x00000200) /* don't use a holder count */
101#define OCFS2_LOCK_PENDING (0x00000400) /* This lockres is pending a
102 call to dlm_lock. Only
103 exists with BUSY set. */
104 104
105struct ocfs2_lock_res_ops; 105struct ocfs2_lock_res_ops;
106 106
@@ -120,13 +120,14 @@ struct ocfs2_lock_res {
120 int l_level; 120 int l_level;
121 unsigned int l_ro_holders; 121 unsigned int l_ro_holders;
122 unsigned int l_ex_holders; 122 unsigned int l_ex_holders;
123 struct dlm_lockstatus l_lksb; 123 union ocfs2_dlm_lksb l_lksb;
124 124
125 /* used from AST/BAST funcs. */ 125 /* used from AST/BAST funcs. */
126 enum ocfs2_ast_action l_action; 126 enum ocfs2_ast_action l_action;
127 enum ocfs2_unlock_action l_unlock_action; 127 enum ocfs2_unlock_action l_unlock_action;
128 int l_requested; 128 int l_requested;
129 int l_blocking; 129 int l_blocking;
130 unsigned int l_pending_gen;
130 131
131 wait_queue_head_t l_event; 132 wait_queue_head_t l_event;
132 133
@@ -179,6 +180,8 @@ enum ocfs2_mount_options
179#define OCFS2_DEFAULT_ATIME_QUANTUM 60 180#define OCFS2_DEFAULT_ATIME_QUANTUM 60
180 181
181struct ocfs2_journal; 182struct ocfs2_journal;
183struct ocfs2_slot_info;
184struct ocfs2_recovery_map;
182struct ocfs2_super 185struct ocfs2_super
183{ 186{
184 struct task_struct *commit_task; 187 struct task_struct *commit_task;
@@ -190,7 +193,6 @@ struct ocfs2_super
190 struct ocfs2_slot_info *slot_info; 193 struct ocfs2_slot_info *slot_info;
191 194
192 spinlock_t node_map_lock; 195 spinlock_t node_map_lock;
193 struct ocfs2_node_map recovery_map;
194 196
195 u64 root_blkno; 197 u64 root_blkno;
196 u64 system_dir_blkno; 198 u64 system_dir_blkno;
@@ -206,25 +208,29 @@ struct ocfs2_super
206 u32 s_feature_incompat; 208 u32 s_feature_incompat;
207 u32 s_feature_ro_compat; 209 u32 s_feature_ro_compat;
208 210
209 /* Protects s_next_generaion, osb_flags. Could protect more on 211 /* Protects s_next_generation, osb_flags and s_inode_steal_slot.
210 * osb as it's very short lived. */ 212 * Could protect more on osb as it's very short lived.
213 */
211 spinlock_t osb_lock; 214 spinlock_t osb_lock;
212 u32 s_next_generation; 215 u32 s_next_generation;
213 unsigned long osb_flags; 216 unsigned long osb_flags;
217 s16 s_inode_steal_slot;
218 atomic_t s_num_inodes_stolen;
214 219
215 unsigned long s_mount_opt; 220 unsigned long s_mount_opt;
216 unsigned int s_atime_quantum; 221 unsigned int s_atime_quantum;
217 222
218 u16 max_slots; 223 unsigned int max_slots;
219 s16 node_num; 224 unsigned int node_num;
220 s16 slot_num; 225 int slot_num;
221 s16 preferred_slot; 226 int preferred_slot;
222 int s_sectsize_bits; 227 int s_sectsize_bits;
223 int s_clustersize; 228 int s_clustersize;
224 int s_clustersize_bits; 229 int s_clustersize_bits;
225 230
226 atomic_t vol_state; 231 atomic_t vol_state;
227 struct mutex recovery_lock; 232 struct mutex recovery_lock;
233 struct ocfs2_recovery_map *recovery_map;
228 struct task_struct *recovery_thread_task; 234 struct task_struct *recovery_thread_task;
229 int disable_recovery; 235 int disable_recovery;
230 wait_queue_head_t checkpoint_event; 236 wait_queue_head_t checkpoint_event;
@@ -245,12 +251,11 @@ struct ocfs2_super
245 struct ocfs2_alloc_stats alloc_stats; 251 struct ocfs2_alloc_stats alloc_stats;
246 char dev_str[20]; /* "major,minor" of the device */ 252 char dev_str[20]; /* "major,minor" of the device */
247 253
248 struct dlm_ctxt *dlm; 254 char osb_cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
255 struct ocfs2_cluster_connection *cconn;
249 struct ocfs2_lock_res osb_super_lockres; 256 struct ocfs2_lock_res osb_super_lockres;
250 struct ocfs2_lock_res osb_rename_lockres; 257 struct ocfs2_lock_res osb_rename_lockres;
251 struct dlm_eviction_cb osb_eviction_cb;
252 struct ocfs2_dlm_debug *osb_dlm_debug; 258 struct ocfs2_dlm_debug *osb_dlm_debug;
253 struct dlm_protocol_version osb_locking_proto;
254 259
255 struct dentry *osb_debug_root; 260 struct dentry *osb_debug_root;
256 261
@@ -367,11 +372,24 @@ static inline int ocfs2_is_soft_readonly(struct ocfs2_super *osb)
367 return ret; 372 return ret;
368} 373}
369 374
375static inline int ocfs2_userspace_stack(struct ocfs2_super *osb)
376{
377 return (osb->s_feature_incompat &
378 OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK);
379}
380
370static inline int ocfs2_mount_local(struct ocfs2_super *osb) 381static inline int ocfs2_mount_local(struct ocfs2_super *osb)
371{ 382{
372 return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT); 383 return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
373} 384}
374 385
386static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
387{
388 return (osb->s_feature_incompat &
389 OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP);
390}
391
392
375#define OCFS2_IS_VALID_DINODE(ptr) \ 393#define OCFS2_IS_VALID_DINODE(ptr) \
376 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) 394 (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
377 395
@@ -522,6 +540,33 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
522 return pages_per_cluster; 540 return pages_per_cluster;
523} 541}
524 542
543static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb)
544{
545 spin_lock(&osb->osb_lock);
546 osb->s_inode_steal_slot = OCFS2_INVALID_SLOT;
547 spin_unlock(&osb->osb_lock);
548 atomic_set(&osb->s_num_inodes_stolen, 0);
549}
550
551static inline void ocfs2_set_inode_steal_slot(struct ocfs2_super *osb,
552 s16 slot)
553{
554 spin_lock(&osb->osb_lock);
555 osb->s_inode_steal_slot = slot;
556 spin_unlock(&osb->osb_lock);
557}
558
559static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
560{
561 s16 slot;
562
563 spin_lock(&osb->osb_lock);
564 slot = osb->s_inode_steal_slot;
565 spin_unlock(&osb->osb_lock);
566
567 return slot;
568}
569
525#define ocfs2_set_bit ext2_set_bit 570#define ocfs2_set_bit ext2_set_bit
526#define ocfs2_clear_bit ext2_clear_bit 571#define ocfs2_clear_bit ext2_clear_bit
527#define ocfs2_test_bit ext2_test_bit 572#define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 3633edd3982f..52c426665154 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -88,7 +88,9 @@
88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB 88#define OCFS2_FEATURE_COMPAT_SUPP OCFS2_FEATURE_COMPAT_BACKUP_SB
89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \ 89#define OCFS2_FEATURE_INCOMPAT_SUPP (OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ 90 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \
91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA) 91 | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
92 | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
93 | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK)
92#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 94#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
93 95
94/* 96/*
@@ -125,6 +127,21 @@
125/* Support for data packed into inode blocks */ 127/* Support for data packed into inode blocks */
126#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 128#define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040
127 129
130/* Support for the extended slot map */
131#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100
132
133
134/*
135 * Support for alternate, userspace cluster stacks. If set, the superblock
136 * field s_cluster_info contains a tag for the alternate stack in use as
137 * well as the name of the cluster being joined.
138 * mount.ocfs2 must pass in a matching stack name.
139 *
140 * If not set, the classic stack will be used. This is compatbile with
141 * all older versions.
142 */
143#define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080
144
128/* 145/*
129 * backup superblock flag is used to indicate that this volume 146 * backup superblock flag is used to indicate that this volume
130 * has backup superblocks. 147 * has backup superblocks.
@@ -267,6 +284,10 @@ struct ocfs2_new_group_input {
267#define OCFS2_VOL_UUID_LEN 16 284#define OCFS2_VOL_UUID_LEN 16
268#define OCFS2_MAX_VOL_LABEL_LEN 64 285#define OCFS2_MAX_VOL_LABEL_LEN 64
269 286
287/* The alternate, userspace stack fields */
288#define OCFS2_STACK_LABEL_LEN 4
289#define OCFS2_CLUSTER_NAME_LEN 16
290
270/* Journal limits (in bytes) */ 291/* Journal limits (in bytes) */
271#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024) 292#define OCFS2_MIN_JOURNAL_SIZE (4 * 1024 * 1024)
272 293
@@ -475,6 +496,47 @@ struct ocfs2_extent_block
475}; 496};
476 497
477/* 498/*
499 * On disk slot map for OCFS2. This defines the contents of the "slot_map"
500 * system file. A slot is valid if it contains a node number >= 0. The
501 * value -1 (0xFFFF) is OCFS2_INVALID_SLOT. This marks a slot empty.
502 */
503struct ocfs2_slot_map {
504/*00*/ __le16 sm_slots[0];
505/*
506 * Actual on-disk size is one block. OCFS2_MAX_SLOTS is 255,
507 * 255 * sizeof(__le16) == 512B, within the 512B block minimum blocksize.
508 */
509};
510
511struct ocfs2_extended_slot {
512/*00*/ __u8 es_valid;
513 __u8 es_reserved1[3];
514 __le32 es_node_num;
515/*10*/
516};
517
518/*
519 * The extended slot map, used when OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP
520 * is set. It separates out the valid marker from the node number, and
521 * has room to grow. Unlike the old slot map, this format is defined by
522 * i_size.
523 */
524struct ocfs2_slot_map_extended {
525/*00*/ struct ocfs2_extended_slot se_slots[0];
526/*
527 * Actual size is i_size of the slot_map system file. It should
528 * match s_max_slots * sizeof(struct ocfs2_extended_slot)
529 */
530};
531
532struct ocfs2_cluster_info {
533/*00*/ __u8 ci_stack[OCFS2_STACK_LABEL_LEN];
534 __le32 ci_reserved;
535/*08*/ __u8 ci_cluster[OCFS2_CLUSTER_NAME_LEN];
536/*18*/
537};
538
539/*
478 * On disk superblock for OCFS2 540 * On disk superblock for OCFS2
479 * Note that it is contained inside an ocfs2_dinode, so all offsets 541 * Note that it is contained inside an ocfs2_dinode, so all offsets
480 * are relative to the start of ocfs2_dinode.id2. 542 * are relative to the start of ocfs2_dinode.id2.
@@ -506,7 +568,20 @@ struct ocfs2_super_block {
506 * group header */ 568 * group header */
507/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ 569/*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */
508/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */ 570/*90*/ __u8 s_uuid[OCFS2_VOL_UUID_LEN]; /* 128-bit uuid */
509/*A0*/ 571/*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace
572 stack. Only valid
573 with INCOMPAT flag. */
574/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */
575/*140*/
576
577 /*
578 * NOTE: As stated above, all offsets are relative to
579 * ocfs2_dinode.id2, which is at 0xC0 in the inode.
580 * 0xC0 + 0x140 = 0x200 or 512 bytes. A superblock must fit within
581 * our smallest blocksize, which is 512 bytes. To ensure this,
582 * we reserve the space in s_reserved2. Anything past s_reserved2
583 * will not be available on the smallest blocksize.
584 */
510}; 585};
511 586
512/* 587/*
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 86f3e3799c2b..82c200f7a8f1 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -100,7 +100,7 @@ static char *ocfs2_lock_type_strings[] = {
100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) 100static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
101{ 101{
102#ifdef __KERNEL__ 102#ifdef __KERNEL__
103 mlog_bug_on_msg(type >= OCFS2_NUM_LOCK_TYPES, "%d\n", type); 103 BUG_ON(type >= OCFS2_NUM_LOCK_TYPES);
104#endif 104#endif
105 return ocfs2_lock_type_strings[type]; 105 return ocfs2_lock_type_strings[type];
106} 106}
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 3a50ce555e64..bb5ff8939bf1 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -42,81 +42,244 @@
42 42
43#include "buffer_head_io.h" 43#include "buffer_head_io.h"
44 44
45static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 45
46 s16 global); 46struct ocfs2_slot {
47static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, 47 int sl_valid;
48 s16 slot_num, 48 unsigned int sl_node_num;
49 s16 node_num); 49};
50 50
51/* post the slot information on disk into our slot_info struct. */ 51struct ocfs2_slot_info {
52void ocfs2_update_slot_info(struct ocfs2_slot_info *si) 52 int si_extended;
53 int si_slots_per_block;
54 struct inode *si_inode;
55 unsigned int si_blocks;
56 struct buffer_head **si_bh;
57 unsigned int si_num_slots;
58 struct ocfs2_slot *si_slots;
59};
60
61
62static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
63 unsigned int node_num);
64
65static void ocfs2_invalidate_slot(struct ocfs2_slot_info *si,
66 int slot_num)
67{
68 BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
69 si->si_slots[slot_num].sl_valid = 0;
70}
71
72static void ocfs2_set_slot(struct ocfs2_slot_info *si,
73 int slot_num, unsigned int node_num)
74{
75 BUG_ON((slot_num < 0) || (slot_num >= si->si_num_slots));
76
77 si->si_slots[slot_num].sl_valid = 1;
78 si->si_slots[slot_num].sl_node_num = node_num;
79}
80
81/* This version is for the extended slot map */
82static void ocfs2_update_slot_info_extended(struct ocfs2_slot_info *si)
83{
84 int b, i, slotno;
85 struct ocfs2_slot_map_extended *se;
86
87 slotno = 0;
88 for (b = 0; b < si->si_blocks; b++) {
89 se = (struct ocfs2_slot_map_extended *)si->si_bh[b]->b_data;
90 for (i = 0;
91 (i < si->si_slots_per_block) &&
92 (slotno < si->si_num_slots);
93 i++, slotno++) {
94 if (se->se_slots[i].es_valid)
95 ocfs2_set_slot(si, slotno,
96 le32_to_cpu(se->se_slots[i].es_node_num));
97 else
98 ocfs2_invalidate_slot(si, slotno);
99 }
100 }
101}
102
103/*
104 * Post the slot information on disk into our slot_info struct.
105 * Must be protected by osb_lock.
106 */
107static void ocfs2_update_slot_info_old(struct ocfs2_slot_info *si)
53{ 108{
54 int i; 109 int i;
55 __le16 *disk_info; 110 struct ocfs2_slot_map *sm;
56 111
57 /* we don't read the slot block here as ocfs2_super_lock 112 sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
58 * should've made sure we have the most recent copy. */
59 spin_lock(&si->si_lock);
60 disk_info = (__le16 *) si->si_bh->b_data;
61 113
62 for (i = 0; i < si->si_size; i++) 114 for (i = 0; i < si->si_num_slots; i++) {
63 si->si_global_node_nums[i] = le16_to_cpu(disk_info[i]); 115 if (le16_to_cpu(sm->sm_slots[i]) == (u16)OCFS2_INVALID_SLOT)
116 ocfs2_invalidate_slot(si, i);
117 else
118 ocfs2_set_slot(si, i, le16_to_cpu(sm->sm_slots[i]));
119 }
120}
64 121
65 spin_unlock(&si->si_lock); 122static void ocfs2_update_slot_info(struct ocfs2_slot_info *si)
123{
124 /*
125 * The slot data will have been refreshed when ocfs2_super_lock
126 * was taken.
127 */
128 if (si->si_extended)
129 ocfs2_update_slot_info_extended(si);
130 else
131 ocfs2_update_slot_info_old(si);
132}
133
134int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
135{
136 int ret;
137 struct ocfs2_slot_info *si = osb->slot_info;
138
139 if (si == NULL)
140 return 0;
141
142 BUG_ON(si->si_blocks == 0);
143 BUG_ON(si->si_bh == NULL);
144
145 mlog(0, "Refreshing slot map, reading %u block(s)\n",
146 si->si_blocks);
147
148 /*
149 * We pass -1 as blocknr because we expect all of si->si_bh to
150 * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If
151 * this is not true, the read of -1 (UINT64_MAX) will fail.
152 */
153 ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0,
154 si->si_inode);
155 if (ret == 0) {
156 spin_lock(&osb->osb_lock);
157 ocfs2_update_slot_info(si);
158 spin_unlock(&osb->osb_lock);
159 }
160
161 return ret;
66} 162}
67 163
68/* post the our slot info stuff into it's destination bh and write it 164/* post the our slot info stuff into it's destination bh and write it
69 * out. */ 165 * out. */
70int ocfs2_update_disk_slots(struct ocfs2_super *osb, 166static void ocfs2_update_disk_slot_extended(struct ocfs2_slot_info *si,
71 struct ocfs2_slot_info *si) 167 int slot_num,
168 struct buffer_head **bh)
72{ 169{
73 int status, i; 170 int blkind = slot_num / si->si_slots_per_block;
74 __le16 *disk_info = (__le16 *) si->si_bh->b_data; 171 int slotno = slot_num % si->si_slots_per_block;
172 struct ocfs2_slot_map_extended *se;
173
174 BUG_ON(blkind >= si->si_blocks);
175
176 se = (struct ocfs2_slot_map_extended *)si->si_bh[blkind]->b_data;
177 se->se_slots[slotno].es_valid = si->si_slots[slot_num].sl_valid;
178 if (si->si_slots[slot_num].sl_valid)
179 se->se_slots[slotno].es_node_num =
180 cpu_to_le32(si->si_slots[slot_num].sl_node_num);
181 *bh = si->si_bh[blkind];
182}
75 183
76 spin_lock(&si->si_lock); 184static void ocfs2_update_disk_slot_old(struct ocfs2_slot_info *si,
77 for (i = 0; i < si->si_size; i++) 185 int slot_num,
78 disk_info[i] = cpu_to_le16(si->si_global_node_nums[i]); 186 struct buffer_head **bh)
79 spin_unlock(&si->si_lock); 187{
188 int i;
189 struct ocfs2_slot_map *sm;
190
191 sm = (struct ocfs2_slot_map *)si->si_bh[0]->b_data;
192 for (i = 0; i < si->si_num_slots; i++) {
193 if (si->si_slots[i].sl_valid)
194 sm->sm_slots[i] =
195 cpu_to_le16(si->si_slots[i].sl_node_num);
196 else
197 sm->sm_slots[i] = cpu_to_le16(OCFS2_INVALID_SLOT);
198 }
199 *bh = si->si_bh[0];
200}
201
202static int ocfs2_update_disk_slot(struct ocfs2_super *osb,
203 struct ocfs2_slot_info *si,
204 int slot_num)
205{
206 int status;
207 struct buffer_head *bh;
208
209 spin_lock(&osb->osb_lock);
210 if (si->si_extended)
211 ocfs2_update_disk_slot_extended(si, slot_num, &bh);
212 else
213 ocfs2_update_disk_slot_old(si, slot_num, &bh);
214 spin_unlock(&osb->osb_lock);
80 215
81 status = ocfs2_write_block(osb, si->si_bh, si->si_inode); 216 status = ocfs2_write_block(osb, bh, si->si_inode);
82 if (status < 0) 217 if (status < 0)
83 mlog_errno(status); 218 mlog_errno(status);
84 219
85 return status; 220 return status;
86} 221}
87 222
88/* try to find global node in the slot info. Returns 223/*
89 * OCFS2_INVALID_SLOT if nothing is found. */ 224 * Calculate how many bytes are needed by the slot map. Returns
90static s16 __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 225 * an error if the slot map file is too small.
91 s16 global) 226 */
227static int ocfs2_slot_map_physical_size(struct ocfs2_super *osb,
228 struct inode *inode,
229 unsigned long long *bytes)
92{ 230{
93 int i; 231 unsigned long long bytes_needed;
94 s16 ret = OCFS2_INVALID_SLOT; 232
233 if (ocfs2_uses_extended_slot_map(osb)) {
234 bytes_needed = osb->max_slots *
235 sizeof(struct ocfs2_extended_slot);
236 } else {
237 bytes_needed = osb->max_slots * sizeof(__le16);
238 }
239 if (bytes_needed > i_size_read(inode)) {
240 mlog(ML_ERROR,
241 "Slot map file is too small! (size %llu, needed %llu)\n",
242 i_size_read(inode), bytes_needed);
243 return -ENOSPC;
244 }
245
246 *bytes = bytes_needed;
247 return 0;
248}
249
250/* try to find global node in the slot info. Returns -ENOENT
251 * if nothing is found. */
252static int __ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
253 unsigned int node_num)
254{
255 int i, ret = -ENOENT;
95 256
96 for(i = 0; i < si->si_num_slots; i++) { 257 for(i = 0; i < si->si_num_slots; i++) {
97 if (global == si->si_global_node_nums[i]) { 258 if (si->si_slots[i].sl_valid &&
98 ret = (s16) i; 259 (node_num == si->si_slots[i].sl_node_num)) {
260 ret = i;
99 break; 261 break;
100 } 262 }
101 } 263 }
264
102 return ret; 265 return ret;
103} 266}
104 267
105static s16 __ocfs2_find_empty_slot(struct ocfs2_slot_info *si, s16 preferred) 268static int __ocfs2_find_empty_slot(struct ocfs2_slot_info *si,
269 int preferred)
106{ 270{
107 int i; 271 int i, ret = -ENOSPC;
108 s16 ret = OCFS2_INVALID_SLOT;
109 272
110 if (preferred >= 0 && preferred < si->si_num_slots) { 273 if ((preferred >= 0) && (preferred < si->si_num_slots)) {
111 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[preferred]) { 274 if (!si->si_slots[preferred].sl_valid) {
112 ret = preferred; 275 ret = preferred;
113 goto out; 276 goto out;
114 } 277 }
115 } 278 }
116 279
117 for(i = 0; i < si->si_num_slots; i++) { 280 for(i = 0; i < si->si_num_slots; i++) {
118 if (OCFS2_INVALID_SLOT == si->si_global_node_nums[i]) { 281 if (!si->si_slots[i].sl_valid) {
119 ret = (s16) i; 282 ret = i;
120 break; 283 break;
121 } 284 }
122 } 285 }
@@ -124,58 +287,155 @@ out:
124 return ret; 287 return ret;
125} 288}
126 289
127s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si, 290int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num)
128 s16 global)
129{ 291{
130 s16 ret; 292 int slot;
293 struct ocfs2_slot_info *si = osb->slot_info;
131 294
132 spin_lock(&si->si_lock); 295 spin_lock(&osb->osb_lock);
133 ret = __ocfs2_node_num_to_slot(si, global); 296 slot = __ocfs2_node_num_to_slot(si, node_num);
134 spin_unlock(&si->si_lock); 297 spin_unlock(&osb->osb_lock);
135 return ret; 298
299 return slot;
300}
301
302int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
303 unsigned int *node_num)
304{
305 struct ocfs2_slot_info *si = osb->slot_info;
306
307 assert_spin_locked(&osb->osb_lock);
308
309 BUG_ON(slot_num < 0);
310 BUG_ON(slot_num > osb->max_slots);
311
312 if (!si->si_slots[slot_num].sl_valid)
313 return -ENOENT;
314
315 *node_num = si->si_slots[slot_num].sl_node_num;
316 return 0;
136} 317}
137 318
138static void __ocfs2_fill_slot(struct ocfs2_slot_info *si, 319static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
139 s16 slot_num,
140 s16 node_num)
141{ 320{
142 BUG_ON(slot_num == OCFS2_INVALID_SLOT); 321 unsigned int i;
143 BUG_ON(slot_num >= si->si_num_slots); 322
144 BUG_ON((node_num != O2NM_INVALID_NODE_NUM) && 323 if (si == NULL)
145 (node_num >= O2NM_MAX_NODES)); 324 return;
325
326 if (si->si_inode)
327 iput(si->si_inode);
328 if (si->si_bh) {
329 for (i = 0; i < si->si_blocks; i++) {
330 if (si->si_bh[i]) {
331 brelse(si->si_bh[i]);
332 si->si_bh[i] = NULL;
333 }
334 }
335 kfree(si->si_bh);
336 }
146 337
147 si->si_global_node_nums[slot_num] = node_num; 338 kfree(si);
148} 339}
149 340
150void ocfs2_clear_slot(struct ocfs2_slot_info *si, 341int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num)
151 s16 slot_num)
152{ 342{
153 spin_lock(&si->si_lock); 343 struct ocfs2_slot_info *si = osb->slot_info;
154 __ocfs2_fill_slot(si, slot_num, OCFS2_INVALID_SLOT); 344
155 spin_unlock(&si->si_lock); 345 if (si == NULL)
346 return 0;
347
348 spin_lock(&osb->osb_lock);
349 ocfs2_invalidate_slot(si, slot_num);
350 spin_unlock(&osb->osb_lock);
351
352 return ocfs2_update_disk_slot(osb, osb->slot_info, slot_num);
156} 353}
157 354
158int ocfs2_init_slot_info(struct ocfs2_super *osb) 355static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
356 struct ocfs2_slot_info *si)
159{ 357{
160 int status, i; 358 int status = 0;
161 u64 blkno; 359 u64 blkno;
360 unsigned long long blocks, bytes;
361 unsigned int i;
362 struct buffer_head *bh;
363
364 status = ocfs2_slot_map_physical_size(osb, si->si_inode, &bytes);
365 if (status)
366 goto bail;
367
368 blocks = ocfs2_blocks_for_bytes(si->si_inode->i_sb, bytes);
369 BUG_ON(blocks > UINT_MAX);
370 si->si_blocks = blocks;
371 if (!si->si_blocks)
372 goto bail;
373
374 if (si->si_extended)
375 si->si_slots_per_block =
376 (osb->sb->s_blocksize /
377 sizeof(struct ocfs2_extended_slot));
378 else
379 si->si_slots_per_block = osb->sb->s_blocksize / sizeof(__le16);
380
381 /* The size checks above should ensure this */
382 BUG_ON((osb->max_slots / si->si_slots_per_block) > blocks);
383
384 mlog(0, "Slot map needs %u buffers for %llu bytes\n",
385 si->si_blocks, bytes);
386
387 si->si_bh = kzalloc(sizeof(struct buffer_head *) * si->si_blocks,
388 GFP_KERNEL);
389 if (!si->si_bh) {
390 status = -ENOMEM;
391 mlog_errno(status);
392 goto bail;
393 }
394
395 for (i = 0; i < si->si_blocks; i++) {
396 status = ocfs2_extent_map_get_blocks(si->si_inode, i,
397 &blkno, NULL, NULL);
398 if (status < 0) {
399 mlog_errno(status);
400 goto bail;
401 }
402
403 mlog(0, "Reading slot map block %u at %llu\n", i,
404 (unsigned long long)blkno);
405
406 bh = NULL; /* Acquire a fresh bh */
407 status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode);
408 if (status < 0) {
409 mlog_errno(status);
410 goto bail;
411 }
412
413 si->si_bh[i] = bh;
414 }
415
416bail:
417 return status;
418}
419
420int ocfs2_init_slot_info(struct ocfs2_super *osb)
421{
422 int status;
162 struct inode *inode = NULL; 423 struct inode *inode = NULL;
163 struct buffer_head *bh = NULL;
164 struct ocfs2_slot_info *si; 424 struct ocfs2_slot_info *si;
165 425
166 si = kzalloc(sizeof(struct ocfs2_slot_info), GFP_KERNEL); 426 si = kzalloc(sizeof(struct ocfs2_slot_info) +
427 (sizeof(struct ocfs2_slot) * osb->max_slots),
428 GFP_KERNEL);
167 if (!si) { 429 if (!si) {
168 status = -ENOMEM; 430 status = -ENOMEM;
169 mlog_errno(status); 431 mlog_errno(status);
170 goto bail; 432 goto bail;
171 } 433 }
172 434
173 spin_lock_init(&si->si_lock); 435 si->si_extended = ocfs2_uses_extended_slot_map(osb);
174 si->si_num_slots = osb->max_slots; 436 si->si_num_slots = osb->max_slots;
175 si->si_size = OCFS2_MAX_SLOTS; 437 si->si_slots = (struct ocfs2_slot *)((char *)si +
176 438 sizeof(struct ocfs2_slot_info));
177 for(i = 0; i < si->si_num_slots; i++)
178 si->si_global_node_nums[i] = OCFS2_INVALID_SLOT;
179 439
180 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE, 440 inode = ocfs2_get_system_file_inode(osb, SLOT_MAP_SYSTEM_INODE,
181 OCFS2_INVALID_SLOT); 441 OCFS2_INVALID_SLOT);
@@ -185,61 +445,53 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
185 goto bail; 445 goto bail;
186 } 446 }
187 447
188 status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL); 448 si->si_inode = inode;
189 if (status < 0) { 449 status = ocfs2_map_slot_buffers(osb, si);
190 mlog_errno(status);
191 goto bail;
192 }
193
194 status = ocfs2_read_block(osb, blkno, &bh, 0, inode);
195 if (status < 0) { 450 if (status < 0) {
196 mlog_errno(status); 451 mlog_errno(status);
197 goto bail; 452 goto bail;
198 } 453 }
199 454
200 si->si_inode = inode; 455 osb->slot_info = (struct ocfs2_slot_info *)si;
201 si->si_bh = bh;
202 osb->slot_info = si;
203bail: 456bail:
204 if (status < 0 && si) 457 if (status < 0 && si)
205 ocfs2_free_slot_info(si); 458 __ocfs2_free_slot_info(si);
206 459
207 return status; 460 return status;
208} 461}
209 462
210void ocfs2_free_slot_info(struct ocfs2_slot_info *si) 463void ocfs2_free_slot_info(struct ocfs2_super *osb)
211{ 464{
212 if (si->si_inode) 465 struct ocfs2_slot_info *si = osb->slot_info;
213 iput(si->si_inode); 466
214 if (si->si_bh) 467 osb->slot_info = NULL;
215 brelse(si->si_bh); 468 __ocfs2_free_slot_info(si);
216 kfree(si);
217} 469}
218 470
219int ocfs2_find_slot(struct ocfs2_super *osb) 471int ocfs2_find_slot(struct ocfs2_super *osb)
220{ 472{
221 int status; 473 int status;
222 s16 slot; 474 int slot;
223 struct ocfs2_slot_info *si; 475 struct ocfs2_slot_info *si;
224 476
225 mlog_entry_void(); 477 mlog_entry_void();
226 478
227 si = osb->slot_info; 479 si = osb->slot_info;
228 480
481 spin_lock(&osb->osb_lock);
229 ocfs2_update_slot_info(si); 482 ocfs2_update_slot_info(si);
230 483
231 spin_lock(&si->si_lock);
232 /* search for ourselves first and take the slot if it already 484 /* search for ourselves first and take the slot if it already
233 * exists. Perhaps we need to mark this in a variable for our 485 * exists. Perhaps we need to mark this in a variable for our
234 * own journal recovery? Possibly not, though we certainly 486 * own journal recovery? Possibly not, though we certainly
235 * need to warn to the user */ 487 * need to warn to the user */
236 slot = __ocfs2_node_num_to_slot(si, osb->node_num); 488 slot = __ocfs2_node_num_to_slot(si, osb->node_num);
237 if (slot == OCFS2_INVALID_SLOT) { 489 if (slot < 0) {
238 /* if no slot yet, then just take 1st available 490 /* if no slot yet, then just take 1st available
239 * one. */ 491 * one. */
240 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot); 492 slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
241 if (slot == OCFS2_INVALID_SLOT) { 493 if (slot < 0) {
242 spin_unlock(&si->si_lock); 494 spin_unlock(&osb->osb_lock);
243 mlog(ML_ERROR, "no free slots available!\n"); 495 mlog(ML_ERROR, "no free slots available!\n");
244 status = -EINVAL; 496 status = -EINVAL;
245 goto bail; 497 goto bail;
@@ -248,13 +500,13 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
248 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", 500 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n",
249 slot); 501 slot);
250 502
251 __ocfs2_fill_slot(si, slot, osb->node_num); 503 ocfs2_set_slot(si, slot, osb->node_num);
252 osb->slot_num = slot; 504 osb->slot_num = slot;
253 spin_unlock(&si->si_lock); 505 spin_unlock(&osb->osb_lock);
254 506
255 mlog(0, "taking node slot %d\n", osb->slot_num); 507 mlog(0, "taking node slot %d\n", osb->slot_num);
256 508
257 status = ocfs2_update_disk_slots(osb, si); 509 status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
258 if (status < 0) 510 if (status < 0)
259 mlog_errno(status); 511 mlog_errno(status);
260 512
@@ -265,27 +517,27 @@ bail:
265 517
266void ocfs2_put_slot(struct ocfs2_super *osb) 518void ocfs2_put_slot(struct ocfs2_super *osb)
267{ 519{
268 int status; 520 int status, slot_num;
269 struct ocfs2_slot_info *si = osb->slot_info; 521 struct ocfs2_slot_info *si = osb->slot_info;
270 522
271 if (!si) 523 if (!si)
272 return; 524 return;
273 525
526 spin_lock(&osb->osb_lock);
274 ocfs2_update_slot_info(si); 527 ocfs2_update_slot_info(si);
275 528
276 spin_lock(&si->si_lock); 529 slot_num = osb->slot_num;
277 __ocfs2_fill_slot(si, osb->slot_num, OCFS2_INVALID_SLOT); 530 ocfs2_invalidate_slot(si, osb->slot_num);
278 osb->slot_num = OCFS2_INVALID_SLOT; 531 osb->slot_num = OCFS2_INVALID_SLOT;
279 spin_unlock(&si->si_lock); 532 spin_unlock(&osb->osb_lock);
280 533
281 status = ocfs2_update_disk_slots(osb, si); 534 status = ocfs2_update_disk_slot(osb, si, slot_num);
282 if (status < 0) { 535 if (status < 0) {
283 mlog_errno(status); 536 mlog_errno(status);
284 goto bail; 537 goto bail;
285 } 538 }
286 539
287bail: 540bail:
288 osb->slot_info = NULL; 541 ocfs2_free_slot_info(osb);
289 ocfs2_free_slot_info(si);
290} 542}
291 543
diff --git a/fs/ocfs2/slot_map.h b/fs/ocfs2/slot_map.h
index 1025872aaade..601c95fd7003 100644
--- a/fs/ocfs2/slot_map.h
+++ b/fs/ocfs2/slot_map.h
@@ -27,38 +27,18 @@
27#ifndef SLOTMAP_H 27#ifndef SLOTMAP_H
28#define SLOTMAP_H 28#define SLOTMAP_H
29 29
30struct ocfs2_slot_info {
31 spinlock_t si_lock;
32
33 struct inode *si_inode;
34 struct buffer_head *si_bh;
35 unsigned int si_num_slots;
36 unsigned int si_size;
37 s16 si_global_node_nums[OCFS2_MAX_SLOTS];
38};
39
40int ocfs2_init_slot_info(struct ocfs2_super *osb); 30int ocfs2_init_slot_info(struct ocfs2_super *osb);
41void ocfs2_free_slot_info(struct ocfs2_slot_info *si); 31void ocfs2_free_slot_info(struct ocfs2_super *osb);
42 32
43int ocfs2_find_slot(struct ocfs2_super *osb); 33int ocfs2_find_slot(struct ocfs2_super *osb);
44void ocfs2_put_slot(struct ocfs2_super *osb); 34void ocfs2_put_slot(struct ocfs2_super *osb);
45 35
46void ocfs2_update_slot_info(struct ocfs2_slot_info *si); 36int ocfs2_refresh_slot_info(struct ocfs2_super *osb);
47int ocfs2_update_disk_slots(struct ocfs2_super *osb,
48 struct ocfs2_slot_info *si);
49
50s16 ocfs2_node_num_to_slot(struct ocfs2_slot_info *si,
51 s16 global);
52void ocfs2_clear_slot(struct ocfs2_slot_info *si,
53 s16 slot_num);
54 37
55static inline int ocfs2_is_empty_slot(struct ocfs2_slot_info *si, 38int ocfs2_node_num_to_slot(struct ocfs2_super *osb, unsigned int node_num);
56 int slot_num) 39int ocfs2_slot_to_node_num_locked(struct ocfs2_super *osb, int slot_num,
57{ 40 unsigned int *node_num);
58 BUG_ON(slot_num == OCFS2_INVALID_SLOT);
59 assert_spin_locked(&si->si_lock);
60 41
61 return si->si_global_node_nums[slot_num] == OCFS2_INVALID_SLOT; 42int ocfs2_clear_slot(struct ocfs2_super *osb, int slot_num);
62}
63 43
64#endif 44#endif
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
new file mode 100644
index 000000000000..ac1d74c63bf5
--- /dev/null
+++ b/fs/ocfs2/stack_o2cb.c
@@ -0,0 +1,420 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stack_o2cb.c
5 *
6 * Code which interfaces ocfs2 with the o2cb stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/crc32.h>
21#include <linux/module.h>
22
23/* Needed for AOP_TRUNCATED_PAGE in mlog_errno() */
24#include <linux/fs.h>
25
26#include "cluster/masklog.h"
27#include "cluster/nodemanager.h"
28#include "cluster/heartbeat.h"
29
30#include "stackglue.h"
31
32struct o2dlm_private {
33 struct dlm_eviction_cb op_eviction_cb;
34};
35
36static struct ocfs2_stack_plugin o2cb_stack;
37
38/* These should be identical */
39#if (DLM_LOCK_IV != LKM_IVMODE)
40# error Lock modes do not match
41#endif
42#if (DLM_LOCK_NL != LKM_NLMODE)
43# error Lock modes do not match
44#endif
45#if (DLM_LOCK_CR != LKM_CRMODE)
46# error Lock modes do not match
47#endif
48#if (DLM_LOCK_CW != LKM_CWMODE)
49# error Lock modes do not match
50#endif
51#if (DLM_LOCK_PR != LKM_PRMODE)
52# error Lock modes do not match
53#endif
54#if (DLM_LOCK_PW != LKM_PWMODE)
55# error Lock modes do not match
56#endif
57#if (DLM_LOCK_EX != LKM_EXMODE)
58# error Lock modes do not match
59#endif
60static inline int mode_to_o2dlm(int mode)
61{
62 BUG_ON(mode > LKM_MAXMODE);
63
64 return mode;
65}
66
67#define map_flag(_generic, _o2dlm) \
68 if (flags & (_generic)) { \
69 flags &= ~(_generic); \
70 o2dlm_flags |= (_o2dlm); \
71 }
72static int flags_to_o2dlm(u32 flags)
73{
74 int o2dlm_flags = 0;
75
76 map_flag(DLM_LKF_NOQUEUE, LKM_NOQUEUE);
77 map_flag(DLM_LKF_CANCEL, LKM_CANCEL);
78 map_flag(DLM_LKF_CONVERT, LKM_CONVERT);
79 map_flag(DLM_LKF_VALBLK, LKM_VALBLK);
80 map_flag(DLM_LKF_IVVALBLK, LKM_INVVALBLK);
81 map_flag(DLM_LKF_ORPHAN, LKM_ORPHAN);
82 map_flag(DLM_LKF_FORCEUNLOCK, LKM_FORCE);
83 map_flag(DLM_LKF_TIMEOUT, LKM_TIMEOUT);
84 map_flag(DLM_LKF_LOCAL, LKM_LOCAL);
85
86 /* map_flag() should have cleared every flag passed in */
87 BUG_ON(flags != 0);
88
89 return o2dlm_flags;
90}
91#undef map_flag
92
93/*
94 * Map an o2dlm status to standard errno values.
95 *
96 * o2dlm only uses a handful of these, and returns even fewer to the
97 * caller. Still, we try to assign sane values to each error.
98 *
99 * The following value pairs have special meanings to dlmglue, thus
100 * the right hand side needs to stay unique - never duplicate the
101 * mapping elsewhere in the table!
102 *
103 * DLM_NORMAL: 0
104 * DLM_NOTQUEUED: -EAGAIN
105 * DLM_CANCELGRANT: -EBUSY
106 * DLM_CANCEL: -DLM_ECANCEL
107 */
108/* Keep in sync with dlmapi.h */
109static int status_map[] = {
110 [DLM_NORMAL] = 0, /* Success */
111 [DLM_GRANTED] = -EINVAL,
112 [DLM_DENIED] = -EACCES,
113 [DLM_DENIED_NOLOCKS] = -EACCES,
114 [DLM_WORKING] = -EACCES,
115 [DLM_BLOCKED] = -EINVAL,
116 [DLM_BLOCKED_ORPHAN] = -EINVAL,
117 [DLM_DENIED_GRACE_PERIOD] = -EACCES,
118 [DLM_SYSERR] = -ENOMEM, /* It is what it is */
119 [DLM_NOSUPPORT] = -EPROTO,
120 [DLM_CANCELGRANT] = -EBUSY, /* Cancel after grant */
121 [DLM_IVLOCKID] = -EINVAL,
122 [DLM_SYNC] = -EINVAL,
123 [DLM_BADTYPE] = -EINVAL,
124 [DLM_BADRESOURCE] = -EINVAL,
125 [DLM_MAXHANDLES] = -ENOMEM,
126 [DLM_NOCLINFO] = -EINVAL,
127 [DLM_NOLOCKMGR] = -EINVAL,
128 [DLM_NOPURGED] = -EINVAL,
129 [DLM_BADARGS] = -EINVAL,
130 [DLM_VOID] = -EINVAL,
131 [DLM_NOTQUEUED] = -EAGAIN, /* Trylock failed */
132 [DLM_IVBUFLEN] = -EINVAL,
133 [DLM_CVTUNGRANT] = -EPERM,
134 [DLM_BADPARAM] = -EINVAL,
135 [DLM_VALNOTVALID] = -EINVAL,
136 [DLM_REJECTED] = -EPERM,
137 [DLM_ABORT] = -EINVAL,
138 [DLM_CANCEL] = -DLM_ECANCEL, /* Successful cancel */
139 [DLM_IVRESHANDLE] = -EINVAL,
140 [DLM_DEADLOCK] = -EDEADLK,
141 [DLM_DENIED_NOASTS] = -EINVAL,
142 [DLM_FORWARD] = -EINVAL,
143 [DLM_TIMEOUT] = -ETIMEDOUT,
144 [DLM_IVGROUPID] = -EINVAL,
145 [DLM_VERS_CONFLICT] = -EOPNOTSUPP,
146 [DLM_BAD_DEVICE_PATH] = -ENOENT,
147 [DLM_NO_DEVICE_PERMISSION] = -EPERM,
148 [DLM_NO_CONTROL_DEVICE] = -ENOENT,
149 [DLM_RECOVERING] = -ENOTCONN,
150 [DLM_MIGRATING] = -ERESTART,
151 [DLM_MAXSTATS] = -EINVAL,
152};
153
154static int dlm_status_to_errno(enum dlm_status status)
155{
156 BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
157
158 return status_map[status];
159}
160
161static void o2dlm_lock_ast_wrapper(void *astarg)
162{
163 BUG_ON(o2cb_stack.sp_proto == NULL);
164
165 o2cb_stack.sp_proto->lp_lock_ast(astarg);
166}
167
168static void o2dlm_blocking_ast_wrapper(void *astarg, int level)
169{
170 BUG_ON(o2cb_stack.sp_proto == NULL);
171
172 o2cb_stack.sp_proto->lp_blocking_ast(astarg, level);
173}
174
175static void o2dlm_unlock_ast_wrapper(void *astarg, enum dlm_status status)
176{
177 int error = dlm_status_to_errno(status);
178
179 BUG_ON(o2cb_stack.sp_proto == NULL);
180
181 /*
182 * In o2dlm, you can get both the lock_ast() for the lock being
183 * granted and the unlock_ast() for the CANCEL failing. A
184 * successful cancel sends DLM_NORMAL here. If the
185 * lock grant happened before the cancel arrived, you get
186 * DLM_CANCELGRANT.
187 *
188 * There's no need for the double-ast. If we see DLM_CANCELGRANT,
189 * we just ignore it. We expect the lock_ast() to handle the
190 * granted lock.
191 */
192 if (status == DLM_CANCELGRANT)
193 return;
194
195 o2cb_stack.sp_proto->lp_unlock_ast(astarg, error);
196}
197
198static int o2cb_dlm_lock(struct ocfs2_cluster_connection *conn,
199 int mode,
200 union ocfs2_dlm_lksb *lksb,
201 u32 flags,
202 void *name,
203 unsigned int namelen,
204 void *astarg)
205{
206 enum dlm_status status;
207 int o2dlm_mode = mode_to_o2dlm(mode);
208 int o2dlm_flags = flags_to_o2dlm(flags);
209 int ret;
210
211 status = dlmlock(conn->cc_lockspace, o2dlm_mode, &lksb->lksb_o2dlm,
212 o2dlm_flags, name, namelen,
213 o2dlm_lock_ast_wrapper, astarg,
214 o2dlm_blocking_ast_wrapper);
215 ret = dlm_status_to_errno(status);
216 return ret;
217}
218
219static int o2cb_dlm_unlock(struct ocfs2_cluster_connection *conn,
220 union ocfs2_dlm_lksb *lksb,
221 u32 flags,
222 void *astarg)
223{
224 enum dlm_status status;
225 int o2dlm_flags = flags_to_o2dlm(flags);
226 int ret;
227
228 status = dlmunlock(conn->cc_lockspace, &lksb->lksb_o2dlm,
229 o2dlm_flags, o2dlm_unlock_ast_wrapper, astarg);
230 ret = dlm_status_to_errno(status);
231 return ret;
232}
233
234static int o2cb_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
235{
236 return dlm_status_to_errno(lksb->lksb_o2dlm.status);
237}
238
239static void *o2cb_dlm_lvb(union ocfs2_dlm_lksb *lksb)
240{
241 return (void *)(lksb->lksb_o2dlm.lvb);
242}
243
244static void o2cb_dump_lksb(union ocfs2_dlm_lksb *lksb)
245{
246 dlm_print_one_lock(lksb->lksb_o2dlm.lockid);
247}
248
249/*
250 * Called from the dlm when it's about to evict a node. This is how the
251 * classic stack signals node death.
252 */
253static void o2dlm_eviction_cb(int node_num, void *data)
254{
255 struct ocfs2_cluster_connection *conn = data;
256
257 mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n",
258 node_num, conn->cc_namelen, conn->cc_name);
259
260 conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
261}
262
263static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
264{
265 int rc = 0;
266 u32 dlm_key;
267 struct dlm_ctxt *dlm;
268 struct o2dlm_private *priv;
269 struct dlm_protocol_version dlm_version;
270
271 BUG_ON(conn == NULL);
272 BUG_ON(o2cb_stack.sp_proto == NULL);
273
274 /* for now we only have one cluster/node, make sure we see it
275 * in the heartbeat universe */
276 if (!o2hb_check_local_node_heartbeating()) {
277 rc = -EINVAL;
278 goto out;
279 }
280
281 priv = kzalloc(sizeof(struct o2dlm_private), GFP_KERNEL);
282 if (!priv) {
283 rc = -ENOMEM;
284 goto out_free;
285 }
286
287 /* This just fills the structure in. It is safe to pass conn. */
288 dlm_setup_eviction_cb(&priv->op_eviction_cb, o2dlm_eviction_cb,
289 conn);
290
291 conn->cc_private = priv;
292
293 /* used by the dlm code to make message headers unique, each
294 * node in this domain must agree on this. */
295 dlm_key = crc32_le(0, conn->cc_name, conn->cc_namelen);
296 dlm_version.pv_major = conn->cc_version.pv_major;
297 dlm_version.pv_minor = conn->cc_version.pv_minor;
298
299 dlm = dlm_register_domain(conn->cc_name, dlm_key, &dlm_version);
300 if (IS_ERR(dlm)) {
301 rc = PTR_ERR(dlm);
302 mlog_errno(rc);
303 goto out_free;
304 }
305
306 conn->cc_version.pv_major = dlm_version.pv_major;
307 conn->cc_version.pv_minor = dlm_version.pv_minor;
308 conn->cc_lockspace = dlm;
309
310 dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
311
312out_free:
313 if (rc && conn->cc_private)
314 kfree(conn->cc_private);
315
316out:
317 return rc;
318}
319
320static int o2cb_cluster_disconnect(struct ocfs2_cluster_connection *conn,
321 int hangup_pending)
322{
323 struct dlm_ctxt *dlm = conn->cc_lockspace;
324 struct o2dlm_private *priv = conn->cc_private;
325
326 dlm_unregister_eviction_cb(&priv->op_eviction_cb);
327 conn->cc_private = NULL;
328 kfree(priv);
329
330 dlm_unregister_domain(dlm);
331 conn->cc_lockspace = NULL;
332
333 return 0;
334}
335
336static void o2hb_stop(const char *group)
337{
338 int ret;
339 char *argv[5], *envp[3];
340
341 argv[0] = (char *)o2nm_get_hb_ctl_path();
342 argv[1] = "-K";
343 argv[2] = "-u";
344 argv[3] = (char *)group;
345 argv[4] = NULL;
346
347 mlog(0, "Run: %s %s %s %s\n", argv[0], argv[1], argv[2], argv[3]);
348
349 /* minimal command environment taken from cpu_run_sbin_hotplug */
350 envp[0] = "HOME=/";
351 envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
352 envp[2] = NULL;
353
354 ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
355 if (ret < 0)
356 mlog_errno(ret);
357}
358
359/*
360 * Hangup is a hack for tools compatibility. Older ocfs2-tools software
361 * expects the filesystem to call "ocfs2_hb_ctl" during unmount. This
362 * happens regardless of whether the DLM got started, so we can't do it
363 * in ocfs2_cluster_disconnect(). We bring the o2hb_stop() function into
364 * the glue and provide a "hangup" API for super.c to call.
365 *
366 * Other stacks will eventually provide a NULL ->hangup() pointer.
367 */
368static void o2cb_cluster_hangup(const char *group, int grouplen)
369{
370 o2hb_stop(group);
371}
372
373static int o2cb_cluster_this_node(unsigned int *node)
374{
375 int node_num;
376
377 node_num = o2nm_this_node();
378 if (node_num == O2NM_INVALID_NODE_NUM)
379 return -ENOENT;
380
381 if (node_num >= O2NM_MAX_NODES)
382 return -EOVERFLOW;
383
384 *node = node_num;
385 return 0;
386}
387
388struct ocfs2_stack_operations o2cb_stack_ops = {
389 .connect = o2cb_cluster_connect,
390 .disconnect = o2cb_cluster_disconnect,
391 .hangup = o2cb_cluster_hangup,
392 .this_node = o2cb_cluster_this_node,
393 .dlm_lock = o2cb_dlm_lock,
394 .dlm_unlock = o2cb_dlm_unlock,
395 .lock_status = o2cb_dlm_lock_status,
396 .lock_lvb = o2cb_dlm_lvb,
397 .dump_lksb = o2cb_dump_lksb,
398};
399
400static struct ocfs2_stack_plugin o2cb_stack = {
401 .sp_name = "o2cb",
402 .sp_ops = &o2cb_stack_ops,
403 .sp_owner = THIS_MODULE,
404};
405
406static int __init o2cb_stack_init(void)
407{
408 return ocfs2_stack_glue_register(&o2cb_stack);
409}
410
411static void __exit o2cb_stack_exit(void)
412{
413 ocfs2_stack_glue_unregister(&o2cb_stack);
414}
415
416MODULE_AUTHOR("Oracle");
417MODULE_DESCRIPTION("ocfs2 driver for the classic o2cb stack");
418MODULE_LICENSE("GPL");
419module_init(o2cb_stack_init);
420module_exit(o2cb_stack_exit);
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
new file mode 100644
index 000000000000..7428663f9cbb
--- /dev/null
+++ b/fs/ocfs2/stack_user.c
@@ -0,0 +1,883 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stack_user.c
5 *
6 * Code which interfaces ocfs2 with fs/dlm and a userspace stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20#include <linux/module.h>
21#include <linux/fs.h>
22#include <linux/miscdevice.h>
23#include <linux/mutex.h>
24#include <linux/reboot.h>
25#include <asm/uaccess.h>
26
27#include "ocfs2.h" /* For struct ocfs2_lock_res */
28#include "stackglue.h"
29
30
31/*
32 * The control protocol starts with a handshake. Until the handshake
33 * is complete, the control device will fail all write(2)s.
34 *
35 * The handshake is simple. First, the client reads until EOF. Each line
36 * of output is a supported protocol tag. All protocol tags are a single
37 * character followed by a two hex digit version number. Currently the
38 * only things supported is T01, for "Text-base version 0x01". Next, the
39 * client writes the version they would like to use, including the newline.
40 * Thus, the protocol tag is 'T01\n'. If the version tag written is
41 * unknown, -EINVAL is returned. Once the negotiation is complete, the
42 * client can start sending messages.
43 *
44 * The T01 protocol has three messages. First is the "SETN" message.
45 * It has the following syntax:
46 *
47 * SETN<space><8-char-hex-nodenum><newline>
48 *
49 * This is 14 characters.
50 *
51 * The "SETN" message must be the first message following the protocol.
52 * It tells ocfs2_control the local node number.
53 *
54 * Next comes the "SETV" message. It has the following syntax:
55 *
56 * SETV<space><2-char-hex-major><space><2-char-hex-minor><newline>
57 *
58 * This is 11 characters.
59 *
60 * The "SETV" message sets the filesystem locking protocol version as
61 * negotiated by the client. The client negotiates based on the maximum
62 * version advertised in /sys/fs/ocfs2/max_locking_protocol. The major
63 * number from the "SETV" message must match
64 * user_stack.sp_proto->lp_max_version.pv_major, and the minor number
65 * must be less than or equal to ...->lp_max_version.pv_minor.
66 *
67 * Once this information has been set, mounts will be allowed. From this
68 * point on, the "DOWN" message can be sent for node down notification.
69 * It has the following syntax:
70 *
71 * DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline>
72 *
73 * eg:
74 *
75 * DOWN 632A924FDD844190BDA93C0DF6B94899 00000001\n
76 *
77 * This is 47 characters.
78 */
79
80/*
81 * Whether or not the client has done the handshake.
82 * For now, we have just one protocol version.
83 */
84#define OCFS2_CONTROL_PROTO "T01\n"
85#define OCFS2_CONTROL_PROTO_LEN 4
86
87/* Handshake states */
88#define OCFS2_CONTROL_HANDSHAKE_INVALID (0)
89#define OCFS2_CONTROL_HANDSHAKE_READ (1)
90#define OCFS2_CONTROL_HANDSHAKE_PROTOCOL (2)
91#define OCFS2_CONTROL_HANDSHAKE_VALID (3)
92
93/* Messages */
94#define OCFS2_CONTROL_MESSAGE_OP_LEN 4
95#define OCFS2_CONTROL_MESSAGE_SETNODE_OP "SETN"
96#define OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN 14
97#define OCFS2_CONTROL_MESSAGE_SETVERSION_OP "SETV"
98#define OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN 11
99#define OCFS2_CONTROL_MESSAGE_DOWN_OP "DOWN"
100#define OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN 47
101#define OCFS2_TEXT_UUID_LEN 32
102#define OCFS2_CONTROL_MESSAGE_VERNUM_LEN 2
103#define OCFS2_CONTROL_MESSAGE_NODENUM_LEN 8
104
105/*
106 * ocfs2_live_connection is refcounted because the filesystem and
107 * miscdevice sides can detach in different order. Let's just be safe.
108 */
109struct ocfs2_live_connection {
110 struct list_head oc_list;
111 struct ocfs2_cluster_connection *oc_conn;
112};
113
114struct ocfs2_control_private {
115 struct list_head op_list;
116 int op_state;
117 int op_this_node;
118 struct ocfs2_protocol_version op_proto;
119};
120
121/* SETN<space><8-char-hex-nodenum><newline> */
122struct ocfs2_control_message_setn {
123 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
124 char space;
125 char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
126 char newline;
127};
128
129/* SETV<space><2-char-hex-major><space><2-char-hex-minor><newline> */
130struct ocfs2_control_message_setv {
131 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
132 char space1;
133 char major[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
134 char space2;
135 char minor[OCFS2_CONTROL_MESSAGE_VERNUM_LEN];
136 char newline;
137};
138
139/* DOWN<space><32-char-cap-hex-uuid><space><8-char-hex-nodenum><newline> */
140struct ocfs2_control_message_down {
141 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
142 char space1;
143 char uuid[OCFS2_TEXT_UUID_LEN];
144 char space2;
145 char nodestr[OCFS2_CONTROL_MESSAGE_NODENUM_LEN];
146 char newline;
147};
148
149union ocfs2_control_message {
150 char tag[OCFS2_CONTROL_MESSAGE_OP_LEN];
151 struct ocfs2_control_message_setn u_setn;
152 struct ocfs2_control_message_setv u_setv;
153 struct ocfs2_control_message_down u_down;
154};
155
156static struct ocfs2_stack_plugin user_stack;
157
158static atomic_t ocfs2_control_opened;
159static int ocfs2_control_this_node = -1;
160static struct ocfs2_protocol_version running_proto;
161
162static LIST_HEAD(ocfs2_live_connection_list);
163static LIST_HEAD(ocfs2_control_private_list);
164static DEFINE_MUTEX(ocfs2_control_lock);
165
166static inline void ocfs2_control_set_handshake_state(struct file *file,
167 int state)
168{
169 struct ocfs2_control_private *p = file->private_data;
170 p->op_state = state;
171}
172
173static inline int ocfs2_control_get_handshake_state(struct file *file)
174{
175 struct ocfs2_control_private *p = file->private_data;
176 return p->op_state;
177}
178
179static struct ocfs2_live_connection *ocfs2_connection_find(const char *name)
180{
181 size_t len = strlen(name);
182 struct ocfs2_live_connection *c;
183
184 BUG_ON(!mutex_is_locked(&ocfs2_control_lock));
185
186 list_for_each_entry(c, &ocfs2_live_connection_list, oc_list) {
187 if ((c->oc_conn->cc_namelen == len) &&
188 !strncmp(c->oc_conn->cc_name, name, len))
189 return c;
190 }
191
192 return c;
193}
194
195/*
196 * ocfs2_live_connection structures are created underneath the ocfs2
197 * mount path. Since the VFS prevents multiple calls to
198 * fill_super(), we can't get dupes here.
199 */
200static int ocfs2_live_connection_new(struct ocfs2_cluster_connection *conn,
201 struct ocfs2_live_connection **c_ret)
202{
203 int rc = 0;
204 struct ocfs2_live_connection *c;
205
206 c = kzalloc(sizeof(struct ocfs2_live_connection), GFP_KERNEL);
207 if (!c)
208 return -ENOMEM;
209
210 mutex_lock(&ocfs2_control_lock);
211 c->oc_conn = conn;
212
213 if (atomic_read(&ocfs2_control_opened))
214 list_add(&c->oc_list, &ocfs2_live_connection_list);
215 else {
216 printk(KERN_ERR
217 "ocfs2: Userspace control daemon is not present\n");
218 rc = -ESRCH;
219 }
220
221 mutex_unlock(&ocfs2_control_lock);
222
223 if (!rc)
224 *c_ret = c;
225 else
226 kfree(c);
227
228 return rc;
229}
230
231/*
232 * This function disconnects the cluster connection from ocfs2_control.
233 * Afterwards, userspace can't affect the cluster connection.
234 */
235static void ocfs2_live_connection_drop(struct ocfs2_live_connection *c)
236{
237 mutex_lock(&ocfs2_control_lock);
238 list_del_init(&c->oc_list);
239 c->oc_conn = NULL;
240 mutex_unlock(&ocfs2_control_lock);
241
242 kfree(c);
243}
244
245static int ocfs2_control_cfu(void *target, size_t target_len,
246 const char __user *buf, size_t count)
247{
248 /* The T01 expects write(2) calls to have exactly one command */
249 if ((count != target_len) ||
250 (count > sizeof(union ocfs2_control_message)))
251 return -EINVAL;
252
253 if (copy_from_user(target, buf, target_len))
254 return -EFAULT;
255
256 return 0;
257}
258
259static ssize_t ocfs2_control_validate_protocol(struct file *file,
260 const char __user *buf,
261 size_t count)
262{
263 ssize_t ret;
264 char kbuf[OCFS2_CONTROL_PROTO_LEN];
265
266 ret = ocfs2_control_cfu(kbuf, OCFS2_CONTROL_PROTO_LEN,
267 buf, count);
268 if (ret)
269 return ret;
270
271 if (strncmp(kbuf, OCFS2_CONTROL_PROTO, OCFS2_CONTROL_PROTO_LEN))
272 return -EINVAL;
273
274 ocfs2_control_set_handshake_state(file,
275 OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
276
277 return count;
278}
279
280static void ocfs2_control_send_down(const char *uuid,
281 int nodenum)
282{
283 struct ocfs2_live_connection *c;
284
285 mutex_lock(&ocfs2_control_lock);
286
287 c = ocfs2_connection_find(uuid);
288 if (c) {
289 BUG_ON(c->oc_conn == NULL);
290 c->oc_conn->cc_recovery_handler(nodenum,
291 c->oc_conn->cc_recovery_data);
292 }
293
294 mutex_unlock(&ocfs2_control_lock);
295}
296
297/*
298 * Called whenever configuration elements are sent to /dev/ocfs2_control.
299 * If all configuration elements are present, try to set the global
300 * values. If there is a problem, return an error. Skip any missing
301 * elements, and only bump ocfs2_control_opened when we have all elements
302 * and are successful.
303 */
304static int ocfs2_control_install_private(struct file *file)
305{
306 int rc = 0;
307 int set_p = 1;
308 struct ocfs2_control_private *p = file->private_data;
309
310 BUG_ON(p->op_state != OCFS2_CONTROL_HANDSHAKE_PROTOCOL);
311
312 mutex_lock(&ocfs2_control_lock);
313
314 if (p->op_this_node < 0) {
315 set_p = 0;
316 } else if ((ocfs2_control_this_node >= 0) &&
317 (ocfs2_control_this_node != p->op_this_node)) {
318 rc = -EINVAL;
319 goto out_unlock;
320 }
321
322 if (!p->op_proto.pv_major) {
323 set_p = 0;
324 } else if (!list_empty(&ocfs2_live_connection_list) &&
325 ((running_proto.pv_major != p->op_proto.pv_major) ||
326 (running_proto.pv_minor != p->op_proto.pv_minor))) {
327 rc = -EINVAL;
328 goto out_unlock;
329 }
330
331 if (set_p) {
332 ocfs2_control_this_node = p->op_this_node;
333 running_proto.pv_major = p->op_proto.pv_major;
334 running_proto.pv_minor = p->op_proto.pv_minor;
335 }
336
337out_unlock:
338 mutex_unlock(&ocfs2_control_lock);
339
340 if (!rc && set_p) {
341 /* We set the global values successfully */
342 atomic_inc(&ocfs2_control_opened);
343 ocfs2_control_set_handshake_state(file,
344 OCFS2_CONTROL_HANDSHAKE_VALID);
345 }
346
347 return rc;
348}
349
350static int ocfs2_control_get_this_node(void)
351{
352 int rc;
353
354 mutex_lock(&ocfs2_control_lock);
355 if (ocfs2_control_this_node < 0)
356 rc = -EINVAL;
357 else
358 rc = ocfs2_control_this_node;
359 mutex_unlock(&ocfs2_control_lock);
360
361 return rc;
362}
363
364static int ocfs2_control_do_setnode_msg(struct file *file,
365 struct ocfs2_control_message_setn *msg)
366{
367 long nodenum;
368 char *ptr = NULL;
369 struct ocfs2_control_private *p = file->private_data;
370
371 if (ocfs2_control_get_handshake_state(file) !=
372 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
373 return -EINVAL;
374
375 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
376 OCFS2_CONTROL_MESSAGE_OP_LEN))
377 return -EINVAL;
378
379 if ((msg->space != ' ') || (msg->newline != '\n'))
380 return -EINVAL;
381 msg->space = msg->newline = '\0';
382
383 nodenum = simple_strtol(msg->nodestr, &ptr, 16);
384 if (!ptr || *ptr)
385 return -EINVAL;
386
387 if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
388 (nodenum > INT_MAX) || (nodenum < 0))
389 return -ERANGE;
390 p->op_this_node = nodenum;
391
392 return ocfs2_control_install_private(file);
393}
394
395static int ocfs2_control_do_setversion_msg(struct file *file,
396 struct ocfs2_control_message_setv *msg)
397 {
398 long major, minor;
399 char *ptr = NULL;
400 struct ocfs2_control_private *p = file->private_data;
401 struct ocfs2_protocol_version *max =
402 &user_stack.sp_proto->lp_max_version;
403
404 if (ocfs2_control_get_handshake_state(file) !=
405 OCFS2_CONTROL_HANDSHAKE_PROTOCOL)
406 return -EINVAL;
407
408 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
409 OCFS2_CONTROL_MESSAGE_OP_LEN))
410 return -EINVAL;
411
412 if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
413 (msg->newline != '\n'))
414 return -EINVAL;
415 msg->space1 = msg->space2 = msg->newline = '\0';
416
417 major = simple_strtol(msg->major, &ptr, 16);
418 if (!ptr || *ptr)
419 return -EINVAL;
420 minor = simple_strtol(msg->minor, &ptr, 16);
421 if (!ptr || *ptr)
422 return -EINVAL;
423
424 /*
425 * The major must be between 1 and 255, inclusive. The minor
426 * must be between 0 and 255, inclusive. The version passed in
427 * must be within the maximum version supported by the filesystem.
428 */
429 if ((major == LONG_MIN) || (major == LONG_MAX) ||
430 (major > (u8)-1) || (major < 1))
431 return -ERANGE;
432 if ((minor == LONG_MIN) || (minor == LONG_MAX) ||
433 (minor > (u8)-1) || (minor < 0))
434 return -ERANGE;
435 if ((major != max->pv_major) ||
436 (minor > max->pv_minor))
437 return -EINVAL;
438
439 p->op_proto.pv_major = major;
440 p->op_proto.pv_minor = minor;
441
442 return ocfs2_control_install_private(file);
443}
444
445static int ocfs2_control_do_down_msg(struct file *file,
446 struct ocfs2_control_message_down *msg)
447{
448 long nodenum;
449 char *p = NULL;
450
451 if (ocfs2_control_get_handshake_state(file) !=
452 OCFS2_CONTROL_HANDSHAKE_VALID)
453 return -EINVAL;
454
455 if (strncmp(msg->tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
456 OCFS2_CONTROL_MESSAGE_OP_LEN))
457 return -EINVAL;
458
459 if ((msg->space1 != ' ') || (msg->space2 != ' ') ||
460 (msg->newline != '\n'))
461 return -EINVAL;
462 msg->space1 = msg->space2 = msg->newline = '\0';
463
464 nodenum = simple_strtol(msg->nodestr, &p, 16);
465 if (!p || *p)
466 return -EINVAL;
467
468 if ((nodenum == LONG_MIN) || (nodenum == LONG_MAX) ||
469 (nodenum > INT_MAX) || (nodenum < 0))
470 return -ERANGE;
471
472 ocfs2_control_send_down(msg->uuid, nodenum);
473
474 return 0;
475}
476
477static ssize_t ocfs2_control_message(struct file *file,
478 const char __user *buf,
479 size_t count)
480{
481 ssize_t ret;
482 union ocfs2_control_message msg;
483
484 /* Try to catch padding issues */
485 WARN_ON(offsetof(struct ocfs2_control_message_down, uuid) !=
486 (sizeof(msg.u_down.tag) + sizeof(msg.u_down.space1)));
487
488 memset(&msg, 0, sizeof(union ocfs2_control_message));
489 ret = ocfs2_control_cfu(&msg, count, buf, count);
490 if (ret)
491 goto out;
492
493 if ((count == OCFS2_CONTROL_MESSAGE_SETNODE_TOTAL_LEN) &&
494 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETNODE_OP,
495 OCFS2_CONTROL_MESSAGE_OP_LEN))
496 ret = ocfs2_control_do_setnode_msg(file, &msg.u_setn);
497 else if ((count == OCFS2_CONTROL_MESSAGE_SETVERSION_TOTAL_LEN) &&
498 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_SETVERSION_OP,
499 OCFS2_CONTROL_MESSAGE_OP_LEN))
500 ret = ocfs2_control_do_setversion_msg(file, &msg.u_setv);
501 else if ((count == OCFS2_CONTROL_MESSAGE_DOWN_TOTAL_LEN) &&
502 !strncmp(msg.tag, OCFS2_CONTROL_MESSAGE_DOWN_OP,
503 OCFS2_CONTROL_MESSAGE_OP_LEN))
504 ret = ocfs2_control_do_down_msg(file, &msg.u_down);
505 else
506 ret = -EINVAL;
507
508out:
509 return ret ? ret : count;
510}
511
512static ssize_t ocfs2_control_write(struct file *file,
513 const char __user *buf,
514 size_t count,
515 loff_t *ppos)
516{
517 ssize_t ret;
518
519 switch (ocfs2_control_get_handshake_state(file)) {
520 case OCFS2_CONTROL_HANDSHAKE_INVALID:
521 ret = -EINVAL;
522 break;
523
524 case OCFS2_CONTROL_HANDSHAKE_READ:
525 ret = ocfs2_control_validate_protocol(file, buf,
526 count);
527 break;
528
529 case OCFS2_CONTROL_HANDSHAKE_PROTOCOL:
530 case OCFS2_CONTROL_HANDSHAKE_VALID:
531 ret = ocfs2_control_message(file, buf, count);
532 break;
533
534 default:
535 BUG();
536 ret = -EIO;
537 break;
538 }
539
540 return ret;
541}
542
543/*
544 * This is a naive version. If we ever have a new protocol, we'll expand
545 * it. Probably using seq_file.
546 */
547static ssize_t ocfs2_control_read(struct file *file,
548 char __user *buf,
549 size_t count,
550 loff_t *ppos)
551{
552 char *proto_string = OCFS2_CONTROL_PROTO;
553 size_t to_write = 0;
554
555 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
556 return 0;
557
558 to_write = OCFS2_CONTROL_PROTO_LEN - *ppos;
559 if (to_write > count)
560 to_write = count;
561 if (copy_to_user(buf, proto_string + *ppos, to_write))
562 return -EFAULT;
563
564 *ppos += to_write;
565
566 /* Have we read the whole protocol list? */
567 if (*ppos >= OCFS2_CONTROL_PROTO_LEN)
568 ocfs2_control_set_handshake_state(file,
569 OCFS2_CONTROL_HANDSHAKE_READ);
570
571 return to_write;
572}
573
574static int ocfs2_control_release(struct inode *inode, struct file *file)
575{
576 struct ocfs2_control_private *p = file->private_data;
577
578 mutex_lock(&ocfs2_control_lock);
579
580 if (ocfs2_control_get_handshake_state(file) !=
581 OCFS2_CONTROL_HANDSHAKE_VALID)
582 goto out;
583
584 if (atomic_dec_and_test(&ocfs2_control_opened)) {
585 if (!list_empty(&ocfs2_live_connection_list)) {
586 /* XXX: Do bad things! */
587 printk(KERN_ERR
588 "ocfs2: Unexpected release of ocfs2_control!\n"
589 " Loss of cluster connection requires "
590 "an emergency restart!\n");
591 emergency_restart();
592 }
593 /*
594 * Last valid close clears the node number and resets
595 * the locking protocol version
596 */
597 ocfs2_control_this_node = -1;
598 running_proto.pv_major = 0;
599 running_proto.pv_major = 0;
600 }
601
602out:
603 list_del_init(&p->op_list);
604 file->private_data = NULL;
605
606 mutex_unlock(&ocfs2_control_lock);
607
608 kfree(p);
609
610 return 0;
611}
612
613static int ocfs2_control_open(struct inode *inode, struct file *file)
614{
615 struct ocfs2_control_private *p;
616
617 p = kzalloc(sizeof(struct ocfs2_control_private), GFP_KERNEL);
618 if (!p)
619 return -ENOMEM;
620 p->op_this_node = -1;
621
622 mutex_lock(&ocfs2_control_lock);
623 file->private_data = p;
624 list_add(&p->op_list, &ocfs2_control_private_list);
625 mutex_unlock(&ocfs2_control_lock);
626
627 return 0;
628}
629
630static const struct file_operations ocfs2_control_fops = {
631 .open = ocfs2_control_open,
632 .release = ocfs2_control_release,
633 .read = ocfs2_control_read,
634 .write = ocfs2_control_write,
635 .owner = THIS_MODULE,
636};
637
638struct miscdevice ocfs2_control_device = {
639 .minor = MISC_DYNAMIC_MINOR,
640 .name = "ocfs2_control",
641 .fops = &ocfs2_control_fops,
642};
643
644static int ocfs2_control_init(void)
645{
646 int rc;
647
648 atomic_set(&ocfs2_control_opened, 0);
649
650 rc = misc_register(&ocfs2_control_device);
651 if (rc)
652 printk(KERN_ERR
653 "ocfs2: Unable to register ocfs2_control device "
654 "(errno %d)\n",
655 -rc);
656
657 return rc;
658}
659
660static void ocfs2_control_exit(void)
661{
662 int rc;
663
664 rc = misc_deregister(&ocfs2_control_device);
665 if (rc)
666 printk(KERN_ERR
667 "ocfs2: Unable to deregister ocfs2_control device "
668 "(errno %d)\n",
669 -rc);
670}
671
672static struct dlm_lksb *fsdlm_astarg_to_lksb(void *astarg)
673{
674 struct ocfs2_lock_res *res = astarg;
675 return &res->l_lksb.lksb_fsdlm;
676}
677
678static void fsdlm_lock_ast_wrapper(void *astarg)
679{
680 struct dlm_lksb *lksb = fsdlm_astarg_to_lksb(astarg);
681 int status = lksb->sb_status;
682
683 BUG_ON(user_stack.sp_proto == NULL);
684
685 /*
686 * For now we're punting on the issue of other non-standard errors
687 * where we can't tell if the unlock_ast or lock_ast should be called.
688 * The main "other error" that's possible is EINVAL which means the
689 * function was called with invalid args, which shouldn't be possible
690 * since the caller here is under our control. Other non-standard
691 * errors probably fall into the same category, or otherwise are fatal
692 * which means we can't carry on anyway.
693 */
694
695 if (status == -DLM_EUNLOCK || status == -DLM_ECANCEL)
696 user_stack.sp_proto->lp_unlock_ast(astarg, 0);
697 else
698 user_stack.sp_proto->lp_lock_ast(astarg);
699}
700
701static void fsdlm_blocking_ast_wrapper(void *astarg, int level)
702{
703 BUG_ON(user_stack.sp_proto == NULL);
704
705 user_stack.sp_proto->lp_blocking_ast(astarg, level);
706}
707
708static int user_dlm_lock(struct ocfs2_cluster_connection *conn,
709 int mode,
710 union ocfs2_dlm_lksb *lksb,
711 u32 flags,
712 void *name,
713 unsigned int namelen,
714 void *astarg)
715{
716 int ret;
717
718 if (!lksb->lksb_fsdlm.sb_lvbptr)
719 lksb->lksb_fsdlm.sb_lvbptr = (char *)lksb +
720 sizeof(struct dlm_lksb);
721
722 ret = dlm_lock(conn->cc_lockspace, mode, &lksb->lksb_fsdlm,
723 flags|DLM_LKF_NODLCKWT, name, namelen, 0,
724 fsdlm_lock_ast_wrapper, astarg,
725 fsdlm_blocking_ast_wrapper);
726 return ret;
727}
728
729static int user_dlm_unlock(struct ocfs2_cluster_connection *conn,
730 union ocfs2_dlm_lksb *lksb,
731 u32 flags,
732 void *astarg)
733{
734 int ret;
735
736 ret = dlm_unlock(conn->cc_lockspace, lksb->lksb_fsdlm.sb_lkid,
737 flags, &lksb->lksb_fsdlm, astarg);
738 return ret;
739}
740
741static int user_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
742{
743 return lksb->lksb_fsdlm.sb_status;
744}
745
746static void *user_dlm_lvb(union ocfs2_dlm_lksb *lksb)
747{
748 return (void *)(lksb->lksb_fsdlm.sb_lvbptr);
749}
750
751static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
752{
753}
754
755/*
756 * Compare a requested locking protocol version against the current one.
757 *
758 * If the major numbers are different, they are incompatible.
759 * If the current minor is greater than the request, they are incompatible.
760 * If the current minor is less than or equal to the request, they are
761 * compatible, and the requester should run at the current minor version.
762 */
763static int fs_protocol_compare(struct ocfs2_protocol_version *existing,
764 struct ocfs2_protocol_version *request)
765{
766 if (existing->pv_major != request->pv_major)
767 return 1;
768
769 if (existing->pv_minor > request->pv_minor)
770 return 1;
771
772 if (existing->pv_minor < request->pv_minor)
773 request->pv_minor = existing->pv_minor;
774
775 return 0;
776}
777
778static int user_cluster_connect(struct ocfs2_cluster_connection *conn)
779{
780 dlm_lockspace_t *fsdlm;
781 struct ocfs2_live_connection *control;
782 int rc = 0;
783
784 BUG_ON(conn == NULL);
785
786 rc = ocfs2_live_connection_new(conn, &control);
787 if (rc)
788 goto out;
789
790 /*
791 * running_proto must have been set before we allowed any mounts
792 * to proceed.
793 */
794 if (fs_protocol_compare(&running_proto, &conn->cc_version)) {
795 printk(KERN_ERR
796 "Unable to mount with fs locking protocol version "
797 "%u.%u because the userspace control daemon has "
798 "negotiated %u.%u\n",
799 conn->cc_version.pv_major, conn->cc_version.pv_minor,
800 running_proto.pv_major, running_proto.pv_minor);
801 rc = -EPROTO;
802 ocfs2_live_connection_drop(control);
803 goto out;
804 }
805
806 rc = dlm_new_lockspace(conn->cc_name, strlen(conn->cc_name),
807 &fsdlm, DLM_LSFL_FS, DLM_LVB_LEN);
808 if (rc) {
809 ocfs2_live_connection_drop(control);
810 goto out;
811 }
812
813 conn->cc_private = control;
814 conn->cc_lockspace = fsdlm;
815out:
816 return rc;
817}
818
819static int user_cluster_disconnect(struct ocfs2_cluster_connection *conn,
820 int hangup_pending)
821{
822 dlm_release_lockspace(conn->cc_lockspace, 2);
823 conn->cc_lockspace = NULL;
824 ocfs2_live_connection_drop(conn->cc_private);
825 conn->cc_private = NULL;
826 return 0;
827}
828
829static int user_cluster_this_node(unsigned int *this_node)
830{
831 int rc;
832
833 rc = ocfs2_control_get_this_node();
834 if (rc < 0)
835 return rc;
836
837 *this_node = rc;
838 return 0;
839}
840
841static struct ocfs2_stack_operations user_stack_ops = {
842 .connect = user_cluster_connect,
843 .disconnect = user_cluster_disconnect,
844 .this_node = user_cluster_this_node,
845 .dlm_lock = user_dlm_lock,
846 .dlm_unlock = user_dlm_unlock,
847 .lock_status = user_dlm_lock_status,
848 .lock_lvb = user_dlm_lvb,
849 .dump_lksb = user_dlm_dump_lksb,
850};
851
852static struct ocfs2_stack_plugin user_stack = {
853 .sp_name = "user",
854 .sp_ops = &user_stack_ops,
855 .sp_owner = THIS_MODULE,
856};
857
858
859static int __init user_stack_init(void)
860{
861 int rc;
862
863 rc = ocfs2_control_init();
864 if (!rc) {
865 rc = ocfs2_stack_glue_register(&user_stack);
866 if (rc)
867 ocfs2_control_exit();
868 }
869
870 return rc;
871}
872
873static void __exit user_stack_exit(void)
874{
875 ocfs2_stack_glue_unregister(&user_stack);
876 ocfs2_control_exit();
877}
878
879MODULE_AUTHOR("Oracle");
880MODULE_DESCRIPTION("ocfs2 driver for userspace cluster stacks");
881MODULE_LICENSE("GPL");
882module_init(user_stack_init);
883module_exit(user_stack_exit);
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
new file mode 100644
index 000000000000..119f60cea9cc
--- /dev/null
+++ b/fs/ocfs2/stackglue.c
@@ -0,0 +1,568 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stackglue.c
5 *
6 * Code which implements an OCFS2 specific interface to underlying
7 * cluster stacks.
8 *
9 * Copyright (C) 2007 Oracle. All rights reserved.
10 *
11 * This program is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU General Public
13 * License as published by the Free Software Foundation, version 2.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 */
20
21#include <linux/list.h>
22#include <linux/spinlock.h>
23#include <linux/module.h>
24#include <linux/slab.h>
25#include <linux/kmod.h>
26#include <linux/fs.h>
27#include <linux/kobject.h>
28#include <linux/sysfs.h>
29
30#include "ocfs2_fs.h"
31
32#include "stackglue.h"
33
34#define OCFS2_STACK_PLUGIN_O2CB "o2cb"
35#define OCFS2_STACK_PLUGIN_USER "user"
36
37static struct ocfs2_locking_protocol *lproto;
38static DEFINE_SPINLOCK(ocfs2_stack_lock);
39static LIST_HEAD(ocfs2_stack_list);
40static char cluster_stack_name[OCFS2_STACK_LABEL_LEN + 1];
41
42/*
43 * The stack currently in use. If not null, active_stack->sp_count > 0,
44 * the module is pinned, and the locking protocol cannot be changed.
45 */
46static struct ocfs2_stack_plugin *active_stack;
47
48static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
49{
50 struct ocfs2_stack_plugin *p;
51
52 assert_spin_locked(&ocfs2_stack_lock);
53
54 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
55 if (!strcmp(p->sp_name, name))
56 return p;
57 }
58
59 return NULL;
60}
61
62static int ocfs2_stack_driver_request(const char *stack_name,
63 const char *plugin_name)
64{
65 int rc;
66 struct ocfs2_stack_plugin *p;
67
68 spin_lock(&ocfs2_stack_lock);
69
70 /*
71 * If the stack passed by the filesystem isn't the selected one,
72 * we can't continue.
73 */
74 if (strcmp(stack_name, cluster_stack_name)) {
75 rc = -EBUSY;
76 goto out;
77 }
78
79 if (active_stack) {
80 /*
81 * If the active stack isn't the one we want, it cannot
82 * be selected right now.
83 */
84 if (!strcmp(active_stack->sp_name, plugin_name))
85 rc = 0;
86 else
87 rc = -EBUSY;
88 goto out;
89 }
90
91 p = ocfs2_stack_lookup(plugin_name);
92 if (!p || !try_module_get(p->sp_owner)) {
93 rc = -ENOENT;
94 goto out;
95 }
96
97 /* Ok, the stack is pinned */
98 p->sp_count++;
99 active_stack = p;
100
101 rc = 0;
102
103out:
104 spin_unlock(&ocfs2_stack_lock);
105 return rc;
106}
107
108/*
109 * This function looks up the appropriate stack and makes it active. If
110 * there is no stack, it tries to load it. It will fail if the stack still
111 * cannot be found. It will also fail if a different stack is in use.
112 */
113static int ocfs2_stack_driver_get(const char *stack_name)
114{
115 int rc;
116 char *plugin_name = OCFS2_STACK_PLUGIN_O2CB;
117
118 /*
119 * Classic stack does not pass in a stack name. This is
120 * compatible with older tools as well.
121 */
122 if (!stack_name || !*stack_name)
123 stack_name = OCFS2_STACK_PLUGIN_O2CB;
124
125 if (strlen(stack_name) != OCFS2_STACK_LABEL_LEN) {
126 printk(KERN_ERR
127 "ocfs2 passed an invalid cluster stack label: \"%s\"\n",
128 stack_name);
129 return -EINVAL;
130 }
131
132 /* Anything that isn't the classic stack is a user stack */
133 if (strcmp(stack_name, OCFS2_STACK_PLUGIN_O2CB))
134 plugin_name = OCFS2_STACK_PLUGIN_USER;
135
136 rc = ocfs2_stack_driver_request(stack_name, plugin_name);
137 if (rc == -ENOENT) {
138 request_module("ocfs2_stack_%s", plugin_name);
139 rc = ocfs2_stack_driver_request(stack_name, plugin_name);
140 }
141
142 if (rc == -ENOENT) {
143 printk(KERN_ERR
144 "ocfs2: Cluster stack driver \"%s\" cannot be found\n",
145 plugin_name);
146 } else if (rc == -EBUSY) {
147 printk(KERN_ERR
148 "ocfs2: A different cluster stack is in use\n");
149 }
150
151 return rc;
152}
153
154static void ocfs2_stack_driver_put(void)
155{
156 spin_lock(&ocfs2_stack_lock);
157 BUG_ON(active_stack == NULL);
158 BUG_ON(active_stack->sp_count == 0);
159
160 active_stack->sp_count--;
161 if (!active_stack->sp_count) {
162 module_put(active_stack->sp_owner);
163 active_stack = NULL;
164 }
165 spin_unlock(&ocfs2_stack_lock);
166}
167
168int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin)
169{
170 int rc;
171
172 spin_lock(&ocfs2_stack_lock);
173 if (!ocfs2_stack_lookup(plugin->sp_name)) {
174 plugin->sp_count = 0;
175 plugin->sp_proto = lproto;
176 list_add(&plugin->sp_list, &ocfs2_stack_list);
177 printk(KERN_INFO "ocfs2: Registered cluster interface %s\n",
178 plugin->sp_name);
179 rc = 0;
180 } else {
181 printk(KERN_ERR "ocfs2: Stack \"%s\" already registered\n",
182 plugin->sp_name);
183 rc = -EEXIST;
184 }
185 spin_unlock(&ocfs2_stack_lock);
186
187 return rc;
188}
189EXPORT_SYMBOL_GPL(ocfs2_stack_glue_register);
190
191void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin)
192{
193 struct ocfs2_stack_plugin *p;
194
195 spin_lock(&ocfs2_stack_lock);
196 p = ocfs2_stack_lookup(plugin->sp_name);
197 if (p) {
198 BUG_ON(p != plugin);
199 BUG_ON(plugin == active_stack);
200 BUG_ON(plugin->sp_count != 0);
201 list_del_init(&plugin->sp_list);
202 printk(KERN_INFO "ocfs2: Unregistered cluster interface %s\n",
203 plugin->sp_name);
204 } else {
205 printk(KERN_ERR "Stack \"%s\" is not registered\n",
206 plugin->sp_name);
207 }
208 spin_unlock(&ocfs2_stack_lock);
209}
210EXPORT_SYMBOL_GPL(ocfs2_stack_glue_unregister);
211
212void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto)
213{
214 struct ocfs2_stack_plugin *p;
215
216 BUG_ON(proto == NULL);
217
218 spin_lock(&ocfs2_stack_lock);
219 BUG_ON(active_stack != NULL);
220
221 lproto = proto;
222 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
223 p->sp_proto = lproto;
224 }
225
226 spin_unlock(&ocfs2_stack_lock);
227}
228EXPORT_SYMBOL_GPL(ocfs2_stack_glue_set_locking_protocol);
229
230
231/*
232 * The ocfs2_dlm_lock() and ocfs2_dlm_unlock() functions take
233 * "struct ocfs2_lock_res *astarg" instead of "void *astarg" because the
234 * underlying stack plugins need to pilfer the lksb off of the lock_res.
235 * If some other structure needs to be passed as an astarg, the plugins
236 * will need to be given a different avenue to the lksb.
237 */
238int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
239 int mode,
240 union ocfs2_dlm_lksb *lksb,
241 u32 flags,
242 void *name,
243 unsigned int namelen,
244 struct ocfs2_lock_res *astarg)
245{
246 BUG_ON(lproto == NULL);
247
248 return active_stack->sp_ops->dlm_lock(conn, mode, lksb, flags,
249 name, namelen, astarg);
250}
251EXPORT_SYMBOL_GPL(ocfs2_dlm_lock);
252
253int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
254 union ocfs2_dlm_lksb *lksb,
255 u32 flags,
256 struct ocfs2_lock_res *astarg)
257{
258 BUG_ON(lproto == NULL);
259
260 return active_stack->sp_ops->dlm_unlock(conn, lksb, flags, astarg);
261}
262EXPORT_SYMBOL_GPL(ocfs2_dlm_unlock);
263
264int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb)
265{
266 return active_stack->sp_ops->lock_status(lksb);
267}
268EXPORT_SYMBOL_GPL(ocfs2_dlm_lock_status);
269
270/*
271 * Why don't we cast to ocfs2_meta_lvb? The "clean" answer is that we
272 * don't cast at the glue level. The real answer is that the header
273 * ordering is nigh impossible.
274 */
275void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb)
276{
277 return active_stack->sp_ops->lock_lvb(lksb);
278}
279EXPORT_SYMBOL_GPL(ocfs2_dlm_lvb);
280
281void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb)
282{
283 active_stack->sp_ops->dump_lksb(lksb);
284}
285EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb);
286
287int ocfs2_cluster_connect(const char *stack_name,
288 const char *group,
289 int grouplen,
290 void (*recovery_handler)(int node_num,
291 void *recovery_data),
292 void *recovery_data,
293 struct ocfs2_cluster_connection **conn)
294{
295 int rc = 0;
296 struct ocfs2_cluster_connection *new_conn;
297
298 BUG_ON(group == NULL);
299 BUG_ON(conn == NULL);
300 BUG_ON(recovery_handler == NULL);
301
302 if (grouplen > GROUP_NAME_MAX) {
303 rc = -EINVAL;
304 goto out;
305 }
306
307 new_conn = kzalloc(sizeof(struct ocfs2_cluster_connection),
308 GFP_KERNEL);
309 if (!new_conn) {
310 rc = -ENOMEM;
311 goto out;
312 }
313
314 memcpy(new_conn->cc_name, group, grouplen);
315 new_conn->cc_namelen = grouplen;
316 new_conn->cc_recovery_handler = recovery_handler;
317 new_conn->cc_recovery_data = recovery_data;
318
319 /* Start the new connection at our maximum compatibility level */
320 new_conn->cc_version = lproto->lp_max_version;
321
322 /* This will pin the stack driver if successful */
323 rc = ocfs2_stack_driver_get(stack_name);
324 if (rc)
325 goto out_free;
326
327 rc = active_stack->sp_ops->connect(new_conn);
328 if (rc) {
329 ocfs2_stack_driver_put();
330 goto out_free;
331 }
332
333 *conn = new_conn;
334
335out_free:
336 if (rc)
337 kfree(new_conn);
338
339out:
340 return rc;
341}
342EXPORT_SYMBOL_GPL(ocfs2_cluster_connect);
343
344/* If hangup_pending is 0, the stack driver will be dropped */
345int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
346 int hangup_pending)
347{
348 int ret;
349
350 BUG_ON(conn == NULL);
351
352 ret = active_stack->sp_ops->disconnect(conn, hangup_pending);
353
354 /* XXX Should we free it anyway? */
355 if (!ret) {
356 kfree(conn);
357 if (!hangup_pending)
358 ocfs2_stack_driver_put();
359 }
360
361 return ret;
362}
363EXPORT_SYMBOL_GPL(ocfs2_cluster_disconnect);
364
365void ocfs2_cluster_hangup(const char *group, int grouplen)
366{
367 BUG_ON(group == NULL);
368 BUG_ON(group[grouplen] != '\0');
369
370 if (active_stack->sp_ops->hangup)
371 active_stack->sp_ops->hangup(group, grouplen);
372
373 /* cluster_disconnect() was called with hangup_pending==1 */
374 ocfs2_stack_driver_put();
375}
376EXPORT_SYMBOL_GPL(ocfs2_cluster_hangup);
377
378int ocfs2_cluster_this_node(unsigned int *node)
379{
380 return active_stack->sp_ops->this_node(node);
381}
382EXPORT_SYMBOL_GPL(ocfs2_cluster_this_node);
383
384
385/*
386 * Sysfs bits
387 */
388
389static ssize_t ocfs2_max_locking_protocol_show(struct kobject *kobj,
390 struct kobj_attribute *attr,
391 char *buf)
392{
393 ssize_t ret = 0;
394
395 spin_lock(&ocfs2_stack_lock);
396 if (lproto)
397 ret = snprintf(buf, PAGE_SIZE, "%u.%u\n",
398 lproto->lp_max_version.pv_major,
399 lproto->lp_max_version.pv_minor);
400 spin_unlock(&ocfs2_stack_lock);
401
402 return ret;
403}
404
405static struct kobj_attribute ocfs2_attr_max_locking_protocol =
406 __ATTR(max_locking_protocol, S_IFREG | S_IRUGO,
407 ocfs2_max_locking_protocol_show, NULL);
408
409static ssize_t ocfs2_loaded_cluster_plugins_show(struct kobject *kobj,
410 struct kobj_attribute *attr,
411 char *buf)
412{
413 ssize_t ret = 0, total = 0, remain = PAGE_SIZE;
414 struct ocfs2_stack_plugin *p;
415
416 spin_lock(&ocfs2_stack_lock);
417 list_for_each_entry(p, &ocfs2_stack_list, sp_list) {
418 ret = snprintf(buf, remain, "%s\n",
419 p->sp_name);
420 if (ret < 0) {
421 total = ret;
422 break;
423 }
424 if (ret == remain) {
425 /* snprintf() didn't fit */
426 total = -E2BIG;
427 break;
428 }
429 total += ret;
430 remain -= ret;
431 }
432 spin_unlock(&ocfs2_stack_lock);
433
434 return total;
435}
436
437static struct kobj_attribute ocfs2_attr_loaded_cluster_plugins =
438 __ATTR(loaded_cluster_plugins, S_IFREG | S_IRUGO,
439 ocfs2_loaded_cluster_plugins_show, NULL);
440
441static ssize_t ocfs2_active_cluster_plugin_show(struct kobject *kobj,
442 struct kobj_attribute *attr,
443 char *buf)
444{
445 ssize_t ret = 0;
446
447 spin_lock(&ocfs2_stack_lock);
448 if (active_stack) {
449 ret = snprintf(buf, PAGE_SIZE, "%s\n",
450 active_stack->sp_name);
451 if (ret == PAGE_SIZE)
452 ret = -E2BIG;
453 }
454 spin_unlock(&ocfs2_stack_lock);
455
456 return ret;
457}
458
459static struct kobj_attribute ocfs2_attr_active_cluster_plugin =
460 __ATTR(active_cluster_plugin, S_IFREG | S_IRUGO,
461 ocfs2_active_cluster_plugin_show, NULL);
462
463static ssize_t ocfs2_cluster_stack_show(struct kobject *kobj,
464 struct kobj_attribute *attr,
465 char *buf)
466{
467 ssize_t ret;
468 spin_lock(&ocfs2_stack_lock);
469 ret = snprintf(buf, PAGE_SIZE, "%s\n", cluster_stack_name);
470 spin_unlock(&ocfs2_stack_lock);
471
472 return ret;
473}
474
475static ssize_t ocfs2_cluster_stack_store(struct kobject *kobj,
476 struct kobj_attribute *attr,
477 const char *buf, size_t count)
478{
479 size_t len = count;
480 ssize_t ret;
481
482 if (len == 0)
483 return len;
484
485 if (buf[len - 1] == '\n')
486 len--;
487
488 if ((len != OCFS2_STACK_LABEL_LEN) ||
489 (strnlen(buf, len) != len))
490 return -EINVAL;
491
492 spin_lock(&ocfs2_stack_lock);
493 if (active_stack) {
494 if (!strncmp(buf, cluster_stack_name, len))
495 ret = count;
496 else
497 ret = -EBUSY;
498 } else {
499 memcpy(cluster_stack_name, buf, len);
500 ret = count;
501 }
502 spin_unlock(&ocfs2_stack_lock);
503
504 return ret;
505}
506
507
508static struct kobj_attribute ocfs2_attr_cluster_stack =
509 __ATTR(cluster_stack, S_IFREG | S_IRUGO | S_IWUSR,
510 ocfs2_cluster_stack_show,
511 ocfs2_cluster_stack_store);
512
513static struct attribute *ocfs2_attrs[] = {
514 &ocfs2_attr_max_locking_protocol.attr,
515 &ocfs2_attr_loaded_cluster_plugins.attr,
516 &ocfs2_attr_active_cluster_plugin.attr,
517 &ocfs2_attr_cluster_stack.attr,
518 NULL,
519};
520
521static struct attribute_group ocfs2_attr_group = {
522 .attrs = ocfs2_attrs,
523};
524
525static struct kset *ocfs2_kset;
526
527static void ocfs2_sysfs_exit(void)
528{
529 kset_unregister(ocfs2_kset);
530}
531
532static int ocfs2_sysfs_init(void)
533{
534 int ret;
535
536 ocfs2_kset = kset_create_and_add("ocfs2", NULL, fs_kobj);
537 if (!ocfs2_kset)
538 return -ENOMEM;
539
540 ret = sysfs_create_group(&ocfs2_kset->kobj, &ocfs2_attr_group);
541 if (ret)
542 goto error;
543
544 return 0;
545
546error:
547 kset_unregister(ocfs2_kset);
548 return ret;
549}
550
551static int __init ocfs2_stack_glue_init(void)
552{
553 strcpy(cluster_stack_name, OCFS2_STACK_PLUGIN_O2CB);
554
555 return ocfs2_sysfs_init();
556}
557
558static void __exit ocfs2_stack_glue_exit(void)
559{
560 lproto = NULL;
561 ocfs2_sysfs_exit();
562}
563
564MODULE_AUTHOR("Oracle");
565MODULE_DESCRIPTION("ocfs2 cluter stack glue layer");
566MODULE_LICENSE("GPL");
567module_init(ocfs2_stack_glue_init);
568module_exit(ocfs2_stack_glue_exit);
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
new file mode 100644
index 000000000000..005e4f170e0f
--- /dev/null
+++ b/fs/ocfs2/stackglue.h
@@ -0,0 +1,261 @@
1/* -*- mode: c; c-basic-offset: 8; -*-
2 * vim: noexpandtab sw=8 ts=8 sts=0:
3 *
4 * stackglue.h
5 *
6 * Glue to the underlying cluster stack.
7 *
8 * Copyright (C) 2007 Oracle. All rights reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public
12 * License as published by the Free Software Foundation, version 2.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * General Public License for more details.
18 */
19
20
21#ifndef STACKGLUE_H
22#define STACKGLUE_H
23
24#include <linux/types.h>
25#include <linux/list.h>
26#include <linux/dlmconstants.h>
27
28#include "dlm/dlmapi.h"
29#include <linux/dlm.h>
30
31/*
32 * dlmconstants.h does not have a LOCAL flag. We hope to remove it
33 * some day, but right now we need it. Let's fake it. This value is larger
34 * than any flag in dlmconstants.h.
35 */
36#define DLM_LKF_LOCAL 0x00100000
37
38/*
39 * This shadows DLM_LOCKSPACE_LEN in fs/dlm/dlm_internal.h. That probably
40 * wants to be in a public header.
41 */
42#define GROUP_NAME_MAX 64
43
44
45/*
46 * ocfs2_protocol_version changes when ocfs2 does something different in
47 * its inter-node behavior. See dlmglue.c for more information.
48 */
49struct ocfs2_protocol_version {
50 u8 pv_major;
51 u8 pv_minor;
52};
53
54/*
55 * The ocfs2_locking_protocol defines the handlers called on ocfs2's behalf.
56 */
57struct ocfs2_locking_protocol {
58 struct ocfs2_protocol_version lp_max_version;
59 void (*lp_lock_ast)(void *astarg);
60 void (*lp_blocking_ast)(void *astarg, int level);
61 void (*lp_unlock_ast)(void *astarg, int error);
62};
63
64
65/*
66 * The dlm_lockstatus struct includes lvb space, but the dlm_lksb struct only
67 * has a pointer to separately allocated lvb space. This struct exists only to
68 * include in the lksb union to make space for a combined dlm_lksb and lvb.
69 */
70struct fsdlm_lksb_plus_lvb {
71 struct dlm_lksb lksb;
72 char lvb[DLM_LVB_LEN];
73};
74
75/*
76 * A union of all lock status structures. We define it here so that the
77 * size of the union is known. Lock status structures are embedded in
78 * ocfs2 inodes.
79 */
80union ocfs2_dlm_lksb {
81 struct dlm_lockstatus lksb_o2dlm;
82 struct dlm_lksb lksb_fsdlm;
83 struct fsdlm_lksb_plus_lvb padding;
84};
85
86/*
87 * A cluster connection. Mostly opaque to ocfs2, the connection holds
88 * state for the underlying stack. ocfs2 does use cc_version to determine
89 * locking compatibility.
90 */
91struct ocfs2_cluster_connection {
92 char cc_name[GROUP_NAME_MAX];
93 int cc_namelen;
94 struct ocfs2_protocol_version cc_version;
95 void (*cc_recovery_handler)(int node_num, void *recovery_data);
96 void *cc_recovery_data;
97 void *cc_lockspace;
98 void *cc_private;
99};
100
101/*
102 * Each cluster stack implements the stack operations structure. Not used
103 * in the ocfs2 code, the stackglue code translates generic cluster calls
104 * into stack operations.
105 */
106struct ocfs2_stack_operations {
107 /*
108 * The fs code calls ocfs2_cluster_connect() to attach a new
109 * filesystem to the cluster stack. The ->connect() op is passed
110 * an ocfs2_cluster_connection with the name and recovery field
111 * filled in.
112 *
113 * The stack must set up any notification mechanisms and create
114 * the filesystem lockspace in the DLM. The lockspace should be
115 * stored on cc_lockspace. Any other information can be stored on
116 * cc_private.
117 *
118 * ->connect() must not return until it is guaranteed that
119 *
120 * - Node down notifications for the filesystem will be recieved
121 * and passed to conn->cc_recovery_handler().
122 * - Locking requests for the filesystem will be processed.
123 */
124 int (*connect)(struct ocfs2_cluster_connection *conn);
125
126 /*
127 * The fs code calls ocfs2_cluster_disconnect() when a filesystem
128 * no longer needs cluster services. All DLM locks have been
129 * dropped, and recovery notification is being ignored by the
130 * fs code. The stack must disengage from the DLM and discontinue
131 * recovery notification.
132 *
133 * Once ->disconnect() has returned, the connection structure will
134 * be freed. Thus, a stack must not return from ->disconnect()
135 * until it will no longer reference the conn pointer.
136 *
137 * If hangup_pending is zero, ocfs2_cluster_disconnect() will also
138 * be dropping the reference on the module.
139 */
140 int (*disconnect)(struct ocfs2_cluster_connection *conn,
141 int hangup_pending);
142
143 /*
144 * ocfs2_cluster_hangup() exists for compatibility with older
145 * ocfs2 tools. Only the classic stack really needs it. As such
146 * ->hangup() is not required of all stacks. See the comment by
147 * ocfs2_cluster_hangup() for more details.
148 *
149 * Note that ocfs2_cluster_hangup() can only be called if
150 * hangup_pending was passed to ocfs2_cluster_disconnect().
151 */
152 void (*hangup)(const char *group, int grouplen);
153
154 /*
155 * ->this_node() returns the cluster's unique identifier for the
156 * local node.
157 */
158 int (*this_node)(unsigned int *node);
159
160 /*
161 * Call the underlying dlm lock function. The ->dlm_lock()
162 * callback should convert the flags and mode as appropriate.
163 *
164 * ast and bast functions are not part of the call because the
165 * stack will likely want to wrap ast and bast calls before passing
166 * them to stack->sp_proto.
167 */
168 int (*dlm_lock)(struct ocfs2_cluster_connection *conn,
169 int mode,
170 union ocfs2_dlm_lksb *lksb,
171 u32 flags,
172 void *name,
173 unsigned int namelen,
174 void *astarg);
175
176 /*
177 * Call the underlying dlm unlock function. The ->dlm_unlock()
178 * function should convert the flags as appropriate.
179 *
180 * The unlock ast is not passed, as the stack will want to wrap
181 * it before calling stack->sp_proto->lp_unlock_ast().
182 */
183 int (*dlm_unlock)(struct ocfs2_cluster_connection *conn,
184 union ocfs2_dlm_lksb *lksb,
185 u32 flags,
186 void *astarg);
187
188 /*
189 * Return the status of the current lock status block. The fs
190 * code should never dereference the union. The ->lock_status()
191 * callback pulls out the stack-specific lksb, converts the status
192 * to a proper errno, and returns it.
193 */
194 int (*lock_status)(union ocfs2_dlm_lksb *lksb);
195
196 /*
197 * Pull the lvb pointer off of the stack-specific lksb.
198 */
199 void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb);
200
201 /*
202 * This is an optoinal debugging hook. If provided, the
203 * stack can dump debugging information about this lock.
204 */
205 void (*dump_lksb)(union ocfs2_dlm_lksb *lksb);
206};
207
208/*
209 * Each stack plugin must describe itself by registering a
210 * ocfs2_stack_plugin structure. This is only seen by stackglue and the
211 * stack driver.
212 */
213struct ocfs2_stack_plugin {
214 char *sp_name;
215 struct ocfs2_stack_operations *sp_ops;
216 struct module *sp_owner;
217
218 /* These are managed by the stackglue code. */
219 struct list_head sp_list;
220 unsigned int sp_count;
221 struct ocfs2_locking_protocol *sp_proto;
222};
223
224
225/* Used by the filesystem */
226int ocfs2_cluster_connect(const char *stack_name,
227 const char *group,
228 int grouplen,
229 void (*recovery_handler)(int node_num,
230 void *recovery_data),
231 void *recovery_data,
232 struct ocfs2_cluster_connection **conn);
233int ocfs2_cluster_disconnect(struct ocfs2_cluster_connection *conn,
234 int hangup_pending);
235void ocfs2_cluster_hangup(const char *group, int grouplen);
236int ocfs2_cluster_this_node(unsigned int *node);
237
238struct ocfs2_lock_res;
239int ocfs2_dlm_lock(struct ocfs2_cluster_connection *conn,
240 int mode,
241 union ocfs2_dlm_lksb *lksb,
242 u32 flags,
243 void *name,
244 unsigned int namelen,
245 struct ocfs2_lock_res *astarg);
246int ocfs2_dlm_unlock(struct ocfs2_cluster_connection *conn,
247 union ocfs2_dlm_lksb *lksb,
248 u32 flags,
249 struct ocfs2_lock_res *astarg);
250
251int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb);
252void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb);
253void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb);
254
255void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto);
256
257
258/* Used by stack plugins */
259int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
260void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
261#endif /* STACKGLUE_H */
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 72c198a004df..d2d278fb9819 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -46,6 +46,11 @@
46 46
47#include "buffer_head_io.h" 47#include "buffer_head_io.h"
48 48
49#define NOT_ALLOC_NEW_GROUP 0
50#define ALLOC_NEW_GROUP 1
51
52#define OCFS2_MAX_INODES_TO_STEAL 1024
53
49static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg); 54static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg);
50static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe); 55static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe);
51static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl); 56static inline u16 ocfs2_find_victim_chain(struct ocfs2_chain_list *cl);
@@ -106,7 +111,7 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode,
106 u64 *bg_blkno, 111 u64 *bg_blkno,
107 u16 *bg_bit_off); 112 u16 *bg_bit_off);
108 113
109void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) 114static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
110{ 115{
111 struct inode *inode = ac->ac_inode; 116 struct inode *inode = ac->ac_inode;
112 117
@@ -117,9 +122,17 @@ void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
117 mutex_unlock(&inode->i_mutex); 122 mutex_unlock(&inode->i_mutex);
118 123
119 iput(inode); 124 iput(inode);
125 ac->ac_inode = NULL;
120 } 126 }
121 if (ac->ac_bh) 127 if (ac->ac_bh) {
122 brelse(ac->ac_bh); 128 brelse(ac->ac_bh);
129 ac->ac_bh = NULL;
130 }
131}
132
133void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac)
134{
135 ocfs2_free_ac_resource(ac);
123 kfree(ac); 136 kfree(ac);
124} 137}
125 138
@@ -391,7 +404,8 @@ bail:
391static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, 404static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
392 struct ocfs2_alloc_context *ac, 405 struct ocfs2_alloc_context *ac,
393 int type, 406 int type,
394 u32 slot) 407 u32 slot,
408 int alloc_new_group)
395{ 409{
396 int status; 410 int status;
397 u32 bits_wanted = ac->ac_bits_wanted; 411 u32 bits_wanted = ac->ac_bits_wanted;
@@ -420,6 +434,7 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
420 } 434 }
421 435
422 ac->ac_inode = alloc_inode; 436 ac->ac_inode = alloc_inode;
437 ac->ac_alloc_slot = slot;
423 438
424 fe = (struct ocfs2_dinode *) bh->b_data; 439 fe = (struct ocfs2_dinode *) bh->b_data;
425 if (!OCFS2_IS_VALID_DINODE(fe)) { 440 if (!OCFS2_IS_VALID_DINODE(fe)) {
@@ -446,6 +461,14 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
446 goto bail; 461 goto bail;
447 } 462 }
448 463
464 if (alloc_new_group != ALLOC_NEW_GROUP) {
465 mlog(0, "Alloc File %u Full: wanted=%u, free_bits=%u, "
466 "and we don't alloc a new group for it.\n",
467 slot, bits_wanted, free_bits);
468 status = -ENOSPC;
469 goto bail;
470 }
471
449 status = ocfs2_block_group_alloc(osb, alloc_inode, bh); 472 status = ocfs2_block_group_alloc(osb, alloc_inode, bh);
450 if (status < 0) { 473 if (status < 0) {
451 if (status != -ENOSPC) 474 if (status != -ENOSPC)
@@ -490,7 +513,8 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb,
490 (*ac)->ac_group_search = ocfs2_block_group_search; 513 (*ac)->ac_group_search = ocfs2_block_group_search;
491 514
492 status = ocfs2_reserve_suballoc_bits(osb, (*ac), 515 status = ocfs2_reserve_suballoc_bits(osb, (*ac),
493 EXTENT_ALLOC_SYSTEM_INODE, slot); 516 EXTENT_ALLOC_SYSTEM_INODE,
517 slot, ALLOC_NEW_GROUP);
494 if (status < 0) { 518 if (status < 0) {
495 if (status != -ENOSPC) 519 if (status != -ENOSPC)
496 mlog_errno(status); 520 mlog_errno(status);
@@ -508,10 +532,42 @@ bail:
508 return status; 532 return status;
509} 533}
510 534
535static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb,
536 struct ocfs2_alloc_context *ac)
537{
538 int i, status = -ENOSPC;
539 s16 slot = ocfs2_get_inode_steal_slot(osb);
540
541 /* Start to steal inodes from the first slot after ours. */
542 if (slot == OCFS2_INVALID_SLOT)
543 slot = osb->slot_num + 1;
544
545 for (i = 0; i < osb->max_slots; i++, slot++) {
546 if (slot == osb->max_slots)
547 slot = 0;
548
549 if (slot == osb->slot_num)
550 continue;
551
552 status = ocfs2_reserve_suballoc_bits(osb, ac,
553 INODE_ALLOC_SYSTEM_INODE,
554 slot, NOT_ALLOC_NEW_GROUP);
555 if (status >= 0) {
556 ocfs2_set_inode_steal_slot(osb, slot);
557 break;
558 }
559
560 ocfs2_free_ac_resource(ac);
561 }
562
563 return status;
564}
565
511int ocfs2_reserve_new_inode(struct ocfs2_super *osb, 566int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
512 struct ocfs2_alloc_context **ac) 567 struct ocfs2_alloc_context **ac)
513{ 568{
514 int status; 569 int status;
570 s16 slot = ocfs2_get_inode_steal_slot(osb);
515 571
516 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL); 572 *ac = kzalloc(sizeof(struct ocfs2_alloc_context), GFP_KERNEL);
517 if (!(*ac)) { 573 if (!(*ac)) {
@@ -525,9 +581,43 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb,
525 581
526 (*ac)->ac_group_search = ocfs2_block_group_search; 582 (*ac)->ac_group_search = ocfs2_block_group_search;
527 583
584 /*
585 * slot is set when we successfully steal inode from other nodes.
586 * It is reset in 3 places:
587 * 1. when we flush the truncate log
588 * 2. when we complete local alloc recovery.
589 * 3. when we successfully allocate from our own slot.
590 * After it is set, we will go on stealing inodes until we find the
591 * need to check our slots to see whether there is some space for us.
592 */
593 if (slot != OCFS2_INVALID_SLOT &&
594 atomic_read(&osb->s_num_inodes_stolen) < OCFS2_MAX_INODES_TO_STEAL)
595 goto inode_steal;
596
597 atomic_set(&osb->s_num_inodes_stolen, 0);
528 status = ocfs2_reserve_suballoc_bits(osb, *ac, 598 status = ocfs2_reserve_suballoc_bits(osb, *ac,
529 INODE_ALLOC_SYSTEM_INODE, 599 INODE_ALLOC_SYSTEM_INODE,
530 osb->slot_num); 600 osb->slot_num, ALLOC_NEW_GROUP);
601 if (status >= 0) {
602 status = 0;
603
604 /*
605 * Some inodes must be freed by us, so try to allocate
606 * from our own next time.
607 */
608 if (slot != OCFS2_INVALID_SLOT)
609 ocfs2_init_inode_steal_slot(osb);
610 goto bail;
611 } else if (status < 0 && status != -ENOSPC) {
612 mlog_errno(status);
613 goto bail;
614 }
615
616 ocfs2_free_ac_resource(*ac);
617
618inode_steal:
619 status = ocfs2_steal_inode_from_other_nodes(osb, *ac);
620 atomic_inc(&osb->s_num_inodes_stolen);
531 if (status < 0) { 621 if (status < 0) {
532 if (status != -ENOSPC) 622 if (status != -ENOSPC)
533 mlog_errno(status); 623 mlog_errno(status);
@@ -557,7 +647,8 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb,
557 647
558 status = ocfs2_reserve_suballoc_bits(osb, ac, 648 status = ocfs2_reserve_suballoc_bits(osb, ac,
559 GLOBAL_BITMAP_SYSTEM_INODE, 649 GLOBAL_BITMAP_SYSTEM_INODE,
560 OCFS2_INVALID_SLOT); 650 OCFS2_INVALID_SLOT,
651 ALLOC_NEW_GROUP);
561 if (status < 0 && status != -ENOSPC) { 652 if (status < 0 && status != -ENOSPC) {
562 mlog_errno(status); 653 mlog_errno(status);
563 goto bail; 654 goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 8799033bb459..544c600662bd 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -36,6 +36,7 @@ typedef int (group_search_t)(struct inode *,
36struct ocfs2_alloc_context { 36struct ocfs2_alloc_context {
37 struct inode *ac_inode; /* which bitmap are we allocating from? */ 37 struct inode *ac_inode; /* which bitmap are we allocating from? */
38 struct buffer_head *ac_bh; /* file entry bh */ 38 struct buffer_head *ac_bh; /* file entry bh */
39 u32 ac_alloc_slot; /* which slot are we allocating from? */
39 u32 ac_bits_wanted; 40 u32 ac_bits_wanted;
40 u32 ac_bits_given; 41 u32 ac_bits_given;
41#define OCFS2_AC_USE_LOCAL 1 42#define OCFS2_AC_USE_LOCAL 1
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index bec75aff3d9f..df63ba20ae90 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -40,8 +40,7 @@
40#include <linux/crc32.h> 40#include <linux/crc32.h>
41#include <linux/debugfs.h> 41#include <linux/debugfs.h>
42#include <linux/mount.h> 42#include <linux/mount.h>
43 43#include <linux/seq_file.h>
44#include <cluster/nodemanager.h>
45 44
46#define MLOG_MASK_PREFIX ML_SUPER 45#define MLOG_MASK_PREFIX ML_SUPER
47#include <cluster/masklog.h> 46#include <cluster/masklog.h>
@@ -88,6 +87,7 @@ struct mount_options
88 unsigned int atime_quantum; 87 unsigned int atime_quantum;
89 signed short slot; 88 signed short slot;
90 unsigned int localalloc_opt; 89 unsigned int localalloc_opt;
90 char cluster_stack[OCFS2_STACK_LABEL_LEN + 1];
91}; 91};
92 92
93static int ocfs2_parse_options(struct super_block *sb, char *options, 93static int ocfs2_parse_options(struct super_block *sb, char *options,
@@ -109,7 +109,6 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait);
109static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb); 109static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb);
110static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb); 110static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb);
111static void ocfs2_release_system_inodes(struct ocfs2_super *osb); 111static void ocfs2_release_system_inodes(struct ocfs2_super *osb);
112static int ocfs2_fill_local_node_info(struct ocfs2_super *osb);
113static int ocfs2_check_volume(struct ocfs2_super *osb); 112static int ocfs2_check_volume(struct ocfs2_super *osb);
114static int ocfs2_verify_volume(struct ocfs2_dinode *di, 113static int ocfs2_verify_volume(struct ocfs2_dinode *di,
115 struct buffer_head *bh, 114 struct buffer_head *bh,
@@ -154,6 +153,7 @@ enum {
154 Opt_commit, 153 Opt_commit,
155 Opt_localalloc, 154 Opt_localalloc,
156 Opt_localflocks, 155 Opt_localflocks,
156 Opt_stack,
157 Opt_err, 157 Opt_err,
158}; 158};
159 159
@@ -172,6 +172,7 @@ static match_table_t tokens = {
172 {Opt_commit, "commit=%u"}, 172 {Opt_commit, "commit=%u"},
173 {Opt_localalloc, "localalloc=%d"}, 173 {Opt_localalloc, "localalloc=%d"},
174 {Opt_localflocks, "localflocks"}, 174 {Opt_localflocks, "localflocks"},
175 {Opt_stack, "cluster_stack=%s"},
175 {Opt_err, NULL} 176 {Opt_err, NULL}
176}; 177};
177 178
@@ -551,8 +552,17 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
551 } 552 }
552 } 553 }
553 554
555 if (ocfs2_userspace_stack(osb)) {
556 if (osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL) {
557 mlog(ML_ERROR, "Userspace stack expected, but "
558 "o2cb heartbeat arguments passed to mount\n");
559 return -EINVAL;
560 }
561 }
562
554 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) { 563 if (!(osb->s_mount_opt & OCFS2_MOUNT_HB_LOCAL)) {
555 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb)) { 564 if (!ocfs2_mount_local(osb) && !ocfs2_is_hard_readonly(osb) &&
565 !ocfs2_userspace_stack(osb)) {
556 mlog(ML_ERROR, "Heartbeat has to be started to mount " 566 mlog(ML_ERROR, "Heartbeat has to be started to mount "
557 "a read-write clustered device.\n"); 567 "a read-write clustered device.\n");
558 return -EINVAL; 568 return -EINVAL;
@@ -562,6 +572,35 @@ static int ocfs2_verify_heartbeat(struct ocfs2_super *osb)
562 return 0; 572 return 0;
563} 573}
564 574
575/*
576 * If we're using a userspace stack, mount should have passed
577 * a name that matches the disk. If not, mount should not
578 * have passed a stack.
579 */
580static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
581 struct mount_options *mopt)
582{
583 if (!ocfs2_userspace_stack(osb) && mopt->cluster_stack[0]) {
584 mlog(ML_ERROR,
585 "cluster stack passed to mount, but this filesystem "
586 "does not support it\n");
587 return -EINVAL;
588 }
589
590 if (ocfs2_userspace_stack(osb) &&
591 strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
592 OCFS2_STACK_LABEL_LEN)) {
593 mlog(ML_ERROR,
594 "cluster stack passed to mount (\"%s\") does not "
595 "match the filesystem (\"%s\")\n",
596 mopt->cluster_stack,
597 osb->osb_cluster_stack);
598 return -EINVAL;
599 }
600
601 return 0;
602}
603
565static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) 604static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
566{ 605{
567 struct dentry *root; 606 struct dentry *root;
@@ -579,15 +618,6 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
579 goto read_super_error; 618 goto read_super_error;
580 } 619 }
581 620
582 /* for now we only have one cluster/node, make sure we see it
583 * in the heartbeat universe */
584 if (parsed_options.mount_opt & OCFS2_MOUNT_HB_LOCAL) {
585 if (!o2hb_check_local_node_heartbeating()) {
586 status = -EINVAL;
587 goto read_super_error;
588 }
589 }
590
591 /* probe for superblock */ 621 /* probe for superblock */
592 status = ocfs2_sb_probe(sb, &bh, &sector_size); 622 status = ocfs2_sb_probe(sb, &bh, &sector_size);
593 if (status < 0) { 623 if (status < 0) {
@@ -609,6 +639,10 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
609 osb->osb_commit_interval = parsed_options.commit_interval; 639 osb->osb_commit_interval = parsed_options.commit_interval;
610 osb->local_alloc_size = parsed_options.localalloc_opt; 640 osb->local_alloc_size = parsed_options.localalloc_opt;
611 641
642 status = ocfs2_verify_userspace_stack(osb, &parsed_options);
643 if (status)
644 goto read_super_error;
645
612 sb->s_magic = OCFS2_SUPER_MAGIC; 646 sb->s_magic = OCFS2_SUPER_MAGIC;
613 647
614 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 648 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -694,7 +728,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
694 if (ocfs2_mount_local(osb)) 728 if (ocfs2_mount_local(osb))
695 snprintf(nodestr, sizeof(nodestr), "local"); 729 snprintf(nodestr, sizeof(nodestr), "local");
696 else 730 else
697 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 731 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
698 732
699 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) " 733 printk(KERN_INFO "ocfs2: Mounting device (%s) on (node %s, slot %d) "
700 "with %s data mode.\n", 734 "with %s data mode.\n",
@@ -763,6 +797,7 @@ static int ocfs2_parse_options(struct super_block *sb,
763 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 797 mopt->atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
764 mopt->slot = OCFS2_INVALID_SLOT; 798 mopt->slot = OCFS2_INVALID_SLOT;
765 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; 799 mopt->localalloc_opt = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE;
800 mopt->cluster_stack[0] = '\0';
766 801
767 if (!options) { 802 if (!options) {
768 status = 1; 803 status = 1;
@@ -864,6 +899,25 @@ static int ocfs2_parse_options(struct super_block *sb,
864 if (!is_remount) 899 if (!is_remount)
865 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS; 900 mopt->mount_opt |= OCFS2_MOUNT_LOCALFLOCKS;
866 break; 901 break;
902 case Opt_stack:
903 /* Check both that the option we were passed
904 * is of the right length and that it is a proper
905 * string of the right length.
906 */
907 if (((args[0].to - args[0].from) !=
908 OCFS2_STACK_LABEL_LEN) ||
909 (strnlen(args[0].from,
910 OCFS2_STACK_LABEL_LEN) !=
911 OCFS2_STACK_LABEL_LEN)) {
912 mlog(ML_ERROR,
913 "Invalid cluster_stack option\n");
914 status = 0;
915 goto bail;
916 }
917 memcpy(mopt->cluster_stack, args[0].from,
918 OCFS2_STACK_LABEL_LEN);
919 mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
920 break;
867 default: 921 default:
868 mlog(ML_ERROR, 922 mlog(ML_ERROR,
869 "Unrecognized mount option \"%s\" " 923 "Unrecognized mount option \"%s\" "
@@ -922,6 +976,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
922 if (opts & OCFS2_MOUNT_LOCALFLOCKS) 976 if (opts & OCFS2_MOUNT_LOCALFLOCKS)
923 seq_printf(s, ",localflocks,"); 977 seq_printf(s, ",localflocks,");
924 978
979 if (osb->osb_cluster_stack[0])
980 seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
981 osb->osb_cluster_stack);
982
925 return 0; 983 return 0;
926} 984}
927 985
@@ -957,6 +1015,8 @@ static int __init ocfs2_init(void)
957 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); 1015 mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
958 } 1016 }
959 1017
1018 ocfs2_set_locking_protocol();
1019
960leave: 1020leave:
961 if (status < 0) { 1021 if (status < 0) {
962 ocfs2_free_mem_caches(); 1022 ocfs2_free_mem_caches();
@@ -1132,31 +1192,6 @@ static int ocfs2_get_sector(struct super_block *sb,
1132 return 0; 1192 return 0;
1133} 1193}
1134 1194
1135/* ocfs2 1.0 only allows one cluster and node identity per kernel image. */
1136static int ocfs2_fill_local_node_info(struct ocfs2_super *osb)
1137{
1138 int status;
1139
1140 /* XXX hold a ref on the node while mounte? easy enough, if
1141 * desirable. */
1142 if (ocfs2_mount_local(osb))
1143 osb->node_num = 0;
1144 else
1145 osb->node_num = o2nm_this_node();
1146
1147 if (osb->node_num == O2NM_MAX_NODES) {
1148 mlog(ML_ERROR, "could not find this host's node number\n");
1149 status = -ENOENT;
1150 goto bail;
1151 }
1152
1153 mlog(0, "I am node %d\n", osb->node_num);
1154
1155 status = 0;
1156bail:
1157 return status;
1158}
1159
1160static int ocfs2_mount_volume(struct super_block *sb) 1195static int ocfs2_mount_volume(struct super_block *sb)
1161{ 1196{
1162 int status = 0; 1197 int status = 0;
@@ -1168,12 +1203,6 @@ static int ocfs2_mount_volume(struct super_block *sb)
1168 if (ocfs2_is_hard_readonly(osb)) 1203 if (ocfs2_is_hard_readonly(osb))
1169 goto leave; 1204 goto leave;
1170 1205
1171 status = ocfs2_fill_local_node_info(osb);
1172 if (status < 0) {
1173 mlog_errno(status);
1174 goto leave;
1175 }
1176
1177 status = ocfs2_dlm_init(osb); 1206 status = ocfs2_dlm_init(osb);
1178 if (status < 0) { 1207 if (status < 0) {
1179 mlog_errno(status); 1208 mlog_errno(status);
@@ -1224,18 +1253,9 @@ leave:
1224 return status; 1253 return status;
1225} 1254}
1226 1255
1227/* we can't grab the goofy sem lock from inside wait_event, so we use
1228 * memory barriers to make sure that we'll see the null task before
1229 * being woken up */
1230static int ocfs2_recovery_thread_running(struct ocfs2_super *osb)
1231{
1232 mb();
1233 return osb->recovery_thread_task != NULL;
1234}
1235
1236static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) 1256static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1237{ 1257{
1238 int tmp; 1258 int tmp, hangup_needed = 0;
1239 struct ocfs2_super *osb = NULL; 1259 struct ocfs2_super *osb = NULL;
1240 char nodestr[8]; 1260 char nodestr[8];
1241 1261
@@ -1249,25 +1269,16 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1249 1269
1250 ocfs2_truncate_log_shutdown(osb); 1270 ocfs2_truncate_log_shutdown(osb);
1251 1271
1252 /* disable any new recovery threads and wait for any currently 1272 /* This will disable recovery and flush any recovery work. */
1253 * running ones to exit. Do this before setting the vol_state. */ 1273 ocfs2_recovery_exit(osb);
1254 mutex_lock(&osb->recovery_lock);
1255 osb->disable_recovery = 1;
1256 mutex_unlock(&osb->recovery_lock);
1257 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1258
1259 /* At this point, we know that no more recovery threads can be
1260 * launched, so wait for any recovery completion work to
1261 * complete. */
1262 flush_workqueue(ocfs2_wq);
1263 1274
1264 ocfs2_journal_shutdown(osb); 1275 ocfs2_journal_shutdown(osb);
1265 1276
1266 ocfs2_sync_blockdev(sb); 1277 ocfs2_sync_blockdev(sb);
1267 1278
1268 /* No dlm means we've failed during mount, so skip all the 1279 /* No cluster connection means we've failed during mount, so skip
1269 * steps which depended on that to complete. */ 1280 * all the steps which depended on that to complete. */
1270 if (osb->dlm) { 1281 if (osb->cconn) {
1271 tmp = ocfs2_super_lock(osb, 1); 1282 tmp = ocfs2_super_lock(osb, 1);
1272 if (tmp < 0) { 1283 if (tmp < 0) {
1273 mlog_errno(tmp); 1284 mlog_errno(tmp);
@@ -1278,25 +1289,34 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1278 if (osb->slot_num != OCFS2_INVALID_SLOT) 1289 if (osb->slot_num != OCFS2_INVALID_SLOT)
1279 ocfs2_put_slot(osb); 1290 ocfs2_put_slot(osb);
1280 1291
1281 if (osb->dlm) 1292 if (osb->cconn)
1282 ocfs2_super_unlock(osb, 1); 1293 ocfs2_super_unlock(osb, 1);
1283 1294
1284 ocfs2_release_system_inodes(osb); 1295 ocfs2_release_system_inodes(osb);
1285 1296
1286 if (osb->dlm) 1297 /*
1287 ocfs2_dlm_shutdown(osb); 1298 * If we're dismounting due to mount error, mount.ocfs2 will clean
1299 * up heartbeat. If we're a local mount, there is no heartbeat.
1300 * If we failed before we got a uuid_str yet, we can't stop
1301 * heartbeat. Otherwise, do it.
1302 */
1303 if (!mnt_err && !ocfs2_mount_local(osb) && osb->uuid_str)
1304 hangup_needed = 1;
1305
1306 if (osb->cconn)
1307 ocfs2_dlm_shutdown(osb, hangup_needed);
1288 1308
1289 debugfs_remove(osb->osb_debug_root); 1309 debugfs_remove(osb->osb_debug_root);
1290 1310
1291 if (!mnt_err) 1311 if (hangup_needed)
1292 ocfs2_stop_heartbeat(osb); 1312 ocfs2_cluster_hangup(osb->uuid_str, strlen(osb->uuid_str));
1293 1313
1294 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED); 1314 atomic_set(&osb->vol_state, VOLUME_DISMOUNTED);
1295 1315
1296 if (ocfs2_mount_local(osb)) 1316 if (ocfs2_mount_local(osb))
1297 snprintf(nodestr, sizeof(nodestr), "local"); 1317 snprintf(nodestr, sizeof(nodestr), "local");
1298 else 1318 else
1299 snprintf(nodestr, sizeof(nodestr), "%d", osb->node_num); 1319 snprintf(nodestr, sizeof(nodestr), "%u", osb->node_num);
1300 1320
1301 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n", 1321 printk(KERN_INFO "ocfs2: Unmounting device (%s) on (node %s)\n",
1302 osb->dev_str, nodestr); 1322 osb->dev_str, nodestr);
@@ -1355,7 +1375,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1355 sb->s_fs_info = osb; 1375 sb->s_fs_info = osb;
1356 sb->s_op = &ocfs2_sops; 1376 sb->s_op = &ocfs2_sops;
1357 sb->s_export_op = &ocfs2_export_ops; 1377 sb->s_export_op = &ocfs2_export_ops;
1358 osb->osb_locking_proto = ocfs2_locking_protocol;
1359 sb->s_time_gran = 1; 1378 sb->s_time_gran = 1;
1360 sb->s_flags |= MS_NOATIME; 1379 sb->s_flags |= MS_NOATIME;
1361 /* this is needed to support O_LARGEFILE */ 1380 /* this is needed to support O_LARGEFILE */
@@ -1368,7 +1387,6 @@ static int ocfs2_initialize_super(struct super_block *sb,
1368 osb->s_sectsize_bits = blksize_bits(sector_size); 1387 osb->s_sectsize_bits = blksize_bits(sector_size);
1369 BUG_ON(!osb->s_sectsize_bits); 1388 BUG_ON(!osb->s_sectsize_bits);
1370 1389
1371 init_waitqueue_head(&osb->recovery_event);
1372 spin_lock_init(&osb->dc_task_lock); 1390 spin_lock_init(&osb->dc_task_lock);
1373 init_waitqueue_head(&osb->dc_event); 1391 init_waitqueue_head(&osb->dc_event);
1374 osb->dc_work_sequence = 0; 1392 osb->dc_work_sequence = 0;
@@ -1376,6 +1394,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1376 INIT_LIST_HEAD(&osb->blocked_lock_list); 1394 INIT_LIST_HEAD(&osb->blocked_lock_list);
1377 osb->blocked_lock_count = 0; 1395 osb->blocked_lock_count = 0;
1378 spin_lock_init(&osb->osb_lock); 1396 spin_lock_init(&osb->osb_lock);
1397 ocfs2_init_inode_steal_slot(osb);
1379 1398
1380 atomic_set(&osb->alloc_stats.moves, 0); 1399 atomic_set(&osb->alloc_stats.moves, 0);
1381 atomic_set(&osb->alloc_stats.local_data, 0); 1400 atomic_set(&osb->alloc_stats.local_data, 0);
@@ -1388,24 +1407,23 @@ static int ocfs2_initialize_super(struct super_block *sb,
1388 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 1407 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1389 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1408 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1390 1409
1391 mutex_init(&osb->recovery_lock); 1410 status = ocfs2_recovery_init(osb);
1392 1411 if (status) {
1393 osb->disable_recovery = 0; 1412 mlog(ML_ERROR, "Unable to initialize recovery state\n");
1394 osb->recovery_thread_task = NULL; 1413 mlog_errno(status);
1414 goto bail;
1415 }
1395 1416
1396 init_waitqueue_head(&osb->checkpoint_event); 1417 init_waitqueue_head(&osb->checkpoint_event);
1397 atomic_set(&osb->needs_checkpoint, 0); 1418 atomic_set(&osb->needs_checkpoint, 0);
1398 1419
1399 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM; 1420 osb->s_atime_quantum = OCFS2_DEFAULT_ATIME_QUANTUM;
1400 1421
1401 osb->node_num = O2NM_INVALID_NODE_NUM;
1402 osb->slot_num = OCFS2_INVALID_SLOT; 1422 osb->slot_num = OCFS2_INVALID_SLOT;
1403 1423
1404 osb->local_alloc_state = OCFS2_LA_UNUSED; 1424 osb->local_alloc_state = OCFS2_LA_UNUSED;
1405 osb->local_alloc_bh = NULL; 1425 osb->local_alloc_bh = NULL;
1406 1426
1407 ocfs2_setup_hb_callbacks(osb);
1408
1409 init_waitqueue_head(&osb->osb_mount_event); 1427 init_waitqueue_head(&osb->osb_mount_event);
1410 1428
1411 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL); 1429 osb->vol_label = kmalloc(OCFS2_MAX_VOL_LABEL_LEN, GFP_KERNEL);
@@ -1455,6 +1473,25 @@ static int ocfs2_initialize_super(struct super_block *sb,
1455 goto bail; 1473 goto bail;
1456 } 1474 }
1457 1475
1476 if (ocfs2_userspace_stack(osb)) {
1477 memcpy(osb->osb_cluster_stack,
1478 OCFS2_RAW_SB(di)->s_cluster_info.ci_stack,
1479 OCFS2_STACK_LABEL_LEN);
1480 osb->osb_cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0';
1481 if (strlen(osb->osb_cluster_stack) != OCFS2_STACK_LABEL_LEN) {
1482 mlog(ML_ERROR,
1483 "couldn't mount because of an invalid "
1484 "cluster stack label (%s) \n",
1485 osb->osb_cluster_stack);
1486 status = -EINVAL;
1487 goto bail;
1488 }
1489 } else {
1490 /* The empty string is identical with classic tools that
1491 * don't know about s_cluster_info. */
1492 osb->osb_cluster_stack[0] = '\0';
1493 }
1494
1458 get_random_bytes(&osb->s_next_generation, sizeof(u32)); 1495 get_random_bytes(&osb->s_next_generation, sizeof(u32));
1459 1496
1460 /* FIXME 1497 /* FIXME
@@ -1724,8 +1761,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
1724 1761
1725 /* This function assumes that the caller has the main osb resource */ 1762 /* This function assumes that the caller has the main osb resource */
1726 1763
1727 if (osb->slot_info) 1764 ocfs2_free_slot_info(osb);
1728 ocfs2_free_slot_info(osb->slot_info);
1729 1765
1730 kfree(osb->osb_orphan_wipes); 1766 kfree(osb->osb_orphan_wipes);
1731 /* FIXME 1767 /* FIXME
diff --git a/fs/open.c b/fs/open.c
index 3fa4e4ffce4c..b70e7666bb2c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -244,21 +244,21 @@ static long do_sys_truncate(const char __user * path, loff_t length)
244 if (!S_ISREG(inode->i_mode)) 244 if (!S_ISREG(inode->i_mode))
245 goto dput_and_out; 245 goto dput_and_out;
246 246
247 error = vfs_permission(&nd, MAY_WRITE); 247 error = mnt_want_write(nd.path.mnt);
248 if (error) 248 if (error)
249 goto dput_and_out; 249 goto dput_and_out;
250 250
251 error = -EROFS; 251 error = vfs_permission(&nd, MAY_WRITE);
252 if (IS_RDONLY(inode)) 252 if (error)
253 goto dput_and_out; 253 goto mnt_drop_write_and_out;
254 254
255 error = -EPERM; 255 error = -EPERM;
256 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 256 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
257 goto dput_and_out; 257 goto mnt_drop_write_and_out;
258 258
259 error = get_write_access(inode); 259 error = get_write_access(inode);
260 if (error) 260 if (error)
261 goto dput_and_out; 261 goto mnt_drop_write_and_out;
262 262
263 /* 263 /*
264 * Make sure that there are no leases. get_write_access() protects 264 * Make sure that there are no leases. get_write_access() protects
@@ -276,6 +276,8 @@ static long do_sys_truncate(const char __user * path, loff_t length)
276 276
277put_write_and_out: 277put_write_and_out:
278 put_write_access(inode); 278 put_write_access(inode);
279mnt_drop_write_and_out:
280 mnt_drop_write(nd.path.mnt);
279dput_and_out: 281dput_and_out:
280 path_put(&nd.path); 282 path_put(&nd.path);
281out: 283out:
@@ -457,8 +459,17 @@ asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode)
457 if(res || !(mode & S_IWOTH) || 459 if(res || !(mode & S_IWOTH) ||
458 special_file(nd.path.dentry->d_inode->i_mode)) 460 special_file(nd.path.dentry->d_inode->i_mode))
459 goto out_path_release; 461 goto out_path_release;
460 462 /*
461 if(IS_RDONLY(nd.path.dentry->d_inode)) 463 * This is a rare case where using __mnt_is_readonly()
464 * is OK without a mnt_want/drop_write() pair. Since
465 * no actual write to the fs is performed here, we do
466 * not need to telegraph to that to anyone.
467 *
468 * By doing this, we accept that this access is
469 * inherently racy and know that the fs may change
470 * state before we even see this result.
471 */
472 if (__mnt_is_readonly(nd.path.mnt))
462 res = -EROFS; 473 res = -EROFS;
463 474
464out_path_release: 475out_path_release:
@@ -567,12 +578,12 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
567 578
568 audit_inode(NULL, dentry); 579 audit_inode(NULL, dentry);
569 580
570 err = -EROFS; 581 err = mnt_want_write(file->f_path.mnt);
571 if (IS_RDONLY(inode)) 582 if (err)
572 goto out_putf; 583 goto out_putf;
573 err = -EPERM; 584 err = -EPERM;
574 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 585 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
575 goto out_putf; 586 goto out_drop_write;
576 mutex_lock(&inode->i_mutex); 587 mutex_lock(&inode->i_mutex);
577 if (mode == (mode_t) -1) 588 if (mode == (mode_t) -1)
578 mode = inode->i_mode; 589 mode = inode->i_mode;
@@ -581,6 +592,8 @@ asmlinkage long sys_fchmod(unsigned int fd, mode_t mode)
581 err = notify_change(dentry, &newattrs); 592 err = notify_change(dentry, &newattrs);
582 mutex_unlock(&inode->i_mutex); 593 mutex_unlock(&inode->i_mutex);
583 594
595out_drop_write:
596 mnt_drop_write(file->f_path.mnt);
584out_putf: 597out_putf:
585 fput(file); 598 fput(file);
586out: 599out:
@@ -600,13 +613,13 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
600 goto out; 613 goto out;
601 inode = nd.path.dentry->d_inode; 614 inode = nd.path.dentry->d_inode;
602 615
603 error = -EROFS; 616 error = mnt_want_write(nd.path.mnt);
604 if (IS_RDONLY(inode)) 617 if (error)
605 goto dput_and_out; 618 goto dput_and_out;
606 619
607 error = -EPERM; 620 error = -EPERM;
608 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 621 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
609 goto dput_and_out; 622 goto out_drop_write;
610 623
611 mutex_lock(&inode->i_mutex); 624 mutex_lock(&inode->i_mutex);
612 if (mode == (mode_t) -1) 625 if (mode == (mode_t) -1)
@@ -616,6 +629,8 @@ asmlinkage long sys_fchmodat(int dfd, const char __user *filename,
616 error = notify_change(nd.path.dentry, &newattrs); 629 error = notify_change(nd.path.dentry, &newattrs);
617 mutex_unlock(&inode->i_mutex); 630 mutex_unlock(&inode->i_mutex);
618 631
632out_drop_write:
633 mnt_drop_write(nd.path.mnt);
619dput_and_out: 634dput_and_out:
620 path_put(&nd.path); 635 path_put(&nd.path);
621out: 636out:
@@ -638,9 +653,6 @@ static int chown_common(struct dentry * dentry, uid_t user, gid_t group)
638 printk(KERN_ERR "chown_common: NULL inode\n"); 653 printk(KERN_ERR "chown_common: NULL inode\n");
639 goto out; 654 goto out;
640 } 655 }
641 error = -EROFS;
642 if (IS_RDONLY(inode))
643 goto out;
644 error = -EPERM; 656 error = -EPERM;
645 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 657 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
646 goto out; 658 goto out;
@@ -671,7 +683,12 @@ asmlinkage long sys_chown(const char __user * filename, uid_t user, gid_t group)
671 error = user_path_walk(filename, &nd); 683 error = user_path_walk(filename, &nd);
672 if (error) 684 if (error)
673 goto out; 685 goto out;
686 error = mnt_want_write(nd.path.mnt);
687 if (error)
688 goto out_release;
674 error = chown_common(nd.path.dentry, user, group); 689 error = chown_common(nd.path.dentry, user, group);
690 mnt_drop_write(nd.path.mnt);
691out_release:
675 path_put(&nd.path); 692 path_put(&nd.path);
676out: 693out:
677 return error; 694 return error;
@@ -691,7 +708,12 @@ asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
691 error = __user_walk_fd(dfd, filename, follow, &nd); 708 error = __user_walk_fd(dfd, filename, follow, &nd);
692 if (error) 709 if (error)
693 goto out; 710 goto out;
711 error = mnt_want_write(nd.path.mnt);
712 if (error)
713 goto out_release;
694 error = chown_common(nd.path.dentry, user, group); 714 error = chown_common(nd.path.dentry, user, group);
715 mnt_drop_write(nd.path.mnt);
716out_release:
695 path_put(&nd.path); 717 path_put(&nd.path);
696out: 718out:
697 return error; 719 return error;
@@ -705,7 +727,12 @@ asmlinkage long sys_lchown(const char __user * filename, uid_t user, gid_t group
705 error = user_path_walk_link(filename, &nd); 727 error = user_path_walk_link(filename, &nd);
706 if (error) 728 if (error)
707 goto out; 729 goto out;
730 error = mnt_want_write(nd.path.mnt);
731 if (error)
732 goto out_release;
708 error = chown_common(nd.path.dentry, user, group); 733 error = chown_common(nd.path.dentry, user, group);
734 mnt_drop_write(nd.path.mnt);
735out_release:
709 path_put(&nd.path); 736 path_put(&nd.path);
710out: 737out:
711 return error; 738 return error;
@@ -722,14 +749,48 @@ asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group)
722 if (!file) 749 if (!file)
723 goto out; 750 goto out;
724 751
752 error = mnt_want_write(file->f_path.mnt);
753 if (error)
754 goto out_fput;
725 dentry = file->f_path.dentry; 755 dentry = file->f_path.dentry;
726 audit_inode(NULL, dentry); 756 audit_inode(NULL, dentry);
727 error = chown_common(dentry, user, group); 757 error = chown_common(dentry, user, group);
758 mnt_drop_write(file->f_path.mnt);
759out_fput:
728 fput(file); 760 fput(file);
729out: 761out:
730 return error; 762 return error;
731} 763}
732 764
765/*
766 * You have to be very careful that these write
767 * counts get cleaned up in error cases and
768 * upon __fput(). This should probably never
769 * be called outside of __dentry_open().
770 */
771static inline int __get_file_write_access(struct inode *inode,
772 struct vfsmount *mnt)
773{
774 int error;
775 error = get_write_access(inode);
776 if (error)
777 return error;
778 /*
779 * Do not take mount writer counts on
780 * special files since no writes to
781 * the mount itself will occur.
782 */
783 if (!special_file(inode->i_mode)) {
784 /*
785 * Balanced in __fput()
786 */
787 error = mnt_want_write(mnt);
788 if (error)
789 put_write_access(inode);
790 }
791 return error;
792}
793
733static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, 794static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
734 int flags, struct file *f, 795 int flags, struct file *f,
735 int (*open)(struct inode *, struct file *)) 796 int (*open)(struct inode *, struct file *))
@@ -742,9 +803,11 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
742 FMODE_PREAD | FMODE_PWRITE; 803 FMODE_PREAD | FMODE_PWRITE;
743 inode = dentry->d_inode; 804 inode = dentry->d_inode;
744 if (f->f_mode & FMODE_WRITE) { 805 if (f->f_mode & FMODE_WRITE) {
745 error = get_write_access(inode); 806 error = __get_file_write_access(inode, mnt);
746 if (error) 807 if (error)
747 goto cleanup_file; 808 goto cleanup_file;
809 if (!special_file(inode->i_mode))
810 file_take_write(f);
748 } 811 }
749 812
750 f->f_mapping = inode->i_mapping; 813 f->f_mapping = inode->i_mapping;
@@ -784,8 +847,19 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt,
784 847
785cleanup_all: 848cleanup_all:
786 fops_put(f->f_op); 849 fops_put(f->f_op);
787 if (f->f_mode & FMODE_WRITE) 850 if (f->f_mode & FMODE_WRITE) {
788 put_write_access(inode); 851 put_write_access(inode);
852 if (!special_file(inode->i_mode)) {
853 /*
854 * We don't consider this a real
855 * mnt_want/drop_write() pair
856 * because it all happenend right
857 * here, so just reset the state.
858 */
859 file_reset_write(f);
860 mnt_drop_write(mnt);
861 }
862 }
789 file_kill(f); 863 file_kill(f);
790 f->f_path.dentry = NULL; 864 f->f_path.dentry = NULL;
791 f->f_path.mnt = NULL; 865 f->f_path.mnt = NULL;
@@ -796,43 +870,6 @@ cleanup_file:
796 return ERR_PTR(error); 870 return ERR_PTR(error);
797} 871}
798 872
799/*
800 * Note that while the flag value (low two bits) for sys_open means:
801 * 00 - read-only
802 * 01 - write-only
803 * 10 - read-write
804 * 11 - special
805 * it is changed into
806 * 00 - no permissions needed
807 * 01 - read-permission
808 * 10 - write-permission
809 * 11 - read-write
810 * for the internal routines (ie open_namei()/follow_link() etc). 00 is
811 * used by symlinks.
812 */
813static struct file *do_filp_open(int dfd, const char *filename, int flags,
814 int mode)
815{
816 int namei_flags, error;
817 struct nameidata nd;
818
819 namei_flags = flags;
820 if ((namei_flags+1) & O_ACCMODE)
821 namei_flags++;
822
823 error = open_namei(dfd, filename, namei_flags, mode, &nd);
824 if (!error)
825 return nameidata_to_filp(&nd, flags);
826
827 return ERR_PTR(error);
828}
829
830struct file *filp_open(const char *filename, int flags, int mode)
831{
832 return do_filp_open(AT_FDCWD, filename, flags, mode);
833}
834EXPORT_SYMBOL(filp_open);
835
836/** 873/**
837 * lookup_instantiate_filp - instantiates the open intent filp 874 * lookup_instantiate_filp - instantiates the open intent filp
838 * @nd: pointer to nameidata 875 * @nd: pointer to nameidata
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 03f808c5b79d..6149e4b58c88 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -473,6 +473,10 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
473 return 0; 473 return 0;
474 if (IS_ERR(state)) /* I/O error reading the partition table */ 474 if (IS_ERR(state)) /* I/O error reading the partition table */
475 return -EIO; 475 return -EIO;
476
477 /* tell userspace that the media / partition table may have changed */
478 kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE);
479
476 for (p = 1; p < state->limit; p++) { 480 for (p = 1; p < state->limit; p++) {
477 sector_t size = state->parts[p].size; 481 sector_t size = state->parts[p].size;
478 sector_t from = state->parts[p].from; 482 sector_t from = state->parts[p].from;
diff --git a/fs/pipe.c b/fs/pipe.c
index 8be381bbcb54..f73492b6817e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -988,7 +988,10 @@ struct file *create_write_pipe(void)
988 return f; 988 return f;
989 989
990 err_dentry: 990 err_dentry:
991 free_pipe_info(inode);
991 dput(dentry); 992 dput(dentry);
993 return ERR_PTR(err);
994
992 err_inode: 995 err_inode:
993 free_pipe_info(inode); 996 free_pipe_info(inode);
994 iput(inode); 997 iput(inode);
diff --git a/fs/pnode.c b/fs/pnode.c
index 1d8f5447f3f7..8d5f392ec3d3 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -9,6 +9,7 @@
9#include <linux/mnt_namespace.h> 9#include <linux/mnt_namespace.h>
10#include <linux/mount.h> 10#include <linux/mount.h>
11#include <linux/fs.h> 11#include <linux/fs.h>
12#include "internal.h"
12#include "pnode.h" 13#include "pnode.h"
13 14
14/* return the next shared peer mount of @p */ 15/* return the next shared peer mount of @p */
@@ -27,6 +28,57 @@ static inline struct vfsmount *next_slave(struct vfsmount *p)
27 return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave); 28 return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave);
28} 29}
29 30
31/*
32 * Return true if path is reachable from root
33 *
34 * namespace_sem is held, and mnt is attached
35 */
36static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry,
37 const struct path *root)
38{
39 while (mnt != root->mnt && mnt->mnt_parent != mnt) {
40 dentry = mnt->mnt_mountpoint;
41 mnt = mnt->mnt_parent;
42 }
43 return mnt == root->mnt && is_subdir(dentry, root->dentry);
44}
45
46static struct vfsmount *get_peer_under_root(struct vfsmount *mnt,
47 struct mnt_namespace *ns,
48 const struct path *root)
49{
50 struct vfsmount *m = mnt;
51
52 do {
53 /* Check the namespace first for optimization */
54 if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root))
55 return m;
56
57 m = next_peer(m);
58 } while (m != mnt);
59
60 return NULL;
61}
62
63/*
64 * Get ID of closest dominating peer group having a representative
65 * under the given root.
66 *
67 * Caller must hold namespace_sem
68 */
69int get_dominating_id(struct vfsmount *mnt, const struct path *root)
70{
71 struct vfsmount *m;
72
73 for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) {
74 struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root);
75 if (d)
76 return d->mnt_group_id;
77 }
78
79 return 0;
80}
81
30static int do_make_slave(struct vfsmount *mnt) 82static int do_make_slave(struct vfsmount *mnt)
31{ 83{
32 struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master; 84 struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master;
@@ -45,7 +97,11 @@ static int do_make_slave(struct vfsmount *mnt)
45 if (peer_mnt == mnt) 97 if (peer_mnt == mnt)
46 peer_mnt = NULL; 98 peer_mnt = NULL;
47 } 99 }
100 if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share))
101 mnt_release_group_id(mnt);
102
48 list_del_init(&mnt->mnt_share); 103 list_del_init(&mnt->mnt_share);
104 mnt->mnt_group_id = 0;
49 105
50 if (peer_mnt) 106 if (peer_mnt)
51 master = peer_mnt; 107 master = peer_mnt;
@@ -67,7 +123,6 @@ static int do_make_slave(struct vfsmount *mnt)
67 } 123 }
68 mnt->mnt_master = master; 124 mnt->mnt_master = master;
69 CLEAR_MNT_SHARED(mnt); 125 CLEAR_MNT_SHARED(mnt);
70 INIT_LIST_HEAD(&mnt->mnt_slave_list);
71 return 0; 126 return 0;
72} 127}
73 128
@@ -211,8 +266,7 @@ int propagate_mnt(struct vfsmount *dest_mnt, struct dentry *dest_dentry,
211out: 266out:
212 spin_lock(&vfsmount_lock); 267 spin_lock(&vfsmount_lock);
213 while (!list_empty(&tmp_list)) { 268 while (!list_empty(&tmp_list)) {
214 child = list_entry(tmp_list.next, struct vfsmount, mnt_hash); 269 child = list_first_entry(&tmp_list, struct vfsmount, mnt_hash);
215 list_del_init(&child->mnt_hash);
216 umount_tree(child, 0, &umount_list); 270 umount_tree(child, 0, &umount_list);
217 } 271 }
218 spin_unlock(&vfsmount_lock); 272 spin_unlock(&vfsmount_lock);
diff --git a/fs/pnode.h b/fs/pnode.h
index f249be2fee7a..958665d662af 100644
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -35,4 +35,6 @@ int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *,
35 struct list_head *); 35 struct list_head *);
36int propagate_umount(struct list_head *); 36int propagate_umount(struct list_head *);
37int propagate_mount_busy(struct vfsmount *, int); 37int propagate_mount_busy(struct vfsmount *, int);
38void mnt_release_group_id(struct vfsmount *);
39int get_dominating_id(struct vfsmount *mnt, const struct path *root);
38#endif /* _LINUX_PNODE_H */ 40#endif /* _LINUX_PNODE_H */
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 81d7d145292a..c5e412a00b17 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -502,17 +502,14 @@ static const struct inode_operations proc_def_inode_operations = {
502 .setattr = proc_setattr, 502 .setattr = proc_setattr,
503}; 503};
504 504
505extern const struct seq_operations mounts_op; 505static int mounts_open_common(struct inode *inode, struct file *file,
506struct proc_mounts { 506 const struct seq_operations *op)
507 struct seq_file m;
508 int event;
509};
510
511static int mounts_open(struct inode *inode, struct file *file)
512{ 507{
513 struct task_struct *task = get_proc_task(inode); 508 struct task_struct *task = get_proc_task(inode);
514 struct nsproxy *nsp; 509 struct nsproxy *nsp;
515 struct mnt_namespace *ns = NULL; 510 struct mnt_namespace *ns = NULL;
511 struct fs_struct *fs = NULL;
512 struct path root;
516 struct proc_mounts *p; 513 struct proc_mounts *p;
517 int ret = -EINVAL; 514 int ret = -EINVAL;
518 515
@@ -525,40 +522,61 @@ static int mounts_open(struct inode *inode, struct file *file)
525 get_mnt_ns(ns); 522 get_mnt_ns(ns);
526 } 523 }
527 rcu_read_unlock(); 524 rcu_read_unlock();
528 525 if (ns)
526 fs = get_fs_struct(task);
529 put_task_struct(task); 527 put_task_struct(task);
530 } 528 }
531 529
532 if (ns) { 530 if (!ns)
533 ret = -ENOMEM; 531 goto err;
534 p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); 532 if (!fs)
535 if (p) { 533 goto err_put_ns;
536 file->private_data = &p->m; 534
537 ret = seq_open(file, &mounts_op); 535 read_lock(&fs->lock);
538 if (!ret) { 536 root = fs->root;
539 p->m.private = ns; 537 path_get(&root);
540 p->event = ns->event; 538 read_unlock(&fs->lock);
541 return 0; 539 put_fs_struct(fs);
542 } 540
543 kfree(p); 541 ret = -ENOMEM;
544 } 542 p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL);
545 put_mnt_ns(ns); 543 if (!p)
546 } 544 goto err_put_path;
545
546 file->private_data = &p->m;
547 ret = seq_open(file, op);
548 if (ret)
549 goto err_free;
550
551 p->m.private = p;
552 p->ns = ns;
553 p->root = root;
554 p->event = ns->event;
555
556 return 0;
557
558 err_free:
559 kfree(p);
560 err_put_path:
561 path_put(&root);
562 err_put_ns:
563 put_mnt_ns(ns);
564 err:
547 return ret; 565 return ret;
548} 566}
549 567
550static int mounts_release(struct inode *inode, struct file *file) 568static int mounts_release(struct inode *inode, struct file *file)
551{ 569{
552 struct seq_file *m = file->private_data; 570 struct proc_mounts *p = file->private_data;
553 struct mnt_namespace *ns = m->private; 571 path_put(&p->root);
554 put_mnt_ns(ns); 572 put_mnt_ns(p->ns);
555 return seq_release(inode, file); 573 return seq_release(inode, file);
556} 574}
557 575
558static unsigned mounts_poll(struct file *file, poll_table *wait) 576static unsigned mounts_poll(struct file *file, poll_table *wait)
559{ 577{
560 struct proc_mounts *p = file->private_data; 578 struct proc_mounts *p = file->private_data;
561 struct mnt_namespace *ns = p->m.private; 579 struct mnt_namespace *ns = p->ns;
562 unsigned res = 0; 580 unsigned res = 0;
563 581
564 poll_wait(file, &ns->poll, wait); 582 poll_wait(file, &ns->poll, wait);
@@ -573,6 +591,11 @@ static unsigned mounts_poll(struct file *file, poll_table *wait)
573 return res; 591 return res;
574} 592}
575 593
594static int mounts_open(struct inode *inode, struct file *file)
595{
596 return mounts_open_common(inode, file, &mounts_op);
597}
598
576static const struct file_operations proc_mounts_operations = { 599static const struct file_operations proc_mounts_operations = {
577 .open = mounts_open, 600 .open = mounts_open,
578 .read = seq_read, 601 .read = seq_read,
@@ -581,38 +604,22 @@ static const struct file_operations proc_mounts_operations = {
581 .poll = mounts_poll, 604 .poll = mounts_poll,
582}; 605};
583 606
584extern const struct seq_operations mountstats_op; 607static int mountinfo_open(struct inode *inode, struct file *file)
585static int mountstats_open(struct inode *inode, struct file *file)
586{ 608{
587 int ret = seq_open(file, &mountstats_op); 609 return mounts_open_common(inode, file, &mountinfo_op);
588 610}
589 if (!ret) {
590 struct seq_file *m = file->private_data;
591 struct nsproxy *nsp;
592 struct mnt_namespace *mnt_ns = NULL;
593 struct task_struct *task = get_proc_task(inode);
594
595 if (task) {
596 rcu_read_lock();
597 nsp = task_nsproxy(task);
598 if (nsp) {
599 mnt_ns = nsp->mnt_ns;
600 if (mnt_ns)
601 get_mnt_ns(mnt_ns);
602 }
603 rcu_read_unlock();
604 611
605 put_task_struct(task); 612static const struct file_operations proc_mountinfo_operations = {
606 } 613 .open = mountinfo_open,
614 .read = seq_read,
615 .llseek = seq_lseek,
616 .release = mounts_release,
617 .poll = mounts_poll,
618};
607 619
608 if (mnt_ns) 620static int mountstats_open(struct inode *inode, struct file *file)
609 m->private = mnt_ns; 621{
610 else { 622 return mounts_open_common(inode, file, &mountstats_op);
611 seq_release(inode, file);
612 ret = -EINVAL;
613 }
614 }
615 return ret;
616} 623}
617 624
618static const struct file_operations proc_mountstats_operations = { 625static const struct file_operations proc_mountstats_operations = {
@@ -1626,7 +1633,6 @@ static int proc_readfd_common(struct file * filp, void * dirent,
1626 unsigned int fd, ino; 1633 unsigned int fd, ino;
1627 int retval; 1634 int retval;
1628 struct files_struct * files; 1635 struct files_struct * files;
1629 struct fdtable *fdt;
1630 1636
1631 retval = -ENOENT; 1637 retval = -ENOENT;
1632 if (!p) 1638 if (!p)
@@ -1649,9 +1655,8 @@ static int proc_readfd_common(struct file * filp, void * dirent,
1649 if (!files) 1655 if (!files)
1650 goto out; 1656 goto out;
1651 rcu_read_lock(); 1657 rcu_read_lock();
1652 fdt = files_fdtable(files);
1653 for (fd = filp->f_pos-2; 1658 for (fd = filp->f_pos-2;
1654 fd < fdt->max_fds; 1659 fd < files_fdtable(files)->max_fds;
1655 fd++, filp->f_pos++) { 1660 fd++, filp->f_pos++) {
1656 char name[PROC_NUMBUF]; 1661 char name[PROC_NUMBUF];
1657 int len; 1662 int len;
@@ -2311,6 +2316,7 @@ static const struct pid_entry tgid_base_stuff[] = {
2311 LNK("root", root), 2316 LNK("root", root),
2312 LNK("exe", exe), 2317 LNK("exe", exe),
2313 REG("mounts", S_IRUGO, mounts), 2318 REG("mounts", S_IRUGO, mounts),
2319 REG("mountinfo", S_IRUGO, mountinfo),
2314 REG("mountstats", S_IRUSR, mountstats), 2320 REG("mountstats", S_IRUSR, mountstats),
2315#ifdef CONFIG_PROC_PAGE_MONITOR 2321#ifdef CONFIG_PROC_PAGE_MONITOR
2316 REG("clear_refs", S_IWUSR, clear_refs), 2322 REG("clear_refs", S_IWUSR, clear_refs),
@@ -2643,6 +2649,7 @@ static const struct pid_entry tid_base_stuff[] = {
2643 LNK("root", root), 2649 LNK("root", root),
2644 LNK("exe", exe), 2650 LNK("exe", exe),
2645 REG("mounts", S_IRUGO, mounts), 2651 REG("mounts", S_IRUGO, mounts),
2652 REG("mountinfo", S_IRUGO, mountinfo),
2646#ifdef CONFIG_PROC_PAGE_MONITOR 2653#ifdef CONFIG_PROC_PAGE_MONITOR
2647 REG("clear_refs", S_IWUSR, clear_refs), 2654 REG("clear_refs", S_IWUSR, clear_refs),
2648 REG("smaps", S_IRUGO, smaps), 2655 REG("smaps", S_IRUGO, smaps),
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 4caa5f774fb7..13cd7835d0df 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -44,7 +44,9 @@ int seq_open_net(struct inode *ino, struct file *f,
44 put_net(net); 44 put_net(net);
45 return -ENOMEM; 45 return -ENOMEM;
46 } 46 }
47#ifdef CONFIG_NET_NS
47 p->net = net; 48 p->net = net;
49#endif
48 return 0; 50 return 0;
49} 51}
50EXPORT_SYMBOL_GPL(seq_open_net); 52EXPORT_SYMBOL_GPL(seq_open_net);
@@ -52,12 +54,10 @@ EXPORT_SYMBOL_GPL(seq_open_net);
52int seq_release_net(struct inode *ino, struct file *f) 54int seq_release_net(struct inode *ino, struct file *f)
53{ 55{
54 struct seq_file *seq; 56 struct seq_file *seq;
55 struct seq_net_private *p;
56 57
57 seq = f->private_data; 58 seq = f->private_data;
58 p = seq->private;
59 59
60 put_net(p->net); 60 put_net(seq_file_net(seq));
61 seq_release_private(ino, f); 61 seq_release_private(ino, f);
62 return 0; 62 return 0;
63} 63}
diff --git a/fs/read_write.c b/fs/read_write.c
index 49a98718ecdf..f0d1240a5c69 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -33,7 +33,7 @@ EXPORT_SYMBOL(generic_ro_fops);
33 33
34loff_t generic_file_llseek(struct file *file, loff_t offset, int origin) 34loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
35{ 35{
36 long long retval; 36 loff_t retval;
37 struct inode *inode = file->f_mapping->host; 37 struct inode *inode = file->f_mapping->host;
38 38
39 mutex_lock(&inode->i_mutex); 39 mutex_lock(&inode->i_mutex);
@@ -60,7 +60,7 @@ EXPORT_SYMBOL(generic_file_llseek);
60 60
61loff_t remote_llseek(struct file *file, loff_t offset, int origin) 61loff_t remote_llseek(struct file *file, loff_t offset, int origin)
62{ 62{
63 long long retval; 63 loff_t retval;
64 64
65 lock_kernel(); 65 lock_kernel();
66 switch (origin) { 66 switch (origin) {
@@ -91,7 +91,7 @@ EXPORT_SYMBOL(no_llseek);
91 91
92loff_t default_llseek(struct file *file, loff_t offset, int origin) 92loff_t default_llseek(struct file *file, loff_t offset, int origin)
93{ 93{
94 long long retval; 94 loff_t retval;
95 95
96 lock_kernel(); 96 lock_kernel();
97 switch (origin) { 97 switch (origin) {
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index e0f0f098a523..74363a7aacbc 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -4,6 +4,7 @@
4 4
5#include <linux/capability.h> 5#include <linux/capability.h>
6#include <linux/fs.h> 6#include <linux/fs.h>
7#include <linux/mount.h>
7#include <linux/reiserfs_fs.h> 8#include <linux/reiserfs_fs.h>
8#include <linux/time.h> 9#include <linux/time.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
@@ -25,6 +26,7 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
25 unsigned long arg) 26 unsigned long arg)
26{ 27{
27 unsigned int flags; 28 unsigned int flags;
29 int err = 0;
28 30
29 switch (cmd) { 31 switch (cmd) {
30 case REISERFS_IOC_UNPACK: 32 case REISERFS_IOC_UNPACK:
@@ -48,50 +50,67 @@ int reiserfs_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
48 if (!reiserfs_attrs(inode->i_sb)) 50 if (!reiserfs_attrs(inode->i_sb))
49 return -ENOTTY; 51 return -ENOTTY;
50 52
51 if (IS_RDONLY(inode)) 53 err = mnt_want_write(filp->f_path.mnt);
52 return -EROFS; 54 if (err)
55 return err;
53 56
54 if (!is_owner_or_cap(inode)) 57 if (!is_owner_or_cap(inode)) {
55 return -EPERM; 58 err = -EPERM;
56 59 goto setflags_out;
57 if (get_user(flags, (int __user *)arg)) 60 }
58 return -EFAULT; 61 if (get_user(flags, (int __user *)arg)) {
59 62 err = -EFAULT;
60 /* Is it quota file? Do not allow user to mess with it. */ 63 goto setflags_out;
61 if (IS_NOQUOTA(inode)) 64 }
62 return -EPERM; 65 /*
66 * Is it quota file? Do not allow user to mess with it
67 */
68 if (IS_NOQUOTA(inode)) {
69 err = -EPERM;
70 goto setflags_out;
71 }
63 if (((flags ^ REISERFS_I(inode)-> 72 if (((flags ^ REISERFS_I(inode)->
64 i_attrs) & (REISERFS_IMMUTABLE_FL | 73 i_attrs) & (REISERFS_IMMUTABLE_FL |
65 REISERFS_APPEND_FL)) 74 REISERFS_APPEND_FL))
66 && !capable(CAP_LINUX_IMMUTABLE)) 75 && !capable(CAP_LINUX_IMMUTABLE)) {
67 return -EPERM; 76 err = -EPERM;
68 77 goto setflags_out;
78 }
69 if ((flags & REISERFS_NOTAIL_FL) && 79 if ((flags & REISERFS_NOTAIL_FL) &&
70 S_ISREG(inode->i_mode)) { 80 S_ISREG(inode->i_mode)) {
71 int result; 81 int result;
72 82
73 result = reiserfs_unpack(inode, filp); 83 result = reiserfs_unpack(inode, filp);
74 if (result) 84 if (result) {
75 return result; 85 err = result;
86 goto setflags_out;
87 }
76 } 88 }
77 sd_attrs_to_i_attrs(flags, inode); 89 sd_attrs_to_i_attrs(flags, inode);
78 REISERFS_I(inode)->i_attrs = flags; 90 REISERFS_I(inode)->i_attrs = flags;
79 inode->i_ctime = CURRENT_TIME_SEC; 91 inode->i_ctime = CURRENT_TIME_SEC;
80 mark_inode_dirty(inode); 92 mark_inode_dirty(inode);
81 return 0; 93setflags_out:
94 mnt_drop_write(filp->f_path.mnt);
95 return err;
82 } 96 }
83 case REISERFS_IOC_GETVERSION: 97 case REISERFS_IOC_GETVERSION:
84 return put_user(inode->i_generation, (int __user *)arg); 98 return put_user(inode->i_generation, (int __user *)arg);
85 case REISERFS_IOC_SETVERSION: 99 case REISERFS_IOC_SETVERSION:
86 if (!is_owner_or_cap(inode)) 100 if (!is_owner_or_cap(inode))
87 return -EPERM; 101 return -EPERM;
88 if (IS_RDONLY(inode)) 102 err = mnt_want_write(filp->f_path.mnt);
89 return -EROFS; 103 if (err)
90 if (get_user(inode->i_generation, (int __user *)arg)) 104 return err;
91 return -EFAULT; 105 if (get_user(inode->i_generation, (int __user *)arg)) {
106 err = -EFAULT;
107 goto setversion_out;
108 }
92 inode->i_ctime = CURRENT_TIME_SEC; 109 inode->i_ctime = CURRENT_TIME_SEC;
93 mark_inode_dirty(inode); 110 mark_inode_dirty(inode);
94 return 0; 111setversion_out:
112 mnt_drop_write(filp->f_path.mnt);
113 return err;
95 default: 114 default:
96 return -ENOTTY; 115 return -ENOTTY;
97 } 116 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bb05a3e51b93..060eb3f598e7 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -38,7 +38,7 @@
38#include <asm/system.h> 38#include <asm/system.h>
39 39
40#include <linux/time.h> 40#include <linux/time.h>
41#include <asm/semaphore.h> 41#include <linux/semaphore.h>
42 42
43#include <linux/vmalloc.h> 43#include <linux/vmalloc.h>
44#include <linux/reiserfs_fs.h> 44#include <linux/reiserfs_fs.h>
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 344b9b96cc56..d7c4935c1034 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -44,7 +44,6 @@
44#include <net/checksum.h> 44#include <net/checksum.h>
45#include <linux/smp_lock.h> 45#include <linux/smp_lock.h>
46#include <linux/stat.h> 46#include <linux/stat.h>
47#include <asm/semaphore.h>
48 47
49#define FL_READONLY 128 48#define FL_READONLY 128
50#define FL_DIR_SEM_HELD 256 49#define FL_DIR_SEM_HELD 256
diff --git a/fs/select.c b/fs/select.c
index 5633fe980781..00f58c5c7e05 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -260,7 +260,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout)
260 wait = NULL; 260 wait = NULL;
261 if (retval || !*timeout || signal_pending(current)) 261 if (retval || !*timeout || signal_pending(current))
262 break; 262 break;
263 if(table.error) { 263 if (table.error) {
264 retval = table.error; 264 retval = table.error;
265 break; 265 break;
266 } 266 }
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 853770274f20..3f54dbd6c49b 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -25,6 +25,7 @@
25 * into the buffer. In case of error ->start() and ->next() return 25 * into the buffer. In case of error ->start() and ->next() return
26 * ERR_PTR(error). In the end of sequence they return %NULL. ->show() 26 * ERR_PTR(error). In the end of sequence they return %NULL. ->show()
27 * returns 0 in case of success and negative number in case of error. 27 * returns 0 in case of success and negative number in case of error.
28 * Returning SEQ_SKIP means "discard this element and move on".
28 */ 29 */
29int seq_open(struct file *file, const struct seq_operations *op) 30int seq_open(struct file *file, const struct seq_operations *op)
30{ 31{
@@ -114,8 +115,10 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
114 if (!p || IS_ERR(p)) 115 if (!p || IS_ERR(p))
115 break; 116 break;
116 err = m->op->show(m, p); 117 err = m->op->show(m, p);
117 if (err) 118 if (err < 0)
118 break; 119 break;
120 if (unlikely(err))
121 m->count = 0;
119 if (m->count < m->size) 122 if (m->count < m->size)
120 goto Fill; 123 goto Fill;
121 m->op->stop(m, p); 124 m->op->stop(m, p);
@@ -140,9 +143,10 @@ Fill:
140 break; 143 break;
141 } 144 }
142 err = m->op->show(m, p); 145 err = m->op->show(m, p);
143 if (err || m->count == m->size) { 146 if (m->count == m->size || err) {
144 m->count = offs; 147 m->count = offs;
145 break; 148 if (likely(err <= 0))
149 break;
146 } 150 }
147 pos = next; 151 pos = next;
148 } 152 }
@@ -199,8 +203,12 @@ static int traverse(struct seq_file *m, loff_t offset)
199 if (IS_ERR(p)) 203 if (IS_ERR(p))
200 break; 204 break;
201 error = m->op->show(m, p); 205 error = m->op->show(m, p);
202 if (error) 206 if (error < 0)
203 break; 207 break;
208 if (unlikely(error)) {
209 error = 0;
210 m->count = 0;
211 }
204 if (m->count == m->size) 212 if (m->count == m->size)
205 goto Eoverflow; 213 goto Eoverflow;
206 if (pos + m->count > offset) { 214 if (pos + m->count > offset) {
@@ -239,7 +247,7 @@ Eoverflow:
239loff_t seq_lseek(struct file *file, loff_t offset, int origin) 247loff_t seq_lseek(struct file *file, loff_t offset, int origin)
240{ 248{
241 struct seq_file *m = (struct seq_file *)file->private_data; 249 struct seq_file *m = (struct seq_file *)file->private_data;
242 long long retval = -EINVAL; 250 loff_t retval = -EINVAL;
243 251
244 mutex_lock(&m->lock); 252 mutex_lock(&m->lock);
245 m->version = file->f_version; 253 m->version = file->f_version;
@@ -342,28 +350,40 @@ int seq_printf(struct seq_file *m, const char *f, ...)
342} 350}
343EXPORT_SYMBOL(seq_printf); 351EXPORT_SYMBOL(seq_printf);
344 352
353static char *mangle_path(char *s, char *p, char *esc)
354{
355 while (s <= p) {
356 char c = *p++;
357 if (!c) {
358 return s;
359 } else if (!strchr(esc, c)) {
360 *s++ = c;
361 } else if (s + 4 > p) {
362 break;
363 } else {
364 *s++ = '\\';
365 *s++ = '0' + ((c & 0300) >> 6);
366 *s++ = '0' + ((c & 070) >> 3);
367 *s++ = '0' + (c & 07);
368 }
369 }
370 return NULL;
371}
372
373/*
374 * return the absolute path of 'dentry' residing in mount 'mnt'.
375 */
345int seq_path(struct seq_file *m, struct path *path, char *esc) 376int seq_path(struct seq_file *m, struct path *path, char *esc)
346{ 377{
347 if (m->count < m->size) { 378 if (m->count < m->size) {
348 char *s = m->buf + m->count; 379 char *s = m->buf + m->count;
349 char *p = d_path(path, s, m->size - m->count); 380 char *p = d_path(path, s, m->size - m->count);
350 if (!IS_ERR(p)) { 381 if (!IS_ERR(p)) {
351 while (s <= p) { 382 s = mangle_path(s, p, esc);
352 char c = *p++; 383 if (s) {
353 if (!c) { 384 p = m->buf + m->count;
354 p = m->buf + m->count; 385 m->count = s - m->buf;
355 m->count = s - m->buf; 386 return s - p;
356 return s - p;
357 } else if (!strchr(esc, c)) {
358 *s++ = c;
359 } else if (s + 4 > p) {
360 break;
361 } else {
362 *s++ = '\\';
363 *s++ = '0' + ((c & 0300) >> 6);
364 *s++ = '0' + ((c & 070) >> 3);
365 *s++ = '0' + (c & 07);
366 }
367 } 387 }
368 } 388 }
369 } 389 }
@@ -372,6 +392,57 @@ int seq_path(struct seq_file *m, struct path *path, char *esc)
372} 392}
373EXPORT_SYMBOL(seq_path); 393EXPORT_SYMBOL(seq_path);
374 394
395/*
396 * Same as seq_path, but relative to supplied root.
397 *
398 * root may be changed, see __d_path().
399 */
400int seq_path_root(struct seq_file *m, struct path *path, struct path *root,
401 char *esc)
402{
403 int err = -ENAMETOOLONG;
404 if (m->count < m->size) {
405 char *s = m->buf + m->count;
406 char *p;
407
408 spin_lock(&dcache_lock);
409 p = __d_path(path, root, s, m->size - m->count);
410 spin_unlock(&dcache_lock);
411 err = PTR_ERR(p);
412 if (!IS_ERR(p)) {
413 s = mangle_path(s, p, esc);
414 if (s) {
415 p = m->buf + m->count;
416 m->count = s - m->buf;
417 return 0;
418 }
419 }
420 }
421 m->count = m->size;
422 return err;
423}
424
425/*
426 * returns the path of the 'dentry' from the root of its filesystem.
427 */
428int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
429{
430 if (m->count < m->size) {
431 char *s = m->buf + m->count;
432 char *p = dentry_path(dentry, s, m->size - m->count);
433 if (!IS_ERR(p)) {
434 s = mangle_path(s, p, esc);
435 if (s) {
436 p = m->buf + m->count;
437 m->count = s - m->buf;
438 return s - p;
439 }
440 }
441 }
442 m->count = m->size;
443 return -1;
444}
445
375static void *single_start(struct seq_file *p, loff_t *pos) 446static void *single_start(struct seq_file *p, loff_t *pos)
376{ 447{
377 return NULL + (*pos == 0); 448 return NULL + (*pos == 0);
diff --git a/fs/super.c b/fs/super.c
index 09008dbd264e..4798350b2bc9 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -37,7 +37,9 @@
37#include <linux/idr.h> 37#include <linux/idr.h>
38#include <linux/kobject.h> 38#include <linux/kobject.h>
39#include <linux/mutex.h> 39#include <linux/mutex.h>
40#include <linux/file.h>
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
42#include "internal.h"
41 43
42 44
43LIST_HEAD(super_blocks); 45LIST_HEAD(super_blocks);
@@ -567,10 +569,29 @@ static void mark_files_ro(struct super_block *sb)
567{ 569{
568 struct file *f; 570 struct file *f;
569 571
572retry:
570 file_list_lock(); 573 file_list_lock();
571 list_for_each_entry(f, &sb->s_files, f_u.fu_list) { 574 list_for_each_entry(f, &sb->s_files, f_u.fu_list) {
572 if (S_ISREG(f->f_path.dentry->d_inode->i_mode) && file_count(f)) 575 struct vfsmount *mnt;
573 f->f_mode &= ~FMODE_WRITE; 576 if (!S_ISREG(f->f_path.dentry->d_inode->i_mode))
577 continue;
578 if (!file_count(f))
579 continue;
580 if (!(f->f_mode & FMODE_WRITE))
581 continue;
582 f->f_mode &= ~FMODE_WRITE;
583 if (file_check_writeable(f) != 0)
584 continue;
585 file_release_write(f);
586 mnt = mntget(f->f_path.mnt);
587 file_list_unlock();
588 /*
589 * This can sleep, so we can't hold
590 * the file_list_lock() spinlock.
591 */
592 mnt_drop_write(mnt);
593 mntput(mnt);
594 goto retry;
574 } 595 }
575 file_list_unlock(); 596 file_list_unlock();
576} 597}
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 4948d9bc405d..a1c3a1fab7f0 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -20,6 +20,7 @@
20#include <linux/idr.h> 20#include <linux/idr.h>
21#include <linux/completion.h> 21#include <linux/completion.h>
22#include <linux/mutex.h> 22#include <linux/mutex.h>
23#include <linux/slab.h>
23#include "sysfs.h" 24#include "sysfs.h"
24 25
25DEFINE_MUTEX(sysfs_mutex); 26DEFINE_MUTEX(sysfs_mutex);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index baa663e69388..ade9a7e6a757 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -13,6 +13,7 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kobject.h> 14#include <linux/kobject.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/slab.h>
16#include <linux/namei.h> 17#include <linux/namei.h>
17#include <linux/poll.h> 18#include <linux/poll.h>
18#include <linux/list.h> 19#include <linux/list.h>
@@ -128,7 +129,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos)
128 ssize_t retval = 0; 129 ssize_t retval = 0;
129 130
130 mutex_lock(&buffer->mutex); 131 mutex_lock(&buffer->mutex);
131 if (buffer->needs_read_fill) { 132 if (buffer->needs_read_fill || *ppos == 0) {
132 retval = fill_read_buffer(file->f_path.dentry,buffer); 133 retval = fill_read_buffer(file->f_path.dentry,buffer);
133 if (retval) 134 if (retval)
134 goto out; 135 goto out;
@@ -409,8 +410,7 @@ static int sysfs_release(struct inode *inode, struct file *filp)
409 * return POLLERR|POLLPRI, and select will return the fd whether 410 * return POLLERR|POLLPRI, and select will return the fd whether
410 * it is waiting for read, write, or exceptions. 411 * it is waiting for read, write, or exceptions.
411 * Once poll/select indicates that the value has changed, you 412 * Once poll/select indicates that the value has changed, you
412 * need to close and re-open the file, as simply seeking and reading 413 * need to close and re-open the file, or seek to 0 and read again.
413 * again will not get new data, or reset the state of 'poll'.
414 * Reminder: this only works for attributes which actively support 414 * Reminder: this only works for attributes which actively support
415 * it, and it is not possible to test an attribute from userspace 415 * it, and it is not possible to test an attribute from userspace
416 * to see if it supports poll (Neither 'poll' nor 'select' return 416 * to see if it supports poll (Neither 'poll' nor 'select' return
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 5f66c4466151..817f5966edca 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -87,7 +87,14 @@ int sysfs_create_link(struct kobject * kobj, struct kobject * target, const char
87 87
88void sysfs_remove_link(struct kobject * kobj, const char * name) 88void sysfs_remove_link(struct kobject * kobj, const char * name)
89{ 89{
90 sysfs_hash_and_remove(kobj->sd, name); 90 struct sysfs_dirent *parent_sd = NULL;
91
92 if (!kobj)
93 parent_sd = &sysfs_root;
94 else
95 parent_sd = kobj->sd;
96
97 sysfs_hash_and_remove(parent_sd, name);
91} 98}
92 99
93static int sysfs_get_target_path(struct sysfs_dirent *parent_sd, 100static int sysfs_get_target_path(struct sysfs_dirent *parent_sd,
diff --git a/fs/udf/Makefile b/fs/udf/Makefile
index be845e7540ef..0d4503f7446d 100644
--- a/fs/udf/Makefile
+++ b/fs/udf/Makefile
@@ -6,4 +6,4 @@ obj-$(CONFIG_UDF_FS) += udf.o
6 6
7udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \ 7udf-objs := balloc.o dir.o file.o ialloc.o inode.o lowlevel.o namei.o \
8 partition.o super.o truncate.o symlink.o fsync.o \ 8 partition.o super.o truncate.o symlink.o fsync.o \
9 crc.o directory.o misc.o udftime.o unicode.o 9 directory.o misc.o udftime.o unicode.o
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index f855dcbbdfb8..1b809bd494bd 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -149,8 +149,7 @@ static bool udf_add_free_space(struct udf_sb_info *sbi,
149 return false; 149 return false;
150 150
151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data; 151 lvid = (struct logicalVolIntegrityDesc *)sbi->s_lvid_bh->b_data;
152 lvid->freeSpaceTable[partition] = cpu_to_le32(le32_to_cpu( 152 le32_add_cpu(&lvid->freeSpaceTable[partition], cnt);
153 lvid->freeSpaceTable[partition]) + cnt);
154 return true; 153 return true;
155} 154}
156 155
@@ -589,10 +588,8 @@ static void udf_table_free_blocks(struct super_block *sb,
589 sptr = oepos.bh->b_data + epos.offset; 588 sptr = oepos.bh->b_data + epos.offset;
590 aed = (struct allocExtDesc *) 589 aed = (struct allocExtDesc *)
591 oepos.bh->b_data; 590 oepos.bh->b_data;
592 aed->lengthAllocDescs = 591 le32_add_cpu(&aed->lengthAllocDescs,
593 cpu_to_le32(le32_to_cpu( 592 adsize);
594 aed->lengthAllocDescs) +
595 adsize);
596 } else { 593 } else {
597 sptr = iinfo->i_ext.i_data + 594 sptr = iinfo->i_ext.i_data +
598 epos.offset; 595 epos.offset;
@@ -645,9 +642,7 @@ static void udf_table_free_blocks(struct super_block *sb,
645 mark_inode_dirty(table); 642 mark_inode_dirty(table);
646 } else { 643 } else {
647 aed = (struct allocExtDesc *)epos.bh->b_data; 644 aed = (struct allocExtDesc *)epos.bh->b_data;
648 aed->lengthAllocDescs = 645 le32_add_cpu(&aed->lengthAllocDescs, adsize);
649 cpu_to_le32(le32_to_cpu(
650 aed->lengthAllocDescs) + adsize);
651 udf_update_tag(epos.bh->b_data, epos.offset); 646 udf_update_tag(epos.bh->b_data, epos.offset);
652 mark_buffer_dirty(epos.bh); 647 mark_buffer_dirty(epos.bh);
653 } 648 }
diff --git a/fs/udf/crc.c b/fs/udf/crc.c
deleted file mode 100644
index b1661296e786..000000000000
--- a/fs/udf/crc.c
+++ /dev/null
@@ -1,172 +0,0 @@
1/*
2 * crc.c
3 *
4 * PURPOSE
5 * Routines to generate, calculate, and test a 16-bit CRC.
6 *
7 * DESCRIPTION
8 * The CRC code was devised by Don P. Mitchell of AT&T Bell Laboratories
9 * and Ned W. Rhodes of Software Systems Group. It has been published in
10 * "Design and Validation of Computer Protocols", Prentice Hall,
11 * Englewood Cliffs, NJ, 1991, Chapter 3, ISBN 0-13-539925-4.
12 *
13 * Copyright is held by AT&T.
14 *
15 * AT&T gives permission for the free use of the CRC source code.
16 *
17 * COPYRIGHT
18 * This file is distributed under the terms of the GNU General Public
19 * License (GPL). Copies of the GPL can be obtained from:
20 * ftp://prep.ai.mit.edu/pub/gnu/GPL
21 * Each contributing author retains all rights to their own work.
22 */
23
24#include "udfdecl.h"
25
26static uint16_t crc_table[256] = {
27 0x0000U, 0x1021U, 0x2042U, 0x3063U, 0x4084U, 0x50a5U, 0x60c6U, 0x70e7U,
28 0x8108U, 0x9129U, 0xa14aU, 0xb16bU, 0xc18cU, 0xd1adU, 0xe1ceU, 0xf1efU,
29 0x1231U, 0x0210U, 0x3273U, 0x2252U, 0x52b5U, 0x4294U, 0x72f7U, 0x62d6U,
30 0x9339U, 0x8318U, 0xb37bU, 0xa35aU, 0xd3bdU, 0xc39cU, 0xf3ffU, 0xe3deU,
31 0x2462U, 0x3443U, 0x0420U, 0x1401U, 0x64e6U, 0x74c7U, 0x44a4U, 0x5485U,
32 0xa56aU, 0xb54bU, 0x8528U, 0x9509U, 0xe5eeU, 0xf5cfU, 0xc5acU, 0xd58dU,
33 0x3653U, 0x2672U, 0x1611U, 0x0630U, 0x76d7U, 0x66f6U, 0x5695U, 0x46b4U,
34 0xb75bU, 0xa77aU, 0x9719U, 0x8738U, 0xf7dfU, 0xe7feU, 0xd79dU, 0xc7bcU,
35 0x48c4U, 0x58e5U, 0x6886U, 0x78a7U, 0x0840U, 0x1861U, 0x2802U, 0x3823U,
36 0xc9ccU, 0xd9edU, 0xe98eU, 0xf9afU, 0x8948U, 0x9969U, 0xa90aU, 0xb92bU,
37 0x5af5U, 0x4ad4U, 0x7ab7U, 0x6a96U, 0x1a71U, 0x0a50U, 0x3a33U, 0x2a12U,
38 0xdbfdU, 0xcbdcU, 0xfbbfU, 0xeb9eU, 0x9b79U, 0x8b58U, 0xbb3bU, 0xab1aU,
39 0x6ca6U, 0x7c87U, 0x4ce4U, 0x5cc5U, 0x2c22U, 0x3c03U, 0x0c60U, 0x1c41U,
40 0xedaeU, 0xfd8fU, 0xcdecU, 0xddcdU, 0xad2aU, 0xbd0bU, 0x8d68U, 0x9d49U,
41 0x7e97U, 0x6eb6U, 0x5ed5U, 0x4ef4U, 0x3e13U, 0x2e32U, 0x1e51U, 0x0e70U,
42 0xff9fU, 0xefbeU, 0xdfddU, 0xcffcU, 0xbf1bU, 0xaf3aU, 0x9f59U, 0x8f78U,
43 0x9188U, 0x81a9U, 0xb1caU, 0xa1ebU, 0xd10cU, 0xc12dU, 0xf14eU, 0xe16fU,
44 0x1080U, 0x00a1U, 0x30c2U, 0x20e3U, 0x5004U, 0x4025U, 0x7046U, 0x6067U,
45 0x83b9U, 0x9398U, 0xa3fbU, 0xb3daU, 0xc33dU, 0xd31cU, 0xe37fU, 0xf35eU,
46 0x02b1U, 0x1290U, 0x22f3U, 0x32d2U, 0x4235U, 0x5214U, 0x6277U, 0x7256U,
47 0xb5eaU, 0xa5cbU, 0x95a8U, 0x8589U, 0xf56eU, 0xe54fU, 0xd52cU, 0xc50dU,
48 0x34e2U, 0x24c3U, 0x14a0U, 0x0481U, 0x7466U, 0x6447U, 0x5424U, 0x4405U,
49 0xa7dbU, 0xb7faU, 0x8799U, 0x97b8U, 0xe75fU, 0xf77eU, 0xc71dU, 0xd73cU,
50 0x26d3U, 0x36f2U, 0x0691U, 0x16b0U, 0x6657U, 0x7676U, 0x4615U, 0x5634U,
51 0xd94cU, 0xc96dU, 0xf90eU, 0xe92fU, 0x99c8U, 0x89e9U, 0xb98aU, 0xa9abU,
52 0x5844U, 0x4865U, 0x7806U, 0x6827U, 0x18c0U, 0x08e1U, 0x3882U, 0x28a3U,
53 0xcb7dU, 0xdb5cU, 0xeb3fU, 0xfb1eU, 0x8bf9U, 0x9bd8U, 0xabbbU, 0xbb9aU,
54 0x4a75U, 0x5a54U, 0x6a37U, 0x7a16U, 0x0af1U, 0x1ad0U, 0x2ab3U, 0x3a92U,
55 0xfd2eU, 0xed0fU, 0xdd6cU, 0xcd4dU, 0xbdaaU, 0xad8bU, 0x9de8U, 0x8dc9U,
56 0x7c26U, 0x6c07U, 0x5c64U, 0x4c45U, 0x3ca2U, 0x2c83U, 0x1ce0U, 0x0cc1U,
57 0xef1fU, 0xff3eU, 0xcf5dU, 0xdf7cU, 0xaf9bU, 0xbfbaU, 0x8fd9U, 0x9ff8U,
58 0x6e17U, 0x7e36U, 0x4e55U, 0x5e74U, 0x2e93U, 0x3eb2U, 0x0ed1U, 0x1ef0U
59};
60
61/*
62 * udf_crc
63 *
64 * PURPOSE
65 * Calculate a 16-bit CRC checksum using ITU-T V.41 polynomial.
66 *
67 * DESCRIPTION
68 * The OSTA-UDF(tm) 1.50 standard states that using CRCs is mandatory.
69 * The polynomial used is: x^16 + x^12 + x^15 + 1
70 *
71 * PRE-CONDITIONS
72 * data Pointer to the data block.
73 * size Size of the data block.
74 *
75 * POST-CONDITIONS
76 * <return> CRC of the data block.
77 *
78 * HISTORY
79 * July 21, 1997 - Andrew E. Mileski
80 * Adapted from OSTA-UDF(tm) 1.50 standard.
81 */
82uint16_t udf_crc(uint8_t *data, uint32_t size, uint16_t crc)
83{
84 while (size--)
85 crc = crc_table[(crc >> 8 ^ *(data++)) & 0xffU] ^ (crc << 8);
86
87 return crc;
88}
89
90/****************************************************************************/
91#if defined(TEST)
92
93/*
94 * PURPOSE
95 * Test udf_crc()
96 *
97 * HISTORY
98 * July 21, 1997 - Andrew E. Mileski
99 * Adapted from OSTA-UDF(tm) 1.50 standard.
100 */
101
102unsigned char bytes[] = { 0x70U, 0x6AU, 0x77U };
103
104int main(void)
105{
106 unsigned short x;
107
108 x = udf_crc(bytes, sizeof bytes);
109 printf("udf_crc: calculated = %4.4x, correct = %4.4x\n", x, 0x3299U);
110
111 return 0;
112}
113
114#endif /* defined(TEST) */
115
116/****************************************************************************/
117#if defined(GENERATE)
118
119/*
120 * PURPOSE
121 * Generate a table for fast 16-bit CRC calculations (any polynomial).
122 *
123 * DESCRIPTION
124 * The ITU-T V.41 polynomial is 010041.
125 *
126 * HISTORY
127 * July 21, 1997 - Andrew E. Mileski
128 * Adapted from OSTA-UDF(tm) 1.50 standard.
129 */
130
131#include <stdio.h>
132
133int main(int argc, char **argv)
134{
135 unsigned long crc, poly;
136 int n, i;
137
138 /* Get the polynomial */
139 sscanf(argv[1], "%lo", &poly);
140 if (poly & 0xffff0000U) {
141 fprintf(stderr, "polynomial is too large\en");
142 exit(1);
143 }
144
145 printf("/* CRC 0%o */\n", poly);
146
147 /* Create a table */
148 printf("static unsigned short crc_table[256] = {\n");
149 for (n = 0; n < 256; n++) {
150 if (n % 8 == 0)
151 printf("\t");
152 crc = n << 8;
153 for (i = 0; i < 8; i++) {
154 if (crc & 0x8000U)
155 crc = (crc << 1) ^ poly;
156 else
157 crc <<= 1;
158 crc &= 0xFFFFU;
159 }
160 if (n == 255)
161 printf("0x%04xU ", crc);
162 else
163 printf("0x%04xU, ", crc);
164 if (n % 8 == 7)
165 printf("\n");
166 }
167 printf("};\n");
168
169 return 0;
170}
171
172#endif /* defined(GENERATE) */
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 8d8643ada199..62dc270c69d1 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -39,13 +39,13 @@
39static int do_udf_readdir(struct inode *dir, struct file *filp, 39static int do_udf_readdir(struct inode *dir, struct file *filp,
40 filldir_t filldir, void *dirent) 40 filldir_t filldir, void *dirent)
41{ 41{
42 struct udf_fileident_bh fibh; 42 struct udf_fileident_bh fibh = { .sbh = NULL, .ebh = NULL};
43 struct fileIdentDesc *fi = NULL; 43 struct fileIdentDesc *fi = NULL;
44 struct fileIdentDesc cfi; 44 struct fileIdentDesc cfi;
45 int block, iblock; 45 int block, iblock;
46 loff_t nf_pos = (filp->f_pos - 1) << 2; 46 loff_t nf_pos = (filp->f_pos - 1) << 2;
47 int flen; 47 int flen;
48 char fname[UDF_NAME_LEN]; 48 char *fname = NULL;
49 char *nameptr; 49 char *nameptr;
50 uint16_t liu; 50 uint16_t liu;
51 uint8_t lfi; 51 uint8_t lfi;
@@ -54,23 +54,32 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
54 kernel_lb_addr eloc; 54 kernel_lb_addr eloc;
55 uint32_t elen; 55 uint32_t elen;
56 sector_t offset; 56 sector_t offset;
57 int i, num; 57 int i, num, ret = 0;
58 unsigned int dt_type; 58 unsigned int dt_type;
59 struct extent_position epos = { NULL, 0, {0, 0} }; 59 struct extent_position epos = { NULL, 0, {0, 0} };
60 struct udf_inode_info *iinfo; 60 struct udf_inode_info *iinfo;
61 61
62 if (nf_pos >= size) 62 if (nf_pos >= size)
63 return 0; 63 goto out;
64
65 fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
66 if (!fname) {
67 ret = -ENOMEM;
68 goto out;
69 }
64 70
65 if (nf_pos == 0) 71 if (nf_pos == 0)
66 nf_pos = udf_ext0_offset(dir); 72 nf_pos = udf_ext0_offset(dir);
67 73
68 fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1); 74 fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
69 iinfo = UDF_I(dir); 75 iinfo = UDF_I(dir);
70 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) { 76 if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
71 fibh.sbh = fibh.ebh = NULL; 77 if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
72 } else if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits, 78 &epos, &eloc, &elen, &offset)
73 &epos, &eloc, &elen, &offset) == (EXT_RECORDED_ALLOCATED >> 30)) { 79 != (EXT_RECORDED_ALLOCATED >> 30)) {
80 ret = -ENOENT;
81 goto out;
82 }
74 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 83 block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
75 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 84 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
76 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 85 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -83,8 +92,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
83 } 92 }
84 93
85 if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) { 94 if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) {
86 brelse(epos.bh); 95 ret = -EIO;
87 return -EIO; 96 goto out;
88 } 97 }
89 98
90 if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) { 99 if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) {
@@ -105,9 +114,6 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
105 brelse(bha[i]); 114 brelse(bha[i]);
106 } 115 }
107 } 116 }
108 } else {
109 brelse(epos.bh);
110 return -ENOENT;
111 } 117 }
112 118
113 while (nf_pos < size) { 119 while (nf_pos < size) {
@@ -115,13 +121,8 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
115 121
116 fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc, 122 fi = udf_fileident_read(dir, &nf_pos, &fibh, &cfi, &epos, &eloc,
117 &elen, &offset); 123 &elen, &offset);
118 if (!fi) { 124 if (!fi)
119 if (fibh.sbh != fibh.ebh) 125 goto out;
120 brelse(fibh.ebh);
121 brelse(fibh.sbh);
122 brelse(epos.bh);
123 return 0;
124 }
125 126
126 liu = le16_to_cpu(cfi.lengthOfImpUse); 127 liu = le16_to_cpu(cfi.lengthOfImpUse);
127 lfi = cfi.lengthFileIdent; 128 lfi = cfi.lengthFileIdent;
@@ -167,53 +168,23 @@ static int do_udf_readdir(struct inode *dir, struct file *filp,
167 dt_type = DT_UNKNOWN; 168 dt_type = DT_UNKNOWN;
168 } 169 }
169 170
170 if (flen) { 171 if (flen && filldir(dirent, fname, flen, filp->f_pos,
171 if (filldir(dirent, fname, flen, filp->f_pos, iblock, dt_type) < 0) { 172 iblock, dt_type) < 0)
172 if (fibh.sbh != fibh.ebh) 173 goto out;
173 brelse(fibh.ebh);
174 brelse(fibh.sbh);
175 brelse(epos.bh);
176 return 0;
177 }
178 }
179 } /* end while */ 174 } /* end while */
180 175
181 filp->f_pos = (nf_pos >> 2) + 1; 176 filp->f_pos = (nf_pos >> 2) + 1;
182 177
178out:
183 if (fibh.sbh != fibh.ebh) 179 if (fibh.sbh != fibh.ebh)
184 brelse(fibh.ebh); 180 brelse(fibh.ebh);
185 brelse(fibh.sbh); 181 brelse(fibh.sbh);
186 brelse(epos.bh); 182 brelse(epos.bh);
183 kfree(fname);
187 184
188 return 0; 185 return ret;
189} 186}
190 187
191/*
192 * udf_readdir
193 *
194 * PURPOSE
195 * Read a directory entry.
196 *
197 * DESCRIPTION
198 * Optional - sys_getdents() will return -ENOTDIR if this routine is not
199 * available.
200 *
201 * Refer to sys_getdents() in fs/readdir.c
202 * sys_getdents() -> .
203 *
204 * PRE-CONDITIONS
205 * filp Pointer to directory file.
206 * buf Pointer to directory entry buffer.
207 * filldir Pointer to filldir function.
208 *
209 * POST-CONDITIONS
210 * <return> >=0 on success.
211 *
212 * HISTORY
213 * July 1, 1997 - Andrew E. Mileski
214 * Written, tested, and released.
215 */
216
217static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir) 188static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
218{ 189{
219 struct inode *dir = filp->f_path.dentry->d_inode; 190 struct inode *dir = filp->f_path.dentry->d_inode;
diff --git a/fs/udf/ecma_167.h b/fs/udf/ecma_167.h
index 56387711589b..a0974df82b31 100644
--- a/fs/udf/ecma_167.h
+++ b/fs/udf/ecma_167.h
@@ -70,19 +70,6 @@ typedef struct {
70 uint8_t microseconds; 70 uint8_t microseconds;
71} __attribute__ ((packed)) timestamp; 71} __attribute__ ((packed)) timestamp;
72 72
73typedef struct {
74 uint16_t typeAndTimezone;
75 int16_t year;
76 uint8_t month;
77 uint8_t day;
78 uint8_t hour;
79 uint8_t minute;
80 uint8_t second;
81 uint8_t centiseconds;
82 uint8_t hundredsOfMicroseconds;
83 uint8_t microseconds;
84} __attribute__ ((packed)) kernel_timestamp;
85
86/* Type and Time Zone (ECMA 167r3 1/7.3.1) */ 73/* Type and Time Zone (ECMA 167r3 1/7.3.1) */
87#define TIMESTAMP_TYPE_MASK 0xF000 74#define TIMESTAMP_TYPE_MASK 0xF000
88#define TIMESTAMP_TYPE_CUT 0x0000 75#define TIMESTAMP_TYPE_CUT 0x0000
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 97c71ae7c689..0ed6e146a0d9 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -27,7 +27,6 @@
27 27
28#include "udfdecl.h" 28#include "udfdecl.h"
29#include <linux/fs.h> 29#include <linux/fs.h>
30#include <linux/udf_fs.h>
31#include <asm/uaccess.h> 30#include <asm/uaccess.h>
32#include <linux/kernel.h> 31#include <linux/kernel.h>
33#include <linux/string.h> /* memset */ 32#include <linux/string.h> /* memset */
@@ -144,40 +143,6 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
144 return retval; 143 return retval;
145} 144}
146 145
147/*
148 * udf_ioctl
149 *
150 * PURPOSE
151 * Issue an ioctl.
152 *
153 * DESCRIPTION
154 * Optional - sys_ioctl() will return -ENOTTY if this routine is not
155 * available, and the ioctl cannot be handled without filesystem help.
156 *
157 * sys_ioctl() handles these ioctls that apply only to regular files:
158 * FIBMAP [requires udf_block_map()], FIGETBSZ, FIONREAD
159 * These ioctls are also handled by sys_ioctl():
160 * FIOCLEX, FIONCLEX, FIONBIO, FIOASYNC
161 * All other ioctls are passed to the filesystem.
162 *
163 * Refer to sys_ioctl() in fs/ioctl.c
164 * sys_ioctl() -> .
165 *
166 * PRE-CONDITIONS
167 * inode Pointer to inode that ioctl was issued on.
168 * filp Pointer to file that ioctl was issued on.
169 * cmd The ioctl command.
170 * arg The ioctl argument [can be interpreted as a
171 * user-space pointer if desired].
172 *
173 * POST-CONDITIONS
174 * <return> Success (>=0) or an error code (<=0) that
175 * sys_ioctl() will return.
176 *
177 * HISTORY
178 * July 1, 1997 - Andrew E. Mileski
179 * Written, tested, and released.
180 */
181int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, 146int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
182 unsigned long arg) 147 unsigned long arg)
183{ 148{
@@ -225,18 +190,6 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
225 return result; 190 return result;
226} 191}
227 192
228/*
229 * udf_release_file
230 *
231 * PURPOSE
232 * Called when all references to the file are closed
233 *
234 * DESCRIPTION
235 * Discard prealloced blocks
236 *
237 * HISTORY
238 *
239 */
240static int udf_release_file(struct inode *inode, struct file *filp) 193static int udf_release_file(struct inode *inode, struct file *filp)
241{ 194{
242 if (filp->f_mode & FMODE_WRITE) { 195 if (filp->f_mode & FMODE_WRITE) {
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 84360315aca2..eb9cfa23dc3d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -21,7 +21,6 @@
21#include "udfdecl.h" 21#include "udfdecl.h"
22#include <linux/fs.h> 22#include <linux/fs.h>
23#include <linux/quotaops.h> 23#include <linux/quotaops.h>
24#include <linux/udf_fs.h>
25#include <linux/sched.h> 24#include <linux/sched.h>
26#include <linux/slab.h> 25#include <linux/slab.h>
27 26
@@ -47,11 +46,9 @@ void udf_free_inode(struct inode *inode)
47 struct logicalVolIntegrityDescImpUse *lvidiu = 46 struct logicalVolIntegrityDescImpUse *lvidiu =
48 udf_sb_lvidiu(sbi); 47 udf_sb_lvidiu(sbi);
49 if (S_ISDIR(inode->i_mode)) 48 if (S_ISDIR(inode->i_mode))
50 lvidiu->numDirs = 49 le32_add_cpu(&lvidiu->numDirs, -1);
51 cpu_to_le32(le32_to_cpu(lvidiu->numDirs) - 1);
52 else 50 else
53 lvidiu->numFiles = 51 le32_add_cpu(&lvidiu->numFiles, -1);
54 cpu_to_le32(le32_to_cpu(lvidiu->numFiles) - 1);
55 52
56 mark_buffer_dirty(sbi->s_lvid_bh); 53 mark_buffer_dirty(sbi->s_lvid_bh);
57 } 54 }
@@ -105,11 +102,9 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
105 lvhd = (struct logicalVolHeaderDesc *) 102 lvhd = (struct logicalVolHeaderDesc *)
106 (lvid->logicalVolContentsUse); 103 (lvid->logicalVolContentsUse);
107 if (S_ISDIR(mode)) 104 if (S_ISDIR(mode))
108 lvidiu->numDirs = 105 le32_add_cpu(&lvidiu->numDirs, 1);
109 cpu_to_le32(le32_to_cpu(lvidiu->numDirs) + 1);
110 else 106 else
111 lvidiu->numFiles = 107 le32_add_cpu(&lvidiu->numFiles, 1);
112 cpu_to_le32(le32_to_cpu(lvidiu->numFiles) + 1);
113 iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID); 108 iinfo->i_unique = uniqueID = le64_to_cpu(lvhd->uniqueID);
114 if (!(++uniqueID & 0x00000000FFFFFFFFUL)) 109 if (!(++uniqueID & 0x00000000FFFFFFFFUL))
115 uniqueID += 16; 110 uniqueID += 16;
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 24cfa55d0fdc..6e74b117aaf0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -37,6 +37,7 @@
37#include <linux/buffer_head.h> 37#include <linux/buffer_head.h>
38#include <linux/writeback.h> 38#include <linux/writeback.h>
39#include <linux/slab.h> 39#include <linux/slab.h>
40#include <linux/crc-itu-t.h>
40 41
41#include "udf_i.h" 42#include "udf_i.h"
42#include "udf_sb.h" 43#include "udf_sb.h"
@@ -66,22 +67,7 @@ static void udf_update_extents(struct inode *,
66 struct extent_position *); 67 struct extent_position *);
67static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int); 68static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
68 69
69/* 70
70 * udf_delete_inode
71 *
72 * PURPOSE
73 * Clean-up before the specified inode is destroyed.
74 *
75 * DESCRIPTION
76 * This routine is called when the kernel destroys an inode structure
77 * ie. when iput() finds i_count == 0.
78 *
79 * HISTORY
80 * July 1, 1997 - Andrew E. Mileski
81 * Written, tested, and released.
82 *
83 * Called at the last iput() if i_nlink is zero.
84 */
85void udf_delete_inode(struct inode *inode) 71void udf_delete_inode(struct inode *inode)
86{ 72{
87 truncate_inode_pages(&inode->i_data, 0); 73 truncate_inode_pages(&inode->i_data, 0);
@@ -323,9 +309,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
323 309
324 lock_kernel(); 310 lock_kernel();
325 311
326 if (block < 0)
327 goto abort_negative;
328
329 iinfo = UDF_I(inode); 312 iinfo = UDF_I(inode);
330 if (block == iinfo->i_next_alloc_block + 1) { 313 if (block == iinfo->i_next_alloc_block + 1) {
331 iinfo->i_next_alloc_block++; 314 iinfo->i_next_alloc_block++;
@@ -347,10 +330,6 @@ static int udf_get_block(struct inode *inode, sector_t block,
347abort: 330abort:
348 unlock_kernel(); 331 unlock_kernel();
349 return err; 332 return err;
350
351abort_negative:
352 udf_warning(inode->i_sb, "udf_get_block", "block < 0");
353 goto abort;
354} 333}
355 334
356static struct buffer_head *udf_getblk(struct inode *inode, long block, 335static struct buffer_head *udf_getblk(struct inode *inode, long block,
@@ -1116,42 +1095,36 @@ static void __udf_read_inode(struct inode *inode)
1116 fe = (struct fileEntry *)bh->b_data; 1095 fe = (struct fileEntry *)bh->b_data;
1117 1096
1118 if (fe->icbTag.strategyType == cpu_to_le16(4096)) { 1097 if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
1119 struct buffer_head *ibh = NULL, *nbh = NULL; 1098 struct buffer_head *ibh;
1120 struct indirectEntry *ie;
1121 1099
1122 ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1, 1100 ibh = udf_read_ptagged(inode->i_sb, iinfo->i_location, 1,
1123 &ident); 1101 &ident);
1124 if (ident == TAG_IDENT_IE) { 1102 if (ident == TAG_IDENT_IE && ibh) {
1125 if (ibh) { 1103 struct buffer_head *nbh = NULL;
1126 kernel_lb_addr loc; 1104 kernel_lb_addr loc;
1127 ie = (struct indirectEntry *)ibh->b_data; 1105 struct indirectEntry *ie;
1128 1106
1129 loc = lelb_to_cpu(ie->indirectICB.extLocation); 1107 ie = (struct indirectEntry *)ibh->b_data;
1130 1108 loc = lelb_to_cpu(ie->indirectICB.extLocation);
1131 if (ie->indirectICB.extLength && 1109
1132 (nbh = udf_read_ptagged(inode->i_sb, loc, 0, 1110 if (ie->indirectICB.extLength &&
1133 &ident))) { 1111 (nbh = udf_read_ptagged(inode->i_sb, loc, 0,
1134 if (ident == TAG_IDENT_FE || 1112 &ident))) {
1135 ident == TAG_IDENT_EFE) { 1113 if (ident == TAG_IDENT_FE ||
1136 memcpy(&iinfo->i_location, 1114 ident == TAG_IDENT_EFE) {
1137 &loc, 1115 memcpy(&iinfo->i_location,
1138 sizeof(kernel_lb_addr)); 1116 &loc,
1139 brelse(bh); 1117 sizeof(kernel_lb_addr));
1140 brelse(ibh); 1118 brelse(bh);
1141 brelse(nbh);
1142 __udf_read_inode(inode);
1143 return;
1144 } else {
1145 brelse(nbh);
1146 brelse(ibh);
1147 }
1148 } else {
1149 brelse(ibh); 1119 brelse(ibh);
1120 brelse(nbh);
1121 __udf_read_inode(inode);
1122 return;
1150 } 1123 }
1124 brelse(nbh);
1151 } 1125 }
1152 } else {
1153 brelse(ibh);
1154 } 1126 }
1127 brelse(ibh);
1155 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { 1128 } else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
1156 printk(KERN_ERR "udf: unsupported strategy type: %d\n", 1129 printk(KERN_ERR "udf: unsupported strategy type: %d\n",
1157 le16_to_cpu(fe->icbTag.strategyType)); 1130 le16_to_cpu(fe->icbTag.strategyType));
@@ -1168,8 +1141,6 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1168{ 1141{
1169 struct fileEntry *fe; 1142 struct fileEntry *fe;
1170 struct extendedFileEntry *efe; 1143 struct extendedFileEntry *efe;
1171 time_t convtime;
1172 long convtime_usec;
1173 int offset; 1144 int offset;
1174 struct udf_sb_info *sbi = UDF_SB(inode->i_sb); 1145 struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
1175 struct udf_inode_info *iinfo = UDF_I(inode); 1146 struct udf_inode_info *iinfo = UDF_I(inode);
@@ -1257,29 +1228,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1257 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) << 1228 inode->i_blocks = le64_to_cpu(fe->logicalBlocksRecorded) <<
1258 (inode->i_sb->s_blocksize_bits - 9); 1229 (inode->i_sb->s_blocksize_bits - 9);
1259 1230
1260 if (udf_stamp_to_time(&convtime, &convtime_usec, 1231 if (!udf_disk_stamp_to_time(&inode->i_atime, fe->accessTime))
1261 lets_to_cpu(fe->accessTime))) {
1262 inode->i_atime.tv_sec = convtime;
1263 inode->i_atime.tv_nsec = convtime_usec * 1000;
1264 } else {
1265 inode->i_atime = sbi->s_record_time; 1232 inode->i_atime = sbi->s_record_time;
1266 }
1267 1233
1268 if (udf_stamp_to_time(&convtime, &convtime_usec, 1234 if (!udf_disk_stamp_to_time(&inode->i_mtime,
1269 lets_to_cpu(fe->modificationTime))) { 1235 fe->modificationTime))
1270 inode->i_mtime.tv_sec = convtime;
1271 inode->i_mtime.tv_nsec = convtime_usec * 1000;
1272 } else {
1273 inode->i_mtime = sbi->s_record_time; 1236 inode->i_mtime = sbi->s_record_time;
1274 }
1275 1237
1276 if (udf_stamp_to_time(&convtime, &convtime_usec, 1238 if (!udf_disk_stamp_to_time(&inode->i_ctime, fe->attrTime))
1277 lets_to_cpu(fe->attrTime))) {
1278 inode->i_ctime.tv_sec = convtime;
1279 inode->i_ctime.tv_nsec = convtime_usec * 1000;
1280 } else {
1281 inode->i_ctime = sbi->s_record_time; 1239 inode->i_ctime = sbi->s_record_time;
1282 }
1283 1240
1284 iinfo->i_unique = le64_to_cpu(fe->uniqueID); 1241 iinfo->i_unique = le64_to_cpu(fe->uniqueID);
1285 iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr); 1242 iinfo->i_lenEAttr = le32_to_cpu(fe->lengthExtendedAttr);
@@ -1289,37 +1246,18 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1289 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) << 1246 inode->i_blocks = le64_to_cpu(efe->logicalBlocksRecorded) <<
1290 (inode->i_sb->s_blocksize_bits - 9); 1247 (inode->i_sb->s_blocksize_bits - 9);
1291 1248
1292 if (udf_stamp_to_time(&convtime, &convtime_usec, 1249 if (!udf_disk_stamp_to_time(&inode->i_atime, efe->accessTime))
1293 lets_to_cpu(efe->accessTime))) {
1294 inode->i_atime.tv_sec = convtime;
1295 inode->i_atime.tv_nsec = convtime_usec * 1000;
1296 } else {
1297 inode->i_atime = sbi->s_record_time; 1250 inode->i_atime = sbi->s_record_time;
1298 }
1299 1251
1300 if (udf_stamp_to_time(&convtime, &convtime_usec, 1252 if (!udf_disk_stamp_to_time(&inode->i_mtime,
1301 lets_to_cpu(efe->modificationTime))) { 1253 efe->modificationTime))
1302 inode->i_mtime.tv_sec = convtime;
1303 inode->i_mtime.tv_nsec = convtime_usec * 1000;
1304 } else {
1305 inode->i_mtime = sbi->s_record_time; 1254 inode->i_mtime = sbi->s_record_time;
1306 }
1307 1255
1308 if (udf_stamp_to_time(&convtime, &convtime_usec, 1256 if (!udf_disk_stamp_to_time(&iinfo->i_crtime, efe->createTime))
1309 lets_to_cpu(efe->createTime))) {
1310 iinfo->i_crtime.tv_sec = convtime;
1311 iinfo->i_crtime.tv_nsec = convtime_usec * 1000;
1312 } else {
1313 iinfo->i_crtime = sbi->s_record_time; 1257 iinfo->i_crtime = sbi->s_record_time;
1314 }
1315 1258
1316 if (udf_stamp_to_time(&convtime, &convtime_usec, 1259 if (!udf_disk_stamp_to_time(&inode->i_ctime, efe->attrTime))
1317 lets_to_cpu(efe->attrTime))) {
1318 inode->i_ctime.tv_sec = convtime;
1319 inode->i_ctime.tv_nsec = convtime_usec * 1000;
1320 } else {
1321 inode->i_ctime = sbi->s_record_time; 1260 inode->i_ctime = sbi->s_record_time;
1322 }
1323 1261
1324 iinfo->i_unique = le64_to_cpu(efe->uniqueID); 1262 iinfo->i_unique = le64_to_cpu(efe->uniqueID);
1325 iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr); 1263 iinfo->i_lenEAttr = le32_to_cpu(efe->lengthExtendedAttr);
@@ -1338,6 +1276,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1338 case ICBTAG_FILE_TYPE_REALTIME: 1276 case ICBTAG_FILE_TYPE_REALTIME:
1339 case ICBTAG_FILE_TYPE_REGULAR: 1277 case ICBTAG_FILE_TYPE_REGULAR:
1340 case ICBTAG_FILE_TYPE_UNDEF: 1278 case ICBTAG_FILE_TYPE_UNDEF:
1279 case ICBTAG_FILE_TYPE_VAT20:
1341 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 1280 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
1342 inode->i_data.a_ops = &udf_adinicb_aops; 1281 inode->i_data.a_ops = &udf_adinicb_aops;
1343 else 1282 else
@@ -1363,6 +1302,15 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
1363 inode->i_op = &page_symlink_inode_operations; 1302 inode->i_op = &page_symlink_inode_operations;
1364 inode->i_mode = S_IFLNK | S_IRWXUGO; 1303 inode->i_mode = S_IFLNK | S_IRWXUGO;
1365 break; 1304 break;
1305 case ICBTAG_FILE_TYPE_MAIN:
1306 udf_debug("METADATA FILE-----\n");
1307 break;
1308 case ICBTAG_FILE_TYPE_MIRROR:
1309 udf_debug("METADATA MIRROR FILE-----\n");
1310 break;
1311 case ICBTAG_FILE_TYPE_BITMAP:
1312 udf_debug("METADATA BITMAP FILE-----\n");
1313 break;
1366 default: 1314 default:
1367 printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown " 1315 printk(KERN_ERR "udf: udf_fill_inode(ino %ld) failed unknown "
1368 "file type=%d\n", inode->i_ino, 1316 "file type=%d\n", inode->i_ino,
@@ -1416,21 +1364,6 @@ static mode_t udf_convert_permissions(struct fileEntry *fe)
1416 return mode; 1364 return mode;
1417} 1365}
1418 1366
1419/*
1420 * udf_write_inode
1421 *
1422 * PURPOSE
1423 * Write out the specified inode.
1424 *
1425 * DESCRIPTION
1426 * This routine is called whenever an inode is synced.
1427 * Currently this routine is just a placeholder.
1428 *
1429 * HISTORY
1430 * July 1, 1997 - Andrew E. Mileski
1431 * Written, tested, and released.
1432 */
1433
1434int udf_write_inode(struct inode *inode, int sync) 1367int udf_write_inode(struct inode *inode, int sync)
1435{ 1368{
1436 int ret; 1369 int ret;
@@ -1455,7 +1388,6 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1455 uint32_t udfperms; 1388 uint32_t udfperms;
1456 uint16_t icbflags; 1389 uint16_t icbflags;
1457 uint16_t crclen; 1390 uint16_t crclen;
1458 kernel_timestamp cpu_time;
1459 int err = 0; 1391 int err = 0;
1460 struct udf_sb_info *sbi = UDF_SB(inode->i_sb); 1392 struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
1461 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; 1393 unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
@@ -1488,9 +1420,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1488 iinfo->i_location. 1420 iinfo->i_location.
1489 logicalBlockNum); 1421 logicalBlockNum);
1490 use->descTag.descCRCLength = cpu_to_le16(crclen); 1422 use->descTag.descCRCLength = cpu_to_le16(crclen);
1491 use->descTag.descCRC = cpu_to_le16(udf_crc((char *)use + 1423 use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use +
1492 sizeof(tag), crclen, 1424 sizeof(tag),
1493 0)); 1425 crclen));
1494 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); 1426 use->descTag.tagChecksum = udf_tag_checksum(&use->descTag);
1495 1427
1496 mark_buffer_dirty(bh); 1428 mark_buffer_dirty(bh);
@@ -1558,12 +1490,9 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1558 (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >> 1490 (inode->i_blocks + (1 << (blocksize_bits - 9)) - 1) >>
1559 (blocksize_bits - 9)); 1491 (blocksize_bits - 9));
1560 1492
1561 if (udf_time_to_stamp(&cpu_time, inode->i_atime)) 1493 udf_time_to_disk_stamp(&fe->accessTime, inode->i_atime);
1562 fe->accessTime = cpu_to_lets(cpu_time); 1494 udf_time_to_disk_stamp(&fe->modificationTime, inode->i_mtime);
1563 if (udf_time_to_stamp(&cpu_time, inode->i_mtime)) 1495 udf_time_to_disk_stamp(&fe->attrTime, inode->i_ctime);
1564 fe->modificationTime = cpu_to_lets(cpu_time);
1565 if (udf_time_to_stamp(&cpu_time, inode->i_ctime))
1566 fe->attrTime = cpu_to_lets(cpu_time);
1567 memset(&(fe->impIdent), 0, sizeof(regid)); 1496 memset(&(fe->impIdent), 0, sizeof(regid));
1568 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER); 1497 strcpy(fe->impIdent.ident, UDF_ID_DEVELOPER);
1569 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1498 fe->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
@@ -1598,14 +1527,10 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1598 iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec)) 1527 iinfo->i_crtime.tv_nsec > inode->i_ctime.tv_nsec))
1599 iinfo->i_crtime = inode->i_ctime; 1528 iinfo->i_crtime = inode->i_ctime;
1600 1529
1601 if (udf_time_to_stamp(&cpu_time, inode->i_atime)) 1530 udf_time_to_disk_stamp(&efe->accessTime, inode->i_atime);
1602 efe->accessTime = cpu_to_lets(cpu_time); 1531 udf_time_to_disk_stamp(&efe->modificationTime, inode->i_mtime);
1603 if (udf_time_to_stamp(&cpu_time, inode->i_mtime)) 1532 udf_time_to_disk_stamp(&efe->createTime, iinfo->i_crtime);
1604 efe->modificationTime = cpu_to_lets(cpu_time); 1533 udf_time_to_disk_stamp(&efe->attrTime, inode->i_ctime);
1605 if (udf_time_to_stamp(&cpu_time, iinfo->i_crtime))
1606 efe->createTime = cpu_to_lets(cpu_time);
1607 if (udf_time_to_stamp(&cpu_time, inode->i_ctime))
1608 efe->attrTime = cpu_to_lets(cpu_time);
1609 1534
1610 memset(&(efe->impIdent), 0, sizeof(regid)); 1535 memset(&(efe->impIdent), 0, sizeof(regid));
1611 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER); 1536 strcpy(efe->impIdent.ident, UDF_ID_DEVELOPER);
@@ -1660,8 +1585,8 @@ static int udf_update_inode(struct inode *inode, int do_sync)
1660 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc - 1585 crclen += iinfo->i_lenEAttr + iinfo->i_lenAlloc -
1661 sizeof(tag); 1586 sizeof(tag);
1662 fe->descTag.descCRCLength = cpu_to_le16(crclen); 1587 fe->descTag.descCRCLength = cpu_to_le16(crclen);
1663 fe->descTag.descCRC = cpu_to_le16(udf_crc((char *)fe + sizeof(tag), 1588 fe->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)fe + sizeof(tag),
1664 crclen, 0)); 1589 crclen));
1665 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); 1590 fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag);
1666 1591
1667 /* write the data blocks */ 1592 /* write the data blocks */
@@ -1778,9 +1703,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1778 1703
1779 if (epos->bh) { 1704 if (epos->bh) {
1780 aed = (struct allocExtDesc *)epos->bh->b_data; 1705 aed = (struct allocExtDesc *)epos->bh->b_data;
1781 aed->lengthAllocDescs = 1706 le32_add_cpu(&aed->lengthAllocDescs, adsize);
1782 cpu_to_le32(le32_to_cpu(
1783 aed->lengthAllocDescs) + adsize);
1784 } else { 1707 } else {
1785 iinfo->i_lenAlloc += adsize; 1708 iinfo->i_lenAlloc += adsize;
1786 mark_inode_dirty(inode); 1709 mark_inode_dirty(inode);
@@ -1830,9 +1753,7 @@ int8_t udf_add_aext(struct inode *inode, struct extent_position *epos,
1830 mark_inode_dirty(inode); 1753 mark_inode_dirty(inode);
1831 } else { 1754 } else {
1832 aed = (struct allocExtDesc *)epos->bh->b_data; 1755 aed = (struct allocExtDesc *)epos->bh->b_data;
1833 aed->lengthAllocDescs = 1756 le32_add_cpu(&aed->lengthAllocDescs, adsize);
1834 cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) +
1835 adsize);
1836 if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || 1757 if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
1837 UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) 1758 UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
1838 udf_update_tag(epos->bh->b_data, 1759 udf_update_tag(epos->bh->b_data,
@@ -2046,9 +1967,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2046 mark_inode_dirty(inode); 1967 mark_inode_dirty(inode);
2047 } else { 1968 } else {
2048 aed = (struct allocExtDesc *)oepos.bh->b_data; 1969 aed = (struct allocExtDesc *)oepos.bh->b_data;
2049 aed->lengthAllocDescs = 1970 le32_add_cpu(&aed->lengthAllocDescs, -(2 * adsize));
2050 cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) -
2051 (2 * adsize));
2052 if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || 1971 if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
2053 UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) 1972 UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
2054 udf_update_tag(oepos.bh->b_data, 1973 udf_update_tag(oepos.bh->b_data,
@@ -2065,9 +1984,7 @@ int8_t udf_delete_aext(struct inode *inode, struct extent_position epos,
2065 mark_inode_dirty(inode); 1984 mark_inode_dirty(inode);
2066 } else { 1985 } else {
2067 aed = (struct allocExtDesc *)oepos.bh->b_data; 1986 aed = (struct allocExtDesc *)oepos.bh->b_data;
2068 aed->lengthAllocDescs = 1987 le32_add_cpu(&aed->lengthAllocDescs, -adsize);
2069 cpu_to_le32(le32_to_cpu(aed->lengthAllocDescs) -
2070 adsize);
2071 if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || 1988 if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
2072 UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) 1989 UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
2073 udf_update_tag(oepos.bh->b_data, 1990 udf_update_tag(oepos.bh->b_data,
@@ -2095,11 +2012,6 @@ int8_t inode_bmap(struct inode *inode, sector_t block,
2095 int8_t etype; 2012 int8_t etype;
2096 struct udf_inode_info *iinfo; 2013 struct udf_inode_info *iinfo;
2097 2014
2098 if (block < 0) {
2099 printk(KERN_ERR "udf: inode_bmap: block < 0\n");
2100 return -1;
2101 }
2102
2103 iinfo = UDF_I(inode); 2015 iinfo = UDF_I(inode);
2104 pos->offset = 0; 2016 pos->offset = 0;
2105 pos->block = iinfo->i_location; 2017 pos->block = iinfo->i_location;
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 579bae71e67e..703843f30ffd 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -23,7 +23,6 @@
23#include <linux/cdrom.h> 23#include <linux/cdrom.h>
24#include <asm/uaccess.h> 24#include <asm/uaccess.h>
25 25
26#include <linux/udf_fs.h>
27#include "udf_sb.h" 26#include "udf_sb.h"
28 27
29unsigned int udf_get_last_session(struct super_block *sb) 28unsigned int udf_get_last_session(struct super_block *sb)
diff --git a/fs/udf/misc.c b/fs/udf/misc.c
index a1d6da0caf71..84bf0fd4a4f1 100644
--- a/fs/udf/misc.c
+++ b/fs/udf/misc.c
@@ -23,8 +23,8 @@
23 23
24#include <linux/fs.h> 24#include <linux/fs.h>
25#include <linux/string.h> 25#include <linux/string.h>
26#include <linux/udf_fs.h>
27#include <linux/buffer_head.h> 26#include <linux/buffer_head.h>
27#include <linux/crc-itu-t.h>
28 28
29#include "udf_i.h" 29#include "udf_i.h"
30#include "udf_sb.h" 30#include "udf_sb.h"
@@ -136,8 +136,8 @@ struct genericFormat *udf_add_extendedattr(struct inode *inode, uint32_t size,
136 /* rewrite CRC + checksum of eahd */ 136 /* rewrite CRC + checksum of eahd */
137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag); 137 crclen = sizeof(struct extendedAttrHeaderDesc) - sizeof(tag);
138 eahd->descTag.descCRCLength = cpu_to_le16(crclen); 138 eahd->descTag.descCRCLength = cpu_to_le16(crclen);
139 eahd->descTag.descCRC = cpu_to_le16(udf_crc((char *)eahd + 139 eahd->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)eahd +
140 sizeof(tag), crclen, 0)); 140 sizeof(tag), crclen));
141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag); 141 eahd->descTag.tagChecksum = udf_tag_checksum(&eahd->descTag);
142 iinfo->i_lenEAttr += size; 142 iinfo->i_lenEAttr += size;
143 return (struct genericFormat *)&ea[offset]; 143 return (struct genericFormat *)&ea[offset];
@@ -204,16 +204,15 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
204{ 204{
205 tag *tag_p; 205 tag *tag_p;
206 struct buffer_head *bh = NULL; 206 struct buffer_head *bh = NULL;
207 struct udf_sb_info *sbi = UDF_SB(sb);
208 207
209 /* Read the block */ 208 /* Read the block */
210 if (block == 0xFFFFFFFF) 209 if (block == 0xFFFFFFFF)
211 return NULL; 210 return NULL;
212 211
213 bh = udf_tread(sb, block + sbi->s_session); 212 bh = udf_tread(sb, block);
214 if (!bh) { 213 if (!bh) {
215 udf_debug("block=%d, location=%d: read failed\n", 214 udf_debug("block=%d, location=%d: read failed\n",
216 block + sbi->s_session, location); 215 block, location);
217 return NULL; 216 return NULL;
218 } 217 }
219 218
@@ -223,8 +222,7 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
223 222
224 if (location != le32_to_cpu(tag_p->tagLocation)) { 223 if (location != le32_to_cpu(tag_p->tagLocation)) {
225 udf_debug("location mismatch block %u, tag %u != %u\n", 224 udf_debug("location mismatch block %u, tag %u != %u\n",
226 block + sbi->s_session, 225 block, le32_to_cpu(tag_p->tagLocation), location);
227 le32_to_cpu(tag_p->tagLocation), location);
228 goto error_out; 226 goto error_out;
229 } 227 }
230 228
@@ -244,13 +242,13 @@ struct buffer_head *udf_read_tagged(struct super_block *sb, uint32_t block,
244 242
245 /* Verify the descriptor CRC */ 243 /* Verify the descriptor CRC */
246 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize || 244 if (le16_to_cpu(tag_p->descCRCLength) + sizeof(tag) > sb->s_blocksize ||
247 le16_to_cpu(tag_p->descCRC) == udf_crc(bh->b_data + sizeof(tag), 245 le16_to_cpu(tag_p->descCRC) == crc_itu_t(0,
248 le16_to_cpu(tag_p->descCRCLength), 0)) 246 bh->b_data + sizeof(tag),
247 le16_to_cpu(tag_p->descCRCLength)))
249 return bh; 248 return bh;
250 249
251 udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", 250 udf_debug("Crc failure block %d: crc = %d, crclen = %d\n", block,
252 block + sbi->s_session, le16_to_cpu(tag_p->descCRC), 251 le16_to_cpu(tag_p->descCRC), le16_to_cpu(tag_p->descCRCLength));
253 le16_to_cpu(tag_p->descCRCLength));
254 252
255error_out: 253error_out:
256 brelse(bh); 254 brelse(bh);
@@ -270,7 +268,7 @@ void udf_update_tag(char *data, int length)
270 length -= sizeof(tag); 268 length -= sizeof(tag);
271 269
272 tptr->descCRCLength = cpu_to_le16(length); 270 tptr->descCRCLength = cpu_to_le16(length);
273 tptr->descCRC = cpu_to_le16(udf_crc(data + sizeof(tag), length, 0)); 271 tptr->descCRC = cpu_to_le16(crc_itu_t(0, data + sizeof(tag), length));
274 tptr->tagChecksum = udf_tag_checksum(tptr); 272 tptr->tagChecksum = udf_tag_checksum(tptr);
275} 273}
276 274
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 112a5fb0b27b..ba5537d4bc15 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -31,6 +31,7 @@
31#include <linux/smp_lock.h> 31#include <linux/smp_lock.h>
32#include <linux/buffer_head.h> 32#include <linux/buffer_head.h>
33#include <linux/sched.h> 33#include <linux/sched.h>
34#include <linux/crc-itu-t.h>
34 35
35static inline int udf_match(int len1, const char *name1, int len2, 36static inline int udf_match(int len1, const char *name1, int len2,
36 const char *name2) 37 const char *name2)
@@ -97,25 +98,23 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi,
97 memset(fibh->ebh->b_data, 0x00, padlen + offset); 98 memset(fibh->ebh->b_data, 0x00, padlen + offset);
98 } 99 }
99 100
100 crc = udf_crc((uint8_t *)cfi + sizeof(tag), 101 crc = crc_itu_t(0, (uint8_t *)cfi + sizeof(tag),
101 sizeof(struct fileIdentDesc) - sizeof(tag), 0); 102 sizeof(struct fileIdentDesc) - sizeof(tag));
102 103
103 if (fibh->sbh == fibh->ebh) { 104 if (fibh->sbh == fibh->ebh) {
104 crc = udf_crc((uint8_t *)sfi->impUse, 105 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
105 crclen + sizeof(tag) - 106 crclen + sizeof(tag) -
106 sizeof(struct fileIdentDesc), crc); 107 sizeof(struct fileIdentDesc));
107 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) { 108 } else if (sizeof(struct fileIdentDesc) >= -fibh->soffset) {
108 crc = udf_crc(fibh->ebh->b_data + 109 crc = crc_itu_t(crc, fibh->ebh->b_data +
109 sizeof(struct fileIdentDesc) + 110 sizeof(struct fileIdentDesc) +
110 fibh->soffset, 111 fibh->soffset,
111 crclen + sizeof(tag) - 112 crclen + sizeof(tag) -
112 sizeof(struct fileIdentDesc), 113 sizeof(struct fileIdentDesc));
113 crc);
114 } else { 114 } else {
115 crc = udf_crc((uint8_t *)sfi->impUse, 115 crc = crc_itu_t(crc, (uint8_t *)sfi->impUse,
116 -fibh->soffset - sizeof(struct fileIdentDesc), 116 -fibh->soffset - sizeof(struct fileIdentDesc));
117 crc); 117 crc = crc_itu_t(crc, fibh->ebh->b_data, fibh->eoffset);
118 crc = udf_crc(fibh->ebh->b_data, fibh->eoffset, crc);
119 } 118 }
120 119
121 cfi->descTag.descCRC = cpu_to_le16(crc); 120 cfi->descTag.descCRC = cpu_to_le16(crc);
@@ -149,7 +148,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
149 struct fileIdentDesc *fi = NULL; 148 struct fileIdentDesc *fi = NULL;
150 loff_t f_pos; 149 loff_t f_pos;
151 int block, flen; 150 int block, flen;
152 char fname[UDF_NAME_LEN]; 151 char *fname = NULL;
153 char *nameptr; 152 char *nameptr;
154 uint8_t lfi; 153 uint8_t lfi;
155 uint16_t liu; 154 uint16_t liu;
@@ -163,12 +162,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
163 size = udf_ext0_offset(dir) + dir->i_size; 162 size = udf_ext0_offset(dir) + dir->i_size;
164 f_pos = udf_ext0_offset(dir); 163 f_pos = udf_ext0_offset(dir);
165 164
165 fibh->sbh = fibh->ebh = NULL;
166 fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); 166 fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
167 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 167 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
168 fibh->sbh = fibh->ebh = NULL; 168 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
169 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, 169 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
170 &epos, &eloc, &elen, &offset) == 170 goto out_err;
171 (EXT_RECORDED_ALLOCATED >> 30)) {
172 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 171 block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
173 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 172 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
174 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 173 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -179,25 +178,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
179 offset = 0; 178 offset = 0;
180 179
181 fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); 180 fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
182 if (!fibh->sbh) { 181 if (!fibh->sbh)
183 brelse(epos.bh); 182 goto out_err;
184 return NULL;
185 }
186 } else {
187 brelse(epos.bh);
188 return NULL;
189 } 183 }
190 184
185 fname = kmalloc(UDF_NAME_LEN, GFP_NOFS);
186 if (!fname)
187 goto out_err;
188
191 while (f_pos < size) { 189 while (f_pos < size) {
192 fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, 190 fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc,
193 &elen, &offset); 191 &elen, &offset);
194 if (!fi) { 192 if (!fi)
195 if (fibh->sbh != fibh->ebh) 193 goto out_err;
196 brelse(fibh->ebh);
197 brelse(fibh->sbh);
198 brelse(epos.bh);
199 return NULL;
200 }
201 194
202 liu = le16_to_cpu(cfi->lengthOfImpUse); 195 liu = le16_to_cpu(cfi->lengthOfImpUse);
203 lfi = cfi->lengthFileIdent; 196 lfi = cfi->lengthFileIdent;
@@ -237,53 +230,22 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
237 230
238 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi); 231 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
239 if (flen && udf_match(flen, fname, dentry->d_name.len, 232 if (flen && udf_match(flen, fname, dentry->d_name.len,
240 dentry->d_name.name)) { 233 dentry->d_name.name))
241 brelse(epos.bh); 234 goto out_ok;
242 return fi;
243 }
244 } 235 }
245 236
237out_err:
238 fi = NULL;
246 if (fibh->sbh != fibh->ebh) 239 if (fibh->sbh != fibh->ebh)
247 brelse(fibh->ebh); 240 brelse(fibh->ebh);
248 brelse(fibh->sbh); 241 brelse(fibh->sbh);
242out_ok:
249 brelse(epos.bh); 243 brelse(epos.bh);
244 kfree(fname);
250 245
251 return NULL; 246 return fi;
252} 247}
253 248
254/*
255 * udf_lookup
256 *
257 * PURPOSE
258 * Look-up the inode for a given name.
259 *
260 * DESCRIPTION
261 * Required - lookup_dentry() will return -ENOTDIR if this routine is not
262 * available for a directory. The filesystem is useless if this routine is
263 * not available for at least the filesystem's root directory.
264 *
265 * This routine is passed an incomplete dentry - it must be completed by
266 * calling d_add(dentry, inode). If the name does not exist, then the
267 * specified inode must be set to null. An error should only be returned
268 * when the lookup fails for a reason other than the name not existing.
269 * Note that the directory inode semaphore is held during the call.
270 *
271 * Refer to lookup_dentry() in fs/namei.c
272 * lookup_dentry() -> lookup() -> real_lookup() -> .
273 *
274 * PRE-CONDITIONS
275 * dir Pointer to inode of parent directory.
276 * dentry Pointer to dentry to complete.
277 * nd Pointer to lookup nameidata
278 *
279 * POST-CONDITIONS
280 * <return> Zero on success.
281 *
282 * HISTORY
283 * July 1, 1997 - Andrew E. Mileski
284 * Written, tested, and released.
285 */
286
287static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, 249static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
288 struct nameidata *nd) 250 struct nameidata *nd)
289{ 251{
@@ -336,11 +298,9 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
336{ 298{
337 struct super_block *sb = dir->i_sb; 299 struct super_block *sb = dir->i_sb;
338 struct fileIdentDesc *fi = NULL; 300 struct fileIdentDesc *fi = NULL;
339 char name[UDF_NAME_LEN], fname[UDF_NAME_LEN]; 301 char *name = NULL;
340 int namelen; 302 int namelen;
341 loff_t f_pos; 303 loff_t f_pos;
342 int flen;
343 char *nameptr;
344 loff_t size = udf_ext0_offset(dir) + dir->i_size; 304 loff_t size = udf_ext0_offset(dir) + dir->i_size;
345 int nfidlen; 305 int nfidlen;
346 uint8_t lfi; 306 uint8_t lfi;
@@ -352,16 +312,23 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
352 struct extent_position epos = {}; 312 struct extent_position epos = {};
353 struct udf_inode_info *dinfo; 313 struct udf_inode_info *dinfo;
354 314
315 fibh->sbh = fibh->ebh = NULL;
316 name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
317 if (!name) {
318 *err = -ENOMEM;
319 goto out_err;
320 }
321
355 if (dentry) { 322 if (dentry) {
356 if (!dentry->d_name.len) { 323 if (!dentry->d_name.len) {
357 *err = -EINVAL; 324 *err = -EINVAL;
358 return NULL; 325 goto out_err;
359 } 326 }
360 namelen = udf_put_filename(sb, dentry->d_name.name, name, 327 namelen = udf_put_filename(sb, dentry->d_name.name, name,
361 dentry->d_name.len); 328 dentry->d_name.len);
362 if (!namelen) { 329 if (!namelen) {
363 *err = -ENAMETOOLONG; 330 *err = -ENAMETOOLONG;
364 return NULL; 331 goto out_err;
365 } 332 }
366 } else { 333 } else {
367 namelen = 0; 334 namelen = 0;
@@ -373,11 +340,14 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
373 340
374 fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1); 341 fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
375 dinfo = UDF_I(dir); 342 dinfo = UDF_I(dir);
376 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 343 if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
377 fibh->sbh = fibh->ebh = NULL; 344 if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
378 else if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, 345 &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) {
379 &epos, &eloc, &elen, &offset) == 346 block = udf_get_lb_pblock(dir->i_sb,
380 (EXT_RECORDED_ALLOCATED >> 30)) { 347 dinfo->i_location, 0);
348 fibh->soffset = fibh->eoffset = sb->s_blocksize;
349 goto add;
350 }
381 block = udf_get_lb_pblock(dir->i_sb, eloc, offset); 351 block = udf_get_lb_pblock(dir->i_sb, eloc, offset);
382 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) { 352 if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
383 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) 353 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
@@ -389,17 +359,11 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
389 359
390 fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block); 360 fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
391 if (!fibh->sbh) { 361 if (!fibh->sbh) {
392 brelse(epos.bh);
393 *err = -EIO; 362 *err = -EIO;
394 return NULL; 363 goto out_err;
395 } 364 }
396 365
397 block = dinfo->i_location.logicalBlockNum; 366 block = dinfo->i_location.logicalBlockNum;
398 } else {
399 block = udf_get_lb_pblock(dir->i_sb, dinfo->i_location, 0);
400 fibh->sbh = fibh->ebh = NULL;
401 fibh->soffset = fibh->eoffset = sb->s_blocksize;
402 goto add;
403 } 367 }
404 368
405 while (f_pos < size) { 369 while (f_pos < size) {
@@ -407,41 +371,16 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
407 &elen, &offset); 371 &elen, &offset);
408 372
409 if (!fi) { 373 if (!fi) {
410 if (fibh->sbh != fibh->ebh)
411 brelse(fibh->ebh);
412 brelse(fibh->sbh);
413 brelse(epos.bh);
414 *err = -EIO; 374 *err = -EIO;
415 return NULL; 375 goto out_err;
416 } 376 }
417 377
418 liu = le16_to_cpu(cfi->lengthOfImpUse); 378 liu = le16_to_cpu(cfi->lengthOfImpUse);
419 lfi = cfi->lengthFileIdent; 379 lfi = cfi->lengthFileIdent;
420 380
421 if (fibh->sbh == fibh->ebh)
422 nameptr = fi->fileIdent + liu;
423 else {
424 int poffset; /* Unpaded ending offset */
425
426 poffset = fibh->soffset + sizeof(struct fileIdentDesc) +
427 liu + lfi;
428
429 if (poffset >= lfi)
430 nameptr = (char *)(fibh->ebh->b_data +
431 poffset - lfi);
432 else {
433 nameptr = fname;
434 memcpy(nameptr, fi->fileIdent + liu,
435 lfi - poffset);
436 memcpy(nameptr + lfi - poffset,
437 fibh->ebh->b_data, poffset);
438 }
439 }
440
441 if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) { 381 if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
442 if (((sizeof(struct fileIdentDesc) + 382 if (((sizeof(struct fileIdentDesc) +
443 liu + lfi + 3) & ~3) == nfidlen) { 383 liu + lfi + 3) & ~3) == nfidlen) {
444 brelse(epos.bh);
445 cfi->descTag.tagSerialNum = cpu_to_le16(1); 384 cfi->descTag.tagSerialNum = cpu_to_le16(1);
446 cfi->fileVersionNum = cpu_to_le16(1); 385 cfi->fileVersionNum = cpu_to_le16(1);
447 cfi->fileCharacteristics = 0; 386 cfi->fileCharacteristics = 0;
@@ -449,27 +388,13 @@ static struct fileIdentDesc *udf_add_entry(struct inode *dir,
449 cfi->lengthOfImpUse = cpu_to_le16(0); 388 cfi->lengthOfImpUse = cpu_to_le16(0);
450 if (!udf_write_fi(dir, cfi, fi, fibh, NULL, 389 if (!udf_write_fi(dir, cfi, fi, fibh, NULL,
451 name)) 390 name))
452 return fi; 391 goto out_ok;
453 else { 392 else {
454 *err = -EIO; 393 *err = -EIO;
455 return NULL; 394 goto out_err;
456 } 395 }
457 } 396 }
458 } 397 }
459
460 if (!lfi || !dentry)
461 continue;
462
463 flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
464 if (flen && udf_match(flen, fname, dentry->d_name.len,
465 dentry->d_name.name)) {
466 if (fibh->sbh != fibh->ebh)
467 brelse(fibh->ebh);
468 brelse(fibh->sbh);
469 brelse(epos.bh);
470 *err = -EEXIST;
471 return NULL;
472 }
473 } 398 }
474 399
475add: 400add:
@@ -496,7 +421,7 @@ add:
496 fibh->sbh = fibh->ebh = 421 fibh->sbh = fibh->ebh =
497 udf_expand_dir_adinicb(dir, &block, err); 422 udf_expand_dir_adinicb(dir, &block, err);
498 if (!fibh->sbh) 423 if (!fibh->sbh)
499 return NULL; 424 goto out_err;
500 epos.block = dinfo->i_location; 425 epos.block = dinfo->i_location;
501 epos.offset = udf_file_entry_alloc_offset(dir); 426 epos.offset = udf_file_entry_alloc_offset(dir);
502 /* Load extent udf_expand_dir_adinicb() has created */ 427 /* Load extent udf_expand_dir_adinicb() has created */
@@ -537,11 +462,8 @@ add:
537 dir->i_sb->s_blocksize_bits); 462 dir->i_sb->s_blocksize_bits);
538 fibh->ebh = udf_bread(dir, 463 fibh->ebh = udf_bread(dir,
539 f_pos >> dir->i_sb->s_blocksize_bits, 1, err); 464 f_pos >> dir->i_sb->s_blocksize_bits, 1, err);
540 if (!fibh->ebh) { 465 if (!fibh->ebh)
541 brelse(epos.bh); 466 goto out_err;
542 brelse(fibh->sbh);
543 return NULL;
544 }
545 467
546 if (!fibh->soffset) { 468 if (!fibh->soffset) {
547 if (udf_next_aext(dir, &epos, &eloc, &elen, 1) == 469 if (udf_next_aext(dir, &epos, &eloc, &elen, 1) ==
@@ -572,20 +494,25 @@ add:
572 cfi->lengthFileIdent = namelen; 494 cfi->lengthFileIdent = namelen;
573 cfi->lengthOfImpUse = cpu_to_le16(0); 495 cfi->lengthOfImpUse = cpu_to_le16(0);
574 if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) { 496 if (!udf_write_fi(dir, cfi, fi, fibh, NULL, name)) {
575 brelse(epos.bh);
576 dir->i_size += nfidlen; 497 dir->i_size += nfidlen;
577 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) 498 if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
578 dinfo->i_lenAlloc += nfidlen; 499 dinfo->i_lenAlloc += nfidlen;
579 mark_inode_dirty(dir); 500 mark_inode_dirty(dir);
580 return fi; 501 goto out_ok;
581 } else { 502 } else {
582 brelse(epos.bh);
583 if (fibh->sbh != fibh->ebh)
584 brelse(fibh->ebh);
585 brelse(fibh->sbh);
586 *err = -EIO; 503 *err = -EIO;
587 return NULL; 504 goto out_err;
588 } 505 }
506
507out_err:
508 fi = NULL;
509 if (fibh->sbh != fibh->ebh)
510 brelse(fibh->ebh);
511 brelse(fibh->sbh);
512out_ok:
513 brelse(epos.bh);
514 kfree(name);
515 return fi;
589} 516}
590 517
591static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, 518static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
@@ -940,7 +867,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
940 char *ea; 867 char *ea;
941 int err; 868 int err;
942 int block; 869 int block;
943 char name[UDF_NAME_LEN]; 870 char *name = NULL;
944 int namelen; 871 int namelen;
945 struct buffer_head *bh; 872 struct buffer_head *bh;
946 struct udf_inode_info *iinfo; 873 struct udf_inode_info *iinfo;
@@ -950,6 +877,12 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
950 if (!inode) 877 if (!inode)
951 goto out; 878 goto out;
952 879
880 name = kmalloc(UDF_NAME_LEN, GFP_NOFS);
881 if (!name) {
882 err = -ENOMEM;
883 goto out_no_entry;
884 }
885
953 iinfo = UDF_I(inode); 886 iinfo = UDF_I(inode);
954 inode->i_mode = S_IFLNK | S_IRWXUGO; 887 inode->i_mode = S_IFLNK | S_IRWXUGO;
955 inode->i_data.a_ops = &udf_symlink_aops; 888 inode->i_data.a_ops = &udf_symlink_aops;
@@ -1089,6 +1022,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
1089 err = 0; 1022 err = 0;
1090 1023
1091out: 1024out:
1025 kfree(name);
1092 unlock_kernel(); 1026 unlock_kernel();
1093 return err; 1027 return err;
1094 1028
diff --git a/fs/udf/partition.c b/fs/udf/partition.c
index fc533345ab89..63610f026ae1 100644
--- a/fs/udf/partition.c
+++ b/fs/udf/partition.c
@@ -24,7 +24,6 @@
24 24
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/string.h> 26#include <linux/string.h>
27#include <linux/udf_fs.h>
28#include <linux/slab.h> 27#include <linux/slab.h>
29#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
30 29
@@ -55,11 +54,10 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
55 struct udf_sb_info *sbi = UDF_SB(sb); 54 struct udf_sb_info *sbi = UDF_SB(sb);
56 struct udf_part_map *map; 55 struct udf_part_map *map;
57 struct udf_virtual_data *vdata; 56 struct udf_virtual_data *vdata;
58 struct udf_inode_info *iinfo; 57 struct udf_inode_info *iinfo = UDF_I(sbi->s_vat_inode);
59 58
60 map = &sbi->s_partmaps[partition]; 59 map = &sbi->s_partmaps[partition];
61 vdata = &map->s_type_specific.s_virtual; 60 vdata = &map->s_type_specific.s_virtual;
62 index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
63 61
64 if (block > vdata->s_num_entries) { 62 if (block > vdata->s_num_entries) {
65 udf_debug("Trying to access block beyond end of VAT " 63 udf_debug("Trying to access block beyond end of VAT "
@@ -67,6 +65,12 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
67 return 0xFFFFFFFF; 65 return 0xFFFFFFFF;
68 } 66 }
69 67
68 if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
69 loc = le32_to_cpu(((__le32 *)(iinfo->i_ext.i_data +
70 vdata->s_start_offset))[block]);
71 goto translate;
72 }
73 index = (sb->s_blocksize - vdata->s_start_offset) / sizeof(uint32_t);
70 if (block >= index) { 74 if (block >= index) {
71 block -= index; 75 block -= index;
72 newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t))); 76 newblock = 1 + (block / (sb->s_blocksize / sizeof(uint32_t)));
@@ -89,7 +93,7 @@ uint32_t udf_get_pblock_virt15(struct super_block *sb, uint32_t block,
89 93
90 brelse(bh); 94 brelse(bh);
91 95
92 iinfo = UDF_I(sbi->s_vat_inode); 96translate:
93 if (iinfo->i_location.partitionReferenceNum == partition) { 97 if (iinfo->i_location.partitionReferenceNum == partition) {
94 udf_debug("recursive call to udf_get_pblock!\n"); 98 udf_debug("recursive call to udf_get_pblock!\n");
95 return 0xFFFFFFFF; 99 return 0xFFFFFFFF;
@@ -263,3 +267,58 @@ int udf_relocate_blocks(struct super_block *sb, long old_block, long *new_block)
263 267
264 return 0; 268 return 0;
265} 269}
270
271static uint32_t udf_try_read_meta(struct inode *inode, uint32_t block,
272 uint16_t partition, uint32_t offset)
273{
274 struct super_block *sb = inode->i_sb;
275 struct udf_part_map *map;
276 kernel_lb_addr eloc;
277 uint32_t elen;
278 sector_t ext_offset;
279 struct extent_position epos = {};
280 uint32_t phyblock;
281
282 if (inode_bmap(inode, block, &epos, &eloc, &elen, &ext_offset) !=
283 (EXT_RECORDED_ALLOCATED >> 30))
284 phyblock = 0xFFFFFFFF;
285 else {
286 map = &UDF_SB(sb)->s_partmaps[partition];
287 /* map to sparable/physical partition desc */
288 phyblock = udf_get_pblock(sb, eloc.logicalBlockNum,
289 map->s_partition_num, ext_offset + offset);
290 }
291
292 brelse(epos.bh);
293 return phyblock;
294}
295
296uint32_t udf_get_pblock_meta25(struct super_block *sb, uint32_t block,
297 uint16_t partition, uint32_t offset)
298{
299 struct udf_sb_info *sbi = UDF_SB(sb);
300 struct udf_part_map *map;
301 struct udf_meta_data *mdata;
302 uint32_t retblk;
303 struct inode *inode;
304
305 udf_debug("READING from METADATA\n");
306
307 map = &sbi->s_partmaps[partition];
308 mdata = &map->s_type_specific.s_metadata;
309 inode = mdata->s_metadata_fe ? : mdata->s_mirror_fe;
310
311 /* We shouldn't mount such media... */
312 BUG_ON(!inode);
313 retblk = udf_try_read_meta(inode, block, partition, offset);
314 if (retblk == 0xFFFFFFFF) {
315 udf_warning(sb, __func__, "error reading from METADATA, "
316 "trying to read from MIRROR");
317 inode = mdata->s_mirror_fe;
318 if (!inode)
319 return 0xFFFFFFFF;
320 retblk = udf_try_read_meta(inode, block, partition, offset);
321 }
322
323 return retblk;
324}
diff --git a/fs/udf/super.c b/fs/udf/super.c
index f3ac4abfc946..b564fc140fe4 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -55,9 +55,10 @@
55#include <linux/errno.h> 55#include <linux/errno.h>
56#include <linux/mount.h> 56#include <linux/mount.h>
57#include <linux/seq_file.h> 57#include <linux/seq_file.h>
58#include <linux/bitmap.h>
59#include <linux/crc-itu-t.h>
58#include <asm/byteorder.h> 60#include <asm/byteorder.h>
59 61
60#include <linux/udf_fs.h>
61#include "udf_sb.h" 62#include "udf_sb.h"
62#include "udf_i.h" 63#include "udf_i.h"
63 64
@@ -84,22 +85,19 @@ static void udf_write_super(struct super_block *);
84static int udf_remount_fs(struct super_block *, int *, char *); 85static int udf_remount_fs(struct super_block *, int *, char *);
85static int udf_check_valid(struct super_block *, int, int); 86static int udf_check_valid(struct super_block *, int, int);
86static int udf_vrs(struct super_block *sb, int silent); 87static int udf_vrs(struct super_block *sb, int silent);
87static int udf_load_partition(struct super_block *, kernel_lb_addr *);
88static int udf_load_logicalvol(struct super_block *, struct buffer_head *,
89 kernel_lb_addr *);
90static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad); 88static void udf_load_logicalvolint(struct super_block *, kernel_extent_ad);
91static void udf_find_anchor(struct super_block *); 89static void udf_find_anchor(struct super_block *);
92static int udf_find_fileset(struct super_block *, kernel_lb_addr *, 90static int udf_find_fileset(struct super_block *, kernel_lb_addr *,
93 kernel_lb_addr *); 91 kernel_lb_addr *);
94static void udf_load_pvoldesc(struct super_block *, struct buffer_head *);
95static void udf_load_fileset(struct super_block *, struct buffer_head *, 92static void udf_load_fileset(struct super_block *, struct buffer_head *,
96 kernel_lb_addr *); 93 kernel_lb_addr *);
97static int udf_load_partdesc(struct super_block *, struct buffer_head *);
98static void udf_open_lvid(struct super_block *); 94static void udf_open_lvid(struct super_block *);
99static void udf_close_lvid(struct super_block *); 95static void udf_close_lvid(struct super_block *);
100static unsigned int udf_count_free(struct super_block *); 96static unsigned int udf_count_free(struct super_block *);
101static int udf_statfs(struct dentry *, struct kstatfs *); 97static int udf_statfs(struct dentry *, struct kstatfs *);
102static int udf_show_options(struct seq_file *, struct vfsmount *); 98static int udf_show_options(struct seq_file *, struct vfsmount *);
99static void udf_error(struct super_block *sb, const char *function,
100 const char *fmt, ...);
103 101
104struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi) 102struct logicalVolIntegrityDescImpUse *udf_sb_lvidiu(struct udf_sb_info *sbi)
105{ 103{
@@ -587,48 +585,10 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
587 return 0; 585 return 0;
588} 586}
589 587
590/*
591 * udf_set_blocksize
592 *
593 * PURPOSE
594 * Set the block size to be used in all transfers.
595 *
596 * DESCRIPTION
597 * To allow room for a DMA transfer, it is best to guess big when unsure.
598 * This routine picks 2048 bytes as the blocksize when guessing. This
599 * should be adequate until devices with larger block sizes become common.
600 *
601 * Note that the Linux kernel can currently only deal with blocksizes of
602 * 512, 1024, 2048, 4096, and 8192 bytes.
603 *
604 * PRE-CONDITIONS
605 * sb Pointer to _locked_ superblock.
606 *
607 * POST-CONDITIONS
608 * sb->s_blocksize Blocksize.
609 * sb->s_blocksize_bits log2 of blocksize.
610 * <return> 0 Blocksize is valid.
611 * <return> 1 Blocksize is invalid.
612 *
613 * HISTORY
614 * July 1, 1997 - Andrew E. Mileski
615 * Written, tested, and released.
616 */
617static int udf_set_blocksize(struct super_block *sb, int bsize)
618{
619 if (!sb_min_blocksize(sb, bsize)) {
620 udf_debug("Bad block size (%d)\n", bsize);
621 printk(KERN_ERR "udf: bad block size (%d)\n", bsize);
622 return 0;
623 }
624
625 return sb->s_blocksize;
626}
627
628static int udf_vrs(struct super_block *sb, int silent) 588static int udf_vrs(struct super_block *sb, int silent)
629{ 589{
630 struct volStructDesc *vsd = NULL; 590 struct volStructDesc *vsd = NULL;
631 int sector = 32768; 591 loff_t sector = 32768;
632 int sectorsize; 592 int sectorsize;
633 struct buffer_head *bh = NULL; 593 struct buffer_head *bh = NULL;
634 int iso9660 = 0; 594 int iso9660 = 0;
@@ -649,7 +609,8 @@ static int udf_vrs(struct super_block *sb, int silent)
649 sector += (sbi->s_session << sb->s_blocksize_bits); 609 sector += (sbi->s_session << sb->s_blocksize_bits);
650 610
651 udf_debug("Starting at sector %u (%ld byte sectors)\n", 611 udf_debug("Starting at sector %u (%ld byte sectors)\n",
652 (sector >> sb->s_blocksize_bits), sb->s_blocksize); 612 (unsigned int)(sector >> sb->s_blocksize_bits),
613 sb->s_blocksize);
653 /* Process the sequence (if applicable) */ 614 /* Process the sequence (if applicable) */
654 for (; !nsr02 && !nsr03; sector += sectorsize) { 615 for (; !nsr02 && !nsr03; sector += sectorsize) {
655 /* Read a block */ 616 /* Read a block */
@@ -719,162 +680,140 @@ static int udf_vrs(struct super_block *sb, int silent)
719} 680}
720 681
721/* 682/*
722 * udf_find_anchor 683 * Check whether there is an anchor block in the given block
723 *
724 * PURPOSE
725 * Find an anchor volume descriptor.
726 *
727 * PRE-CONDITIONS
728 * sb Pointer to _locked_ superblock.
729 * lastblock Last block on media.
730 *
731 * POST-CONDITIONS
732 * <return> 1 if not found, 0 if ok
733 *
734 * HISTORY
735 * July 1, 1997 - Andrew E. Mileski
736 * Written, tested, and released.
737 */ 684 */
738static void udf_find_anchor(struct super_block *sb) 685static int udf_check_anchor_block(struct super_block *sb, sector_t block,
686 bool varconv)
739{ 687{
740 int lastblock;
741 struct buffer_head *bh = NULL; 688 struct buffer_head *bh = NULL;
689 tag *t;
742 uint16_t ident; 690 uint16_t ident;
743 uint32_t location; 691 uint32_t location;
744 int i;
745 struct udf_sb_info *sbi;
746 692
747 sbi = UDF_SB(sb); 693 if (varconv) {
748 lastblock = sbi->s_last_block; 694 if (udf_fixed_to_variable(block) >=
695 sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits)
696 return 0;
697 bh = sb_bread(sb, udf_fixed_to_variable(block));
698 }
699 else
700 bh = sb_bread(sb, block);
749 701
750 if (lastblock) { 702 if (!bh)
751 int varlastblock = udf_variable_to_fixed(lastblock); 703 return 0;
752 int last[] = { lastblock, lastblock - 2,
753 lastblock - 150, lastblock - 152,
754 varlastblock, varlastblock - 2,
755 varlastblock - 150, varlastblock - 152 };
756
757 lastblock = 0;
758
759 /* Search for an anchor volume descriptor pointer */
760
761 /* according to spec, anchor is in either:
762 * block 256
763 * lastblock-256
764 * lastblock
765 * however, if the disc isn't closed, it could be 512 */
766
767 for (i = 0; !lastblock && i < ARRAY_SIZE(last); i++) {
768 ident = location = 0;
769 if (last[i] >= 0) {
770 bh = sb_bread(sb, last[i]);
771 if (bh) {
772 tag *t = (tag *)bh->b_data;
773 ident = le16_to_cpu(t->tagIdent);
774 location = le32_to_cpu(t->tagLocation);
775 brelse(bh);
776 }
777 }
778 704
779 if (ident == TAG_IDENT_AVDP) { 705 t = (tag *)bh->b_data;
780 if (location == last[i] - sbi->s_session) { 706 ident = le16_to_cpu(t->tagIdent);
781 lastblock = last[i] - sbi->s_session; 707 location = le32_to_cpu(t->tagLocation);
782 sbi->s_anchor[0] = lastblock; 708 brelse(bh);
783 sbi->s_anchor[1] = lastblock - 256; 709 if (ident != TAG_IDENT_AVDP)
784 } else if (location == 710 return 0;
785 udf_variable_to_fixed(last[i]) - 711 return location == block;
786 sbi->s_session) { 712}
787 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
788 lastblock =
789 udf_variable_to_fixed(last[i]) -
790 sbi->s_session;
791 sbi->s_anchor[0] = lastblock;
792 sbi->s_anchor[1] = lastblock - 256 -
793 sbi->s_session;
794 } else {
795 udf_debug("Anchor found at block %d, "
796 "location mismatch %d.\n",
797 last[i], location);
798 }
799 } else if (ident == TAG_IDENT_FE ||
800 ident == TAG_IDENT_EFE) {
801 lastblock = last[i];
802 sbi->s_anchor[3] = 512;
803 } else {
804 ident = location = 0;
805 if (last[i] >= 256) {
806 bh = sb_bread(sb, last[i] - 256);
807 if (bh) {
808 tag *t = (tag *)bh->b_data;
809 ident = le16_to_cpu(
810 t->tagIdent);
811 location = le32_to_cpu(
812 t->tagLocation);
813 brelse(bh);
814 }
815 }
816 713
817 if (ident == TAG_IDENT_AVDP && 714/* Search for an anchor volume descriptor pointer */
818 location == last[i] - 256 - 715static sector_t udf_scan_anchors(struct super_block *sb, bool varconv,
819 sbi->s_session) { 716 sector_t lastblock)
820 lastblock = last[i]; 717{
821 sbi->s_anchor[1] = last[i] - 256; 718 sector_t last[6];
822 } else { 719 int i;
823 ident = location = 0; 720 struct udf_sb_info *sbi = UDF_SB(sb);
824 if (last[i] >= 312 + sbi->s_session) {
825 bh = sb_bread(sb,
826 last[i] - 312 -
827 sbi->s_session);
828 if (bh) {
829 tag *t = (tag *)
830 bh->b_data;
831 ident = le16_to_cpu(
832 t->tagIdent);
833 location = le32_to_cpu(
834 t->tagLocation);
835 brelse(bh);
836 }
837 }
838 721
839 if (ident == TAG_IDENT_AVDP && 722 last[0] = lastblock;
840 location == udf_variable_to_fixed(last[i]) - 256) { 723 last[1] = last[0] - 1;
841 UDF_SET_FLAG(sb, 724 last[2] = last[0] + 1;
842 UDF_FLAG_VARCONV); 725 last[3] = last[0] - 2;
843 lastblock = udf_variable_to_fixed(last[i]); 726 last[4] = last[0] - 150;
844 sbi->s_anchor[1] = lastblock - 256; 727 last[5] = last[0] - 152;
845 } 728
846 } 729 /* according to spec, anchor is in either:
847 } 730 * block 256
731 * lastblock-256
732 * lastblock
733 * however, if the disc isn't closed, it could be 512 */
734
735 for (i = 0; i < ARRAY_SIZE(last); i++) {
736 if (last[i] < 0)
737 continue;
738 if (last[i] >= sb->s_bdev->bd_inode->i_size >>
739 sb->s_blocksize_bits)
740 continue;
741
742 if (udf_check_anchor_block(sb, last[i], varconv)) {
743 sbi->s_anchor[0] = last[i];
744 sbi->s_anchor[1] = last[i] - 256;
745 return last[i];
848 } 746 }
849 }
850 747
851 if (!lastblock) { 748 if (last[i] < 256)
852 /* We haven't found the lastblock. check 312 */ 749 continue;
853 bh = sb_bread(sb, 312 + sbi->s_session);
854 if (bh) {
855 tag *t = (tag *)bh->b_data;
856 ident = le16_to_cpu(t->tagIdent);
857 location = le32_to_cpu(t->tagLocation);
858 brelse(bh);
859 750
860 if (ident == TAG_IDENT_AVDP && location == 256) 751 if (udf_check_anchor_block(sb, last[i] - 256, varconv)) {
861 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV); 752 sbi->s_anchor[1] = last[i] - 256;
753 return last[i];
862 } 754 }
863 } 755 }
864 756
757 if (udf_check_anchor_block(sb, sbi->s_session + 256, varconv)) {
758 sbi->s_anchor[0] = sbi->s_session + 256;
759 return last[0];
760 }
761 if (udf_check_anchor_block(sb, sbi->s_session + 512, varconv)) {
762 sbi->s_anchor[0] = sbi->s_session + 512;
763 return last[0];
764 }
765 return 0;
766}
767
768/*
769 * Find an anchor volume descriptor. The function expects sbi->s_lastblock to
770 * be the last block on the media.
771 *
772 * Return 1 if not found, 0 if ok
773 *
774 */
775static void udf_find_anchor(struct super_block *sb)
776{
777 sector_t lastblock;
778 struct buffer_head *bh = NULL;
779 uint16_t ident;
780 int i;
781 struct udf_sb_info *sbi = UDF_SB(sb);
782
783 lastblock = udf_scan_anchors(sb, 0, sbi->s_last_block);
784 if (lastblock)
785 goto check_anchor;
786
787 /* No anchor found? Try VARCONV conversion of block numbers */
788 /* Firstly, we try to not convert number of the last block */
789 lastblock = udf_scan_anchors(sb, 1,
790 udf_variable_to_fixed(sbi->s_last_block));
791 if (lastblock) {
792 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
793 goto check_anchor;
794 }
795
796 /* Secondly, we try with converted number of the last block */
797 lastblock = udf_scan_anchors(sb, 1, sbi->s_last_block);
798 if (lastblock)
799 UDF_SET_FLAG(sb, UDF_FLAG_VARCONV);
800
801check_anchor:
802 /*
803 * Check located anchors and the anchor block supplied via
804 * mount options
805 */
865 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { 806 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
866 if (sbi->s_anchor[i]) { 807 if (!sbi->s_anchor[i])
867 bh = udf_read_tagged(sb, sbi->s_anchor[i], 808 continue;
868 sbi->s_anchor[i], &ident); 809 bh = udf_read_tagged(sb, sbi->s_anchor[i],
869 if (!bh) 810 sbi->s_anchor[i], &ident);
811 if (!bh)
812 sbi->s_anchor[i] = 0;
813 else {
814 brelse(bh);
815 if (ident != TAG_IDENT_AVDP)
870 sbi->s_anchor[i] = 0; 816 sbi->s_anchor[i] = 0;
871 else {
872 brelse(bh);
873 if ((ident != TAG_IDENT_AVDP) &&
874 (i || (ident != TAG_IDENT_FE &&
875 ident != TAG_IDENT_EFE)))
876 sbi->s_anchor[i] = 0;
877 }
878 } 817 }
879 } 818 }
880 819
@@ -971,27 +910,30 @@ static int udf_find_fileset(struct super_block *sb,
971 return 1; 910 return 1;
972} 911}
973 912
974static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh) 913static int udf_load_pvoldesc(struct super_block *sb, sector_t block)
975{ 914{
976 struct primaryVolDesc *pvoldesc; 915 struct primaryVolDesc *pvoldesc;
977 time_t recording;
978 long recording_usec;
979 struct ustr instr; 916 struct ustr instr;
980 struct ustr outstr; 917 struct ustr outstr;
918 struct buffer_head *bh;
919 uint16_t ident;
920
921 bh = udf_read_tagged(sb, block, block, &ident);
922 if (!bh)
923 return 1;
924 BUG_ON(ident != TAG_IDENT_PVD);
981 925
982 pvoldesc = (struct primaryVolDesc *)bh->b_data; 926 pvoldesc = (struct primaryVolDesc *)bh->b_data;
983 927
984 if (udf_stamp_to_time(&recording, &recording_usec, 928 if (udf_disk_stamp_to_time(&UDF_SB(sb)->s_record_time,
985 lets_to_cpu(pvoldesc->recordingDateAndTime))) { 929 pvoldesc->recordingDateAndTime)) {
986 kernel_timestamp ts; 930#ifdef UDFFS_DEBUG
987 ts = lets_to_cpu(pvoldesc->recordingDateAndTime); 931 timestamp *ts = &pvoldesc->recordingDateAndTime;
988 udf_debug("recording time %ld/%ld, %04u/%02u/%02u" 932 udf_debug("recording time %04u/%02u/%02u"
989 " %02u:%02u (%x)\n", 933 " %02u:%02u (%x)\n",
990 recording, recording_usec, 934 le16_to_cpu(ts->year), ts->month, ts->day, ts->hour,
991 ts.year, ts.month, ts.day, ts.hour, 935 ts->minute, le16_to_cpu(ts->typeAndTimezone));
992 ts.minute, ts.typeAndTimezone); 936#endif
993 UDF_SB(sb)->s_record_time.tv_sec = recording;
994 UDF_SB(sb)->s_record_time.tv_nsec = recording_usec * 1000;
995 } 937 }
996 938
997 if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32)) 939 if (!udf_build_ustr(&instr, pvoldesc->volIdent, 32))
@@ -1005,6 +947,104 @@ static void udf_load_pvoldesc(struct super_block *sb, struct buffer_head *bh)
1005 if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128)) 947 if (!udf_build_ustr(&instr, pvoldesc->volSetIdent, 128))
1006 if (udf_CS0toUTF8(&outstr, &instr)) 948 if (udf_CS0toUTF8(&outstr, &instr))
1007 udf_debug("volSetIdent[] = '%s'\n", outstr.u_name); 949 udf_debug("volSetIdent[] = '%s'\n", outstr.u_name);
950
951 brelse(bh);
952 return 0;
953}
954
955static int udf_load_metadata_files(struct super_block *sb, int partition)
956{
957 struct udf_sb_info *sbi = UDF_SB(sb);
958 struct udf_part_map *map;
959 struct udf_meta_data *mdata;
960 kernel_lb_addr addr;
961 int fe_error = 0;
962
963 map = &sbi->s_partmaps[partition];
964 mdata = &map->s_type_specific.s_metadata;
965
966 /* metadata address */
967 addr.logicalBlockNum = mdata->s_meta_file_loc;
968 addr.partitionReferenceNum = map->s_partition_num;
969
970 udf_debug("Metadata file location: block = %d part = %d\n",
971 addr.logicalBlockNum, addr.partitionReferenceNum);
972
973 mdata->s_metadata_fe = udf_iget(sb, addr);
974
975 if (mdata->s_metadata_fe == NULL) {
976 udf_warning(sb, __func__, "metadata inode efe not found, "
977 "will try mirror inode.");
978 fe_error = 1;
979 } else if (UDF_I(mdata->s_metadata_fe)->i_alloc_type !=
980 ICBTAG_FLAG_AD_SHORT) {
981 udf_warning(sb, __func__, "metadata inode efe does not have "
982 "short allocation descriptors!");
983 fe_error = 1;
984 iput(mdata->s_metadata_fe);
985 mdata->s_metadata_fe = NULL;
986 }
987
988 /* mirror file entry */
989 addr.logicalBlockNum = mdata->s_mirror_file_loc;
990 addr.partitionReferenceNum = map->s_partition_num;
991
992 udf_debug("Mirror metadata file location: block = %d part = %d\n",
993 addr.logicalBlockNum, addr.partitionReferenceNum);
994
995 mdata->s_mirror_fe = udf_iget(sb, addr);
996
997 if (mdata->s_mirror_fe == NULL) {
998 if (fe_error) {
999 udf_error(sb, __func__, "mirror inode efe not found "
1000 "and metadata inode is missing too, exiting...");
1001 goto error_exit;
1002 } else
1003 udf_warning(sb, __func__, "mirror inode efe not found,"
1004 " but metadata inode is OK");
1005 } else if (UDF_I(mdata->s_mirror_fe)->i_alloc_type !=
1006 ICBTAG_FLAG_AD_SHORT) {
1007 udf_warning(sb, __func__, "mirror inode efe does not have "
1008 "short allocation descriptors!");
1009 iput(mdata->s_mirror_fe);
1010 mdata->s_mirror_fe = NULL;
1011 if (fe_error)
1012 goto error_exit;
1013 }
1014
1015 /*
1016 * bitmap file entry
1017 * Note:
1018 * Load only if bitmap file location differs from 0xFFFFFFFF (DCN-5102)
1019 */
1020 if (mdata->s_bitmap_file_loc != 0xFFFFFFFF) {
1021 addr.logicalBlockNum = mdata->s_bitmap_file_loc;
1022 addr.partitionReferenceNum = map->s_partition_num;
1023
1024 udf_debug("Bitmap file location: block = %d part = %d\n",
1025 addr.logicalBlockNum, addr.partitionReferenceNum);
1026
1027 mdata->s_bitmap_fe = udf_iget(sb, addr);
1028
1029 if (mdata->s_bitmap_fe == NULL) {
1030 if (sb->s_flags & MS_RDONLY)
1031 udf_warning(sb, __func__, "bitmap inode efe "
1032 "not found but it's ok since the disc"
1033 " is mounted read-only");
1034 else {
1035 udf_error(sb, __func__, "bitmap inode efe not "
1036 "found and attempted read-write mount");
1037 goto error_exit;
1038 }
1039 }
1040 }
1041
1042 udf_debug("udf_load_metadata_files Ok\n");
1043
1044 return 0;
1045
1046error_exit:
1047 return 1;
1008} 1048}
1009 1049
1010static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh, 1050static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
@@ -1025,10 +1065,9 @@ static void udf_load_fileset(struct super_block *sb, struct buffer_head *bh,
1025int udf_compute_nr_groups(struct super_block *sb, u32 partition) 1065int udf_compute_nr_groups(struct super_block *sb, u32 partition)
1026{ 1066{
1027 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition]; 1067 struct udf_part_map *map = &UDF_SB(sb)->s_partmaps[partition];
1028 return (map->s_partition_len + 1068 return DIV_ROUND_UP(map->s_partition_len +
1029 (sizeof(struct spaceBitmapDesc) << 3) + 1069 (sizeof(struct spaceBitmapDesc) << 3),
1030 (sb->s_blocksize * 8) - 1) / 1070 sb->s_blocksize * 8);
1031 (sb->s_blocksize * 8);
1032} 1071}
1033 1072
1034static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) 1073static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
@@ -1059,134 +1098,241 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index)
1059 return bitmap; 1098 return bitmap;
1060} 1099}
1061 1100
1062static int udf_load_partdesc(struct super_block *sb, struct buffer_head *bh) 1101static int udf_fill_partdesc_info(struct super_block *sb,
1102 struct partitionDesc *p, int p_index)
1103{
1104 struct udf_part_map *map;
1105 struct udf_sb_info *sbi = UDF_SB(sb);
1106 struct partitionHeaderDesc *phd;
1107
1108 map = &sbi->s_partmaps[p_index];
1109
1110 map->s_partition_len = le32_to_cpu(p->partitionLength); /* blocks */
1111 map->s_partition_root = le32_to_cpu(p->partitionStartingLocation);
1112
1113 if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
1114 map->s_partition_flags |= UDF_PART_FLAG_READ_ONLY;
1115 if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
1116 map->s_partition_flags |= UDF_PART_FLAG_WRITE_ONCE;
1117 if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
1118 map->s_partition_flags |= UDF_PART_FLAG_REWRITABLE;
1119 if (p->accessType == cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
1120 map->s_partition_flags |= UDF_PART_FLAG_OVERWRITABLE;
1121
1122 udf_debug("Partition (%d type %x) starts at physical %d, "
1123 "block length %d\n", p_index,
1124 map->s_partition_type, map->s_partition_root,
1125 map->s_partition_len);
1126
1127 if (strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR02) &&
1128 strcmp(p->partitionContents.ident, PD_PARTITION_CONTENTS_NSR03))
1129 return 0;
1130
1131 phd = (struct partitionHeaderDesc *)p->partitionContentsUse;
1132 if (phd->unallocSpaceTable.extLength) {
1133 kernel_lb_addr loc = {
1134 .logicalBlockNum = le32_to_cpu(
1135 phd->unallocSpaceTable.extPosition),
1136 .partitionReferenceNum = p_index,
1137 };
1138
1139 map->s_uspace.s_table = udf_iget(sb, loc);
1140 if (!map->s_uspace.s_table) {
1141 udf_debug("cannot load unallocSpaceTable (part %d)\n",
1142 p_index);
1143 return 1;
1144 }
1145 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
1146 udf_debug("unallocSpaceTable (part %d) @ %ld\n",
1147 p_index, map->s_uspace.s_table->i_ino);
1148 }
1149
1150 if (phd->unallocSpaceBitmap.extLength) {
1151 struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
1152 if (!bitmap)
1153 return 1;
1154 map->s_uspace.s_bitmap = bitmap;
1155 bitmap->s_extLength = le32_to_cpu(
1156 phd->unallocSpaceBitmap.extLength);
1157 bitmap->s_extPosition = le32_to_cpu(
1158 phd->unallocSpaceBitmap.extPosition);
1159 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
1160 udf_debug("unallocSpaceBitmap (part %d) @ %d\n", p_index,
1161 bitmap->s_extPosition);
1162 }
1163
1164 if (phd->partitionIntegrityTable.extLength)
1165 udf_debug("partitionIntegrityTable (part %d)\n", p_index);
1166
1167 if (phd->freedSpaceTable.extLength) {
1168 kernel_lb_addr loc = {
1169 .logicalBlockNum = le32_to_cpu(
1170 phd->freedSpaceTable.extPosition),
1171 .partitionReferenceNum = p_index,
1172 };
1173
1174 map->s_fspace.s_table = udf_iget(sb, loc);
1175 if (!map->s_fspace.s_table) {
1176 udf_debug("cannot load freedSpaceTable (part %d)\n",
1177 p_index);
1178 return 1;
1179 }
1180
1181 map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
1182 udf_debug("freedSpaceTable (part %d) @ %ld\n",
1183 p_index, map->s_fspace.s_table->i_ino);
1184 }
1185
1186 if (phd->freedSpaceBitmap.extLength) {
1187 struct udf_bitmap *bitmap = udf_sb_alloc_bitmap(sb, p_index);
1188 if (!bitmap)
1189 return 1;
1190 map->s_fspace.s_bitmap = bitmap;
1191 bitmap->s_extLength = le32_to_cpu(
1192 phd->freedSpaceBitmap.extLength);
1193 bitmap->s_extPosition = le32_to_cpu(
1194 phd->freedSpaceBitmap.extPosition);
1195 map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
1196 udf_debug("freedSpaceBitmap (part %d) @ %d\n", p_index,
1197 bitmap->s_extPosition);
1198 }
1199 return 0;
1200}
1201
1202static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
1203{
1204 struct udf_sb_info *sbi = UDF_SB(sb);
1205 struct udf_part_map *map = &sbi->s_partmaps[p_index];
1206 kernel_lb_addr ino;
1207 struct buffer_head *bh = NULL;
1208 struct udf_inode_info *vati;
1209 uint32_t pos;
1210 struct virtualAllocationTable20 *vat20;
1211
1212 /* VAT file entry is in the last recorded block */
1213 ino.partitionReferenceNum = type1_index;
1214 ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
1215 sbi->s_vat_inode = udf_iget(sb, ino);
1216 if (!sbi->s_vat_inode)
1217 return 1;
1218
1219 if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
1220 map->s_type_specific.s_virtual.s_start_offset = 0;
1221 map->s_type_specific.s_virtual.s_num_entries =
1222 (sbi->s_vat_inode->i_size - 36) >> 2;
1223 } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
1224 vati = UDF_I(sbi->s_vat_inode);
1225 if (vati->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
1226 pos = udf_block_map(sbi->s_vat_inode, 0);
1227 bh = sb_bread(sb, pos);
1228 if (!bh)
1229 return 1;
1230 vat20 = (struct virtualAllocationTable20 *)bh->b_data;
1231 } else {
1232 vat20 = (struct virtualAllocationTable20 *)
1233 vati->i_ext.i_data;
1234 }
1235
1236 map->s_type_specific.s_virtual.s_start_offset =
1237 le16_to_cpu(vat20->lengthHeader);
1238 map->s_type_specific.s_virtual.s_num_entries =
1239 (sbi->s_vat_inode->i_size -
1240 map->s_type_specific.s_virtual.
1241 s_start_offset) >> 2;
1242 brelse(bh);
1243 }
1244 return 0;
1245}
1246
1247static int udf_load_partdesc(struct super_block *sb, sector_t block)
1063{ 1248{
1249 struct buffer_head *bh;
1064 struct partitionDesc *p; 1250 struct partitionDesc *p;
1065 int i;
1066 struct udf_part_map *map; 1251 struct udf_part_map *map;
1067 struct udf_sb_info *sbi; 1252 struct udf_sb_info *sbi = UDF_SB(sb);
1253 int i, type1_idx;
1254 uint16_t partitionNumber;
1255 uint16_t ident;
1256 int ret = 0;
1257
1258 bh = udf_read_tagged(sb, block, block, &ident);
1259 if (!bh)
1260 return 1;
1261 if (ident != TAG_IDENT_PD)
1262 goto out_bh;
1068 1263
1069 p = (struct partitionDesc *)bh->b_data; 1264 p = (struct partitionDesc *)bh->b_data;
1070 sbi = UDF_SB(sb); 1265 partitionNumber = le16_to_cpu(p->partitionNumber);
1071 1266
1267 /* First scan for TYPE1, SPARABLE and METADATA partitions */
1072 for (i = 0; i < sbi->s_partitions; i++) { 1268 for (i = 0; i < sbi->s_partitions; i++) {
1073 map = &sbi->s_partmaps[i]; 1269 map = &sbi->s_partmaps[i];
1074 udf_debug("Searching map: (%d == %d)\n", 1270 udf_debug("Searching map: (%d == %d)\n",
1075 map->s_partition_num, 1271 map->s_partition_num, partitionNumber);
1076 le16_to_cpu(p->partitionNumber)); 1272 if (map->s_partition_num == partitionNumber &&
1077 if (map->s_partition_num == 1273 (map->s_partition_type == UDF_TYPE1_MAP15 ||
1078 le16_to_cpu(p->partitionNumber)) { 1274 map->s_partition_type == UDF_SPARABLE_MAP15))
1079 map->s_partition_len =
1080 le32_to_cpu(p->partitionLength); /* blocks */
1081 map->s_partition_root =
1082 le32_to_cpu(p->partitionStartingLocation);
1083 if (p->accessType ==
1084 cpu_to_le32(PD_ACCESS_TYPE_READ_ONLY))
1085 map->s_partition_flags |=
1086 UDF_PART_FLAG_READ_ONLY;
1087 if (p->accessType ==
1088 cpu_to_le32(PD_ACCESS_TYPE_WRITE_ONCE))
1089 map->s_partition_flags |=
1090 UDF_PART_FLAG_WRITE_ONCE;
1091 if (p->accessType ==
1092 cpu_to_le32(PD_ACCESS_TYPE_REWRITABLE))
1093 map->s_partition_flags |=
1094 UDF_PART_FLAG_REWRITABLE;
1095 if (p->accessType ==
1096 cpu_to_le32(PD_ACCESS_TYPE_OVERWRITABLE))
1097 map->s_partition_flags |=
1098 UDF_PART_FLAG_OVERWRITABLE;
1099
1100 if (!strcmp(p->partitionContents.ident,
1101 PD_PARTITION_CONTENTS_NSR02) ||
1102 !strcmp(p->partitionContents.ident,
1103 PD_PARTITION_CONTENTS_NSR03)) {
1104 struct partitionHeaderDesc *phd;
1105
1106 phd = (struct partitionHeaderDesc *)
1107 (p->partitionContentsUse);
1108 if (phd->unallocSpaceTable.extLength) {
1109 kernel_lb_addr loc = {
1110 .logicalBlockNum = le32_to_cpu(phd->unallocSpaceTable.extPosition),
1111 .partitionReferenceNum = i,
1112 };
1113
1114 map->s_uspace.s_table =
1115 udf_iget(sb, loc);
1116 if (!map->s_uspace.s_table) {
1117 udf_debug("cannot load unallocSpaceTable (part %d)\n", i);
1118 return 1;
1119 }
1120 map->s_partition_flags |=
1121 UDF_PART_FLAG_UNALLOC_TABLE;
1122 udf_debug("unallocSpaceTable (part %d) @ %ld\n",
1123 i, map->s_uspace.s_table->i_ino);
1124 }
1125 if (phd->unallocSpaceBitmap.extLength) {
1126 struct udf_bitmap *bitmap =
1127 udf_sb_alloc_bitmap(sb, i);
1128 map->s_uspace.s_bitmap = bitmap;
1129 if (bitmap != NULL) {
1130 bitmap->s_extLength =
1131 le32_to_cpu(phd->unallocSpaceBitmap.extLength);
1132 bitmap->s_extPosition =
1133 le32_to_cpu(phd->unallocSpaceBitmap.extPosition);
1134 map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_BITMAP;
1135 udf_debug("unallocSpaceBitmap (part %d) @ %d\n",
1136 i, bitmap->s_extPosition);
1137 }
1138 }
1139 if (phd->partitionIntegrityTable.extLength)
1140 udf_debug("partitionIntegrityTable (part %d)\n", i);
1141 if (phd->freedSpaceTable.extLength) {
1142 kernel_lb_addr loc = {
1143 .logicalBlockNum = le32_to_cpu(phd->freedSpaceTable.extPosition),
1144 .partitionReferenceNum = i,
1145 };
1146
1147 map->s_fspace.s_table =
1148 udf_iget(sb, loc);
1149 if (!map->s_fspace.s_table) {
1150 udf_debug("cannot load freedSpaceTable (part %d)\n", i);
1151 return 1;
1152 }
1153 map->s_partition_flags |=
1154 UDF_PART_FLAG_FREED_TABLE;
1155 udf_debug("freedSpaceTable (part %d) @ %ld\n",
1156 i, map->s_fspace.s_table->i_ino);
1157 }
1158 if (phd->freedSpaceBitmap.extLength) {
1159 struct udf_bitmap *bitmap =
1160 udf_sb_alloc_bitmap(sb, i);
1161 map->s_fspace.s_bitmap = bitmap;
1162 if (bitmap != NULL) {
1163 bitmap->s_extLength =
1164 le32_to_cpu(phd->freedSpaceBitmap.extLength);
1165 bitmap->s_extPosition =
1166 le32_to_cpu(phd->freedSpaceBitmap.extPosition);
1167 map->s_partition_flags |= UDF_PART_FLAG_FREED_BITMAP;
1168 udf_debug("freedSpaceBitmap (part %d) @ %d\n",
1169 i, bitmap->s_extPosition);
1170 }
1171 }
1172 }
1173 break; 1275 break;
1174 }
1175 } 1276 }
1176 if (i == sbi->s_partitions) 1277
1278 if (i >= sbi->s_partitions) {
1177 udf_debug("Partition (%d) not found in partition map\n", 1279 udf_debug("Partition (%d) not found in partition map\n",
1178 le16_to_cpu(p->partitionNumber)); 1280 partitionNumber);
1179 else 1281 goto out_bh;
1180 udf_debug("Partition (%d:%d type %x) starts at physical %d, " 1282 }
1181 "block length %d\n", 1283
1182 le16_to_cpu(p->partitionNumber), i, 1284 ret = udf_fill_partdesc_info(sb, p, i);
1183 map->s_partition_type, 1285
1184 map->s_partition_root, 1286 /*
1185 map->s_partition_len); 1287 * Now rescan for VIRTUAL or METADATA partitions when SPARABLE and
1186 return 0; 1288 * PHYSICAL partitions are already set up
1289 */
1290 type1_idx = i;
1291 for (i = 0; i < sbi->s_partitions; i++) {
1292 map = &sbi->s_partmaps[i];
1293
1294 if (map->s_partition_num == partitionNumber &&
1295 (map->s_partition_type == UDF_VIRTUAL_MAP15 ||
1296 map->s_partition_type == UDF_VIRTUAL_MAP20 ||
1297 map->s_partition_type == UDF_METADATA_MAP25))
1298 break;
1299 }
1300
1301 if (i >= sbi->s_partitions)
1302 goto out_bh;
1303
1304 ret = udf_fill_partdesc_info(sb, p, i);
1305 if (ret)
1306 goto out_bh;
1307
1308 if (map->s_partition_type == UDF_METADATA_MAP25) {
1309 ret = udf_load_metadata_files(sb, i);
1310 if (ret) {
1311 printk(KERN_ERR "UDF-fs: error loading MetaData "
1312 "partition map %d\n", i);
1313 goto out_bh;
1314 }
1315 } else {
1316 ret = udf_load_vat(sb, i, type1_idx);
1317 if (ret)
1318 goto out_bh;
1319 /*
1320 * Mark filesystem read-only if we have a partition with
1321 * virtual map since we don't handle writing to it (we
1322 * overwrite blocks instead of relocating them).
1323 */
1324 sb->s_flags |= MS_RDONLY;
1325 printk(KERN_NOTICE "UDF-fs: Filesystem marked read-only "
1326 "because writing to pseudooverwrite partition is "
1327 "not implemented.\n");
1328 }
1329out_bh:
1330 /* In case loading failed, we handle cleanup in udf_fill_super */
1331 brelse(bh);
1332 return ret;
1187} 1333}
1188 1334
1189static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh, 1335static int udf_load_logicalvol(struct super_block *sb, sector_t block,
1190 kernel_lb_addr *fileset) 1336 kernel_lb_addr *fileset)
1191{ 1337{
1192 struct logicalVolDesc *lvd; 1338 struct logicalVolDesc *lvd;
@@ -1194,12 +1340,21 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
1194 uint8_t type; 1340 uint8_t type;
1195 struct udf_sb_info *sbi = UDF_SB(sb); 1341 struct udf_sb_info *sbi = UDF_SB(sb);
1196 struct genericPartitionMap *gpm; 1342 struct genericPartitionMap *gpm;
1343 uint16_t ident;
1344 struct buffer_head *bh;
1345 int ret = 0;
1197 1346
1347 bh = udf_read_tagged(sb, block, block, &ident);
1348 if (!bh)
1349 return 1;
1350 BUG_ON(ident != TAG_IDENT_LVD);
1198 lvd = (struct logicalVolDesc *)bh->b_data; 1351 lvd = (struct logicalVolDesc *)bh->b_data;
1199 1352
1200 i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps)); 1353 i = udf_sb_alloc_partition_maps(sb, le32_to_cpu(lvd->numPartitionMaps));
1201 if (i != 0) 1354 if (i != 0) {
1202 return i; 1355 ret = i;
1356 goto out_bh;
1357 }
1203 1358
1204 for (i = 0, offset = 0; 1359 for (i = 0, offset = 0;
1205 i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength); 1360 i < sbi->s_partitions && offset < le32_to_cpu(lvd->mapTableLength);
@@ -1223,12 +1378,12 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
1223 u16 suf = 1378 u16 suf =
1224 le16_to_cpu(((__le16 *)upm2->partIdent. 1379 le16_to_cpu(((__le16 *)upm2->partIdent.
1225 identSuffix)[0]); 1380 identSuffix)[0]);
1226 if (suf == 0x0150) { 1381 if (suf < 0x0200) {
1227 map->s_partition_type = 1382 map->s_partition_type =
1228 UDF_VIRTUAL_MAP15; 1383 UDF_VIRTUAL_MAP15;
1229 map->s_partition_func = 1384 map->s_partition_func =
1230 udf_get_pblock_virt15; 1385 udf_get_pblock_virt15;
1231 } else if (suf == 0x0200) { 1386 } else {
1232 map->s_partition_type = 1387 map->s_partition_type =
1233 UDF_VIRTUAL_MAP20; 1388 UDF_VIRTUAL_MAP20;
1234 map->s_partition_func = 1389 map->s_partition_func =
@@ -1238,7 +1393,6 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
1238 UDF_ID_SPARABLE, 1393 UDF_ID_SPARABLE,
1239 strlen(UDF_ID_SPARABLE))) { 1394 strlen(UDF_ID_SPARABLE))) {
1240 uint32_t loc; 1395 uint32_t loc;
1241 uint16_t ident;
1242 struct sparingTable *st; 1396 struct sparingTable *st;
1243 struct sparablePartitionMap *spm = 1397 struct sparablePartitionMap *spm =
1244 (struct sparablePartitionMap *)gpm; 1398 (struct sparablePartitionMap *)gpm;
@@ -1256,22 +1410,64 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
1256 map->s_type_specific.s_sparing. 1410 map->s_type_specific.s_sparing.
1257 s_spar_map[j] = bh2; 1411 s_spar_map[j] = bh2;
1258 1412
1259 if (bh2 != NULL) { 1413 if (bh2 == NULL)
1260 st = (struct sparingTable *) 1414 continue;
1261 bh2->b_data; 1415
1262 if (ident != 0 || strncmp( 1416 st = (struct sparingTable *)bh2->b_data;
1263 st->sparingIdent.ident, 1417 if (ident != 0 || strncmp(
1264 UDF_ID_SPARING, 1418 st->sparingIdent.ident,
1265 strlen(UDF_ID_SPARING))) { 1419 UDF_ID_SPARING,
1266 brelse(bh2); 1420 strlen(UDF_ID_SPARING))) {
1267 map->s_type_specific. 1421 brelse(bh2);
1268 s_sparing. 1422 map->s_type_specific.s_sparing.
1269 s_spar_map[j] = 1423 s_spar_map[j] = NULL;
1270 NULL;
1271 }
1272 } 1424 }
1273 } 1425 }
1274 map->s_partition_func = udf_get_pblock_spar15; 1426 map->s_partition_func = udf_get_pblock_spar15;
1427 } else if (!strncmp(upm2->partIdent.ident,
1428 UDF_ID_METADATA,
1429 strlen(UDF_ID_METADATA))) {
1430 struct udf_meta_data *mdata =
1431 &map->s_type_specific.s_metadata;
1432 struct metadataPartitionMap *mdm =
1433 (struct metadataPartitionMap *)
1434 &(lvd->partitionMaps[offset]);
1435 udf_debug("Parsing Logical vol part %d "
1436 "type %d id=%s\n", i, type,
1437 UDF_ID_METADATA);
1438
1439 map->s_partition_type = UDF_METADATA_MAP25;
1440 map->s_partition_func = udf_get_pblock_meta25;
1441
1442 mdata->s_meta_file_loc =
1443 le32_to_cpu(mdm->metadataFileLoc);
1444 mdata->s_mirror_file_loc =
1445 le32_to_cpu(mdm->metadataMirrorFileLoc);
1446 mdata->s_bitmap_file_loc =
1447 le32_to_cpu(mdm->metadataBitmapFileLoc);
1448 mdata->s_alloc_unit_size =
1449 le32_to_cpu(mdm->allocUnitSize);
1450 mdata->s_align_unit_size =
1451 le16_to_cpu(mdm->alignUnitSize);
1452 mdata->s_dup_md_flag =
1453 mdm->flags & 0x01;
1454
1455 udf_debug("Metadata Ident suffix=0x%x\n",
1456 (le16_to_cpu(
1457 ((__le16 *)
1458 mdm->partIdent.identSuffix)[0])));
1459 udf_debug("Metadata part num=%d\n",
1460 le16_to_cpu(mdm->partitionNum));
1461 udf_debug("Metadata part alloc unit size=%d\n",
1462 le32_to_cpu(mdm->allocUnitSize));
1463 udf_debug("Metadata file loc=%d\n",
1464 le32_to_cpu(mdm->metadataFileLoc));
1465 udf_debug("Mirror file loc=%d\n",
1466 le32_to_cpu(mdm->metadataMirrorFileLoc));
1467 udf_debug("Bitmap file loc=%d\n",
1468 le32_to_cpu(mdm->metadataBitmapFileLoc));
1469 udf_debug("Duplicate Flag: %d %d\n",
1470 mdata->s_dup_md_flag, mdm->flags);
1275 } else { 1471 } else {
1276 udf_debug("Unknown ident: %s\n", 1472 udf_debug("Unknown ident: %s\n",
1277 upm2->partIdent.ident); 1473 upm2->partIdent.ident);
@@ -1296,7 +1492,9 @@ static int udf_load_logicalvol(struct super_block *sb, struct buffer_head *bh,
1296 if (lvd->integritySeqExt.extLength) 1492 if (lvd->integritySeqExt.extLength)
1297 udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt)); 1493 udf_load_logicalvolint(sb, leea_to_cpu(lvd->integritySeqExt));
1298 1494
1299 return 0; 1495out_bh:
1496 brelse(bh);
1497 return ret;
1300} 1498}
1301 1499
1302/* 1500/*
@@ -1345,7 +1543,7 @@ static void udf_load_logicalvolint(struct super_block *sb, kernel_extent_ad loc)
1345 * July 1, 1997 - Andrew E. Mileski 1543 * July 1, 1997 - Andrew E. Mileski
1346 * Written, tested, and released. 1544 * Written, tested, and released.
1347 */ 1545 */
1348static int udf_process_sequence(struct super_block *sb, long block, 1546static noinline int udf_process_sequence(struct super_block *sb, long block,
1349 long lastblock, kernel_lb_addr *fileset) 1547 long lastblock, kernel_lb_addr *fileset)
1350{ 1548{
1351 struct buffer_head *bh = NULL; 1549 struct buffer_head *bh = NULL;
@@ -1354,19 +1552,25 @@ static int udf_process_sequence(struct super_block *sb, long block,
1354 struct generic_desc *gd; 1552 struct generic_desc *gd;
1355 struct volDescPtr *vdp; 1553 struct volDescPtr *vdp;
1356 int done = 0; 1554 int done = 0;
1357 int i, j;
1358 uint32_t vdsn; 1555 uint32_t vdsn;
1359 uint16_t ident; 1556 uint16_t ident;
1360 long next_s = 0, next_e = 0; 1557 long next_s = 0, next_e = 0;
1361 1558
1362 memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); 1559 memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
1363 1560
1364 /* Read the main descriptor sequence */ 1561 /*
1562 * Read the main descriptor sequence and find which descriptors
1563 * are in it.
1564 */
1365 for (; (!done && block <= lastblock); block++) { 1565 for (; (!done && block <= lastblock); block++) {
1366 1566
1367 bh = udf_read_tagged(sb, block, block, &ident); 1567 bh = udf_read_tagged(sb, block, block, &ident);
1368 if (!bh) 1568 if (!bh) {
1369 break; 1569 printk(KERN_ERR "udf: Block %Lu of volume descriptor "
1570 "sequence is corrupted or we could not read "
1571 "it.\n", (unsigned long long)block);
1572 return 1;
1573 }
1370 1574
1371 /* Process each descriptor (ISO 13346 3/8.3-8.4) */ 1575 /* Process each descriptor (ISO 13346 3/8.3-8.4) */
1372 gd = (struct generic_desc *)bh->b_data; 1576 gd = (struct generic_desc *)bh->b_data;
@@ -1432,41 +1636,31 @@ static int udf_process_sequence(struct super_block *sb, long block,
1432 } 1636 }
1433 brelse(bh); 1637 brelse(bh);
1434 } 1638 }
1435 for (i = 0; i < VDS_POS_LENGTH; i++) { 1639 /*
1436 if (vds[i].block) { 1640 * Now read interesting descriptors again and process them
1437 bh = udf_read_tagged(sb, vds[i].block, vds[i].block, 1641 * in a suitable order
1438 &ident); 1642 */
1439 1643 if (!vds[VDS_POS_PRIMARY_VOL_DESC].block) {
1440 if (i == VDS_POS_PRIMARY_VOL_DESC) { 1644 printk(KERN_ERR "udf: Primary Volume Descriptor not found!\n");
1441 udf_load_pvoldesc(sb, bh); 1645 return 1;
1442 } else if (i == VDS_POS_LOGICAL_VOL_DESC) { 1646 }
1443 if (udf_load_logicalvol(sb, bh, fileset)) { 1647 if (udf_load_pvoldesc(sb, vds[VDS_POS_PRIMARY_VOL_DESC].block))
1444 brelse(bh); 1648 return 1;
1445 return 1; 1649
1446 } 1650 if (vds[VDS_POS_LOGICAL_VOL_DESC].block && udf_load_logicalvol(sb,
1447 } else if (i == VDS_POS_PARTITION_DESC) { 1651 vds[VDS_POS_LOGICAL_VOL_DESC].block, fileset))
1448 struct buffer_head *bh2 = NULL; 1652 return 1;
1449 if (udf_load_partdesc(sb, bh)) { 1653
1450 brelse(bh); 1654 if (vds[VDS_POS_PARTITION_DESC].block) {
1451 return 1; 1655 /*
1452 } 1656 * We rescan the whole descriptor sequence to find
1453 for (j = vds[i].block + 1; 1657 * partition descriptor blocks and process them.
1454 j < vds[VDS_POS_TERMINATING_DESC].block; 1658 */
1455 j++) { 1659 for (block = vds[VDS_POS_PARTITION_DESC].block;
1456 bh2 = udf_read_tagged(sb, j, j, &ident); 1660 block < vds[VDS_POS_TERMINATING_DESC].block;
1457 gd = (struct generic_desc *)bh2->b_data; 1661 block++)
1458 if (ident == TAG_IDENT_PD) 1662 if (udf_load_partdesc(sb, block))
1459 if (udf_load_partdesc(sb, 1663 return 1;
1460 bh2)) {
1461 brelse(bh);
1462 brelse(bh2);
1463 return 1;
1464 }
1465 brelse(bh2);
1466 }
1467 }
1468 brelse(bh);
1469 }
1470 } 1664 }
1471 1665
1472 return 0; 1666 return 0;
@@ -1478,6 +1672,7 @@ static int udf_process_sequence(struct super_block *sb, long block,
1478static int udf_check_valid(struct super_block *sb, int novrs, int silent) 1672static int udf_check_valid(struct super_block *sb, int novrs, int silent)
1479{ 1673{
1480 long block; 1674 long block;
1675 struct udf_sb_info *sbi = UDF_SB(sb);
1481 1676
1482 if (novrs) { 1677 if (novrs) {
1483 udf_debug("Validity check skipped because of novrs option\n"); 1678 udf_debug("Validity check skipped because of novrs option\n");
@@ -1485,27 +1680,22 @@ static int udf_check_valid(struct super_block *sb, int novrs, int silent)
1485 } 1680 }
1486 /* Check that it is NSR02 compliant */ 1681 /* Check that it is NSR02 compliant */
1487 /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */ 1682 /* Process any "CD-ROM Volume Descriptor Set" (ECMA 167 2/8.3.1) */
1488 else { 1683 block = udf_vrs(sb, silent);
1489 block = udf_vrs(sb, silent); 1684 if (block == -1)
1490 if (block == -1) { 1685 udf_debug("Failed to read byte 32768. Assuming open "
1491 struct udf_sb_info *sbi = UDF_SB(sb); 1686 "disc. Skipping validity check\n");
1492 udf_debug("Failed to read byte 32768. Assuming open " 1687 if (block && !sbi->s_last_block)
1493 "disc. Skipping validity check\n"); 1688 sbi->s_last_block = udf_get_last_block(sb);
1494 if (!sbi->s_last_block) 1689 return !block;
1495 sbi->s_last_block = udf_get_last_block(sb);
1496 return 0;
1497 } else
1498 return !block;
1499 }
1500} 1690}
1501 1691
1502static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset) 1692static int udf_load_sequence(struct super_block *sb, kernel_lb_addr *fileset)
1503{ 1693{
1504 struct anchorVolDescPtr *anchor; 1694 struct anchorVolDescPtr *anchor;
1505 uint16_t ident; 1695 uint16_t ident;
1506 struct buffer_head *bh; 1696 struct buffer_head *bh;
1507 long main_s, main_e, reserve_s, reserve_e; 1697 long main_s, main_e, reserve_s, reserve_e;
1508 int i, j; 1698 int i;
1509 struct udf_sb_info *sbi; 1699 struct udf_sb_info *sbi;
1510 1700
1511 if (!sb) 1701 if (!sb)
@@ -1515,6 +1705,7 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
1515 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) { 1705 for (i = 0; i < ARRAY_SIZE(sbi->s_anchor); i++) {
1516 if (!sbi->s_anchor[i]) 1706 if (!sbi->s_anchor[i])
1517 continue; 1707 continue;
1708
1518 bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i], 1709 bh = udf_read_tagged(sb, sbi->s_anchor[i], sbi->s_anchor[i],
1519 &ident); 1710 &ident);
1520 if (!bh) 1711 if (!bh)
@@ -1553,76 +1744,6 @@ static int udf_load_partition(struct super_block *sb, kernel_lb_addr *fileset)
1553 } 1744 }
1554 udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]); 1745 udf_debug("Using anchor in block %d\n", sbi->s_anchor[i]);
1555 1746
1556 for (i = 0; i < sbi->s_partitions; i++) {
1557 kernel_lb_addr uninitialized_var(ino);
1558 struct udf_part_map *map = &sbi->s_partmaps[i];
1559 switch (map->s_partition_type) {
1560 case UDF_VIRTUAL_MAP15:
1561 case UDF_VIRTUAL_MAP20:
1562 if (!sbi->s_last_block) {
1563 sbi->s_last_block = udf_get_last_block(sb);
1564 udf_find_anchor(sb);
1565 }
1566
1567 if (!sbi->s_last_block) {
1568 udf_debug("Unable to determine Lastblock (For "
1569 "Virtual Partition)\n");
1570 return 1;
1571 }
1572
1573 for (j = 0; j < sbi->s_partitions; j++) {
1574 struct udf_part_map *map2 = &sbi->s_partmaps[j];
1575 if (j != i &&
1576 map->s_volumeseqnum ==
1577 map2->s_volumeseqnum &&
1578 map->s_partition_num ==
1579 map2->s_partition_num) {
1580 ino.partitionReferenceNum = j;
1581 ino.logicalBlockNum =
1582 sbi->s_last_block -
1583 map2->s_partition_root;
1584 break;
1585 }
1586 }
1587
1588 if (j == sbi->s_partitions)
1589 return 1;
1590
1591 sbi->s_vat_inode = udf_iget(sb, ino);
1592 if (!sbi->s_vat_inode)
1593 return 1;
1594
1595 if (map->s_partition_type == UDF_VIRTUAL_MAP15) {
1596 map->s_type_specific.s_virtual.s_start_offset =
1597 udf_ext0_offset(sbi->s_vat_inode);
1598 map->s_type_specific.s_virtual.s_num_entries =
1599 (sbi->s_vat_inode->i_size - 36) >> 2;
1600 } else if (map->s_partition_type == UDF_VIRTUAL_MAP20) {
1601 uint32_t pos;
1602 struct virtualAllocationTable20 *vat20;
1603
1604 pos = udf_block_map(sbi->s_vat_inode, 0);
1605 bh = sb_bread(sb, pos);
1606 if (!bh)
1607 return 1;
1608 vat20 = (struct virtualAllocationTable20 *)
1609 bh->b_data +
1610 udf_ext0_offset(sbi->s_vat_inode);
1611 map->s_type_specific.s_virtual.s_start_offset =
1612 le16_to_cpu(vat20->lengthHeader) +
1613 udf_ext0_offset(sbi->s_vat_inode);
1614 map->s_type_specific.s_virtual.s_num_entries =
1615 (sbi->s_vat_inode->i_size -
1616 map->s_type_specific.s_virtual.
1617 s_start_offset) >> 2;
1618 brelse(bh);
1619 }
1620 map->s_partition_root = udf_get_pblock(sb, 0, i, 0);
1621 map->s_partition_len =
1622 sbi->s_partmaps[ino.partitionReferenceNum].
1623 s_partition_len;
1624 }
1625 }
1626 return 0; 1747 return 0;
1627} 1748}
1628 1749
@@ -1630,65 +1751,61 @@ static void udf_open_lvid(struct super_block *sb)
1630{ 1751{
1631 struct udf_sb_info *sbi = UDF_SB(sb); 1752 struct udf_sb_info *sbi = UDF_SB(sb);
1632 struct buffer_head *bh = sbi->s_lvid_bh; 1753 struct buffer_head *bh = sbi->s_lvid_bh;
1633 if (bh) { 1754 struct logicalVolIntegrityDesc *lvid;
1634 kernel_timestamp cpu_time; 1755 struct logicalVolIntegrityDescImpUse *lvidiu;
1635 struct logicalVolIntegrityDesc *lvid = 1756 if (!bh)
1636 (struct logicalVolIntegrityDesc *)bh->b_data; 1757 return;
1637 struct logicalVolIntegrityDescImpUse *lvidiu =
1638 udf_sb_lvidiu(sbi);
1639 1758
1640 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1759 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1641 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1760 lvidiu = udf_sb_lvidiu(sbi);
1642 if (udf_time_to_stamp(&cpu_time, CURRENT_TIME))
1643 lvid->recordingDateAndTime = cpu_to_lets(cpu_time);
1644 lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
1645 1761
1646 lvid->descTag.descCRC = cpu_to_le16( 1762 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1647 udf_crc((char *)lvid + sizeof(tag), 1763 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1648 le16_to_cpu(lvid->descTag.descCRCLength), 1764 udf_time_to_disk_stamp(&lvid->recordingDateAndTime,
1649 0)); 1765 CURRENT_TIME);
1766 lvid->integrityType = LVID_INTEGRITY_TYPE_OPEN;
1650 1767
1651 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag); 1768 lvid->descTag.descCRC = cpu_to_le16(
1652 mark_buffer_dirty(bh); 1769 crc_itu_t(0, (char *)lvid + sizeof(tag),
1653 } 1770 le16_to_cpu(lvid->descTag.descCRCLength)));
1771
1772 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1773 mark_buffer_dirty(bh);
1654} 1774}
1655 1775
1656static void udf_close_lvid(struct super_block *sb) 1776static void udf_close_lvid(struct super_block *sb)
1657{ 1777{
1658 kernel_timestamp cpu_time;
1659 struct udf_sb_info *sbi = UDF_SB(sb); 1778 struct udf_sb_info *sbi = UDF_SB(sb);
1660 struct buffer_head *bh = sbi->s_lvid_bh; 1779 struct buffer_head *bh = sbi->s_lvid_bh;
1661 struct logicalVolIntegrityDesc *lvid; 1780 struct logicalVolIntegrityDesc *lvid;
1781 struct logicalVolIntegrityDescImpUse *lvidiu;
1662 1782
1663 if (!bh) 1783 if (!bh)
1664 return; 1784 return;
1665 1785
1666 lvid = (struct logicalVolIntegrityDesc *)bh->b_data; 1786 lvid = (struct logicalVolIntegrityDesc *)bh->b_data;
1667 1787
1668 if (lvid->integrityType == LVID_INTEGRITY_TYPE_OPEN) { 1788 if (lvid->integrityType != LVID_INTEGRITY_TYPE_OPEN)
1669 struct logicalVolIntegrityDescImpUse *lvidiu = 1789 return;
1670 udf_sb_lvidiu(sbi); 1790
1671 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX; 1791 lvidiu = udf_sb_lvidiu(sbi);
1672 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX; 1792 lvidiu->impIdent.identSuffix[0] = UDF_OS_CLASS_UNIX;
1673 if (udf_time_to_stamp(&cpu_time, CURRENT_TIME)) 1793 lvidiu->impIdent.identSuffix[1] = UDF_OS_ID_LINUX;
1674 lvid->recordingDateAndTime = cpu_to_lets(cpu_time); 1794 udf_time_to_disk_stamp(&lvid->recordingDateAndTime, CURRENT_TIME);
1675 if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev)) 1795 if (UDF_MAX_WRITE_VERSION > le16_to_cpu(lvidiu->maxUDFWriteRev))
1676 lvidiu->maxUDFWriteRev = 1796 lvidiu->maxUDFWriteRev = cpu_to_le16(UDF_MAX_WRITE_VERSION);
1677 cpu_to_le16(UDF_MAX_WRITE_VERSION); 1797 if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev))
1678 if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFReadRev)) 1798 lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev);
1679 lvidiu->minUDFReadRev = cpu_to_le16(sbi->s_udfrev); 1799 if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev))
1680 if (sbi->s_udfrev > le16_to_cpu(lvidiu->minUDFWriteRev)) 1800 lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev);
1681 lvidiu->minUDFWriteRev = cpu_to_le16(sbi->s_udfrev); 1801 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE);
1682 lvid->integrityType = cpu_to_le32(LVID_INTEGRITY_TYPE_CLOSE); 1802
1683 1803 lvid->descTag.descCRC = cpu_to_le16(
1684 lvid->descTag.descCRC = cpu_to_le16( 1804 crc_itu_t(0, (char *)lvid + sizeof(tag),
1685 udf_crc((char *)lvid + sizeof(tag), 1805 le16_to_cpu(lvid->descTag.descCRCLength)));
1686 le16_to_cpu(lvid->descTag.descCRCLength), 1806
1687 0)); 1807 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1688 1808 mark_buffer_dirty(bh);
1689 lvid->descTag.tagChecksum = udf_tag_checksum(&lvid->descTag);
1690 mark_buffer_dirty(bh);
1691 }
1692} 1809}
1693 1810
1694static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) 1811static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
@@ -1708,22 +1825,35 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
1708 vfree(bitmap); 1825 vfree(bitmap);
1709} 1826}
1710 1827
1711/* 1828static void udf_free_partition(struct udf_part_map *map)
1712 * udf_read_super 1829{
1713 * 1830 int i;
1714 * PURPOSE 1831 struct udf_meta_data *mdata;
1715 * Complete the specified super block. 1832
1716 * 1833 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE)
1717 * PRE-CONDITIONS 1834 iput(map->s_uspace.s_table);
1718 * sb Pointer to superblock to complete - never NULL. 1835 if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
1719 * sb->s_dev Device to read suberblock from. 1836 iput(map->s_fspace.s_table);
1720 * options Pointer to mount options. 1837 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
1721 * silent Silent flag. 1838 udf_sb_free_bitmap(map->s_uspace.s_bitmap);
1722 * 1839 if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
1723 * HISTORY 1840 udf_sb_free_bitmap(map->s_fspace.s_bitmap);
1724 * July 1, 1997 - Andrew E. Mileski 1841 if (map->s_partition_type == UDF_SPARABLE_MAP15)
1725 * Written, tested, and released. 1842 for (i = 0; i < 4; i++)
1726 */ 1843 brelse(map->s_type_specific.s_sparing.s_spar_map[i]);
1844 else if (map->s_partition_type == UDF_METADATA_MAP25) {
1845 mdata = &map->s_type_specific.s_metadata;
1846 iput(mdata->s_metadata_fe);
1847 mdata->s_metadata_fe = NULL;
1848
1849 iput(mdata->s_mirror_fe);
1850 mdata->s_mirror_fe = NULL;
1851
1852 iput(mdata->s_bitmap_fe);
1853 mdata->s_bitmap_fe = NULL;
1854 }
1855}
1856
1727static int udf_fill_super(struct super_block *sb, void *options, int silent) 1857static int udf_fill_super(struct super_block *sb, void *options, int silent)
1728{ 1858{
1729 int i; 1859 int i;
@@ -1776,8 +1906,11 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1776 sbi->s_nls_map = uopt.nls_map; 1906 sbi->s_nls_map = uopt.nls_map;
1777 1907
1778 /* Set the block size for all transfers */ 1908 /* Set the block size for all transfers */
1779 if (!udf_set_blocksize(sb, uopt.blocksize)) 1909 if (!sb_min_blocksize(sb, uopt.blocksize)) {
1910 udf_debug("Bad block size (%d)\n", uopt.blocksize);
1911 printk(KERN_ERR "udf: bad block size (%d)\n", uopt.blocksize);
1780 goto error_out; 1912 goto error_out;
1913 }
1781 1914
1782 if (uopt.session == 0xFFFFFFFF) 1915 if (uopt.session == 0xFFFFFFFF)
1783 sbi->s_session = udf_get_last_session(sb); 1916 sbi->s_session = udf_get_last_session(sb);
@@ -1789,7 +1922,6 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1789 sbi->s_last_block = uopt.lastblock; 1922 sbi->s_last_block = uopt.lastblock;
1790 sbi->s_anchor[0] = sbi->s_anchor[1] = 0; 1923 sbi->s_anchor[0] = sbi->s_anchor[1] = 0;
1791 sbi->s_anchor[2] = uopt.anchor; 1924 sbi->s_anchor[2] = uopt.anchor;
1792 sbi->s_anchor[3] = 256;
1793 1925
1794 if (udf_check_valid(sb, uopt.novrs, silent)) { 1926 if (udf_check_valid(sb, uopt.novrs, silent)) {
1795 /* read volume recognition sequences */ 1927 /* read volume recognition sequences */
@@ -1806,7 +1938,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1806 sb->s_magic = UDF_SUPER_MAGIC; 1938 sb->s_magic = UDF_SUPER_MAGIC;
1807 sb->s_time_gran = 1000; 1939 sb->s_time_gran = 1000;
1808 1940
1809 if (udf_load_partition(sb, &fileset)) { 1941 if (udf_load_sequence(sb, &fileset)) {
1810 printk(KERN_WARNING "UDF-fs: No partition found (1)\n"); 1942 printk(KERN_WARNING "UDF-fs: No partition found (1)\n");
1811 goto error_out; 1943 goto error_out;
1812 } 1944 }
@@ -1856,12 +1988,12 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1856 } 1988 }
1857 1989
1858 if (!silent) { 1990 if (!silent) {
1859 kernel_timestamp ts; 1991 timestamp ts;
1860 udf_time_to_stamp(&ts, sbi->s_record_time); 1992 udf_time_to_disk_stamp(&ts, sbi->s_record_time);
1861 udf_info("UDF: Mounting volume '%s', " 1993 udf_info("UDF: Mounting volume '%s', "
1862 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n", 1994 "timestamp %04u/%02u/%02u %02u:%02u (%x)\n",
1863 sbi->s_volume_ident, ts.year, ts.month, ts.day, 1995 sbi->s_volume_ident, le16_to_cpu(ts.year), ts.month, ts.day,
1864 ts.hour, ts.minute, ts.typeAndTimezone); 1996 ts.hour, ts.minute, le16_to_cpu(ts.typeAndTimezone));
1865 } 1997 }
1866 if (!(sb->s_flags & MS_RDONLY)) 1998 if (!(sb->s_flags & MS_RDONLY))
1867 udf_open_lvid(sb); 1999 udf_open_lvid(sb);
@@ -1890,21 +2022,9 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
1890error_out: 2022error_out:
1891 if (sbi->s_vat_inode) 2023 if (sbi->s_vat_inode)
1892 iput(sbi->s_vat_inode); 2024 iput(sbi->s_vat_inode);
1893 if (sbi->s_partitions) { 2025 if (sbi->s_partitions)
1894 struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition]; 2026 for (i = 0; i < sbi->s_partitions; i++)
1895 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) 2027 udf_free_partition(&sbi->s_partmaps[i]);
1896 iput(map->s_uspace.s_table);
1897 if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
1898 iput(map->s_fspace.s_table);
1899 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
1900 udf_sb_free_bitmap(map->s_uspace.s_bitmap);
1901 if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
1902 udf_sb_free_bitmap(map->s_fspace.s_bitmap);
1903 if (map->s_partition_type == UDF_SPARABLE_MAP15)
1904 for (i = 0; i < 4; i++)
1905 brelse(map->s_type_specific.s_sparing.
1906 s_spar_map[i]);
1907 }
1908#ifdef CONFIG_UDF_NLS 2028#ifdef CONFIG_UDF_NLS
1909 if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 2029 if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
1910 unload_nls(sbi->s_nls_map); 2030 unload_nls(sbi->s_nls_map);
@@ -1920,8 +2040,8 @@ error_out:
1920 return -EINVAL; 2040 return -EINVAL;
1921} 2041}
1922 2042
1923void udf_error(struct super_block *sb, const char *function, 2043static void udf_error(struct super_block *sb, const char *function,
1924 const char *fmt, ...) 2044 const char *fmt, ...)
1925{ 2045{
1926 va_list args; 2046 va_list args;
1927 2047
@@ -1948,19 +2068,6 @@ void udf_warning(struct super_block *sb, const char *function,
1948 sb->s_id, function, error_buf); 2068 sb->s_id, function, error_buf);
1949} 2069}
1950 2070
1951/*
1952 * udf_put_super
1953 *
1954 * PURPOSE
1955 * Prepare for destruction of the superblock.
1956 *
1957 * DESCRIPTION
1958 * Called before the filesystem is unmounted.
1959 *
1960 * HISTORY
1961 * July 1, 1997 - Andrew E. Mileski
1962 * Written, tested, and released.
1963 */
1964static void udf_put_super(struct super_block *sb) 2071static void udf_put_super(struct super_block *sb)
1965{ 2072{
1966 int i; 2073 int i;
@@ -1969,21 +2076,9 @@ static void udf_put_super(struct super_block *sb)
1969 sbi = UDF_SB(sb); 2076 sbi = UDF_SB(sb);
1970 if (sbi->s_vat_inode) 2077 if (sbi->s_vat_inode)
1971 iput(sbi->s_vat_inode); 2078 iput(sbi->s_vat_inode);
1972 if (sbi->s_partitions) { 2079 if (sbi->s_partitions)
1973 struct udf_part_map *map = &sbi->s_partmaps[sbi->s_partition]; 2080 for (i = 0; i < sbi->s_partitions; i++)
1974 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_TABLE) 2081 udf_free_partition(&sbi->s_partmaps[i]);
1975 iput(map->s_uspace.s_table);
1976 if (map->s_partition_flags & UDF_PART_FLAG_FREED_TABLE)
1977 iput(map->s_fspace.s_table);
1978 if (map->s_partition_flags & UDF_PART_FLAG_UNALLOC_BITMAP)
1979 udf_sb_free_bitmap(map->s_uspace.s_bitmap);
1980 if (map->s_partition_flags & UDF_PART_FLAG_FREED_BITMAP)
1981 udf_sb_free_bitmap(map->s_fspace.s_bitmap);
1982 if (map->s_partition_type == UDF_SPARABLE_MAP15)
1983 for (i = 0; i < 4; i++)
1984 brelse(map->s_type_specific.s_sparing.
1985 s_spar_map[i]);
1986 }
1987#ifdef CONFIG_UDF_NLS 2082#ifdef CONFIG_UDF_NLS
1988 if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) 2083 if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
1989 unload_nls(sbi->s_nls_map); 2084 unload_nls(sbi->s_nls_map);
@@ -1996,19 +2091,6 @@ static void udf_put_super(struct super_block *sb)
1996 sb->s_fs_info = NULL; 2091 sb->s_fs_info = NULL;
1997} 2092}
1998 2093
1999/*
2000 * udf_stat_fs
2001 *
2002 * PURPOSE
2003 * Return info about the filesystem.
2004 *
2005 * DESCRIPTION
2006 * Called by sys_statfs()
2007 *
2008 * HISTORY
2009 * July 1, 1997 - Andrew E. Mileski
2010 * Written, tested, and released.
2011 */
2012static int udf_statfs(struct dentry *dentry, struct kstatfs *buf) 2094static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2013{ 2095{
2014 struct super_block *sb = dentry->d_sb; 2096 struct super_block *sb = dentry->d_sb;
@@ -2035,10 +2117,6 @@ static int udf_statfs(struct dentry *dentry, struct kstatfs *buf)
2035 return 0; 2117 return 0;
2036} 2118}
2037 2119
2038static unsigned char udf_bitmap_lookup[16] = {
2039 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4
2040};
2041
2042static unsigned int udf_count_free_bitmap(struct super_block *sb, 2120static unsigned int udf_count_free_bitmap(struct super_block *sb,
2043 struct udf_bitmap *bitmap) 2121 struct udf_bitmap *bitmap)
2044{ 2122{
@@ -2048,7 +2126,6 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2048 int block = 0, newblock; 2126 int block = 0, newblock;
2049 kernel_lb_addr loc; 2127 kernel_lb_addr loc;
2050 uint32_t bytes; 2128 uint32_t bytes;
2051 uint8_t value;
2052 uint8_t *ptr; 2129 uint8_t *ptr;
2053 uint16_t ident; 2130 uint16_t ident;
2054 struct spaceBitmapDesc *bm; 2131 struct spaceBitmapDesc *bm;
@@ -2074,13 +2151,10 @@ static unsigned int udf_count_free_bitmap(struct super_block *sb,
2074 ptr = (uint8_t *)bh->b_data; 2151 ptr = (uint8_t *)bh->b_data;
2075 2152
2076 while (bytes > 0) { 2153 while (bytes > 0) {
2077 while ((bytes > 0) && (index < sb->s_blocksize)) { 2154 u32 cur_bytes = min_t(u32, bytes, sb->s_blocksize - index);
2078 value = ptr[index]; 2155 accum += bitmap_weight((const unsigned long *)(ptr + index),
2079 accum += udf_bitmap_lookup[value & 0x0f]; 2156 cur_bytes * 8);
2080 accum += udf_bitmap_lookup[value >> 4]; 2157 bytes -= cur_bytes;
2081 index++;
2082 bytes--;
2083 }
2084 if (bytes) { 2158 if (bytes) {
2085 brelse(bh); 2159 brelse(bh);
2086 newblock = udf_get_lb_pblock(sb, loc, ++block); 2160 newblock = udf_get_lb_pblock(sb, loc, ++block);
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 6ec99221e50c..c3265e1385d4 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -23,7 +23,6 @@
23#include <asm/uaccess.h> 23#include <asm/uaccess.h>
24#include <linux/errno.h> 24#include <linux/errno.h>
25#include <linux/fs.h> 25#include <linux/fs.h>
26#include <linux/udf_fs.h>
27#include <linux/time.h> 26#include <linux/time.h>
28#include <linux/mm.h> 27#include <linux/mm.h>
29#include <linux/stat.h> 28#include <linux/stat.h>
diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c
index fe61be17cdab..65e19b4f9424 100644
--- a/fs/udf/truncate.c
+++ b/fs/udf/truncate.c
@@ -22,7 +22,6 @@
22#include "udfdecl.h" 22#include "udfdecl.h"
23#include <linux/fs.h> 23#include <linux/fs.h>
24#include <linux/mm.h> 24#include <linux/mm.h>
25#include <linux/udf_fs.h>
26#include <linux/buffer_head.h> 25#include <linux/buffer_head.h>
27 26
28#include "udf_i.h" 27#include "udf_i.h"
@@ -180,6 +179,24 @@ void udf_discard_prealloc(struct inode *inode)
180 brelse(epos.bh); 179 brelse(epos.bh);
181} 180}
182 181
182static void udf_update_alloc_ext_desc(struct inode *inode,
183 struct extent_position *epos,
184 u32 lenalloc)
185{
186 struct super_block *sb = inode->i_sb;
187 struct udf_sb_info *sbi = UDF_SB(sb);
188
189 struct allocExtDesc *aed = (struct allocExtDesc *) (epos->bh->b_data);
190 int len = sizeof(struct allocExtDesc);
191
192 aed->lengthAllocDescs = cpu_to_le32(lenalloc);
193 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) || sbi->s_udfrev >= 0x0201)
194 len += lenalloc;
195
196 udf_update_tag(epos->bh->b_data, len);
197 mark_buffer_dirty_inode(epos->bh, inode);
198}
199
183void udf_truncate_extents(struct inode *inode) 200void udf_truncate_extents(struct inode *inode)
184{ 201{
185 struct extent_position epos; 202 struct extent_position epos;
@@ -187,7 +204,6 @@ void udf_truncate_extents(struct inode *inode)
187 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc; 204 uint32_t elen, nelen = 0, indirect_ext_len = 0, lenalloc;
188 int8_t etype; 205 int8_t etype;
189 struct super_block *sb = inode->i_sb; 206 struct super_block *sb = inode->i_sb;
190 struct udf_sb_info *sbi = UDF_SB(sb);
191 sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset; 207 sector_t first_block = inode->i_size >> sb->s_blocksize_bits, offset;
192 loff_t byte_offset; 208 loff_t byte_offset;
193 int adsize; 209 int adsize;
@@ -224,35 +240,15 @@ void udf_truncate_extents(struct inode *inode)
224 if (indirect_ext_len) { 240 if (indirect_ext_len) {
225 /* We managed to free all extents in the 241 /* We managed to free all extents in the
226 * indirect extent - free it too */ 242 * indirect extent - free it too */
227 if (!epos.bh) 243 BUG_ON(!epos.bh);
228 BUG();
229 udf_free_blocks(sb, inode, epos.block, 244 udf_free_blocks(sb, inode, epos.block,
230 0, indirect_ext_len); 245 0, indirect_ext_len);
231 } else { 246 } else if (!epos.bh) {
232 if (!epos.bh) { 247 iinfo->i_lenAlloc = lenalloc;
233 iinfo->i_lenAlloc = 248 mark_inode_dirty(inode);
234 lenalloc; 249 } else
235 mark_inode_dirty(inode); 250 udf_update_alloc_ext_desc(inode,
236 } else { 251 &epos, lenalloc);
237 struct allocExtDesc *aed =
238 (struct allocExtDesc *)
239 (epos.bh->b_data);
240 int len =
241 sizeof(struct allocExtDesc);
242
243 aed->lengthAllocDescs =
244 cpu_to_le32(lenalloc);
245 if (!UDF_QUERY_FLAG(sb,
246 UDF_FLAG_STRICT) ||
247 sbi->s_udfrev >= 0x0201)
248 len += lenalloc;
249
250 udf_update_tag(epos.bh->b_data,
251 len);
252 mark_buffer_dirty_inode(
253 epos.bh, inode);
254 }
255 }
256 brelse(epos.bh); 252 brelse(epos.bh);
257 epos.offset = sizeof(struct allocExtDesc); 253 epos.offset = sizeof(struct allocExtDesc);
258 epos.block = eloc; 254 epos.block = eloc;
@@ -272,29 +268,14 @@ void udf_truncate_extents(struct inode *inode)
272 } 268 }
273 269
274 if (indirect_ext_len) { 270 if (indirect_ext_len) {
275 if (!epos.bh) 271 BUG_ON(!epos.bh);
276 BUG();
277 udf_free_blocks(sb, inode, epos.block, 0, 272 udf_free_blocks(sb, inode, epos.block, 0,
278 indirect_ext_len); 273 indirect_ext_len);
279 } else { 274 } else if (!epos.bh) {
280 if (!epos.bh) { 275 iinfo->i_lenAlloc = lenalloc;
281 iinfo->i_lenAlloc = lenalloc; 276 mark_inode_dirty(inode);
282 mark_inode_dirty(inode); 277 } else
283 } else { 278 udf_update_alloc_ext_desc(inode, &epos, lenalloc);
284 struct allocExtDesc *aed =
285 (struct allocExtDesc *)(epos.bh->b_data);
286 aed->lengthAllocDescs = cpu_to_le32(lenalloc);
287 if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT) ||
288 sbi->s_udfrev >= 0x0201)
289 udf_update_tag(epos.bh->b_data,
290 lenalloc +
291 sizeof(struct allocExtDesc));
292 else
293 udf_update_tag(epos.bh->b_data,
294 sizeof(struct allocExtDesc));
295 mark_buffer_dirty_inode(epos.bh, inode);
296 }
297 }
298 } else if (inode->i_size) { 279 } else if (inode->i_size) {
299 if (byte_offset) { 280 if (byte_offset) {
300 kernel_long_ad extent; 281 kernel_long_ad extent;
diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h
index ccc52f16bf7d..4f86b1d98a5d 100644
--- a/fs/udf/udf_i.h
+++ b/fs/udf/udf_i.h
@@ -1,10 +1,32 @@
1#ifndef __LINUX_UDF_I_H 1#ifndef _UDF_I_H
2#define __LINUX_UDF_I_H 2#define _UDF_I_H
3
4struct udf_inode_info {
5 struct timespec i_crtime;
6 /* Physical address of inode */
7 kernel_lb_addr i_location;
8 __u64 i_unique;
9 __u32 i_lenEAttr;
10 __u32 i_lenAlloc;
11 __u64 i_lenExtents;
12 __u32 i_next_alloc_block;
13 __u32 i_next_alloc_goal;
14 unsigned i_alloc_type : 3;
15 unsigned i_efe : 1; /* extendedFileEntry */
16 unsigned i_use : 1; /* unallocSpaceEntry */
17 unsigned i_strat4096 : 1;
18 unsigned reserved : 26;
19 union {
20 short_ad *i_sad;
21 long_ad *i_lad;
22 __u8 *i_data;
23 } i_ext;
24 struct inode vfs_inode;
25};
3 26
4#include <linux/udf_fs_i.h>
5static inline struct udf_inode_info *UDF_I(struct inode *inode) 27static inline struct udf_inode_info *UDF_I(struct inode *inode)
6{ 28{
7 return list_entry(inode, struct udf_inode_info, vfs_inode); 29 return list_entry(inode, struct udf_inode_info, vfs_inode);
8} 30}
9 31
10#endif /* !defined(_LINUX_UDF_I_H) */ 32#endif /* _UDF_I_H) */
diff --git a/fs/udf/udf_sb.h b/fs/udf/udf_sb.h
index 737d1c604eea..1c1c514a9725 100644
--- a/fs/udf/udf_sb.h
+++ b/fs/udf/udf_sb.h
@@ -1,10 +1,12 @@
1#ifndef __LINUX_UDF_SB_H 1#ifndef __LINUX_UDF_SB_H
2#define __LINUX_UDF_SB_H 2#define __LINUX_UDF_SB_H
3 3
4#include <linux/mutex.h>
5
4/* Since UDF 2.01 is ISO 13346 based... */ 6/* Since UDF 2.01 is ISO 13346 based... */
5#define UDF_SUPER_MAGIC 0x15013346 7#define UDF_SUPER_MAGIC 0x15013346
6 8
7#define UDF_MAX_READ_VERSION 0x0201 9#define UDF_MAX_READ_VERSION 0x0250
8#define UDF_MAX_WRITE_VERSION 0x0201 10#define UDF_MAX_WRITE_VERSION 0x0201
9 11
10#define UDF_FLAG_USE_EXTENDED_FE 0 12#define UDF_FLAG_USE_EXTENDED_FE 0
@@ -38,6 +40,111 @@
38#define UDF_PART_FLAG_REWRITABLE 0x0040 40#define UDF_PART_FLAG_REWRITABLE 0x0040
39#define UDF_PART_FLAG_OVERWRITABLE 0x0080 41#define UDF_PART_FLAG_OVERWRITABLE 0x0080
40 42
43#define UDF_MAX_BLOCK_LOADED 8
44
45#define UDF_TYPE1_MAP15 0x1511U
46#define UDF_VIRTUAL_MAP15 0x1512U
47#define UDF_VIRTUAL_MAP20 0x2012U
48#define UDF_SPARABLE_MAP15 0x1522U
49#define UDF_METADATA_MAP25 0x2511U
50
51#pragma pack(1) /* XXX(hch): Why? This file just defines in-core structures */
52
53struct udf_meta_data {
54 __u32 s_meta_file_loc;
55 __u32 s_mirror_file_loc;
56 __u32 s_bitmap_file_loc;
57 __u32 s_alloc_unit_size;
58 __u16 s_align_unit_size;
59 __u8 s_dup_md_flag;
60 struct inode *s_metadata_fe;
61 struct inode *s_mirror_fe;
62 struct inode *s_bitmap_fe;
63};
64
65struct udf_sparing_data {
66 __u16 s_packet_len;
67 struct buffer_head *s_spar_map[4];
68};
69
70struct udf_virtual_data {
71 __u32 s_num_entries;
72 __u16 s_start_offset;
73};
74
75struct udf_bitmap {
76 __u32 s_extLength;
77 __u32 s_extPosition;
78 __u16 s_nr_groups;
79 struct buffer_head **s_block_bitmap;
80};
81
82struct udf_part_map {
83 union {
84 struct udf_bitmap *s_bitmap;
85 struct inode *s_table;
86 } s_uspace;
87 union {
88 struct udf_bitmap *s_bitmap;
89 struct inode *s_table;
90 } s_fspace;
91 __u32 s_partition_root;
92 __u32 s_partition_len;
93 __u16 s_partition_type;
94 __u16 s_partition_num;
95 union {
96 struct udf_sparing_data s_sparing;
97 struct udf_virtual_data s_virtual;
98 struct udf_meta_data s_metadata;
99 } s_type_specific;
100 __u32 (*s_partition_func)(struct super_block *, __u32, __u16, __u32);
101 __u16 s_volumeseqnum;
102 __u16 s_partition_flags;
103};
104
105#pragma pack()
106
107struct udf_sb_info {
108 struct udf_part_map *s_partmaps;
109 __u8 s_volume_ident[32];
110
111 /* Overall info */
112 __u16 s_partitions;
113 __u16 s_partition;
114
115 /* Sector headers */
116 __s32 s_session;
117 __u32 s_anchor[3];
118 __u32 s_last_block;
119
120 struct buffer_head *s_lvid_bh;
121
122 /* Default permissions */
123 mode_t s_umask;
124 gid_t s_gid;
125 uid_t s_uid;
126
127 /* Root Info */
128 struct timespec s_record_time;
129
130 /* Fileset Info */
131 __u16 s_serial_number;
132
133 /* highest UDF revision we have recorded to this media */
134 __u16 s_udfrev;
135
136 /* Miscellaneous flags */
137 __u32 s_flags;
138
139 /* Encoding info */
140 struct nls_table *s_nls_map;
141
142 /* VAT inode */
143 struct inode *s_vat_inode;
144
145 struct mutex s_alloc_mutex;
146};
147
41static inline struct udf_sb_info *UDF_SB(struct super_block *sb) 148static inline struct udf_sb_info *UDF_SB(struct super_block *sb)
42{ 149{
43 return sb->s_fs_info; 150 return sb->s_fs_info;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 681dc2b66cdb..f3f45d029277 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -1,17 +1,37 @@
1#ifndef __UDF_DECL_H 1#ifndef __UDF_DECL_H
2#define __UDF_DECL_H 2#define __UDF_DECL_H
3 3
4#include <linux/udf_fs.h>
5#include "ecma_167.h" 4#include "ecma_167.h"
6#include "osta_udf.h" 5#include "osta_udf.h"
7 6
8#include <linux/fs.h> 7#include <linux/fs.h>
9#include <linux/types.h> 8#include <linux/types.h>
10#include <linux/udf_fs_i.h>
11#include <linux/udf_fs_sb.h>
12#include <linux/buffer_head.h> 9#include <linux/buffer_head.h>
10#include <linux/udf_fs_i.h>
13 11
12#include "udf_sb.h"
14#include "udfend.h" 13#include "udfend.h"
14#include "udf_i.h"
15
16#define UDF_PREALLOCATE
17#define UDF_DEFAULT_PREALLOC_BLOCKS 8
18
19#define UDFFS_DEBUG
20
21#ifdef UDFFS_DEBUG
22#define udf_debug(f, a...) \
23do { \
24 printk(KERN_DEBUG "UDF-fs DEBUG %s:%d:%s: ", \
25 __FILE__, __LINE__, __func__); \
26 printk(f, ##a); \
27} while (0)
28#else
29#define udf_debug(f, a...) /**/
30#endif
31
32#define udf_info(f, a...) \
33 printk(KERN_INFO "UDF-fs INFO " f, ##a);
34
15 35
16#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) ) 36#define udf_fixed_to_variable(x) ( ( ( (x) >> 5 ) * 39 ) + ( (x) & 0x0000001F ) )
17#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) ) 37#define udf_variable_to_fixed(x) ( ( ( (x) / 39 ) << 5 ) + ( (x) % 39 ) )
@@ -23,16 +43,24 @@
23#define UDF_NAME_LEN 256 43#define UDF_NAME_LEN 256
24#define UDF_PATH_LEN 1023 44#define UDF_PATH_LEN 1023
25 45
26#define udf_file_entry_alloc_offset(inode)\ 46static inline size_t udf_file_entry_alloc_offset(struct inode *inode)
27 (UDF_I(inode)->i_use ?\ 47{
28 sizeof(struct unallocSpaceEntry) :\ 48 struct udf_inode_info *iinfo = UDF_I(inode);
29 ((UDF_I(inode)->i_efe ?\ 49 if (iinfo->i_use)
30 sizeof(struct extendedFileEntry) :\ 50 return sizeof(struct unallocSpaceEntry);
31 sizeof(struct fileEntry)) + UDF_I(inode)->i_lenEAttr)) 51 else if (iinfo->i_efe)
32 52 return sizeof(struct extendedFileEntry) + iinfo->i_lenEAttr;
33#define udf_ext0_offset(inode)\ 53 else
34 (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB ?\ 54 return sizeof(struct fileEntry) + iinfo->i_lenEAttr;
35 udf_file_entry_alloc_offset(inode) : 0) 55}
56
57static inline size_t udf_ext0_offset(struct inode *inode)
58{
59 if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
60 return udf_file_entry_alloc_offset(inode);
61 else
62 return 0;
63}
36 64
37#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset)) 65#define udf_get_lb_pblock(sb,loc,offset) udf_get_pblock((sb), (loc).logicalBlockNum, (loc).partitionReferenceNum, (offset))
38 66
@@ -83,7 +111,6 @@ struct extent_position {
83}; 111};
84 112
85/* super.c */ 113/* super.c */
86extern void udf_error(struct super_block *, const char *, const char *, ...);
87extern void udf_warning(struct super_block *, const char *, const char *, ...); 114extern void udf_warning(struct super_block *, const char *, const char *, ...);
88 115
89/* namei.c */ 116/* namei.c */
@@ -150,6 +177,8 @@ extern uint32_t udf_get_pblock_virt20(struct super_block *, uint32_t, uint16_t,
150 uint32_t); 177 uint32_t);
151extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t, 178extern uint32_t udf_get_pblock_spar15(struct super_block *, uint32_t, uint16_t,
152 uint32_t); 179 uint32_t);
180extern uint32_t udf_get_pblock_meta25(struct super_block *, uint32_t, uint16_t,
181 uint32_t);
153extern int udf_relocate_blocks(struct super_block *, long, long *); 182extern int udf_relocate_blocks(struct super_block *, long, long *);
154 183
155/* unicode.c */ 184/* unicode.c */
@@ -157,7 +186,7 @@ extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
157extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *, 186extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
158 int); 187 int);
159extern int udf_build_ustr(struct ustr *, dstring *, int); 188extern int udf_build_ustr(struct ustr *, dstring *, int);
160extern int udf_CS0toUTF8(struct ustr *, struct ustr *); 189extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
161 190
162/* ialloc.c */ 191/* ialloc.c */
163extern void udf_free_inode(struct inode *); 192extern void udf_free_inode(struct inode *);
@@ -191,11 +220,9 @@ extern struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize,
191extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int); 220extern long_ad *udf_get_filelongad(uint8_t *, int, uint32_t *, int);
192extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int); 221extern short_ad *udf_get_fileshortad(uint8_t *, int, uint32_t *, int);
193 222
194/* crc.c */
195extern uint16_t udf_crc(uint8_t *, uint32_t, uint16_t);
196
197/* udftime.c */ 223/* udftime.c */
198extern time_t *udf_stamp_to_time(time_t *, long *, kernel_timestamp); 224extern struct timespec *udf_disk_stamp_to_time(struct timespec *dest,
199extern kernel_timestamp *udf_time_to_stamp(kernel_timestamp *, struct timespec); 225 timestamp src);
226extern timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec src);
200 227
201#endif /* __UDF_DECL_H */ 228#endif /* __UDF_DECL_H */
diff --git a/fs/udf/udfend.h b/fs/udf/udfend.h
index c4bd1203f857..489f52fb428c 100644
--- a/fs/udf/udfend.h
+++ b/fs/udf/udfend.h
@@ -24,17 +24,6 @@ static inline lb_addr cpu_to_lelb(kernel_lb_addr in)
24 return out; 24 return out;
25} 25}
26 26
27static inline kernel_timestamp lets_to_cpu(timestamp in)
28{
29 kernel_timestamp out;
30
31 memcpy(&out, &in, sizeof(timestamp));
32 out.typeAndTimezone = le16_to_cpu(in.typeAndTimezone);
33 out.year = le16_to_cpu(in.year);
34
35 return out;
36}
37
38static inline short_ad lesa_to_cpu(short_ad in) 27static inline short_ad lesa_to_cpu(short_ad in)
39{ 28{
40 short_ad out; 29 short_ad out;
@@ -85,15 +74,4 @@ static inline kernel_extent_ad leea_to_cpu(extent_ad in)
85 return out; 74 return out;
86} 75}
87 76
88static inline timestamp cpu_to_lets(kernel_timestamp in)
89{
90 timestamp out;
91
92 memcpy(&out, &in, sizeof(timestamp));
93 out.typeAndTimezone = cpu_to_le16(in.typeAndTimezone);
94 out.year = cpu_to_le16(in.year);
95
96 return out;
97}
98
99#endif /* __UDF_ENDIAN_H */ 77#endif /* __UDF_ENDIAN_H */
diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c
index ce595732ba6f..5f811655c9b5 100644
--- a/fs/udf/udftime.c
+++ b/fs/udf/udftime.c
@@ -85,39 +85,38 @@ extern struct timezone sys_tz;
85#define SECS_PER_HOUR (60 * 60) 85#define SECS_PER_HOUR (60 * 60)
86#define SECS_PER_DAY (SECS_PER_HOUR * 24) 86#define SECS_PER_DAY (SECS_PER_HOUR * 24)
87 87
88time_t *udf_stamp_to_time(time_t *dest, long *dest_usec, kernel_timestamp src) 88struct timespec *udf_disk_stamp_to_time(struct timespec *dest, timestamp src)
89{ 89{
90 int yday; 90 int yday;
91 uint8_t type = src.typeAndTimezone >> 12; 91 u16 typeAndTimezone = le16_to_cpu(src.typeAndTimezone);
92 u16 year = le16_to_cpu(src.year);
93 uint8_t type = typeAndTimezone >> 12;
92 int16_t offset; 94 int16_t offset;
93 95
94 if (type == 1) { 96 if (type == 1) {
95 offset = src.typeAndTimezone << 4; 97 offset = typeAndTimezone << 4;
96 /* sign extent offset */ 98 /* sign extent offset */
97 offset = (offset >> 4); 99 offset = (offset >> 4);
98 if (offset == -2047) /* unspecified offset */ 100 if (offset == -2047) /* unspecified offset */
99 offset = 0; 101 offset = 0;
100 } else { 102 } else
101 offset = 0; 103 offset = 0;
102 }
103 104
104 if ((src.year < EPOCH_YEAR) || 105 if ((year < EPOCH_YEAR) ||
105 (src.year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) { 106 (year >= EPOCH_YEAR + MAX_YEAR_SECONDS)) {
106 *dest = -1;
107 *dest_usec = -1;
108 return NULL; 107 return NULL;
109 } 108 }
110 *dest = year_seconds[src.year - EPOCH_YEAR]; 109 dest->tv_sec = year_seconds[year - EPOCH_YEAR];
111 *dest -= offset * 60; 110 dest->tv_sec -= offset * 60;
112 111
113 yday = ((__mon_yday[__isleap(src.year)][src.month - 1]) + src.day - 1); 112 yday = ((__mon_yday[__isleap(year)][src.month - 1]) + src.day - 1);
114 *dest += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second; 113 dest->tv_sec += (((yday * 24) + src.hour) * 60 + src.minute) * 60 + src.second;
115 *dest_usec = src.centiseconds * 10000 + 114 dest->tv_nsec = 1000 * (src.centiseconds * 10000 +
116 src.hundredsOfMicroseconds * 100 + src.microseconds; 115 src.hundredsOfMicroseconds * 100 + src.microseconds);
117 return dest; 116 return dest;
118} 117}
119 118
120kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts) 119timestamp *udf_time_to_disk_stamp(timestamp *dest, struct timespec ts)
121{ 120{
122 long int days, rem, y; 121 long int days, rem, y;
123 const unsigned short int *ip; 122 const unsigned short int *ip;
@@ -128,7 +127,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
128 if (!dest) 127 if (!dest)
129 return NULL; 128 return NULL;
130 129
131 dest->typeAndTimezone = 0x1000 | (offset & 0x0FFF); 130 dest->typeAndTimezone = cpu_to_le16(0x1000 | (offset & 0x0FFF));
132 131
133 ts.tv_sec += offset * 60; 132 ts.tv_sec += offset * 60;
134 days = ts.tv_sec / SECS_PER_DAY; 133 days = ts.tv_sec / SECS_PER_DAY;
@@ -151,7 +150,7 @@ kernel_timestamp *udf_time_to_stamp(kernel_timestamp *dest, struct timespec ts)
151 - LEAPS_THRU_END_OF(y - 1)); 150 - LEAPS_THRU_END_OF(y - 1));
152 y = yg; 151 y = yg;
153 } 152 }
154 dest->year = y; 153 dest->year = cpu_to_le16(y);
155 ip = __mon_yday[__isleap(y)]; 154 ip = __mon_yday[__isleap(y)];
156 for (y = 11; days < (long int)ip[y]; --y) 155 for (y = 11; days < (long int)ip[y]; --y)
157 continue; 156 continue;
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index e533b11703bf..9fdf8c93c58e 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -23,7 +23,7 @@
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/string.h> /* for memset */ 24#include <linux/string.h> /* for memset */
25#include <linux/nls.h> 25#include <linux/nls.h>
26#include <linux/udf_fs.h> 26#include <linux/crc-itu-t.h>
27 27
28#include "udf_sb.h" 28#include "udf_sb.h"
29 29
@@ -49,14 +49,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size)
49{ 49{
50 int usesize; 50 int usesize;
51 51
52 if ((!dest) || (!ptr) || (!size)) 52 if (!dest || !ptr || !size)
53 return -1; 53 return -1;
54 BUG_ON(size < 2);
54 55
55 memset(dest, 0, sizeof(struct ustr)); 56 usesize = min_t(size_t, ptr[size - 1], sizeof(dest->u_name));
56 usesize = (size > UDF_NAME_LEN) ? UDF_NAME_LEN : size; 57 usesize = min(usesize, size - 2);
57 dest->u_cmpID = ptr[0]; 58 dest->u_cmpID = ptr[0];
58 dest->u_len = ptr[size - 1]; 59 dest->u_len = usesize;
59 memcpy(dest->u_name, ptr + 1, usesize - 1); 60 memcpy(dest->u_name, ptr + 1, usesize);
61 memset(dest->u_name + usesize, 0, sizeof(dest->u_name) - usesize);
60 62
61 return 0; 63 return 0;
62} 64}
@@ -83,9 +85,6 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
83 * PURPOSE 85 * PURPOSE
84 * Convert OSTA Compressed Unicode to the UTF-8 equivalent. 86 * Convert OSTA Compressed Unicode to the UTF-8 equivalent.
85 * 87 *
86 * DESCRIPTION
87 * This routine is only called by udf_filldir().
88 *
89 * PRE-CONDITIONS 88 * PRE-CONDITIONS
90 * utf Pointer to UTF-8 output buffer. 89 * utf Pointer to UTF-8 output buffer.
91 * ocu Pointer to OSTA Compressed Unicode input buffer 90 * ocu Pointer to OSTA Compressed Unicode input buffer
@@ -99,43 +98,39 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize)
99 * November 12, 1997 - Andrew E. Mileski 98 * November 12, 1997 - Andrew E. Mileski
100 * Written, tested, and released. 99 * Written, tested, and released.
101 */ 100 */
102int udf_CS0toUTF8(struct ustr *utf_o, struct ustr *ocu_i) 101int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i)
103{ 102{
104 uint8_t *ocu; 103 const uint8_t *ocu;
105 uint32_t c;
106 uint8_t cmp_id, ocu_len; 104 uint8_t cmp_id, ocu_len;
107 int i; 105 int i;
108 106
109 ocu = ocu_i->u_name;
110
111 ocu_len = ocu_i->u_len; 107 ocu_len = ocu_i->u_len;
112 cmp_id = ocu_i->u_cmpID;
113 utf_o->u_len = 0;
114
115 if (ocu_len == 0) { 108 if (ocu_len == 0) {
116 memset(utf_o, 0, sizeof(struct ustr)); 109 memset(utf_o, 0, sizeof(struct ustr));
117 utf_o->u_cmpID = 0;
118 utf_o->u_len = 0;
119 return 0; 110 return 0;
120 } 111 }
121 112
122 if ((cmp_id != 8) && (cmp_id != 16)) { 113 cmp_id = ocu_i->u_cmpID;
114 if (cmp_id != 8 && cmp_id != 16) {
115 memset(utf_o, 0, sizeof(struct ustr));
123 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", 116 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
124 cmp_id, ocu_i->u_name); 117 cmp_id, ocu_i->u_name);
125 return 0; 118 return 0;
126 } 119 }
127 120
121 ocu = ocu_i->u_name;
122 utf_o->u_len = 0;
128 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { 123 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
129 124
130 /* Expand OSTA compressed Unicode to Unicode */ 125 /* Expand OSTA compressed Unicode to Unicode */
131 c = ocu[i++]; 126 uint32_t c = ocu[i++];
132 if (cmp_id == 16) 127 if (cmp_id == 16)
133 c = (c << 8) | ocu[i++]; 128 c = (c << 8) | ocu[i++];
134 129
135 /* Compress Unicode to UTF-8 */ 130 /* Compress Unicode to UTF-8 */
136 if (c < 0x80U) { 131 if (c < 0x80U)
137 utf_o->u_name[utf_o->u_len++] = (uint8_t)c; 132 utf_o->u_name[utf_o->u_len++] = (uint8_t)c;
138 } else if (c < 0x800U) { 133 else if (c < 0x800U) {
139 utf_o->u_name[utf_o->u_len++] = 134 utf_o->u_name[utf_o->u_len++] =
140 (uint8_t)(0xc0 | (c >> 6)); 135 (uint8_t)(0xc0 | (c >> 6));
141 utf_o->u_name[utf_o->u_len++] = 136 utf_o->u_name[utf_o->u_len++] =
@@ -255,35 +250,32 @@ error_out:
255} 250}
256 251
257static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, 252static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o,
258 struct ustr *ocu_i) 253 const struct ustr *ocu_i)
259{ 254{
260 uint8_t *ocu; 255 const uint8_t *ocu;
261 uint32_t c;
262 uint8_t cmp_id, ocu_len; 256 uint8_t cmp_id, ocu_len;
263 int i; 257 int i;
264 258
265 ocu = ocu_i->u_name;
266 259
267 ocu_len = ocu_i->u_len; 260 ocu_len = ocu_i->u_len;
268 cmp_id = ocu_i->u_cmpID;
269 utf_o->u_len = 0;
270
271 if (ocu_len == 0) { 261 if (ocu_len == 0) {
272 memset(utf_o, 0, sizeof(struct ustr)); 262 memset(utf_o, 0, sizeof(struct ustr));
273 utf_o->u_cmpID = 0;
274 utf_o->u_len = 0;
275 return 0; 263 return 0;
276 } 264 }
277 265
278 if ((cmp_id != 8) && (cmp_id != 16)) { 266 cmp_id = ocu_i->u_cmpID;
267 if (cmp_id != 8 && cmp_id != 16) {
268 memset(utf_o, 0, sizeof(struct ustr));
279 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n", 269 printk(KERN_ERR "udf: unknown compression code (%d) stri=%s\n",
280 cmp_id, ocu_i->u_name); 270 cmp_id, ocu_i->u_name);
281 return 0; 271 return 0;
282 } 272 }
283 273
274 ocu = ocu_i->u_name;
275 utf_o->u_len = 0;
284 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) { 276 for (i = 0; (i < ocu_len) && (utf_o->u_len <= (UDF_NAME_LEN - 3));) {
285 /* Expand OSTA compressed Unicode to Unicode */ 277 /* Expand OSTA compressed Unicode to Unicode */
286 c = ocu[i++]; 278 uint32_t c = ocu[i++];
287 if (cmp_id == 16) 279 if (cmp_id == 16)
288 c = (c << 8) | ocu[i++]; 280 c = (c << 8) | ocu[i++];
289 281
@@ -463,7 +455,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
463 } else if (newIndex > 250) 455 } else if (newIndex > 250)
464 newIndex = 250; 456 newIndex = 250;
465 newName[newIndex++] = CRC_MARK; 457 newName[newIndex++] = CRC_MARK;
466 valueCRC = udf_crc(fidName, fidNameLen, 0); 458 valueCRC = crc_itu_t(0, fidName, fidNameLen);
467 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12]; 459 newName[newIndex++] = hexChar[(valueCRC & 0xf000) >> 12];
468 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8]; 460 newName[newIndex++] = hexChar[(valueCRC & 0x0f00) >> 8];
469 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4]; 461 newName[newIndex++] = hexChar[(valueCRC & 0x00f0) >> 4];
diff --git a/fs/utimes.c b/fs/utimes.c
index b18da9c0b97f..a2bef77dc9c9 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -2,6 +2,7 @@
2#include <linux/file.h> 2#include <linux/file.h>
3#include <linux/fs.h> 3#include <linux/fs.h>
4#include <linux/linkage.h> 4#include <linux/linkage.h>
5#include <linux/mount.h>
5#include <linux/namei.h> 6#include <linux/namei.h>
6#include <linux/sched.h> 7#include <linux/sched.h>
7#include <linux/stat.h> 8#include <linux/stat.h>
@@ -59,6 +60,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
59 struct inode *inode; 60 struct inode *inode;
60 struct iattr newattrs; 61 struct iattr newattrs;
61 struct file *f = NULL; 62 struct file *f = NULL;
63 struct vfsmount *mnt;
62 64
63 error = -EINVAL; 65 error = -EINVAL;
64 if (times && (!nsec_valid(times[0].tv_nsec) || 66 if (times && (!nsec_valid(times[0].tv_nsec) ||
@@ -79,18 +81,20 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
79 if (!f) 81 if (!f)
80 goto out; 82 goto out;
81 dentry = f->f_path.dentry; 83 dentry = f->f_path.dentry;
84 mnt = f->f_path.mnt;
82 } else { 85 } else {
83 error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd); 86 error = __user_walk_fd(dfd, filename, (flags & AT_SYMLINK_NOFOLLOW) ? 0 : LOOKUP_FOLLOW, &nd);
84 if (error) 87 if (error)
85 goto out; 88 goto out;
86 89
87 dentry = nd.path.dentry; 90 dentry = nd.path.dentry;
91 mnt = nd.path.mnt;
88 } 92 }
89 93
90 inode = dentry->d_inode; 94 inode = dentry->d_inode;
91 95
92 error = -EROFS; 96 error = mnt_want_write(mnt);
93 if (IS_RDONLY(inode)) 97 if (error)
94 goto dput_and_out; 98 goto dput_and_out;
95 99
96 /* Don't worry, the checks are done in inode_change_ok() */ 100 /* Don't worry, the checks are done in inode_change_ok() */
@@ -98,7 +102,7 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
98 if (times) { 102 if (times) {
99 error = -EPERM; 103 error = -EPERM;
100 if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) 104 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
101 goto dput_and_out; 105 goto mnt_drop_write_and_out;
102 106
103 if (times[0].tv_nsec == UTIME_OMIT) 107 if (times[0].tv_nsec == UTIME_OMIT)
104 newattrs.ia_valid &= ~ATTR_ATIME; 108 newattrs.ia_valid &= ~ATTR_ATIME;
@@ -118,22 +122,24 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags
118 } else { 122 } else {
119 error = -EACCES; 123 error = -EACCES;
120 if (IS_IMMUTABLE(inode)) 124 if (IS_IMMUTABLE(inode))
121 goto dput_and_out; 125 goto mnt_drop_write_and_out;
122 126
123 if (!is_owner_or_cap(inode)) { 127 if (!is_owner_or_cap(inode)) {
124 if (f) { 128 if (f) {
125 if (!(f->f_mode & FMODE_WRITE)) 129 if (!(f->f_mode & FMODE_WRITE))
126 goto dput_and_out; 130 goto mnt_drop_write_and_out;
127 } else { 131 } else {
128 error = vfs_permission(&nd, MAY_WRITE); 132 error = vfs_permission(&nd, MAY_WRITE);
129 if (error) 133 if (error)
130 goto dput_and_out; 134 goto mnt_drop_write_and_out;
131 } 135 }
132 } 136 }
133 } 137 }
134 mutex_lock(&inode->i_mutex); 138 mutex_lock(&inode->i_mutex);
135 error = notify_change(dentry, &newattrs); 139 error = notify_change(dentry, &newattrs);
136 mutex_unlock(&inode->i_mutex); 140 mutex_unlock(&inode->i_mutex);
141mnt_drop_write_and_out:
142 mnt_drop_write(mnt);
137dput_and_out: 143dput_and_out:
138 if (f) 144 if (f)
139 fput(f); 145 fput(f);
diff --git a/fs/xattr.c b/fs/xattr.c
index 3acab1615460..89a942f07e1b 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -11,6 +11,7 @@
11#include <linux/slab.h> 11#include <linux/slab.h>
12#include <linux/file.h> 12#include <linux/file.h>
13#include <linux/xattr.h> 13#include <linux/xattr.h>
14#include <linux/mount.h>
14#include <linux/namei.h> 15#include <linux/namei.h>
15#include <linux/security.h> 16#include <linux/security.h>
16#include <linux/syscalls.h> 17#include <linux/syscalls.h>
@@ -32,8 +33,6 @@ xattr_permission(struct inode *inode, const char *name, int mask)
32 * filesystem or on an immutable / append-only inode. 33 * filesystem or on an immutable / append-only inode.
33 */ 34 */
34 if (mask & MAY_WRITE) { 35 if (mask & MAY_WRITE) {
35 if (IS_RDONLY(inode))
36 return -EROFS;
37 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 36 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
38 return -EPERM; 37 return -EPERM;
39 } 38 }
@@ -262,7 +261,11 @@ sys_setxattr(char __user *path, char __user *name, void __user *value,
262 error = user_path_walk(path, &nd); 261 error = user_path_walk(path, &nd);
263 if (error) 262 if (error)
264 return error; 263 return error;
265 error = setxattr(nd.path.dentry, name, value, size, flags); 264 error = mnt_want_write(nd.path.mnt);
265 if (!error) {
266 error = setxattr(nd.path.dentry, name, value, size, flags);
267 mnt_drop_write(nd.path.mnt);
268 }
266 path_put(&nd.path); 269 path_put(&nd.path);
267 return error; 270 return error;
268} 271}
@@ -277,7 +280,11 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value,
277 error = user_path_walk_link(path, &nd); 280 error = user_path_walk_link(path, &nd);
278 if (error) 281 if (error)
279 return error; 282 return error;
280 error = setxattr(nd.path.dentry, name, value, size, flags); 283 error = mnt_want_write(nd.path.mnt);
284 if (!error) {
285 error = setxattr(nd.path.dentry, name, value, size, flags);
286 mnt_drop_write(nd.path.mnt);
287 }
281 path_put(&nd.path); 288 path_put(&nd.path);
282 return error; 289 return error;
283} 290}
@@ -295,7 +302,11 @@ sys_fsetxattr(int fd, char __user *name, void __user *value,
295 return error; 302 return error;
296 dentry = f->f_path.dentry; 303 dentry = f->f_path.dentry;
297 audit_inode(NULL, dentry); 304 audit_inode(NULL, dentry);
298 error = setxattr(dentry, name, value, size, flags); 305 error = mnt_want_write(f->f_path.mnt);
306 if (!error) {
307 error = setxattr(dentry, name, value, size, flags);
308 mnt_drop_write(f->f_path.mnt);
309 }
299 fput(f); 310 fput(f);
300 return error; 311 return error;
301} 312}
@@ -482,7 +493,11 @@ sys_removexattr(char __user *path, char __user *name)
482 error = user_path_walk(path, &nd); 493 error = user_path_walk(path, &nd);
483 if (error) 494 if (error)
484 return error; 495 return error;
485 error = removexattr(nd.path.dentry, name); 496 error = mnt_want_write(nd.path.mnt);
497 if (!error) {
498 error = removexattr(nd.path.dentry, name);
499 mnt_drop_write(nd.path.mnt);
500 }
486 path_put(&nd.path); 501 path_put(&nd.path);
487 return error; 502 return error;
488} 503}
@@ -496,7 +511,11 @@ sys_lremovexattr(char __user *path, char __user *name)
496 error = user_path_walk_link(path, &nd); 511 error = user_path_walk_link(path, &nd);
497 if (error) 512 if (error)
498 return error; 513 return error;
499 error = removexattr(nd.path.dentry, name); 514 error = mnt_want_write(nd.path.mnt);
515 if (!error) {
516 error = removexattr(nd.path.dentry, name);
517 mnt_drop_write(nd.path.mnt);
518 }
500 path_put(&nd.path); 519 path_put(&nd.path);
501 return error; 520 return error;
502} 521}
@@ -513,7 +532,11 @@ sys_fremovexattr(int fd, char __user *name)
513 return error; 532 return error;
514 dentry = f->f_path.dentry; 533 dentry = f->f_path.dentry;
515 audit_inode(NULL, dentry); 534 audit_inode(NULL, dentry);
516 error = removexattr(dentry, name); 535 error = mnt_want_write(f->f_path.mnt);
536 if (!error) {
537 error = removexattr(dentry, name);
538 mnt_drop_write(f->f_path.mnt);
539 }
517 fput(f); 540 fput(f);
518 return error; 541 return error;
519} 542}
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 35115bca036e..524021ff5436 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -35,18 +35,6 @@ config XFS_QUOTA
35 with or without the generic quota support enabled (CONFIG_QUOTA) - 35 with or without the generic quota support enabled (CONFIG_QUOTA) -
36 they are completely independent subsystems. 36 they are completely independent subsystems.
37 37
38config XFS_SECURITY
39 bool "XFS Security Label support"
40 depends on XFS_FS
41 help
42 Security labels support alternative access control models
43 implemented by security modules like SELinux. This option
44 enables an extended attribute namespace for inode security
45 labels in the XFS filesystem.
46
47 If you are not using a security module that requires using
48 extended attributes for inode security labels, say N.
49
50config XFS_POSIX_ACL 38config XFS_POSIX_ACL
51 bool "XFS POSIX ACL support" 39 bool "XFS POSIX ACL support"
52 depends on XFS_FS 40 depends on XFS_FS
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c
index e040f1ce1b6a..9b1bb17a0501 100644
--- a/fs/xfs/linux-2.6/kmem.c
+++ b/fs/xfs/linux-2.6/kmem.c
@@ -37,7 +37,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
37#ifdef DEBUG 37#ifdef DEBUG
38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) { 38 if (unlikely(!(flags & KM_LARGE) && (size > PAGE_SIZE))) {
39 printk(KERN_WARNING "Large %s attempt, size=%ld\n", 39 printk(KERN_WARNING "Large %s attempt, size=%ld\n",
40 __FUNCTION__, (long)size); 40 __func__, (long)size);
41 dump_stack(); 41 dump_stack();
42 } 42 }
43#endif 43#endif
@@ -52,7 +52,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags)
52 if (!(++retries % 100)) 52 if (!(++retries % 100))
53 printk(KERN_ERR "XFS: possible memory allocation " 53 printk(KERN_ERR "XFS: possible memory allocation "
54 "deadlock in %s (mode:0x%x)\n", 54 "deadlock in %s (mode:0x%x)\n",
55 __FUNCTION__, lflags); 55 __func__, lflags);
56 congestion_wait(WRITE, HZ/50); 56 congestion_wait(WRITE, HZ/50);
57 } while (1); 57 } while (1);
58} 58}
@@ -129,7 +129,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags)
129 if (!(++retries % 100)) 129 if (!(++retries % 100))
130 printk(KERN_ERR "XFS: possible memory allocation " 130 printk(KERN_ERR "XFS: possible memory allocation "
131 "deadlock in %s (mode:0x%x)\n", 131 "deadlock in %s (mode:0x%x)\n",
132 __FUNCTION__, lflags); 132 __func__, lflags);
133 congestion_wait(WRITE, HZ/50); 133 congestion_wait(WRITE, HZ/50);
134 } while (1); 134 } while (1);
135} 135}
diff --git a/fs/xfs/linux-2.6/sema.h b/fs/xfs/linux-2.6/sema.h
index 2009e6d922ce..3abe7e9ceb33 100644
--- a/fs/xfs/linux-2.6/sema.h
+++ b/fs/xfs/linux-2.6/sema.h
@@ -20,8 +20,8 @@
20 20
21#include <linux/time.h> 21#include <linux/time.h>
22#include <linux/wait.h> 22#include <linux/wait.h>
23#include <linux/semaphore.h>
23#include <asm/atomic.h> 24#include <asm/atomic.h>
24#include <asm/semaphore.h>
25 25
26/* 26/*
27 * sema_t structure just maps to struct semaphore in Linux kernel. 27 * sema_t structure just maps to struct semaphore in Linux kernel.
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index e0519529c26c..a55c3b26d840 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -243,8 +243,12 @@ xfs_end_bio_unwritten(
243 size_t size = ioend->io_size; 243 size_t size = ioend->io_size;
244 244
245 if (likely(!ioend->io_error)) { 245 if (likely(!ioend->io_error)) {
246 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) 246 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
247 xfs_iomap_write_unwritten(ip, offset, size); 247 int error;
248 error = xfs_iomap_write_unwritten(ip, offset, size);
249 if (error)
250 ioend->io_error = error;
251 }
248 xfs_setfilesize(ioend); 252 xfs_setfilesize(ioend);
249 } 253 }
250 xfs_destroy_ioend(ioend); 254 xfs_destroy_ioend(ioend);
@@ -1532,9 +1536,9 @@ xfs_vm_bmap(
1532 struct xfs_inode *ip = XFS_I(inode); 1536 struct xfs_inode *ip = XFS_I(inode);
1533 1537
1534 xfs_itrace_entry(XFS_I(inode)); 1538 xfs_itrace_entry(XFS_I(inode));
1535 xfs_rwlock(ip, VRWLOCK_READ); 1539 xfs_ilock(ip, XFS_IOLOCK_SHARED);
1536 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF); 1540 xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
1537 xfs_rwunlock(ip, VRWLOCK_READ); 1541 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1538 return generic_block_bmap(mapping, block, xfs_get_blocks); 1542 return generic_block_bmap(mapping, block, xfs_get_blocks);
1539} 1543}
1540 1544
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e347bfd47c91..52f6846101d5 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -400,7 +400,7 @@ _xfs_buf_lookup_pages(
400 printk(KERN_ERR 400 printk(KERN_ERR
401 "XFS: possible memory allocation " 401 "XFS: possible memory allocation "
402 "deadlock in %s (mode:0x%x)\n", 402 "deadlock in %s (mode:0x%x)\n",
403 __FUNCTION__, gfp_mask); 403 __func__, gfp_mask);
404 404
405 XFS_STATS_INC(xb_page_retries); 405 XFS_STATS_INC(xb_page_retries);
406 xfsbufd_wakeup(0, gfp_mask); 406 xfsbufd_wakeup(0, gfp_mask);
@@ -598,7 +598,7 @@ xfs_buf_get_flags(
598 error = _xfs_buf_map_pages(bp, flags); 598 error = _xfs_buf_map_pages(bp, flags);
599 if (unlikely(error)) { 599 if (unlikely(error)) {
600 printk(KERN_WARNING "%s: failed to map pages\n", 600 printk(KERN_WARNING "%s: failed to map pages\n",
601 __FUNCTION__); 601 __func__);
602 goto no_buffer; 602 goto no_buffer;
603 } 603 }
604 } 604 }
@@ -778,7 +778,7 @@ xfs_buf_get_noaddr(
778 error = _xfs_buf_map_pages(bp, XBF_MAPPED); 778 error = _xfs_buf_map_pages(bp, XBF_MAPPED);
779 if (unlikely(error)) { 779 if (unlikely(error)) {
780 printk(KERN_WARNING "%s: failed to map pages\n", 780 printk(KERN_WARNING "%s: failed to map pages\n",
781 __FUNCTION__); 781 __func__);
782 goto fail_free_mem; 782 goto fail_free_mem;
783 } 783 }
784 784
@@ -1060,7 +1060,7 @@ xfs_buf_iostart(
1060 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC); 1060 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
1061 bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC); 1061 bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
1062 xfs_buf_delwri_queue(bp, 1); 1062 xfs_buf_delwri_queue(bp, 1);
1063 return status; 1063 return 0;
1064 } 1064 }
1065 1065
1066 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \ 1066 bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index a3d207de48b8..841d7883528d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -387,11 +387,15 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
387 return error; 387 return error;
388} 388}
389 389
390static inline int xfs_bdwrite(void *mp, xfs_buf_t *bp) 390/*
391 * No error can be returned from xfs_buf_iostart for delwri
392 * buffers as they are queued and no I/O is issued.
393 */
394static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
391{ 395{
392 bp->b_strat = xfs_bdstrat_cb; 396 bp->b_strat = xfs_bdstrat_cb;
393 bp->b_fspriv3 = mp; 397 bp->b_fspriv3 = mp;
394 return xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC); 398 (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
395} 399}
396 400
397#define XFS_bdstrat(bp) xfs_buf_iorequest(bp) 401#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index e7f3da61c6c3..652721ce0ea5 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -30,7 +30,7 @@ typedef struct cred {
30extern struct cred *sys_cred; 30extern struct cred *sys_cred;
31 31
32/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */ 32/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
33static __inline int capable_cred(cred_t *cr, int cid) 33static inline int capable_cred(cred_t *cr, int cid)
34{ 34{
35 return (cr == sys_cred) ? 1 : capable(cid); 35 return (cr == sys_cred) ? 1 : capable(cid);
36} 36}
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index ca4f66c4de16..265f0168ab76 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -22,6 +22,7 @@
22#include "xfs_trans.h" 22#include "xfs_trans.h"
23#include "xfs_sb.h" 23#include "xfs_sb.h"
24#include "xfs_ag.h" 24#include "xfs_ag.h"
25#include "xfs_dir2.h"
25#include "xfs_dmapi.h" 26#include "xfs_dmapi.h"
26#include "xfs_mount.h" 27#include "xfs_mount.h"
27#include "xfs_export.h" 28#include "xfs_export.h"
@@ -30,8 +31,6 @@
30#include "xfs_inode.h" 31#include "xfs_inode.h"
31#include "xfs_vfsops.h" 32#include "xfs_vfsops.h"
32 33
33static struct dentry dotdot = { .d_name.name = "..", .d_name.len = 2, };
34
35/* 34/*
36 * Note that we only accept fileids which are long enough rather than allow 35 * Note that we only accept fileids which are long enough rather than allow
37 * the parent generation number to default to zero. XFS considers zero a 36 * the parent generation number to default to zero. XFS considers zero a
@@ -66,7 +65,7 @@ xfs_fs_encode_fh(
66 int len; 65 int len;
67 66
68 /* Directories don't need their parent encoded, they have ".." */ 67 /* Directories don't need their parent encoded, they have ".." */
69 if (S_ISDIR(inode->i_mode)) 68 if (S_ISDIR(inode->i_mode) || !connectable)
70 fileid_type = FILEID_INO32_GEN; 69 fileid_type = FILEID_INO32_GEN;
71 else 70 else
72 fileid_type = FILEID_INO32_GEN_PARENT; 71 fileid_type = FILEID_INO32_GEN_PARENT;
@@ -213,17 +212,16 @@ xfs_fs_get_parent(
213 struct dentry *child) 212 struct dentry *child)
214{ 213{
215 int error; 214 int error;
216 bhv_vnode_t *cvp; 215 struct xfs_inode *cip;
217 struct dentry *parent; 216 struct dentry *parent;
218 217
219 cvp = NULL; 218 error = xfs_lookup(XFS_I(child->d_inode), &xfs_name_dotdot, &cip);
220 error = xfs_lookup(XFS_I(child->d_inode), &dotdot, &cvp);
221 if (unlikely(error)) 219 if (unlikely(error))
222 return ERR_PTR(-error); 220 return ERR_PTR(-error);
223 221
224 parent = d_alloc_anon(vn_to_inode(cvp)); 222 parent = d_alloc_anon(cip->i_vnode);
225 if (unlikely(!parent)) { 223 if (unlikely(!parent)) {
226 VN_RELE(cvp); 224 iput(cip->i_vnode);
227 return ERR_PTR(-ENOMEM); 225 return ERR_PTR(-ENOMEM);
228 } 226 }
229 return parent; 227 return parent;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index edab1ffbb163..05905246434d 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -469,16 +469,11 @@ xfs_file_open_exec(
469 struct inode *inode) 469 struct inode *inode)
470{ 470{
471 struct xfs_mount *mp = XFS_M(inode->i_sb); 471 struct xfs_mount *mp = XFS_M(inode->i_sb);
472 struct xfs_inode *ip = XFS_I(inode);
472 473
473 if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI)) { 474 if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) &&
474 if (DM_EVENT_ENABLED(XFS_I(inode), DM_EVENT_READ)) { 475 DM_EVENT_ENABLED(ip, DM_EVENT_READ))
475 bhv_vnode_t *vp = vn_from_inode(inode); 476 return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
476
477 return -XFS_SEND_DATA(mp, DM_EVENT_READ,
478 vp, 0, 0, 0, NULL);
479 }
480 }
481
482 return 0; 477 return 0;
483} 478}
484#endif /* HAVE_FOP_OPEN_EXEC */ 479#endif /* HAVE_FOP_OPEN_EXEC */
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index ac6d34cc355d..1eefe61f0e10 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -17,18 +17,7 @@
17 */ 17 */
18#include "xfs.h" 18#include "xfs.h"
19#include "xfs_vnodeops.h" 19#include "xfs_vnodeops.h"
20
21/*
22 * The following six includes are needed so that we can include
23 * xfs_inode.h. What a mess..
24 */
25#include "xfs_bmap_btree.h" 20#include "xfs_bmap_btree.h"
26#include "xfs_inum.h"
27#include "xfs_dir2.h"
28#include "xfs_dir2_sf.h"
29#include "xfs_attr_sf.h"
30#include "xfs_dinode.h"
31
32#include "xfs_inode.h" 21#include "xfs_inode.h"
33 22
34int fs_noerr(void) { return 0; } 23int fs_noerr(void) { return 0; }
@@ -42,11 +31,10 @@ xfs_tosspages(
42 xfs_off_t last, 31 xfs_off_t last,
43 int fiopt) 32 int fiopt)
44{ 33{
45 bhv_vnode_t *vp = XFS_ITOV(ip); 34 struct address_space *mapping = ip->i_vnode->i_mapping;
46 struct inode *inode = vn_to_inode(vp);
47 35
48 if (VN_CACHED(vp)) 36 if (mapping->nrpages)
49 truncate_inode_pages(inode->i_mapping, first); 37 truncate_inode_pages(mapping, first);
50} 38}
51 39
52int 40int
@@ -56,15 +44,14 @@ xfs_flushinval_pages(
56 xfs_off_t last, 44 xfs_off_t last,
57 int fiopt) 45 int fiopt)
58{ 46{
59 bhv_vnode_t *vp = XFS_ITOV(ip); 47 struct address_space *mapping = ip->i_vnode->i_mapping;
60 struct inode *inode = vn_to_inode(vp);
61 int ret = 0; 48 int ret = 0;
62 49
63 if (VN_CACHED(vp)) { 50 if (mapping->nrpages) {
64 xfs_iflags_clear(ip, XFS_ITRUNCATED); 51 xfs_iflags_clear(ip, XFS_ITRUNCATED);
65 ret = filemap_write_and_wait(inode->i_mapping); 52 ret = filemap_write_and_wait(mapping);
66 if (!ret) 53 if (!ret)
67 truncate_inode_pages(inode->i_mapping, first); 54 truncate_inode_pages(mapping, first);
68 } 55 }
69 return ret; 56 return ret;
70} 57}
@@ -77,17 +64,16 @@ xfs_flush_pages(
77 uint64_t flags, 64 uint64_t flags,
78 int fiopt) 65 int fiopt)
79{ 66{
80 bhv_vnode_t *vp = XFS_ITOV(ip); 67 struct address_space *mapping = ip->i_vnode->i_mapping;
81 struct inode *inode = vn_to_inode(vp);
82 int ret = 0; 68 int ret = 0;
83 int ret2; 69 int ret2;
84 70
85 if (VN_DIRTY(vp)) { 71 if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
86 xfs_iflags_clear(ip, XFS_ITRUNCATED); 72 xfs_iflags_clear(ip, XFS_ITRUNCATED);
87 ret = filemap_fdatawrite(inode->i_mapping); 73 ret = filemap_fdatawrite(mapping);
88 if (flags & XFS_B_ASYNC) 74 if (flags & XFS_B_ASYNC)
89 return ret; 75 return ret;
90 ret2 = filemap_fdatawait(inode->i_mapping); 76 ret2 = filemap_fdatawait(mapping);
91 if (!ret) 77 if (!ret)
92 ret = ret2; 78 ret = ret2;
93 } 79 }
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index f34bd010eb51..4ddb86b73c6b 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -535,8 +535,6 @@ xfs_attrmulti_attr_set(
535 char *kbuf; 535 char *kbuf;
536 int error = EFAULT; 536 int error = EFAULT;
537 537
538 if (IS_RDONLY(inode))
539 return -EROFS;
540 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 538 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
541 return EPERM; 539 return EPERM;
542 if (len > XATTR_SIZE_MAX) 540 if (len > XATTR_SIZE_MAX)
@@ -562,8 +560,6 @@ xfs_attrmulti_attr_remove(
562 char *name, 560 char *name,
563 __uint32_t flags) 561 __uint32_t flags)
564{ 562{
565 if (IS_RDONLY(inode))
566 return -EROFS;
567 if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) 563 if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
568 return EPERM; 564 return EPERM;
569 return xfs_attr_remove(XFS_I(inode), name, flags); 565 return xfs_attr_remove(XFS_I(inode), name, flags);
@@ -573,6 +569,7 @@ STATIC int
573xfs_attrmulti_by_handle( 569xfs_attrmulti_by_handle(
574 xfs_mount_t *mp, 570 xfs_mount_t *mp,
575 void __user *arg, 571 void __user *arg,
572 struct file *parfilp,
576 struct inode *parinode) 573 struct inode *parinode)
577{ 574{
578 int error; 575 int error;
@@ -626,13 +623,21 @@ xfs_attrmulti_by_handle(
626 &ops[i].am_length, ops[i].am_flags); 623 &ops[i].am_length, ops[i].am_flags);
627 break; 624 break;
628 case ATTR_OP_SET: 625 case ATTR_OP_SET:
626 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
627 if (ops[i].am_error)
628 break;
629 ops[i].am_error = xfs_attrmulti_attr_set(inode, 629 ops[i].am_error = xfs_attrmulti_attr_set(inode,
630 attr_name, ops[i].am_attrvalue, 630 attr_name, ops[i].am_attrvalue,
631 ops[i].am_length, ops[i].am_flags); 631 ops[i].am_length, ops[i].am_flags);
632 mnt_drop_write(parfilp->f_path.mnt);
632 break; 633 break;
633 case ATTR_OP_REMOVE: 634 case ATTR_OP_REMOVE:
635 ops[i].am_error = mnt_want_write(parfilp->f_path.mnt);
636 if (ops[i].am_error)
637 break;
634 ops[i].am_error = xfs_attrmulti_attr_remove(inode, 638 ops[i].am_error = xfs_attrmulti_attr_remove(inode,
635 attr_name, ops[i].am_flags); 639 attr_name, ops[i].am_flags);
640 mnt_drop_write(parfilp->f_path.mnt);
636 break; 641 break;
637 default: 642 default:
638 ops[i].am_error = EINVAL; 643 ops[i].am_error = EINVAL;
@@ -651,314 +656,6 @@ xfs_attrmulti_by_handle(
651 return -error; 656 return -error;
652} 657}
653 658
654/* prototypes for a few of the stack-hungry cases that have
655 * their own functions. Functions are defined after their use
656 * so gcc doesn't get fancy and inline them with -03 */
657
658STATIC int
659xfs_ioc_space(
660 struct xfs_inode *ip,
661 struct inode *inode,
662 struct file *filp,
663 int flags,
664 unsigned int cmd,
665 void __user *arg);
666
667STATIC int
668xfs_ioc_bulkstat(
669 xfs_mount_t *mp,
670 unsigned int cmd,
671 void __user *arg);
672
673STATIC int
674xfs_ioc_fsgeometry_v1(
675 xfs_mount_t *mp,
676 void __user *arg);
677
678STATIC int
679xfs_ioc_fsgeometry(
680 xfs_mount_t *mp,
681 void __user *arg);
682
683STATIC int
684xfs_ioc_xattr(
685 xfs_inode_t *ip,
686 struct file *filp,
687 unsigned int cmd,
688 void __user *arg);
689
690STATIC int
691xfs_ioc_fsgetxattr(
692 xfs_inode_t *ip,
693 int attr,
694 void __user *arg);
695
696STATIC int
697xfs_ioc_getbmap(
698 struct xfs_inode *ip,
699 int flags,
700 unsigned int cmd,
701 void __user *arg);
702
703STATIC int
704xfs_ioc_getbmapx(
705 struct xfs_inode *ip,
706 void __user *arg);
707
708int
709xfs_ioctl(
710 xfs_inode_t *ip,
711 struct file *filp,
712 int ioflags,
713 unsigned int cmd,
714 void __user *arg)
715{
716 struct inode *inode = filp->f_path.dentry->d_inode;
717 xfs_mount_t *mp = ip->i_mount;
718 int error;
719
720 xfs_itrace_entry(XFS_I(inode));
721 switch (cmd) {
722
723 case XFS_IOC_ALLOCSP:
724 case XFS_IOC_FREESP:
725 case XFS_IOC_RESVSP:
726 case XFS_IOC_UNRESVSP:
727 case XFS_IOC_ALLOCSP64:
728 case XFS_IOC_FREESP64:
729 case XFS_IOC_RESVSP64:
730 case XFS_IOC_UNRESVSP64:
731 /*
732 * Only allow the sys admin to reserve space unless
733 * unwritten extents are enabled.
734 */
735 if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
736 !capable(CAP_SYS_ADMIN))
737 return -EPERM;
738
739 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
740
741 case XFS_IOC_DIOINFO: {
742 struct dioattr da;
743 xfs_buftarg_t *target =
744 XFS_IS_REALTIME_INODE(ip) ?
745 mp->m_rtdev_targp : mp->m_ddev_targp;
746
747 da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
748 da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
749
750 if (copy_to_user(arg, &da, sizeof(da)))
751 return -XFS_ERROR(EFAULT);
752 return 0;
753 }
754
755 case XFS_IOC_FSBULKSTAT_SINGLE:
756 case XFS_IOC_FSBULKSTAT:
757 case XFS_IOC_FSINUMBERS:
758 return xfs_ioc_bulkstat(mp, cmd, arg);
759
760 case XFS_IOC_FSGEOMETRY_V1:
761 return xfs_ioc_fsgeometry_v1(mp, arg);
762
763 case XFS_IOC_FSGEOMETRY:
764 return xfs_ioc_fsgeometry(mp, arg);
765
766 case XFS_IOC_GETVERSION:
767 return put_user(inode->i_generation, (int __user *)arg);
768
769 case XFS_IOC_FSGETXATTR:
770 return xfs_ioc_fsgetxattr(ip, 0, arg);
771 case XFS_IOC_FSGETXATTRA:
772 return xfs_ioc_fsgetxattr(ip, 1, arg);
773 case XFS_IOC_GETXFLAGS:
774 case XFS_IOC_SETXFLAGS:
775 case XFS_IOC_FSSETXATTR:
776 return xfs_ioc_xattr(ip, filp, cmd, arg);
777
778 case XFS_IOC_FSSETDM: {
779 struct fsdmidata dmi;
780
781 if (copy_from_user(&dmi, arg, sizeof(dmi)))
782 return -XFS_ERROR(EFAULT);
783
784 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
785 dmi.fsd_dmstate);
786 return -error;
787 }
788
789 case XFS_IOC_GETBMAP:
790 case XFS_IOC_GETBMAPA:
791 return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
792
793 case XFS_IOC_GETBMAPX:
794 return xfs_ioc_getbmapx(ip, arg);
795
796 case XFS_IOC_FD_TO_HANDLE:
797 case XFS_IOC_PATH_TO_HANDLE:
798 case XFS_IOC_PATH_TO_FSHANDLE:
799 return xfs_find_handle(cmd, arg);
800
801 case XFS_IOC_OPEN_BY_HANDLE:
802 return xfs_open_by_handle(mp, arg, filp, inode);
803
804 case XFS_IOC_FSSETDM_BY_HANDLE:
805 return xfs_fssetdm_by_handle(mp, arg, inode);
806
807 case XFS_IOC_READLINK_BY_HANDLE:
808 return xfs_readlink_by_handle(mp, arg, inode);
809
810 case XFS_IOC_ATTRLIST_BY_HANDLE:
811 return xfs_attrlist_by_handle(mp, arg, inode);
812
813 case XFS_IOC_ATTRMULTI_BY_HANDLE:
814 return xfs_attrmulti_by_handle(mp, arg, inode);
815
816 case XFS_IOC_SWAPEXT: {
817 error = xfs_swapext((struct xfs_swapext __user *)arg);
818 return -error;
819 }
820
821 case XFS_IOC_FSCOUNTS: {
822 xfs_fsop_counts_t out;
823
824 error = xfs_fs_counts(mp, &out);
825 if (error)
826 return -error;
827
828 if (copy_to_user(arg, &out, sizeof(out)))
829 return -XFS_ERROR(EFAULT);
830 return 0;
831 }
832
833 case XFS_IOC_SET_RESBLKS: {
834 xfs_fsop_resblks_t inout;
835 __uint64_t in;
836
837 if (!capable(CAP_SYS_ADMIN))
838 return -EPERM;
839
840 if (copy_from_user(&inout, arg, sizeof(inout)))
841 return -XFS_ERROR(EFAULT);
842
843 /* input parameter is passed in resblks field of structure */
844 in = inout.resblks;
845 error = xfs_reserve_blocks(mp, &in, &inout);
846 if (error)
847 return -error;
848
849 if (copy_to_user(arg, &inout, sizeof(inout)))
850 return -XFS_ERROR(EFAULT);
851 return 0;
852 }
853
854 case XFS_IOC_GET_RESBLKS: {
855 xfs_fsop_resblks_t out;
856
857 if (!capable(CAP_SYS_ADMIN))
858 return -EPERM;
859
860 error = xfs_reserve_blocks(mp, NULL, &out);
861 if (error)
862 return -error;
863
864 if (copy_to_user(arg, &out, sizeof(out)))
865 return -XFS_ERROR(EFAULT);
866
867 return 0;
868 }
869
870 case XFS_IOC_FSGROWFSDATA: {
871 xfs_growfs_data_t in;
872
873 if (!capable(CAP_SYS_ADMIN))
874 return -EPERM;
875
876 if (copy_from_user(&in, arg, sizeof(in)))
877 return -XFS_ERROR(EFAULT);
878
879 error = xfs_growfs_data(mp, &in);
880 return -error;
881 }
882
883 case XFS_IOC_FSGROWFSLOG: {
884 xfs_growfs_log_t in;
885
886 if (!capable(CAP_SYS_ADMIN))
887 return -EPERM;
888
889 if (copy_from_user(&in, arg, sizeof(in)))
890 return -XFS_ERROR(EFAULT);
891
892 error = xfs_growfs_log(mp, &in);
893 return -error;
894 }
895
896 case XFS_IOC_FSGROWFSRT: {
897 xfs_growfs_rt_t in;
898
899 if (!capable(CAP_SYS_ADMIN))
900 return -EPERM;
901
902 if (copy_from_user(&in, arg, sizeof(in)))
903 return -XFS_ERROR(EFAULT);
904
905 error = xfs_growfs_rt(mp, &in);
906 return -error;
907 }
908
909 case XFS_IOC_FREEZE:
910 if (!capable(CAP_SYS_ADMIN))
911 return -EPERM;
912
913 if (inode->i_sb->s_frozen == SB_UNFROZEN)
914 freeze_bdev(inode->i_sb->s_bdev);
915 return 0;
916
917 case XFS_IOC_THAW:
918 if (!capable(CAP_SYS_ADMIN))
919 return -EPERM;
920 if (inode->i_sb->s_frozen != SB_UNFROZEN)
921 thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
922 return 0;
923
924 case XFS_IOC_GOINGDOWN: {
925 __uint32_t in;
926
927 if (!capable(CAP_SYS_ADMIN))
928 return -EPERM;
929
930 if (get_user(in, (__uint32_t __user *)arg))
931 return -XFS_ERROR(EFAULT);
932
933 error = xfs_fs_goingdown(mp, in);
934 return -error;
935 }
936
937 case XFS_IOC_ERROR_INJECTION: {
938 xfs_error_injection_t in;
939
940 if (!capable(CAP_SYS_ADMIN))
941 return -EPERM;
942
943 if (copy_from_user(&in, arg, sizeof(in)))
944 return -XFS_ERROR(EFAULT);
945
946 error = xfs_errortag_add(in.errtag, mp);
947 return -error;
948 }
949
950 case XFS_IOC_ERROR_CLEARALL:
951 if (!capable(CAP_SYS_ADMIN))
952 return -EPERM;
953
954 error = xfs_errortag_clearall(mp, 1);
955 return -error;
956
957 default:
958 return -ENOTTY;
959 }
960}
961
962STATIC int 659STATIC int
963xfs_ioc_space( 660xfs_ioc_space(
964 struct xfs_inode *ip, 661 struct xfs_inode *ip,
@@ -1179,85 +876,85 @@ xfs_ioc_fsgetxattr(
1179} 876}
1180 877
1181STATIC int 878STATIC int
1182xfs_ioc_xattr( 879xfs_ioc_fssetxattr(
1183 xfs_inode_t *ip, 880 xfs_inode_t *ip,
1184 struct file *filp, 881 struct file *filp,
1185 unsigned int cmd,
1186 void __user *arg) 882 void __user *arg)
1187{ 883{
1188 struct fsxattr fa; 884 struct fsxattr fa;
1189 struct bhv_vattr *vattr; 885 struct bhv_vattr *vattr;
1190 int error = 0; 886 int error;
1191 int attr_flags; 887 int attr_flags;
1192 unsigned int flags; 888
889 if (copy_from_user(&fa, arg, sizeof(fa)))
890 return -EFAULT;
1193 891
1194 vattr = kmalloc(sizeof(*vattr), GFP_KERNEL); 892 vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
1195 if (unlikely(!vattr)) 893 if (unlikely(!vattr))
1196 return -ENOMEM; 894 return -ENOMEM;
1197 895
1198 switch (cmd) { 896 attr_flags = 0;
1199 case XFS_IOC_FSSETXATTR: { 897 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1200 if (copy_from_user(&fa, arg, sizeof(fa))) { 898 attr_flags |= ATTR_NONBLOCK;
1201 error = -EFAULT;
1202 break;
1203 }
1204 899
1205 attr_flags = 0; 900 vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID;
1206 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 901 vattr->va_xflags = fa.fsx_xflags;
1207 attr_flags |= ATTR_NONBLOCK; 902 vattr->va_extsize = fa.fsx_extsize;
903 vattr->va_projid = fa.fsx_projid;
1208 904
1209 vattr->va_mask = XFS_AT_XFLAGS | XFS_AT_EXTSIZE | XFS_AT_PROJID; 905 error = -xfs_setattr(ip, vattr, attr_flags, NULL);
1210 vattr->va_xflags = fa.fsx_xflags; 906 if (!error)
1211 vattr->va_extsize = fa.fsx_extsize; 907 vn_revalidate(XFS_ITOV(ip)); /* update flags */
1212 vattr->va_projid = fa.fsx_projid; 908 kfree(vattr);
909 return 0;
910}
1213 911
1214 error = xfs_setattr(ip, vattr, attr_flags, NULL); 912STATIC int
1215 if (likely(!error)) 913xfs_ioc_getxflags(
1216 vn_revalidate(XFS_ITOV(ip)); /* update flags */ 914 xfs_inode_t *ip,
1217 error = -error; 915 void __user *arg)
1218 break; 916{
1219 } 917 unsigned int flags;
1220 918
1221 case XFS_IOC_GETXFLAGS: { 919 flags = xfs_di2lxflags(ip->i_d.di_flags);
1222 flags = xfs_di2lxflags(ip->i_d.di_flags); 920 if (copy_to_user(arg, &flags, sizeof(flags)))
1223 if (copy_to_user(arg, &flags, sizeof(flags))) 921 return -EFAULT;
1224 error = -EFAULT; 922 return 0;
1225 break; 923}
1226 }
1227 924
1228 case XFS_IOC_SETXFLAGS: { 925STATIC int
1229 if (copy_from_user(&flags, arg, sizeof(flags))) { 926xfs_ioc_setxflags(
1230 error = -EFAULT; 927 xfs_inode_t *ip,
1231 break; 928 struct file *filp,
1232 } 929 void __user *arg)
930{
931 struct bhv_vattr *vattr;
932 unsigned int flags;
933 int attr_flags;
934 int error;
1233 935
1234 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \ 936 if (copy_from_user(&flags, arg, sizeof(flags)))
1235 FS_NOATIME_FL | FS_NODUMP_FL | \ 937 return -EFAULT;
1236 FS_SYNC_FL)) {
1237 error = -EOPNOTSUPP;
1238 break;
1239 }
1240 938
1241 attr_flags = 0; 939 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
1242 if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) 940 FS_NOATIME_FL | FS_NODUMP_FL | \
1243 attr_flags |= ATTR_NONBLOCK; 941 FS_SYNC_FL))
942 return -EOPNOTSUPP;
1244 943
1245 vattr->va_mask = XFS_AT_XFLAGS; 944 vattr = kmalloc(sizeof(*vattr), GFP_KERNEL);
1246 vattr->va_xflags = xfs_merge_ioc_xflags(flags, 945 if (unlikely(!vattr))
1247 xfs_ip2xflags(ip)); 946 return -ENOMEM;
1248 947
1249 error = xfs_setattr(ip, vattr, attr_flags, NULL); 948 attr_flags = 0;
1250 if (likely(!error)) 949 if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
1251 vn_revalidate(XFS_ITOV(ip)); /* update flags */ 950 attr_flags |= ATTR_NONBLOCK;
1252 error = -error;
1253 break;
1254 }
1255 951
1256 default: 952 vattr->va_mask = XFS_AT_XFLAGS;
1257 error = -ENOTTY; 953 vattr->va_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
1258 break;
1259 }
1260 954
955 error = -xfs_setattr(ip, vattr, attr_flags, NULL);
956 if (likely(!error))
957 vn_revalidate(XFS_ITOV(ip)); /* update flags */
1261 kfree(vattr); 958 kfree(vattr);
1262 return error; 959 return error;
1263} 960}
@@ -1332,3 +1029,259 @@ xfs_ioc_getbmapx(
1332 1029
1333 return 0; 1030 return 0;
1334} 1031}
1032
1033int
1034xfs_ioctl(
1035 xfs_inode_t *ip,
1036 struct file *filp,
1037 int ioflags,
1038 unsigned int cmd,
1039 void __user *arg)
1040{
1041 struct inode *inode = filp->f_path.dentry->d_inode;
1042 xfs_mount_t *mp = ip->i_mount;
1043 int error;
1044
1045 xfs_itrace_entry(XFS_I(inode));
1046 switch (cmd) {
1047
1048 case XFS_IOC_ALLOCSP:
1049 case XFS_IOC_FREESP:
1050 case XFS_IOC_RESVSP:
1051 case XFS_IOC_UNRESVSP:
1052 case XFS_IOC_ALLOCSP64:
1053 case XFS_IOC_FREESP64:
1054 case XFS_IOC_RESVSP64:
1055 case XFS_IOC_UNRESVSP64:
1056 /*
1057 * Only allow the sys admin to reserve space unless
1058 * unwritten extents are enabled.
1059 */
1060 if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
1061 !capable(CAP_SYS_ADMIN))
1062 return -EPERM;
1063
1064 return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
1065
1066 case XFS_IOC_DIOINFO: {
1067 struct dioattr da;
1068 xfs_buftarg_t *target =
1069 XFS_IS_REALTIME_INODE(ip) ?
1070 mp->m_rtdev_targp : mp->m_ddev_targp;
1071
1072 da.d_mem = da.d_miniosz = 1 << target->bt_sshift;
1073 da.d_maxiosz = INT_MAX & ~(da.d_miniosz - 1);
1074
1075 if (copy_to_user(arg, &da, sizeof(da)))
1076 return -XFS_ERROR(EFAULT);
1077 return 0;
1078 }
1079
1080 case XFS_IOC_FSBULKSTAT_SINGLE:
1081 case XFS_IOC_FSBULKSTAT:
1082 case XFS_IOC_FSINUMBERS:
1083 return xfs_ioc_bulkstat(mp, cmd, arg);
1084
1085 case XFS_IOC_FSGEOMETRY_V1:
1086 return xfs_ioc_fsgeometry_v1(mp, arg);
1087
1088 case XFS_IOC_FSGEOMETRY:
1089 return xfs_ioc_fsgeometry(mp, arg);
1090
1091 case XFS_IOC_GETVERSION:
1092 return put_user(inode->i_generation, (int __user *)arg);
1093
1094 case XFS_IOC_FSGETXATTR:
1095 return xfs_ioc_fsgetxattr(ip, 0, arg);
1096 case XFS_IOC_FSGETXATTRA:
1097 return xfs_ioc_fsgetxattr(ip, 1, arg);
1098 case XFS_IOC_FSSETXATTR:
1099 return xfs_ioc_fssetxattr(ip, filp, arg);
1100 case XFS_IOC_GETXFLAGS:
1101 return xfs_ioc_getxflags(ip, arg);
1102 case XFS_IOC_SETXFLAGS:
1103 return xfs_ioc_setxflags(ip, filp, arg);
1104
1105 case XFS_IOC_FSSETDM: {
1106 struct fsdmidata dmi;
1107
1108 if (copy_from_user(&dmi, arg, sizeof(dmi)))
1109 return -XFS_ERROR(EFAULT);
1110
1111 error = xfs_set_dmattrs(ip, dmi.fsd_dmevmask,
1112 dmi.fsd_dmstate);
1113 return -error;
1114 }
1115
1116 case XFS_IOC_GETBMAP:
1117 case XFS_IOC_GETBMAPA:
1118 return xfs_ioc_getbmap(ip, ioflags, cmd, arg);
1119
1120 case XFS_IOC_GETBMAPX:
1121 return xfs_ioc_getbmapx(ip, arg);
1122
1123 case XFS_IOC_FD_TO_HANDLE:
1124 case XFS_IOC_PATH_TO_HANDLE:
1125 case XFS_IOC_PATH_TO_FSHANDLE:
1126 return xfs_find_handle(cmd, arg);
1127
1128 case XFS_IOC_OPEN_BY_HANDLE:
1129 return xfs_open_by_handle(mp, arg, filp, inode);
1130
1131 case XFS_IOC_FSSETDM_BY_HANDLE:
1132 return xfs_fssetdm_by_handle(mp, arg, inode);
1133
1134 case XFS_IOC_READLINK_BY_HANDLE:
1135 return xfs_readlink_by_handle(mp, arg, inode);
1136
1137 case XFS_IOC_ATTRLIST_BY_HANDLE:
1138 return xfs_attrlist_by_handle(mp, arg, inode);
1139
1140 case XFS_IOC_ATTRMULTI_BY_HANDLE:
1141 return xfs_attrmulti_by_handle(mp, arg, filp, inode);
1142
1143 case XFS_IOC_SWAPEXT: {
1144 error = xfs_swapext((struct xfs_swapext __user *)arg);
1145 return -error;
1146 }
1147
1148 case XFS_IOC_FSCOUNTS: {
1149 xfs_fsop_counts_t out;
1150
1151 error = xfs_fs_counts(mp, &out);
1152 if (error)
1153 return -error;
1154
1155 if (copy_to_user(arg, &out, sizeof(out)))
1156 return -XFS_ERROR(EFAULT);
1157 return 0;
1158 }
1159
1160 case XFS_IOC_SET_RESBLKS: {
1161 xfs_fsop_resblks_t inout;
1162 __uint64_t in;
1163
1164 if (!capable(CAP_SYS_ADMIN))
1165 return -EPERM;
1166
1167 if (copy_from_user(&inout, arg, sizeof(inout)))
1168 return -XFS_ERROR(EFAULT);
1169
1170 /* input parameter is passed in resblks field of structure */
1171 in = inout.resblks;
1172 error = xfs_reserve_blocks(mp, &in, &inout);
1173 if (error)
1174 return -error;
1175
1176 if (copy_to_user(arg, &inout, sizeof(inout)))
1177 return -XFS_ERROR(EFAULT);
1178 return 0;
1179 }
1180
1181 case XFS_IOC_GET_RESBLKS: {
1182 xfs_fsop_resblks_t out;
1183
1184 if (!capable(CAP_SYS_ADMIN))
1185 return -EPERM;
1186
1187 error = xfs_reserve_blocks(mp, NULL, &out);
1188 if (error)
1189 return -error;
1190
1191 if (copy_to_user(arg, &out, sizeof(out)))
1192 return -XFS_ERROR(EFAULT);
1193
1194 return 0;
1195 }
1196
1197 case XFS_IOC_FSGROWFSDATA: {
1198 xfs_growfs_data_t in;
1199
1200 if (!capable(CAP_SYS_ADMIN))
1201 return -EPERM;
1202
1203 if (copy_from_user(&in, arg, sizeof(in)))
1204 return -XFS_ERROR(EFAULT);
1205
1206 error = xfs_growfs_data(mp, &in);
1207 return -error;
1208 }
1209
1210 case XFS_IOC_FSGROWFSLOG: {
1211 xfs_growfs_log_t in;
1212
1213 if (!capable(CAP_SYS_ADMIN))
1214 return -EPERM;
1215
1216 if (copy_from_user(&in, arg, sizeof(in)))
1217 return -XFS_ERROR(EFAULT);
1218
1219 error = xfs_growfs_log(mp, &in);
1220 return -error;
1221 }
1222
1223 case XFS_IOC_FSGROWFSRT: {
1224 xfs_growfs_rt_t in;
1225
1226 if (!capable(CAP_SYS_ADMIN))
1227 return -EPERM;
1228
1229 if (copy_from_user(&in, arg, sizeof(in)))
1230 return -XFS_ERROR(EFAULT);
1231
1232 error = xfs_growfs_rt(mp, &in);
1233 return -error;
1234 }
1235
1236 case XFS_IOC_FREEZE:
1237 if (!capable(CAP_SYS_ADMIN))
1238 return -EPERM;
1239
1240 if (inode->i_sb->s_frozen == SB_UNFROZEN)
1241 freeze_bdev(inode->i_sb->s_bdev);
1242 return 0;
1243
1244 case XFS_IOC_THAW:
1245 if (!capable(CAP_SYS_ADMIN))
1246 return -EPERM;
1247 if (inode->i_sb->s_frozen != SB_UNFROZEN)
1248 thaw_bdev(inode->i_sb->s_bdev, inode->i_sb);
1249 return 0;
1250
1251 case XFS_IOC_GOINGDOWN: {
1252 __uint32_t in;
1253
1254 if (!capable(CAP_SYS_ADMIN))
1255 return -EPERM;
1256
1257 if (get_user(in, (__uint32_t __user *)arg))
1258 return -XFS_ERROR(EFAULT);
1259
1260 error = xfs_fs_goingdown(mp, in);
1261 return -error;
1262 }
1263
1264 case XFS_IOC_ERROR_INJECTION: {
1265 xfs_error_injection_t in;
1266
1267 if (!capable(CAP_SYS_ADMIN))
1268 return -EPERM;
1269
1270 if (copy_from_user(&in, arg, sizeof(in)))
1271 return -XFS_ERROR(EFAULT);
1272
1273 error = xfs_errortag_add(in.errtag, mp);
1274 return -error;
1275 }
1276
1277 case XFS_IOC_ERROR_CLEARALL:
1278 if (!capable(CAP_SYS_ADMIN))
1279 return -EPERM;
1280
1281 error = xfs_errortag_clearall(mp, 1);
1282 return -error;
1283
1284 default:
1285 return -ENOTTY;
1286 }
1287}
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index cc4abd3daa49..a1237dad6430 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -62,12 +62,11 @@ void
62xfs_synchronize_atime( 62xfs_synchronize_atime(
63 xfs_inode_t *ip) 63 xfs_inode_t *ip)
64{ 64{
65 bhv_vnode_t *vp; 65 struct inode *inode = ip->i_vnode;
66 66
67 vp = XFS_ITOV_NULL(ip); 67 if (inode) {
68 if (vp) { 68 ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
69 ip->i_d.di_atime.t_sec = (__int32_t)vp->i_atime.tv_sec; 69 ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
70 ip->i_d.di_atime.t_nsec = (__int32_t)vp->i_atime.tv_nsec;
71 } 70 }
72} 71}
73 72
@@ -80,11 +79,10 @@ void
80xfs_mark_inode_dirty_sync( 79xfs_mark_inode_dirty_sync(
81 xfs_inode_t *ip) 80 xfs_inode_t *ip)
82{ 81{
83 bhv_vnode_t *vp; 82 struct inode *inode = ip->i_vnode;
84 83
85 vp = XFS_ITOV_NULL(ip); 84 if (inode)
86 if (vp) 85 mark_inode_dirty_sync(inode);
87 mark_inode_dirty_sync(vn_to_inode(vp));
88} 86}
89 87
90/* 88/*
@@ -157,13 +155,6 @@ xfs_ichgtime_fast(
157 */ 155 */
158 ASSERT((flags & XFS_ICHGTIME_ACC) == 0); 156 ASSERT((flags & XFS_ICHGTIME_ACC) == 0);
159 157
160 /*
161 * We're not supposed to change timestamps in readonly-mounted
162 * filesystems. Throw it away if anyone asks us.
163 */
164 if (unlikely(IS_RDONLY(inode)))
165 return;
166
167 if (flags & XFS_ICHGTIME_MOD) { 158 if (flags & XFS_ICHGTIME_MOD) {
168 tvp = &inode->i_mtime; 159 tvp = &inode->i_mtime;
169 ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec; 160 ip->i_d.di_mtime.t_sec = (__int32_t)tvp->tv_sec;
@@ -215,66 +206,62 @@ xfs_validate_fields(
215 */ 206 */
216STATIC int 207STATIC int
217xfs_init_security( 208xfs_init_security(
218 bhv_vnode_t *vp, 209 struct inode *inode,
219 struct inode *dir) 210 struct inode *dir)
220{ 211{
221 struct inode *ip = vn_to_inode(vp); 212 struct xfs_inode *ip = XFS_I(inode);
222 size_t length; 213 size_t length;
223 void *value; 214 void *value;
224 char *name; 215 char *name;
225 int error; 216 int error;
226 217
227 error = security_inode_init_security(ip, dir, &name, &value, &length); 218 error = security_inode_init_security(inode, dir, &name,
219 &value, &length);
228 if (error) { 220 if (error) {
229 if (error == -EOPNOTSUPP) 221 if (error == -EOPNOTSUPP)
230 return 0; 222 return 0;
231 return -error; 223 return -error;
232 } 224 }
233 225
234 error = xfs_attr_set(XFS_I(ip), name, value, 226 error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
235 length, ATTR_SECURE);
236 if (!error) 227 if (!error)
237 xfs_iflags_set(XFS_I(ip), XFS_IMODIFIED); 228 xfs_iflags_set(ip, XFS_IMODIFIED);
238 229
239 kfree(name); 230 kfree(name);
240 kfree(value); 231 kfree(value);
241 return error; 232 return error;
242} 233}
243 234
244/* 235static void
245 * Determine whether a process has a valid fs_struct (kernel daemons 236xfs_dentry_to_name(
246 * like knfsd don't have an fs_struct). 237 struct xfs_name *namep,
247 * 238 struct dentry *dentry)
248 * XXX(hch): nfsd is broken, better fix it instead.
249 */
250STATIC_INLINE int
251xfs_has_fs_struct(struct task_struct *task)
252{ 239{
253 return (task->fs != init_task.fs); 240 namep->name = dentry->d_name.name;
241 namep->len = dentry->d_name.len;
254} 242}
255 243
256STATIC void 244STATIC void
257xfs_cleanup_inode( 245xfs_cleanup_inode(
258 struct inode *dir, 246 struct inode *dir,
259 bhv_vnode_t *vp, 247 struct inode *inode,
260 struct dentry *dentry, 248 struct dentry *dentry,
261 int mode) 249 int mode)
262{ 250{
263 struct dentry teardown = {}; 251 struct xfs_name teardown;
264 252
265 /* Oh, the horror. 253 /* Oh, the horror.
266 * If we can't add the ACL or we fail in 254 * If we can't add the ACL or we fail in
267 * xfs_init_security we must back out. 255 * xfs_init_security we must back out.
268 * ENOSPC can hit here, among other things. 256 * ENOSPC can hit here, among other things.
269 */ 257 */
270 teardown.d_inode = vn_to_inode(vp); 258 xfs_dentry_to_name(&teardown, dentry);
271 teardown.d_name = dentry->d_name;
272 259
273 if (S_ISDIR(mode)) 260 if (S_ISDIR(mode))
274 xfs_rmdir(XFS_I(dir), &teardown); 261 xfs_rmdir(XFS_I(dir), &teardown, XFS_I(inode));
275 else 262 else
276 xfs_remove(XFS_I(dir), &teardown); 263 xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
277 VN_RELE(vp); 264 iput(inode);
278} 265}
279 266
280STATIC int 267STATIC int
@@ -284,9 +271,10 @@ xfs_vn_mknod(
284 int mode, 271 int mode,
285 dev_t rdev) 272 dev_t rdev)
286{ 273{
287 struct inode *ip; 274 struct inode *inode;
288 bhv_vnode_t *vp = NULL, *dvp = vn_from_inode(dir); 275 struct xfs_inode *ip = NULL;
289 xfs_acl_t *default_acl = NULL; 276 xfs_acl_t *default_acl = NULL;
277 struct xfs_name name;
290 attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS; 278 attrexists_t test_default_acl = _ACL_DEFAULT_EXISTS;
291 int error; 279 int error;
292 280
@@ -297,59 +285,67 @@ xfs_vn_mknod(
297 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff)) 285 if (unlikely(!sysv_valid_dev(rdev) || MAJOR(rdev) & ~0x1ff))
298 return -EINVAL; 286 return -EINVAL;
299 287
300 if (unlikely(test_default_acl && test_default_acl(dvp))) { 288 if (test_default_acl && test_default_acl(dir)) {
301 if (!_ACL_ALLOC(default_acl)) { 289 if (!_ACL_ALLOC(default_acl)) {
302 return -ENOMEM; 290 return -ENOMEM;
303 } 291 }
304 if (!_ACL_GET_DEFAULT(dvp, default_acl)) { 292 if (!_ACL_GET_DEFAULT(dir, default_acl)) {
305 _ACL_FREE(default_acl); 293 _ACL_FREE(default_acl);
306 default_acl = NULL; 294 default_acl = NULL;
307 } 295 }
308 } 296 }
309 297
310 if (IS_POSIXACL(dir) && !default_acl && xfs_has_fs_struct(current)) 298 xfs_dentry_to_name(&name, dentry);
299
300 if (IS_POSIXACL(dir) && !default_acl)
311 mode &= ~current->fs->umask; 301 mode &= ~current->fs->umask;
312 302
313 switch (mode & S_IFMT) { 303 switch (mode & S_IFMT) {
314 case S_IFCHR: case S_IFBLK: case S_IFIFO: case S_IFSOCK: 304 case S_IFCHR:
305 case S_IFBLK:
306 case S_IFIFO:
307 case S_IFSOCK:
315 rdev = sysv_encode_dev(rdev); 308 rdev = sysv_encode_dev(rdev);
316 case S_IFREG: 309 case S_IFREG:
317 error = xfs_create(XFS_I(dir), dentry, mode, rdev, &vp, NULL); 310 error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL);
318 break; 311 break;
319 case S_IFDIR: 312 case S_IFDIR:
320 error = xfs_mkdir(XFS_I(dir), dentry, mode, &vp, NULL); 313 error = xfs_mkdir(XFS_I(dir), &name, mode, &ip, NULL);
321 break; 314 break;
322 default: 315 default:
323 error = EINVAL; 316 error = EINVAL;
324 break; 317 break;
325 } 318 }
326 319
327 if (unlikely(!error)) { 320 if (unlikely(error))
328 error = xfs_init_security(vp, dir); 321 goto out_free_acl;
329 if (error)
330 xfs_cleanup_inode(dir, vp, dentry, mode);
331 }
332 322
333 if (unlikely(default_acl)) { 323 inode = ip->i_vnode;
334 if (!error) { 324
335 error = _ACL_INHERIT(vp, mode, default_acl); 325 error = xfs_init_security(inode, dir);
336 if (!error) 326 if (unlikely(error))
337 xfs_iflags_set(XFS_I(vp), XFS_IMODIFIED); 327 goto out_cleanup_inode;
338 else 328
339 xfs_cleanup_inode(dir, vp, dentry, mode); 329 if (default_acl) {
340 } 330 error = _ACL_INHERIT(inode, mode, default_acl);
331 if (unlikely(error))
332 goto out_cleanup_inode;
333 xfs_iflags_set(ip, XFS_IMODIFIED);
341 _ACL_FREE(default_acl); 334 _ACL_FREE(default_acl);
342 } 335 }
343 336
344 if (likely(!error)) {
345 ASSERT(vp);
346 ip = vn_to_inode(vp);
347 337
348 if (S_ISDIR(mode)) 338 if (S_ISDIR(mode))
349 xfs_validate_fields(ip); 339 xfs_validate_fields(inode);
350 d_instantiate(dentry, ip); 340 d_instantiate(dentry, inode);
351 xfs_validate_fields(dir); 341 xfs_validate_fields(dir);
352 } 342 return -error;
343
344 out_cleanup_inode:
345 xfs_cleanup_inode(dir, inode, dentry, mode);
346 out_free_acl:
347 if (default_acl)
348 _ACL_FREE(default_acl);
353 return -error; 349 return -error;
354} 350}
355 351
@@ -378,13 +374,15 @@ xfs_vn_lookup(
378 struct dentry *dentry, 374 struct dentry *dentry,
379 struct nameidata *nd) 375 struct nameidata *nd)
380{ 376{
381 bhv_vnode_t *cvp; 377 struct xfs_inode *cip;
378 struct xfs_name name;
382 int error; 379 int error;
383 380
384 if (dentry->d_name.len >= MAXNAMELEN) 381 if (dentry->d_name.len >= MAXNAMELEN)
385 return ERR_PTR(-ENAMETOOLONG); 382 return ERR_PTR(-ENAMETOOLONG);
386 383
387 error = xfs_lookup(XFS_I(dir), dentry, &cvp); 384 xfs_dentry_to_name(&name, dentry);
385 error = xfs_lookup(XFS_I(dir), &name, &cip);
388 if (unlikely(error)) { 386 if (unlikely(error)) {
389 if (unlikely(error != ENOENT)) 387 if (unlikely(error != ENOENT))
390 return ERR_PTR(-error); 388 return ERR_PTR(-error);
@@ -392,7 +390,7 @@ xfs_vn_lookup(
392 return NULL; 390 return NULL;
393 } 391 }
394 392
395 return d_splice_alias(vn_to_inode(cvp), dentry); 393 return d_splice_alias(cip->i_vnode, dentry);
396} 394}
397 395
398STATIC int 396STATIC int
@@ -401,23 +399,24 @@ xfs_vn_link(
401 struct inode *dir, 399 struct inode *dir,
402 struct dentry *dentry) 400 struct dentry *dentry)
403{ 401{
404 struct inode *ip; /* inode of guy being linked to */ 402 struct inode *inode; /* inode of guy being linked to */
405 bhv_vnode_t *vp; /* vp of name being linked */ 403 struct xfs_name name;
406 int error; 404 int error;
407 405
408 ip = old_dentry->d_inode; /* inode being linked to */ 406 inode = old_dentry->d_inode;
409 vp = vn_from_inode(ip); 407 xfs_dentry_to_name(&name, dentry);
410 408
411 VN_HOLD(vp); 409 igrab(inode);
412 error = xfs_link(XFS_I(dir), vp, dentry); 410 error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
413 if (unlikely(error)) { 411 if (unlikely(error)) {
414 VN_RELE(vp); 412 iput(inode);
415 } else { 413 return -error;
416 xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
417 xfs_validate_fields(ip);
418 d_instantiate(dentry, ip);
419 } 414 }
420 return -error; 415
416 xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
417 xfs_validate_fields(inode);
418 d_instantiate(dentry, inode);
419 return 0;
421} 420}
422 421
423STATIC int 422STATIC int
@@ -426,11 +425,13 @@ xfs_vn_unlink(
426 struct dentry *dentry) 425 struct dentry *dentry)
427{ 426{
428 struct inode *inode; 427 struct inode *inode;
428 struct xfs_name name;
429 int error; 429 int error;
430 430
431 inode = dentry->d_inode; 431 inode = dentry->d_inode;
432 xfs_dentry_to_name(&name, dentry);
432 433
433 error = xfs_remove(XFS_I(dir), dentry); 434 error = xfs_remove(XFS_I(dir), &name, XFS_I(inode));
434 if (likely(!error)) { 435 if (likely(!error)) {
435 xfs_validate_fields(dir); /* size needs update */ 436 xfs_validate_fields(dir); /* size needs update */
436 xfs_validate_fields(inode); 437 xfs_validate_fields(inode);
@@ -444,29 +445,34 @@ xfs_vn_symlink(
444 struct dentry *dentry, 445 struct dentry *dentry,
445 const char *symname) 446 const char *symname)
446{ 447{
447 struct inode *ip; 448 struct inode *inode;
448 bhv_vnode_t *cvp; /* used to lookup symlink to put in dentry */ 449 struct xfs_inode *cip = NULL;
450 struct xfs_name name;
449 int error; 451 int error;
450 mode_t mode; 452 mode_t mode;
451 453
452 cvp = NULL;
453
454 mode = S_IFLNK | 454 mode = S_IFLNK |
455 (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO); 455 (irix_symlink_mode ? 0777 & ~current->fs->umask : S_IRWXUGO);
456 xfs_dentry_to_name(&name, dentry);
456 457
457 error = xfs_symlink(XFS_I(dir), dentry, (char *)symname, mode, 458 error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL);
458 &cvp, NULL); 459 if (unlikely(error))
459 if (likely(!error && cvp)) { 460 goto out;
460 error = xfs_init_security(cvp, dir); 461
461 if (likely(!error)) { 462 inode = cip->i_vnode;
462 ip = vn_to_inode(cvp); 463
463 d_instantiate(dentry, ip); 464 error = xfs_init_security(inode, dir);
464 xfs_validate_fields(dir); 465 if (unlikely(error))
465 xfs_validate_fields(ip); 466 goto out_cleanup_inode;
466 } else { 467
467 xfs_cleanup_inode(dir, cvp, dentry, 0); 468 d_instantiate(dentry, inode);
468 } 469 xfs_validate_fields(dir);
469 } 470 xfs_validate_fields(inode);
471 return 0;
472
473 out_cleanup_inode:
474 xfs_cleanup_inode(dir, inode, dentry, 0);
475 out:
470 return -error; 476 return -error;
471} 477}
472 478
@@ -476,9 +482,12 @@ xfs_vn_rmdir(
476 struct dentry *dentry) 482 struct dentry *dentry)
477{ 483{
478 struct inode *inode = dentry->d_inode; 484 struct inode *inode = dentry->d_inode;
485 struct xfs_name name;
479 int error; 486 int error;
480 487
481 error = xfs_rmdir(XFS_I(dir), dentry); 488 xfs_dentry_to_name(&name, dentry);
489
490 error = xfs_rmdir(XFS_I(dir), &name, XFS_I(inode));
482 if (likely(!error)) { 491 if (likely(!error)) {
483 xfs_validate_fields(inode); 492 xfs_validate_fields(inode);
484 xfs_validate_fields(dir); 493 xfs_validate_fields(dir);
@@ -494,12 +503,15 @@ xfs_vn_rename(
494 struct dentry *ndentry) 503 struct dentry *ndentry)
495{ 504{
496 struct inode *new_inode = ndentry->d_inode; 505 struct inode *new_inode = ndentry->d_inode;
497 bhv_vnode_t *tvp; /* target directory */ 506 struct xfs_name oname;
507 struct xfs_name nname;
498 int error; 508 int error;
499 509
500 tvp = vn_from_inode(ndir); 510 xfs_dentry_to_name(&oname, odentry);
511 xfs_dentry_to_name(&nname, ndentry);
501 512
502 error = xfs_rename(XFS_I(odir), odentry, tvp, ndentry); 513 error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
514 XFS_I(ndir), &nname);
503 if (likely(!error)) { 515 if (likely(!error)) {
504 if (new_inode) 516 if (new_inode)
505 xfs_validate_fields(new_inode); 517 xfs_validate_fields(new_inode);
@@ -700,11 +712,19 @@ xfs_vn_setattr(
700 return -error; 712 return -error;
701} 713}
702 714
715/*
716 * block_truncate_page can return an error, but we can't propagate it
717 * at all here. Leave a complaint + stack trace in the syslog because
718 * this could be bad. If it is bad, we need to propagate the error further.
719 */
703STATIC void 720STATIC void
704xfs_vn_truncate( 721xfs_vn_truncate(
705 struct inode *inode) 722 struct inode *inode)
706{ 723{
707 block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks); 724 int error;
725 error = block_truncate_page(inode->i_mapping, inode->i_size,
726 xfs_get_blocks);
727 WARN_ON(error);
708} 728}
709 729
710STATIC int 730STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index 3ca39c4e5d2a..e5143323e71f 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -99,7 +99,6 @@
99/* 99/*
100 * Feature macros (disable/enable) 100 * Feature macros (disable/enable)
101 */ 101 */
102#undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */
103#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ 102#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */
104#ifdef CONFIG_SMP 103#ifdef CONFIG_SMP
105#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ 104#define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 166353388490..1ebd8004469c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,6 +51,7 @@
51#include "xfs_vnodeops.h" 51#include "xfs_vnodeops.h"
52 52
53#include <linux/capability.h> 53#include <linux/capability.h>
54#include <linux/mount.h>
54#include <linux/writeback.h> 55#include <linux/writeback.h>
55 56
56 57
@@ -176,7 +177,6 @@ xfs_read(
176{ 177{
177 struct file *file = iocb->ki_filp; 178 struct file *file = iocb->ki_filp;
178 struct inode *inode = file->f_mapping->host; 179 struct inode *inode = file->f_mapping->host;
179 bhv_vnode_t *vp = XFS_ITOV(ip);
180 xfs_mount_t *mp = ip->i_mount; 180 xfs_mount_t *mp = ip->i_mount;
181 size_t size = 0; 181 size_t size = 0;
182 ssize_t ret = 0; 182 ssize_t ret = 0;
@@ -228,11 +228,11 @@ xfs_read(
228 xfs_ilock(ip, XFS_IOLOCK_SHARED); 228 xfs_ilock(ip, XFS_IOLOCK_SHARED);
229 229
230 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { 230 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
231 bhv_vrwlock_t locktype = VRWLOCK_READ;
232 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags); 231 int dmflags = FILP_DELAY_FLAG(file) | DM_SEM_FLAG_RD(ioflags);
232 int iolock = XFS_IOLOCK_SHARED;
233 233
234 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *offset, size, 234 ret = -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *offset, size,
235 dmflags, &locktype); 235 dmflags, &iolock);
236 if (ret) { 236 if (ret) {
237 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 237 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
238 if (unlikely(ioflags & IO_ISDIRECT)) 238 if (unlikely(ioflags & IO_ISDIRECT))
@@ -242,7 +242,7 @@ xfs_read(
242 } 242 }
243 243
244 if (unlikely(ioflags & IO_ISDIRECT)) { 244 if (unlikely(ioflags & IO_ISDIRECT)) {
245 if (VN_CACHED(vp)) 245 if (inode->i_mapping->nrpages)
246 ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK), 246 ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
247 -1, FI_REMAPF_LOCKED); 247 -1, FI_REMAPF_LOCKED);
248 mutex_unlock(&inode->i_mutex); 248 mutex_unlock(&inode->i_mutex);
@@ -276,7 +276,6 @@ xfs_splice_read(
276 int flags, 276 int flags,
277 int ioflags) 277 int ioflags)
278{ 278{
279 bhv_vnode_t *vp = XFS_ITOV(ip);
280 xfs_mount_t *mp = ip->i_mount; 279 xfs_mount_t *mp = ip->i_mount;
281 ssize_t ret; 280 ssize_t ret;
282 281
@@ -287,11 +286,11 @@ xfs_splice_read(
287 xfs_ilock(ip, XFS_IOLOCK_SHARED); 286 xfs_ilock(ip, XFS_IOLOCK_SHARED);
288 287
289 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) { 288 if (DM_EVENT_ENABLED(ip, DM_EVENT_READ) && !(ioflags & IO_INVIS)) {
290 bhv_vrwlock_t locktype = VRWLOCK_READ; 289 int iolock = XFS_IOLOCK_SHARED;
291 int error; 290 int error;
292 291
293 error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, *ppos, count, 292 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, *ppos, count,
294 FILP_DELAY_FLAG(infilp), &locktype); 293 FILP_DELAY_FLAG(infilp), &iolock);
295 if (error) { 294 if (error) {
296 xfs_iunlock(ip, XFS_IOLOCK_SHARED); 295 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
297 return -error; 296 return -error;
@@ -317,7 +316,6 @@ xfs_splice_write(
317 int flags, 316 int flags,
318 int ioflags) 317 int ioflags)
319{ 318{
320 bhv_vnode_t *vp = XFS_ITOV(ip);
321 xfs_mount_t *mp = ip->i_mount; 319 xfs_mount_t *mp = ip->i_mount;
322 ssize_t ret; 320 ssize_t ret;
323 struct inode *inode = outfilp->f_mapping->host; 321 struct inode *inode = outfilp->f_mapping->host;
@@ -330,11 +328,11 @@ xfs_splice_write(
330 xfs_ilock(ip, XFS_IOLOCK_EXCL); 328 xfs_ilock(ip, XFS_IOLOCK_EXCL);
331 329
332 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) { 330 if (DM_EVENT_ENABLED(ip, DM_EVENT_WRITE) && !(ioflags & IO_INVIS)) {
333 bhv_vrwlock_t locktype = VRWLOCK_WRITE; 331 int iolock = XFS_IOLOCK_EXCL;
334 int error; 332 int error;
335 333
336 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, *ppos, count, 334 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, *ppos, count,
337 FILP_DELAY_FLAG(outfilp), &locktype); 335 FILP_DELAY_FLAG(outfilp), &iolock);
338 if (error) { 336 if (error) {
339 xfs_iunlock(ip, XFS_IOLOCK_EXCL); 337 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
340 return -error; 338 return -error;
@@ -573,14 +571,12 @@ xfs_write(
573 struct file *file = iocb->ki_filp; 571 struct file *file = iocb->ki_filp;
574 struct address_space *mapping = file->f_mapping; 572 struct address_space *mapping = file->f_mapping;
575 struct inode *inode = mapping->host; 573 struct inode *inode = mapping->host;
576 bhv_vnode_t *vp = XFS_ITOV(xip);
577 unsigned long segs = nsegs; 574 unsigned long segs = nsegs;
578 xfs_mount_t *mp; 575 xfs_mount_t *mp;
579 ssize_t ret = 0, error = 0; 576 ssize_t ret = 0, error = 0;
580 xfs_fsize_t isize, new_size; 577 xfs_fsize_t isize, new_size;
581 int iolock; 578 int iolock;
582 int eventsent = 0; 579 int eventsent = 0;
583 bhv_vrwlock_t locktype;
584 size_t ocount = 0, count; 580 size_t ocount = 0, count;
585 loff_t pos; 581 loff_t pos;
586 int need_i_mutex; 582 int need_i_mutex;
@@ -607,11 +603,9 @@ xfs_write(
607relock: 603relock:
608 if (ioflags & IO_ISDIRECT) { 604 if (ioflags & IO_ISDIRECT) {
609 iolock = XFS_IOLOCK_SHARED; 605 iolock = XFS_IOLOCK_SHARED;
610 locktype = VRWLOCK_WRITE_DIRECT;
611 need_i_mutex = 0; 606 need_i_mutex = 0;
612 } else { 607 } else {
613 iolock = XFS_IOLOCK_EXCL; 608 iolock = XFS_IOLOCK_EXCL;
614 locktype = VRWLOCK_WRITE;
615 need_i_mutex = 1; 609 need_i_mutex = 1;
616 mutex_lock(&inode->i_mutex); 610 mutex_lock(&inode->i_mutex);
617 } 611 }
@@ -634,9 +628,8 @@ start:
634 dmflags |= DM_FLAGS_IMUX; 628 dmflags |= DM_FLAGS_IMUX;
635 629
636 xfs_iunlock(xip, XFS_ILOCK_EXCL); 630 xfs_iunlock(xip, XFS_ILOCK_EXCL);
637 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, vp, 631 error = XFS_SEND_DATA(xip->i_mount, DM_EVENT_WRITE, xip,
638 pos, count, 632 pos, count, dmflags, &iolock);
639 dmflags, &locktype);
640 if (error) { 633 if (error) {
641 goto out_unlock_internal; 634 goto out_unlock_internal;
642 } 635 }
@@ -664,10 +657,9 @@ start:
664 return XFS_ERROR(-EINVAL); 657 return XFS_ERROR(-EINVAL);
665 } 658 }
666 659
667 if (!need_i_mutex && (VN_CACHED(vp) || pos > xip->i_size)) { 660 if (!need_i_mutex && (mapping->nrpages || pos > xip->i_size)) {
668 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock); 661 xfs_iunlock(xip, XFS_ILOCK_EXCL|iolock);
669 iolock = XFS_IOLOCK_EXCL; 662 iolock = XFS_IOLOCK_EXCL;
670 locktype = VRWLOCK_WRITE;
671 need_i_mutex = 1; 663 need_i_mutex = 1;
672 mutex_lock(&inode->i_mutex); 664 mutex_lock(&inode->i_mutex);
673 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock); 665 xfs_ilock(xip, XFS_ILOCK_EXCL|iolock);
@@ -679,10 +671,16 @@ start:
679 if (new_size > xip->i_size) 671 if (new_size > xip->i_size)
680 xip->i_new_size = new_size; 672 xip->i_new_size = new_size;
681 673
682 if (likely(!(ioflags & IO_INVIS))) { 674 /*
675 * We're not supposed to change timestamps in readonly-mounted
676 * filesystems. Throw it away if anyone asks us.
677 */
678 if (likely(!(ioflags & IO_INVIS) &&
679 !mnt_want_write(file->f_path.mnt))) {
683 file_update_time(file); 680 file_update_time(file);
684 xfs_ichgtime_fast(xip, inode, 681 xfs_ichgtime_fast(xip, inode,
685 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 682 XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
683 mnt_drop_write(file->f_path.mnt);
686 } 684 }
687 685
688 /* 686 /*
@@ -727,7 +725,7 @@ retry:
727 current->backing_dev_info = mapping->backing_dev_info; 725 current->backing_dev_info = mapping->backing_dev_info;
728 726
729 if ((ioflags & IO_ISDIRECT)) { 727 if ((ioflags & IO_ISDIRECT)) {
730 if (VN_CACHED(vp)) { 728 if (mapping->nrpages) {
731 WARN_ON(need_i_mutex == 0); 729 WARN_ON(need_i_mutex == 0);
732 xfs_inval_cached_trace(xip, pos, -1, 730 xfs_inval_cached_trace(xip, pos, -1,
733 (pos & PAGE_CACHE_MASK), -1); 731 (pos & PAGE_CACHE_MASK), -1);
@@ -744,7 +742,6 @@ retry:
744 mutex_unlock(&inode->i_mutex); 742 mutex_unlock(&inode->i_mutex);
745 743
746 iolock = XFS_IOLOCK_SHARED; 744 iolock = XFS_IOLOCK_SHARED;
747 locktype = VRWLOCK_WRITE_DIRECT;
748 need_i_mutex = 0; 745 need_i_mutex = 0;
749 } 746 }
750 747
@@ -781,15 +778,15 @@ retry:
781 778
782 if (ret == -ENOSPC && 779 if (ret == -ENOSPC &&
783 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) { 780 DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
784 xfs_rwunlock(xip, locktype); 781 xfs_iunlock(xip, iolock);
785 if (need_i_mutex) 782 if (need_i_mutex)
786 mutex_unlock(&inode->i_mutex); 783 mutex_unlock(&inode->i_mutex);
787 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, vp, 784 error = XFS_SEND_NAMESP(xip->i_mount, DM_EVENT_NOSPACE, xip,
788 DM_RIGHT_NULL, vp, DM_RIGHT_NULL, NULL, NULL, 785 DM_RIGHT_NULL, xip, DM_RIGHT_NULL, NULL, NULL,
789 0, 0, 0); /* Delay flag intentionally unused */ 786 0, 0, 0); /* Delay flag intentionally unused */
790 if (need_i_mutex) 787 if (need_i_mutex)
791 mutex_lock(&inode->i_mutex); 788 mutex_lock(&inode->i_mutex);
792 xfs_rwlock(xip, locktype); 789 xfs_ilock(xip, iolock);
793 if (error) 790 if (error)
794 goto out_unlock_internal; 791 goto out_unlock_internal;
795 pos = xip->i_size; 792 pos = xip->i_size;
@@ -817,7 +814,8 @@ retry:
817 /* Handle various SYNC-type writes */ 814 /* Handle various SYNC-type writes */
818 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { 815 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
819 int error2; 816 int error2;
820 xfs_rwunlock(xip, locktype); 817
818 xfs_iunlock(xip, iolock);
821 if (need_i_mutex) 819 if (need_i_mutex)
822 mutex_unlock(&inode->i_mutex); 820 mutex_unlock(&inode->i_mutex);
823 error2 = sync_page_range(inode, mapping, pos, ret); 821 error2 = sync_page_range(inode, mapping, pos, ret);
@@ -825,7 +823,7 @@ retry:
825 error = error2; 823 error = error2;
826 if (need_i_mutex) 824 if (need_i_mutex)
827 mutex_lock(&inode->i_mutex); 825 mutex_lock(&inode->i_mutex);
828 xfs_rwlock(xip, locktype); 826 xfs_ilock(xip, iolock);
829 error2 = xfs_write_sync_logforce(mp, xip); 827 error2 = xfs_write_sync_logforce(mp, xip);
830 if (!error) 828 if (!error)
831 error = error2; 829 error = error2;
@@ -846,7 +844,7 @@ retry:
846 xip->i_d.di_size = xip->i_size; 844 xip->i_d.di_size = xip->i_size;
847 xfs_iunlock(xip, XFS_ILOCK_EXCL); 845 xfs_iunlock(xip, XFS_ILOCK_EXCL);
848 } 846 }
849 xfs_rwunlock(xip, locktype); 847 xfs_iunlock(xip, iolock);
850 out_unlock_mutex: 848 out_unlock_mutex:
851 if (need_i_mutex) 849 if (need_i_mutex)
852 mutex_unlock(&inode->i_mutex); 850 mutex_unlock(&inode->i_mutex);
@@ -884,28 +882,23 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
884} 882}
885 883
886/* 884/*
887 * Wrapper around bdstrat so that we can stop data 885 * Wrapper around bdstrat so that we can stop data from going to disk in case
888 * from going to disk in case we are shutting down the filesystem. 886 * we are shutting down the filesystem. Typically user data goes thru this
889 * Typically user data goes thru this path; one of the exceptions 887 * path; one of the exceptions is the superblock.
890 * is the superblock.
891 */ 888 */
892int 889void
893xfsbdstrat( 890xfsbdstrat(
894 struct xfs_mount *mp, 891 struct xfs_mount *mp,
895 struct xfs_buf *bp) 892 struct xfs_buf *bp)
896{ 893{
897 ASSERT(mp); 894 ASSERT(mp);
898 if (!XFS_FORCED_SHUTDOWN(mp)) { 895 if (!XFS_FORCED_SHUTDOWN(mp)) {
899 /* Grio redirection would go here
900 * if (XFS_BUF_IS_GRIO(bp)) {
901 */
902
903 xfs_buf_iorequest(bp); 896 xfs_buf_iorequest(bp);
904 return 0; 897 return;
905 } 898 }
906 899
907 xfs_buftrace("XFSBDSTRAT IOERROR", bp); 900 xfs_buftrace("XFSBDSTRAT IOERROR", bp);
908 return (xfs_bioerror_relse(bp)); 901 xfs_bioerror_relse(bp);
909} 902}
910 903
911/* 904/*
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index e200253139cf..e1d498b4ba7a 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -68,7 +68,8 @@ extern void xfs_inval_cached_trace(struct xfs_inode *,
68#define xfs_inval_cached_trace(ip, offset, len, first, last) 68#define xfs_inval_cached_trace(ip, offset, len, first, last)
69#endif 69#endif
70 70
71extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); 71/* errors from xfsbdstrat() must be extracted from the buffer */
72extern void xfsbdstrat(struct xfs_mount *, struct xfs_buf *);
72extern int xfs_bdstrat_cb(struct xfs_buf *); 73extern int xfs_bdstrat_cb(struct xfs_buf *);
73extern int xfs_dev_is_read_only(struct xfs_mount *, char *); 74extern int xfs_dev_is_read_only(struct xfs_mount *, char *);
74 75
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index 8ba7a2fa6c1d..afd0b0d5fdb2 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -144,8 +144,8 @@ extern void xfs_cleanup_procfs(void);
144# define XFS_STATS_DEC(count) 144# define XFS_STATS_DEC(count)
145# define XFS_STATS_ADD(count, inc) 145# define XFS_STATS_ADD(count, inc)
146 146
147static __inline void xfs_init_procfs(void) { }; 147static inline void xfs_init_procfs(void) { };
148static __inline void xfs_cleanup_procfs(void) { }; 148static inline void xfs_cleanup_procfs(void) { };
149 149
150#endif /* !CONFIG_PROC_FS */ 150#endif /* !CONFIG_PROC_FS */
151 151
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 8831d9518790..865eb708aa95 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -896,7 +896,8 @@ xfs_fs_write_inode(
896 struct inode *inode, 896 struct inode *inode,
897 int sync) 897 int sync)
898{ 898{
899 int error = 0, flags = FLUSH_INODE; 899 int error = 0;
900 int flags = 0;
900 901
901 xfs_itrace_entry(XFS_I(inode)); 902 xfs_itrace_entry(XFS_I(inode));
902 if (sync) { 903 if (sync) {
@@ -934,7 +935,7 @@ xfs_fs_clear_inode(
934 xfs_inactive(ip); 935 xfs_inactive(ip);
935 xfs_iflags_clear(ip, XFS_IMODIFIED); 936 xfs_iflags_clear(ip, XFS_IMODIFIED);
936 if (xfs_reclaim(ip)) 937 if (xfs_reclaim(ip))
937 panic("%s: cannot reclaim 0x%p\n", __FUNCTION__, inode); 938 panic("%s: cannot reclaim 0x%p\n", __func__, inode);
938 } 939 }
939 940
940 ASSERT(XFS_I(inode) == NULL); 941 ASSERT(XFS_I(inode) == NULL);
@@ -1027,8 +1028,7 @@ xfs_sync_worker(
1027 int error; 1028 int error;
1028 1029
1029 if (!(mp->m_flags & XFS_MOUNT_RDONLY)) 1030 if (!(mp->m_flags & XFS_MOUNT_RDONLY))
1030 error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR | 1031 error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
1031 SYNC_REFCACHE | SYNC_SUPER);
1032 mp->m_sync_seq++; 1032 mp->m_sync_seq++;
1033 wake_up(&mp->m_wait_single_sync_task); 1033 wake_up(&mp->m_wait_single_sync_task);
1034} 1034}
@@ -1306,7 +1306,7 @@ xfs_fs_fill_super(
1306 void *data, 1306 void *data,
1307 int silent) 1307 int silent)
1308{ 1308{
1309 struct inode *rootvp; 1309 struct inode *root;
1310 struct xfs_mount *mp = NULL; 1310 struct xfs_mount *mp = NULL;
1311 struct xfs_mount_args *args = xfs_args_allocate(sb, silent); 1311 struct xfs_mount_args *args = xfs_args_allocate(sb, silent);
1312 int error; 1312 int error;
@@ -1344,19 +1344,18 @@ xfs_fs_fill_super(
1344 sb->s_time_gran = 1; 1344 sb->s_time_gran = 1;
1345 set_posix_acl_flag(sb); 1345 set_posix_acl_flag(sb);
1346 1346
1347 rootvp = igrab(mp->m_rootip->i_vnode); 1347 root = igrab(mp->m_rootip->i_vnode);
1348 if (!rootvp) { 1348 if (!root) {
1349 error = ENOENT; 1349 error = ENOENT;
1350 goto fail_unmount; 1350 goto fail_unmount;
1351 } 1351 }
1352 1352 if (is_bad_inode(root)) {
1353 sb->s_root = d_alloc_root(vn_to_inode(rootvp)); 1353 error = EINVAL;
1354 if (!sb->s_root) {
1355 error = ENOMEM;
1356 goto fail_vnrele; 1354 goto fail_vnrele;
1357 } 1355 }
1358 if (is_bad_inode(sb->s_root->d_inode)) { 1356 sb->s_root = d_alloc_root(root);
1359 error = EINVAL; 1357 if (!sb->s_root) {
1358 error = ENOMEM;
1360 goto fail_vnrele; 1359 goto fail_vnrele;
1361 } 1360 }
1362 1361
@@ -1378,7 +1377,7 @@ fail_vnrele:
1378 dput(sb->s_root); 1377 dput(sb->s_root);
1379 sb->s_root = NULL; 1378 sb->s_root = NULL;
1380 } else { 1379 } else {
1381 VN_RELE(rootvp); 1380 iput(root);
1382 } 1381 }
1383 1382
1384fail_unmount: 1383fail_unmount:
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index 3efcf45b14ab..3efb7c6d3303 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -50,13 +50,7 @@ extern void xfs_qm_exit(void);
50# define set_posix_acl_flag(sb) do { } while (0) 50# define set_posix_acl_flag(sb) do { } while (0)
51#endif 51#endif
52 52
53#ifdef CONFIG_XFS_SECURITY 53#define XFS_SECURITY_STRING "security attributes, "
54# define XFS_SECURITY_STRING "security attributes, "
55# define ENOSECURITY 0
56#else
57# define XFS_SECURITY_STRING
58# define ENOSECURITY EOPNOTSUPP
59#endif
60 54
61#ifdef CONFIG_XFS_RT 55#ifdef CONFIG_XFS_RT
62# define XFS_REALTIME_STRING "realtime, " 56# define XFS_REALTIME_STRING "realtime, "
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
index 4da03a4e3520..7e60c7776b1c 100644
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ b/fs/xfs/linux-2.6/xfs_vfs.h
@@ -49,7 +49,6 @@ typedef struct bhv_vfs_sync_work {
49#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ 49#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */
50#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ 50#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */
51#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */ 51#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */
52#define SYNC_SUPER 0x0200 /* flush superblock to disk */
53 52
54/* 53/*
55 * When remounting a filesystem read-only or freezing the filesystem, 54 * When remounting a filesystem read-only or freezing the filesystem,
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index b5ea418693b1..8b4d63ce8694 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -23,8 +23,6 @@ struct bhv_vattr;
23struct xfs_iomap; 23struct xfs_iomap;
24struct attrlist_cursor_kern; 24struct attrlist_cursor_kern;
25 25
26typedef struct dentry bhv_vname_t;
27typedef __u64 bhv_vnumber_t;
28typedef struct inode bhv_vnode_t; 26typedef struct inode bhv_vnode_t;
29 27
30#define VN_ISLNK(vp) S_ISLNK((vp)->i_mode) 28#define VN_ISLNK(vp) S_ISLNK((vp)->i_mode)
@@ -46,18 +44,6 @@ static inline struct inode *vn_to_inode(bhv_vnode_t *vnode)
46} 44}
47 45
48/* 46/*
49 * Values for the vop_rwlock/rwunlock flags parameter.
50 */
51typedef enum bhv_vrwlock {
52 VRWLOCK_NONE,
53 VRWLOCK_READ,
54 VRWLOCK_WRITE,
55 VRWLOCK_WRITE_DIRECT,
56 VRWLOCK_TRY_READ,
57 VRWLOCK_TRY_WRITE
58} bhv_vrwlock_t;
59
60/*
61 * Return values for xfs_inactive. A return value of 47 * Return values for xfs_inactive. A return value of
62 * VN_INACTIVE_NOCACHE implies that the file system behavior 48 * VN_INACTIVE_NOCACHE implies that the file system behavior
63 * has disassociated its state and bhv_desc_t from the vnode. 49 * has disassociated its state and bhv_desc_t from the vnode.
@@ -73,12 +59,9 @@ typedef enum bhv_vrwlock {
73#define IO_INVIS 0x00020 /* don't update inode timestamps */ 59#define IO_INVIS 0x00020 /* don't update inode timestamps */
74 60
75/* 61/*
76 * Flags for vop_iflush call 62 * Flags for xfs_inode_flush
77 */ 63 */
78#define FLUSH_SYNC 1 /* wait for flush to complete */ 64#define FLUSH_SYNC 1 /* wait for flush to complete */
79#define FLUSH_INODE 2 /* flush the inode itself */
80#define FLUSH_LOG 4 /* force the last log entry for
81 * this inode out to disk */
82 65
83/* 66/*
84 * Flush/Invalidate options for vop_toss/flush/flushinval_pages. 67 * Flush/Invalidate options for vop_toss/flush/flushinval_pages.
@@ -226,13 +209,6 @@ static inline bhv_vnode_t *vn_grab(bhv_vnode_t *vp)
226} 209}
227 210
228/* 211/*
229 * Vname handling macros.
230 */
231#define VNAME(dentry) ((char *) (dentry)->d_name.name)
232#define VNAMELEN(dentry) ((dentry)->d_name.len)
233#define VNAME_TO_VNODE(dentry) (vn_from_inode((dentry)->d_inode))
234
235/*
236 * Dealing with bad inodes 212 * Dealing with bad inodes
237 */ 213 */
238static inline int VN_BAD(bhv_vnode_t *vp) 214static inline int VN_BAD(bhv_vnode_t *vp)
@@ -303,9 +279,9 @@ extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
303extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *); 279extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
304extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *); 280extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
305#define xfs_itrace_entry(ip) \ 281#define xfs_itrace_entry(ip) \
306 _xfs_itrace_entry(ip, __FUNCTION__, (inst_t *)__return_address) 282 _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
307#define xfs_itrace_exit(ip) \ 283#define xfs_itrace_exit(ip) \
308 _xfs_itrace_exit(ip, __FUNCTION__, (inst_t *)__return_address) 284 _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
309#define xfs_itrace_exit_tag(ip, tag) \ 285#define xfs_itrace_exit_tag(ip, tag) \
310 _xfs_itrace_exit(ip, tag, (inst_t *)__return_address) 286 _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
311#define xfs_itrace_ref(ip) \ 287#define xfs_itrace_ref(ip) \
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index 665babcca6a6..631ebb31b295 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -1291,7 +1291,7 @@ xfs_qm_dqflush(
1291 if (flags & XFS_QMOPT_DELWRI) { 1291 if (flags & XFS_QMOPT_DELWRI) {
1292 xfs_bdwrite(mp, bp); 1292 xfs_bdwrite(mp, bp);
1293 } else if (flags & XFS_QMOPT_ASYNC) { 1293 } else if (flags & XFS_QMOPT_ASYNC) {
1294 xfs_bawrite(mp, bp); 1294 error = xfs_bawrite(mp, bp);
1295 } else { 1295 } else {
1296 error = xfs_bwrite(mp, bp); 1296 error = xfs_bwrite(mp, bp);
1297 } 1297 }
@@ -1439,9 +1439,7 @@ xfs_qm_dqpurge(
1439 uint flags) 1439 uint flags)
1440{ 1440{
1441 xfs_dqhash_t *thishash; 1441 xfs_dqhash_t *thishash;
1442 xfs_mount_t *mp; 1442 xfs_mount_t *mp = dqp->q_mount;
1443
1444 mp = dqp->q_mount;
1445 1443
1446 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp)); 1444 ASSERT(XFS_QM_IS_MPLIST_LOCKED(mp));
1447 ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash)); 1445 ASSERT(XFS_DQ_IS_HASH_LOCKED(dqp->q_hash));
@@ -1485,6 +1483,7 @@ xfs_qm_dqpurge(
1485 * we're unmounting, we do care, so we flush it and wait. 1483 * we're unmounting, we do care, so we flush it and wait.
1486 */ 1484 */
1487 if (XFS_DQ_IS_DIRTY(dqp)) { 1485 if (XFS_DQ_IS_DIRTY(dqp)) {
1486 int error;
1488 xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY"); 1487 xfs_dqtrace_entry(dqp, "DQPURGE ->DQFLUSH: DQDIRTY");
1489 /* dqflush unlocks dqflock */ 1488 /* dqflush unlocks dqflock */
1490 /* 1489 /*
@@ -1495,7 +1494,10 @@ xfs_qm_dqpurge(
1495 * We don't care about getting disk errors here. We need 1494 * We don't care about getting disk errors here. We need
1496 * to purge this dquot anyway, so we go ahead regardless. 1495 * to purge this dquot anyway, so we go ahead regardless.
1497 */ 1496 */
1498 (void) xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC); 1497 error = xfs_qm_dqflush(dqp, XFS_QMOPT_SYNC);
1498 if (error)
1499 xfs_fs_cmn_err(CE_WARN, mp,
1500 "xfs_qm_dqpurge: dquot %p flush failed", dqp);
1499 xfs_dqflock(dqp); 1501 xfs_dqflock(dqp);
1500 } 1502 }
1501 ASSERT(dqp->q_pincount == 0); 1503 ASSERT(dqp->q_pincount == 0);
@@ -1580,12 +1582,18 @@ xfs_qm_dqflock_pushbuf_wait(
1580 XFS_INCORE_TRYLOCK); 1582 XFS_INCORE_TRYLOCK);
1581 if (bp != NULL) { 1583 if (bp != NULL) {
1582 if (XFS_BUF_ISDELAYWRITE(bp)) { 1584 if (XFS_BUF_ISDELAYWRITE(bp)) {
1585 int error;
1583 if (XFS_BUF_ISPINNED(bp)) { 1586 if (XFS_BUF_ISPINNED(bp)) {
1584 xfs_log_force(dqp->q_mount, 1587 xfs_log_force(dqp->q_mount,
1585 (xfs_lsn_t)0, 1588 (xfs_lsn_t)0,
1586 XFS_LOG_FORCE); 1589 XFS_LOG_FORCE);
1587 } 1590 }
1588 xfs_bawrite(dqp->q_mount, bp); 1591 error = xfs_bawrite(dqp->q_mount, bp);
1592 if (error)
1593 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
1594 "xfs_qm_dqflock_pushbuf_wait: "
1595 "pushbuf error %d on dqp %p, bp %p",
1596 error, dqp, bp);
1589 } else { 1597 } else {
1590 xfs_buf_relse(bp); 1598 xfs_buf_relse(bp);
1591 } 1599 }
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1800e8d1f646..36e05ca78412 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -146,6 +146,7 @@ xfs_qm_dquot_logitem_push(
146 xfs_dq_logitem_t *logitem) 146 xfs_dq_logitem_t *logitem)
147{ 147{
148 xfs_dquot_t *dqp; 148 xfs_dquot_t *dqp;
149 int error;
149 150
150 dqp = logitem->qli_dquot; 151 dqp = logitem->qli_dquot;
151 152
@@ -161,7 +162,11 @@ xfs_qm_dquot_logitem_push(
161 * lock without sleeping, then there must not have been 162 * lock without sleeping, then there must not have been
162 * anyone in the process of flushing the dquot. 163 * anyone in the process of flushing the dquot.
163 */ 164 */
164 xfs_qm_dqflush(dqp, XFS_B_DELWRI); 165 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
166 if (error)
167 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
168 "xfs_qm_dquot_logitem_push: push error %d on dqp %p",
169 error, dqp);
165 xfs_dqunlock(dqp); 170 xfs_dqunlock(dqp);
166} 171}
167 172
@@ -262,11 +267,16 @@ xfs_qm_dquot_logitem_pushbuf(
262 XFS_LOG_FORCE); 267 XFS_LOG_FORCE);
263 } 268 }
264 if (dopush) { 269 if (dopush) {
270 int error;
265#ifdef XFSRACEDEBUG 271#ifdef XFSRACEDEBUG
266 delay_for_intr(); 272 delay_for_intr();
267 delay(300); 273 delay(300);
268#endif 274#endif
269 xfs_bawrite(mp, bp); 275 error = xfs_bawrite(mp, bp);
276 if (error)
277 xfs_fs_cmn_err(CE_WARN, mp,
278 "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
279 error, qip, bp);
270 } else { 280 } else {
271 xfs_buf_relse(bp); 281 xfs_buf_relse(bp);
272 } 282 }
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 8e9c5ae6504d..40ea56409561 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -304,8 +304,11 @@ xfs_qm_unmount_quotadestroy(
304 * necessary data structures like quotainfo. This is also responsible for 304 * necessary data structures like quotainfo. This is also responsible for
305 * running a quotacheck as necessary. We are guaranteed that the superblock 305 * running a quotacheck as necessary. We are guaranteed that the superblock
306 * is consistently read in at this point. 306 * is consistently read in at this point.
307 *
308 * If we fail here, the mount will continue with quota turned off. We don't
309 * need to inidicate success or failure at all.
307 */ 310 */
308int 311void
309xfs_qm_mount_quotas( 312xfs_qm_mount_quotas(
310 xfs_mount_t *mp, 313 xfs_mount_t *mp,
311 int mfsi_flags) 314 int mfsi_flags)
@@ -313,7 +316,6 @@ xfs_qm_mount_quotas(
313 int error = 0; 316 int error = 0;
314 uint sbf; 317 uint sbf;
315 318
316
317 /* 319 /*
318 * If quotas on realtime volumes is not supported, we disable 320 * If quotas on realtime volumes is not supported, we disable
319 * quotas immediately. 321 * quotas immediately.
@@ -332,7 +334,8 @@ xfs_qm_mount_quotas(
332 * Allocate the quotainfo structure inside the mount struct, and 334 * Allocate the quotainfo structure inside the mount struct, and
333 * create quotainode(s), and change/rev superblock if necessary. 335 * create quotainode(s), and change/rev superblock if necessary.
334 */ 336 */
335 if ((error = xfs_qm_init_quotainfo(mp))) { 337 error = xfs_qm_init_quotainfo(mp);
338 if (error) {
336 /* 339 /*
337 * We must turn off quotas. 340 * We must turn off quotas.
338 */ 341 */
@@ -344,12 +347,11 @@ xfs_qm_mount_quotas(
344 * If any of the quotas are not consistent, do a quotacheck. 347 * If any of the quotas are not consistent, do a quotacheck.
345 */ 348 */
346 if (XFS_QM_NEED_QUOTACHECK(mp) && 349 if (XFS_QM_NEED_QUOTACHECK(mp) &&
347 !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) { 350 !(mfsi_flags & XFS_MFSI_NO_QUOTACHECK)) {
348 if ((error = xfs_qm_quotacheck(mp))) { 351 error = xfs_qm_quotacheck(mp);
349 /* Quotacheck has failed and quotas have 352 if (error) {
350 * been disabled. 353 /* Quotacheck failed and disabled quotas. */
351 */ 354 return;
352 return XFS_ERROR(error);
353 } 355 }
354 } 356 }
355 /* 357 /*
@@ -357,12 +359,10 @@ xfs_qm_mount_quotas(
357 * quotachecked status, since we won't be doing accounting for 359 * quotachecked status, since we won't be doing accounting for
358 * that type anymore. 360 * that type anymore.
359 */ 361 */
360 if (!XFS_IS_UQUOTA_ON(mp)) { 362 if (!XFS_IS_UQUOTA_ON(mp))
361 mp->m_qflags &= ~XFS_UQUOTA_CHKD; 363 mp->m_qflags &= ~XFS_UQUOTA_CHKD;
362 } 364 if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp)))
363 if (!(XFS_IS_GQUOTA_ON(mp) || XFS_IS_PQUOTA_ON(mp))) {
364 mp->m_qflags &= ~XFS_OQUOTA_CHKD; 365 mp->m_qflags &= ~XFS_OQUOTA_CHKD;
365 }
366 366
367 write_changes: 367 write_changes:
368 /* 368 /*
@@ -392,7 +392,7 @@ xfs_qm_mount_quotas(
392 xfs_fs_cmn_err(CE_WARN, mp, 392 xfs_fs_cmn_err(CE_WARN, mp,
393 "Failed to initialize disk quotas."); 393 "Failed to initialize disk quotas.");
394 } 394 }
395 return XFS_ERROR(error); 395 return;
396} 396}
397 397
398/* 398/*
@@ -1438,7 +1438,7 @@ xfs_qm_qino_alloc(
1438} 1438}
1439 1439
1440 1440
1441STATIC int 1441STATIC void
1442xfs_qm_reset_dqcounts( 1442xfs_qm_reset_dqcounts(
1443 xfs_mount_t *mp, 1443 xfs_mount_t *mp,
1444 xfs_buf_t *bp, 1444 xfs_buf_t *bp,
@@ -1478,8 +1478,6 @@ xfs_qm_reset_dqcounts(
1478 ddq->d_rtbwarns = 0; 1478 ddq->d_rtbwarns = 0;
1479 ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1); 1479 ddq = (xfs_disk_dquot_t *) ((xfs_dqblk_t *)ddq + 1);
1480 } 1480 }
1481
1482 return 0;
1483} 1481}
1484 1482
1485STATIC int 1483STATIC int
@@ -1520,7 +1518,7 @@ xfs_qm_dqiter_bufs(
1520 if (error) 1518 if (error)
1521 break; 1519 break;
1522 1520
1523 (void) xfs_qm_reset_dqcounts(mp, bp, firstid, type); 1521 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
1524 xfs_bdwrite(mp, bp); 1522 xfs_bdwrite(mp, bp);
1525 /* 1523 /*
1526 * goto the next block. 1524 * goto the next block.
@@ -1810,7 +1808,7 @@ xfs_qm_dqusage_adjust(
1810 * Now release the inode. This will send it to 'inactive', and 1808 * Now release the inode. This will send it to 'inactive', and
1811 * possibly even free blocks. 1809 * possibly even free blocks.
1812 */ 1810 */
1813 VN_RELE(XFS_ITOV(ip)); 1811 IRELE(ip);
1814 1812
1815 /* 1813 /*
1816 * Goto next inode. 1814 * Goto next inode.
@@ -1880,6 +1878,14 @@ xfs_qm_quotacheck(
1880 } while (! done); 1878 } while (! done);
1881 1879
1882 /* 1880 /*
1881 * We've made all the changes that we need to make incore.
1882 * Flush them down to disk buffers if everything was updated
1883 * successfully.
1884 */
1885 if (!error)
1886 error = xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
1887
1888 /*
1883 * We can get this error if we couldn't do a dquot allocation inside 1889 * We can get this error if we couldn't do a dquot allocation inside
1884 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the 1890 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
1885 * dirty dquots that might be cached, we just want to get rid of them 1891 * dirty dquots that might be cached, we just want to get rid of them
@@ -1890,11 +1896,6 @@ xfs_qm_quotacheck(
1890 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF); 1896 xfs_qm_dqpurge_all(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_QUOTAOFF);
1891 goto error_return; 1897 goto error_return;
1892 } 1898 }
1893 /*
1894 * We've made all the changes that we need to make incore.
1895 * Now flush_them down to disk buffers.
1896 */
1897 xfs_qm_dqflush_all(mp, XFS_QMOPT_DELWRI);
1898 1899
1899 /* 1900 /*
1900 * We didn't log anything, because if we crashed, we'll have to 1901 * We didn't log anything, because if we crashed, we'll have to
@@ -1926,7 +1927,10 @@ xfs_qm_quotacheck(
1926 ASSERT(mp->m_quotainfo != NULL); 1927 ASSERT(mp->m_quotainfo != NULL);
1927 ASSERT(xfs_Gqm != NULL); 1928 ASSERT(xfs_Gqm != NULL);
1928 xfs_qm_destroy_quotainfo(mp); 1929 xfs_qm_destroy_quotainfo(mp);
1929 (void)xfs_mount_reset_sbqflags(mp); 1930 if (xfs_mount_reset_sbqflags(mp)) {
1931 cmn_err(CE_WARN, "XFS quotacheck %s: "
1932 "Failed to reset quota flags.", mp->m_fsname);
1933 }
1930 } else { 1934 } else {
1931 cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname); 1935 cmn_err(CE_NOTE, "XFS quotacheck %s: Done.", mp->m_fsname);
1932 } 1936 }
@@ -1968,7 +1972,7 @@ xfs_qm_init_quotainos(
1968 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 1972 if ((error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
1969 0, 0, &gip, 0))) { 1973 0, 0, &gip, 0))) {
1970 if (uip) 1974 if (uip)
1971 VN_RELE(XFS_ITOV(uip)); 1975 IRELE(uip);
1972 return XFS_ERROR(error); 1976 return XFS_ERROR(error);
1973 } 1977 }
1974 } 1978 }
@@ -1999,7 +2003,7 @@ xfs_qm_init_quotainos(
1999 sbflags | XFS_SB_GQUOTINO, flags); 2003 sbflags | XFS_SB_GQUOTINO, flags);
2000 if (error) { 2004 if (error) {
2001 if (uip) 2005 if (uip)
2002 VN_RELE(XFS_ITOV(uip)); 2006 IRELE(uip);
2003 2007
2004 return XFS_ERROR(error); 2008 return XFS_ERROR(error);
2005 } 2009 }
@@ -2093,12 +2097,17 @@ xfs_qm_shake_freelist(
2093 * dirty dquots. 2097 * dirty dquots.
2094 */ 2098 */
2095 if (XFS_DQ_IS_DIRTY(dqp)) { 2099 if (XFS_DQ_IS_DIRTY(dqp)) {
2100 int error;
2096 xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY"); 2101 xfs_dqtrace_entry(dqp, "DQSHAKE: DQDIRTY");
2097 /* 2102 /*
2098 * We flush it delayed write, so don't bother 2103 * We flush it delayed write, so don't bother
2099 * releasing the mplock. 2104 * releasing the mplock.
2100 */ 2105 */
2101 (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2106 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
2107 if (error) {
2108 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2109 "xfs_qm_dqflush_all: dquot %p flush failed", dqp);
2110 }
2102 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 2111 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2103 dqp = dqp->dq_flnext; 2112 dqp = dqp->dq_flnext;
2104 continue; 2113 continue;
@@ -2265,12 +2274,17 @@ xfs_qm_dqreclaim_one(void)
2265 * dirty dquots. 2274 * dirty dquots.
2266 */ 2275 */
2267 if (XFS_DQ_IS_DIRTY(dqp)) { 2276 if (XFS_DQ_IS_DIRTY(dqp)) {
2277 int error;
2268 xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY"); 2278 xfs_dqtrace_entry(dqp, "DQRECLAIM: DQDIRTY");
2269 /* 2279 /*
2270 * We flush it delayed write, so don't bother 2280 * We flush it delayed write, so don't bother
2271 * releasing the freelist lock. 2281 * releasing the freelist lock.
2272 */ 2282 */
2273 (void) xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI); 2283 error = xfs_qm_dqflush(dqp, XFS_QMOPT_DELWRI);
2284 if (error) {
2285 xfs_fs_cmn_err(CE_WARN, dqp->q_mount,
2286 "xfs_qm_dqreclaim: dquot %p flush failed", dqp);
2287 }
2274 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */ 2288 xfs_dqunlock(dqp); /* dqflush unlocks dqflock */
2275 continue; 2289 continue;
2276 } 2290 }
@@ -2378,9 +2392,9 @@ xfs_qm_write_sb_changes(
2378 } 2392 }
2379 2393
2380 xfs_mod_sb(tp, flags); 2394 xfs_mod_sb(tp, flags);
2381 (void) xfs_trans_commit(tp, 0); 2395 error = xfs_trans_commit(tp, 0);
2382 2396
2383 return 0; 2397 return error;
2384} 2398}
2385 2399
2386 2400
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index baf537c1c177..cd2300e374af 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -165,7 +165,7 @@ typedef struct xfs_dquot_acct {
165#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--) 165#define XFS_QM_RELE(xqm) ((xqm)->qm_nrefs--)
166 166
167extern void xfs_qm_destroy_quotainfo(xfs_mount_t *); 167extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
168extern int xfs_qm_mount_quotas(xfs_mount_t *, int); 168extern void xfs_qm_mount_quotas(xfs_mount_t *, int);
169extern int xfs_qm_quotacheck(xfs_mount_t *); 169extern int xfs_qm_quotacheck(xfs_mount_t *);
170extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *); 170extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
171extern int xfs_qm_unmount_quotas(xfs_mount_t *); 171extern int xfs_qm_unmount_quotas(xfs_mount_t *);
diff --git a/fs/xfs/quota/xfs_qm_stats.h b/fs/xfs/quota/xfs_qm_stats.h
index a50ffabcf554..5b964fc0dc09 100644
--- a/fs/xfs/quota/xfs_qm_stats.h
+++ b/fs/xfs/quota/xfs_qm_stats.h
@@ -45,8 +45,8 @@ extern void xfs_qm_cleanup_procfs(void);
45 45
46# define XQM_STATS_INC(count) do { } while (0) 46# define XQM_STATS_INC(count) do { } while (0)
47 47
48static __inline void xfs_qm_init_procfs(void) { }; 48static inline void xfs_qm_init_procfs(void) { };
49static __inline void xfs_qm_cleanup_procfs(void) { }; 49static inline void xfs_qm_cleanup_procfs(void) { };
50 50
51#endif 51#endif
52 52
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index d2b8be7e75f9..8342823dbdc3 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -279,9 +279,12 @@ xfs_qm_scall_quotaoff(
279 279
280 /* 280 /*
281 * Write the LI_QUOTAOFF log record, and do SB changes atomically, 281 * Write the LI_QUOTAOFF log record, and do SB changes atomically,
282 * and synchronously. 282 * and synchronously. If we fail to write, we should abort the
283 * operation as it cannot be recovered safely if we crash.
283 */ 284 */
284 xfs_qm_log_quotaoff(mp, &qoffstart, flags); 285 error = xfs_qm_log_quotaoff(mp, &qoffstart, flags);
286 if (error)
287 goto out_error;
285 288
286 /* 289 /*
287 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct 290 * Next we clear the XFS_MOUNT_*DQ_ACTIVE bit(s) in the mount struct
@@ -337,7 +340,12 @@ xfs_qm_scall_quotaoff(
337 * So, we have QUOTAOFF start and end logitems; the start 340 * So, we have QUOTAOFF start and end logitems; the start
338 * logitem won't get overwritten until the end logitem appears... 341 * logitem won't get overwritten until the end logitem appears...
339 */ 342 */
340 xfs_qm_log_quotaoff_end(mp, qoffstart, flags); 343 error = xfs_qm_log_quotaoff_end(mp, qoffstart, flags);
344 if (error) {
345 /* We're screwed now. Shutdown is the only option. */
346 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
347 goto out_error;
348 }
341 349
342 /* 350 /*
343 * If quotas is completely disabled, close shop. 351 * If quotas is completely disabled, close shop.
@@ -361,6 +369,7 @@ xfs_qm_scall_quotaoff(
361 XFS_PURGE_INODE(XFS_QI_GQIP(mp)); 369 XFS_PURGE_INODE(XFS_QI_GQIP(mp));
362 XFS_QI_GQIP(mp) = NULL; 370 XFS_QI_GQIP(mp) = NULL;
363 } 371 }
372out_error:
364 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 373 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
365 374
366 return (error); 375 return (error);
@@ -371,12 +380,11 @@ xfs_qm_scall_trunc_qfiles(
371 xfs_mount_t *mp, 380 xfs_mount_t *mp,
372 uint flags) 381 uint flags)
373{ 382{
374 int error; 383 int error = 0, error2 = 0;
375 xfs_inode_t *qip; 384 xfs_inode_t *qip;
376 385
377 if (!capable(CAP_SYS_ADMIN)) 386 if (!capable(CAP_SYS_ADMIN))
378 return XFS_ERROR(EPERM); 387 return XFS_ERROR(EPERM);
379 error = 0;
380 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) { 388 if (!xfs_sb_version_hasquota(&mp->m_sb) || flags == 0) {
381 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags); 389 qdprintk("qtrunc flags=%x m_qflags=%x\n", flags, mp->m_qflags);
382 return XFS_ERROR(EINVAL); 390 return XFS_ERROR(EINVAL);
@@ -384,22 +392,22 @@ xfs_qm_scall_trunc_qfiles(
384 392
385 if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) { 393 if ((flags & XFS_DQ_USER) && mp->m_sb.sb_uquotino != NULLFSINO) {
386 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0); 394 error = xfs_iget(mp, NULL, mp->m_sb.sb_uquotino, 0, 0, &qip, 0);
387 if (! error) { 395 if (!error) {
388 (void) xfs_truncate_file(mp, qip); 396 error = xfs_truncate_file(mp, qip);
389 VN_RELE(XFS_ITOV(qip)); 397 IRELE(qip);
390 } 398 }
391 } 399 }
392 400
393 if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) && 401 if ((flags & (XFS_DQ_GROUP|XFS_DQ_PROJ)) &&
394 mp->m_sb.sb_gquotino != NULLFSINO) { 402 mp->m_sb.sb_gquotino != NULLFSINO) {
395 error = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0); 403 error2 = xfs_iget(mp, NULL, mp->m_sb.sb_gquotino, 0, 0, &qip, 0);
396 if (! error) { 404 if (!error2) {
397 (void) xfs_truncate_file(mp, qip); 405 error2 = xfs_truncate_file(mp, qip);
398 VN_RELE(XFS_ITOV(qip)); 406 IRELE(qip);
399 } 407 }
400 } 408 }
401 409
402 return (error); 410 return error ? error : error2;
403} 411}
404 412
405 413
@@ -552,13 +560,13 @@ xfs_qm_scall_getqstat(
552 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks; 560 out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
553 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents; 561 out->qs_uquota.qfs_nextents = uip->i_d.di_nextents;
554 if (tempuqip) 562 if (tempuqip)
555 VN_RELE(XFS_ITOV(uip)); 563 IRELE(uip);
556 } 564 }
557 if (gip) { 565 if (gip) {
558 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks; 566 out->qs_gquota.qfs_nblks = gip->i_d.di_nblocks;
559 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents; 567 out->qs_gquota.qfs_nextents = gip->i_d.di_nextents;
560 if (tempgqip) 568 if (tempgqip)
561 VN_RELE(XFS_ITOV(gip)); 569 IRELE(gip);
562 } 570 }
563 if (mp->m_quotainfo) { 571 if (mp->m_quotainfo) {
564 out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp); 572 out->qs_incoredqs = XFS_QI_MPLNDQUOTS(mp);
@@ -726,12 +734,12 @@ xfs_qm_scall_setqlim(
726 xfs_trans_log_dquot(tp, dqp); 734 xfs_trans_log_dquot(tp, dqp);
727 735
728 xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT"); 736 xfs_dqtrace_entry(dqp, "Q_SETQLIM: COMMIT");
729 xfs_trans_commit(tp, 0); 737 error = xfs_trans_commit(tp, 0);
730 xfs_qm_dqprint(dqp); 738 xfs_qm_dqprint(dqp);
731 xfs_qm_dqrele(dqp); 739 xfs_qm_dqrele(dqp);
732 mutex_unlock(&(XFS_QI_QOFFLOCK(mp))); 740 mutex_unlock(&(XFS_QI_QOFFLOCK(mp)));
733 741
734 return (0); 742 return error;
735} 743}
736 744
737STATIC int 745STATIC int
@@ -1095,7 +1103,7 @@ again:
1095 * inactive code in hell. 1103 * inactive code in hell.
1096 */ 1104 */
1097 if (vnode_refd) 1105 if (vnode_refd)
1098 VN_RELE(vp); 1106 IRELE(ip);
1099 XFS_MOUNT_ILOCK(mp); 1107 XFS_MOUNT_ILOCK(mp);
1100 /* 1108 /*
1101 * If an inode was inserted or removed, we gotta 1109 * If an inode was inserted or removed, we gotta
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index 129067cfcb86..0b75d302508f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -24,7 +24,7 @@ static int ktrace_zentries;
24void __init 24void __init
25ktrace_init(int zentries) 25ktrace_init(int zentries)
26{ 26{
27 ktrace_zentries = zentries; 27 ktrace_zentries = roundup_pow_of_two(zentries);
28 28
29 ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t), 29 ktrace_hdr_zone = kmem_zone_init(sizeof(ktrace_t),
30 "ktrace_hdr"); 30 "ktrace_hdr");
@@ -47,13 +47,16 @@ ktrace_uninit(void)
47 * ktrace_alloc() 47 * ktrace_alloc()
48 * 48 *
49 * Allocate a ktrace header and enough buffering for the given 49 * Allocate a ktrace header and enough buffering for the given
50 * number of entries. 50 * number of entries. Round the number of entries up to a
51 * power of 2 so we can do fast masking to get the index from
52 * the atomic index counter.
51 */ 53 */
52ktrace_t * 54ktrace_t *
53ktrace_alloc(int nentries, unsigned int __nocast sleep) 55ktrace_alloc(int nentries, unsigned int __nocast sleep)
54{ 56{
55 ktrace_t *ktp; 57 ktrace_t *ktp;
56 ktrace_entry_t *ktep; 58 ktrace_entry_t *ktep;
59 int entries;
57 60
58 ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep); 61 ktp = (ktrace_t*)kmem_zone_alloc(ktrace_hdr_zone, sleep);
59 62
@@ -70,11 +73,12 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
70 /* 73 /*
71 * Special treatment for buffers with the ktrace_zentries entries 74 * Special treatment for buffers with the ktrace_zentries entries
72 */ 75 */
73 if (nentries == ktrace_zentries) { 76 entries = roundup_pow_of_two(nentries);
77 if (entries == ktrace_zentries) {
74 ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone, 78 ktep = (ktrace_entry_t*)kmem_zone_zalloc(ktrace_ent_zone,
75 sleep); 79 sleep);
76 } else { 80 } else {
77 ktep = (ktrace_entry_t*)kmem_zalloc((nentries * sizeof(*ktep)), 81 ktep = (ktrace_entry_t*)kmem_zalloc((entries * sizeof(*ktep)),
78 sleep | KM_LARGE); 82 sleep | KM_LARGE);
79 } 83 }
80 84
@@ -91,8 +95,10 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
91 } 95 }
92 96
93 ktp->kt_entries = ktep; 97 ktp->kt_entries = ktep;
94 ktp->kt_nentries = nentries; 98 ktp->kt_nentries = entries;
95 ktp->kt_index = 0; 99 ASSERT(is_power_of_2(entries));
100 ktp->kt_index_mask = entries - 1;
101 atomic_set(&ktp->kt_index, 0);
96 ktp->kt_rollover = 0; 102 ktp->kt_rollover = 0;
97 return ktp; 103 return ktp;
98} 104}
@@ -151,8 +157,6 @@ ktrace_enter(
151 void *val14, 157 void *val14,
152 void *val15) 158 void *val15)
153{ 159{
154 static DEFINE_SPINLOCK(wrap_lock);
155 unsigned long flags;
156 int index; 160 int index;
157 ktrace_entry_t *ktep; 161 ktrace_entry_t *ktep;
158 162
@@ -161,12 +165,8 @@ ktrace_enter(
161 /* 165 /*
162 * Grab an entry by pushing the index up to the next one. 166 * Grab an entry by pushing the index up to the next one.
163 */ 167 */
164 spin_lock_irqsave(&wrap_lock, flags); 168 index = atomic_add_return(1, &ktp->kt_index);
165 index = ktp->kt_index; 169 index = (index - 1) & ktp->kt_index_mask;
166 if (++ktp->kt_index == ktp->kt_nentries)
167 ktp->kt_index = 0;
168 spin_unlock_irqrestore(&wrap_lock, flags);
169
170 if (!ktp->kt_rollover && index == ktp->kt_nentries - 1) 170 if (!ktp->kt_rollover && index == ktp->kt_nentries - 1)
171 ktp->kt_rollover = 1; 171 ktp->kt_rollover = 1;
172 172
@@ -199,11 +199,12 @@ int
199ktrace_nentries( 199ktrace_nentries(
200 ktrace_t *ktp) 200 ktrace_t *ktp)
201{ 201{
202 if (ktp == NULL) { 202 int index;
203 if (ktp == NULL)
203 return 0; 204 return 0;
204 }
205 205
206 return (ktp->kt_rollover ? ktp->kt_nentries : ktp->kt_index); 206 index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
207 return (ktp->kt_rollover ? ktp->kt_nentries : index);
207} 208}
208 209
209/* 210/*
@@ -228,7 +229,7 @@ ktrace_first(ktrace_t *ktp, ktrace_snap_t *ktsp)
228 int nentries; 229 int nentries;
229 230
230 if (ktp->kt_rollover) 231 if (ktp->kt_rollover)
231 index = ktp->kt_index; 232 index = atomic_read(&ktp->kt_index) & ktp->kt_index_mask;
232 else 233 else
233 index = 0; 234 index = 0;
234 235
diff --git a/fs/xfs/support/ktrace.h b/fs/xfs/support/ktrace.h
index 56e72b40a859..741d6947ca60 100644
--- a/fs/xfs/support/ktrace.h
+++ b/fs/xfs/support/ktrace.h
@@ -30,7 +30,8 @@ typedef struct ktrace_entry {
30 */ 30 */
31typedef struct ktrace { 31typedef struct ktrace {
32 int kt_nentries; /* number of entries in trace buf */ 32 int kt_nentries; /* number of entries in trace buf */
33 int kt_index; /* current index in entries */ 33 atomic_t kt_index; /* current index in entries */
34 unsigned int kt_index_mask;
34 int kt_rollover; 35 int kt_rollover;
35 ktrace_entry_t *kt_entries; /* buffer of entries */ 36 ktrace_entry_t *kt_entries; /* buffer of entries */
36} ktrace_t; 37} ktrace_t;
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..765aaf65e2d3 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -22,7 +22,7 @@
22#define STATIC 22#define STATIC
23#define DEBUG 1 23#define DEBUG 1
24#define XFS_BUF_LOCK_TRACKING 1 24#define XFS_BUF_LOCK_TRACKING 1
25/* #define QUOTADEBUG 1 */ 25#define QUOTADEBUG 1
26#endif 26#endif
27 27
28#ifdef CONFIG_XFS_TRACE 28#ifdef CONFIG_XFS_TRACE
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 7272fe39a92d..8e130b9720ae 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -307,12 +307,13 @@ xfs_acl_vset(
307 307
308 VN_HOLD(vp); 308 VN_HOLD(vp);
309 error = xfs_acl_allow_set(vp, kind); 309 error = xfs_acl_allow_set(vp, kind);
310 if (error)
311 goto out;
312 310
313 /* Incoming ACL exists, set file mode based on its value */ 311 /* Incoming ACL exists, set file mode based on its value */
314 if (kind == _ACL_TYPE_ACCESS) 312 if (!error && kind == _ACL_TYPE_ACCESS)
315 xfs_acl_setmode(vp, xfs_acl, &basicperms); 313 error = xfs_acl_setmode(vp, xfs_acl, &basicperms);
314
315 if (error)
316 goto out;
316 317
317 /* 318 /*
318 * If we have more than std unix permissions, set up the actual attr. 319 * If we have more than std unix permissions, set up the actual attr.
@@ -323,7 +324,7 @@ xfs_acl_vset(
323 if (!basicperms) { 324 if (!basicperms) {
324 xfs_acl_set_attr(vp, xfs_acl, kind, &error); 325 xfs_acl_set_attr(vp, xfs_acl, kind, &error);
325 } else { 326 } else {
326 xfs_acl_vremove(vp, _ACL_TYPE_ACCESS); 327 error = -xfs_acl_vremove(vp, _ACL_TYPE_ACCESS);
327 } 328 }
328 329
329out: 330out:
@@ -707,7 +708,9 @@ xfs_acl_inherit(
707 708
708 memcpy(cacl, pdaclp, sizeof(xfs_acl_t)); 709 memcpy(cacl, pdaclp, sizeof(xfs_acl_t));
709 xfs_acl_filter_mode(mode, cacl); 710 xfs_acl_filter_mode(mode, cacl);
710 xfs_acl_setmode(vp, cacl, &basicperms); 711 error = xfs_acl_setmode(vp, cacl, &basicperms);
712 if (error)
713 goto out_error;
711 714
712 /* 715 /*
713 * Set the Default and Access ACL on the file. The mode is already 716 * Set the Default and Access ACL on the file. The mode is already
@@ -720,6 +723,7 @@ xfs_acl_inherit(
720 xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error); 723 xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error);
721 if (!error && !basicperms) 724 if (!error && !basicperms)
722 xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error); 725 xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error);
726out_error:
723 _ACL_FREE(cacl); 727 _ACL_FREE(cacl);
724 return error; 728 return error;
725} 729}
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index bdbfbbee4959..1956f83489f1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -45,7 +45,7 @@
45#define XFSA_FIXUP_BNO_OK 1 45#define XFSA_FIXUP_BNO_OK 1
46#define XFSA_FIXUP_CNT_OK 2 46#define XFSA_FIXUP_CNT_OK 2
47 47
48STATIC int 48STATIC void
49xfs_alloc_search_busy(xfs_trans_t *tp, 49xfs_alloc_search_busy(xfs_trans_t *tp,
50 xfs_agnumber_t agno, 50 xfs_agnumber_t agno,
51 xfs_agblock_t bno, 51 xfs_agblock_t bno,
@@ -55,24 +55,24 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
55ktrace_t *xfs_alloc_trace_buf; 55ktrace_t *xfs_alloc_trace_buf;
56 56
57#define TRACE_ALLOC(s,a) \ 57#define TRACE_ALLOC(s,a) \
58 xfs_alloc_trace_alloc(__FUNCTION__, s, a, __LINE__) 58 xfs_alloc_trace_alloc(__func__, s, a, __LINE__)
59#define TRACE_FREE(s,a,b,x,f) \ 59#define TRACE_FREE(s,a,b,x,f) \
60 xfs_alloc_trace_free(__FUNCTION__, s, mp, a, b, x, f, __LINE__) 60 xfs_alloc_trace_free(__func__, s, mp, a, b, x, f, __LINE__)
61#define TRACE_MODAGF(s,a,f) \ 61#define TRACE_MODAGF(s,a,f) \
62 xfs_alloc_trace_modagf(__FUNCTION__, s, mp, a, f, __LINE__) 62 xfs_alloc_trace_modagf(__func__, s, mp, a, f, __LINE__)
63#define TRACE_BUSY(__FUNCTION__,s,ag,agb,l,sl,tp) \ 63#define TRACE_BUSY(__func__,s,ag,agb,l,sl,tp) \
64 xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__) 64 xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSY, __LINE__)
65#define TRACE_UNBUSY(__FUNCTION__,s,ag,sl,tp) \ 65#define TRACE_UNBUSY(__func__,s,ag,sl,tp) \
66 xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__) 66 xfs_alloc_trace_busy(__func__, s, mp, ag, -1, -1, sl, tp, XFS_ALLOC_KTRACE_UNBUSY, __LINE__)
67#define TRACE_BUSYSEARCH(__FUNCTION__,s,ag,agb,l,sl,tp) \ 67#define TRACE_BUSYSEARCH(__func__,s,ag,agb,l,tp) \
68 xfs_alloc_trace_busy(__FUNCTION__, s, mp, ag, agb, l, sl, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__) 68 xfs_alloc_trace_busy(__func__, s, mp, ag, agb, l, 0, tp, XFS_ALLOC_KTRACE_BUSYSEARCH, __LINE__)
69#else 69#else
70#define TRACE_ALLOC(s,a) 70#define TRACE_ALLOC(s,a)
71#define TRACE_FREE(s,a,b,x,f) 71#define TRACE_FREE(s,a,b,x,f)
72#define TRACE_MODAGF(s,a,f) 72#define TRACE_MODAGF(s,a,f)
73#define TRACE_BUSY(s,a,ag,agb,l,sl,tp) 73#define TRACE_BUSY(s,a,ag,agb,l,sl,tp)
74#define TRACE_UNBUSY(fname,s,ag,sl,tp) 74#define TRACE_UNBUSY(fname,s,ag,sl,tp)
75#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,sl,tp) 75#define TRACE_BUSYSEARCH(fname,s,ag,agb,l,tp)
76#endif /* XFS_ALLOC_TRACE */ 76#endif /* XFS_ALLOC_TRACE */
77 77
78/* 78/*
@@ -93,7 +93,7 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
93 * Compute aligned version of the found extent. 93 * Compute aligned version of the found extent.
94 * Takes alignment and min length into account. 94 * Takes alignment and min length into account.
95 */ 95 */
96STATIC int /* success (>= minlen) */ 96STATIC void
97xfs_alloc_compute_aligned( 97xfs_alloc_compute_aligned(
98 xfs_agblock_t foundbno, /* starting block in found extent */ 98 xfs_agblock_t foundbno, /* starting block in found extent */
99 xfs_extlen_t foundlen, /* length in found extent */ 99 xfs_extlen_t foundlen, /* length in found extent */
@@ -116,7 +116,6 @@ xfs_alloc_compute_aligned(
116 } 116 }
117 *resbno = bno; 117 *resbno = bno;
118 *reslen = len; 118 *reslen = len;
119 return len >= minlen;
120} 119}
121 120
122/* 121/*
@@ -837,9 +836,9 @@ xfs_alloc_ag_vextent_near(
837 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i))) 836 if ((error = xfs_alloc_get_rec(cnt_cur, &ltbno, &ltlen, &i)))
838 goto error0; 837 goto error0;
839 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 838 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
840 if (!xfs_alloc_compute_aligned(ltbno, ltlen, 839 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
841 args->alignment, args->minlen, 840 args->minlen, &ltbnoa, &ltlena);
842 &ltbnoa, &ltlena)) 841 if (ltlena < args->minlen)
843 continue; 842 continue;
844 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); 843 args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
845 xfs_alloc_fix_len(args); 844 xfs_alloc_fix_len(args);
@@ -958,9 +957,9 @@ xfs_alloc_ag_vextent_near(
958 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i))) 957 if ((error = xfs_alloc_get_rec(bno_cur_lt, &ltbno, &ltlen, &i)))
959 goto error0; 958 goto error0;
960 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 959 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
961 if (xfs_alloc_compute_aligned(ltbno, ltlen, 960 xfs_alloc_compute_aligned(ltbno, ltlen, args->alignment,
962 args->alignment, args->minlen, 961 args->minlen, &ltbnoa, &ltlena);
963 &ltbnoa, &ltlena)) 962 if (ltlena >= args->minlen)
964 break; 963 break;
965 if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i))) 964 if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
966 goto error0; 965 goto error0;
@@ -974,9 +973,9 @@ xfs_alloc_ag_vextent_near(
974 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i))) 973 if ((error = xfs_alloc_get_rec(bno_cur_gt, &gtbno, &gtlen, &i)))
975 goto error0; 974 goto error0;
976 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 975 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
977 if (xfs_alloc_compute_aligned(gtbno, gtlen, 976 xfs_alloc_compute_aligned(gtbno, gtlen, args->alignment,
978 args->alignment, args->minlen, 977 args->minlen, &gtbnoa, &gtlena);
979 &gtbnoa, &gtlena)) 978 if (gtlena >= args->minlen)
980 break; 979 break;
981 if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i))) 980 if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
982 goto error0; 981 goto error0;
@@ -2562,9 +2561,10 @@ xfs_alloc_clear_busy(xfs_trans_t *tp,
2562 2561
2563 2562
2564/* 2563/*
2565 * returns non-zero if any of (agno,bno):len is in a busy list 2564 * If we find the extent in the busy list, force the log out to get the
2565 * extent out of the busy list so the caller can use it straight away.
2566 */ 2566 */
2567STATIC int 2567STATIC void
2568xfs_alloc_search_busy(xfs_trans_t *tp, 2568xfs_alloc_search_busy(xfs_trans_t *tp,
2569 xfs_agnumber_t agno, 2569 xfs_agnumber_t agno,
2570 xfs_agblock_t bno, 2570 xfs_agblock_t bno,
@@ -2572,7 +2572,6 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2572{ 2572{
2573 xfs_mount_t *mp; 2573 xfs_mount_t *mp;
2574 xfs_perag_busy_t *bsy; 2574 xfs_perag_busy_t *bsy;
2575 int n;
2576 xfs_agblock_t uend, bend; 2575 xfs_agblock_t uend, bend;
2577 xfs_lsn_t lsn; 2576 xfs_lsn_t lsn;
2578 int cnt; 2577 int cnt;
@@ -2585,21 +2584,18 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2585 uend = bno + len - 1; 2584 uend = bno + len - 1;
2586 2585
2587 /* search pagb_list for this slot, skipping open slots */ 2586 /* search pagb_list for this slot, skipping open slots */
2588 for (bsy = mp->m_perag[agno].pagb_list, n = 0; 2587 for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) {
2589 cnt; bsy++, n++) {
2590 2588
2591 /* 2589 /*
2592 * (start1,length1) within (start2, length2) 2590 * (start1,length1) within (start2, length2)
2593 */ 2591 */
2594 if (bsy->busy_tp != NULL) { 2592 if (bsy->busy_tp != NULL) {
2595 bend = bsy->busy_start + bsy->busy_length - 1; 2593 bend = bsy->busy_start + bsy->busy_length - 1;
2596 if ((bno > bend) || 2594 if ((bno > bend) || (uend < bsy->busy_start)) {
2597 (uend < bsy->busy_start)) {
2598 cnt--; 2595 cnt--;
2599 } else { 2596 } else {
2600 TRACE_BUSYSEARCH("xfs_alloc_search_busy", 2597 TRACE_BUSYSEARCH("xfs_alloc_search_busy",
2601 "found1", agno, bno, len, n, 2598 "found1", agno, bno, len, tp);
2602 tp);
2603 break; 2599 break;
2604 } 2600 }
2605 } 2601 }
@@ -2610,15 +2606,12 @@ xfs_alloc_search_busy(xfs_trans_t *tp,
2610 * transaction that freed the block 2606 * transaction that freed the block
2611 */ 2607 */
2612 if (cnt) { 2608 if (cnt) {
2613 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, n, tp); 2609 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "found", agno, bno, len, tp);
2614 lsn = bsy->busy_tp->t_commit_lsn; 2610 lsn = bsy->busy_tp->t_commit_lsn;
2615 spin_unlock(&mp->m_perag[agno].pagb_lock); 2611 spin_unlock(&mp->m_perag[agno].pagb_lock);
2616 xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); 2612 xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC);
2617 } else { 2613 } else {
2618 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, n, tp); 2614 TRACE_BUSYSEARCH("xfs_alloc_search_busy", "not-found", agno, bno, len, tp);
2619 n = -1;
2620 spin_unlock(&mp->m_perag[agno].pagb_lock); 2615 spin_unlock(&mp->m_perag[agno].pagb_lock);
2621 } 2616 }
2622
2623 return n;
2624} 2617}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index e58f321fdae9..36d781ee5fcc 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2647,14 +2647,6 @@ attr_trusted_capable(
2647} 2647}
2648 2648
2649STATIC int 2649STATIC int
2650attr_secure_capable(
2651 bhv_vnode_t *vp,
2652 cred_t *cred)
2653{
2654 return -ENOSECURITY;
2655}
2656
2657STATIC int
2658attr_system_set( 2650attr_system_set(
2659 bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags) 2651 bhv_vnode_t *vp, char *name, void *data, size_t size, int xflags)
2660{ 2652{
@@ -2724,7 +2716,7 @@ struct attrnames attr_secure = {
2724 .attr_get = attr_generic_get, 2716 .attr_get = attr_generic_get,
2725 .attr_set = attr_generic_set, 2717 .attr_set = attr_generic_set,
2726 .attr_remove = attr_generic_remove, 2718 .attr_remove = attr_generic_remove,
2727 .attr_capable = attr_secure_capable, 2719 .attr_capable = (attrcapable_t)fs_noerr,
2728}; 2720};
2729 2721
2730struct attrnames attr_user = { 2722struct attrnames attr_user = {
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index 96ba6aa4ed8c..303d41e4217b 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -166,7 +166,7 @@ xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes)
166 166
167 if (!(mp->m_flags & XFS_MOUNT_ATTR2)) { 167 if (!(mp->m_flags & XFS_MOUNT_ATTR2)) {
168 if (bytes <= XFS_IFORK_ASIZE(dp)) 168 if (bytes <= XFS_IFORK_ASIZE(dp))
169 return mp->m_attroffset >> 3; 169 return dp->i_d.di_forkoff;
170 return 0; 170 return 0;
171 } 171 }
172 172
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 2def273855a2..eb198c01c35d 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -323,13 +323,13 @@ xfs_bmap_trace_pre_update(
323 int whichfork); /* data or attr fork */ 323 int whichfork); /* data or attr fork */
324 324
325#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \ 325#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) \
326 xfs_bmap_trace_delete(__FUNCTION__,d,ip,i,c,w) 326 xfs_bmap_trace_delete(__func__,d,ip,i,c,w)
327#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \ 327#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) \
328 xfs_bmap_trace_insert(__FUNCTION__,d,ip,i,c,r1,r2,w) 328 xfs_bmap_trace_insert(__func__,d,ip,i,c,r1,r2,w)
329#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \ 329#define XFS_BMAP_TRACE_POST_UPDATE(d,ip,i,w) \
330 xfs_bmap_trace_post_update(__FUNCTION__,d,ip,i,w) 330 xfs_bmap_trace_post_update(__func__,d,ip,i,w)
331#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \ 331#define XFS_BMAP_TRACE_PRE_UPDATE(d,ip,i,w) \
332 xfs_bmap_trace_pre_update(__FUNCTION__,d,ip,i,w) 332 xfs_bmap_trace_pre_update(__func__,d,ip,i,w)
333#else 333#else
334#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w) 334#define XFS_BMAP_TRACE_DELETE(d,ip,i,c,w)
335#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w) 335#define XFS_BMAP_TRACE_INSERT(d,ip,i,c,r1,r2,w)
@@ -2402,7 +2402,7 @@ xfs_bmap_extsize_align(
2402 2402
2403#define XFS_ALLOC_GAP_UNITS 4 2403#define XFS_ALLOC_GAP_UNITS 4
2404 2404
2405STATIC int 2405STATIC void
2406xfs_bmap_adjacent( 2406xfs_bmap_adjacent(
2407 xfs_bmalloca_t *ap) /* bmap alloc argument struct */ 2407 xfs_bmalloca_t *ap) /* bmap alloc argument struct */
2408{ 2408{
@@ -2548,7 +2548,6 @@ xfs_bmap_adjacent(
2548 ap->rval = gotbno; 2548 ap->rval = gotbno;
2549 } 2549 }
2550#undef ISVALID 2550#undef ISVALID
2551 return 0;
2552} 2551}
2553 2552
2554STATIC int 2553STATIC int
@@ -4154,16 +4153,21 @@ xfs_bmap_compute_maxlevels(
4154 * number of leaf entries, is controlled by the type of di_nextents 4153 * number of leaf entries, is controlled by the type of di_nextents
4155 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents 4154 * (a signed 32-bit number, xfs_extnum_t), or by di_anextents
4156 * (a signed 16-bit number, xfs_aextnum_t). 4155 * (a signed 16-bit number, xfs_aextnum_t).
4156 *
4157 * Note that we can no longer assume that if we are in ATTR1 that
4158 * the fork offset of all the inodes will be (m_attroffset >> 3)
4159 * because we could have mounted with ATTR2 and then mounted back
4160 * with ATTR1, keeping the di_forkoff's fixed but probably at
4161 * various positions. Therefore, for both ATTR1 and ATTR2
4162 * we have to assume the worst case scenario of a minimum size
4163 * available.
4157 */ 4164 */
4158 if (whichfork == XFS_DATA_FORK) { 4165 if (whichfork == XFS_DATA_FORK) {
4159 maxleafents = MAXEXTNUM; 4166 maxleafents = MAXEXTNUM;
4160 sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? 4167 sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
4161 XFS_BMDR_SPACE_CALC(MINDBTPTRS) : mp->m_attroffset;
4162 } else { 4168 } else {
4163 maxleafents = MAXAEXTNUM; 4169 maxleafents = MAXAEXTNUM;
4164 sz = (mp->m_flags & XFS_MOUNT_ATTR2) ? 4170 sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
4165 XFS_BMDR_SPACE_CALC(MINABTPTRS) :
4166 mp->m_sb.sb_inodesize - mp->m_attroffset;
4167 } 4171 }
4168 maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0); 4172 maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
4169 minleafrecs = mp->m_bmap_dmnr[0]; 4173 minleafrecs = mp->m_bmap_dmnr[0];
@@ -5772,7 +5776,6 @@ xfs_getbmap(
5772 int error; /* return value */ 5776 int error; /* return value */
5773 __int64_t fixlen; /* length for -1 case */ 5777 __int64_t fixlen; /* length for -1 case */
5774 int i; /* extent number */ 5778 int i; /* extent number */
5775 bhv_vnode_t *vp; /* corresponding vnode */
5776 int lock; /* lock state */ 5779 int lock; /* lock state */
5777 xfs_bmbt_irec_t *map; /* buffer for user's data */ 5780 xfs_bmbt_irec_t *map; /* buffer for user's data */
5778 xfs_mount_t *mp; /* file system mount point */ 5781 xfs_mount_t *mp; /* file system mount point */
@@ -5789,7 +5792,6 @@ xfs_getbmap(
5789 int bmapi_flags; /* flags for xfs_bmapi */ 5792 int bmapi_flags; /* flags for xfs_bmapi */
5790 __int32_t oflags; /* getbmapx bmv_oflags field */ 5793 __int32_t oflags; /* getbmapx bmv_oflags field */
5791 5794
5792 vp = XFS_ITOV(ip);
5793 mp = ip->i_mount; 5795 mp = ip->i_mount;
5794 5796
5795 whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK; 5797 whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
@@ -5811,7 +5813,7 @@ xfs_getbmap(
5811 if ((interface & BMV_IF_NO_DMAPI_READ) == 0 && 5813 if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
5812 DM_EVENT_ENABLED(ip, DM_EVENT_READ) && 5814 DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
5813 whichfork == XFS_DATA_FORK) { 5815 whichfork == XFS_DATA_FORK) {
5814 error = XFS_SEND_DATA(mp, DM_EVENT_READ, vp, 0, 0, 0, NULL); 5816 error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
5815 if (error) 5817 if (error)
5816 return XFS_ERROR(error); 5818 return XFS_ERROR(error);
5817 } 5819 }
@@ -5869,6 +5871,10 @@ xfs_getbmap(
5869 /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */ 5871 /* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
5870 error = xfs_flush_pages(ip, (xfs_off_t)0, 5872 error = xfs_flush_pages(ip, (xfs_off_t)0,
5871 -1, 0, FI_REMAPF); 5873 -1, 0, FI_REMAPF);
5874 if (error) {
5875 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
5876 return error;
5877 }
5872 } 5878 }
5873 5879
5874 ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0); 5880 ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
@@ -6162,10 +6168,10 @@ xfs_check_block(
6162 } 6168 }
6163 if (*thispa == *pp) { 6169 if (*thispa == *pp) {
6164 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld", 6170 cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
6165 __FUNCTION__, j, i, 6171 __func__, j, i,
6166 (unsigned long long)be64_to_cpu(*thispa)); 6172 (unsigned long long)be64_to_cpu(*thispa));
6167 panic("%s: ptrs are equal in node\n", 6173 panic("%s: ptrs are equal in node\n",
6168 __FUNCTION__); 6174 __func__);
6169 } 6175 }
6170 } 6176 }
6171 } 6177 }
@@ -6192,7 +6198,7 @@ xfs_bmap_check_leaf_extents(
6192 xfs_mount_t *mp; /* file system mount structure */ 6198 xfs_mount_t *mp; /* file system mount structure */
6193 __be64 *pp; /* pointer to block address */ 6199 __be64 *pp; /* pointer to block address */
6194 xfs_bmbt_rec_t *ep; /* pointer to current extent */ 6200 xfs_bmbt_rec_t *ep; /* pointer to current extent */
6195 xfs_bmbt_rec_t *lastp; /* pointer to previous extent */ 6201 xfs_bmbt_rec_t last = {0, 0}; /* last extent in prev block */
6196 xfs_bmbt_rec_t *nextp; /* pointer to next extent */ 6202 xfs_bmbt_rec_t *nextp; /* pointer to next extent */
6197 int bp_release = 0; 6203 int bp_release = 0;
6198 6204
@@ -6262,7 +6268,6 @@ xfs_bmap_check_leaf_extents(
6262 /* 6268 /*
6263 * Loop over all leaf nodes checking that all extents are in the right order. 6269 * Loop over all leaf nodes checking that all extents are in the right order.
6264 */ 6270 */
6265 lastp = NULL;
6266 for (;;) { 6271 for (;;) {
6267 xfs_fsblock_t nextbno; 6272 xfs_fsblock_t nextbno;
6268 xfs_extnum_t num_recs; 6273 xfs_extnum_t num_recs;
@@ -6283,18 +6288,16 @@ xfs_bmap_check_leaf_extents(
6283 */ 6288 */
6284 6289
6285 ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1); 6290 ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
6291 if (i) {
6292 xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
6293 }
6286 for (j = 1; j < num_recs; j++) { 6294 for (j = 1; j < num_recs; j++) {
6287 nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1); 6295 nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
6288 if (lastp) { 6296 xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
6289 xfs_btree_check_rec(XFS_BTNUM_BMAP,
6290 (void *)lastp, (void *)ep);
6291 }
6292 xfs_btree_check_rec(XFS_BTNUM_BMAP, (void *)ep,
6293 (void *)(nextp));
6294 lastp = ep;
6295 ep = nextp; 6297 ep = nextp;
6296 } 6298 }
6297 6299
6300 last = *ep;
6298 i += num_recs; 6301 i += num_recs;
6299 if (bp_release) { 6302 if (bp_release) {
6300 bp_release = 0; 6303 bp_release = 0;
@@ -6325,13 +6328,13 @@ xfs_bmap_check_leaf_extents(
6325 return; 6328 return;
6326 6329
6327error0: 6330error0:
6328 cmn_err(CE_WARN, "%s: at error0", __FUNCTION__); 6331 cmn_err(CE_WARN, "%s: at error0", __func__);
6329 if (bp_release) 6332 if (bp_release)
6330 xfs_trans_brelse(NULL, bp); 6333 xfs_trans_brelse(NULL, bp);
6331error_norelse: 6334error_norelse:
6332 cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents", 6335 cmn_err(CE_WARN, "%s: BAD after btree leaves for %d extents",
6333 __FUNCTION__, i); 6336 __func__, i);
6334 panic("%s: CORRUPTED BTREE OR SOMETHING", __FUNCTION__); 6337 panic("%s: CORRUPTED BTREE OR SOMETHING", __func__);
6335 return; 6338 return;
6336} 6339}
6337#endif 6340#endif
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 87224b7d7984..6ff70cda451c 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -151,7 +151,7 @@ xfs_bmap_trace_exlist(
151 xfs_extnum_t cnt, /* count of entries in list */ 151 xfs_extnum_t cnt, /* count of entries in list */
152 int whichfork); /* data or attr fork */ 152 int whichfork); /* data or attr fork */
153#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \ 153#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
154 xfs_bmap_trace_exlist(__FUNCTION__,ip,c,w) 154 xfs_bmap_trace_exlist(__func__,ip,c,w)
155#else 155#else
156#define XFS_BMAP_TRACE_EXLIST(ip,c,w) 156#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
157#endif 157#endif
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index bd18987326a3..4f0e849d973e 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -275,21 +275,21 @@ xfs_bmbt_trace_cursor(
275} 275}
276 276
277#define XFS_BMBT_TRACE_ARGBI(c,b,i) \ 277#define XFS_BMBT_TRACE_ARGBI(c,b,i) \
278 xfs_bmbt_trace_argbi(__FUNCTION__, c, b, i, __LINE__) 278 xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
279#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \ 279#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \
280 xfs_bmbt_trace_argbii(__FUNCTION__, c, b, i, j, __LINE__) 280 xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
281#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \ 281#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \
282 xfs_bmbt_trace_argfffi(__FUNCTION__, c, o, b, i, j, __LINE__) 282 xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
283#define XFS_BMBT_TRACE_ARGI(c,i) \ 283#define XFS_BMBT_TRACE_ARGI(c,i) \
284 xfs_bmbt_trace_argi(__FUNCTION__, c, i, __LINE__) 284 xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
285#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \ 285#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \
286 xfs_bmbt_trace_argifk(__FUNCTION__, c, i, f, s, __LINE__) 286 xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
287#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \ 287#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \
288 xfs_bmbt_trace_argifr(__FUNCTION__, c, i, f, r, __LINE__) 288 xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
289#define XFS_BMBT_TRACE_ARGIK(c,i,k) \ 289#define XFS_BMBT_TRACE_ARGIK(c,i,k) \
290 xfs_bmbt_trace_argik(__FUNCTION__, c, i, k, __LINE__) 290 xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
291#define XFS_BMBT_TRACE_CURSOR(c,s) \ 291#define XFS_BMBT_TRACE_CURSOR(c,s) \
292 xfs_bmbt_trace_cursor(__FUNCTION__, c, s, __LINE__) 292 xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
293#else 293#else
294#define XFS_BMBT_TRACE_ARGBI(c,b,i) 294#define XFS_BMBT_TRACE_ARGBI(c,b,i)
295#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) 295#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
@@ -2027,6 +2027,24 @@ xfs_bmbt_increment(
2027 2027
2028/* 2028/*
2029 * Insert the current record at the point referenced by cur. 2029 * Insert the current record at the point referenced by cur.
2030 *
2031 * A multi-level split of the tree on insert will invalidate the original
2032 * cursor. It appears, however, that some callers assume that the cursor is
2033 * always valid. Hence if we do a multi-level split we need to revalidate the
2034 * cursor.
2035 *
2036 * When a split occurs, we will see a new cursor returned. Use that as a
2037 * trigger to determine if we need to revalidate the original cursor. If we get
2038 * a split, then use the original irec to lookup up the path of the record we
2039 * just inserted.
2040 *
2041 * Note that the fact that the btree root is in the inode means that we can
2042 * have the level of the tree change without a "split" occurring at the root
2043 * level. What happens is that the root is migrated to an allocated block and
2044 * the inode root is pointed to it. This means a single split can change the
2045 * level of the tree (level 2 -> level 3) and invalidate the old cursor. Hence
2046 * the level change should be accounted as a split so as to correctly trigger a
2047 * revalidation of the old cursor.
2030 */ 2048 */
2031int /* error */ 2049int /* error */
2032xfs_bmbt_insert( 2050xfs_bmbt_insert(
@@ -2039,11 +2057,14 @@ xfs_bmbt_insert(
2039 xfs_fsblock_t nbno; 2057 xfs_fsblock_t nbno;
2040 xfs_btree_cur_t *ncur; 2058 xfs_btree_cur_t *ncur;
2041 xfs_bmbt_rec_t nrec; 2059 xfs_bmbt_rec_t nrec;
2060 xfs_bmbt_irec_t oirec; /* original irec */
2042 xfs_btree_cur_t *pcur; 2061 xfs_btree_cur_t *pcur;
2062 int splits = 0;
2043 2063
2044 XFS_BMBT_TRACE_CURSOR(cur, ENTRY); 2064 XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
2045 level = 0; 2065 level = 0;
2046 nbno = NULLFSBLOCK; 2066 nbno = NULLFSBLOCK;
2067 oirec = cur->bc_rec.b;
2047 xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b); 2068 xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
2048 ncur = NULL; 2069 ncur = NULL;
2049 pcur = cur; 2070 pcur = cur;
@@ -2052,11 +2073,13 @@ xfs_bmbt_insert(
2052 &i))) { 2073 &i))) {
2053 if (pcur != cur) 2074 if (pcur != cur)
2054 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR); 2075 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
2055 XFS_BMBT_TRACE_CURSOR(cur, ERROR); 2076 goto error0;
2056 return error;
2057 } 2077 }
2058 XFS_WANT_CORRUPTED_GOTO(i == 1, error0); 2078 XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
2059 if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) { 2079 if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
2080 /* allocating a new root is effectively a split */
2081 if (cur->bc_nlevels != pcur->bc_nlevels)
2082 splits++;
2060 cur->bc_nlevels = pcur->bc_nlevels; 2083 cur->bc_nlevels = pcur->bc_nlevels;
2061 cur->bc_private.b.allocated += 2084 cur->bc_private.b.allocated +=
2062 pcur->bc_private.b.allocated; 2085 pcur->bc_private.b.allocated;
@@ -2070,10 +2093,21 @@ xfs_bmbt_insert(
2070 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR); 2093 xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
2071 } 2094 }
2072 if (ncur) { 2095 if (ncur) {
2096 splits++;
2073 pcur = ncur; 2097 pcur = ncur;
2074 ncur = NULL; 2098 ncur = NULL;
2075 } 2099 }
2076 } while (nbno != NULLFSBLOCK); 2100 } while (nbno != NULLFSBLOCK);
2101
2102 if (splits > 1) {
2103 /* revalidate the old cursor as we had a multi-level split */
2104 error = xfs_bmbt_lookup_eq(cur, oirec.br_startoff,
2105 oirec.br_startblock, oirec.br_blockcount, &i);
2106 if (error)
2107 goto error0;
2108 ASSERT(i == 1);
2109 }
2110
2077 XFS_BMBT_TRACE_CURSOR(cur, EXIT); 2111 XFS_BMBT_TRACE_CURSOR(cur, EXIT);
2078 *stat = i; 2112 *stat = i;
2079 return 0; 2113 return 0;
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 63debd147eb5..53a71c62025d 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -645,7 +645,12 @@ xfs_buf_item_push(
645 bp = bip->bli_buf; 645 bp = bip->bli_buf;
646 646
647 if (XFS_BUF_ISDELAYWRITE(bp)) { 647 if (XFS_BUF_ISDELAYWRITE(bp)) {
648 xfs_bawrite(bip->bli_item.li_mountp, bp); 648 int error;
649 error = xfs_bawrite(bip->bli_item.li_mountp, bp);
650 if (error)
651 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp,
652 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p",
653 error, bip, bp);
649 } else { 654 } else {
650 xfs_buf_relse(bp); 655 xfs_buf_relse(bp);
651 } 656 }
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index e92e73f0e6af..7cb26529766b 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -44,6 +44,7 @@
44#include "xfs_error.h" 44#include "xfs_error.h"
45#include "xfs_vnodeops.h" 45#include "xfs_vnodeops.h"
46 46
47struct xfs_name xfs_name_dotdot = {"..", 2};
47 48
48void 49void
49xfs_dir_mount( 50xfs_dir_mount(
@@ -146,8 +147,7 @@ int
146xfs_dir_createname( 147xfs_dir_createname(
147 xfs_trans_t *tp, 148 xfs_trans_t *tp,
148 xfs_inode_t *dp, 149 xfs_inode_t *dp,
149 char *name, 150 struct xfs_name *name,
150 int namelen,
151 xfs_ino_t inum, /* new entry inode number */ 151 xfs_ino_t inum, /* new entry inode number */
152 xfs_fsblock_t *first, /* bmap's firstblock */ 152 xfs_fsblock_t *first, /* bmap's firstblock */
153 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 153 xfs_bmap_free_t *flist, /* bmap's freeblock list */
@@ -162,9 +162,9 @@ xfs_dir_createname(
162 return rval; 162 return rval;
163 XFS_STATS_INC(xs_dir_create); 163 XFS_STATS_INC(xs_dir_create);
164 164
165 args.name = name; 165 args.name = name->name;
166 args.namelen = namelen; 166 args.namelen = name->len;
167 args.hashval = xfs_da_hashname(name, namelen); 167 args.hashval = xfs_da_hashname(name->name, name->len);
168 args.inumber = inum; 168 args.inumber = inum;
169 args.dp = dp; 169 args.dp = dp;
170 args.firstblock = first; 170 args.firstblock = first;
@@ -197,8 +197,7 @@ int
197xfs_dir_lookup( 197xfs_dir_lookup(
198 xfs_trans_t *tp, 198 xfs_trans_t *tp,
199 xfs_inode_t *dp, 199 xfs_inode_t *dp,
200 char *name, 200 struct xfs_name *name,
201 int namelen,
202 xfs_ino_t *inum) /* out: inode number */ 201 xfs_ino_t *inum) /* out: inode number */
203{ 202{
204 xfs_da_args_t args; 203 xfs_da_args_t args;
@@ -207,18 +206,14 @@ xfs_dir_lookup(
207 206
208 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 207 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
209 XFS_STATS_INC(xs_dir_lookup); 208 XFS_STATS_INC(xs_dir_lookup);
209 memset(&args, 0, sizeof(xfs_da_args_t));
210 210
211 args.name = name; 211 args.name = name->name;
212 args.namelen = namelen; 212 args.namelen = name->len;
213 args.hashval = xfs_da_hashname(name, namelen); 213 args.hashval = xfs_da_hashname(name->name, name->len);
214 args.inumber = 0;
215 args.dp = dp; 214 args.dp = dp;
216 args.firstblock = NULL;
217 args.flist = NULL;
218 args.total = 0;
219 args.whichfork = XFS_DATA_FORK; 215 args.whichfork = XFS_DATA_FORK;
220 args.trans = tp; 216 args.trans = tp;
221 args.justcheck = args.addname = 0;
222 args.oknoent = 1; 217 args.oknoent = 1;
223 218
224 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) 219 if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
@@ -247,8 +242,7 @@ int
247xfs_dir_removename( 242xfs_dir_removename(
248 xfs_trans_t *tp, 243 xfs_trans_t *tp,
249 xfs_inode_t *dp, 244 xfs_inode_t *dp,
250 char *name, 245 struct xfs_name *name,
251 int namelen,
252 xfs_ino_t ino, 246 xfs_ino_t ino,
253 xfs_fsblock_t *first, /* bmap's firstblock */ 247 xfs_fsblock_t *first, /* bmap's firstblock */
254 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 248 xfs_bmap_free_t *flist, /* bmap's freeblock list */
@@ -261,9 +255,9 @@ xfs_dir_removename(
261 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 255 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
262 XFS_STATS_INC(xs_dir_remove); 256 XFS_STATS_INC(xs_dir_remove);
263 257
264 args.name = name; 258 args.name = name->name;
265 args.namelen = namelen; 259 args.namelen = name->len;
266 args.hashval = xfs_da_hashname(name, namelen); 260 args.hashval = xfs_da_hashname(name->name, name->len);
267 args.inumber = ino; 261 args.inumber = ino;
268 args.dp = dp; 262 args.dp = dp;
269 args.firstblock = first; 263 args.firstblock = first;
@@ -329,8 +323,7 @@ int
329xfs_dir_replace( 323xfs_dir_replace(
330 xfs_trans_t *tp, 324 xfs_trans_t *tp,
331 xfs_inode_t *dp, 325 xfs_inode_t *dp,
332 char *name, /* name of entry to replace */ 326 struct xfs_name *name, /* name of entry to replace */
333 int namelen,
334 xfs_ino_t inum, /* new inode number */ 327 xfs_ino_t inum, /* new inode number */
335 xfs_fsblock_t *first, /* bmap's firstblock */ 328 xfs_fsblock_t *first, /* bmap's firstblock */
336 xfs_bmap_free_t *flist, /* bmap's freeblock list */ 329 xfs_bmap_free_t *flist, /* bmap's freeblock list */
@@ -345,9 +338,9 @@ xfs_dir_replace(
345 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum))) 338 if ((rval = xfs_dir_ino_validate(tp->t_mountp, inum)))
346 return rval; 339 return rval;
347 340
348 args.name = name; 341 args.name = name->name;
349 args.namelen = namelen; 342 args.namelen = name->len;
350 args.hashval = xfs_da_hashname(name, namelen); 343 args.hashval = xfs_da_hashname(name->name, name->len);
351 args.inumber = inum; 344 args.inumber = inum;
352 args.dp = dp; 345 args.dp = dp;
353 args.firstblock = first; 346 args.firstblock = first;
@@ -374,28 +367,29 @@ xfs_dir_replace(
374 367
375/* 368/*
376 * See if this entry can be added to the directory without allocating space. 369 * See if this entry can be added to the directory without allocating space.
370 * First checks that the caller couldn't reserve enough space (resblks = 0).
377 */ 371 */
378int 372int
379xfs_dir_canenter( 373xfs_dir_canenter(
380 xfs_trans_t *tp, 374 xfs_trans_t *tp,
381 xfs_inode_t *dp, 375 xfs_inode_t *dp,
382 char *name, /* name of entry to add */ 376 struct xfs_name *name, /* name of entry to add */
383 int namelen) 377 uint resblks)
384{ 378{
385 xfs_da_args_t args; 379 xfs_da_args_t args;
386 int rval; 380 int rval;
387 int v; /* type-checking value */ 381 int v; /* type-checking value */
388 382
383 if (resblks)
384 return 0;
385
389 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR); 386 ASSERT((dp->i_d.di_mode & S_IFMT) == S_IFDIR);
387 memset(&args, 0, sizeof(xfs_da_args_t));
390 388
391 args.name = name; 389 args.name = name->name;
392 args.namelen = namelen; 390 args.namelen = name->len;
393 args.hashval = xfs_da_hashname(name, namelen); 391 args.hashval = xfs_da_hashname(name->name, name->len);
394 args.inumber = 0;
395 args.dp = dp; 392 args.dp = dp;
396 args.firstblock = NULL;
397 args.flist = NULL;
398 args.total = 0;
399 args.whichfork = XFS_DATA_FORK; 393 args.whichfork = XFS_DATA_FORK;
400 args.trans = tp; 394 args.trans = tp;
401 args.justcheck = args.addname = args.oknoent = 1; 395 args.justcheck = args.addname = args.oknoent = 1;
diff --git a/fs/xfs/xfs_dir2.h b/fs/xfs/xfs_dir2.h
index b265197e74cf..6392f939029f 100644
--- a/fs/xfs/xfs_dir2.h
+++ b/fs/xfs/xfs_dir2.h
@@ -59,6 +59,8 @@ typedef __uint32_t xfs_dir2_db_t;
59 */ 59 */
60typedef xfs_off_t xfs_dir2_off_t; 60typedef xfs_off_t xfs_dir2_off_t;
61 61
62extern struct xfs_name xfs_name_dotdot;
63
62/* 64/*
63 * Generic directory interface routines 65 * Generic directory interface routines
64 */ 66 */
@@ -68,21 +70,21 @@ extern int xfs_dir_isempty(struct xfs_inode *dp);
68extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp, 70extern int xfs_dir_init(struct xfs_trans *tp, struct xfs_inode *dp,
69 struct xfs_inode *pdp); 71 struct xfs_inode *pdp);
70extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp, 72extern int xfs_dir_createname(struct xfs_trans *tp, struct xfs_inode *dp,
71 char *name, int namelen, xfs_ino_t inum, 73 struct xfs_name *name, xfs_ino_t inum,
72 xfs_fsblock_t *first, 74 xfs_fsblock_t *first,
73 struct xfs_bmap_free *flist, xfs_extlen_t tot); 75 struct xfs_bmap_free *flist, xfs_extlen_t tot);
74extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp, 76extern int xfs_dir_lookup(struct xfs_trans *tp, struct xfs_inode *dp,
75 char *name, int namelen, xfs_ino_t *inum); 77 struct xfs_name *name, xfs_ino_t *inum);
76extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp, 78extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
77 char *name, int namelen, xfs_ino_t ino, 79 struct xfs_name *name, xfs_ino_t ino,
78 xfs_fsblock_t *first, 80 xfs_fsblock_t *first,
79 struct xfs_bmap_free *flist, xfs_extlen_t tot); 81 struct xfs_bmap_free *flist, xfs_extlen_t tot);
80extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, 82extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
81 char *name, int namelen, xfs_ino_t inum, 83 struct xfs_name *name, xfs_ino_t inum,
82 xfs_fsblock_t *first, 84 xfs_fsblock_t *first,
83 struct xfs_bmap_free *flist, xfs_extlen_t tot); 85 struct xfs_bmap_free *flist, xfs_extlen_t tot);
84extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, 86extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp,
85 char *name, int namelen); 87 struct xfs_name *name, uint resblks);
86extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino); 88extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
87 89
88/* 90/*
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index eb03eab5ca52..3f3785b10804 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -73,7 +73,7 @@ xfs_filestreams_trace(
73#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0) 73#define TRACE4(mp,t,a0,a1,a2,a3) TRACE6(mp,t,a0,a1,a2,a3,0,0)
74#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0) 74#define TRACE5(mp,t,a0,a1,a2,a3,a4) TRACE6(mp,t,a0,a1,a2,a3,a4,0)
75#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \ 75#define TRACE6(mp,t,a0,a1,a2,a3,a4,a5) \
76 xfs_filestreams_trace(mp, t, __FUNCTION__, __LINE__, \ 76 xfs_filestreams_trace(mp, t, __func__, __LINE__, \
77 (__psunsigned_t)a0, (__psunsigned_t)a1, \ 77 (__psunsigned_t)a0, (__psunsigned_t)a1, \
78 (__psunsigned_t)a2, (__psunsigned_t)a3, \ 78 (__psunsigned_t)a2, (__psunsigned_t)a3, \
79 (__psunsigned_t)a4, (__psunsigned_t)a5) 79 (__psunsigned_t)a4, (__psunsigned_t)a5)
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 5a146cb22980..a64dfbd565a5 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -107,6 +107,16 @@ xfs_ialloc_log_di(
107/* 107/*
108 * Allocation group level functions. 108 * Allocation group level functions.
109 */ 109 */
110static inline int
111xfs_ialloc_cluster_alignment(
112 xfs_alloc_arg_t *args)
113{
114 if (xfs_sb_version_hasalign(&args->mp->m_sb) &&
115 args->mp->m_sb.sb_inoalignmt >=
116 XFS_B_TO_FSBT(args->mp, XFS_INODE_CLUSTER_SIZE(args->mp)))
117 return args->mp->m_sb.sb_inoalignmt;
118 return 1;
119}
110 120
111/* 121/*
112 * Allocate new inodes in the allocation group specified by agbp. 122 * Allocate new inodes in the allocation group specified by agbp.
@@ -167,10 +177,24 @@ xfs_ialloc_ag_alloc(
167 args.mod = args.total = args.wasdel = args.isfl = 177 args.mod = args.total = args.wasdel = args.isfl =
168 args.userdata = args.minalignslop = 0; 178 args.userdata = args.minalignslop = 0;
169 args.prod = 1; 179 args.prod = 1;
170 args.alignment = 1; 180
171 /* 181 /*
172 * Allow space for the inode btree to split. 182 * We need to take into account alignment here to ensure that
183 * we don't modify the free list if we fail to have an exact
184 * block. If we don't have an exact match, and every oher
185 * attempt allocation attempt fails, we'll end up cancelling
186 * a dirty transaction and shutting down.
187 *
188 * For an exact allocation, alignment must be 1,
189 * however we need to take cluster alignment into account when
190 * fixing up the freelist. Use the minalignslop field to
191 * indicate that extra blocks might be required for alignment,
192 * but not to use them in the actual exact allocation.
173 */ 193 */
194 args.alignment = 1;
195 args.minalignslop = xfs_ialloc_cluster_alignment(&args) - 1;
196
197 /* Allow space for the inode btree to split. */
174 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; 198 args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1;
175 if ((error = xfs_alloc_vextent(&args))) 199 if ((error = xfs_alloc_vextent(&args)))
176 return error; 200 return error;
@@ -191,13 +215,8 @@ xfs_ialloc_ag_alloc(
191 ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); 215 ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN));
192 args.alignment = args.mp->m_dalign; 216 args.alignment = args.mp->m_dalign;
193 isaligned = 1; 217 isaligned = 1;
194 } else if (xfs_sb_version_hasalign(&args.mp->m_sb) && 218 } else
195 args.mp->m_sb.sb_inoalignmt >= 219 args.alignment = xfs_ialloc_cluster_alignment(&args);
196 XFS_B_TO_FSBT(args.mp,
197 XFS_INODE_CLUSTER_SIZE(args.mp)))
198 args.alignment = args.mp->m_sb.sb_inoalignmt;
199 else
200 args.alignment = 1;
201 /* 220 /*
202 * Need to figure out where to allocate the inode blocks. 221 * Need to figure out where to allocate the inode blocks.
203 * Ideally they should be spaced out through the a.g. 222 * Ideally they should be spaced out through the a.g.
@@ -230,12 +249,7 @@ xfs_ialloc_ag_alloc(
230 args.agbno = be32_to_cpu(agi->agi_root); 249 args.agbno = be32_to_cpu(agi->agi_root);
231 args.fsbno = XFS_AGB_TO_FSB(args.mp, 250 args.fsbno = XFS_AGB_TO_FSB(args.mp,
232 be32_to_cpu(agi->agi_seqno), args.agbno); 251 be32_to_cpu(agi->agi_seqno), args.agbno);
233 if (xfs_sb_version_hasalign(&args.mp->m_sb) && 252 args.alignment = xfs_ialloc_cluster_alignment(&args);
234 args.mp->m_sb.sb_inoalignmt >=
235 XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp)))
236 args.alignment = args.mp->m_sb.sb_inoalignmt;
237 else
238 args.alignment = 1;
239 if ((error = xfs_alloc_vextent(&args))) 253 if ((error = xfs_alloc_vextent(&args)))
240 return error; 254 return error;
241 } 255 }
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 8e09b71f4104..e657c5128460 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -78,7 +78,6 @@ xfs_iget_core(
78 xfs_inode_t *ip; 78 xfs_inode_t *ip;
79 xfs_inode_t *iq; 79 xfs_inode_t *iq;
80 int error; 80 int error;
81 xfs_icluster_t *icl, *new_icl = NULL;
82 unsigned long first_index, mask; 81 unsigned long first_index, mask;
83 xfs_perag_t *pag; 82 xfs_perag_t *pag;
84 xfs_agino_t agino; 83 xfs_agino_t agino;
@@ -229,11 +228,9 @@ finish_inode:
229 } 228 }
230 229
231 /* 230 /*
232 * This is a bit messy - we preallocate everything we _might_ 231 * Preload the radix tree so we can insert safely under the
233 * need before we pick up the ici lock. That way we don't have to 232 * write spinlock.
234 * juggle locks and go all the way back to the start.
235 */ 233 */
236 new_icl = kmem_zone_alloc(xfs_icluster_zone, KM_SLEEP);
237 if (radix_tree_preload(GFP_KERNEL)) { 234 if (radix_tree_preload(GFP_KERNEL)) {
238 xfs_idestroy(ip); 235 xfs_idestroy(ip);
239 delay(1); 236 delay(1);
@@ -242,17 +239,6 @@ finish_inode:
242 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); 239 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
243 first_index = agino & mask; 240 first_index = agino & mask;
244 write_lock(&pag->pag_ici_lock); 241 write_lock(&pag->pag_ici_lock);
245
246 /*
247 * Find the cluster if it exists
248 */
249 icl = NULL;
250 if (radix_tree_gang_lookup(&pag->pag_ici_root, (void**)&iq,
251 first_index, 1)) {
252 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) == first_index)
253 icl = iq->i_cluster;
254 }
255
256 /* 242 /*
257 * insert the new inode 243 * insert the new inode
258 */ 244 */
@@ -267,30 +253,13 @@ finish_inode:
267 } 253 }
268 254
269 /* 255 /*
270 * These values _must_ be set before releasing ihlock! 256 * These values _must_ be set before releasing the radix tree lock!
271 */ 257 */
272 ip->i_udquot = ip->i_gdquot = NULL; 258 ip->i_udquot = ip->i_gdquot = NULL;
273 xfs_iflags_set(ip, XFS_INEW); 259 xfs_iflags_set(ip, XFS_INEW);
274 260
275 ASSERT(ip->i_cluster == NULL);
276
277 if (!icl) {
278 spin_lock_init(&new_icl->icl_lock);
279 INIT_HLIST_HEAD(&new_icl->icl_inodes);
280 icl = new_icl;
281 new_icl = NULL;
282 } else {
283 ASSERT(!hlist_empty(&icl->icl_inodes));
284 }
285 spin_lock(&icl->icl_lock);
286 hlist_add_head(&ip->i_cnode, &icl->icl_inodes);
287 ip->i_cluster = icl;
288 spin_unlock(&icl->icl_lock);
289
290 write_unlock(&pag->pag_ici_lock); 261 write_unlock(&pag->pag_ici_lock);
291 radix_tree_preload_end(); 262 radix_tree_preload_end();
292 if (new_icl)
293 kmem_zone_free(xfs_icluster_zone, new_icl);
294 263
295 /* 264 /*
296 * Link ip to its mount and thread it on the mount's inode list. 265 * Link ip to its mount and thread it on the mount's inode list.
@@ -529,18 +498,6 @@ xfs_iextract(
529 xfs_put_perag(mp, pag); 498 xfs_put_perag(mp, pag);
530 499
531 /* 500 /*
532 * Remove from cluster list
533 */
534 mp = ip->i_mount;
535 spin_lock(&ip->i_cluster->icl_lock);
536 hlist_del(&ip->i_cnode);
537 spin_unlock(&ip->i_cluster->icl_lock);
538
539 /* was last inode in cluster? */
540 if (hlist_empty(&ip->i_cluster->icl_inodes))
541 kmem_zone_free(xfs_icluster_zone, ip->i_cluster);
542
543 /*
544 * Remove from mount's inode list. 501 * Remove from mount's inode list.
545 */ 502 */
546 XFS_MOUNT_ILOCK(mp); 503 XFS_MOUNT_ILOCK(mp);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index f43a6e01d68f..ca12acb90394 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -55,7 +55,6 @@
55 55
56kmem_zone_t *xfs_ifork_zone; 56kmem_zone_t *xfs_ifork_zone;
57kmem_zone_t *xfs_inode_zone; 57kmem_zone_t *xfs_inode_zone;
58kmem_zone_t *xfs_icluster_zone;
59 58
60/* 59/*
61 * Used in xfs_itruncate(). This is the maximum number of extents 60 * Used in xfs_itruncate(). This is the maximum number of extents
@@ -126,6 +125,90 @@ xfs_inobp_check(
126#endif 125#endif
127 126
128/* 127/*
128 * Find the buffer associated with the given inode map
129 * We do basic validation checks on the buffer once it has been
130 * retrieved from disk.
131 */
132STATIC int
133xfs_imap_to_bp(
134 xfs_mount_t *mp,
135 xfs_trans_t *tp,
136 xfs_imap_t *imap,
137 xfs_buf_t **bpp,
138 uint buf_flags,
139 uint imap_flags)
140{
141 int error;
142 int i;
143 int ni;
144 xfs_buf_t *bp;
145
146 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
147 (int)imap->im_len, buf_flags, &bp);
148 if (error) {
149 if (error != EAGAIN) {
150 cmn_err(CE_WARN,
151 "xfs_imap_to_bp: xfs_trans_read_buf()returned "
152 "an error %d on %s. Returning error.",
153 error, mp->m_fsname);
154 } else {
155 ASSERT(buf_flags & XFS_BUF_TRYLOCK);
156 }
157 return error;
158 }
159
160 /*
161 * Validate the magic number and version of every inode in the buffer
162 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
163 */
164#ifdef DEBUG
165 ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
166#else /* usual case */
167 ni = 1;
168#endif
169
170 for (i = 0; i < ni; i++) {
171 int di_ok;
172 xfs_dinode_t *dip;
173
174 dip = (xfs_dinode_t *)xfs_buf_offset(bp,
175 (i << mp->m_sb.sb_inodelog));
176 di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
177 XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
178 if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
179 XFS_ERRTAG_ITOBP_INOTOBP,
180 XFS_RANDOM_ITOBP_INOTOBP))) {
181 if (imap_flags & XFS_IMAP_BULKSTAT) {
182 xfs_trans_brelse(tp, bp);
183 return XFS_ERROR(EINVAL);
184 }
185 XFS_CORRUPTION_ERROR("xfs_imap_to_bp",
186 XFS_ERRLEVEL_HIGH, mp, dip);
187#ifdef DEBUG
188 cmn_err(CE_PANIC,
189 "Device %s - bad inode magic/vsn "
190 "daddr %lld #%d (magic=%x)",
191 XFS_BUFTARG_NAME(mp->m_ddev_targp),
192 (unsigned long long)imap->im_blkno, i,
193 be16_to_cpu(dip->di_core.di_magic));
194#endif
195 xfs_trans_brelse(tp, bp);
196 return XFS_ERROR(EFSCORRUPTED);
197 }
198 }
199
200 xfs_inobp_check(mp, bp);
201
202 /*
203 * Mark the buffer as an inode buffer now that it looks good
204 */
205 XFS_BUF_SET_VTYPE(bp, B_FS_INO);
206
207 *bpp = bp;
208 return 0;
209}
210
211/*
129 * This routine is called to map an inode number within a file 212 * This routine is called to map an inode number within a file
130 * system to the buffer containing the on-disk version of the 213 * system to the buffer containing the on-disk version of the
131 * inode. It returns a pointer to the buffer containing the 214 * inode. It returns a pointer to the buffer containing the
@@ -147,72 +230,19 @@ xfs_inotobp(
147 xfs_buf_t **bpp, 230 xfs_buf_t **bpp,
148 int *offset) 231 int *offset)
149{ 232{
150 int di_ok;
151 xfs_imap_t imap; 233 xfs_imap_t imap;
152 xfs_buf_t *bp; 234 xfs_buf_t *bp;
153 int error; 235 int error;
154 xfs_dinode_t *dip;
155 236
156 /*
157 * Call the space management code to find the location of the
158 * inode on disk.
159 */
160 imap.im_blkno = 0; 237 imap.im_blkno = 0;
161 error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP); 238 error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
162 if (error != 0) { 239 if (error)
163 cmn_err(CE_WARN,
164 "xfs_inotobp: xfs_imap() returned an "
165 "error %d on %s. Returning error.", error, mp->m_fsname);
166 return error; 240 return error;
167 }
168 241
169 /* 242 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
170 * If the inode number maps to a block outside the bounds of the 243 if (error)
171 * file system then return NULL rather than calling read_buf
172 * and panicing when we get an error from the driver.
173 */
174 if ((imap.im_blkno + imap.im_len) >
175 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
176 cmn_err(CE_WARN,
177 "xfs_inotobp: inode number (%llu + %d) maps to a block outside the bounds "
178 "of the file system %s. Returning EINVAL.",
179 (unsigned long long)imap.im_blkno,
180 imap.im_len, mp->m_fsname);
181 return XFS_ERROR(EINVAL);
182 }
183
184 /*
185 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will
186 * default to just a read_buf() call.
187 */
188 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
189 (int)imap.im_len, XFS_BUF_LOCK, &bp);
190
191 if (error) {
192 cmn_err(CE_WARN,
193 "xfs_inotobp: xfs_trans_read_buf() returned an "
194 "error %d on %s. Returning error.", error, mp->m_fsname);
195 return error; 244 return error;
196 }
197 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 0);
198 di_ok =
199 be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
200 XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
201 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, XFS_ERRTAG_ITOBP_INOTOBP,
202 XFS_RANDOM_ITOBP_INOTOBP))) {
203 XFS_CORRUPTION_ERROR("xfs_inotobp", XFS_ERRLEVEL_LOW, mp, dip);
204 xfs_trans_brelse(tp, bp);
205 cmn_err(CE_WARN,
206 "xfs_inotobp: XFS_TEST_ERROR() returned an "
207 "error on %s. Returning EFSCORRUPTED.", mp->m_fsname);
208 return XFS_ERROR(EFSCORRUPTED);
209 }
210 245
211 xfs_inobp_check(mp, bp);
212
213 /*
214 * Set *dipp to point to the on-disk inode in the buffer.
215 */
216 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 246 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
217 *bpp = bp; 247 *bpp = bp;
218 *offset = imap.im_boffset; 248 *offset = imap.im_boffset;
@@ -248,46 +278,21 @@ xfs_itobp(
248 xfs_dinode_t **dipp, 278 xfs_dinode_t **dipp,
249 xfs_buf_t **bpp, 279 xfs_buf_t **bpp,
250 xfs_daddr_t bno, 280 xfs_daddr_t bno,
251 uint imap_flags) 281 uint imap_flags,
282 uint buf_flags)
252{ 283{
253 xfs_imap_t imap; 284 xfs_imap_t imap;
254 xfs_buf_t *bp; 285 xfs_buf_t *bp;
255 int error; 286 int error;
256 int i;
257 int ni;
258 287
259 if (ip->i_blkno == (xfs_daddr_t)0) { 288 if (ip->i_blkno == (xfs_daddr_t)0) {
260 /*
261 * Call the space management code to find the location of the
262 * inode on disk.
263 */
264 imap.im_blkno = bno; 289 imap.im_blkno = bno;
265 if ((error = xfs_imap(mp, tp, ip->i_ino, &imap, 290 error = xfs_imap(mp, tp, ip->i_ino, &imap,
266 XFS_IMAP_LOOKUP | imap_flags))) 291 XFS_IMAP_LOOKUP | imap_flags);
292 if (error)
267 return error; 293 return error;
268 294
269 /* 295 /*
270 * If the inode number maps to a block outside the bounds
271 * of the file system then return NULL rather than calling
272 * read_buf and panicing when we get an error from the
273 * driver.
274 */
275 if ((imap.im_blkno + imap.im_len) >
276 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
277#ifdef DEBUG
278 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
279 "(imap.im_blkno (0x%llx) "
280 "+ imap.im_len (0x%llx)) > "
281 " XFS_FSB_TO_BB(mp, "
282 "mp->m_sb.sb_dblocks) (0x%llx)",
283 (unsigned long long) imap.im_blkno,
284 (unsigned long long) imap.im_len,
285 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
286#endif /* DEBUG */
287 return XFS_ERROR(EINVAL);
288 }
289
290 /*
291 * Fill in the fields in the inode that will be used to 296 * Fill in the fields in the inode that will be used to
292 * map the inode to its buffer from now on. 297 * map the inode to its buffer from now on.
293 */ 298 */
@@ -305,76 +310,17 @@ xfs_itobp(
305 } 310 }
306 ASSERT(bno == 0 || bno == imap.im_blkno); 311 ASSERT(bno == 0 || bno == imap.im_blkno);
307 312
308 /* 313 error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
309 * Read in the buffer. If tp is NULL, xfs_trans_read_buf() will 314 if (error)
310 * default to just a read_buf() call.
311 */
312 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap.im_blkno,
313 (int)imap.im_len, XFS_BUF_LOCK, &bp);
314 if (error) {
315#ifdef DEBUG
316 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_itobp: "
317 "xfs_trans_read_buf() returned error %d, "
318 "imap.im_blkno 0x%llx, imap.im_len 0x%llx",
319 error, (unsigned long long) imap.im_blkno,
320 (unsigned long long) imap.im_len);
321#endif /* DEBUG */
322 return error; 315 return error;
323 }
324
325 /*
326 * Validate the magic number and version of every inode in the buffer
327 * (if DEBUG kernel) or the first inode in the buffer, otherwise.
328 * No validation is done here in userspace (xfs_repair).
329 */
330#if !defined(__KERNEL__)
331 ni = 0;
332#elif defined(DEBUG)
333 ni = BBTOB(imap.im_len) >> mp->m_sb.sb_inodelog;
334#else /* usual case */
335 ni = 1;
336#endif
337
338 for (i = 0; i < ni; i++) {
339 int di_ok;
340 xfs_dinode_t *dip;
341 316
342 dip = (xfs_dinode_t *)xfs_buf_offset(bp, 317 if (!bp) {
343 (i << mp->m_sb.sb_inodelog)); 318 ASSERT(buf_flags & XFS_BUF_TRYLOCK);
344 di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC && 319 ASSERT(tp == NULL);
345 XFS_DINODE_GOOD_VERSION(dip->di_core.di_version); 320 *bpp = NULL;
346 if (unlikely(XFS_TEST_ERROR(!di_ok, mp, 321 return EAGAIN;
347 XFS_ERRTAG_ITOBP_INOTOBP,
348 XFS_RANDOM_ITOBP_INOTOBP))) {
349 if (imap_flags & XFS_IMAP_BULKSTAT) {
350 xfs_trans_brelse(tp, bp);
351 return XFS_ERROR(EINVAL);
352 }
353#ifdef DEBUG
354 cmn_err(CE_ALERT,
355 "Device %s - bad inode magic/vsn "
356 "daddr %lld #%d (magic=%x)",
357 XFS_BUFTARG_NAME(mp->m_ddev_targp),
358 (unsigned long long)imap.im_blkno, i,
359 be16_to_cpu(dip->di_core.di_magic));
360#endif
361 XFS_CORRUPTION_ERROR("xfs_itobp", XFS_ERRLEVEL_HIGH,
362 mp, dip);
363 xfs_trans_brelse(tp, bp);
364 return XFS_ERROR(EFSCORRUPTED);
365 }
366 } 322 }
367 323
368 xfs_inobp_check(mp, bp);
369
370 /*
371 * Mark the buffer as an inode buffer now that it looks good
372 */
373 XFS_BUF_SET_VTYPE(bp, B_FS_INO);
374
375 /*
376 * Set *dipp to point to the on-disk inode in the buffer.
377 */
378 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 324 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
379 *bpp = bp; 325 *bpp = bp;
380 return 0; 326 return 0;
@@ -878,7 +824,7 @@ xfs_iread(
878 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will 824 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
879 * know that this is a new incore inode. 825 * know that this is a new incore inode.
880 */ 826 */
881 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags); 827 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
882 if (error) { 828 if (error) {
883 kmem_zone_free(xfs_inode_zone, ip); 829 kmem_zone_free(xfs_inode_zone, ip);
884 return error; 830 return error;
@@ -1518,51 +1464,50 @@ xfs_itruncate_start(
1518} 1464}
1519 1465
1520/* 1466/*
1521 * Shrink the file to the given new_size. The new 1467 * Shrink the file to the given new_size. The new size must be smaller than
1522 * size must be smaller than the current size. 1468 * the current size. This will free up the underlying blocks in the removed
1523 * This will free up the underlying blocks 1469 * range after a call to xfs_itruncate_start() or xfs_atruncate_start().
1524 * in the removed range after a call to xfs_itruncate_start()
1525 * or xfs_atruncate_start().
1526 * 1470 *
1527 * The transaction passed to this routine must have made 1471 * The transaction passed to this routine must have made a permanent log
1528 * a permanent log reservation of at least XFS_ITRUNCATE_LOG_RES. 1472 * reservation of at least XFS_ITRUNCATE_LOG_RES. This routine may commit the
1529 * This routine may commit the given transaction and 1473 * given transaction and start new ones, so make sure everything involved in
1530 * start new ones, so make sure everything involved in 1474 * the transaction is tidy before calling here. Some transaction will be
1531 * the transaction is tidy before calling here. 1475 * returned to the caller to be committed. The incoming transaction must
1532 * Some transaction will be returned to the caller to be 1476 * already include the inode, and both inode locks must be held exclusively.
1533 * committed. The incoming transaction must already include 1477 * The inode must also be "held" within the transaction. On return the inode
1534 * the inode, and both inode locks must be held exclusively. 1478 * will be "held" within the returned transaction. This routine does NOT
1535 * The inode must also be "held" within the transaction. On 1479 * require any disk space to be reserved for it within the transaction.
1536 * return the inode will be "held" within the returned transaction.
1537 * This routine does NOT require any disk space to be reserved
1538 * for it within the transaction.
1539 * 1480 *
1540 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, 1481 * The fork parameter must be either xfs_attr_fork or xfs_data_fork, and it
1541 * and it indicates the fork which is to be truncated. For the 1482 * indicates the fork which is to be truncated. For the attribute fork we only
1542 * attribute fork we only support truncation to size 0. 1483 * support truncation to size 0.
1543 * 1484 *
1544 * We use the sync parameter to indicate whether or not the first 1485 * We use the sync parameter to indicate whether or not the first transaction
1545 * transaction we perform might have to be synchronous. For the attr fork, 1486 * we perform might have to be synchronous. For the attr fork, it needs to be
1546 * it needs to be so if the unlink of the inode is not yet known to be 1487 * so if the unlink of the inode is not yet known to be permanent in the log.
1547 * permanent in the log. This keeps us from freeing and reusing the 1488 * This keeps us from freeing and reusing the blocks of the attribute fork
1548 * blocks of the attribute fork before the unlink of the inode becomes 1489 * before the unlink of the inode becomes permanent.
1549 * permanent.
1550 * 1490 *
1551 * For the data fork, we normally have to run synchronously if we're 1491 * For the data fork, we normally have to run synchronously if we're being
1552 * being called out of the inactive path or we're being called 1492 * called out of the inactive path or we're being called out of the create path
1553 * out of the create path where we're truncating an existing file. 1493 * where we're truncating an existing file. Either way, the truncate needs to
1554 * Either way, the truncate needs to be sync so blocks don't reappear 1494 * be sync so blocks don't reappear in the file with altered data in case of a
1555 * in the file with altered data in case of a crash. wsync filesystems 1495 * crash. wsync filesystems can run the first case async because anything that
1556 * can run the first case async because anything that shrinks the inode 1496 * shrinks the inode has to run sync so by the time we're called here from
1557 * has to run sync so by the time we're called here from inactive, the 1497 * inactive, the inode size is permanently set to 0.
1558 * inode size is permanently set to 0.
1559 * 1498 *
1560 * Calls from the truncate path always need to be sync unless we're 1499 * Calls from the truncate path always need to be sync unless we're in a wsync
1561 * in a wsync filesystem and the file has already been unlinked. 1500 * filesystem and the file has already been unlinked.
1562 * 1501 *
1563 * The caller is responsible for correctly setting the sync parameter. 1502 * The caller is responsible for correctly setting the sync parameter. It gets
1564 * It gets too hard for us to guess here which path we're being called 1503 * too hard for us to guess here which path we're being called out of just
1565 * out of just based on inode state. 1504 * based on inode state.
1505 *
1506 * If we get an error, we must return with the inode locked and linked into the
1507 * current transaction. This keeps things simple for the higher level code,
1508 * because it always knows that the inode is locked and held in the transaction
1509 * that returns to it whether errors occur or not. We don't mark the inode
1510 * dirty on error so that transactions can be easily aborted if possible.
1566 */ 1511 */
1567int 1512int
1568xfs_itruncate_finish( 1513xfs_itruncate_finish(
@@ -1741,65 +1686,51 @@ xfs_itruncate_finish(
1741 */ 1686 */
1742 error = xfs_bmap_finish(tp, &free_list, &committed); 1687 error = xfs_bmap_finish(tp, &free_list, &committed);
1743 ntp = *tp; 1688 ntp = *tp;
1689 if (committed) {
1690 /* link the inode into the next xact in the chain */
1691 xfs_trans_ijoin(ntp, ip,
1692 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1693 xfs_trans_ihold(ntp, ip);
1694 }
1695
1744 if (error) { 1696 if (error) {
1745 /* 1697 /*
1746 * If the bmap finish call encounters an error, 1698 * If the bmap finish call encounters an error, return
1747 * return to the caller where the transaction 1699 * to the caller where the transaction can be properly
1748 * can be properly aborted. We just need to 1700 * aborted. We just need to make sure we're not
1749 * make sure we're not holding any resources 1701 * holding any resources that we were not when we came
1750 * that we were not when we came in. 1702 * in.
1751 * 1703 *
1752 * Aborting from this point might lose some 1704 * Aborting from this point might lose some blocks in
1753 * blocks in the file system, but oh well. 1705 * the file system, but oh well.
1754 */ 1706 */
1755 xfs_bmap_cancel(&free_list); 1707 xfs_bmap_cancel(&free_list);
1756 if (committed) {
1757 /*
1758 * If the passed in transaction committed
1759 * in xfs_bmap_finish(), then we want to
1760 * add the inode to this one before returning.
1761 * This keeps things simple for the higher
1762 * level code, because it always knows that
1763 * the inode is locked and held in the
1764 * transaction that returns to it whether
1765 * errors occur or not. We don't mark the
1766 * inode dirty so that this transaction can
1767 * be easily aborted if possible.
1768 */
1769 xfs_trans_ijoin(ntp, ip,
1770 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1771 xfs_trans_ihold(ntp, ip);
1772 }
1773 return error; 1708 return error;
1774 } 1709 }
1775 1710
1776 if (committed) { 1711 if (committed) {
1777 /* 1712 /*
1778 * The first xact was committed, 1713 * Mark the inode dirty so it will be logged and
1779 * so add the inode to the new one. 1714 * moved forward in the log as part of every commit.
1780 * Mark it dirty so it will be logged
1781 * and moved forward in the log as
1782 * part of every commit.
1783 */ 1715 */
1784 xfs_trans_ijoin(ntp, ip,
1785 XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1786 xfs_trans_ihold(ntp, ip);
1787 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE); 1716 xfs_trans_log_inode(ntp, ip, XFS_ILOG_CORE);
1788 } 1717 }
1718
1789 ntp = xfs_trans_dup(ntp); 1719 ntp = xfs_trans_dup(ntp);
1790 (void) xfs_trans_commit(*tp, 0); 1720 error = xfs_trans_commit(*tp, 0);
1791 *tp = ntp; 1721 *tp = ntp;
1792 error = xfs_trans_reserve(ntp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 1722
1793 XFS_TRANS_PERM_LOG_RES, 1723 /* link the inode into the next transaction in the chain */
1794 XFS_ITRUNCATE_LOG_COUNT);
1795 /*
1796 * Add the inode being truncated to the next chained
1797 * transaction.
1798 */
1799 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); 1724 xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
1800 xfs_trans_ihold(ntp, ip); 1725 xfs_trans_ihold(ntp, ip);
1726
1727 if (!error)
1728 error = xfs_trans_reserve(ntp, 0,
1729 XFS_ITRUNCATE_LOG_RES(mp), 0,
1730 XFS_TRANS_PERM_LOG_RES,
1731 XFS_ITRUNCATE_LOG_COUNT);
1801 if (error) 1732 if (error)
1802 return (error); 1733 return error;
1803 } 1734 }
1804 /* 1735 /*
1805 * Only update the size in the case of the data fork, but 1736 * Only update the size in the case of the data fork, but
@@ -1967,7 +1898,7 @@ xfs_iunlink(
1967 * Here we put the head pointer into our next pointer, 1898 * Here we put the head pointer into our next pointer,
1968 * and then we fall through to point the head at us. 1899 * and then we fall through to point the head at us.
1969 */ 1900 */
1970 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 1901 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
1971 if (error) 1902 if (error)
1972 return error; 1903 return error;
1973 1904
@@ -2075,7 +2006,7 @@ xfs_iunlink_remove(
2075 * of dealing with the buffer when there is no need to 2006 * of dealing with the buffer when there is no need to
2076 * change it. 2007 * change it.
2077 */ 2008 */
2078 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 2009 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2079 if (error) { 2010 if (error) {
2080 cmn_err(CE_WARN, 2011 cmn_err(CE_WARN,
2081 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2012 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2137,7 +2068,7 @@ xfs_iunlink_remove(
2137 * Now last_ibp points to the buffer previous to us on 2068 * Now last_ibp points to the buffer previous to us on
2138 * the unlinked list. Pull us from the list. 2069 * the unlinked list. Pull us from the list.
2139 */ 2070 */
2140 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 2071 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2141 if (error) { 2072 if (error) {
2142 cmn_err(CE_WARN, 2073 cmn_err(CE_WARN,
2143 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2074 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2172,13 +2103,6 @@ xfs_iunlink_remove(
2172 return 0; 2103 return 0;
2173} 2104}
2174 2105
2175STATIC_INLINE int xfs_inode_clean(xfs_inode_t *ip)
2176{
2177 return (((ip->i_itemp == NULL) ||
2178 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
2179 (ip->i_update_core == 0));
2180}
2181
2182STATIC void 2106STATIC void
2183xfs_ifree_cluster( 2107xfs_ifree_cluster(
2184 xfs_inode_t *free_ip, 2108 xfs_inode_t *free_ip,
@@ -2400,7 +2324,7 @@ xfs_ifree(
2400 2324
2401 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2325 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2402 2326
2403 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0); 2327 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2404 if (error) 2328 if (error)
2405 return error; 2329 return error;
2406 2330
@@ -2678,14 +2602,31 @@ xfs_imap(
2678 fsbno = imap->im_blkno ? 2602 fsbno = imap->im_blkno ?
2679 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK; 2603 XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
2680 error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags); 2604 error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
2681 if (error != 0) { 2605 if (error)
2682 return error; 2606 return error;
2683 } 2607
2684 imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno); 2608 imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
2685 imap->im_len = XFS_FSB_TO_BB(mp, len); 2609 imap->im_len = XFS_FSB_TO_BB(mp, len);
2686 imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno); 2610 imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
2687 imap->im_ioffset = (ushort)off; 2611 imap->im_ioffset = (ushort)off;
2688 imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog); 2612 imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
2613
2614 /*
2615 * If the inode number maps to a block outside the bounds
2616 * of the file system then return NULL rather than calling
2617 * read_buf and panicing when we get an error from the
2618 * driver.
2619 */
2620 if ((imap->im_blkno + imap->im_len) >
2621 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
2622 xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
2623 "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
2624 " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
2625 (unsigned long long) imap->im_blkno,
2626 (unsigned long long) imap->im_len,
2627 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
2628 return EINVAL;
2629 }
2689 return 0; 2630 return 0;
2690} 2631}
2691 2632
@@ -2826,38 +2767,41 @@ xfs_iunpin(
2826} 2767}
2827 2768
2828/* 2769/*
2829 * This is called to wait for the given inode to be unpinned. 2770 * This is called to unpin an inode. It can be directed to wait or to return
2830 * It will sleep until this happens. The caller must have the 2771 * immediately without waiting for the inode to be unpinned. The caller must
2831 * inode locked in at least shared mode so that the buffer cannot 2772 * have the inode locked in at least shared mode so that the buffer cannot be
2832 * be subsequently pinned once someone is waiting for it to be 2773 * subsequently pinned once someone is waiting for it to be unpinned.
2833 * unpinned.
2834 */ 2774 */
2835STATIC void 2775STATIC void
2836xfs_iunpin_wait( 2776__xfs_iunpin_wait(
2837 xfs_inode_t *ip) 2777 xfs_inode_t *ip,
2778 int wait)
2838{ 2779{
2839 xfs_inode_log_item_t *iip; 2780 xfs_inode_log_item_t *iip = ip->i_itemp;
2840 xfs_lsn_t lsn;
2841 2781
2842 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); 2782 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
2843 2783 if (atomic_read(&ip->i_pincount) == 0)
2844 if (atomic_read(&ip->i_pincount) == 0) {
2845 return; 2784 return;
2846 }
2847 2785
2848 iip = ip->i_itemp; 2786 /* Give the log a push to start the unpinning I/O */
2849 if (iip && iip->ili_last_lsn) { 2787 xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
2850 lsn = iip->ili_last_lsn; 2788 iip->ili_last_lsn : 0, XFS_LOG_FORCE);
2851 } else { 2789 if (wait)
2852 lsn = (xfs_lsn_t)0; 2790 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2853 } 2791}
2854 2792
2855 /* 2793static inline void
2856 * Give the log a push so we don't wait here too long. 2794xfs_iunpin_wait(
2857 */ 2795 xfs_inode_t *ip)
2858 xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); 2796{
2797 __xfs_iunpin_wait(ip, 1);
2798}
2859 2799
2860 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); 2800static inline void
2801xfs_iunpin_nowait(
2802 xfs_inode_t *ip)
2803{
2804 __xfs_iunpin_wait(ip, 0);
2861} 2805}
2862 2806
2863 2807
@@ -2932,7 +2876,7 @@ xfs_iextents_copy(
2932 * format indicates the current state of the fork. 2876 * format indicates the current state of the fork.
2933 */ 2877 */
2934/*ARGSUSED*/ 2878/*ARGSUSED*/
2935STATIC int 2879STATIC void
2936xfs_iflush_fork( 2880xfs_iflush_fork(
2937 xfs_inode_t *ip, 2881 xfs_inode_t *ip,
2938 xfs_dinode_t *dip, 2882 xfs_dinode_t *dip,
@@ -2953,16 +2897,16 @@ xfs_iflush_fork(
2953 static const short extflag[2] = 2897 static const short extflag[2] =
2954 { XFS_ILOG_DEXT, XFS_ILOG_AEXT }; 2898 { XFS_ILOG_DEXT, XFS_ILOG_AEXT };
2955 2899
2956 if (iip == NULL) 2900 if (!iip)
2957 return 0; 2901 return;
2958 ifp = XFS_IFORK_PTR(ip, whichfork); 2902 ifp = XFS_IFORK_PTR(ip, whichfork);
2959 /* 2903 /*
2960 * This can happen if we gave up in iformat in an error path, 2904 * This can happen if we gave up in iformat in an error path,
2961 * for the attribute fork. 2905 * for the attribute fork.
2962 */ 2906 */
2963 if (ifp == NULL) { 2907 if (!ifp) {
2964 ASSERT(whichfork == XFS_ATTR_FORK); 2908 ASSERT(whichfork == XFS_ATTR_FORK);
2965 return 0; 2909 return;
2966 } 2910 }
2967 cp = XFS_DFORK_PTR(dip, whichfork); 2911 cp = XFS_DFORK_PTR(dip, whichfork);
2968 mp = ip->i_mount; 2912 mp = ip->i_mount;
@@ -3023,8 +2967,145 @@ xfs_iflush_fork(
3023 ASSERT(0); 2967 ASSERT(0);
3024 break; 2968 break;
3025 } 2969 }
2970}
2971
2972STATIC int
2973xfs_iflush_cluster(
2974 xfs_inode_t *ip,
2975 xfs_buf_t *bp)
2976{
2977 xfs_mount_t *mp = ip->i_mount;
2978 xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
2979 unsigned long first_index, mask;
2980 int ilist_size;
2981 xfs_inode_t **ilist;
2982 xfs_inode_t *iq;
2983 int nr_found;
2984 int clcount = 0;
2985 int bufwasdelwri;
2986 int i;
2987
2988 ASSERT(pag->pagi_inodeok);
2989 ASSERT(pag->pag_ici_init);
2990
2991 ilist_size = XFS_INODE_CLUSTER_SIZE(mp) * sizeof(xfs_inode_t *);
2992 ilist = kmem_alloc(ilist_size, KM_MAYFAIL);
2993 if (!ilist)
2994 return 0;
2995
2996 mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
2997 first_index = XFS_INO_TO_AGINO(mp, ip->i_ino) & mask;
2998 read_lock(&pag->pag_ici_lock);
2999 /* really need a gang lookup range call here */
3000 nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void**)ilist,
3001 first_index,
3002 XFS_INODE_CLUSTER_SIZE(mp));
3003 if (nr_found == 0)
3004 goto out_free;
3005
3006 for (i = 0; i < nr_found; i++) {
3007 iq = ilist[i];
3008 if (iq == ip)
3009 continue;
3010 /* if the inode lies outside this cluster, we're done. */
3011 if ((XFS_INO_TO_AGINO(mp, iq->i_ino) & mask) != first_index)
3012 break;
3013 /*
3014 * Do an un-protected check to see if the inode is dirty and
3015 * is a candidate for flushing. These checks will be repeated
3016 * later after the appropriate locks are acquired.
3017 */
3018 if (xfs_inode_clean(iq) && xfs_ipincount(iq) == 0)
3019 continue;
3020
3021 /*
3022 * Try to get locks. If any are unavailable or it is pinned,
3023 * then this inode cannot be flushed and is skipped.
3024 */
3025
3026 if (!xfs_ilock_nowait(iq, XFS_ILOCK_SHARED))
3027 continue;
3028 if (!xfs_iflock_nowait(iq)) {
3029 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3030 continue;
3031 }
3032 if (xfs_ipincount(iq)) {
3033 xfs_ifunlock(iq);
3034 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3035 continue;
3036 }
3037
3038 /*
3039 * arriving here means that this inode can be flushed. First
3040 * re-check that it's dirty before flushing.
3041 */
3042 if (!xfs_inode_clean(iq)) {
3043 int error;
3044 error = xfs_iflush_int(iq, bp);
3045 if (error) {
3046 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3047 goto cluster_corrupt_out;
3048 }
3049 clcount++;
3050 } else {
3051 xfs_ifunlock(iq);
3052 }
3053 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3054 }
3055
3056 if (clcount) {
3057 XFS_STATS_INC(xs_icluster_flushcnt);
3058 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
3059 }
3026 3060
3061out_free:
3062 read_unlock(&pag->pag_ici_lock);
3063 kmem_free(ilist, ilist_size);
3027 return 0; 3064 return 0;
3065
3066
3067cluster_corrupt_out:
3068 /*
3069 * Corruption detected in the clustering loop. Invalidate the
3070 * inode buffer and shut down the filesystem.
3071 */
3072 read_unlock(&pag->pag_ici_lock);
3073 /*
3074 * Clean up the buffer. If it was B_DELWRI, just release it --
3075 * brelse can handle it with no problems. If not, shut down the
3076 * filesystem before releasing the buffer.
3077 */
3078 bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp);
3079 if (bufwasdelwri)
3080 xfs_buf_relse(bp);
3081
3082 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3083
3084 if (!bufwasdelwri) {
3085 /*
3086 * Just like incore_relse: if we have b_iodone functions,
3087 * mark the buffer as an error and call them. Otherwise
3088 * mark it as stale and brelse.
3089 */
3090 if (XFS_BUF_IODONE_FUNC(bp)) {
3091 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3092 XFS_BUF_UNDONE(bp);
3093 XFS_BUF_STALE(bp);
3094 XFS_BUF_SHUT(bp);
3095 XFS_BUF_ERROR(bp,EIO);
3096 xfs_biodone(bp);
3097 } else {
3098 XFS_BUF_STALE(bp);
3099 xfs_buf_relse(bp);
3100 }
3101 }
3102
3103 /*
3104 * Unlocks the flush lock
3105 */
3106 xfs_iflush_abort(iq);
3107 kmem_free(ilist, ilist_size);
3108 return XFS_ERROR(EFSCORRUPTED);
3028} 3109}
3029 3110
3030/* 3111/*
@@ -3046,11 +3127,7 @@ xfs_iflush(
3046 xfs_dinode_t *dip; 3127 xfs_dinode_t *dip;
3047 xfs_mount_t *mp; 3128 xfs_mount_t *mp;
3048 int error; 3129 int error;
3049 /* REFERENCED */ 3130 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
3050 xfs_inode_t *iq;
3051 int clcount; /* count of inodes clustered */
3052 int bufwasdelwri;
3053 struct hlist_node *entry;
3054 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; 3131 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3055 3132
3056 XFS_STATS_INC(xs_iflush_count); 3133 XFS_STATS_INC(xs_iflush_count);
@@ -3067,8 +3144,7 @@ xfs_iflush(
3067 * If the inode isn't dirty, then just release the inode 3144 * If the inode isn't dirty, then just release the inode
3068 * flush lock and do nothing. 3145 * flush lock and do nothing.
3069 */ 3146 */
3070 if ((ip->i_update_core == 0) && 3147 if (xfs_inode_clean(ip)) {
3071 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3072 ASSERT((iip != NULL) ? 3148 ASSERT((iip != NULL) ?
3073 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1); 3149 !(iip->ili_item.li_flags & XFS_LI_IN_AIL) : 1);
3074 xfs_ifunlock(ip); 3150 xfs_ifunlock(ip);
@@ -3076,11 +3152,21 @@ xfs_iflush(
3076 } 3152 }
3077 3153
3078 /* 3154 /*
3079 * We can't flush the inode until it is unpinned, so 3155 * We can't flush the inode until it is unpinned, so wait for it if we
3080 * wait for it. We know noone new can pin it, because 3156 * are allowed to block. We know noone new can pin it, because we are
3081 * we are holding the inode lock shared and you need 3157 * holding the inode lock shared and you need to hold it exclusively to
3082 * to hold it exclusively to pin the inode. 3158 * pin the inode.
3159 *
3160 * If we are not allowed to block, force the log out asynchronously so
3161 * that when we come back the inode will be unpinned. If other inodes
3162 * in the same cluster are dirty, they will probably write the inode
3163 * out for us if they occur after the log force completes.
3083 */ 3164 */
3165 if (noblock && xfs_ipincount(ip)) {
3166 xfs_iunpin_nowait(ip);
3167 xfs_ifunlock(ip);
3168 return EAGAIN;
3169 }
3084 xfs_iunpin_wait(ip); 3170 xfs_iunpin_wait(ip);
3085 3171
3086 /* 3172 /*
@@ -3097,15 +3183,6 @@ xfs_iflush(
3097 } 3183 }
3098 3184
3099 /* 3185 /*
3100 * Get the buffer containing the on-disk inode.
3101 */
3102 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
3103 if (error) {
3104 xfs_ifunlock(ip);
3105 return error;
3106 }
3107
3108 /*
3109 * Decide how buffer will be flushed out. This is done before 3186 * Decide how buffer will be flushed out. This is done before
3110 * the call to xfs_iflush_int because this field is zeroed by it. 3187 * the call to xfs_iflush_int because this field is zeroed by it.
3111 */ 3188 */
@@ -3121,6 +3198,7 @@ xfs_iflush(
3121 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3198 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3122 flags = 0; 3199 flags = 0;
3123 break; 3200 break;
3201 case XFS_IFLUSH_ASYNC_NOBLOCK:
3124 case XFS_IFLUSH_ASYNC: 3202 case XFS_IFLUSH_ASYNC:
3125 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3203 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3126 flags = INT_ASYNC; 3204 flags = INT_ASYNC;
@@ -3140,6 +3218,7 @@ xfs_iflush(
3140 case XFS_IFLUSH_DELWRI: 3218 case XFS_IFLUSH_DELWRI:
3141 flags = INT_DELWRI; 3219 flags = INT_DELWRI;
3142 break; 3220 break;
3221 case XFS_IFLUSH_ASYNC_NOBLOCK:
3143 case XFS_IFLUSH_ASYNC: 3222 case XFS_IFLUSH_ASYNC:
3144 flags = INT_ASYNC; 3223 flags = INT_ASYNC;
3145 break; 3224 break;
@@ -3154,94 +3233,41 @@ xfs_iflush(
3154 } 3233 }
3155 3234
3156 /* 3235 /*
3157 * First flush out the inode that xfs_iflush was called with. 3236 * Get the buffer containing the on-disk inode.
3158 */ 3237 */
3159 error = xfs_iflush_int(ip, bp); 3238 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
3160 if (error) { 3239 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
3161 goto corrupt_out; 3240 if (error || !bp) {
3241 xfs_ifunlock(ip);
3242 return error;
3162 } 3243 }
3163 3244
3164 /* 3245 /*
3165 * inode clustering: 3246 * First flush out the inode that xfs_iflush was called with.
3166 * see if other inodes can be gathered into this write
3167 */ 3247 */
3168 spin_lock(&ip->i_cluster->icl_lock); 3248 error = xfs_iflush_int(ip, bp);
3169 ip->i_cluster->icl_buf = bp; 3249 if (error)
3170 3250 goto corrupt_out;
3171 clcount = 0;
3172 hlist_for_each_entry(iq, entry, &ip->i_cluster->icl_inodes, i_cnode) {
3173 if (iq == ip)
3174 continue;
3175
3176 /*
3177 * Do an un-protected check to see if the inode is dirty and
3178 * is a candidate for flushing. These checks will be repeated
3179 * later after the appropriate locks are acquired.
3180 */
3181 iip = iq->i_itemp;
3182 if ((iq->i_update_core == 0) &&
3183 ((iip == NULL) ||
3184 !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
3185 xfs_ipincount(iq) == 0) {
3186 continue;
3187 }
3188
3189 /*
3190 * Try to get locks. If any are unavailable,
3191 * then this inode cannot be flushed and is skipped.
3192 */
3193
3194 /* get inode locks (just i_lock) */
3195 if (xfs_ilock_nowait(iq, XFS_ILOCK_SHARED)) {
3196 /* get inode flush lock */
3197 if (xfs_iflock_nowait(iq)) {
3198 /* check if pinned */
3199 if (xfs_ipincount(iq) == 0) {
3200 /* arriving here means that
3201 * this inode can be flushed.
3202 * first re-check that it's
3203 * dirty
3204 */
3205 iip = iq->i_itemp;
3206 if ((iq->i_update_core != 0)||
3207 ((iip != NULL) &&
3208 (iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3209 clcount++;
3210 error = xfs_iflush_int(iq, bp);
3211 if (error) {
3212 xfs_iunlock(iq,
3213 XFS_ILOCK_SHARED);
3214 goto cluster_corrupt_out;
3215 }
3216 } else {
3217 xfs_ifunlock(iq);
3218 }
3219 } else {
3220 xfs_ifunlock(iq);
3221 }
3222 }
3223 xfs_iunlock(iq, XFS_ILOCK_SHARED);
3224 }
3225 }
3226 spin_unlock(&ip->i_cluster->icl_lock);
3227
3228 if (clcount) {
3229 XFS_STATS_INC(xs_icluster_flushcnt);
3230 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
3231 }
3232 3251
3233 /* 3252 /*
3234 * If the buffer is pinned then push on the log so we won't 3253 * If the buffer is pinned then push on the log now so we won't
3235 * get stuck waiting in the write for too long. 3254 * get stuck waiting in the write for too long.
3236 */ 3255 */
3237 if (XFS_BUF_ISPINNED(bp)){ 3256 if (XFS_BUF_ISPINNED(bp))
3238 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); 3257 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3239 } 3258
3259 /*
3260 * inode clustering:
3261 * see if other inodes can be gathered into this write
3262 */
3263 error = xfs_iflush_cluster(ip, bp);
3264 if (error)
3265 goto cluster_corrupt_out;
3240 3266
3241 if (flags & INT_DELWRI) { 3267 if (flags & INT_DELWRI) {
3242 xfs_bdwrite(mp, bp); 3268 xfs_bdwrite(mp, bp);
3243 } else if (flags & INT_ASYNC) { 3269 } else if (flags & INT_ASYNC) {
3244 xfs_bawrite(mp, bp); 3270 error = xfs_bawrite(mp, bp);
3245 } else { 3271 } else {
3246 error = xfs_bwrite(mp, bp); 3272 error = xfs_bwrite(mp, bp);
3247 } 3273 }
@@ -3250,52 +3276,11 @@ xfs_iflush(
3250corrupt_out: 3276corrupt_out:
3251 xfs_buf_relse(bp); 3277 xfs_buf_relse(bp);
3252 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 3278 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3253 xfs_iflush_abort(ip);
3254 /*
3255 * Unlocks the flush lock
3256 */
3257 return XFS_ERROR(EFSCORRUPTED);
3258
3259cluster_corrupt_out: 3279cluster_corrupt_out:
3260 /* Corruption detected in the clustering loop. Invalidate the
3261 * inode buffer and shut down the filesystem.
3262 */
3263 spin_unlock(&ip->i_cluster->icl_lock);
3264
3265 /*
3266 * Clean up the buffer. If it was B_DELWRI, just release it --
3267 * brelse can handle it with no problems. If not, shut down the
3268 * filesystem before releasing the buffer.
3269 */
3270 if ((bufwasdelwri= XFS_BUF_ISDELAYWRITE(bp))) {
3271 xfs_buf_relse(bp);
3272 }
3273
3274 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
3275
3276 if(!bufwasdelwri) {
3277 /*
3278 * Just like incore_relse: if we have b_iodone functions,
3279 * mark the buffer as an error and call them. Otherwise
3280 * mark it as stale and brelse.
3281 */
3282 if (XFS_BUF_IODONE_FUNC(bp)) {
3283 XFS_BUF_CLR_BDSTRAT_FUNC(bp);
3284 XFS_BUF_UNDONE(bp);
3285 XFS_BUF_STALE(bp);
3286 XFS_BUF_SHUT(bp);
3287 XFS_BUF_ERROR(bp,EIO);
3288 xfs_biodone(bp);
3289 } else {
3290 XFS_BUF_STALE(bp);
3291 xfs_buf_relse(bp);
3292 }
3293 }
3294
3295 xfs_iflush_abort(iq);
3296 /* 3280 /*
3297 * Unlocks the flush lock 3281 * Unlocks the flush lock
3298 */ 3282 */
3283 xfs_iflush_abort(ip);
3299 return XFS_ERROR(EFSCORRUPTED); 3284 return XFS_ERROR(EFSCORRUPTED);
3300} 3285}
3301 3286
@@ -3325,8 +3310,7 @@ xfs_iflush_int(
3325 * If the inode isn't dirty, then just release the inode 3310 * If the inode isn't dirty, then just release the inode
3326 * flush lock and do nothing. 3311 * flush lock and do nothing.
3327 */ 3312 */
3328 if ((ip->i_update_core == 0) && 3313 if (xfs_inode_clean(ip)) {
3329 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL))) {
3330 xfs_ifunlock(ip); 3314 xfs_ifunlock(ip);
3331 return 0; 3315 return 0;
3332 } 3316 }
@@ -3459,16 +3443,9 @@ xfs_iflush_int(
3459 } 3443 }
3460 } 3444 }
3461 3445
3462 if (xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp) == EFSCORRUPTED) { 3446 xfs_iflush_fork(ip, dip, iip, XFS_DATA_FORK, bp);
3463 goto corrupt_out; 3447 if (XFS_IFORK_Q(ip))
3464 } 3448 xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3465
3466 if (XFS_IFORK_Q(ip)) {
3467 /*
3468 * The only error from xfs_iflush_fork is on the data fork.
3469 */
3470 (void) xfs_iflush_fork(ip, dip, iip, XFS_ATTR_FORK, bp);
3471 }
3472 xfs_inobp_check(mp, bp); 3449 xfs_inobp_check(mp, bp);
3473 3450
3474 /* 3451 /*
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index bfcd72cbaeea..93c37697a72c 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -133,19 +133,6 @@ typedef struct dm_attrs_s {
133} dm_attrs_t; 133} dm_attrs_t;
134 134
135/* 135/*
136 * This is the xfs inode cluster structure. This structure is used by
137 * xfs_iflush to find inodes that share a cluster and can be flushed to disk at
138 * the same time.
139 */
140typedef struct xfs_icluster {
141 struct hlist_head icl_inodes; /* list of inodes on cluster */
142 xfs_daddr_t icl_blkno; /* starting block number of
143 * the cluster */
144 struct xfs_buf *icl_buf; /* the inode buffer */
145 spinlock_t icl_lock; /* inode list lock */
146} xfs_icluster_t;
147
148/*
149 * This is the xfs in-core inode structure. 136 * This is the xfs in-core inode structure.
150 * Most of the on-disk inode is embedded in the i_d field. 137 * Most of the on-disk inode is embedded in the i_d field.
151 * 138 *
@@ -240,10 +227,6 @@ typedef struct xfs_inode {
240 atomic_t i_pincount; /* inode pin count */ 227 atomic_t i_pincount; /* inode pin count */
241 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */ 228 wait_queue_head_t i_ipin_wait; /* inode pinning wait queue */
242 spinlock_t i_flags_lock; /* inode i_flags lock */ 229 spinlock_t i_flags_lock; /* inode i_flags lock */
243#ifdef HAVE_REFCACHE
244 struct xfs_inode **i_refcache; /* ptr to entry in ref cache */
245 struct xfs_inode *i_release; /* inode to unref */
246#endif
247 /* Miscellaneous state. */ 230 /* Miscellaneous state. */
248 unsigned short i_flags; /* see defined flags below */ 231 unsigned short i_flags; /* see defined flags below */
249 unsigned char i_update_core; /* timestamps/size is dirty */ 232 unsigned char i_update_core; /* timestamps/size is dirty */
@@ -252,8 +235,6 @@ typedef struct xfs_inode {
252 unsigned int i_delayed_blks; /* count of delay alloc blks */ 235 unsigned int i_delayed_blks; /* count of delay alloc blks */
253 236
254 xfs_icdinode_t i_d; /* most of ondisk inode */ 237 xfs_icdinode_t i_d; /* most of ondisk inode */
255 xfs_icluster_t *i_cluster; /* cluster list header */
256 struct hlist_node i_cnode; /* cluster link node */
257 238
258 xfs_fsize_t i_size; /* in-memory size */ 239 xfs_fsize_t i_size; /* in-memory size */
259 xfs_fsize_t i_new_size; /* size when write completes */ 240 xfs_fsize_t i_new_size; /* size when write completes */
@@ -461,6 +442,7 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
461#define XFS_IFLUSH_SYNC 3 442#define XFS_IFLUSH_SYNC 3
462#define XFS_IFLUSH_ASYNC 4 443#define XFS_IFLUSH_ASYNC 4
463#define XFS_IFLUSH_DELWRI 5 444#define XFS_IFLUSH_DELWRI 5
445#define XFS_IFLUSH_ASYNC_NOBLOCK 6
464 446
465/* 447/*
466 * Flags for xfs_itruncate_start(). 448 * Flags for xfs_itruncate_start().
@@ -515,7 +497,7 @@ int xfs_finish_reclaim_all(struct xfs_mount *, int);
515 */ 497 */
516int xfs_itobp(struct xfs_mount *, struct xfs_trans *, 498int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
517 xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **, 499 xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
518 xfs_daddr_t, uint); 500 xfs_daddr_t, uint, uint);
519int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t, 501int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
520 xfs_inode_t **, xfs_daddr_t, uint); 502 xfs_inode_t **, xfs_daddr_t, uint);
521int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int); 503int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
@@ -597,7 +579,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
597#define xfs_inobp_check(mp, bp) 579#define xfs_inobp_check(mp, bp)
598#endif /* DEBUG */ 580#endif /* DEBUG */
599 581
600extern struct kmem_zone *xfs_icluster_zone;
601extern struct kmem_zone *xfs_ifork_zone; 582extern struct kmem_zone *xfs_ifork_zone;
602extern struct kmem_zone *xfs_inode_zone; 583extern struct kmem_zone *xfs_inode_zone;
603extern struct kmem_zone *xfs_ili_zone; 584extern struct kmem_zone *xfs_ili_zone;
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 2c775b4ae9e6..93b5db453ea2 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -40,6 +40,7 @@
40#include "xfs_btree.h" 40#include "xfs_btree.h"
41#include "xfs_ialloc.h" 41#include "xfs_ialloc.h"
42#include "xfs_rw.h" 42#include "xfs_rw.h"
43#include "xfs_error.h"
43 44
44 45
45kmem_zone_t *xfs_ili_zone; /* inode log item zone */ 46kmem_zone_t *xfs_ili_zone; /* inode log item zone */
@@ -813,7 +814,12 @@ xfs_inode_item_pushbuf(
813 XFS_LOG_FORCE); 814 XFS_LOG_FORCE);
814 } 815 }
815 if (dopush) { 816 if (dopush) {
816 xfs_bawrite(mp, bp); 817 int error;
818 error = xfs_bawrite(mp, bp);
819 if (error)
820 xfs_fs_cmn_err(CE_WARN, mp,
821 "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
822 error, iip, bp);
817 } else { 823 } else {
818 xfs_buf_relse(bp); 824 xfs_buf_relse(bp);
819 } 825 }
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index bfe92ea17952..40513077ab36 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -168,6 +168,14 @@ static inline int xfs_ilog_fext(int w)
168 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT); 168 return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
169} 169}
170 170
171static inline int xfs_inode_clean(xfs_inode_t *ip)
172{
173 return (!ip->i_itemp ||
174 !(ip->i_itemp->ili_format.ilf_fields & XFS_ILOG_ALL)) &&
175 !ip->i_update_core;
176}
177
178
171#ifdef __KERNEL__ 179#ifdef __KERNEL__
172 180
173extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *); 181extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index fde37f87d52f..fb3cf1191419 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,8 +802,11 @@ xfs_iomap_write_allocate(
802 */ 802 */
803 nimaps = 1; 803 nimaps = 1;
804 end_fsb = XFS_B_TO_FSB(mp, ip->i_size); 804 end_fsb = XFS_B_TO_FSB(mp, ip->i_size);
805 xfs_bmap_last_offset(NULL, ip, &last_block, 805 error = xfs_bmap_last_offset(NULL, ip, &last_block,
806 XFS_DATA_FORK); 806 XFS_DATA_FORK);
807 if (error)
808 goto trans_cancel;
809
807 last_block = XFS_FILEOFF_MAX(last_block, end_fsb); 810 last_block = XFS_FILEOFF_MAX(last_block, end_fsb);
808 if ((map_start_fsb + count_fsb) > last_block) { 811 if ((map_start_fsb + count_fsb) > last_block) {
809 count_fsb = last_block - map_start_fsb; 812 count_fsb = last_block - map_start_fsb;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index f615e04364f4..eb85bdedad0c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -129,7 +129,7 @@ xfs_bulkstat_one_iget(
129 return error; 129 return error;
130} 130}
131 131
132STATIC int 132STATIC void
133xfs_bulkstat_one_dinode( 133xfs_bulkstat_one_dinode(
134 xfs_mount_t *mp, /* mount point for filesystem */ 134 xfs_mount_t *mp, /* mount point for filesystem */
135 xfs_ino_t ino, /* inode number to get data for */ 135 xfs_ino_t ino, /* inode number to get data for */
@@ -198,8 +198,6 @@ xfs_bulkstat_one_dinode(
198 buf->bs_blocks = be64_to_cpu(dic->di_nblocks); 198 buf->bs_blocks = be64_to_cpu(dic->di_nblocks);
199 break; 199 break;
200 } 200 }
201
202 return 0;
203} 201}
204 202
205STATIC int 203STATIC int
@@ -614,7 +612,8 @@ xfs_bulkstat(
614 xfs_buf_relse(bp); 612 xfs_buf_relse(bp);
615 error = xfs_itobp(mp, NULL, ip, 613 error = xfs_itobp(mp, NULL, ip,
616 &dip, &bp, bno, 614 &dip, &bp, bno,
617 XFS_IMAP_BULKSTAT); 615 XFS_IMAP_BULKSTAT,
616 XFS_BUF_LOCK);
618 if (!error) 617 if (!error)
619 clustidx = ip->i_boffset / mp->m_sb.sb_inodesize; 618 clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
620 kmem_zone_free(xfs_inode_zone, ip); 619 kmem_zone_free(xfs_inode_zone, ip);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 31f2b04f2c97..afaee301b0ee 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -41,6 +41,7 @@
41#include "xfs_inode.h" 41#include "xfs_inode.h"
42#include "xfs_rw.h" 42#include "xfs_rw.h"
43 43
44kmem_zone_t *xfs_log_ticket_zone;
44 45
45#define xlog_write_adv_cnt(ptr, len, off, bytes) \ 46#define xlog_write_adv_cnt(ptr, len, off, bytes) \
46 { (ptr) += (bytes); \ 47 { (ptr) += (bytes); \
@@ -73,8 +74,6 @@ STATIC int xlog_state_get_iclog_space(xlog_t *log,
73 xlog_ticket_t *ticket, 74 xlog_ticket_t *ticket,
74 int *continued_write, 75 int *continued_write,
75 int *logoffsetp); 76 int *logoffsetp);
76STATIC void xlog_state_put_ticket(xlog_t *log,
77 xlog_ticket_t *tic);
78STATIC int xlog_state_release_iclog(xlog_t *log, 77STATIC int xlog_state_release_iclog(xlog_t *log,
79 xlog_in_core_t *iclog); 78 xlog_in_core_t *iclog);
80STATIC void xlog_state_switch_iclogs(xlog_t *log, 79STATIC void xlog_state_switch_iclogs(xlog_t *log,
@@ -101,7 +100,6 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
101 100
102 101
103/* local ticket functions */ 102/* local ticket functions */
104STATIC void xlog_state_ticket_alloc(xlog_t *log);
105STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log, 103STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log,
106 int unit_bytes, 104 int unit_bytes,
107 int count, 105 int count,
@@ -330,7 +328,7 @@ xfs_log_done(xfs_mount_t *mp,
330 */ 328 */
331 xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); 329 xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
332 xlog_ungrant_log_space(log, ticket); 330 xlog_ungrant_log_space(log, ticket);
333 xlog_state_put_ticket(log, ticket); 331 xlog_ticket_put(log, ticket);
334 } else { 332 } else {
335 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)"); 333 xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
336 xlog_regrant_reserve_log_space(log, ticket); 334 xlog_regrant_reserve_log_space(log, ticket);
@@ -384,7 +382,27 @@ _xfs_log_force(
384 return xlog_state_sync_all(log, flags, log_flushed); 382 return xlog_state_sync_all(log, flags, log_flushed);
385 else 383 else
386 return xlog_state_sync(log, lsn, flags, log_flushed); 384 return xlog_state_sync(log, lsn, flags, log_flushed);
387} /* xfs_log_force */ 385} /* _xfs_log_force */
386
387/*
388 * Wrapper for _xfs_log_force(), to be used when caller doesn't care
389 * about errors or whether the log was flushed or not. This is the normal
390 * interface to use when trying to unpin items or move the log forward.
391 */
392void
393xfs_log_force(
394 xfs_mount_t *mp,
395 xfs_lsn_t lsn,
396 uint flags)
397{
398 int error;
399 error = _xfs_log_force(mp, lsn, flags, NULL);
400 if (error) {
401 xfs_fs_cmn_err(CE_WARN, mp, "xfs_log_force: "
402 "error %d returned.", error);
403 }
404}
405
388 406
389/* 407/*
390 * Attaches a new iclog I/O completion callback routine during 408 * Attaches a new iclog I/O completion callback routine during
@@ -397,12 +415,10 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
397 void *iclog_hndl, /* iclog to hang callback off */ 415 void *iclog_hndl, /* iclog to hang callback off */
398 xfs_log_callback_t *cb) 416 xfs_log_callback_t *cb)
399{ 417{
400 xlog_t *log = mp->m_log;
401 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl; 418 xlog_in_core_t *iclog = (xlog_in_core_t *)iclog_hndl;
402 int abortflg; 419 int abortflg;
403 420
404 cb->cb_next = NULL; 421 spin_lock(&iclog->ic_callback_lock);
405 spin_lock(&log->l_icloglock);
406 abortflg = (iclog->ic_state & XLOG_STATE_IOERROR); 422 abortflg = (iclog->ic_state & XLOG_STATE_IOERROR);
407 if (!abortflg) { 423 if (!abortflg) {
408 ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) || 424 ASSERT_ALWAYS((iclog->ic_state == XLOG_STATE_ACTIVE) ||
@@ -411,7 +427,7 @@ xfs_log_notify(xfs_mount_t *mp, /* mount of partition */
411 *(iclog->ic_callback_tail) = cb; 427 *(iclog->ic_callback_tail) = cb;
412 iclog->ic_callback_tail = &(cb->cb_next); 428 iclog->ic_callback_tail = &(cb->cb_next);
413 } 429 }
414 spin_unlock(&log->l_icloglock); 430 spin_unlock(&iclog->ic_callback_lock);
415 return abortflg; 431 return abortflg;
416} /* xfs_log_notify */ 432} /* xfs_log_notify */
417 433
@@ -471,6 +487,8 @@ xfs_log_reserve(xfs_mount_t *mp,
471 /* may sleep if need to allocate more tickets */ 487 /* may sleep if need to allocate more tickets */
472 internal_ticket = xlog_ticket_get(log, unit_bytes, cnt, 488 internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
473 client, flags); 489 client, flags);
490 if (!internal_ticket)
491 return XFS_ERROR(ENOMEM);
474 internal_ticket->t_trans_type = t_type; 492 internal_ticket->t_trans_type = t_type;
475 *ticket = internal_ticket; 493 *ticket = internal_ticket;
476 xlog_trace_loggrant(log, internal_ticket, 494 xlog_trace_loggrant(log, internal_ticket,
@@ -636,7 +654,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
636 if (mp->m_flags & XFS_MOUNT_RDONLY) 654 if (mp->m_flags & XFS_MOUNT_RDONLY)
637 return 0; 655 return 0;
638 656
639 xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC); 657 error = _xfs_log_force(mp, 0, XFS_LOG_FORCE|XFS_LOG_SYNC, NULL);
658 ASSERT(error || !(XLOG_FORCED_SHUTDOWN(log)));
640 659
641#ifdef DEBUG 660#ifdef DEBUG
642 first_iclog = iclog = log->l_iclog; 661 first_iclog = iclog = log->l_iclog;
@@ -675,10 +694,10 @@ xfs_log_unmount_write(xfs_mount_t *mp)
675 694
676 spin_lock(&log->l_icloglock); 695 spin_lock(&log->l_icloglock);
677 iclog = log->l_iclog; 696 iclog = log->l_iclog;
678 iclog->ic_refcnt++; 697 atomic_inc(&iclog->ic_refcnt);
679 spin_unlock(&log->l_icloglock); 698 spin_unlock(&log->l_icloglock);
680 xlog_state_want_sync(log, iclog); 699 xlog_state_want_sync(log, iclog);
681 (void) xlog_state_release_iclog(log, iclog); 700 error = xlog_state_release_iclog(log, iclog);
682 701
683 spin_lock(&log->l_icloglock); 702 spin_lock(&log->l_icloglock);
684 if (!(iclog->ic_state == XLOG_STATE_ACTIVE || 703 if (!(iclog->ic_state == XLOG_STATE_ACTIVE ||
@@ -695,7 +714,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
695 if (tic) { 714 if (tic) {
696 xlog_trace_loggrant(log, tic, "unmount rec"); 715 xlog_trace_loggrant(log, tic, "unmount rec");
697 xlog_ungrant_log_space(log, tic); 716 xlog_ungrant_log_space(log, tic);
698 xlog_state_put_ticket(log, tic); 717 xlog_ticket_put(log, tic);
699 } 718 }
700 } else { 719 } else {
701 /* 720 /*
@@ -713,11 +732,11 @@ xfs_log_unmount_write(xfs_mount_t *mp)
713 */ 732 */
714 spin_lock(&log->l_icloglock); 733 spin_lock(&log->l_icloglock);
715 iclog = log->l_iclog; 734 iclog = log->l_iclog;
716 iclog->ic_refcnt++; 735 atomic_inc(&iclog->ic_refcnt);
717 spin_unlock(&log->l_icloglock); 736 spin_unlock(&log->l_icloglock);
718 737
719 xlog_state_want_sync(log, iclog); 738 xlog_state_want_sync(log, iclog);
720 (void) xlog_state_release_iclog(log, iclog); 739 error = xlog_state_release_iclog(log, iclog);
721 740
722 spin_lock(&log->l_icloglock); 741 spin_lock(&log->l_icloglock);
723 742
@@ -732,7 +751,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
732 } 751 }
733 } 752 }
734 753
735 return 0; 754 return error;
736} /* xfs_log_unmount_write */ 755} /* xfs_log_unmount_write */
737 756
738/* 757/*
@@ -1210,7 +1229,6 @@ xlog_alloc_log(xfs_mount_t *mp,
1210 spin_lock_init(&log->l_icloglock); 1229 spin_lock_init(&log->l_icloglock);
1211 spin_lock_init(&log->l_grant_lock); 1230 spin_lock_init(&log->l_grant_lock);
1212 initnsema(&log->l_flushsema, 0, "ic-flush"); 1231 initnsema(&log->l_flushsema, 0, "ic-flush");
1213 xlog_state_ticket_alloc(log); /* wait until after icloglock inited */
1214 1232
1215 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ 1233 /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */
1216 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); 1234 ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0);
@@ -1240,9 +1258,9 @@ xlog_alloc_log(xfs_mount_t *mp,
1240 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1); 1258 XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
1241 iclog->ic_bp = bp; 1259 iclog->ic_bp = bp;
1242 iclog->hic_data = bp->b_addr; 1260 iclog->hic_data = bp->b_addr;
1243 1261#ifdef DEBUG
1244 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); 1262 log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
1245 1263#endif
1246 head = &iclog->ic_header; 1264 head = &iclog->ic_header;
1247 memset(head, 0, sizeof(xlog_rec_header_t)); 1265 memset(head, 0, sizeof(xlog_rec_header_t));
1248 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM); 1266 head->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
@@ -1253,10 +1271,11 @@ xlog_alloc_log(xfs_mount_t *mp,
1253 head->h_fmt = cpu_to_be32(XLOG_FMT); 1271 head->h_fmt = cpu_to_be32(XLOG_FMT);
1254 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t)); 1272 memcpy(&head->h_fs_uuid, &mp->m_sb.sb_uuid, sizeof(uuid_t));
1255 1273
1256
1257 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize; 1274 iclog->ic_size = XFS_BUF_SIZE(bp) - log->l_iclog_hsize;
1258 iclog->ic_state = XLOG_STATE_ACTIVE; 1275 iclog->ic_state = XLOG_STATE_ACTIVE;
1259 iclog->ic_log = log; 1276 iclog->ic_log = log;
1277 atomic_set(&iclog->ic_refcnt, 0);
1278 spin_lock_init(&iclog->ic_callback_lock);
1260 iclog->ic_callback_tail = &(iclog->ic_callback); 1279 iclog->ic_callback_tail = &(iclog->ic_callback);
1261 iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize; 1280 iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
1262 1281
@@ -1405,7 +1424,7 @@ xlog_sync(xlog_t *log,
1405 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb); 1424 int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
1406 1425
1407 XFS_STATS_INC(xs_log_writes); 1426 XFS_STATS_INC(xs_log_writes);
1408 ASSERT(iclog->ic_refcnt == 0); 1427 ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
1409 1428
1410 /* Add for LR header */ 1429 /* Add for LR header */
1411 count_init = log->l_iclog_hsize + iclog->ic_offset; 1430 count_init = log->l_iclog_hsize + iclog->ic_offset;
@@ -1538,7 +1557,6 @@ STATIC void
1538xlog_dealloc_log(xlog_t *log) 1557xlog_dealloc_log(xlog_t *log)
1539{ 1558{
1540 xlog_in_core_t *iclog, *next_iclog; 1559 xlog_in_core_t *iclog, *next_iclog;
1541 xlog_ticket_t *tic, *next_tic;
1542 int i; 1560 int i;
1543 1561
1544 iclog = log->l_iclog; 1562 iclog = log->l_iclog;
@@ -1559,22 +1577,6 @@ xlog_dealloc_log(xlog_t *log)
1559 spinlock_destroy(&log->l_icloglock); 1577 spinlock_destroy(&log->l_icloglock);
1560 spinlock_destroy(&log->l_grant_lock); 1578 spinlock_destroy(&log->l_grant_lock);
1561 1579
1562 /* XXXsup take a look at this again. */
1563 if ((log->l_ticket_cnt != log->l_ticket_tcnt) &&
1564 !XLOG_FORCED_SHUTDOWN(log)) {
1565 xfs_fs_cmn_err(CE_WARN, log->l_mp,
1566 "xlog_dealloc_log: (cnt: %d, total: %d)",
1567 log->l_ticket_cnt, log->l_ticket_tcnt);
1568 /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */
1569
1570 } else {
1571 tic = log->l_unmount_free;
1572 while (tic) {
1573 next_tic = tic->t_next;
1574 kmem_free(tic, PAGE_SIZE);
1575 tic = next_tic;
1576 }
1577 }
1578 xfs_buf_free(log->l_xbuf); 1580 xfs_buf_free(log->l_xbuf);
1579#ifdef XFS_LOG_TRACE 1581#ifdef XFS_LOG_TRACE
1580 if (log->l_trace != NULL) { 1582 if (log->l_trace != NULL) {
@@ -1987,7 +1989,7 @@ xlog_state_clean_log(xlog_t *log)
1987 if (iclog->ic_state == XLOG_STATE_DIRTY) { 1989 if (iclog->ic_state == XLOG_STATE_DIRTY) {
1988 iclog->ic_state = XLOG_STATE_ACTIVE; 1990 iclog->ic_state = XLOG_STATE_ACTIVE;
1989 iclog->ic_offset = 0; 1991 iclog->ic_offset = 0;
1990 iclog->ic_callback = NULL; /* don't need to free */ 1992 ASSERT(iclog->ic_callback == NULL);
1991 /* 1993 /*
1992 * If the number of ops in this iclog indicate it just 1994 * If the number of ops in this iclog indicate it just
1993 * contains the dummy transaction, we can 1995 * contains the dummy transaction, we can
@@ -2190,37 +2192,40 @@ xlog_state_do_callback(
2190 be64_to_cpu(iclog->ic_header.h_lsn); 2192 be64_to_cpu(iclog->ic_header.h_lsn);
2191 spin_unlock(&log->l_grant_lock); 2193 spin_unlock(&log->l_grant_lock);
2192 2194
2193 /*
2194 * Keep processing entries in the callback list
2195 * until we come around and it is empty. We
2196 * need to atomically see that the list is
2197 * empty and change the state to DIRTY so that
2198 * we don't miss any more callbacks being added.
2199 */
2200 spin_lock(&log->l_icloglock);
2201 } else { 2195 } else {
2196 spin_unlock(&log->l_icloglock);
2202 ioerrors++; 2197 ioerrors++;
2203 } 2198 }
2204 cb = iclog->ic_callback;
2205 2199
2200 /*
2201 * Keep processing entries in the callback list until
2202 * we come around and it is empty. We need to
2203 * atomically see that the list is empty and change the
2204 * state to DIRTY so that we don't miss any more
2205 * callbacks being added.
2206 */
2207 spin_lock(&iclog->ic_callback_lock);
2208 cb = iclog->ic_callback;
2206 while (cb) { 2209 while (cb) {
2207 iclog->ic_callback_tail = &(iclog->ic_callback); 2210 iclog->ic_callback_tail = &(iclog->ic_callback);
2208 iclog->ic_callback = NULL; 2211 iclog->ic_callback = NULL;
2209 spin_unlock(&log->l_icloglock); 2212 spin_unlock(&iclog->ic_callback_lock);
2210 2213
2211 /* perform callbacks in the order given */ 2214 /* perform callbacks in the order given */
2212 for (; cb; cb = cb_next) { 2215 for (; cb; cb = cb_next) {
2213 cb_next = cb->cb_next; 2216 cb_next = cb->cb_next;
2214 cb->cb_func(cb->cb_arg, aborted); 2217 cb->cb_func(cb->cb_arg, aborted);
2215 } 2218 }
2216 spin_lock(&log->l_icloglock); 2219 spin_lock(&iclog->ic_callback_lock);
2217 cb = iclog->ic_callback; 2220 cb = iclog->ic_callback;
2218 } 2221 }
2219 2222
2220 loopdidcallbacks++; 2223 loopdidcallbacks++;
2221 funcdidcallbacks++; 2224 funcdidcallbacks++;
2222 2225
2226 spin_lock(&log->l_icloglock);
2223 ASSERT(iclog->ic_callback == NULL); 2227 ASSERT(iclog->ic_callback == NULL);
2228 spin_unlock(&iclog->ic_callback_lock);
2224 if (!(iclog->ic_state & XLOG_STATE_IOERROR)) 2229 if (!(iclog->ic_state & XLOG_STATE_IOERROR))
2225 iclog->ic_state = XLOG_STATE_DIRTY; 2230 iclog->ic_state = XLOG_STATE_DIRTY;
2226 2231
@@ -2241,7 +2246,7 @@ xlog_state_do_callback(
2241 repeats = 0; 2246 repeats = 0;
2242 xfs_fs_cmn_err(CE_WARN, log->l_mp, 2247 xfs_fs_cmn_err(CE_WARN, log->l_mp,
2243 "%s: possible infinite loop (%d iterations)", 2248 "%s: possible infinite loop (%d iterations)",
2244 __FUNCTION__, flushcnt); 2249 __func__, flushcnt);
2245 } 2250 }
2246 } while (!ioerrors && loopdidcallbacks); 2251 } while (!ioerrors && loopdidcallbacks);
2247 2252
@@ -2309,7 +2314,7 @@ xlog_state_done_syncing(
2309 2314
2310 ASSERT(iclog->ic_state == XLOG_STATE_SYNCING || 2315 ASSERT(iclog->ic_state == XLOG_STATE_SYNCING ||
2311 iclog->ic_state == XLOG_STATE_IOERROR); 2316 iclog->ic_state == XLOG_STATE_IOERROR);
2312 ASSERT(iclog->ic_refcnt == 0); 2317 ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
2313 ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2); 2318 ASSERT(iclog->ic_bwritecnt == 1 || iclog->ic_bwritecnt == 2);
2314 2319
2315 2320
@@ -2391,7 +2396,7 @@ restart:
2391 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE); 2396 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE);
2392 head = &iclog->ic_header; 2397 head = &iclog->ic_header;
2393 2398
2394 iclog->ic_refcnt++; /* prevents sync */ 2399 atomic_inc(&iclog->ic_refcnt); /* prevents sync */
2395 log_offset = iclog->ic_offset; 2400 log_offset = iclog->ic_offset;
2396 2401
2397 /* On the 1st write to an iclog, figure out lsn. This works 2402 /* On the 1st write to an iclog, figure out lsn. This works
@@ -2423,12 +2428,12 @@ restart:
2423 xlog_state_switch_iclogs(log, iclog, iclog->ic_size); 2428 xlog_state_switch_iclogs(log, iclog, iclog->ic_size);
2424 2429
2425 /* If I'm the only one writing to this iclog, sync it to disk */ 2430 /* If I'm the only one writing to this iclog, sync it to disk */
2426 if (iclog->ic_refcnt == 1) { 2431 if (atomic_read(&iclog->ic_refcnt) == 1) {
2427 spin_unlock(&log->l_icloglock); 2432 spin_unlock(&log->l_icloglock);
2428 if ((error = xlog_state_release_iclog(log, iclog))) 2433 if ((error = xlog_state_release_iclog(log, iclog)))
2429 return error; 2434 return error;
2430 } else { 2435 } else {
2431 iclog->ic_refcnt--; 2436 atomic_dec(&iclog->ic_refcnt);
2432 spin_unlock(&log->l_icloglock); 2437 spin_unlock(&log->l_icloglock);
2433 } 2438 }
2434 goto restart; 2439 goto restart;
@@ -2792,18 +2797,6 @@ xlog_ungrant_log_space(xlog_t *log,
2792 2797
2793 2798
2794/* 2799/*
2795 * Atomically put back used ticket.
2796 */
2797STATIC void
2798xlog_state_put_ticket(xlog_t *log,
2799 xlog_ticket_t *tic)
2800{
2801 spin_lock(&log->l_icloglock);
2802 xlog_ticket_put(log, tic);
2803 spin_unlock(&log->l_icloglock);
2804} /* xlog_state_put_ticket */
2805
2806/*
2807 * Flush iclog to disk if this is the last reference to the given iclog and 2800 * Flush iclog to disk if this is the last reference to the given iclog and
2808 * the WANT_SYNC bit is set. 2801 * the WANT_SYNC bit is set.
2809 * 2802 *
@@ -2813,33 +2806,35 @@ xlog_state_put_ticket(xlog_t *log,
2813 * 2806 *
2814 */ 2807 */
2815STATIC int 2808STATIC int
2816xlog_state_release_iclog(xlog_t *log, 2809xlog_state_release_iclog(
2817 xlog_in_core_t *iclog) 2810 xlog_t *log,
2811 xlog_in_core_t *iclog)
2818{ 2812{
2819 int sync = 0; /* do we sync? */ 2813 int sync = 0; /* do we sync? */
2820 2814
2821 xlog_assign_tail_lsn(log->l_mp); 2815 if (iclog->ic_state & XLOG_STATE_IOERROR)
2816 return XFS_ERROR(EIO);
2822 2817
2823 spin_lock(&log->l_icloglock); 2818 ASSERT(atomic_read(&iclog->ic_refcnt) > 0);
2819 if (!atomic_dec_and_lock(&iclog->ic_refcnt, &log->l_icloglock))
2820 return 0;
2824 2821
2825 if (iclog->ic_state & XLOG_STATE_IOERROR) { 2822 if (iclog->ic_state & XLOG_STATE_IOERROR) {
2826 spin_unlock(&log->l_icloglock); 2823 spin_unlock(&log->l_icloglock);
2827 return XFS_ERROR(EIO); 2824 return XFS_ERROR(EIO);
2828 } 2825 }
2829
2830 ASSERT(iclog->ic_refcnt > 0);
2831 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE || 2826 ASSERT(iclog->ic_state == XLOG_STATE_ACTIVE ||
2832 iclog->ic_state == XLOG_STATE_WANT_SYNC); 2827 iclog->ic_state == XLOG_STATE_WANT_SYNC);
2833 2828
2834 if (--iclog->ic_refcnt == 0 && 2829 if (iclog->ic_state == XLOG_STATE_WANT_SYNC) {
2835 iclog->ic_state == XLOG_STATE_WANT_SYNC) { 2830 /* update tail before writing to iclog */
2831 xlog_assign_tail_lsn(log->l_mp);
2836 sync++; 2832 sync++;
2837 iclog->ic_state = XLOG_STATE_SYNCING; 2833 iclog->ic_state = XLOG_STATE_SYNCING;
2838 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn); 2834 iclog->ic_header.h_tail_lsn = cpu_to_be64(log->l_tail_lsn);
2839 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn); 2835 xlog_verify_tail_lsn(log, iclog, log->l_tail_lsn);
2840 /* cycle incremented when incrementing curr_block */ 2836 /* cycle incremented when incrementing curr_block */
2841 } 2837 }
2842
2843 spin_unlock(&log->l_icloglock); 2838 spin_unlock(&log->l_icloglock);
2844 2839
2845 /* 2840 /*
@@ -2849,11 +2844,9 @@ xlog_state_release_iclog(xlog_t *log,
2849 * this iclog has consistent data, so we ignore IOERROR 2844 * this iclog has consistent data, so we ignore IOERROR
2850 * flags after this point. 2845 * flags after this point.
2851 */ 2846 */
2852 if (sync) { 2847 if (sync)
2853 return xlog_sync(log, iclog); 2848 return xlog_sync(log, iclog);
2854 }
2855 return 0; 2849 return 0;
2856
2857} /* xlog_state_release_iclog */ 2850} /* xlog_state_release_iclog */
2858 2851
2859 2852
@@ -2953,7 +2946,8 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
2953 * previous iclog and go to sleep. 2946 * previous iclog and go to sleep.
2954 */ 2947 */
2955 if (iclog->ic_state == XLOG_STATE_DIRTY || 2948 if (iclog->ic_state == XLOG_STATE_DIRTY ||
2956 (iclog->ic_refcnt == 0 && iclog->ic_offset == 0)) { 2949 (atomic_read(&iclog->ic_refcnt) == 0
2950 && iclog->ic_offset == 0)) {
2957 iclog = iclog->ic_prev; 2951 iclog = iclog->ic_prev;
2958 if (iclog->ic_state == XLOG_STATE_ACTIVE || 2952 if (iclog->ic_state == XLOG_STATE_ACTIVE ||
2959 iclog->ic_state == XLOG_STATE_DIRTY) 2953 iclog->ic_state == XLOG_STATE_DIRTY)
@@ -2961,14 +2955,14 @@ xlog_state_sync_all(xlog_t *log, uint flags, int *log_flushed)
2961 else 2955 else
2962 goto maybe_sleep; 2956 goto maybe_sleep;
2963 } else { 2957 } else {
2964 if (iclog->ic_refcnt == 0) { 2958 if (atomic_read(&iclog->ic_refcnt) == 0) {
2965 /* We are the only one with access to this 2959 /* We are the only one with access to this
2966 * iclog. Flush it out now. There should 2960 * iclog. Flush it out now. There should
2967 * be a roundoff of zero to show that someone 2961 * be a roundoff of zero to show that someone
2968 * has already taken care of the roundoff from 2962 * has already taken care of the roundoff from
2969 * the previous sync. 2963 * the previous sync.
2970 */ 2964 */
2971 iclog->ic_refcnt++; 2965 atomic_inc(&iclog->ic_refcnt);
2972 lsn = be64_to_cpu(iclog->ic_header.h_lsn); 2966 lsn = be64_to_cpu(iclog->ic_header.h_lsn);
2973 xlog_state_switch_iclogs(log, iclog, 0); 2967 xlog_state_switch_iclogs(log, iclog, 0);
2974 spin_unlock(&log->l_icloglock); 2968 spin_unlock(&log->l_icloglock);
@@ -3100,7 +3094,7 @@ try_again:
3100 already_slept = 1; 3094 already_slept = 1;
3101 goto try_again; 3095 goto try_again;
3102 } else { 3096 } else {
3103 iclog->ic_refcnt++; 3097 atomic_inc(&iclog->ic_refcnt);
3104 xlog_state_switch_iclogs(log, iclog, 0); 3098 xlog_state_switch_iclogs(log, iclog, 0);
3105 spin_unlock(&log->l_icloglock); 3099 spin_unlock(&log->l_icloglock);
3106 if (xlog_state_release_iclog(log, iclog)) 3100 if (xlog_state_release_iclog(log, iclog))
@@ -3172,92 +3166,19 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
3172 */ 3166 */
3173 3167
3174/* 3168/*
3175 * Algorithm doesn't take into account page size. ;-( 3169 * Free a used ticket.
3176 */
3177STATIC void
3178xlog_state_ticket_alloc(xlog_t *log)
3179{
3180 xlog_ticket_t *t_list;
3181 xlog_ticket_t *next;
3182 xfs_caddr_t buf;
3183 uint i = (PAGE_SIZE / sizeof(xlog_ticket_t)) - 2;
3184
3185 /*
3186 * The kmem_zalloc may sleep, so we shouldn't be holding the
3187 * global lock. XXXmiken: may want to use zone allocator.
3188 */
3189 buf = (xfs_caddr_t) kmem_zalloc(PAGE_SIZE, KM_SLEEP);
3190
3191 spin_lock(&log->l_icloglock);
3192
3193 /* Attach 1st ticket to Q, so we can keep track of allocated memory */
3194 t_list = (xlog_ticket_t *)buf;
3195 t_list->t_next = log->l_unmount_free;
3196 log->l_unmount_free = t_list++;
3197 log->l_ticket_cnt++;
3198 log->l_ticket_tcnt++;
3199
3200 /* Next ticket becomes first ticket attached to ticket free list */
3201 if (log->l_freelist != NULL) {
3202 ASSERT(log->l_tail != NULL);
3203 log->l_tail->t_next = t_list;
3204 } else {
3205 log->l_freelist = t_list;
3206 }
3207 log->l_ticket_cnt++;
3208 log->l_ticket_tcnt++;
3209
3210 /* Cycle through rest of alloc'ed memory, building up free Q */
3211 for ( ; i > 0; i--) {
3212 next = t_list + 1;
3213 t_list->t_next = next;
3214 t_list = next;
3215 log->l_ticket_cnt++;
3216 log->l_ticket_tcnt++;
3217 }
3218 t_list->t_next = NULL;
3219 log->l_tail = t_list;
3220 spin_unlock(&log->l_icloglock);
3221} /* xlog_state_ticket_alloc */
3222
3223
3224/*
3225 * Put ticket into free list
3226 *
3227 * Assumption: log lock is held around this call.
3228 */ 3170 */
3229STATIC void 3171STATIC void
3230xlog_ticket_put(xlog_t *log, 3172xlog_ticket_put(xlog_t *log,
3231 xlog_ticket_t *ticket) 3173 xlog_ticket_t *ticket)
3232{ 3174{
3233 sv_destroy(&ticket->t_sema); 3175 sv_destroy(&ticket->t_sema);
3234 3176 kmem_zone_free(xfs_log_ticket_zone, ticket);
3235 /*
3236 * Don't think caching will make that much difference. It's
3237 * more important to make debug easier.
3238 */
3239#if 0
3240 /* real code will want to use LIFO for caching */
3241 ticket->t_next = log->l_freelist;
3242 log->l_freelist = ticket;
3243 /* no need to clear fields */
3244#else
3245 /* When we debug, it is easier if tickets are cycled */
3246 ticket->t_next = NULL;
3247 if (log->l_tail) {
3248 log->l_tail->t_next = ticket;
3249 } else {
3250 ASSERT(log->l_freelist == NULL);
3251 log->l_freelist = ticket;
3252 }
3253 log->l_tail = ticket;
3254#endif /* DEBUG */
3255 log->l_ticket_cnt++;
3256} /* xlog_ticket_put */ 3177} /* xlog_ticket_put */
3257 3178
3258 3179
3259/* 3180/*
3260 * Grab ticket off freelist or allocation some more 3181 * Allocate and initialise a new log ticket.
3261 */ 3182 */
3262STATIC xlog_ticket_t * 3183STATIC xlog_ticket_t *
3263xlog_ticket_get(xlog_t *log, 3184xlog_ticket_get(xlog_t *log,
@@ -3269,21 +3190,9 @@ xlog_ticket_get(xlog_t *log,
3269 xlog_ticket_t *tic; 3190 xlog_ticket_t *tic;
3270 uint num_headers; 3191 uint num_headers;
3271 3192
3272 alloc: 3193 tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
3273 if (log->l_freelist == NULL) 3194 if (!tic)
3274 xlog_state_ticket_alloc(log); /* potentially sleep */ 3195 return NULL;
3275
3276 spin_lock(&log->l_icloglock);
3277 if (log->l_freelist == NULL) {
3278 spin_unlock(&log->l_icloglock);
3279 goto alloc;
3280 }
3281 tic = log->l_freelist;
3282 log->l_freelist = tic->t_next;
3283 if (log->l_freelist == NULL)
3284 log->l_tail = NULL;
3285 log->l_ticket_cnt--;
3286 spin_unlock(&log->l_icloglock);
3287 3196
3288 /* 3197 /*
3289 * Permanent reservations have up to 'cnt'-1 active log operations 3198 * Permanent reservations have up to 'cnt'-1 active log operations
@@ -3611,8 +3520,8 @@ xfs_log_force_umount(
3611 * before we mark the filesystem SHUTDOWN and wake 3520 * before we mark the filesystem SHUTDOWN and wake
3612 * everybody up to tell the bad news. 3521 * everybody up to tell the bad news.
3613 */ 3522 */
3614 spin_lock(&log->l_grant_lock);
3615 spin_lock(&log->l_icloglock); 3523 spin_lock(&log->l_icloglock);
3524 spin_lock(&log->l_grant_lock);
3616 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN; 3525 mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
3617 XFS_BUF_DONE(mp->m_sb_bp); 3526 XFS_BUF_DONE(mp->m_sb_bp);
3618 /* 3527 /*
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 4cdac048df5e..d1d678ecb63e 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -142,8 +142,9 @@ int _xfs_log_force(struct xfs_mount *mp,
142 xfs_lsn_t lsn, 142 xfs_lsn_t lsn,
143 uint flags, 143 uint flags,
144 int *log_forced); 144 int *log_forced);
145#define xfs_log_force(mp, lsn, flags) \ 145void xfs_log_force(struct xfs_mount *mp,
146 _xfs_log_force(mp, lsn, flags, NULL); 146 xfs_lsn_t lsn,
147 uint flags);
147int xfs_log_mount(struct xfs_mount *mp, 148int xfs_log_mount(struct xfs_mount *mp,
148 struct xfs_buftarg *log_target, 149 struct xfs_buftarg *log_target,
149 xfs_daddr_t start_block, 150 xfs_daddr_t start_block,
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index c6244cc733c0..8952a392b5f3 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -242,7 +242,7 @@ typedef struct xlog_res {
242 242
243typedef struct xlog_ticket { 243typedef struct xlog_ticket {
244 sv_t t_sema; /* sleep on this semaphore : 20 */ 244 sv_t t_sema; /* sleep on this semaphore : 20 */
245 struct xlog_ticket *t_next; /* :4|8 */ 245 struct xlog_ticket *t_next; /* :4|8 */
246 struct xlog_ticket *t_prev; /* :4|8 */ 246 struct xlog_ticket *t_prev; /* :4|8 */
247 xlog_tid_t t_tid; /* transaction identifier : 4 */ 247 xlog_tid_t t_tid; /* transaction identifier : 4 */
248 int t_curr_res; /* current reservation in bytes : 4 */ 248 int t_curr_res; /* current reservation in bytes : 4 */
@@ -324,6 +324,19 @@ typedef struct xlog_rec_ext_header {
324 * - ic_offset is the current number of bytes written to in this iclog. 324 * - ic_offset is the current number of bytes written to in this iclog.
325 * - ic_refcnt is bumped when someone is writing to the log. 325 * - ic_refcnt is bumped when someone is writing to the log.
326 * - ic_state is the state of the iclog. 326 * - ic_state is the state of the iclog.
327 *
328 * Because of cacheline contention on large machines, we need to separate
329 * various resources onto different cachelines. To start with, make the
330 * structure cacheline aligned. The following fields can be contended on
331 * by independent processes:
332 *
333 * - ic_callback_*
334 * - ic_refcnt
335 * - fields protected by the global l_icloglock
336 *
337 * so we need to ensure that these fields are located in separate cachelines.
338 * We'll put all the read-only and l_icloglock fields in the first cacheline,
339 * and move everything else out to subsequent cachelines.
327 */ 340 */
328typedef struct xlog_iclog_fields { 341typedef struct xlog_iclog_fields {
329 sv_t ic_forcesema; 342 sv_t ic_forcesema;
@@ -332,17 +345,22 @@ typedef struct xlog_iclog_fields {
332 struct xlog_in_core *ic_prev; 345 struct xlog_in_core *ic_prev;
333 struct xfs_buf *ic_bp; 346 struct xfs_buf *ic_bp;
334 struct log *ic_log; 347 struct log *ic_log;
335 xfs_log_callback_t *ic_callback;
336 xfs_log_callback_t **ic_callback_tail;
337#ifdef XFS_LOG_TRACE
338 struct ktrace *ic_trace;
339#endif
340 int ic_size; 348 int ic_size;
341 int ic_offset; 349 int ic_offset;
342 int ic_refcnt;
343 int ic_bwritecnt; 350 int ic_bwritecnt;
344 ushort_t ic_state; 351 ushort_t ic_state;
345 char *ic_datap; /* pointer to iclog data */ 352 char *ic_datap; /* pointer to iclog data */
353#ifdef XFS_LOG_TRACE
354 struct ktrace *ic_trace;
355#endif
356
357 /* Callback structures need their own cacheline */
358 spinlock_t ic_callback_lock ____cacheline_aligned_in_smp;
359 xfs_log_callback_t *ic_callback;
360 xfs_log_callback_t **ic_callback_tail;
361
362 /* reference counts need their own cacheline */
363 atomic_t ic_refcnt ____cacheline_aligned_in_smp;
346} xlog_iclog_fields_t; 364} xlog_iclog_fields_t;
347 365
348typedef union xlog_in_core2 { 366typedef union xlog_in_core2 {
@@ -366,6 +384,7 @@ typedef struct xlog_in_core {
366#define ic_bp hic_fields.ic_bp 384#define ic_bp hic_fields.ic_bp
367#define ic_log hic_fields.ic_log 385#define ic_log hic_fields.ic_log
368#define ic_callback hic_fields.ic_callback 386#define ic_callback hic_fields.ic_callback
387#define ic_callback_lock hic_fields.ic_callback_lock
369#define ic_callback_tail hic_fields.ic_callback_tail 388#define ic_callback_tail hic_fields.ic_callback_tail
370#define ic_trace hic_fields.ic_trace 389#define ic_trace hic_fields.ic_trace
371#define ic_size hic_fields.ic_size 390#define ic_size hic_fields.ic_size
@@ -383,43 +402,46 @@ typedef struct xlog_in_core {
383 * that round off problems won't occur when releasing partial reservations. 402 * that round off problems won't occur when releasing partial reservations.
384 */ 403 */
385typedef struct log { 404typedef struct log {
405 /* The following fields don't need locking */
406 struct xfs_mount *l_mp; /* mount point */
407 struct xfs_buf *l_xbuf; /* extra buffer for log
408 * wrapping */
409 struct xfs_buftarg *l_targ; /* buftarg of log */
410 uint l_flags;
411 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
412 struct xfs_buf_cancel **l_buf_cancel_table;
413 int l_iclog_hsize; /* size of iclog header */
414 int l_iclog_heads; /* # of iclog header sectors */
415 uint l_sectbb_log; /* log2 of sector size in BBs */
416 uint l_sectbb_mask; /* sector size (in BBs)
417 * alignment mask */
418 int l_iclog_size; /* size of log in bytes */
419 int l_iclog_size_log; /* log power size of log */
420 int l_iclog_bufs; /* number of iclog buffers */
421 xfs_daddr_t l_logBBstart; /* start block of log */
422 int l_logsize; /* size of log in bytes */
423 int l_logBBsize; /* size of log in BB chunks */
424
386 /* The following block of fields are changed while holding icloglock */ 425 /* The following block of fields are changed while holding icloglock */
387 sema_t l_flushsema; /* iclog flushing semaphore */ 426 sema_t l_flushsema ____cacheline_aligned_in_smp;
427 /* iclog flushing semaphore */
388 int l_flushcnt; /* # of procs waiting on this 428 int l_flushcnt; /* # of procs waiting on this
389 * sema */ 429 * sema */
390 int l_ticket_cnt; /* free ticket count */
391 int l_ticket_tcnt; /* total ticket count */
392 int l_covered_state;/* state of "covering disk 430 int l_covered_state;/* state of "covering disk
393 * log entries" */ 431 * log entries" */
394 xlog_ticket_t *l_freelist; /* free list of tickets */
395 xlog_ticket_t *l_unmount_free;/* kmem_free these addresses */
396 xlog_ticket_t *l_tail; /* free list of tickets */
397 xlog_in_core_t *l_iclog; /* head log queue */ 432 xlog_in_core_t *l_iclog; /* head log queue */
398 spinlock_t l_icloglock; /* grab to change iclog state */ 433 spinlock_t l_icloglock; /* grab to change iclog state */
399 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed 434 xfs_lsn_t l_tail_lsn; /* lsn of 1st LR with unflushed
400 * buffers */ 435 * buffers */
401 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */ 436 xfs_lsn_t l_last_sync_lsn;/* lsn of last LR on disk */
402 struct xfs_mount *l_mp; /* mount point */
403 struct xfs_buf *l_xbuf; /* extra buffer for log
404 * wrapping */
405 struct xfs_buftarg *l_targ; /* buftarg of log */
406 xfs_daddr_t l_logBBstart; /* start block of log */
407 int l_logsize; /* size of log in bytes */
408 int l_logBBsize; /* size of log in BB chunks */
409 int l_curr_cycle; /* Cycle number of log writes */ 437 int l_curr_cycle; /* Cycle number of log writes */
410 int l_prev_cycle; /* Cycle number before last 438 int l_prev_cycle; /* Cycle number before last
411 * block increment */ 439 * block increment */
412 int l_curr_block; /* current logical log block */ 440 int l_curr_block; /* current logical log block */
413 int l_prev_block; /* previous logical log block */ 441 int l_prev_block; /* previous logical log block */
414 int l_iclog_size; /* size of log in bytes */
415 int l_iclog_size_log; /* log power size of log */
416 int l_iclog_bufs; /* number of iclog buffers */
417
418 /* The following field are used for debugging; need to hold icloglock */
419 char *l_iclog_bak[XLOG_MAX_ICLOGS];
420 442
421 /* The following block of fields are changed while holding grant_lock */ 443 /* The following block of fields are changed while holding grant_lock */
422 spinlock_t l_grant_lock; 444 spinlock_t l_grant_lock ____cacheline_aligned_in_smp;
423 xlog_ticket_t *l_reserve_headq; 445 xlog_ticket_t *l_reserve_headq;
424 xlog_ticket_t *l_write_headq; 446 xlog_ticket_t *l_write_headq;
425 int l_grant_reserve_cycle; 447 int l_grant_reserve_cycle;
@@ -427,19 +449,16 @@ typedef struct log {
427 int l_grant_write_cycle; 449 int l_grant_write_cycle;
428 int l_grant_write_bytes; 450 int l_grant_write_bytes;
429 451
430 /* The following fields don't need locking */
431#ifdef XFS_LOG_TRACE 452#ifdef XFS_LOG_TRACE
432 struct ktrace *l_trace; 453 struct ktrace *l_trace;
433 struct ktrace *l_grant_trace; 454 struct ktrace *l_grant_trace;
434#endif 455#endif
435 uint l_flags; 456
436 uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */ 457 /* The following field are used for debugging; need to hold icloglock */
437 struct xfs_buf_cancel **l_buf_cancel_table; 458#ifdef DEBUG
438 int l_iclog_hsize; /* size of iclog header */ 459 char *l_iclog_bak[XLOG_MAX_ICLOGS];
439 int l_iclog_heads; /* # of iclog header sectors */ 460#endif
440 uint l_sectbb_log; /* log2 of sector size in BBs */ 461
441 uint l_sectbb_mask; /* sector size (in BBs)
442 * alignment mask */
443} xlog_t; 462} xlog_t;
444 463
445#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR) 464#define XLOG_FORCED_SHUTDOWN(log) ((log)->l_flags & XLOG_IO_ERROR)
@@ -459,6 +478,8 @@ extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
459extern void xlog_put_bp(struct xfs_buf *); 478extern void xlog_put_bp(struct xfs_buf *);
460extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *); 479extern int xlog_bread(xlog_t *, xfs_daddr_t, int, struct xfs_buf *);
461 480
481extern kmem_zone_t *xfs_log_ticket_zone;
482
462/* iclog tracing */ 483/* iclog tracing */
463#define XLOG_TRACE_GRAB_FLUSH 1 484#define XLOG_TRACE_GRAB_FLUSH 1
464#define XLOG_TRACE_REL_FLUSH 2 485#define XLOG_TRACE_REL_FLUSH 2
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index b2b70eba282c..e65ab4af0955 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -46,6 +46,7 @@
46#include "xfs_trans_priv.h" 46#include "xfs_trans_priv.h"
47#include "xfs_quota.h" 47#include "xfs_quota.h"
48#include "xfs_rw.h" 48#include "xfs_rw.h"
49#include "xfs_utils.h"
49 50
50STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *); 51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
51STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t); 52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
@@ -120,7 +121,8 @@ xlog_bread(
120 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp); 121 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
121 122
122 xfsbdstrat(log->l_mp, bp); 123 xfsbdstrat(log->l_mp, bp);
123 if ((error = xfs_iowait(bp))) 124 error = xfs_iowait(bp);
125 if (error)
124 xfs_ioerror_alert("xlog_bread", log->l_mp, 126 xfs_ioerror_alert("xlog_bread", log->l_mp,
125 bp, XFS_BUF_ADDR(bp)); 127 bp, XFS_BUF_ADDR(bp));
126 return error; 128 return error;
@@ -191,7 +193,7 @@ xlog_header_check_dump(
191{ 193{
192 int b; 194 int b;
193 195
194 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __FUNCTION__); 196 cmn_err(CE_DEBUG, "%s: SB : uuid = ", __func__);
195 for (b = 0; b < 16; b++) 197 for (b = 0; b < 16; b++)
196 cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]); 198 cmn_err(CE_DEBUG, "%02x", ((uchar_t *)&mp->m_sb.sb_uuid)[b]);
197 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT); 199 cmn_err(CE_DEBUG, ", fmt = %d\n", XLOG_FMT);
@@ -1160,10 +1162,14 @@ xlog_write_log_records(
1160 if (j == 0 && (start_block + endcount > ealign)) { 1162 if (j == 0 && (start_block + endcount > ealign)) {
1161 offset = XFS_BUF_PTR(bp); 1163 offset = XFS_BUF_PTR(bp);
1162 balign = BBTOB(ealign - start_block); 1164 balign = BBTOB(ealign - start_block);
1163 XFS_BUF_SET_PTR(bp, offset + balign, BBTOB(sectbb)); 1165 error = XFS_BUF_SET_PTR(bp, offset + balign,
1164 if ((error = xlog_bread(log, ealign, sectbb, bp))) 1166 BBTOB(sectbb));
1167 if (!error)
1168 error = xlog_bread(log, ealign, sectbb, bp);
1169 if (!error)
1170 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
1171 if (error)
1165 break; 1172 break;
1166 XFS_BUF_SET_PTR(bp, offset, bufblks);
1167 } 1173 }
1168 1174
1169 offset = xlog_align(log, start_block, endcount, bp); 1175 offset = xlog_align(log, start_block, endcount, bp);
@@ -2280,7 +2286,9 @@ xlog_recover_do_inode_trans(
2280 * invalidate the buffer when we write it out below. 2286 * invalidate the buffer when we write it out below.
2281 */ 2287 */
2282 imap.im_blkno = 0; 2288 imap.im_blkno = 0;
2283 xfs_imap(log->l_mp, NULL, ino, &imap, 0); 2289 error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
2290 if (error)
2291 goto error;
2284 } 2292 }
2285 2293
2286 /* 2294 /*
@@ -2964,7 +2972,7 @@ xlog_recover_process_data(
2964 * Process an extent free intent item that was recovered from 2972 * Process an extent free intent item that was recovered from
2965 * the log. We need to free the extents that it describes. 2973 * the log. We need to free the extents that it describes.
2966 */ 2974 */
2967STATIC void 2975STATIC int
2968xlog_recover_process_efi( 2976xlog_recover_process_efi(
2969 xfs_mount_t *mp, 2977 xfs_mount_t *mp,
2970 xfs_efi_log_item_t *efip) 2978 xfs_efi_log_item_t *efip)
@@ -2972,6 +2980,7 @@ xlog_recover_process_efi(
2972 xfs_efd_log_item_t *efdp; 2980 xfs_efd_log_item_t *efdp;
2973 xfs_trans_t *tp; 2981 xfs_trans_t *tp;
2974 int i; 2982 int i;
2983 int error = 0;
2975 xfs_extent_t *extp; 2984 xfs_extent_t *extp;
2976 xfs_fsblock_t startblock_fsb; 2985 xfs_fsblock_t startblock_fsb;
2977 2986
@@ -2995,23 +3004,32 @@ xlog_recover_process_efi(
2995 * free the memory associated with it. 3004 * free the memory associated with it.
2996 */ 3005 */
2997 xfs_efi_release(efip, efip->efi_format.efi_nextents); 3006 xfs_efi_release(efip, efip->efi_format.efi_nextents);
2998 return; 3007 return XFS_ERROR(EIO);
2999 } 3008 }
3000 } 3009 }
3001 3010
3002 tp = xfs_trans_alloc(mp, 0); 3011 tp = xfs_trans_alloc(mp, 0);
3003 xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0); 3012 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
3013 if (error)
3014 goto abort_error;
3004 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents); 3015 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3005 3016
3006 for (i = 0; i < efip->efi_format.efi_nextents; i++) { 3017 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3007 extp = &(efip->efi_format.efi_extents[i]); 3018 extp = &(efip->efi_format.efi_extents[i]);
3008 xfs_free_extent(tp, extp->ext_start, extp->ext_len); 3019 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3020 if (error)
3021 goto abort_error;
3009 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start, 3022 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3010 extp->ext_len); 3023 extp->ext_len);
3011 } 3024 }
3012 3025
3013 efip->efi_flags |= XFS_EFI_RECOVERED; 3026 efip->efi_flags |= XFS_EFI_RECOVERED;
3014 xfs_trans_commit(tp, 0); 3027 error = xfs_trans_commit(tp, 0);
3028 return error;
3029
3030abort_error:
3031 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3032 return error;
3015} 3033}
3016 3034
3017/* 3035/*
@@ -3059,7 +3077,7 @@ xlog_recover_check_ail(
3059 * everything already in the AIL, we stop processing as soon as 3077 * everything already in the AIL, we stop processing as soon as
3060 * we see something other than an EFI in the AIL. 3078 * we see something other than an EFI in the AIL.
3061 */ 3079 */
3062STATIC void 3080STATIC int
3063xlog_recover_process_efis( 3081xlog_recover_process_efis(
3064 xlog_t *log) 3082 xlog_t *log)
3065{ 3083{
@@ -3067,6 +3085,7 @@ xlog_recover_process_efis(
3067 xfs_efi_log_item_t *efip; 3085 xfs_efi_log_item_t *efip;
3068 int gen; 3086 int gen;
3069 xfs_mount_t *mp; 3087 xfs_mount_t *mp;
3088 int error = 0;
3070 3089
3071 mp = log->l_mp; 3090 mp = log->l_mp;
3072 spin_lock(&mp->m_ail_lock); 3091 spin_lock(&mp->m_ail_lock);
@@ -3091,11 +3110,14 @@ xlog_recover_process_efis(
3091 } 3110 }
3092 3111
3093 spin_unlock(&mp->m_ail_lock); 3112 spin_unlock(&mp->m_ail_lock);
3094 xlog_recover_process_efi(mp, efip); 3113 error = xlog_recover_process_efi(mp, efip);
3114 if (error)
3115 return error;
3095 spin_lock(&mp->m_ail_lock); 3116 spin_lock(&mp->m_ail_lock);
3096 lip = xfs_trans_next_ail(mp, lip, &gen, NULL); 3117 lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
3097 } 3118 }
3098 spin_unlock(&mp->m_ail_lock); 3119 spin_unlock(&mp->m_ail_lock);
3120 return error;
3099} 3121}
3100 3122
3101/* 3123/*
@@ -3115,21 +3137,18 @@ xlog_recover_clear_agi_bucket(
3115 int error; 3137 int error;
3116 3138
3117 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET); 3139 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3118 xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0); 3140 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
3119 3141 if (!error)
3120 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 3142 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3121 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)), 3143 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
3122 XFS_FSS_TO_BB(mp, 1), 0, &agibp); 3144 XFS_FSS_TO_BB(mp, 1), 0, &agibp);
3123 if (error) { 3145 if (error)
3124 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3146 goto out_abort;
3125 return;
3126 }
3127 3147
3148 error = EINVAL;
3128 agi = XFS_BUF_TO_AGI(agibp); 3149 agi = XFS_BUF_TO_AGI(agibp);
3129 if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC) { 3150 if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
3130 xfs_trans_cancel(tp, XFS_TRANS_ABORT); 3151 goto out_abort;
3131 return;
3132 }
3133 3152
3134 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO); 3153 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3135 offset = offsetof(xfs_agi_t, agi_unlinked) + 3154 offset = offsetof(xfs_agi_t, agi_unlinked) +
@@ -3137,7 +3156,17 @@ xlog_recover_clear_agi_bucket(
3137 xfs_trans_log_buf(tp, agibp, offset, 3156 xfs_trans_log_buf(tp, agibp, offset,
3138 (offset + sizeof(xfs_agino_t) - 1)); 3157 (offset + sizeof(xfs_agino_t) - 1));
3139 3158
3140 (void) xfs_trans_commit(tp, 0); 3159 error = xfs_trans_commit(tp, 0);
3160 if (error)
3161 goto out_error;
3162 return;
3163
3164out_abort:
3165 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3166out_error:
3167 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
3168 "failed to clear agi %d. Continuing.", agno);
3169 return;
3141} 3170}
3142 3171
3143/* 3172/*
@@ -3214,7 +3243,8 @@ xlog_recover_process_iunlinks(
3214 * next inode in the bucket. 3243 * next inode in the bucket.
3215 */ 3244 */
3216 error = xfs_itobp(mp, NULL, ip, &dip, 3245 error = xfs_itobp(mp, NULL, ip, &dip,
3217 &ibp, 0, 0); 3246 &ibp, 0, 0,
3247 XFS_BUF_LOCK);
3218 ASSERT(error || (dip != NULL)); 3248 ASSERT(error || (dip != NULL));
3219 } 3249 }
3220 3250
@@ -3247,7 +3277,7 @@ xlog_recover_process_iunlinks(
3247 if (ip->i_d.di_mode == 0) 3277 if (ip->i_d.di_mode == 0)
3248 xfs_iput_new(ip, 0); 3278 xfs_iput_new(ip, 0);
3249 else 3279 else
3250 VN_RELE(XFS_ITOV(ip)); 3280 IRELE(ip);
3251 } else { 3281 } else {
3252 /* 3282 /*
3253 * We can't read in the inode 3283 * We can't read in the inode
@@ -3445,7 +3475,7 @@ xlog_valid_rec_header(
3445 (!rhead->h_version || 3475 (!rhead->h_version ||
3446 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) { 3476 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3447 xlog_warn("XFS: %s: unrecognised log version (%d).", 3477 xlog_warn("XFS: %s: unrecognised log version (%d).",
3448 __FUNCTION__, be32_to_cpu(rhead->h_version)); 3478 __func__, be32_to_cpu(rhead->h_version));
3449 return XFS_ERROR(EIO); 3479 return XFS_ERROR(EIO);
3450 } 3480 }
3451 3481
@@ -3604,15 +3634,19 @@ xlog_do_recovery_pass(
3604 * _first_, then the log start (LR header end) 3634 * _first_, then the log start (LR header end)
3605 * - order is important. 3635 * - order is important.
3606 */ 3636 */
3637 wrapped_hblks = hblks - split_hblks;
3607 bufaddr = XFS_BUF_PTR(hbp); 3638 bufaddr = XFS_BUF_PTR(hbp);
3608 XFS_BUF_SET_PTR(hbp, 3639 error = XFS_BUF_SET_PTR(hbp,
3609 bufaddr + BBTOB(split_hblks), 3640 bufaddr + BBTOB(split_hblks),
3610 BBTOB(hblks - split_hblks)); 3641 BBTOB(hblks - split_hblks));
3611 wrapped_hblks = hblks - split_hblks; 3642 if (!error)
3612 error = xlog_bread(log, 0, wrapped_hblks, hbp); 3643 error = xlog_bread(log, 0,
3644 wrapped_hblks, hbp);
3645 if (!error)
3646 error = XFS_BUF_SET_PTR(hbp, bufaddr,
3647 BBTOB(hblks));
3613 if (error) 3648 if (error)
3614 goto bread_err2; 3649 goto bread_err2;
3615 XFS_BUF_SET_PTR(hbp, bufaddr, BBTOB(hblks));
3616 if (!offset) 3650 if (!offset)
3617 offset = xlog_align(log, 0, 3651 offset = xlog_align(log, 0,
3618 wrapped_hblks, hbp); 3652 wrapped_hblks, hbp);
@@ -3664,13 +3698,18 @@ xlog_do_recovery_pass(
3664 * - order is important. 3698 * - order is important.
3665 */ 3699 */
3666 bufaddr = XFS_BUF_PTR(dbp); 3700 bufaddr = XFS_BUF_PTR(dbp);
3667 XFS_BUF_SET_PTR(dbp, 3701 error = XFS_BUF_SET_PTR(dbp,
3668 bufaddr + BBTOB(split_bblks), 3702 bufaddr + BBTOB(split_bblks),
3669 BBTOB(bblks - split_bblks)); 3703 BBTOB(bblks - split_bblks));
3670 if ((error = xlog_bread(log, wrapped_hblks, 3704 if (!error)
3671 bblks - split_bblks, dbp))) 3705 error = xlog_bread(log, wrapped_hblks,
3706 bblks - split_bblks,
3707 dbp);
3708 if (!error)
3709 error = XFS_BUF_SET_PTR(dbp, bufaddr,
3710 h_size);
3711 if (error)
3672 goto bread_err2; 3712 goto bread_err2;
3673 XFS_BUF_SET_PTR(dbp, bufaddr, h_size);
3674 if (!offset) 3713 if (!offset)
3675 offset = xlog_align(log, wrapped_hblks, 3714 offset = xlog_align(log, wrapped_hblks,
3676 bblks - split_bblks, dbp); 3715 bblks - split_bblks, dbp);
@@ -3826,7 +3865,8 @@ xlog_do_recover(
3826 XFS_BUF_READ(bp); 3865 XFS_BUF_READ(bp);
3827 XFS_BUF_UNASYNC(bp); 3866 XFS_BUF_UNASYNC(bp);
3828 xfsbdstrat(log->l_mp, bp); 3867 xfsbdstrat(log->l_mp, bp);
3829 if ((error = xfs_iowait(bp))) { 3868 error = xfs_iowait(bp);
3869 if (error) {
3830 xfs_ioerror_alert("xlog_do_recover", 3870 xfs_ioerror_alert("xlog_do_recover",
3831 log->l_mp, bp, XFS_BUF_ADDR(bp)); 3871 log->l_mp, bp, XFS_BUF_ADDR(bp));
3832 ASSERT(0); 3872 ASSERT(0);
@@ -3917,7 +3957,14 @@ xlog_recover_finish(
3917 * rather than accepting new requests. 3957 * rather than accepting new requests.
3918 */ 3958 */
3919 if (log->l_flags & XLOG_RECOVERY_NEEDED) { 3959 if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3920 xlog_recover_process_efis(log); 3960 int error;
3961 error = xlog_recover_process_efis(log);
3962 if (error) {
3963 cmn_err(CE_ALERT,
3964 "Failed to recover EFIs on filesystem: %s",
3965 log->l_mp->m_fsname);
3966 return error;
3967 }
3921 /* 3968 /*
3922 * Sync the log to get all the EFIs out of the AIL. 3969 * Sync the log to get all the EFIs out of the AIL.
3923 * This isn't absolutely necessary, but it helps in 3970 * This isn't absolutely necessary, but it helps in
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 8ed164eb9544..2fec452afbcc 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,8 +43,9 @@
43#include "xfs_rw.h" 43#include "xfs_rw.h"
44#include "xfs_quota.h" 44#include "xfs_quota.h"
45#include "xfs_fsops.h" 45#include "xfs_fsops.h"
46#include "xfs_utils.h"
46 47
47STATIC void xfs_mount_log_sb(xfs_mount_t *, __int64_t); 48STATIC int xfs_mount_log_sb(xfs_mount_t *, __int64_t);
48STATIC int xfs_uuid_mount(xfs_mount_t *); 49STATIC int xfs_uuid_mount(xfs_mount_t *);
49STATIC void xfs_uuid_unmount(xfs_mount_t *mp); 50STATIC void xfs_uuid_unmount(xfs_mount_t *mp);
50STATIC void xfs_unmountfs_wait(xfs_mount_t *); 51STATIC void xfs_unmountfs_wait(xfs_mount_t *);
@@ -57,7 +58,7 @@ STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t,
57STATIC void xfs_icsb_sync_counters(xfs_mount_t *); 58STATIC void xfs_icsb_sync_counters(xfs_mount_t *);
58STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, 59STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t,
59 int64_t, int); 60 int64_t, int);
60STATIC int xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); 61STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t);
61 62
62#else 63#else
63 64
@@ -956,7 +957,6 @@ xfs_mountfs(
956{ 957{
957 xfs_sb_t *sbp = &(mp->m_sb); 958 xfs_sb_t *sbp = &(mp->m_sb);
958 xfs_inode_t *rip; 959 xfs_inode_t *rip;
959 bhv_vnode_t *rvp = NULL;
960 __uint64_t resblks; 960 __uint64_t resblks;
961 __int64_t update_flags = 0LL; 961 __int64_t update_flags = 0LL;
962 uint quotamount, quotaflags; 962 uint quotamount, quotaflags;
@@ -964,11 +964,6 @@ xfs_mountfs(
964 int uuid_mounted = 0; 964 int uuid_mounted = 0;
965 int error = 0; 965 int error = 0;
966 966
967 if (mp->m_sb_bp == NULL) {
968 error = xfs_readsb(mp, mfsi_flags);
969 if (error)
970 return error;
971 }
972 xfs_mount_common(mp, sbp); 967 xfs_mount_common(mp, sbp);
973 968
974 /* 969 /*
@@ -1163,7 +1158,6 @@ xfs_mountfs(
1163 } 1158 }
1164 1159
1165 ASSERT(rip != NULL); 1160 ASSERT(rip != NULL);
1166 rvp = XFS_ITOV(rip);
1167 1161
1168 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) { 1162 if (unlikely((rip->i_d.di_mode & S_IFMT) != S_IFDIR)) {
1169 cmn_err(CE_WARN, "XFS: corrupted root inode"); 1163 cmn_err(CE_WARN, "XFS: corrupted root inode");
@@ -1195,8 +1189,13 @@ xfs_mountfs(
1195 /* 1189 /*
1196 * If fs is not mounted readonly, then update the superblock changes. 1190 * If fs is not mounted readonly, then update the superblock changes.
1197 */ 1191 */
1198 if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) 1192 if (update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
1199 xfs_mount_log_sb(mp, update_flags); 1193 error = xfs_mount_log_sb(mp, update_flags);
1194 if (error) {
1195 cmn_err(CE_WARN, "XFS: failed to write sb changes");
1196 goto error4;
1197 }
1198 }
1200 1199
1201 /* 1200 /*
1202 * Initialise the XFS quota management subsystem for this mount 1201 * Initialise the XFS quota management subsystem for this mount
@@ -1233,12 +1232,15 @@ xfs_mountfs(
1233 * 1232 *
1234 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. 1233 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
1235 * This may drive us straight to ENOSPC on mount, but that implies 1234 * This may drive us straight to ENOSPC on mount, but that implies
1236 * we were already there on the last unmount. 1235 * we were already there on the last unmount. Warn if this occurs.
1237 */ 1236 */
1238 resblks = mp->m_sb.sb_dblocks; 1237 resblks = mp->m_sb.sb_dblocks;
1239 do_div(resblks, 20); 1238 do_div(resblks, 20);
1240 resblks = min_t(__uint64_t, resblks, 1024); 1239 resblks = min_t(__uint64_t, resblks, 1024);
1241 xfs_reserve_blocks(mp, &resblks, NULL); 1240 error = xfs_reserve_blocks(mp, &resblks, NULL);
1241 if (error)
1242 cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
1243 "Continuing without a reserve pool.");
1242 1244
1243 return 0; 1245 return 0;
1244 1246
@@ -1246,7 +1248,7 @@ xfs_mountfs(
1246 /* 1248 /*
1247 * Free up the root inode. 1249 * Free up the root inode.
1248 */ 1250 */
1249 VN_RELE(rvp); 1251 IRELE(rip);
1250 error3: 1252 error3:
1251 xfs_log_unmount_dealloc(mp); 1253 xfs_log_unmount_dealloc(mp);
1252 error2: 1254 error2:
@@ -1274,6 +1276,7 @@ int
1274xfs_unmountfs(xfs_mount_t *mp, struct cred *cr) 1276xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1275{ 1277{
1276 __uint64_t resblks; 1278 __uint64_t resblks;
1279 int error = 0;
1277 1280
1278 /* 1281 /*
1279 * We can potentially deadlock here if we have an inode cluster 1282 * We can potentially deadlock here if we have an inode cluster
@@ -1317,9 +1320,15 @@ xfs_unmountfs(xfs_mount_t *mp, struct cred *cr)
1317 * value does not matter.... 1320 * value does not matter....
1318 */ 1321 */
1319 resblks = 0; 1322 resblks = 0;
1320 xfs_reserve_blocks(mp, &resblks, NULL); 1323 error = xfs_reserve_blocks(mp, &resblks, NULL);
1324 if (error)
1325 cmn_err(CE_WARN, "XFS: Unable to free reserved block pool. "
1326 "Freespace may not be correct on next mount.");
1321 1327
1322 xfs_log_sbcount(mp, 1); 1328 error = xfs_log_sbcount(mp, 1);
1329 if (error)
1330 cmn_err(CE_WARN, "XFS: Unable to update superblock counters. "
1331 "Freespace may not be correct on next mount.");
1323 xfs_unmountfs_writesb(mp); 1332 xfs_unmountfs_writesb(mp);
1324 xfs_unmountfs_wait(mp); /* wait for async bufs */ 1333 xfs_unmountfs_wait(mp); /* wait for async bufs */
1325 xfs_log_unmount(mp); /* Done! No more fs ops. */ 1334 xfs_log_unmount(mp); /* Done! No more fs ops. */
@@ -1411,9 +1420,8 @@ xfs_log_sbcount(
1411 xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS); 1420 xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
1412 if (sync) 1421 if (sync)
1413 xfs_trans_set_sync(tp); 1422 xfs_trans_set_sync(tp);
1414 xfs_trans_commit(tp, 0); 1423 error = xfs_trans_commit(tp, 0);
1415 1424 return error;
1416 return 0;
1417} 1425}
1418 1426
1419STATIC void 1427STATIC void
@@ -1462,7 +1470,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
1462 XFS_BUF_UNASYNC(sbp); 1470 XFS_BUF_UNASYNC(sbp);
1463 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp); 1471 ASSERT(XFS_BUF_TARGET(sbp) == mp->m_ddev_targp);
1464 xfsbdstrat(mp, sbp); 1472 xfsbdstrat(mp, sbp);
1465 /* Nevermind errors we might get here. */
1466 error = xfs_iowait(sbp); 1473 error = xfs_iowait(sbp);
1467 if (error) 1474 if (error)
1468 xfs_ioerror_alert("xfs_unmountfs_writesb", 1475 xfs_ioerror_alert("xfs_unmountfs_writesb",
@@ -1911,24 +1918,27 @@ xfs_uuid_unmount(
1911 * be altered by the mount options, as well as any potential sb_features2 1918 * be altered by the mount options, as well as any potential sb_features2
1912 * fixup. Only the first superblock is updated. 1919 * fixup. Only the first superblock is updated.
1913 */ 1920 */
1914STATIC void 1921STATIC int
1915xfs_mount_log_sb( 1922xfs_mount_log_sb(
1916 xfs_mount_t *mp, 1923 xfs_mount_t *mp,
1917 __int64_t fields) 1924 __int64_t fields)
1918{ 1925{
1919 xfs_trans_t *tp; 1926 xfs_trans_t *tp;
1927 int error;
1920 1928
1921 ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID | 1929 ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
1922 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2)); 1930 XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2));
1923 1931
1924 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT); 1932 tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
1925 if (xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0, 1933 error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
1926 XFS_DEFAULT_LOG_COUNT)) { 1934 XFS_DEFAULT_LOG_COUNT);
1935 if (error) {
1927 xfs_trans_cancel(tp, 0); 1936 xfs_trans_cancel(tp, 0);
1928 return; 1937 return error;
1929 } 1938 }
1930 xfs_mod_sb(tp, fields); 1939 xfs_mod_sb(tp, fields);
1931 xfs_trans_commit(tp, 0); 1940 error = xfs_trans_commit(tp, 0);
1941 return error;
1932} 1942}
1933 1943
1934 1944
@@ -2189,7 +2199,7 @@ xfs_icsb_counter_disabled(
2189 return test_bit(field, &mp->m_icsb_counters); 2199 return test_bit(field, &mp->m_icsb_counters);
2190} 2200}
2191 2201
2192STATIC int 2202STATIC void
2193xfs_icsb_disable_counter( 2203xfs_icsb_disable_counter(
2194 xfs_mount_t *mp, 2204 xfs_mount_t *mp,
2195 xfs_sb_field_t field) 2205 xfs_sb_field_t field)
@@ -2207,7 +2217,7 @@ xfs_icsb_disable_counter(
2207 * the m_icsb_mutex. 2217 * the m_icsb_mutex.
2208 */ 2218 */
2209 if (xfs_icsb_counter_disabled(mp, field)) 2219 if (xfs_icsb_counter_disabled(mp, field))
2210 return 0; 2220 return;
2211 2221
2212 xfs_icsb_lock_all_counters(mp); 2222 xfs_icsb_lock_all_counters(mp);
2213 if (!test_and_set_bit(field, &mp->m_icsb_counters)) { 2223 if (!test_and_set_bit(field, &mp->m_icsb_counters)) {
@@ -2230,8 +2240,6 @@ xfs_icsb_disable_counter(
2230 } 2240 }
2231 2241
2232 xfs_icsb_unlock_all_counters(mp); 2242 xfs_icsb_unlock_all_counters(mp);
2233
2234 return 0;
2235} 2243}
2236 2244
2237STATIC void 2245STATIC void
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 1d8a4728d847..1ed575110ff0 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -66,17 +66,17 @@ struct xfs_mru_cache;
66 * Prototypes and functions for the Data Migration subsystem. 66 * Prototypes and functions for the Data Migration subsystem.
67 */ 67 */
68 68
69typedef int (*xfs_send_data_t)(int, bhv_vnode_t *, 69typedef int (*xfs_send_data_t)(int, struct xfs_inode *,
70 xfs_off_t, size_t, int, bhv_vrwlock_t *); 70 xfs_off_t, size_t, int, int *);
71typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint); 71typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
72typedef int (*xfs_send_destroy_t)(bhv_vnode_t *, dm_right_t); 72typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
73typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *, 73typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
74 bhv_vnode_t *, 74 struct xfs_inode *, dm_right_t,
75 dm_right_t, bhv_vnode_t *, dm_right_t, 75 struct xfs_inode *, dm_right_t,
76 char *, char *, mode_t, int, int); 76 const char *, const char *, mode_t, int, int);
77typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t, 77typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
78 char *, char *); 78 char *, char *);
79typedef void (*xfs_send_unmount_t)(struct xfs_mount *, bhv_vnode_t *, 79typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
80 dm_right_t, mode_t, int, int); 80 dm_right_t, mode_t, int, int);
81 81
82typedef struct xfs_dmops { 82typedef struct xfs_dmops {
@@ -88,20 +88,20 @@ typedef struct xfs_dmops {
88 xfs_send_unmount_t xfs_send_unmount; 88 xfs_send_unmount_t xfs_send_unmount;
89} xfs_dmops_t; 89} xfs_dmops_t;
90 90
91#define XFS_SEND_DATA(mp, ev,vp,off,len,fl,lock) \ 91#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
92 (*(mp)->m_dm_ops->xfs_send_data)(ev,vp,off,len,fl,lock) 92 (*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
93#define XFS_SEND_MMAP(mp, vma,fl) \ 93#define XFS_SEND_MMAP(mp, vma,fl) \
94 (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl) 94 (*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
95#define XFS_SEND_DESTROY(mp, vp,right) \ 95#define XFS_SEND_DESTROY(mp, ip,right) \
96 (*(mp)->m_dm_ops->xfs_send_destroy)(vp,right) 96 (*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
97#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ 97#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
98 (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl) 98 (*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
99#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \ 99#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
100 (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) 100 (*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
101#define XFS_SEND_MOUNT(mp,right,path,name) \ 101#define XFS_SEND_MOUNT(mp,right,path,name) \
102 (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name) 102 (*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
103#define XFS_SEND_UNMOUNT(mp, vp,right,mode,rval,fl) \ 103#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
104 (*(mp)->m_dm_ops->xfs_send_unmount)(mp,vp,right,mode,rval,fl) 104 (*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
105 105
106 106
107/* 107/*
@@ -220,7 +220,7 @@ extern void xfs_icsb_sync_counters_flags(struct xfs_mount *, int);
220#endif 220#endif
221 221
222typedef struct xfs_ail { 222typedef struct xfs_ail {
223 xfs_ail_entry_t xa_ail; 223 struct list_head xa_ail;
224 uint xa_gen; 224 uint xa_gen;
225 struct task_struct *xa_task; 225 struct task_struct *xa_task;
226 xfs_lsn_t xa_target; 226 xfs_lsn_t xa_target;
@@ -401,7 +401,7 @@ typedef struct xfs_mount {
401 401
402/* 402/*
403 * Allow large block sizes to be reported to userspace programs if the 403 * Allow large block sizes to be reported to userspace programs if the
404 * "largeio" mount option is used. 404 * "largeio" mount option is used.
405 * 405 *
406 * If compatibility mode is specified, simply return the basic unit of caching 406 * If compatibility mode is specified, simply return the basic unit of caching
407 * so that we don't get inefficient read/modify/write I/O from user apps. 407 * so that we don't get inefficient read/modify/write I/O from user apps.
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index 7eb157a59f9e..ee371890d85d 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -36,7 +36,6 @@
36#include "xfs_bmap.h" 36#include "xfs_bmap.h"
37#include "xfs_error.h" 37#include "xfs_error.h"
38#include "xfs_quota.h" 38#include "xfs_quota.h"
39#include "xfs_refcache.h"
40#include "xfs_utils.h" 39#include "xfs_utils.h"
41#include "xfs_trans_space.h" 40#include "xfs_trans_space.h"
42#include "xfs_vnodeops.h" 41#include "xfs_vnodeops.h"
@@ -84,25 +83,23 @@ int xfs_rename_skip, xfs_rename_nskip;
84 */ 83 */
85STATIC int 84STATIC int
86xfs_lock_for_rename( 85xfs_lock_for_rename(
87 xfs_inode_t *dp1, /* old (source) directory inode */ 86 xfs_inode_t *dp1, /* in: old (source) directory inode */
88 xfs_inode_t *dp2, /* new (target) directory inode */ 87 xfs_inode_t *dp2, /* in: new (target) directory inode */
89 bhv_vname_t *vname1,/* old entry name */ 88 xfs_inode_t *ip1, /* in: inode of old entry */
90 bhv_vname_t *vname2,/* new entry name */ 89 struct xfs_name *name2, /* in: new entry name */
91 xfs_inode_t **ipp1, /* inode of old entry */ 90 xfs_inode_t **ipp2, /* out: inode of new entry, if it
92 xfs_inode_t **ipp2, /* inode of new entry, if it
93 already exists, NULL otherwise. */ 91 already exists, NULL otherwise. */
94 xfs_inode_t **i_tab,/* array of inode returned, sorted */ 92 xfs_inode_t **i_tab,/* out: array of inode returned, sorted */
95 int *num_inodes) /* number of inodes in array */ 93 int *num_inodes) /* out: number of inodes in array */
96{ 94{
97 xfs_inode_t *ip1, *ip2, *temp; 95 xfs_inode_t *ip2 = NULL;
96 xfs_inode_t *temp;
98 xfs_ino_t inum1, inum2; 97 xfs_ino_t inum1, inum2;
99 int error; 98 int error;
100 int i, j; 99 int i, j;
101 uint lock_mode; 100 uint lock_mode;
102 int diff_dirs = (dp1 != dp2); 101 int diff_dirs = (dp1 != dp2);
103 102
104 ip2 = NULL;
105
106 /* 103 /*
107 * First, find out the current inums of the entries so that we 104 * First, find out the current inums of the entries so that we
108 * can determine the initial locking order. We'll have to 105 * can determine the initial locking order. We'll have to
@@ -110,27 +107,20 @@ xfs_lock_for_rename(
110 * to see if we still have the right inodes, directories, etc. 107 * to see if we still have the right inodes, directories, etc.
111 */ 108 */
112 lock_mode = xfs_ilock_map_shared(dp1); 109 lock_mode = xfs_ilock_map_shared(dp1);
113 error = xfs_get_dir_entry(vname1, &ip1); 110 IHOLD(ip1);
114 if (error) { 111 xfs_itrace_ref(ip1);
115 xfs_iunlock_map_shared(dp1, lock_mode);
116 return error;
117 }
118 112
119 inum1 = ip1->i_ino; 113 inum1 = ip1->i_ino;
120 114
121 ASSERT(ip1);
122 xfs_itrace_ref(ip1);
123
124 /* 115 /*
125 * Unlock dp1 and lock dp2 if they are different. 116 * Unlock dp1 and lock dp2 if they are different.
126 */ 117 */
127
128 if (diff_dirs) { 118 if (diff_dirs) {
129 xfs_iunlock_map_shared(dp1, lock_mode); 119 xfs_iunlock_map_shared(dp1, lock_mode);
130 lock_mode = xfs_ilock_map_shared(dp2); 120 lock_mode = xfs_ilock_map_shared(dp2);
131 } 121 }
132 122
133 error = xfs_dir_lookup_int(dp2, lock_mode, vname2, &inum2, &ip2); 123 error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2);
134 if (error == ENOENT) { /* target does not need to exist. */ 124 if (error == ENOENT) { /* target does not need to exist. */
135 inum2 = 0; 125 inum2 = 0;
136 } else if (error) { 126 } else if (error) {
@@ -162,6 +152,7 @@ xfs_lock_for_rename(
162 *num_inodes = 4; 152 *num_inodes = 4;
163 i_tab[3] = ip2; 153 i_tab[3] = ip2;
164 } 154 }
155 *ipp2 = i_tab[3];
165 156
166 /* 157 /*
167 * Sort the elements via bubble sort. (Remember, there are at 158 * Sort the elements via bubble sort. (Remember, there are at
@@ -199,21 +190,6 @@ xfs_lock_for_rename(
199 xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED); 190 xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED);
200 } 191 }
201 192
202 /*
203 * Set the return value. Null out any unused entries in i_tab.
204 */
205 *ipp1 = *ipp2 = NULL;
206 for (i=0; i < *num_inodes; i++) {
207 if (i_tab[i]->i_ino == inum1) {
208 *ipp1 = i_tab[i];
209 }
210 if (i_tab[i]->i_ino == inum2) {
211 *ipp2 = i_tab[i];
212 }
213 }
214 for (;i < 4; i++) {
215 i_tab[i] = NULL;
216 }
217 return 0; 193 return 0;
218} 194}
219 195
@@ -223,13 +199,13 @@ xfs_lock_for_rename(
223int 199int
224xfs_rename( 200xfs_rename(
225 xfs_inode_t *src_dp, 201 xfs_inode_t *src_dp,
226 bhv_vname_t *src_vname, 202 struct xfs_name *src_name,
227 bhv_vnode_t *target_dir_vp, 203 xfs_inode_t *src_ip,
228 bhv_vname_t *target_vname) 204 xfs_inode_t *target_dp,
205 struct xfs_name *target_name)
229{ 206{
230 bhv_vnode_t *src_dir_vp = XFS_ITOV(src_dp);
231 xfs_trans_t *tp; 207 xfs_trans_t *tp;
232 xfs_inode_t *target_dp, *src_ip, *target_ip; 208 xfs_inode_t *target_ip;
233 xfs_mount_t *mp = src_dp->i_mount; 209 xfs_mount_t *mp = src_dp->i_mount;
234 int new_parent; /* moving to a new dir */ 210 int new_parent; /* moving to a new dir */
235 int src_is_directory; /* src_name is a directory */ 211 int src_is_directory; /* src_name is a directory */
@@ -243,29 +219,16 @@ xfs_rename(
243 int spaceres; 219 int spaceres;
244 int target_link_zero = 0; 220 int target_link_zero = 0;
245 int num_inodes; 221 int num_inodes;
246 char *src_name = VNAME(src_vname);
247 char *target_name = VNAME(target_vname);
248 int src_namelen = VNAMELEN(src_vname);
249 int target_namelen = VNAMELEN(target_vname);
250 222
251 xfs_itrace_entry(src_dp); 223 xfs_itrace_entry(src_dp);
252 xfs_itrace_entry(xfs_vtoi(target_dir_vp)); 224 xfs_itrace_entry(target_dp);
253
254 /*
255 * Find the XFS behavior descriptor for the target directory
256 * vnode since it was not handed to us.
257 */
258 target_dp = xfs_vtoi(target_dir_vp);
259 if (target_dp == NULL) {
260 return XFS_ERROR(EXDEV);
261 }
262 225
263 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) || 226 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_RENAME) ||
264 DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) { 227 DM_EVENT_ENABLED(target_dp, DM_EVENT_RENAME)) {
265 error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME, 228 error = XFS_SEND_NAMESP(mp, DM_EVENT_RENAME,
266 src_dir_vp, DM_RIGHT_NULL, 229 src_dp, DM_RIGHT_NULL,
267 target_dir_vp, DM_RIGHT_NULL, 230 target_dp, DM_RIGHT_NULL,
268 src_name, target_name, 231 src_name->name, target_name->name,
269 0, 0, 0); 232 0, 0, 0);
270 if (error) { 233 if (error) {
271 return error; 234 return error;
@@ -282,10 +245,8 @@ xfs_rename(
282 * does not exist in the source directory. 245 * does not exist in the source directory.
283 */ 246 */
284 tp = NULL; 247 tp = NULL;
285 error = xfs_lock_for_rename(src_dp, target_dp, src_vname, 248 error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name,
286 target_vname, &src_ip, &target_ip, inodes, 249 &target_ip, inodes, &num_inodes);
287 &num_inodes);
288
289 if (error) { 250 if (error) {
290 /* 251 /*
291 * We have nothing locked, no inode references, and 252 * We have nothing locked, no inode references, and
@@ -331,7 +292,7 @@ xfs_rename(
331 XFS_BMAP_INIT(&free_list, &first_block); 292 XFS_BMAP_INIT(&free_list, &first_block);
332 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); 293 tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME);
333 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 294 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
334 spaceres = XFS_RENAME_SPACE_RES(mp, target_namelen); 295 spaceres = XFS_RENAME_SPACE_RES(mp, target_name->len);
335 error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0, 296 error = xfs_trans_reserve(tp, spaceres, XFS_RENAME_LOG_RES(mp), 0,
336 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT); 297 XFS_TRANS_PERM_LOG_RES, XFS_RENAME_LOG_COUNT);
337 if (error == ENOSPC) { 298 if (error == ENOSPC) {
@@ -365,10 +326,10 @@ xfs_rename(
365 * them when they unlock the inodes. Also, we need to be careful 326 * them when they unlock the inodes. Also, we need to be careful
366 * not to add an inode to the transaction more than once. 327 * not to add an inode to the transaction more than once.
367 */ 328 */
368 VN_HOLD(src_dir_vp); 329 IHOLD(src_dp);
369 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); 330 xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
370 if (new_parent) { 331 if (new_parent) {
371 VN_HOLD(target_dir_vp); 332 IHOLD(target_dp);
372 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); 333 xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
373 } 334 }
374 if ((src_ip != src_dp) && (src_ip != target_dp)) { 335 if ((src_ip != src_dp) && (src_ip != target_dp)) {
@@ -389,9 +350,8 @@ xfs_rename(
389 * If there's no space reservation, check the entry will 350 * If there's no space reservation, check the entry will
390 * fit before actually inserting it. 351 * fit before actually inserting it.
391 */ 352 */
392 if (spaceres == 0 && 353 error = xfs_dir_canenter(tp, target_dp, target_name, spaceres);
393 (error = xfs_dir_canenter(tp, target_dp, target_name, 354 if (error)
394 target_namelen)))
395 goto error_return; 355 goto error_return;
396 /* 356 /*
397 * If target does not exist and the rename crosses 357 * If target does not exist and the rename crosses
@@ -399,8 +359,8 @@ xfs_rename(
399 * to account for the ".." reference from the new entry. 359 * to account for the ".." reference from the new entry.
400 */ 360 */
401 error = xfs_dir_createname(tp, target_dp, target_name, 361 error = xfs_dir_createname(tp, target_dp, target_name,
402 target_namelen, src_ip->i_ino, 362 src_ip->i_ino, &first_block,
403 &first_block, &free_list, spaceres); 363 &free_list, spaceres);
404 if (error == ENOSPC) 364 if (error == ENOSPC)
405 goto error_return; 365 goto error_return;
406 if (error) 366 if (error)
@@ -439,7 +399,7 @@ xfs_rename(
439 * name at the destination directory, remove it first. 399 * name at the destination directory, remove it first.
440 */ 400 */
441 error = xfs_dir_replace(tp, target_dp, target_name, 401 error = xfs_dir_replace(tp, target_dp, target_name,
442 target_namelen, src_ip->i_ino, 402 src_ip->i_ino,
443 &first_block, &free_list, spaceres); 403 &first_block, &free_list, spaceres);
444 if (error) 404 if (error)
445 goto abort_return; 405 goto abort_return;
@@ -476,7 +436,8 @@ xfs_rename(
476 * Rewrite the ".." entry to point to the new 436 * Rewrite the ".." entry to point to the new
477 * directory. 437 * directory.
478 */ 438 */
479 error = xfs_dir_replace(tp, src_ip, "..", 2, target_dp->i_ino, 439 error = xfs_dir_replace(tp, src_ip, &xfs_name_dotdot,
440 target_dp->i_ino,
480 &first_block, &free_list, spaceres); 441 &first_block, &free_list, spaceres);
481 ASSERT(error != EEXIST); 442 ASSERT(error != EEXIST);
482 if (error) 443 if (error)
@@ -512,8 +473,8 @@ xfs_rename(
512 goto abort_return; 473 goto abort_return;
513 } 474 }
514 475
515 error = xfs_dir_removename(tp, src_dp, src_name, src_namelen, 476 error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino,
516 src_ip->i_ino, &first_block, &free_list, spaceres); 477 &first_block, &free_list, spaceres);
517 if (error) 478 if (error)
518 goto abort_return; 479 goto abort_return;
519 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 480 xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -580,10 +541,8 @@ xfs_rename(
580 * the vnode references. 541 * the vnode references.
581 */ 542 */
582 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 543 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
583 if (target_ip != NULL) { 544 if (target_ip != NULL)
584 xfs_refcache_purge_ip(target_ip);
585 IRELE(target_ip); 545 IRELE(target_ip);
586 }
587 /* 546 /*
588 * Let interposed file systems know about removed links. 547 * Let interposed file systems know about removed links.
589 */ 548 */
@@ -598,9 +557,9 @@ std_return:
598 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) || 557 if (DM_EVENT_ENABLED(src_dp, DM_EVENT_POSTRENAME) ||
599 DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) { 558 DM_EVENT_ENABLED(target_dp, DM_EVENT_POSTRENAME)) {
600 (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME, 559 (void) XFS_SEND_NAMESP (mp, DM_EVENT_POSTRENAME,
601 src_dir_vp, DM_RIGHT_NULL, 560 src_dp, DM_RIGHT_NULL,
602 target_dir_vp, DM_RIGHT_NULL, 561 target_dp, DM_RIGHT_NULL,
603 src_name, target_name, 562 src_name->name, target_name->name,
604 0, error, 0); 563 0, error, 0);
605 } 564 }
606 return error; 565 return error;
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 47082c01872d..a0dc6e5bc5b9 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -44,6 +44,7 @@
44#include "xfs_rw.h" 44#include "xfs_rw.h"
45#include "xfs_inode_item.h" 45#include "xfs_inode_item.h"
46#include "xfs_trans_space.h" 46#include "xfs_trans_space.h"
47#include "xfs_utils.h"
47 48
48 49
49/* 50/*
@@ -123,14 +124,14 @@ xfs_growfs_rt_alloc(
123 XFS_GROWRTALLOC_LOG_RES(mp), 0, 124 XFS_GROWRTALLOC_LOG_RES(mp), 0,
124 XFS_TRANS_PERM_LOG_RES, 125 XFS_TRANS_PERM_LOG_RES,
125 XFS_DEFAULT_PERM_LOG_COUNT))) 126 XFS_DEFAULT_PERM_LOG_COUNT)))
126 goto error_exit; 127 goto error_cancel;
127 cancelflags = XFS_TRANS_RELEASE_LOG_RES; 128 cancelflags = XFS_TRANS_RELEASE_LOG_RES;
128 /* 129 /*
129 * Lock the inode. 130 * Lock the inode.
130 */ 131 */
131 if ((error = xfs_trans_iget(mp, tp, ino, 0, 132 if ((error = xfs_trans_iget(mp, tp, ino, 0,
132 XFS_ILOCK_EXCL, &ip))) 133 XFS_ILOCK_EXCL, &ip)))
133 goto error_exit; 134 goto error_cancel;
134 XFS_BMAP_INIT(&flist, &firstblock); 135 XFS_BMAP_INIT(&flist, &firstblock);
135 /* 136 /*
136 * Allocate blocks to the bitmap file. 137 * Allocate blocks to the bitmap file.
@@ -143,14 +144,16 @@ xfs_growfs_rt_alloc(
143 if (!error && nmap < 1) 144 if (!error && nmap < 1)
144 error = XFS_ERROR(ENOSPC); 145 error = XFS_ERROR(ENOSPC);
145 if (error) 146 if (error)
146 goto error_exit; 147 goto error_cancel;
147 /* 148 /*
148 * Free any blocks freed up in the transaction, then commit. 149 * Free any blocks freed up in the transaction, then commit.
149 */ 150 */
150 error = xfs_bmap_finish(&tp, &flist, &committed); 151 error = xfs_bmap_finish(&tp, &flist, &committed);
151 if (error) 152 if (error)
152 goto error_exit; 153 goto error_cancel;
153 xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 154 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
155 if (error)
156 goto error;
154 /* 157 /*
155 * Now we need to clear the allocated blocks. 158 * Now we need to clear the allocated blocks.
156 * Do this one block per transaction, to keep it simple. 159 * Do this one block per transaction, to keep it simple.
@@ -165,13 +168,13 @@ xfs_growfs_rt_alloc(
165 */ 168 */
166 if ((error = xfs_trans_reserve(tp, 0, 169 if ((error = xfs_trans_reserve(tp, 0,
167 XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0))) 170 XFS_GROWRTZERO_LOG_RES(mp), 0, 0, 0)))
168 goto error_exit; 171 goto error_cancel;
169 /* 172 /*
170 * Lock the bitmap inode. 173 * Lock the bitmap inode.
171 */ 174 */
172 if ((error = xfs_trans_iget(mp, tp, ino, 0, 175 if ((error = xfs_trans_iget(mp, tp, ino, 0,
173 XFS_ILOCK_EXCL, &ip))) 176 XFS_ILOCK_EXCL, &ip)))
174 goto error_exit; 177 goto error_cancel;
175 /* 178 /*
176 * Get a buffer for the block. 179 * Get a buffer for the block.
177 */ 180 */
@@ -180,14 +183,16 @@ xfs_growfs_rt_alloc(
180 mp->m_bsize, 0); 183 mp->m_bsize, 0);
181 if (bp == NULL) { 184 if (bp == NULL) {
182 error = XFS_ERROR(EIO); 185 error = XFS_ERROR(EIO);
183 goto error_exit; 186 goto error_cancel;
184 } 187 }
185 memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize); 188 memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
186 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1); 189 xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
187 /* 190 /*
188 * Commit the transaction. 191 * Commit the transaction.
189 */ 192 */
190 xfs_trans_commit(tp, 0); 193 error = xfs_trans_commit(tp, 0);
194 if (error)
195 goto error;
191 } 196 }
192 /* 197 /*
193 * Go on to the next extent, if any. 198 * Go on to the next extent, if any.
@@ -195,8 +200,9 @@ xfs_growfs_rt_alloc(
195 oblocks = map.br_startoff + map.br_blockcount; 200 oblocks = map.br_startoff + map.br_blockcount;
196 } 201 }
197 return 0; 202 return 0;
198error_exit: 203error_cancel:
199 xfs_trans_cancel(tp, cancelflags); 204 xfs_trans_cancel(tp, cancelflags);
205error:
200 return error; 206 return error;
201} 207}
202 208
@@ -1875,6 +1881,7 @@ xfs_growfs_rt(
1875 xfs_trans_t *tp; /* transaction pointer */ 1881 xfs_trans_t *tp; /* transaction pointer */
1876 1882
1877 sbp = &mp->m_sb; 1883 sbp = &mp->m_sb;
1884 cancelflags = 0;
1878 /* 1885 /*
1879 * Initial error checking. 1886 * Initial error checking.
1880 */ 1887 */
@@ -2041,13 +2048,15 @@ xfs_growfs_rt(
2041 */ 2048 */
2042 mp->m_rsumlevels = nrsumlevels; 2049 mp->m_rsumlevels = nrsumlevels;
2043 mp->m_rsumsize = nrsumsize; 2050 mp->m_rsumsize = nrsumsize;
2044 /* 2051
2045 * Commit the transaction. 2052 error = xfs_trans_commit(tp, 0);
2046 */ 2053 if (error) {
2047 xfs_trans_commit(tp, 0); 2054 tp = NULL;
2055 break;
2056 }
2048 } 2057 }
2049 2058
2050 if (error) 2059 if (error && tp)
2051 xfs_trans_cancel(tp, cancelflags); 2060 xfs_trans_cancel(tp, cancelflags);
2052 2061
2053 /* 2062 /*
@@ -2278,7 +2287,7 @@ xfs_rtmount_inodes(
2278 ASSERT(sbp->sb_rsumino != NULLFSINO); 2287 ASSERT(sbp->sb_rsumino != NULLFSINO);
2279 error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0); 2288 error = xfs_iget(mp, NULL, sbp->sb_rsumino, 0, 0, &mp->m_rsumip, 0);
2280 if (error) { 2289 if (error) {
2281 VN_RELE(XFS_ITOV(mp->m_rbmip)); 2290 IRELE(mp->m_rbmip);
2282 return error; 2291 return error;
2283 } 2292 }
2284 ASSERT(mp->m_rsumip != NULL); 2293 ASSERT(mp->m_rsumip != NULL);
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index cd3ece6cc918..b0f31c09a76d 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -126,11 +126,11 @@ xfs_write_sync_logforce(
126 * when we return. 126 * when we return.
127 */ 127 */
128 if (iip && iip->ili_last_lsn) { 128 if (iip && iip->ili_last_lsn) {
129 xfs_log_force(mp, iip->ili_last_lsn, 129 error = _xfs_log_force(mp, iip->ili_last_lsn,
130 XFS_LOG_FORCE | XFS_LOG_SYNC); 130 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
131 } else if (xfs_ipincount(ip) > 0) { 131 } else if (xfs_ipincount(ip) > 0) {
132 xfs_log_force(mp, (xfs_lsn_t)0, 132 error = _xfs_log_force(mp, (xfs_lsn_t)0,
133 XFS_LOG_FORCE | XFS_LOG_SYNC); 133 XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
134 } 134 }
135 135
136 } else { 136 } else {
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 7f40628d85c7..0804207c7391 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -113,13 +113,8 @@ struct xfs_mount;
113struct xfs_trans; 113struct xfs_trans;
114struct xfs_dquot_acct; 114struct xfs_dquot_acct;
115 115
116typedef struct xfs_ail_entry {
117 struct xfs_log_item *ail_forw; /* AIL forw pointer */
118 struct xfs_log_item *ail_back; /* AIL back pointer */
119} xfs_ail_entry_t;
120
121typedef struct xfs_log_item { 116typedef struct xfs_log_item {
122 xfs_ail_entry_t li_ail; /* AIL pointers */ 117 struct list_head li_ail; /* AIL pointers */
123 xfs_lsn_t li_lsn; /* last on-disk lsn */ 118 xfs_lsn_t li_lsn; /* last on-disk lsn */
124 struct xfs_log_item_desc *li_desc; /* ptr to current desc*/ 119 struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
125 struct xfs_mount *li_mountp; /* ptr to fs mount */ 120 struct xfs_mount *li_mountp; /* ptr to fs mount */
@@ -341,7 +336,6 @@ typedef struct xfs_trans {
341 unsigned int t_rtx_res; /* # of rt extents resvd */ 336 unsigned int t_rtx_res; /* # of rt extents resvd */
342 unsigned int t_rtx_res_used; /* # of resvd rt extents used */ 337 unsigned int t_rtx_res_used; /* # of resvd rt extents used */
343 xfs_log_ticket_t t_ticket; /* log mgr ticket */ 338 xfs_log_ticket_t t_ticket; /* log mgr ticket */
344 sema_t t_sema; /* sema for commit completion */
345 xfs_lsn_t t_lsn; /* log seq num of start of 339 xfs_lsn_t t_lsn; /* log seq num of start of
346 * transaction. */ 340 * transaction. */
347 xfs_lsn_t t_commit_lsn; /* log seq num of end of 341 xfs_lsn_t t_commit_lsn; /* log seq num of end of
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 76d470d8a1e6..1f77c00af566 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,13 +28,13 @@
28#include "xfs_trans_priv.h" 28#include "xfs_trans_priv.h"
29#include "xfs_error.h" 29#include "xfs_error.h"
30 30
31STATIC void xfs_ail_insert(xfs_ail_entry_t *, xfs_log_item_t *); 31STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
32STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_entry_t *, xfs_log_item_t *); 32STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
33STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_entry_t *); 33STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
34STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_entry_t *, xfs_log_item_t *); 34STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
35 35
36#ifdef DEBUG 36#ifdef DEBUG
37STATIC void xfs_ail_check(xfs_ail_entry_t *, xfs_log_item_t *); 37STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
38#else 38#else
39#define xfs_ail_check(a,l) 39#define xfs_ail_check(a,l)
40#endif /* DEBUG */ 40#endif /* DEBUG */
@@ -57,7 +57,7 @@ xfs_trans_tail_ail(
57 xfs_log_item_t *lip; 57 xfs_log_item_t *lip;
58 58
59 spin_lock(&mp->m_ail_lock); 59 spin_lock(&mp->m_ail_lock);
60 lip = xfs_ail_min(&(mp->m_ail.xa_ail)); 60 lip = xfs_ail_min(&mp->m_ail);
61 if (lip == NULL) { 61 if (lip == NULL) {
62 lsn = (xfs_lsn_t)0; 62 lsn = (xfs_lsn_t)0;
63 } else { 63 } else {
@@ -91,7 +91,7 @@ xfs_trans_push_ail(
91{ 91{
92 xfs_log_item_t *lip; 92 xfs_log_item_t *lip;
93 93
94 lip = xfs_ail_min(&mp->m_ail.xa_ail); 94 lip = xfs_ail_min(&mp->m_ail);
95 if (lip && !XFS_FORCED_SHUTDOWN(mp)) { 95 if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
96 if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0) 96 if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
97 xfsaild_wakeup(mp, threshold_lsn); 97 xfsaild_wakeup(mp, threshold_lsn);
@@ -111,15 +111,17 @@ xfs_trans_first_push_ail(
111{ 111{
112 xfs_log_item_t *lip; 112 xfs_log_item_t *lip;
113 113
114 lip = xfs_ail_min(&(mp->m_ail.xa_ail)); 114 lip = xfs_ail_min(&mp->m_ail);
115 *gen = (int)mp->m_ail.xa_gen; 115 *gen = (int)mp->m_ail.xa_gen;
116 if (lsn == 0) 116 if (lsn == 0)
117 return lip; 117 return lip;
118 118
119 while (lip && (XFS_LSN_CMP(lip->li_lsn, lsn) < 0)) 119 list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
120 lip = lip->li_ail.ail_forw; 120 if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
121 return lip;
122 }
121 123
122 return lip; 124 return NULL;
123} 125}
124 126
125/* 127/*
@@ -329,7 +331,7 @@ xfs_trans_unlocked_item(
329 * the call to xfs_log_move_tail() doesn't do anything if there's 331 * the call to xfs_log_move_tail() doesn't do anything if there's
330 * not enough free space to wake people up so we're safe calling it. 332 * not enough free space to wake people up so we're safe calling it.
331 */ 333 */
332 min_lip = xfs_ail_min(&mp->m_ail.xa_ail); 334 min_lip = xfs_ail_min(&mp->m_ail);
333 335
334 if (min_lip == lip) 336 if (min_lip == lip)
335 xfs_log_move_tail(mp, 1); 337 xfs_log_move_tail(mp, 1);
@@ -357,15 +359,13 @@ xfs_trans_update_ail(
357 xfs_log_item_t *lip, 359 xfs_log_item_t *lip,
358 xfs_lsn_t lsn) __releases(mp->m_ail_lock) 360 xfs_lsn_t lsn) __releases(mp->m_ail_lock)
359{ 361{
360 xfs_ail_entry_t *ailp;
361 xfs_log_item_t *dlip=NULL; 362 xfs_log_item_t *dlip=NULL;
362 xfs_log_item_t *mlip; /* ptr to minimum lip */ 363 xfs_log_item_t *mlip; /* ptr to minimum lip */
363 364
364 ailp = &(mp->m_ail.xa_ail); 365 mlip = xfs_ail_min(&mp->m_ail);
365 mlip = xfs_ail_min(ailp);
366 366
367 if (lip->li_flags & XFS_LI_IN_AIL) { 367 if (lip->li_flags & XFS_LI_IN_AIL) {
368 dlip = xfs_ail_delete(ailp, lip); 368 dlip = xfs_ail_delete(&mp->m_ail, lip);
369 ASSERT(dlip == lip); 369 ASSERT(dlip == lip);
370 } else { 370 } else {
371 lip->li_flags |= XFS_LI_IN_AIL; 371 lip->li_flags |= XFS_LI_IN_AIL;
@@ -373,11 +373,11 @@ xfs_trans_update_ail(
373 373
374 lip->li_lsn = lsn; 374 lip->li_lsn = lsn;
375 375
376 xfs_ail_insert(ailp, lip); 376 xfs_ail_insert(&mp->m_ail, lip);
377 mp->m_ail.xa_gen++; 377 mp->m_ail.xa_gen++;
378 378
379 if (mlip == dlip) { 379 if (mlip == dlip) {
380 mlip = xfs_ail_min(&(mp->m_ail.xa_ail)); 380 mlip = xfs_ail_min(&mp->m_ail);
381 spin_unlock(&mp->m_ail_lock); 381 spin_unlock(&mp->m_ail_lock);
382 xfs_log_move_tail(mp, mlip->li_lsn); 382 xfs_log_move_tail(mp, mlip->li_lsn);
383 } else { 383 } else {
@@ -407,14 +407,12 @@ xfs_trans_delete_ail(
407 xfs_mount_t *mp, 407 xfs_mount_t *mp,
408 xfs_log_item_t *lip) __releases(mp->m_ail_lock) 408 xfs_log_item_t *lip) __releases(mp->m_ail_lock)
409{ 409{
410 xfs_ail_entry_t *ailp;
411 xfs_log_item_t *dlip; 410 xfs_log_item_t *dlip;
412 xfs_log_item_t *mlip; 411 xfs_log_item_t *mlip;
413 412
414 if (lip->li_flags & XFS_LI_IN_AIL) { 413 if (lip->li_flags & XFS_LI_IN_AIL) {
415 ailp = &(mp->m_ail.xa_ail); 414 mlip = xfs_ail_min(&mp->m_ail);
416 mlip = xfs_ail_min(ailp); 415 dlip = xfs_ail_delete(&mp->m_ail, lip);
417 dlip = xfs_ail_delete(ailp, lip);
418 ASSERT(dlip == lip); 416 ASSERT(dlip == lip);
419 417
420 418
@@ -423,7 +421,7 @@ xfs_trans_delete_ail(
423 mp->m_ail.xa_gen++; 421 mp->m_ail.xa_gen++;
424 422
425 if (mlip == dlip) { 423 if (mlip == dlip) {
426 mlip = xfs_ail_min(&(mp->m_ail.xa_ail)); 424 mlip = xfs_ail_min(&mp->m_ail);
427 spin_unlock(&mp->m_ail_lock); 425 spin_unlock(&mp->m_ail_lock);
428 xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0)); 426 xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
429 } else { 427 } else {
@@ -440,7 +438,7 @@ xfs_trans_delete_ail(
440 else { 438 else {
441 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp, 439 xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
442 "%s: attempting to delete a log item that is not in the AIL", 440 "%s: attempting to delete a log item that is not in the AIL",
443 __FUNCTION__); 441 __func__);
444 spin_unlock(&mp->m_ail_lock); 442 spin_unlock(&mp->m_ail_lock);
445 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); 443 xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
446 } 444 }
@@ -461,7 +459,7 @@ xfs_trans_first_ail(
461{ 459{
462 xfs_log_item_t *lip; 460 xfs_log_item_t *lip;
463 461
464 lip = xfs_ail_min(&(mp->m_ail.xa_ail)); 462 lip = xfs_ail_min(&mp->m_ail);
465 *gen = (int)mp->m_ail.xa_gen; 463 *gen = (int)mp->m_ail.xa_gen;
466 464
467 return lip; 465 return lip;
@@ -485,9 +483,9 @@ xfs_trans_next_ail(
485 483
486 ASSERT(mp && lip && gen); 484 ASSERT(mp && lip && gen);
487 if (mp->m_ail.xa_gen == *gen) { 485 if (mp->m_ail.xa_gen == *gen) {
488 nlip = xfs_ail_next(&(mp->m_ail.xa_ail), lip); 486 nlip = xfs_ail_next(&mp->m_ail, lip);
489 } else { 487 } else {
490 nlip = xfs_ail_min(&(mp->m_ail).xa_ail); 488 nlip = xfs_ail_min(&mp->m_ail);
491 *gen = (int)mp->m_ail.xa_gen; 489 *gen = (int)mp->m_ail.xa_gen;
492 if (restarts != NULL) { 490 if (restarts != NULL) {
493 XFS_STATS_INC(xs_push_ail_restarts); 491 XFS_STATS_INC(xs_push_ail_restarts);
@@ -517,8 +515,7 @@ int
517xfs_trans_ail_init( 515xfs_trans_ail_init(
518 xfs_mount_t *mp) 516 xfs_mount_t *mp)
519{ 517{
520 mp->m_ail.xa_ail.ail_forw = (xfs_log_item_t*)&mp->m_ail.xa_ail; 518 INIT_LIST_HEAD(&mp->m_ail.xa_ail);
521 mp->m_ail.xa_ail.ail_back = (xfs_log_item_t*)&mp->m_ail.xa_ail;
522 return xfsaild_start(mp); 519 return xfsaild_start(mp);
523} 520}
524 521
@@ -537,7 +534,7 @@ xfs_trans_ail_destroy(
537 */ 534 */
538STATIC void 535STATIC void
539xfs_ail_insert( 536xfs_ail_insert(
540 xfs_ail_entry_t *base, 537 xfs_ail_t *ailp,
541 xfs_log_item_t *lip) 538 xfs_log_item_t *lip)
542/* ARGSUSED */ 539/* ARGSUSED */
543{ 540{
@@ -546,27 +543,22 @@ xfs_ail_insert(
546 /* 543 /*
547 * If the list is empty, just insert the item. 544 * If the list is empty, just insert the item.
548 */ 545 */
549 if (base->ail_back == (xfs_log_item_t*)base) { 546 if (list_empty(&ailp->xa_ail)) {
550 base->ail_forw = lip; 547 list_add(&lip->li_ail, &ailp->xa_ail);
551 base->ail_back = lip;
552 lip->li_ail.ail_forw = (xfs_log_item_t*)base;
553 lip->li_ail.ail_back = (xfs_log_item_t*)base;
554 return; 548 return;
555 } 549 }
556 550
557 next_lip = base->ail_back; 551 list_for_each_entry_reverse(next_lip, &ailp->xa_ail, li_ail) {
558 while ((next_lip != (xfs_log_item_t*)base) && 552 if (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)
559 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) > 0)) { 553 break;
560 next_lip = next_lip->li_ail.ail_back;
561 } 554 }
562 ASSERT((next_lip == (xfs_log_item_t*)base) || 555
556 ASSERT((&next_lip->li_ail == &ailp->xa_ail) ||
563 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0)); 557 (XFS_LSN_CMP(next_lip->li_lsn, lip->li_lsn) <= 0));
564 lip->li_ail.ail_forw = next_lip->li_ail.ail_forw;
565 lip->li_ail.ail_back = next_lip;
566 next_lip->li_ail.ail_forw = lip;
567 lip->li_ail.ail_forw->li_ail.ail_back = lip;
568 558
569 xfs_ail_check(base, lip); 559 list_add(&lip->li_ail, &next_lip->li_ail);
560
561 xfs_ail_check(ailp, lip);
570 return; 562 return;
571} 563}
572 564
@@ -576,15 +568,13 @@ xfs_ail_insert(
576/*ARGSUSED*/ 568/*ARGSUSED*/
577STATIC xfs_log_item_t * 569STATIC xfs_log_item_t *
578xfs_ail_delete( 570xfs_ail_delete(
579 xfs_ail_entry_t *base, 571 xfs_ail_t *ailp,
580 xfs_log_item_t *lip) 572 xfs_log_item_t *lip)
581/* ARGSUSED */ 573/* ARGSUSED */
582{ 574{
583 xfs_ail_check(base, lip); 575 xfs_ail_check(ailp, lip);
584 lip->li_ail.ail_forw->li_ail.ail_back = lip->li_ail.ail_back; 576
585 lip->li_ail.ail_back->li_ail.ail_forw = lip->li_ail.ail_forw; 577 list_del(&lip->li_ail);
586 lip->li_ail.ail_forw = NULL;
587 lip->li_ail.ail_back = NULL;
588 578
589 return lip; 579 return lip;
590} 580}
@@ -595,14 +585,13 @@ xfs_ail_delete(
595 */ 585 */
596STATIC xfs_log_item_t * 586STATIC xfs_log_item_t *
597xfs_ail_min( 587xfs_ail_min(
598 xfs_ail_entry_t *base) 588 xfs_ail_t *ailp)
599/* ARGSUSED */ 589/* ARGSUSED */
600{ 590{
601 register xfs_log_item_t *forw = base->ail_forw; 591 if (list_empty(&ailp->xa_ail))
602 if (forw == (xfs_log_item_t*)base) {
603 return NULL; 592 return NULL;
604 } 593
605 return forw; 594 return list_first_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
606} 595}
607 596
608/* 597/*
@@ -612,15 +601,14 @@ xfs_ail_min(
612 */ 601 */
613STATIC xfs_log_item_t * 602STATIC xfs_log_item_t *
614xfs_ail_next( 603xfs_ail_next(
615 xfs_ail_entry_t *base, 604 xfs_ail_t *ailp,
616 xfs_log_item_t *lip) 605 xfs_log_item_t *lip)
617/* ARGSUSED */ 606/* ARGSUSED */
618{ 607{
619 if (lip->li_ail.ail_forw == (xfs_log_item_t*)base) { 608 if (lip->li_ail.next == &ailp->xa_ail)
620 return NULL; 609 return NULL;
621 }
622 return lip->li_ail.ail_forw;
623 610
611 return list_first_entry(&lip->li_ail, xfs_log_item_t, li_ail);
624} 612}
625 613
626#ifdef DEBUG 614#ifdef DEBUG
@@ -629,57 +617,40 @@ xfs_ail_next(
629 */ 617 */
630STATIC void 618STATIC void
631xfs_ail_check( 619xfs_ail_check(
632 xfs_ail_entry_t *base, 620 xfs_ail_t *ailp,
633 xfs_log_item_t *lip) 621 xfs_log_item_t *lip)
634{ 622{
635 xfs_log_item_t *prev_lip; 623 xfs_log_item_t *prev_lip;
636 624
637 prev_lip = base->ail_forw; 625 if (list_empty(&ailp->xa_ail))
638 if (prev_lip == (xfs_log_item_t*)base) {
639 /*
640 * Make sure the pointers are correct when the list
641 * is empty.
642 */
643 ASSERT(base->ail_back == (xfs_log_item_t*)base);
644 return; 626 return;
645 }
646 627
647 /* 628 /*
648 * Check the next and previous entries are valid. 629 * Check the next and previous entries are valid.
649 */ 630 */
650 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); 631 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
651 prev_lip = lip->li_ail.ail_back; 632 prev_lip = list_entry(lip->li_ail.prev, xfs_log_item_t, li_ail);
652 if (prev_lip != (xfs_log_item_t*)base) { 633 if (&prev_lip->li_ail != &ailp->xa_ail)
653 ASSERT(prev_lip->li_ail.ail_forw == lip);
654 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); 634 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
655 } 635
656 prev_lip = lip->li_ail.ail_forw; 636 prev_lip = list_entry(lip->li_ail.next, xfs_log_item_t, li_ail);
657 if (prev_lip != (xfs_log_item_t*)base) { 637 if (&prev_lip->li_ail != &ailp->xa_ail)
658 ASSERT(prev_lip->li_ail.ail_back == lip);
659 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0); 638 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
660 }
661 639
662 640
663#ifdef XFS_TRANS_DEBUG 641#ifdef XFS_TRANS_DEBUG
664 /* 642 /*
665 * Walk the list checking forward and backward pointers, 643 * Walk the list checking lsn ordering, and that every entry has the
666 * lsn ordering, and that every entry has the XFS_LI_IN_AIL 644 * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
667 * flag set. This is really expensive, so only do it when 645 * when specifically debugging the transaction subsystem.
668 * specifically debugging the transaction subsystem.
669 */ 646 */
670 prev_lip = (xfs_log_item_t*)base; 647 prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
671 while (lip != (xfs_log_item_t*)base) { 648 list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
672 if (prev_lip != (xfs_log_item_t*)base) { 649 if (&prev_lip->li_ail != &ailp->xa_ail)
673 ASSERT(prev_lip->li_ail.ail_forw == lip);
674 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0); 650 ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
675 }
676 ASSERT(lip->li_ail.ail_back == prev_lip);
677 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0); 651 ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
678 prev_lip = lip; 652 prev_lip = lip;
679 lip = lip->li_ail.ail_forw;
680 } 653 }
681 ASSERT(lip == (xfs_log_item_t*)base);
682 ASSERT(base->ail_back == prev_lip);
683#endif /* XFS_TRANS_DEBUG */ 654#endif /* XFS_TRANS_DEBUG */
684} 655}
685#endif /* DEBUG */ 656#endif /* DEBUG */
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 60b6b898022b..cb0c5839154b 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -304,7 +304,8 @@ xfs_trans_read_buf(
304 if (tp == NULL) { 304 if (tp == NULL) {
305 bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY); 305 bp = xfs_buf_read_flags(target, blkno, len, flags | BUF_BUSY);
306 if (!bp) 306 if (!bp)
307 return XFS_ERROR(ENOMEM); 307 return (flags & XFS_BUF_TRYLOCK) ?
308 EAGAIN : XFS_ERROR(ENOMEM);
308 309
309 if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) { 310 if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
310 xfs_ioerror_alert("xfs_trans_read_buf", mp, 311 xfs_ioerror_alert("xfs_trans_read_buf", mp,
@@ -353,17 +354,15 @@ xfs_trans_read_buf(
353 ASSERT(!XFS_BUF_ISASYNC(bp)); 354 ASSERT(!XFS_BUF_ISASYNC(bp));
354 XFS_BUF_READ(bp); 355 XFS_BUF_READ(bp);
355 xfsbdstrat(tp->t_mountp, bp); 356 xfsbdstrat(tp->t_mountp, bp);
356 xfs_iowait(bp); 357 error = xfs_iowait(bp);
357 if (XFS_BUF_GETERROR(bp) != 0) { 358 if (error) {
358 xfs_ioerror_alert("xfs_trans_read_buf", mp, 359 xfs_ioerror_alert("xfs_trans_read_buf", mp,
359 bp, blkno); 360 bp, blkno);
360 error = XFS_BUF_GETERROR(bp);
361 xfs_buf_relse(bp); 361 xfs_buf_relse(bp);
362 /* 362 /*
363 * We can gracefully recover from most 363 * We can gracefully recover from most read
364 * read errors. Ones we can't are those 364 * errors. Ones we can't are those that happen
365 * that happen after the transaction's 365 * after the transaction's already dirty.
366 * already dirty.
367 */ 366 */
368 if (tp->t_flags & XFS_TRANS_DIRTY) 367 if (tp->t_flags & XFS_TRANS_DIRTY)
369 xfs_force_shutdown(tp->t_mountp, 368 xfs_force_shutdown(tp->t_mountp,
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 5c89be475464..0f5191644ab2 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -160,4 +160,9 @@ typedef enum {
160 XFS_BTNUM_MAX 160 XFS_BTNUM_MAX
161} xfs_btnum_t; 161} xfs_btnum_t;
162 162
163struct xfs_name {
164 const char *name;
165 int len;
166};
167
163#endif /* __XFS_TYPES_H__ */ 168#endif /* __XFS_TYPES_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 18a85e746680..2b8dc7e40772 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -40,34 +40,12 @@
40#include "xfs_itable.h" 40#include "xfs_itable.h"
41#include "xfs_utils.h" 41#include "xfs_utils.h"
42 42
43/*
44 * xfs_get_dir_entry is used to get a reference to an inode given
45 * its parent directory inode and the name of the file. It does
46 * not lock the child inode, and it unlocks the directory before
47 * returning. The directory's generation number is returned for
48 * use by a later call to xfs_lock_dir_and_entry.
49 */
50int
51xfs_get_dir_entry(
52 bhv_vname_t *dentry,
53 xfs_inode_t **ipp)
54{
55 bhv_vnode_t *vp;
56
57 vp = VNAME_TO_VNODE(dentry);
58
59 *ipp = xfs_vtoi(vp);
60 if (!*ipp)
61 return XFS_ERROR(ENOENT);
62 VN_HOLD(vp);
63 return 0;
64}
65 43
66int 44int
67xfs_dir_lookup_int( 45xfs_dir_lookup_int(
68 xfs_inode_t *dp, 46 xfs_inode_t *dp,
69 uint lock_mode, 47 uint lock_mode,
70 bhv_vname_t *dentry, 48 struct xfs_name *name,
71 xfs_ino_t *inum, 49 xfs_ino_t *inum,
72 xfs_inode_t **ipp) 50 xfs_inode_t **ipp)
73{ 51{
@@ -75,7 +53,7 @@ xfs_dir_lookup_int(
75 53
76 xfs_itrace_entry(dp); 54 xfs_itrace_entry(dp);
77 55
78 error = xfs_dir_lookup(NULL, dp, VNAME(dentry), VNAMELEN(dentry), inum); 56 error = xfs_dir_lookup(NULL, dp, name, inum);
79 if (!error) { 57 if (!error) {
80 /* 58 /*
81 * Unlock the directory. We do this because we can't 59 * Unlock the directory. We do this because we can't
diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h
index f857fcccb723..175b126d2cab 100644
--- a/fs/xfs/xfs_utils.h
+++ b/fs/xfs/xfs_utils.h
@@ -21,15 +21,14 @@
21#define IRELE(ip) VN_RELE(XFS_ITOV(ip)) 21#define IRELE(ip) VN_RELE(XFS_ITOV(ip))
22#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip)) 22#define IHOLD(ip) VN_HOLD(XFS_ITOV(ip))
23 23
24extern int xfs_get_dir_entry (bhv_vname_t *, xfs_inode_t **); 24extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *,
25extern int xfs_dir_lookup_int (xfs_inode_t *, uint, bhv_vname_t *, xfs_ino_t *, 25 xfs_ino_t *, xfs_inode_t **);
26 xfs_inode_t **); 26extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *);
27extern int xfs_truncate_file (xfs_mount_t *, xfs_inode_t *); 27extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
28extern int xfs_dir_ialloc (xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t,
29 xfs_dev_t, cred_t *, prid_t, int, 28 xfs_dev_t, cred_t *, prid_t, int,
30 xfs_inode_t **, int *); 29 xfs_inode_t **, int *);
31extern int xfs_droplink (xfs_trans_t *, xfs_inode_t *); 30extern int xfs_droplink(xfs_trans_t *, xfs_inode_t *);
32extern int xfs_bumplink (xfs_trans_t *, xfs_inode_t *); 31extern int xfs_bumplink(xfs_trans_t *, xfs_inode_t *);
33extern void xfs_bump_ino_vers2 (xfs_trans_t *, xfs_inode_t *); 32extern void xfs_bump_ino_vers2(xfs_trans_t *, xfs_inode_t *);
34 33
35#endif /* __XFS_UTILS_H__ */ 34#endif /* __XFS_UTILS_H__ */
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
index 7094caff13cf..fc48158fe479 100644
--- a/fs/xfs/xfs_vfsops.c
+++ b/fs/xfs/xfs_vfsops.c
@@ -43,7 +43,6 @@
43#include "xfs_error.h" 43#include "xfs_error.h"
44#include "xfs_bmap.h" 44#include "xfs_bmap.h"
45#include "xfs_rw.h" 45#include "xfs_rw.h"
46#include "xfs_refcache.h"
47#include "xfs_buf_item.h" 46#include "xfs_buf_item.h"
48#include "xfs_log_priv.h" 47#include "xfs_log_priv.h"
49#include "xfs_dir2_trace.h" 48#include "xfs_dir2_trace.h"
@@ -56,6 +55,7 @@
56#include "xfs_fsops.h" 55#include "xfs_fsops.h"
57#include "xfs_vnodeops.h" 56#include "xfs_vnodeops.h"
58#include "xfs_vfsops.h" 57#include "xfs_vfsops.h"
58#include "xfs_utils.h"
59 59
60 60
61int __init 61int __init
@@ -69,15 +69,17 @@ xfs_init(void)
69 /* 69 /*
70 * Initialize all of the zone allocators we use. 70 * Initialize all of the zone allocators we use.
71 */ 71 */
72 xfs_log_ticket_zone = kmem_zone_init(sizeof(xlog_ticket_t),
73 "xfs_log_ticket");
72 xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t), 74 xfs_bmap_free_item_zone = kmem_zone_init(sizeof(xfs_bmap_free_item_t),
73 "xfs_bmap_free_item"); 75 "xfs_bmap_free_item");
74 xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t), 76 xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
75 "xfs_btree_cur"); 77 "xfs_btree_cur");
76 xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans"); 78 xfs_da_state_zone = kmem_zone_init(sizeof(xfs_da_state_t),
77 xfs_da_state_zone = 79 "xfs_da_state");
78 kmem_zone_init(sizeof(xfs_da_state_t), "xfs_da_state");
79 xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf"); 80 xfs_dabuf_zone = kmem_zone_init(sizeof(xfs_dabuf_t), "xfs_dabuf");
80 xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork"); 81 xfs_ifork_zone = kmem_zone_init(sizeof(xfs_ifork_t), "xfs_ifork");
82 xfs_trans_zone = kmem_zone_init(sizeof(xfs_trans_t), "xfs_trans");
81 xfs_acl_zone_init(xfs_acl_zone, "xfs_acl"); 83 xfs_acl_zone_init(xfs_acl_zone, "xfs_acl");
82 xfs_mru_cache_init(); 84 xfs_mru_cache_init();
83 xfs_filestream_init(); 85 xfs_filestream_init();
@@ -113,9 +115,6 @@ xfs_init(void)
113 xfs_ili_zone = 115 xfs_ili_zone =
114 kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili", 116 kmem_zone_init_flags(sizeof(xfs_inode_log_item_t), "xfs_ili",
115 KM_ZONE_SPREAD, NULL); 117 KM_ZONE_SPREAD, NULL);
116 xfs_icluster_zone =
117 kmem_zone_init_flags(sizeof(xfs_icluster_t), "xfs_icluster",
118 KM_ZONE_SPREAD, NULL);
119 118
120 /* 119 /*
121 * Allocate global trace buffers. 120 * Allocate global trace buffers.
@@ -153,11 +152,9 @@ xfs_cleanup(void)
153 extern kmem_zone_t *xfs_inode_zone; 152 extern kmem_zone_t *xfs_inode_zone;
154 extern kmem_zone_t *xfs_efd_zone; 153 extern kmem_zone_t *xfs_efd_zone;
155 extern kmem_zone_t *xfs_efi_zone; 154 extern kmem_zone_t *xfs_efi_zone;
156 extern kmem_zone_t *xfs_icluster_zone;
157 155
158 xfs_cleanup_procfs(); 156 xfs_cleanup_procfs();
159 xfs_sysctl_unregister(); 157 xfs_sysctl_unregister();
160 xfs_refcache_destroy();
161 xfs_filestream_uninit(); 158 xfs_filestream_uninit();
162 xfs_mru_cache_uninit(); 159 xfs_mru_cache_uninit();
163 xfs_acl_zone_destroy(xfs_acl_zone); 160 xfs_acl_zone_destroy(xfs_acl_zone);
@@ -189,7 +186,6 @@ xfs_cleanup(void)
189 kmem_zone_destroy(xfs_efi_zone); 186 kmem_zone_destroy(xfs_efi_zone);
190 kmem_zone_destroy(xfs_ifork_zone); 187 kmem_zone_destroy(xfs_ifork_zone);
191 kmem_zone_destroy(xfs_ili_zone); 188 kmem_zone_destroy(xfs_ili_zone);
192 kmem_zone_destroy(xfs_icluster_zone);
193} 189}
194 190
195/* 191/*
@@ -573,7 +569,7 @@ xfs_unmount(
573#ifdef HAVE_DMAPI 569#ifdef HAVE_DMAPI
574 if (mp->m_flags & XFS_MOUNT_DMAPI) { 570 if (mp->m_flags & XFS_MOUNT_DMAPI) {
575 error = XFS_SEND_PREUNMOUNT(mp, 571 error = XFS_SEND_PREUNMOUNT(mp,
576 rvp, DM_RIGHT_NULL, rvp, DM_RIGHT_NULL, 572 rip, DM_RIGHT_NULL, rip, DM_RIGHT_NULL,
577 NULL, NULL, 0, 0, 573 NULL, NULL, 0, 0,
578 (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))? 574 (mp->m_dmevmask & (1<<DM_EVENT_PREUNMOUNT))?
579 0:DM_FLAGS_UNWANTED); 575 0:DM_FLAGS_UNWANTED);
@@ -584,11 +580,6 @@ xfs_unmount(
584 0 : DM_FLAGS_UNWANTED; 580 0 : DM_FLAGS_UNWANTED;
585 } 581 }
586#endif 582#endif
587 /*
588 * First blow any referenced inode from this file system
589 * out of the reference cache, and delete the timer.
590 */
591 xfs_refcache_purge_mp(mp);
592 583
593 /* 584 /*
594 * Blow away any referenced inode in the filestreams cache. 585 * Blow away any referenced inode in the filestreams cache.
@@ -607,7 +598,7 @@ xfs_unmount(
607 /* 598 /*
608 * Drop the reference count 599 * Drop the reference count
609 */ 600 */
610 VN_RELE(rvp); 601 IRELE(rip);
611 602
612 /* 603 /*
613 * If we're forcing a shutdown, typically because of a media error, 604 * If we're forcing a shutdown, typically because of a media error,
@@ -629,7 +620,7 @@ out:
629 /* Note: mp structure must still exist for 620 /* Note: mp structure must still exist for
630 * XFS_SEND_UNMOUNT() call. 621 * XFS_SEND_UNMOUNT() call.
631 */ 622 */
632 XFS_SEND_UNMOUNT(mp, error == 0 ? rvp : NULL, 623 XFS_SEND_UNMOUNT(mp, error == 0 ? rip : NULL,
633 DM_RIGHT_NULL, 0, error, unmount_event_flags); 624 DM_RIGHT_NULL, 0, error, unmount_event_flags);
634 } 625 }
635 if (xfs_unmountfs_needed) { 626 if (xfs_unmountfs_needed) {
@@ -646,13 +637,12 @@ out:
646 return XFS_ERROR(error); 637 return XFS_ERROR(error);
647} 638}
648 639
649STATIC int 640STATIC void
650xfs_quiesce_fs( 641xfs_quiesce_fs(
651 xfs_mount_t *mp) 642 xfs_mount_t *mp)
652{ 643{
653 int count = 0, pincount; 644 int count = 0, pincount;
654 645
655 xfs_refcache_purge_mp(mp);
656 xfs_flush_buftarg(mp->m_ddev_targp, 0); 646 xfs_flush_buftarg(mp->m_ddev_targp, 0);
657 xfs_finish_reclaim_all(mp, 0); 647 xfs_finish_reclaim_all(mp, 0);
658 648
@@ -671,8 +661,6 @@ xfs_quiesce_fs(
671 count++; 661 count++;
672 } 662 }
673 } while (count < 2); 663 } while (count < 2);
674
675 return 0;
676} 664}
677 665
678/* 666/*
@@ -684,6 +672,8 @@ void
684xfs_attr_quiesce( 672xfs_attr_quiesce(
685 xfs_mount_t *mp) 673 xfs_mount_t *mp)
686{ 674{
675 int error = 0;
676
687 /* wait for all modifications to complete */ 677 /* wait for all modifications to complete */
688 while (atomic_read(&mp->m_active_trans) > 0) 678 while (atomic_read(&mp->m_active_trans) > 0)
689 delay(100); 679 delay(100);
@@ -694,7 +684,11 @@ xfs_attr_quiesce(
694 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0); 684 ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
695 685
696 /* Push the superblock and write an unmount record */ 686 /* Push the superblock and write an unmount record */
697 xfs_log_sbcount(mp, 1); 687 error = xfs_log_sbcount(mp, 1);
688 if (error)
689 xfs_fs_cmn_err(CE_WARN, mp,
690 "xfs_attr_quiesce: failed to log sb changes. "
691 "Frozen image may not be consistent.");
698 xfs_log_unmount_write(mp); 692 xfs_log_unmount_write(mp);
699 xfs_unmountfs_writesb(mp); 693 xfs_unmountfs_writesb(mp);
700} 694}
@@ -790,8 +784,8 @@ xfs_unmount_flush(
790 goto fscorrupt_out2; 784 goto fscorrupt_out2;
791 785
792 if (rbmip) { 786 if (rbmip) {
793 VN_RELE(XFS_ITOV(rbmip)); 787 IRELE(rbmip);
794 VN_RELE(XFS_ITOV(rsumip)); 788 IRELE(rsumip);
795 } 789 }
796 790
797 xfs_iunlock(rip, XFS_ILOCK_EXCL); 791 xfs_iunlock(rip, XFS_ILOCK_EXCL);
@@ -1169,10 +1163,10 @@ xfs_sync_inodes(
1169 * above, then wait until after we've unlocked 1163 * above, then wait until after we've unlocked
1170 * the inode to release the reference. This is 1164 * the inode to release the reference. This is
1171 * because we can be already holding the inode 1165 * because we can be already holding the inode
1172 * lock when VN_RELE() calls xfs_inactive(). 1166 * lock when IRELE() calls xfs_inactive().
1173 * 1167 *
1174 * Make sure to drop the mount lock before calling 1168 * Make sure to drop the mount lock before calling
1175 * VN_RELE() so that we don't trip over ourselves if 1169 * IRELE() so that we don't trip over ourselves if
1176 * we have to go for the mount lock again in the 1170 * we have to go for the mount lock again in the
1177 * inactive code. 1171 * inactive code.
1178 */ 1172 */
@@ -1180,7 +1174,7 @@ xfs_sync_inodes(
1180 IPOINTER_INSERT(ip, mp); 1174 IPOINTER_INSERT(ip, mp);
1181 } 1175 }
1182 1176
1183 VN_RELE(vp); 1177 IRELE(ip);
1184 1178
1185 vnode_refed = B_FALSE; 1179 vnode_refed = B_FALSE;
1186 } 1180 }
@@ -1323,30 +1317,8 @@ xfs_syncsub(
1323 } 1317 }
1324 1318
1325 /* 1319 /*
1326 * If this is the periodic sync, then kick some entries out of
1327 * the reference cache. This ensures that idle entries are
1328 * eventually kicked out of the cache.
1329 */
1330 if (flags & SYNC_REFCACHE) {
1331 if (flags & SYNC_WAIT)
1332 xfs_refcache_purge_mp(mp);
1333 else
1334 xfs_refcache_purge_some(mp);
1335 }
1336
1337 /*
1338 * If asked, update the disk superblock with incore counter values if we
1339 * are using non-persistent counters so that they don't get too far out
1340 * of sync if we crash or get a forced shutdown. We don't want to force
1341 * this to disk, just get a transaction into the iclogs....
1342 */
1343 if (flags & SYNC_SUPER)
1344 xfs_log_sbcount(mp, 0);
1345
1346 /*
1347 * Now check to see if the log needs a "dummy" transaction. 1320 * Now check to see if the log needs a "dummy" transaction.
1348 */ 1321 */
1349
1350 if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) { 1322 if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
1351 xfs_trans_t *tp; 1323 xfs_trans_t *tp;
1352 xfs_inode_t *ip; 1324 xfs_inode_t *ip;
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 64c5953feca4..6650601c64f7 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -48,7 +48,6 @@
48#include "xfs_quota.h" 48#include "xfs_quota.h"
49#include "xfs_utils.h" 49#include "xfs_utils.h"
50#include "xfs_rtalloc.h" 50#include "xfs_rtalloc.h"
51#include "xfs_refcache.h"
52#include "xfs_trans_space.h" 51#include "xfs_trans_space.h"
53#include "xfs_log_priv.h" 52#include "xfs_log_priv.h"
54#include "xfs_filestream.h" 53#include "xfs_filestream.h"
@@ -327,7 +326,7 @@ xfs_setattr(
327 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && 326 if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) &&
328 !(flags & ATTR_DMI)) { 327 !(flags & ATTR_DMI)) {
329 int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR; 328 int dmflags = AT_DELAY_FLAG(flags) | DM_SEM_FLAG_WR;
330 code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, vp, 329 code = XFS_SEND_DATA(mp, DM_EVENT_TRUNCATE, ip,
331 vap->va_size, 0, dmflags, NULL); 330 vap->va_size, 0, dmflags, NULL);
332 if (code) { 331 if (code) {
333 lock_flags = 0; 332 lock_flags = 0;
@@ -634,6 +633,15 @@ xfs_setattr(
634 * Truncate file. Must have write permission and not be a directory. 633 * Truncate file. Must have write permission and not be a directory.
635 */ 634 */
636 if (mask & XFS_AT_SIZE) { 635 if (mask & XFS_AT_SIZE) {
636 /*
637 * Only change the c/mtime if we are changing the size
638 * or we are explicitly asked to change it. This handles
639 * the semantic difference between truncate() and ftruncate()
640 * as implemented in the VFS.
641 */
642 if (vap->va_size != ip->i_size || (mask & XFS_AT_CTIME))
643 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
644
637 if (vap->va_size > ip->i_size) { 645 if (vap->va_size > ip->i_size) {
638 xfs_igrow_finish(tp, ip, vap->va_size, 646 xfs_igrow_finish(tp, ip, vap->va_size,
639 !(flags & ATTR_DMI)); 647 !(flags & ATTR_DMI));
@@ -662,10 +670,6 @@ xfs_setattr(
662 */ 670 */
663 xfs_iflags_set(ip, XFS_ITRUNCATED); 671 xfs_iflags_set(ip, XFS_ITRUNCATED);
664 } 672 }
665 /*
666 * Have to do this even if the file's size doesn't change.
667 */
668 timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
669 } 673 }
670 674
671 /* 675 /*
@@ -877,7 +881,7 @@ xfs_setattr(
877 881
878 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) && 882 if (DM_EVENT_ENABLED(ip, DM_EVENT_ATTRIBUTE) &&
879 !(flags & ATTR_DMI)) { 883 !(flags & ATTR_DMI)) {
880 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, vp, DM_RIGHT_NULL, 884 (void) XFS_SEND_NAMESP(mp, DM_EVENT_ATTRIBUTE, ip, DM_RIGHT_NULL,
881 NULL, DM_RIGHT_NULL, NULL, NULL, 885 NULL, DM_RIGHT_NULL, NULL, NULL,
882 0, 0, AT_DELAY_FLAG(flags)); 886 0, 0, AT_DELAY_FLAG(flags));
883 } 887 }
@@ -1443,28 +1447,22 @@ xfs_inactive_attrs(
1443 tp = *tpp; 1447 tp = *tpp;
1444 mp = ip->i_mount; 1448 mp = ip->i_mount;
1445 ASSERT(ip->i_d.di_forkoff != 0); 1449 ASSERT(ip->i_d.di_forkoff != 0);
1446 xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1450 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1447 xfs_iunlock(ip, XFS_ILOCK_EXCL); 1451 xfs_iunlock(ip, XFS_ILOCK_EXCL);
1452 if (error)
1453 goto error_unlock;
1448 1454
1449 error = xfs_attr_inactive(ip); 1455 error = xfs_attr_inactive(ip);
1450 if (error) { 1456 if (error)
1451 *tpp = NULL; 1457 goto error_unlock;
1452 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1453 return error; /* goto out */
1454 }
1455 1458
1456 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); 1459 tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE);
1457 error = xfs_trans_reserve(tp, 0, 1460 error = xfs_trans_reserve(tp, 0,
1458 XFS_IFREE_LOG_RES(mp), 1461 XFS_IFREE_LOG_RES(mp),
1459 0, XFS_TRANS_PERM_LOG_RES, 1462 0, XFS_TRANS_PERM_LOG_RES,
1460 XFS_INACTIVE_LOG_COUNT); 1463 XFS_INACTIVE_LOG_COUNT);
1461 if (error) { 1464 if (error)
1462 ASSERT(XFS_FORCED_SHUTDOWN(mp)); 1465 goto error_cancel;
1463 xfs_trans_cancel(tp, 0);
1464 *tpp = NULL;
1465 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1466 return error;
1467 }
1468 1466
1469 xfs_ilock(ip, XFS_ILOCK_EXCL); 1467 xfs_ilock(ip, XFS_ILOCK_EXCL);
1470 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL); 1468 xfs_trans_ijoin(tp, ip, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
@@ -1475,6 +1473,14 @@ xfs_inactive_attrs(
1475 1473
1476 *tpp = tp; 1474 *tpp = tp;
1477 return 0; 1475 return 0;
1476
1477error_cancel:
1478 ASSERT(XFS_FORCED_SHUTDOWN(mp));
1479 xfs_trans_cancel(tp, 0);
1480error_unlock:
1481 *tpp = NULL;
1482 xfs_iunlock(ip, XFS_IOLOCK_EXCL);
1483 return error;
1478} 1484}
1479 1485
1480int 1486int
@@ -1520,12 +1526,6 @@ xfs_release(
1520 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE); 1526 xfs_flush_pages(ip, 0, -1, XFS_B_ASYNC, FI_NONE);
1521 } 1527 }
1522 1528
1523#ifdef HAVE_REFCACHE
1524 /* If we are in the NFS reference cache then don't do this now */
1525 if (ip->i_refcache)
1526 return 0;
1527#endif
1528
1529 if (ip->i_d.di_nlink != 0) { 1529 if (ip->i_d.di_nlink != 0) {
1530 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) && 1530 if ((((ip->i_d.di_mode & S_IFMT) == S_IFREG) &&
1531 ((ip->i_size > 0) || (VN_CACHED(vp) > 0 || 1531 ((ip->i_size > 0) || (VN_CACHED(vp) > 0 ||
@@ -1588,9 +1588,8 @@ xfs_inactive(
1588 1588
1589 mp = ip->i_mount; 1589 mp = ip->i_mount;
1590 1590
1591 if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY)) { 1591 if (ip->i_d.di_nlink == 0 && DM_EVENT_ENABLED(ip, DM_EVENT_DESTROY))
1592 (void) XFS_SEND_DESTROY(mp, vp, DM_RIGHT_NULL); 1592 XFS_SEND_DESTROY(mp, ip, DM_RIGHT_NULL);
1593 }
1594 1593
1595 error = 0; 1594 error = 0;
1596 1595
@@ -1744,11 +1743,18 @@ xfs_inactive(
1744 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1); 1743 XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_ICOUNT, -1);
1745 1744
1746 /* 1745 /*
1747 * Just ignore errors at this point. There is 1746 * Just ignore errors at this point. There is nothing we can
1748 * nothing we can do except to try to keep going. 1747 * do except to try to keep going. Make sure it's not a silent
1748 * error.
1749 */ 1749 */
1750 (void) xfs_bmap_finish(&tp, &free_list, &committed); 1750 error = xfs_bmap_finish(&tp, &free_list, &committed);
1751 (void) xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); 1751 if (error)
1752 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1753 "xfs_bmap_finish() returned error %d", error);
1754 error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES);
1755 if (error)
1756 xfs_fs_cmn_err(CE_NOTE, mp, "xfs_inactive: "
1757 "xfs_trans_commit() returned error %d", error);
1752 } 1758 }
1753 /* 1759 /*
1754 * Release the dquots held by inode, if any. 1760 * Release the dquots held by inode, if any.
@@ -1765,8 +1771,8 @@ xfs_inactive(
1765int 1771int
1766xfs_lookup( 1772xfs_lookup(
1767 xfs_inode_t *dp, 1773 xfs_inode_t *dp,
1768 bhv_vname_t *dentry, 1774 struct xfs_name *name,
1769 bhv_vnode_t **vpp) 1775 xfs_inode_t **ipp)
1770{ 1776{
1771 xfs_inode_t *ip; 1777 xfs_inode_t *ip;
1772 xfs_ino_t e_inum; 1778 xfs_ino_t e_inum;
@@ -1779,9 +1785,9 @@ xfs_lookup(
1779 return XFS_ERROR(EIO); 1785 return XFS_ERROR(EIO);
1780 1786
1781 lock_mode = xfs_ilock_map_shared(dp); 1787 lock_mode = xfs_ilock_map_shared(dp);
1782 error = xfs_dir_lookup_int(dp, lock_mode, dentry, &e_inum, &ip); 1788 error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip);
1783 if (!error) { 1789 if (!error) {
1784 *vpp = XFS_ITOV(ip); 1790 *ipp = ip;
1785 xfs_itrace_ref(ip); 1791 xfs_itrace_ref(ip);
1786 } 1792 }
1787 xfs_iunlock_map_shared(dp, lock_mode); 1793 xfs_iunlock_map_shared(dp, lock_mode);
@@ -1791,19 +1797,16 @@ xfs_lookup(
1791int 1797int
1792xfs_create( 1798xfs_create(
1793 xfs_inode_t *dp, 1799 xfs_inode_t *dp,
1794 bhv_vname_t *dentry, 1800 struct xfs_name *name,
1795 mode_t mode, 1801 mode_t mode,
1796 xfs_dev_t rdev, 1802 xfs_dev_t rdev,
1797 bhv_vnode_t **vpp, 1803 xfs_inode_t **ipp,
1798 cred_t *credp) 1804 cred_t *credp)
1799{ 1805{
1800 char *name = VNAME(dentry); 1806 xfs_mount_t *mp = dp->i_mount;
1801 xfs_mount_t *mp = dp->i_mount;
1802 bhv_vnode_t *dir_vp = XFS_ITOV(dp);
1803 xfs_inode_t *ip; 1807 xfs_inode_t *ip;
1804 bhv_vnode_t *vp = NULL;
1805 xfs_trans_t *tp; 1808 xfs_trans_t *tp;
1806 int error; 1809 int error;
1807 xfs_bmap_free_t free_list; 1810 xfs_bmap_free_t free_list;
1808 xfs_fsblock_t first_block; 1811 xfs_fsblock_t first_block;
1809 boolean_t unlock_dp_on_error = B_FALSE; 1812 boolean_t unlock_dp_on_error = B_FALSE;
@@ -1813,17 +1816,14 @@ xfs_create(
1813 xfs_prid_t prid; 1816 xfs_prid_t prid;
1814 struct xfs_dquot *udqp, *gdqp; 1817 struct xfs_dquot *udqp, *gdqp;
1815 uint resblks; 1818 uint resblks;
1816 int namelen;
1817 1819
1818 ASSERT(!*vpp); 1820 ASSERT(!*ipp);
1819 xfs_itrace_entry(dp); 1821 xfs_itrace_entry(dp);
1820 1822
1821 namelen = VNAMELEN(dentry);
1822
1823 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { 1823 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
1824 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, 1824 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
1825 dir_vp, DM_RIGHT_NULL, NULL, 1825 dp, DM_RIGHT_NULL, NULL,
1826 DM_RIGHT_NULL, name, NULL, 1826 DM_RIGHT_NULL, name->name, NULL,
1827 mode, 0, 0); 1827 mode, 0, 0);
1828 1828
1829 if (error) 1829 if (error)
@@ -1855,7 +1855,7 @@ xfs_create(
1855 1855
1856 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); 1856 tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE);
1857 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 1857 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
1858 resblks = XFS_CREATE_SPACE_RES(mp, namelen); 1858 resblks = XFS_CREATE_SPACE_RES(mp, name->len);
1859 /* 1859 /*
1860 * Initially assume that the file does not exist and 1860 * Initially assume that the file does not exist and
1861 * reserve the resources for that case. If that is not 1861 * reserve the resources for that case. If that is not
@@ -1888,7 +1888,8 @@ xfs_create(
1888 if (error) 1888 if (error)
1889 goto error_return; 1889 goto error_return;
1890 1890
1891 if (resblks == 0 && (error = xfs_dir_canenter(tp, dp, name, namelen))) 1891 error = xfs_dir_canenter(tp, dp, name, resblks);
1892 if (error)
1892 goto error_return; 1893 goto error_return;
1893 error = xfs_dir_ialloc(&tp, dp, mode, 1, 1894 error = xfs_dir_ialloc(&tp, dp, mode, 1,
1894 rdev, credp, prid, resblks > 0, 1895 rdev, credp, prid, resblks > 0,
@@ -1914,11 +1915,11 @@ xfs_create(
1914 * the transaction cancel unlocking dp so don't do it explicitly in the 1915 * the transaction cancel unlocking dp so don't do it explicitly in the
1915 * error path. 1916 * error path.
1916 */ 1917 */
1917 VN_HOLD(dir_vp); 1918 IHOLD(dp);
1918 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 1919 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
1919 unlock_dp_on_error = B_FALSE; 1920 unlock_dp_on_error = B_FALSE;
1920 1921
1921 error = xfs_dir_createname(tp, dp, name, namelen, ip->i_ino, 1922 error = xfs_dir_createname(tp, dp, name, ip->i_ino,
1922 &first_block, &free_list, resblks ? 1923 &first_block, &free_list, resblks ?
1923 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 1924 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
1924 if (error) { 1925 if (error) {
@@ -1952,7 +1953,6 @@ xfs_create(
1952 * vnode to the caller, we bump the vnode ref count now. 1953 * vnode to the caller, we bump the vnode ref count now.
1953 */ 1954 */
1954 IHOLD(ip); 1955 IHOLD(ip);
1955 vp = XFS_ITOV(ip);
1956 1956
1957 error = xfs_bmap_finish(&tp, &free_list, &committed); 1957 error = xfs_bmap_finish(&tp, &free_list, &committed);
1958 if (error) { 1958 if (error) {
@@ -1970,17 +1970,17 @@ xfs_create(
1970 XFS_QM_DQRELE(mp, udqp); 1970 XFS_QM_DQRELE(mp, udqp);
1971 XFS_QM_DQRELE(mp, gdqp); 1971 XFS_QM_DQRELE(mp, gdqp);
1972 1972
1973 *vpp = vp; 1973 *ipp = ip;
1974 1974
1975 /* Fallthrough to std_return with error = 0 */ 1975 /* Fallthrough to std_return with error = 0 */
1976 1976
1977std_return: 1977std_return:
1978 if ((*vpp || (error != 0 && dm_event_sent != 0)) && 1978 if ((*ipp || (error != 0 && dm_event_sent != 0)) &&
1979 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { 1979 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
1980 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, 1980 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
1981 dir_vp, DM_RIGHT_NULL, 1981 dp, DM_RIGHT_NULL,
1982 *vpp ? vp:NULL, 1982 *ipp ? ip : NULL,
1983 DM_RIGHT_NULL, name, NULL, 1983 DM_RIGHT_NULL, name->name, NULL,
1984 mode, error, 0); 1984 mode, error, 0);
1985 } 1985 }
1986 return error; 1986 return error;
@@ -2272,46 +2272,32 @@ int remove_which_error_return = 0;
2272int 2272int
2273xfs_remove( 2273xfs_remove(
2274 xfs_inode_t *dp, 2274 xfs_inode_t *dp,
2275 bhv_vname_t *dentry) 2275 struct xfs_name *name,
2276 xfs_inode_t *ip)
2276{ 2277{
2277 bhv_vnode_t *dir_vp = XFS_ITOV(dp);
2278 char *name = VNAME(dentry);
2279 xfs_mount_t *mp = dp->i_mount; 2278 xfs_mount_t *mp = dp->i_mount;
2280 xfs_inode_t *ip;
2281 xfs_trans_t *tp = NULL; 2279 xfs_trans_t *tp = NULL;
2282 int error = 0; 2280 int error = 0;
2283 xfs_bmap_free_t free_list; 2281 xfs_bmap_free_t free_list;
2284 xfs_fsblock_t first_block; 2282 xfs_fsblock_t first_block;
2285 int cancel_flags; 2283 int cancel_flags;
2286 int committed; 2284 int committed;
2287 int dm_di_mode = 0;
2288 int link_zero; 2285 int link_zero;
2289 uint resblks; 2286 uint resblks;
2290 int namelen;
2291 2287
2292 xfs_itrace_entry(dp); 2288 xfs_itrace_entry(dp);
2293 2289
2294 if (XFS_FORCED_SHUTDOWN(mp)) 2290 if (XFS_FORCED_SHUTDOWN(mp))
2295 return XFS_ERROR(EIO); 2291 return XFS_ERROR(EIO);
2296 2292
2297 namelen = VNAMELEN(dentry);
2298
2299 if (!xfs_get_dir_entry(dentry, &ip)) {
2300 dm_di_mode = ip->i_d.di_mode;
2301 IRELE(ip);
2302 }
2303
2304 if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { 2293 if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2305 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dir_vp, 2294 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, dp, DM_RIGHT_NULL,
2306 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, 2295 NULL, DM_RIGHT_NULL, name->name, NULL,
2307 name, NULL, dm_di_mode, 0, 0); 2296 ip->i_d.di_mode, 0, 0);
2308 if (error) 2297 if (error)
2309 return error; 2298 return error;
2310 } 2299 }
2311 2300
2312 /* From this point on, return through std_return */
2313 ip = NULL;
2314
2315 /* 2301 /*
2316 * We need to get a reference to ip before we get our log 2302 * We need to get a reference to ip before we get our log
2317 * reservation. The reason for this is that we cannot call 2303 * reservation. The reason for this is that we cannot call
@@ -2324,13 +2310,7 @@ xfs_remove(
2324 * when we call xfs_iget. Instead we get an unlocked reference 2310 * when we call xfs_iget. Instead we get an unlocked reference
2325 * to the inode before getting our log reservation. 2311 * to the inode before getting our log reservation.
2326 */ 2312 */
2327 error = xfs_get_dir_entry(dentry, &ip); 2313 IHOLD(ip);
2328 if (error) {
2329 REMOVE_DEBUG_TRACE(__LINE__);
2330 goto std_return;
2331 }
2332
2333 dm_di_mode = ip->i_d.di_mode;
2334 2314
2335 xfs_itrace_entry(ip); 2315 xfs_itrace_entry(ip);
2336 xfs_itrace_ref(ip); 2316 xfs_itrace_ref(ip);
@@ -2398,7 +2378,7 @@ xfs_remove(
2398 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry. 2378 * Entry must exist since we did a lookup in xfs_lock_dir_and_entry.
2399 */ 2379 */
2400 XFS_BMAP_INIT(&free_list, &first_block); 2380 XFS_BMAP_INIT(&free_list, &first_block);
2401 error = xfs_dir_removename(tp, dp, name, namelen, ip->i_ino, 2381 error = xfs_dir_removename(tp, dp, name, ip->i_ino,
2402 &first_block, &free_list, 0); 2382 &first_block, &free_list, 0);
2403 if (error) { 2383 if (error) {
2404 ASSERT(error != ENOENT); 2384 ASSERT(error != ENOENT);
@@ -2449,14 +2429,6 @@ xfs_remove(
2449 } 2429 }
2450 2430
2451 /* 2431 /*
2452 * Before we drop our extra reference to the inode, purge it
2453 * from the refcache if it is there. By waiting until afterwards
2454 * to do the IRELE, we ensure that we won't go inactive in the
2455 * xfs_refcache_purge_ip routine (although that would be OK).
2456 */
2457 xfs_refcache_purge_ip(ip);
2458
2459 /*
2460 * If we are using filestreams, kill the stream association. 2432 * If we are using filestreams, kill the stream association.
2461 * If the file is still open it may get a new one but that 2433 * If the file is still open it may get a new one but that
2462 * will get killed on last close in xfs_close() so we don't 2434 * will get killed on last close in xfs_close() so we don't
@@ -2472,9 +2444,9 @@ xfs_remove(
2472 std_return: 2444 std_return:
2473 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { 2445 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
2474 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, 2446 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
2475 dir_vp, DM_RIGHT_NULL, 2447 dp, DM_RIGHT_NULL,
2476 NULL, DM_RIGHT_NULL, 2448 NULL, DM_RIGHT_NULL,
2477 name, NULL, dm_di_mode, error, 0); 2449 name->name, NULL, ip->i_d.di_mode, error, 0);
2478 } 2450 }
2479 return error; 2451 return error;
2480 2452
@@ -2495,14 +2467,6 @@ xfs_remove(
2495 cancel_flags |= XFS_TRANS_ABORT; 2467 cancel_flags |= XFS_TRANS_ABORT;
2496 xfs_trans_cancel(tp, cancel_flags); 2468 xfs_trans_cancel(tp, cancel_flags);
2497 2469
2498 /*
2499 * Before we drop our extra reference to the inode, purge it
2500 * from the refcache if it is there. By waiting until afterwards
2501 * to do the IRELE, we ensure that we won't go inactive in the
2502 * xfs_refcache_purge_ip routine (although that would be OK).
2503 */
2504 xfs_refcache_purge_ip(ip);
2505
2506 IRELE(ip); 2470 IRELE(ip);
2507 2471
2508 goto std_return; 2472 goto std_return;
@@ -2511,12 +2475,10 @@ xfs_remove(
2511int 2475int
2512xfs_link( 2476xfs_link(
2513 xfs_inode_t *tdp, 2477 xfs_inode_t *tdp,
2514 bhv_vnode_t *src_vp, 2478 xfs_inode_t *sip,
2515 bhv_vname_t *dentry) 2479 struct xfs_name *target_name)
2516{ 2480{
2517 bhv_vnode_t *target_dir_vp = XFS_ITOV(tdp);
2518 xfs_mount_t *mp = tdp->i_mount; 2481 xfs_mount_t *mp = tdp->i_mount;
2519 xfs_inode_t *sip = xfs_vtoi(src_vp);
2520 xfs_trans_t *tp; 2482 xfs_trans_t *tp;
2521 xfs_inode_t *ips[2]; 2483 xfs_inode_t *ips[2];
2522 int error; 2484 int error;
@@ -2525,23 +2487,20 @@ xfs_link(
2525 int cancel_flags; 2487 int cancel_flags;
2526 int committed; 2488 int committed;
2527 int resblks; 2489 int resblks;
2528 char *target_name = VNAME(dentry);
2529 int target_namelen;
2530 2490
2531 xfs_itrace_entry(tdp); 2491 xfs_itrace_entry(tdp);
2532 xfs_itrace_entry(xfs_vtoi(src_vp)); 2492 xfs_itrace_entry(sip);
2533 2493
2534 target_namelen = VNAMELEN(dentry); 2494 ASSERT(!S_ISDIR(sip->i_d.di_mode));
2535 ASSERT(!VN_ISDIR(src_vp));
2536 2495
2537 if (XFS_FORCED_SHUTDOWN(mp)) 2496 if (XFS_FORCED_SHUTDOWN(mp))
2538 return XFS_ERROR(EIO); 2497 return XFS_ERROR(EIO);
2539 2498
2540 if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) { 2499 if (DM_EVENT_ENABLED(tdp, DM_EVENT_LINK)) {
2541 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK, 2500 error = XFS_SEND_NAMESP(mp, DM_EVENT_LINK,
2542 target_dir_vp, DM_RIGHT_NULL, 2501 tdp, DM_RIGHT_NULL,
2543 src_vp, DM_RIGHT_NULL, 2502 sip, DM_RIGHT_NULL,
2544 target_name, NULL, 0, 0, 0); 2503 target_name->name, NULL, 0, 0, 0);
2545 if (error) 2504 if (error)
2546 return error; 2505 return error;
2547 } 2506 }
@@ -2556,7 +2515,7 @@ xfs_link(
2556 2515
2557 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); 2516 tp = xfs_trans_alloc(mp, XFS_TRANS_LINK);
2558 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 2517 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2559 resblks = XFS_LINK_SPACE_RES(mp, target_namelen); 2518 resblks = XFS_LINK_SPACE_RES(mp, target_name->len);
2560 error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0, 2519 error = xfs_trans_reserve(tp, resblks, XFS_LINK_LOG_RES(mp), 0,
2561 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT); 2520 XFS_TRANS_PERM_LOG_RES, XFS_LINK_LOG_COUNT);
2562 if (error == ENOSPC) { 2521 if (error == ENOSPC) {
@@ -2584,8 +2543,8 @@ xfs_link(
2584 * xfs_trans_cancel will both unlock the inodes and 2543 * xfs_trans_cancel will both unlock the inodes and
2585 * decrement the associated ref counts. 2544 * decrement the associated ref counts.
2586 */ 2545 */
2587 VN_HOLD(src_vp); 2546 IHOLD(sip);
2588 VN_HOLD(target_dir_vp); 2547 IHOLD(tdp);
2589 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL); 2548 xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
2590 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL); 2549 xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
2591 2550
@@ -2608,15 +2567,14 @@ xfs_link(
2608 goto error_return; 2567 goto error_return;
2609 } 2568 }
2610 2569
2611 if (resblks == 0 && 2570 error = xfs_dir_canenter(tp, tdp, target_name, resblks);
2612 (error = xfs_dir_canenter(tp, tdp, target_name, target_namelen))) 2571 if (error)
2613 goto error_return; 2572 goto error_return;
2614 2573
2615 XFS_BMAP_INIT(&free_list, &first_block); 2574 XFS_BMAP_INIT(&free_list, &first_block);
2616 2575
2617 error = xfs_dir_createname(tp, tdp, target_name, target_namelen, 2576 error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino,
2618 sip->i_ino, &first_block, &free_list, 2577 &first_block, &free_list, resblks);
2619 resblks);
2620 if (error) 2578 if (error)
2621 goto abort_return; 2579 goto abort_return;
2622 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 2580 xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -2650,9 +2608,9 @@ xfs_link(
2650std_return: 2608std_return:
2651 if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) { 2609 if (DM_EVENT_ENABLED(sip, DM_EVENT_POSTLINK)) {
2652 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK, 2610 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTLINK,
2653 target_dir_vp, DM_RIGHT_NULL, 2611 tdp, DM_RIGHT_NULL,
2654 src_vp, DM_RIGHT_NULL, 2612 sip, DM_RIGHT_NULL,
2655 target_name, NULL, 0, error, 0); 2613 target_name->name, NULL, 0, error, 0);
2656 } 2614 }
2657 return error; 2615 return error;
2658 2616
@@ -2669,17 +2627,13 @@ std_return:
2669int 2627int
2670xfs_mkdir( 2628xfs_mkdir(
2671 xfs_inode_t *dp, 2629 xfs_inode_t *dp,
2672 bhv_vname_t *dentry, 2630 struct xfs_name *dir_name,
2673 mode_t mode, 2631 mode_t mode,
2674 bhv_vnode_t **vpp, 2632 xfs_inode_t **ipp,
2675 cred_t *credp) 2633 cred_t *credp)
2676{ 2634{
2677 bhv_vnode_t *dir_vp = XFS_ITOV(dp);
2678 char *dir_name = VNAME(dentry);
2679 int dir_namelen = VNAMELEN(dentry);
2680 xfs_mount_t *mp = dp->i_mount; 2635 xfs_mount_t *mp = dp->i_mount;
2681 xfs_inode_t *cdp; /* inode of created dir */ 2636 xfs_inode_t *cdp; /* inode of created dir */
2682 bhv_vnode_t *cvp; /* vnode of created dir */
2683 xfs_trans_t *tp; 2637 xfs_trans_t *tp;
2684 int cancel_flags; 2638 int cancel_flags;
2685 int error; 2639 int error;
@@ -2700,8 +2654,8 @@ xfs_mkdir(
2700 2654
2701 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) { 2655 if (DM_EVENT_ENABLED(dp, DM_EVENT_CREATE)) {
2702 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE, 2656 error = XFS_SEND_NAMESP(mp, DM_EVENT_CREATE,
2703 dir_vp, DM_RIGHT_NULL, NULL, 2657 dp, DM_RIGHT_NULL, NULL,
2704 DM_RIGHT_NULL, dir_name, NULL, 2658 DM_RIGHT_NULL, dir_name->name, NULL,
2705 mode, 0, 0); 2659 mode, 0, 0);
2706 if (error) 2660 if (error)
2707 return error; 2661 return error;
@@ -2730,7 +2684,7 @@ xfs_mkdir(
2730 2684
2731 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR); 2685 tp = xfs_trans_alloc(mp, XFS_TRANS_MKDIR);
2732 cancel_flags = XFS_TRANS_RELEASE_LOG_RES; 2686 cancel_flags = XFS_TRANS_RELEASE_LOG_RES;
2733 resblks = XFS_MKDIR_SPACE_RES(mp, dir_namelen); 2687 resblks = XFS_MKDIR_SPACE_RES(mp, dir_name->len);
2734 error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0, 2688 error = xfs_trans_reserve(tp, resblks, XFS_MKDIR_LOG_RES(mp), 0,
2735 XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT); 2689 XFS_TRANS_PERM_LOG_RES, XFS_MKDIR_LOG_COUNT);
2736 if (error == ENOSPC) { 2690 if (error == ENOSPC) {
@@ -2762,8 +2716,8 @@ xfs_mkdir(
2762 if (error) 2716 if (error)
2763 goto error_return; 2717 goto error_return;
2764 2718
2765 if (resblks == 0 && 2719 error = xfs_dir_canenter(tp, dp, dir_name, resblks);
2766 (error = xfs_dir_canenter(tp, dp, dir_name, dir_namelen))) 2720 if (error)
2767 goto error_return; 2721 goto error_return;
2768 /* 2722 /*
2769 * create the directory inode. 2723 * create the directory inode.
@@ -2786,15 +2740,15 @@ xfs_mkdir(
2786 * from here on will result in the transaction cancel 2740 * from here on will result in the transaction cancel
2787 * unlocking dp so don't do it explicitly in the error path. 2741 * unlocking dp so don't do it explicitly in the error path.
2788 */ 2742 */
2789 VN_HOLD(dir_vp); 2743 IHOLD(dp);
2790 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 2744 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
2791 unlock_dp_on_error = B_FALSE; 2745 unlock_dp_on_error = B_FALSE;
2792 2746
2793 XFS_BMAP_INIT(&free_list, &first_block); 2747 XFS_BMAP_INIT(&free_list, &first_block);
2794 2748
2795 error = xfs_dir_createname(tp, dp, dir_name, dir_namelen, cdp->i_ino, 2749 error = xfs_dir_createname(tp, dp, dir_name, cdp->i_ino,
2796 &first_block, &free_list, resblks ? 2750 &first_block, &free_list, resblks ?
2797 resblks - XFS_IALLOC_SPACE_RES(mp) : 0); 2751 resblks - XFS_IALLOC_SPACE_RES(mp) : 0);
2798 if (error) { 2752 if (error) {
2799 ASSERT(error != ENOSPC); 2753 ASSERT(error != ENOSPC);
2800 goto error1; 2754 goto error1;
@@ -2817,11 +2771,9 @@ xfs_mkdir(
2817 if (error) 2771 if (error)
2818 goto error2; 2772 goto error2;
2819 2773
2820 cvp = XFS_ITOV(cdp);
2821
2822 created = B_TRUE; 2774 created = B_TRUE;
2823 2775
2824 *vpp = cvp; 2776 *ipp = cdp;
2825 IHOLD(cdp); 2777 IHOLD(cdp);
2826 2778
2827 /* 2779 /*
@@ -2858,10 +2810,10 @@ std_return:
2858 if ((created || (error != 0 && dm_event_sent != 0)) && 2810 if ((created || (error != 0 && dm_event_sent != 0)) &&
2859 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) { 2811 DM_EVENT_ENABLED(dp, DM_EVENT_POSTCREATE)) {
2860 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE, 2812 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTCREATE,
2861 dir_vp, DM_RIGHT_NULL, 2813 dp, DM_RIGHT_NULL,
2862 created ? XFS_ITOV(cdp):NULL, 2814 created ? cdp : NULL,
2863 DM_RIGHT_NULL, 2815 DM_RIGHT_NULL,
2864 dir_name, NULL, 2816 dir_name->name, NULL,
2865 mode, error, 0); 2817 mode, error, 0);
2866 } 2818 }
2867 return error; 2819 return error;
@@ -2885,20 +2837,17 @@ std_return:
2885int 2837int
2886xfs_rmdir( 2838xfs_rmdir(
2887 xfs_inode_t *dp, 2839 xfs_inode_t *dp,
2888 bhv_vname_t *dentry) 2840 struct xfs_name *name,
2841 xfs_inode_t *cdp)
2889{ 2842{
2890 bhv_vnode_t *dir_vp = XFS_ITOV(dp); 2843 bhv_vnode_t *dir_vp = XFS_ITOV(dp);
2891 char *name = VNAME(dentry);
2892 int namelen = VNAMELEN(dentry);
2893 xfs_mount_t *mp = dp->i_mount; 2844 xfs_mount_t *mp = dp->i_mount;
2894 xfs_inode_t *cdp; /* child directory */
2895 xfs_trans_t *tp; 2845 xfs_trans_t *tp;
2896 int error; 2846 int error;
2897 xfs_bmap_free_t free_list; 2847 xfs_bmap_free_t free_list;
2898 xfs_fsblock_t first_block; 2848 xfs_fsblock_t first_block;
2899 int cancel_flags; 2849 int cancel_flags;
2900 int committed; 2850 int committed;
2901 int dm_di_mode = S_IFDIR;
2902 int last_cdp_link; 2851 int last_cdp_link;
2903 uint resblks; 2852 uint resblks;
2904 2853
@@ -2907,24 +2856,15 @@ xfs_rmdir(
2907 if (XFS_FORCED_SHUTDOWN(mp)) 2856 if (XFS_FORCED_SHUTDOWN(mp))
2908 return XFS_ERROR(EIO); 2857 return XFS_ERROR(EIO);
2909 2858
2910 if (!xfs_get_dir_entry(dentry, &cdp)) {
2911 dm_di_mode = cdp->i_d.di_mode;
2912 IRELE(cdp);
2913 }
2914
2915 if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) { 2859 if (DM_EVENT_ENABLED(dp, DM_EVENT_REMOVE)) {
2916 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE, 2860 error = XFS_SEND_NAMESP(mp, DM_EVENT_REMOVE,
2917 dir_vp, DM_RIGHT_NULL, 2861 dp, DM_RIGHT_NULL,
2918 NULL, DM_RIGHT_NULL, 2862 NULL, DM_RIGHT_NULL, name->name,
2919 name, NULL, dm_di_mode, 0, 0); 2863 NULL, cdp->i_d.di_mode, 0, 0);
2920 if (error) 2864 if (error)
2921 return XFS_ERROR(error); 2865 return XFS_ERROR(error);
2922 } 2866 }
2923 2867
2924 /* Return through std_return after this point. */
2925
2926 cdp = NULL;
2927
2928 /* 2868 /*
2929 * We need to get a reference to cdp before we get our log 2869 * We need to get a reference to cdp before we get our log
2930 * reservation. The reason for this is that we cannot call 2870 * reservation. The reason for this is that we cannot call
@@ -2937,13 +2877,7 @@ xfs_rmdir(
2937 * when we call xfs_iget. Instead we get an unlocked reference 2877 * when we call xfs_iget. Instead we get an unlocked reference
2938 * to the inode before getting our log reservation. 2878 * to the inode before getting our log reservation.
2939 */ 2879 */
2940 error = xfs_get_dir_entry(dentry, &cdp); 2880 IHOLD(cdp);
2941 if (error) {
2942 REMOVE_DEBUG_TRACE(__LINE__);
2943 goto std_return;
2944 }
2945 mp = dp->i_mount;
2946 dm_di_mode = cdp->i_d.di_mode;
2947 2881
2948 /* 2882 /*
2949 * Get the dquots for the inodes. 2883 * Get the dquots for the inodes.
@@ -3020,7 +2954,7 @@ xfs_rmdir(
3020 goto error_return; 2954 goto error_return;
3021 } 2955 }
3022 2956
3023 error = xfs_dir_removename(tp, dp, name, namelen, cdp->i_ino, 2957 error = xfs_dir_removename(tp, dp, name, cdp->i_ino,
3024 &first_block, &free_list, resblks); 2958 &first_block, &free_list, resblks);
3025 if (error) 2959 if (error)
3026 goto error1; 2960 goto error1;
@@ -3098,9 +3032,9 @@ xfs_rmdir(
3098 std_return: 3032 std_return:
3099 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) { 3033 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTREMOVE)) {
3100 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE, 3034 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTREMOVE,
3101 dir_vp, DM_RIGHT_NULL, 3035 dp, DM_RIGHT_NULL,
3102 NULL, DM_RIGHT_NULL, 3036 NULL, DM_RIGHT_NULL,
3103 name, NULL, dm_di_mode, 3037 name->name, NULL, cdp->i_d.di_mode,
3104 error, 0); 3038 error, 0);
3105 } 3039 }
3106 return error; 3040 return error;
@@ -3118,13 +3052,12 @@ xfs_rmdir(
3118int 3052int
3119xfs_symlink( 3053xfs_symlink(
3120 xfs_inode_t *dp, 3054 xfs_inode_t *dp,
3121 bhv_vname_t *dentry, 3055 struct xfs_name *link_name,
3122 char *target_path, 3056 const char *target_path,
3123 mode_t mode, 3057 mode_t mode,
3124 bhv_vnode_t **vpp, 3058 xfs_inode_t **ipp,
3125 cred_t *credp) 3059 cred_t *credp)
3126{ 3060{
3127 bhv_vnode_t *dir_vp = XFS_ITOV(dp);
3128 xfs_mount_t *mp = dp->i_mount; 3061 xfs_mount_t *mp = dp->i_mount;
3129 xfs_trans_t *tp; 3062 xfs_trans_t *tp;
3130 xfs_inode_t *ip; 3063 xfs_inode_t *ip;
@@ -3140,17 +3073,15 @@ xfs_symlink(
3140 int nmaps; 3073 int nmaps;
3141 xfs_bmbt_irec_t mval[SYMLINK_MAPS]; 3074 xfs_bmbt_irec_t mval[SYMLINK_MAPS];
3142 xfs_daddr_t d; 3075 xfs_daddr_t d;
3143 char *cur_chunk; 3076 const char *cur_chunk;
3144 int byte_cnt; 3077 int byte_cnt;
3145 int n; 3078 int n;
3146 xfs_buf_t *bp; 3079 xfs_buf_t *bp;
3147 xfs_prid_t prid; 3080 xfs_prid_t prid;
3148 struct xfs_dquot *udqp, *gdqp; 3081 struct xfs_dquot *udqp, *gdqp;
3149 uint resblks; 3082 uint resblks;
3150 char *link_name = VNAME(dentry);
3151 int link_namelen;
3152 3083
3153 *vpp = NULL; 3084 *ipp = NULL;
3154 error = 0; 3085 error = 0;
3155 ip = NULL; 3086 ip = NULL;
3156 tp = NULL; 3087 tp = NULL;
@@ -3160,44 +3091,17 @@ xfs_symlink(
3160 if (XFS_FORCED_SHUTDOWN(mp)) 3091 if (XFS_FORCED_SHUTDOWN(mp))
3161 return XFS_ERROR(EIO); 3092 return XFS_ERROR(EIO);
3162 3093
3163 link_namelen = VNAMELEN(dentry);
3164
3165 /* 3094 /*
3166 * Check component lengths of the target path name. 3095 * Check component lengths of the target path name.
3167 */ 3096 */
3168 pathlen = strlen(target_path); 3097 pathlen = strlen(target_path);
3169 if (pathlen >= MAXPATHLEN) /* total string too long */ 3098 if (pathlen >= MAXPATHLEN) /* total string too long */
3170 return XFS_ERROR(ENAMETOOLONG); 3099 return XFS_ERROR(ENAMETOOLONG);
3171 if (pathlen >= MAXNAMELEN) { /* is any component too long? */
3172 int len, total;
3173 char *path;
3174
3175 for (total = 0, path = target_path; total < pathlen;) {
3176 /*
3177 * Skip any slashes.
3178 */
3179 while(*path == '/') {
3180 total++;
3181 path++;
3182 }
3183
3184 /*
3185 * Count up to the next slash or end of path.
3186 * Error out if the component is bigger than MAXNAMELEN.
3187 */
3188 for(len = 0; *path != '/' && total < pathlen;total++, path++) {
3189 if (++len >= MAXNAMELEN) {
3190 error = ENAMETOOLONG;
3191 return error;
3192 }
3193 }
3194 }
3195 }
3196 3100
3197 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) { 3101 if (DM_EVENT_ENABLED(dp, DM_EVENT_SYMLINK)) {
3198 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dir_vp, 3102 error = XFS_SEND_NAMESP(mp, DM_EVENT_SYMLINK, dp,
3199 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL, 3103 DM_RIGHT_NULL, NULL, DM_RIGHT_NULL,
3200 link_name, target_path, 0, 0, 0); 3104 link_name->name, target_path, 0, 0, 0);
3201 if (error) 3105 if (error)
3202 return error; 3106 return error;
3203 } 3107 }
@@ -3229,7 +3133,7 @@ xfs_symlink(
3229 fs_blocks = 0; 3133 fs_blocks = 0;
3230 else 3134 else
3231 fs_blocks = XFS_B_TO_FSB(mp, pathlen); 3135 fs_blocks = XFS_B_TO_FSB(mp, pathlen);
3232 resblks = XFS_SYMLINK_SPACE_RES(mp, link_namelen, fs_blocks); 3136 resblks = XFS_SYMLINK_SPACE_RES(mp, link_name->len, fs_blocks);
3233 error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0, 3137 error = xfs_trans_reserve(tp, resblks, XFS_SYMLINK_LOG_RES(mp), 0,
3234 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT); 3138 XFS_TRANS_PERM_LOG_RES, XFS_SYMLINK_LOG_COUNT);
3235 if (error == ENOSPC && fs_blocks == 0) { 3139 if (error == ENOSPC && fs_blocks == 0) {
@@ -3263,8 +3167,8 @@ xfs_symlink(
3263 /* 3167 /*
3264 * Check for ability to enter directory entry, if no space reserved. 3168 * Check for ability to enter directory entry, if no space reserved.
3265 */ 3169 */
3266 if (resblks == 0 && 3170 error = xfs_dir_canenter(tp, dp, link_name, resblks);
3267 (error = xfs_dir_canenter(tp, dp, link_name, link_namelen))) 3171 if (error)
3268 goto error_return; 3172 goto error_return;
3269 /* 3173 /*
3270 * Initialize the bmap freelist prior to calling either 3174 * Initialize the bmap freelist prior to calling either
@@ -3289,7 +3193,7 @@ xfs_symlink(
3289 * transaction cancel unlocking dp so don't do it explicitly in the 3193 * transaction cancel unlocking dp so don't do it explicitly in the
3290 * error path. 3194 * error path.
3291 */ 3195 */
3292 VN_HOLD(dir_vp); 3196 IHOLD(dp);
3293 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); 3197 xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
3294 unlock_dp_on_error = B_FALSE; 3198 unlock_dp_on_error = B_FALSE;
3295 3199
@@ -3356,8 +3260,8 @@ xfs_symlink(
3356 /* 3260 /*
3357 * Create the directory entry for the symlink. 3261 * Create the directory entry for the symlink.
3358 */ 3262 */
3359 error = xfs_dir_createname(tp, dp, link_name, link_namelen, ip->i_ino, 3263 error = xfs_dir_createname(tp, dp, link_name, ip->i_ino,
3360 &first_block, &free_list, resblks); 3264 &first_block, &free_list, resblks);
3361 if (error) 3265 if (error)
3362 goto error1; 3266 goto error1;
3363 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); 3267 xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
@@ -3399,19 +3303,14 @@ xfs_symlink(
3399std_return: 3303std_return:
3400 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) { 3304 if (DM_EVENT_ENABLED(dp, DM_EVENT_POSTSYMLINK)) {
3401 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK, 3305 (void) XFS_SEND_NAMESP(mp, DM_EVENT_POSTSYMLINK,
3402 dir_vp, DM_RIGHT_NULL, 3306 dp, DM_RIGHT_NULL,
3403 error ? NULL : XFS_ITOV(ip), 3307 error ? NULL : ip,
3404 DM_RIGHT_NULL, link_name, target_path, 3308 DM_RIGHT_NULL, link_name->name,
3405 0, error, 0); 3309 target_path, 0, error, 0);
3406 } 3310 }
3407 3311
3408 if (!error) { 3312 if (!error)
3409 bhv_vnode_t *vp; 3313 *ipp = ip;
3410
3411 ASSERT(ip);
3412 vp = XFS_ITOV(ip);
3413 *vpp = vp;
3414 }
3415 return error; 3314 return error;
3416 3315
3417 error2: 3316 error2:
@@ -3431,60 +3330,11 @@ std_return:
3431} 3330}
3432 3331
3433int 3332int
3434xfs_rwlock(
3435 xfs_inode_t *ip,
3436 bhv_vrwlock_t locktype)
3437{
3438 if (S_ISDIR(ip->i_d.di_mode))
3439 return 1;
3440 if (locktype == VRWLOCK_WRITE) {
3441 xfs_ilock(ip, XFS_IOLOCK_EXCL);
3442 } else if (locktype == VRWLOCK_TRY_READ) {
3443 return xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED);
3444 } else if (locktype == VRWLOCK_TRY_WRITE) {
3445 return xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL);
3446 } else {
3447 ASSERT((locktype == VRWLOCK_READ) ||
3448 (locktype == VRWLOCK_WRITE_DIRECT));
3449 xfs_ilock(ip, XFS_IOLOCK_SHARED);
3450 }
3451
3452 return 1;
3453}
3454
3455
3456void
3457xfs_rwunlock(
3458 xfs_inode_t *ip,
3459 bhv_vrwlock_t locktype)
3460{
3461 if (S_ISDIR(ip->i_d.di_mode))
3462 return;
3463 if (locktype == VRWLOCK_WRITE) {
3464 /*
3465 * In the write case, we may have added a new entry to
3466 * the reference cache. This might store a pointer to
3467 * an inode to be released in this inode. If it is there,
3468 * clear the pointer and release the inode after unlocking
3469 * this one.
3470 */
3471 xfs_refcache_iunlock(ip, XFS_IOLOCK_EXCL);
3472 } else {
3473 ASSERT((locktype == VRWLOCK_READ) ||
3474 (locktype == VRWLOCK_WRITE_DIRECT));
3475 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
3476 }
3477 return;
3478}
3479
3480
3481int
3482xfs_inode_flush( 3333xfs_inode_flush(
3483 xfs_inode_t *ip, 3334 xfs_inode_t *ip,
3484 int flags) 3335 int flags)
3485{ 3336{
3486 xfs_mount_t *mp = ip->i_mount; 3337 xfs_mount_t *mp = ip->i_mount;
3487 xfs_inode_log_item_t *iip = ip->i_itemp;
3488 int error = 0; 3338 int error = 0;
3489 3339
3490 if (XFS_FORCED_SHUTDOWN(mp)) 3340 if (XFS_FORCED_SHUTDOWN(mp))
@@ -3494,33 +3344,9 @@ xfs_inode_flush(
3494 * Bypass inodes which have already been cleaned by 3344 * Bypass inodes which have already been cleaned by
3495 * the inode flush clustering code inside xfs_iflush 3345 * the inode flush clustering code inside xfs_iflush
3496 */ 3346 */
3497 if ((ip->i_update_core == 0) && 3347 if (xfs_inode_clean(ip))
3498 ((iip == NULL) || !(iip->ili_format.ilf_fields & XFS_ILOG_ALL)))
3499 return 0; 3348 return 0;
3500 3349
3501 if (flags & FLUSH_LOG) {
3502 if (iip && iip->ili_last_lsn) {
3503 xlog_t *log = mp->m_log;
3504 xfs_lsn_t sync_lsn;
3505 int log_flags = XFS_LOG_FORCE;
3506
3507 spin_lock(&log->l_grant_lock);
3508 sync_lsn = log->l_last_sync_lsn;
3509 spin_unlock(&log->l_grant_lock);
3510
3511 if ((XFS_LSN_CMP(iip->ili_last_lsn, sync_lsn) > 0)) {
3512 if (flags & FLUSH_SYNC)
3513 log_flags |= XFS_LOG_SYNC;
3514 error = xfs_log_force(mp, iip->ili_last_lsn, log_flags);
3515 if (error)
3516 return error;
3517 }
3518
3519 if (ip->i_update_core == 0)
3520 return 0;
3521 }
3522 }
3523
3524 /* 3350 /*
3525 * We make this non-blocking if the inode is contended, 3351 * We make this non-blocking if the inode is contended,
3526 * return EAGAIN to indicate to the caller that they 3352 * return EAGAIN to indicate to the caller that they
@@ -3528,30 +3354,22 @@ xfs_inode_flush(
3528 * blocking on inodes inside another operation right 3354 * blocking on inodes inside another operation right
3529 * now, they get caught later by xfs_sync. 3355 * now, they get caught later by xfs_sync.
3530 */ 3356 */
3531 if (flags & FLUSH_INODE) { 3357 if (flags & FLUSH_SYNC) {
3532 int flush_flags; 3358 xfs_ilock(ip, XFS_ILOCK_SHARED);
3533 3359 xfs_iflock(ip);
3534 if (flags & FLUSH_SYNC) { 3360 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3535 xfs_ilock(ip, XFS_ILOCK_SHARED); 3361 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3536 xfs_iflock(ip); 3362 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3537 } else if (xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
3538 if (xfs_ipincount(ip) || !xfs_iflock_nowait(ip)) {
3539 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3540 return EAGAIN;
3541 }
3542 } else {
3543 return EAGAIN; 3363 return EAGAIN;
3544 } 3364 }
3545 3365 } else {
3546 if (flags & FLUSH_SYNC) 3366 return EAGAIN;
3547 flush_flags = XFS_IFLUSH_SYNC;
3548 else
3549 flush_flags = XFS_IFLUSH_ASYNC;
3550
3551 error = xfs_iflush(ip, flush_flags);
3552 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3553 } 3367 }
3554 3368
3369 error = xfs_iflush(ip, (flags & FLUSH_SYNC) ? XFS_IFLUSH_SYNC
3370 : XFS_IFLUSH_ASYNC_NOBLOCK);
3371 xfs_iunlock(ip, XFS_ILOCK_SHARED);
3372
3555 return error; 3373 return error;
3556} 3374}
3557 3375
@@ -3694,12 +3512,12 @@ xfs_finish_reclaim(
3694 * We get the flush lock regardless, though, just to make sure 3512 * We get the flush lock regardless, though, just to make sure
3695 * we don't free it while it is being flushed. 3513 * we don't free it while it is being flushed.
3696 */ 3514 */
3697 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 3515 if (!locked) {
3698 if (!locked) { 3516 xfs_ilock(ip, XFS_ILOCK_EXCL);
3699 xfs_ilock(ip, XFS_ILOCK_EXCL); 3517 xfs_iflock(ip);
3700 xfs_iflock(ip); 3518 }
3701 }
3702 3519
3520 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
3703 if (ip->i_update_core || 3521 if (ip->i_update_core ||
3704 ((ip->i_itemp != NULL) && 3522 ((ip->i_itemp != NULL) &&
3705 (ip->i_itemp->ili_format.ilf_fields != 0))) { 3523 (ip->i_itemp->ili_format.ilf_fields != 0))) {
@@ -3719,17 +3537,11 @@ xfs_finish_reclaim(
3719 ASSERT(ip->i_update_core == 0); 3537 ASSERT(ip->i_update_core == 0);
3720 ASSERT(ip->i_itemp == NULL || 3538 ASSERT(ip->i_itemp == NULL ||
3721 ip->i_itemp->ili_format.ilf_fields == 0); 3539 ip->i_itemp->ili_format.ilf_fields == 0);
3722 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3723 } else if (locked) {
3724 /*
3725 * We are not interested in doing an iflush if we're
3726 * in the process of shutting down the filesystem forcibly.
3727 * So, just reclaim the inode.
3728 */
3729 xfs_ifunlock(ip);
3730 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3731 } 3540 }
3732 3541
3542 xfs_ifunlock(ip);
3543 xfs_iunlock(ip, XFS_ILOCK_EXCL);
3544
3733 reclaim: 3545 reclaim:
3734 xfs_ireclaim(ip); 3546 xfs_ireclaim(ip);
3735 return 0; 3547 return 0;
@@ -3845,9 +3657,8 @@ xfs_alloc_file_space(
3845 end_dmi_offset = offset+len; 3657 end_dmi_offset = offset+len;
3846 if (end_dmi_offset > ip->i_size) 3658 if (end_dmi_offset > ip->i_size)
3847 end_dmi_offset = ip->i_size; 3659 end_dmi_offset = ip->i_size;
3848 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, XFS_ITOV(ip), 3660 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip, offset,
3849 offset, end_dmi_offset - offset, 3661 end_dmi_offset - offset, 0, NULL);
3850 0, NULL);
3851 if (error) 3662 if (error)
3852 return error; 3663 return error;
3853 } 3664 }
@@ -3956,8 +3767,8 @@ dmapi_enospc_check:
3956 if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 && 3767 if (error == ENOSPC && (attr_flags & ATTR_DMI) == 0 &&
3957 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) { 3768 DM_EVENT_ENABLED(ip, DM_EVENT_NOSPACE)) {
3958 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE, 3769 error = XFS_SEND_NAMESP(mp, DM_EVENT_NOSPACE,
3959 XFS_ITOV(ip), DM_RIGHT_NULL, 3770 ip, DM_RIGHT_NULL,
3960 XFS_ITOV(ip), DM_RIGHT_NULL, 3771 ip, DM_RIGHT_NULL,
3961 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */ 3772 NULL, NULL, 0, 0, 0); /* Delay flag intentionally unused */
3962 if (error == 0) 3773 if (error == 0)
3963 goto retry; /* Maybe DMAPI app. has made space */ 3774 goto retry; /* Maybe DMAPI app. has made space */
@@ -4021,7 +3832,8 @@ xfs_zero_remaining_bytes(
4021 XFS_BUF_READ(bp); 3832 XFS_BUF_READ(bp);
4022 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock)); 3833 XFS_BUF_SET_ADDR(bp, XFS_FSB_TO_DB(ip, imap.br_startblock));
4023 xfsbdstrat(mp, bp); 3834 xfsbdstrat(mp, bp);
4024 if ((error = xfs_iowait(bp))) { 3835 error = xfs_iowait(bp);
3836 if (error) {
4025 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)", 3837 xfs_ioerror_alert("xfs_zero_remaining_bytes(read)",
4026 mp, bp, XFS_BUF_ADDR(bp)); 3838 mp, bp, XFS_BUF_ADDR(bp));
4027 break; 3839 break;
@@ -4033,7 +3845,8 @@ xfs_zero_remaining_bytes(
4033 XFS_BUF_UNREAD(bp); 3845 XFS_BUF_UNREAD(bp);
4034 XFS_BUF_WRITE(bp); 3846 XFS_BUF_WRITE(bp);
4035 xfsbdstrat(mp, bp); 3847 xfsbdstrat(mp, bp);
4036 if ((error = xfs_iowait(bp))) { 3848 error = xfs_iowait(bp);
3849 if (error) {
4037 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)", 3850 xfs_ioerror_alert("xfs_zero_remaining_bytes(write)",
4038 mp, bp, XFS_BUF_ADDR(bp)); 3851 mp, bp, XFS_BUF_ADDR(bp));
4039 break; 3852 break;
@@ -4102,7 +3915,7 @@ xfs_free_file_space(
4102 DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) { 3915 DM_EVENT_ENABLED(ip, DM_EVENT_WRITE)) {
4103 if (end_dmi_offset > ip->i_size) 3916 if (end_dmi_offset > ip->i_size)
4104 end_dmi_offset = ip->i_size; 3917 end_dmi_offset = ip->i_size;
4105 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, vp, 3918 error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, ip,
4106 offset, end_dmi_offset - offset, 3919 offset, end_dmi_offset - offset,
4107 AT_DELAY_FLAG(attr_flags), NULL); 3920 AT_DELAY_FLAG(attr_flags), NULL);
4108 if (error) 3921 if (error)
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 4e3970f0e5e3..24c53923dc2c 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -23,31 +23,32 @@ int xfs_fsync(struct xfs_inode *ip, int flag, xfs_off_t start,
23 xfs_off_t stop); 23 xfs_off_t stop);
24int xfs_release(struct xfs_inode *ip); 24int xfs_release(struct xfs_inode *ip);
25int xfs_inactive(struct xfs_inode *ip); 25int xfs_inactive(struct xfs_inode *ip);
26int xfs_lookup(struct xfs_inode *dp, bhv_vname_t *dentry, 26int xfs_lookup(struct xfs_inode *dp, struct xfs_name *name,
27 bhv_vnode_t **vpp); 27 struct xfs_inode **ipp);
28int xfs_create(struct xfs_inode *dp, bhv_vname_t *dentry, mode_t mode, 28int xfs_create(struct xfs_inode *dp, struct xfs_name *name, mode_t mode,
29 xfs_dev_t rdev, bhv_vnode_t **vpp, struct cred *credp); 29 xfs_dev_t rdev, struct xfs_inode **ipp, struct cred *credp);
30int xfs_remove(struct xfs_inode *dp, bhv_vname_t *dentry); 30int xfs_remove(struct xfs_inode *dp, struct xfs_name *name,
31int xfs_link(struct xfs_inode *tdp, bhv_vnode_t *src_vp, 31 struct xfs_inode *ip);
32 bhv_vname_t *dentry); 32int xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
33int xfs_mkdir(struct xfs_inode *dp, bhv_vname_t *dentry, 33 struct xfs_name *target_name);
34 mode_t mode, bhv_vnode_t **vpp, struct cred *credp); 34int xfs_mkdir(struct xfs_inode *dp, struct xfs_name *dir_name,
35int xfs_rmdir(struct xfs_inode *dp, bhv_vname_t *dentry); 35 mode_t mode, struct xfs_inode **ipp, struct cred *credp);
36int xfs_rmdir(struct xfs_inode *dp, struct xfs_name *name,
37 struct xfs_inode *cdp);
36int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize, 38int xfs_readdir(struct xfs_inode *dp, void *dirent, size_t bufsize,
37 xfs_off_t *offset, filldir_t filldir); 39 xfs_off_t *offset, filldir_t filldir);
38int xfs_symlink(struct xfs_inode *dp, bhv_vname_t *dentry, 40int xfs_symlink(struct xfs_inode *dp, struct xfs_name *link_name,
39 char *target_path, mode_t mode, bhv_vnode_t **vpp, 41 const char *target_path, mode_t mode, struct xfs_inode **ipp,
40 struct cred *credp); 42 struct cred *credp);
41int xfs_rwlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
42void xfs_rwunlock(struct xfs_inode *ip, bhv_vrwlock_t locktype);
43int xfs_inode_flush(struct xfs_inode *ip, int flags); 43int xfs_inode_flush(struct xfs_inode *ip, int flags);
44int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state); 44int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
45int xfs_reclaim(struct xfs_inode *ip); 45int xfs_reclaim(struct xfs_inode *ip);
46int xfs_change_file_space(struct xfs_inode *ip, int cmd, 46int xfs_change_file_space(struct xfs_inode *ip, int cmd,
47 xfs_flock64_t *bf, xfs_off_t offset, 47 xfs_flock64_t *bf, xfs_off_t offset,
48 struct cred *credp, int attr_flags); 48 struct cred *credp, int attr_flags);
49int xfs_rename(struct xfs_inode *src_dp, bhv_vname_t *src_vname, 49int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
50 bhv_vnode_t *target_dir_vp, bhv_vname_t *target_vname); 50 struct xfs_inode *src_ip, struct xfs_inode *target_dp,
51 struct xfs_name *target_name);
51int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, 52int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value,
52 int *valuelenp, int flags, cred_t *cred); 53 int *valuelenp, int flags, cred_t *cred);
53int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, 54int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,